rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,112 @@
1
+ Installing Programs with install.rb / setup.rb
2
+ ==============================================
3
+
4
+ Overview
5
+ --------
6
+
7
+ Type these lines on command line:
8
+ ("#" line may require root privilege)
9
+
10
+ $ ruby install.rb config
11
+ $ ruby install.rb setup
12
+ # ruby install.rb install
13
+
14
+
15
+ There's no difference in a usage between install.rb
16
+ and setup.rb.
17
+
18
+ $ ruby setup.rb config
19
+ $ ruby setup.rb setup
20
+ # ruby setup.rb install
21
+
22
+
23
+ Details
24
+ -------
25
+
26
+ Usage of install.rb/setup.rb is:
27
+
28
+ ruby install.rb <global options>
29
+ ruby install.rb [<global options>] <task> [<task options>]
30
+
31
+
32
+ -q,--quiet
33
+ suppress message outputs
34
+ --verbose
35
+ output messages verbosely (default)
36
+ -h,--help
37
+ prints help and quit
38
+ -v,--version
39
+ prints version and quit
40
+ --copyright
41
+ prints copyright and quit
42
+
43
+ These are acceptable tasks:
44
+ config
45
+ saves configurations
46
+ show
47
+ prints current configurations
48
+ setup
49
+ compiles extentions
50
+ install
51
+ installs files
52
+ clean
53
+ cleans created files
54
+
55
+ Task Options for Config
56
+ -----------------------
57
+
58
+ --prefix=PATH
59
+ a prefix of the installing directory path
60
+ --std-ruby=PATH
61
+ the directory for standard ruby libraries
62
+ --site-ruby-common=PATH
63
+ the directory for version-independent non-standard
64
+ ruby libraries
65
+ --site-ruby=PATH
66
+ the directory for non-standard ruby libraries
67
+ --bin-dir=PATH
68
+ the directory for commands
69
+ --rb-dir=PATH
70
+ the directory for ruby scripts
71
+ --so-dir=PATH
72
+ the directory for ruby extentions
73
+ --data-dir=PATH
74
+ the directory for shared data
75
+ --ruby-path=PATH
76
+ path to set to #! line
77
+ --ruby-prog=PATH
78
+ the ruby program using for installation
79
+ --make-prog=NAME
80
+ the make program to compile ruby extentions
81
+ --without-ext
82
+ forces to install.rb never to compile/install
83
+ ruby extentions.
84
+ --rbconfig=PATH
85
+ your rbconfig.rb to load
86
+
87
+ You can view default values of these options by typing
88
+
89
+ $ ruby install.rb --help
90
+
91
+
92
+ In addition, setup.rb accepts these options:
93
+ --with=NAME,NAME,NAME...
94
+ package names which you want to install
95
+ --without=NAME,NAME,NAME...
96
+ package names which you do not want to install
97
+
98
+ [NOTE] You can pass options for extconf.rb like this:
99
+
100
+ ruby install.rb config -- --with-tklib=/usr/lib/libtk-ja.so.8.0
101
+
102
+
103
+ Task Options for Install
104
+ ------------------------
105
+
106
+ --no-harm
107
+ prints what to do and done nothing really.
108
+ --prefix=PATH
109
+ a prefix of the installing directory path.
110
+ This option may help binary package maintainers.
111
+ A default value is an empty string.
112
+
@@ -0,0 +1,34 @@
1
+ 2004-06-16 Zeno Davatz <zdavatz@ywesee.com>
2
+ * Fi und Pi neu aus PDF parsen.
3
+ + hwyss@ywesee.com: PDF-Parser so anpassen, dass er optional zus�tzlich zum Text auch
4
+ Formatierungen mitliefert (das brauchen wir, um die Dokument-Struktur zu
5
+ erfassen).
6
+
7
+ 2004-05-28 Hannes Wyss <hwyss@ywesee.com>
8
+ * Analysis of the Erroneous Space Chars Bug:
9
+ DESCRIPTION:
10
+ At the Moment, Rpdf2txt::Text performs an Iconv-Operation to unify the
11
+ Encoding of all Text-Snippets. This does not work if the resulting
12
+ Character-Id is not defined in the original Font (and only works fuzzy if it
13
+ does).
14
+ PROPOSED SOLUTION:
15
+ Have a Parser-Wide target-encoding, which is propagated to all
16
+ Pointer-Instances. Perform the conversation in Pointer#txt, according to its
17
+ Font-Encoding.
18
+ CLEANUP:
19
+ Return an Array of Pointers instead of a Hash.
20
+ 2004-05-26 Zeno Davatz <zdavatz@ywesee.com>
21
+ * If possible try before the 1.6.2004
22
+ 2004-05-26 Hannes Wyss <hwyss@ywesee.com>
23
+ * Bug-Report: Erroneous Space Chars
24
+ DESCRIPTION:
25
+ - Umlauts and Accented characters are not given the right width and
26
+ subsequently the resulting ascii has erroneous spaces.
27
+ - Example-File in
28
+ ./user-stories/documents/swissmedicjournal/04_2004.pdf
29
+ ESTIMATE: 8h
30
+ - 4h Finding the Problem
31
+ - 2h Writing Tests
32
+ - 2h Writing Code
33
+ STATUS: pending Assignment
34
+
metadata ADDED
@@ -0,0 +1,220 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rpdf2txt
3
+ version: !ruby/object:Gem::Version
4
+ hash: 59
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 8
9
+ - 2
10
+ version: 0.8.2
11
+ platform: ruby
12
+ authors:
13
+ - Masaomi Hatakeyama, Zeno R.R. Davatz
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-15 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: hoe
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 47
30
+ segments:
31
+ - 2
32
+ - 8
33
+ - 0
34
+ version: 2.8.0
35
+ type: :development
36
+ version_requirements: *id001
37
+ description: rpdf2txt will turn a PDF-file into a textfile.
38
+ email:
39
+ - mhatakeyama@ywesee.com, zdavatz@ywesee.com
40
+ executables:
41
+ - rpdf2txt
42
+ extensions: []
43
+
44
+ extra_rdoc_files:
45
+ - History.txt
46
+ - Manifest.txt
47
+ - README.txt
48
+ - bin/rpdf2txt
49
+ - lib/rpdf2txt/data/fonts/License-Adobe.txt
50
+ - usage-en.txt
51
+ - user-stories/UserStories_Rpdf2Txt.txt
52
+ files:
53
+ - LICENCE
54
+ - History.txt
55
+ - Manifest.txt
56
+ - README.txt
57
+ - Rakefile
58
+ - bin/rpdf2txt
59
+ - config.save
60
+ - install.rb
61
+ - lib/rpdf2txt-rockit/base_extensions.rb
62
+ - lib/rpdf2txt-rockit/bootstrap.rb
63
+ - lib/rpdf2txt-rockit/bounded_lru_cache.rb
64
+ - lib/rpdf2txt-rockit/conflict_resolution.rb
65
+ - lib/rpdf2txt-rockit/directed_graph.rb
66
+ - lib/rpdf2txt-rockit/glr_parser.rb
67
+ - lib/rpdf2txt-rockit/grammar.rb
68
+ - lib/rpdf2txt-rockit/graphdrawing.rb
69
+ - lib/rpdf2txt-rockit/graphviz_dot.rb
70
+ - lib/rpdf2txt-rockit/indexable.rb
71
+ - lib/rpdf2txt-rockit/lalr_parsetable_generator.rb
72
+ - lib/rpdf2txt-rockit/parse_table.rb
73
+ - lib/rpdf2txt-rockit/parsetable_generation.rb
74
+ - lib/rpdf2txt-rockit/parsing_ambiguities.rb
75
+ - lib/rpdf2txt-rockit/profiler.rb
76
+ - lib/rpdf2txt-rockit/reduce_actions_generator.rb
77
+ - lib/rpdf2txt-rockit/rockit.rb
78
+ - lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb
79
+ - lib/rpdf2txt-rockit/rockit_grammars_parser.rb
80
+ - lib/rpdf2txt-rockit/sourcecode_dumpable.rb
81
+ - lib/rpdf2txt-rockit/stringscanner.rb
82
+ - lib/rpdf2txt-rockit/syntax_tree.rb
83
+ - lib/rpdf2txt-rockit/token.rb
84
+ - lib/rpdf2txt-rockit/version.rb
85
+ - lib/rpdf2txt/attributesparser.rb
86
+ - lib/rpdf2txt/cmapparser.rb
87
+ - lib/rpdf2txt/data/cmap.grammar
88
+ - lib/rpdf2txt/data/cmap.rb
89
+ - lib/rpdf2txt/data/cmap_range.grammar
90
+ - lib/rpdf2txt/data/cmap_range.rb
91
+ - lib/rpdf2txt/data/_cmap.grammar
92
+ - lib/rpdf2txt/data/_cmap_range.grammar
93
+ - lib/rpdf2txt/data/_pdfattributes.grammar
94
+ - lib/rpdf2txt/data/fonts/Courier-Bold.afm
95
+ - lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm
96
+ - lib/rpdf2txt/data/fonts/Courier-Oblique.afm
97
+ - lib/rpdf2txt/data/fonts/Courier.afm
98
+ - lib/rpdf2txt/data/fonts/Helvetica-Bold.afm
99
+ - lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm
100
+ - lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm
101
+ - lib/rpdf2txt/data/fonts/Helvetica.afm
102
+ - lib/rpdf2txt/data/fonts/License-Adobe.txt
103
+ - lib/rpdf2txt/data/fonts/Symbol.afm
104
+ - lib/rpdf2txt/data/fonts/Times-Bold.afm
105
+ - lib/rpdf2txt/data/fonts/Times-BoldItalic.afm
106
+ - lib/rpdf2txt/data/fonts/Times-Italic.afm
107
+ - lib/rpdf2txt/data/fonts/Times-Roman.afm
108
+ - lib/rpdf2txt/data/fonts/ZapfDingbats.afm
109
+ - lib/rpdf2txt/data/pdfattributes.grammar
110
+ - lib/rpdf2txt/data/pdfattributes.rb
111
+ - lib/rpdf2txt/data/pdftext.grammar
112
+ - lib/rpdf2txt/data/pdftext.rb
113
+ - lib/rpdf2txt/default_handler.rb
114
+ - lib/rpdf2txt/lzw.rb
115
+ - lib/rpdf2txt/object.rb
116
+ - lib/rpdf2txt/parser.rb
117
+ - lib/rpdf2txt/symbol.rb
118
+ - lib/rpdf2txt/text.rb
119
+ - lib/rpdf2txt/text_state.rb
120
+ - lib/rpdf2txt/textparser.rb
121
+ - test/data/3392_obj
122
+ - test/data/397_decrypted
123
+ - test/data/450_decrypted
124
+ - test/data/450_obj
125
+ - test/data/452_decrypted
126
+ - test/data/454_decrypted
127
+ - test/data/456_decrypted
128
+ - test/data/458_decrypted
129
+ - test/data/458_obj
130
+ - test/data/460_decrypted
131
+ - test/data/460_obj
132
+ - test/data/463_decrypted
133
+ - test/data/465_decrypted
134
+ - test/data/465_obj
135
+ - test/data/90_obj
136
+ - test/data/90_obj_comp
137
+ - test/data/decrypted
138
+ - test/data/encrypt_obj
139
+ - test/data/encrypt_string
140
+ - test/data/encrypt_string_128bit
141
+ - test/data/encrypted_object_stream.pdf
142
+ - test/data/firststream
143
+ - test/data/index.pdfobj
144
+ - test/data/index_2bit.pdfobj
145
+ - test/data/index_masked.pdfobj
146
+ - test/data/indexed.pdfobj
147
+ - test/data/indexed_2bit.pdfobj
148
+ - test/data/indexed_masked.pdfobj
149
+ - test/data/inline.png
150
+ - test/data/logo.png
151
+ - test/data/lzw.pdfobj
152
+ - test/data/lzw_index.pdfobj
153
+ - test/data/page_tree.pdf
154
+ - test/data/pdf_20.png
155
+ - test/data/pdf_21.png
156
+ - test/data/pdf_22.png
157
+ - test/data/pdf_50.png
158
+ - test/data/png.pdfobj
159
+ - test/data/space_bug_stream.txt
160
+ - test/data/stream.txt
161
+ - test/data/stream_kerning_bug.txt
162
+ - test/data/stream_kerning_bug2.txt
163
+ - test/data/test.pdf
164
+ - test/data/test.txt
165
+ - test/data/test_text.txt
166
+ - test/data/working_obj
167
+ - test/data/working_obj2
168
+ - test/mock.rb
169
+ - test/suite.rb
170
+ - test/test_pdf_object.rb
171
+ - test/test_pdf_parser.rb
172
+ - test/test_pdf_text.rb
173
+ - test/test_space_bug_05_2004.rb
174
+ - test/test_stream.rb
175
+ - test/test_text_state.rb
176
+ - usage-en.txt
177
+ - user-stories/UserStories_Rpdf2Txt.txt
178
+ - user-stories/documents/swissmedicjournal/04_2004.pdf
179
+ has_rdoc: true
180
+ homepage: http://scm.ywesee.com/?p=rpdf2txt/.git;a=summary
181
+ licenses: []
182
+
183
+ post_install_message:
184
+ rdoc_options:
185
+ - --main
186
+ - README.txt
187
+ require_paths:
188
+ - lib
189
+ required_ruby_version: !ruby/object:Gem::Requirement
190
+ none: false
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ hash: 3
195
+ segments:
196
+ - 0
197
+ version: "0"
198
+ required_rubygems_version: !ruby/object:Gem::Requirement
199
+ none: false
200
+ requirements:
201
+ - - ">="
202
+ - !ruby/object:Gem::Version
203
+ hash: 3
204
+ segments:
205
+ - 0
206
+ version: "0"
207
+ requirements: []
208
+
209
+ rubyforge_project: rpdf2txt
210
+ rubygems_version: 1.3.7
211
+ signing_key:
212
+ specification_version: 3
213
+ summary: rpdf2txt will turn a PDF-file into a textfile.
214
+ test_files:
215
+ - test/test_pdf_object.rb
216
+ - test/test_pdf_parser.rb
217
+ - test/test_pdf_text.rb
218
+ - test/test_space_bug_05_2004.rb
219
+ - test/test_stream.rb
220
+ - test/test_text_state.rb