rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,112 @@
1
+ Installing Programs with install.rb / setup.rb
2
+ ==============================================
3
+
4
+ Overview
5
+ --------
6
+
7
+ Type these lines on command line:
8
+ ("#" line may require root privilege)
9
+
10
+ $ ruby install.rb config
11
+ $ ruby install.rb setup
12
+ # ruby install.rb install
13
+
14
+
15
+ There's no difference in a usage between install.rb
16
+ and setup.rb.
17
+
18
+ $ ruby setup.rb config
19
+ $ ruby setup.rb setup
20
+ # ruby setup.rb install
21
+
22
+
23
+ Details
24
+ -------
25
+
26
+ Usage of install.rb/setup.rb is:
27
+
28
+ ruby install.rb <global options>
29
+ ruby install.rb [<global options>] <task> [<task options>]
30
+
31
+
32
+ -q,--quiet
33
+ suppress message outputs
34
+ --verbose
35
+ output messages verbosely (default)
36
+ -h,--help
37
+ prints help and quit
38
+ -v,--version
39
+ prints version and quit
40
+ --copyright
41
+ prints copyright and quit
42
+
43
+ These are acceptable tasks:
44
+ config
45
+ saves configurations
46
+ show
47
+ prints current configurations
48
+ setup
49
+ compiles extentions
50
+ install
51
+ installs files
52
+ clean
53
+ cleans created files
54
+
55
+ Task Options for Config
56
+ -----------------------
57
+
58
+ --prefix=PATH
59
+ a prefix of the installing directory path
60
+ --std-ruby=PATH
61
+ the directory for standard ruby libraries
62
+ --site-ruby-common=PATH
63
+ the directory for version-independent non-standard
64
+ ruby libraries
65
+ --site-ruby=PATH
66
+ the directory for non-standard ruby libraries
67
+ --bin-dir=PATH
68
+ the directory for commands
69
+ --rb-dir=PATH
70
+ the directory for ruby scripts
71
+ --so-dir=PATH
72
+ the directory for ruby extentions
73
+ --data-dir=PATH
74
+ the directory for shared data
75
+ --ruby-path=PATH
76
+ path to set to #! line
77
+ --ruby-prog=PATH
78
+ the ruby program using for installation
79
+ --make-prog=NAME
80
+ the make program to compile ruby extentions
81
+ --without-ext
82
+ forces to install.rb never to compile/install
83
+ ruby extentions.
84
+ --rbconfig=PATH
85
+ your rbconfig.rb to load
86
+
87
+ You can view default values of these options by typing
88
+
89
+ $ ruby install.rb --help
90
+
91
+
92
+ In addition, setup.rb accepts these options:
93
+ --with=NAME,NAME,NAME...
94
+ package names which you want to install
95
+ --without=NAME,NAME,NAME...
96
+ package names which you do not want to install
97
+
98
+ [NOTE] You can pass options for extconf.rb like this:
99
+
100
+ ruby install.rb config -- --with-tklib=/usr/lib/libtk-ja.so.8.0
101
+
102
+
103
+ Task Options for Install
104
+ ------------------------
105
+
106
+ --no-harm
107
+ prints what to do and done nothing really.
108
+ --prefix=PATH
109
+ a prefix of the installing directory path.
110
+ This option may help binary package maintainers.
111
+ A default value is an empty string.
112
+
@@ -0,0 +1,34 @@
1
+ 2004-06-16 Zeno Davatz <zdavatz@ywesee.com>
2
+ * Fi und Pi neu aus PDF parsen.
3
+ + hwyss@ywesee.com: PDF-Parser so anpassen, dass er optional zus�tzlich zum Text auch
4
+ Formatierungen mitliefert (das brauchen wir, um die Dokument-Struktur zu
5
+ erfassen).
6
+
7
+ 2004-05-28 Hannes Wyss <hwyss@ywesee.com>
8
+ * Analysis of the Erroneous Space Chars Bug:
9
+ DESCRIPTION:
10
+ At the Moment, Rpdf2txt::Text performs an Iconv-Operation to unify the
11
+ Encoding of all Text-Snippets. This does not work if the resulting
12
+ Character-Id is not defined in the original Font (and only works fuzzy if it
13
+ does).
14
+ PROPOSED SOLUTION:
15
+ Have a Parser-Wide target-encoding, which is propagated to all
16
+ Pointer-Instances. Perform the conversation in Pointer#txt, according to its
17
+ Font-Encoding.
18
+ CLEANUP:
19
+ Return an Array of Pointers instead of a Hash.
20
+ 2004-05-26 Zeno Davatz <zdavatz@ywesee.com>
21
+ * If possible try before the 1.6.2004
22
+ 2004-05-26 Hannes Wyss <hwyss@ywesee.com>
23
+ * Bug-Report: Erroneous Space Chars
24
+ DESCRIPTION:
25
+ - Umlauts and Accented characters are not given the right width and
26
+ subsequently the resulting ascii has erroneous spaces.
27
+ - Example-File in
28
+ ./user-stories/documents/swissmedicjournal/04_2004.pdf
29
+ ESTIMATE: 8h
30
+ - 4h Finding the Problem
31
+ - 2h Writing Tests
32
+ - 2h Writing Code
33
+ STATUS: pending Assignment
34
+
metadata ADDED
@@ -0,0 +1,220 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rpdf2txt
3
+ version: !ruby/object:Gem::Version
4
+ hash: 59
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 8
9
+ - 2
10
+ version: 0.8.2
11
+ platform: ruby
12
+ authors:
13
+ - Masaomi Hatakeyama, Zeno R.R. Davatz
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-15 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: hoe
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 47
30
+ segments:
31
+ - 2
32
+ - 8
33
+ - 0
34
+ version: 2.8.0
35
+ type: :development
36
+ version_requirements: *id001
37
+ description: rpdf2txt will turn a PDF-file into a textfile.
38
+ email:
39
+ - mhatakeyama@ywesee.com, zdavatz@ywesee.com
40
+ executables:
41
+ - rpdf2txt
42
+ extensions: []
43
+
44
+ extra_rdoc_files:
45
+ - History.txt
46
+ - Manifest.txt
47
+ - README.txt
48
+ - bin/rpdf2txt
49
+ - lib/rpdf2txt/data/fonts/License-Adobe.txt
50
+ - usage-en.txt
51
+ - user-stories/UserStories_Rpdf2Txt.txt
52
+ files:
53
+ - LICENCE
54
+ - History.txt
55
+ - Manifest.txt
56
+ - README.txt
57
+ - Rakefile
58
+ - bin/rpdf2txt
59
+ - config.save
60
+ - install.rb
61
+ - lib/rpdf2txt-rockit/base_extensions.rb
62
+ - lib/rpdf2txt-rockit/bootstrap.rb
63
+ - lib/rpdf2txt-rockit/bounded_lru_cache.rb
64
+ - lib/rpdf2txt-rockit/conflict_resolution.rb
65
+ - lib/rpdf2txt-rockit/directed_graph.rb
66
+ - lib/rpdf2txt-rockit/glr_parser.rb
67
+ - lib/rpdf2txt-rockit/grammar.rb
68
+ - lib/rpdf2txt-rockit/graphdrawing.rb
69
+ - lib/rpdf2txt-rockit/graphviz_dot.rb
70
+ - lib/rpdf2txt-rockit/indexable.rb
71
+ - lib/rpdf2txt-rockit/lalr_parsetable_generator.rb
72
+ - lib/rpdf2txt-rockit/parse_table.rb
73
+ - lib/rpdf2txt-rockit/parsetable_generation.rb
74
+ - lib/rpdf2txt-rockit/parsing_ambiguities.rb
75
+ - lib/rpdf2txt-rockit/profiler.rb
76
+ - lib/rpdf2txt-rockit/reduce_actions_generator.rb
77
+ - lib/rpdf2txt-rockit/rockit.rb
78
+ - lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb
79
+ - lib/rpdf2txt-rockit/rockit_grammars_parser.rb
80
+ - lib/rpdf2txt-rockit/sourcecode_dumpable.rb
81
+ - lib/rpdf2txt-rockit/stringscanner.rb
82
+ - lib/rpdf2txt-rockit/syntax_tree.rb
83
+ - lib/rpdf2txt-rockit/token.rb
84
+ - lib/rpdf2txt-rockit/version.rb
85
+ - lib/rpdf2txt/attributesparser.rb
86
+ - lib/rpdf2txt/cmapparser.rb
87
+ - lib/rpdf2txt/data/cmap.grammar
88
+ - lib/rpdf2txt/data/cmap.rb
89
+ - lib/rpdf2txt/data/cmap_range.grammar
90
+ - lib/rpdf2txt/data/cmap_range.rb
91
+ - lib/rpdf2txt/data/_cmap.grammar
92
+ - lib/rpdf2txt/data/_cmap_range.grammar
93
+ - lib/rpdf2txt/data/_pdfattributes.grammar
94
+ - lib/rpdf2txt/data/fonts/Courier-Bold.afm
95
+ - lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm
96
+ - lib/rpdf2txt/data/fonts/Courier-Oblique.afm
97
+ - lib/rpdf2txt/data/fonts/Courier.afm
98
+ - lib/rpdf2txt/data/fonts/Helvetica-Bold.afm
99
+ - lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm
100
+ - lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm
101
+ - lib/rpdf2txt/data/fonts/Helvetica.afm
102
+ - lib/rpdf2txt/data/fonts/License-Adobe.txt
103
+ - lib/rpdf2txt/data/fonts/Symbol.afm
104
+ - lib/rpdf2txt/data/fonts/Times-Bold.afm
105
+ - lib/rpdf2txt/data/fonts/Times-BoldItalic.afm
106
+ - lib/rpdf2txt/data/fonts/Times-Italic.afm
107
+ - lib/rpdf2txt/data/fonts/Times-Roman.afm
108
+ - lib/rpdf2txt/data/fonts/ZapfDingbats.afm
109
+ - lib/rpdf2txt/data/pdfattributes.grammar
110
+ - lib/rpdf2txt/data/pdfattributes.rb
111
+ - lib/rpdf2txt/data/pdftext.grammar
112
+ - lib/rpdf2txt/data/pdftext.rb
113
+ - lib/rpdf2txt/default_handler.rb
114
+ - lib/rpdf2txt/lzw.rb
115
+ - lib/rpdf2txt/object.rb
116
+ - lib/rpdf2txt/parser.rb
117
+ - lib/rpdf2txt/symbol.rb
118
+ - lib/rpdf2txt/text.rb
119
+ - lib/rpdf2txt/text_state.rb
120
+ - lib/rpdf2txt/textparser.rb
121
+ - test/data/3392_obj
122
+ - test/data/397_decrypted
123
+ - test/data/450_decrypted
124
+ - test/data/450_obj
125
+ - test/data/452_decrypted
126
+ - test/data/454_decrypted
127
+ - test/data/456_decrypted
128
+ - test/data/458_decrypted
129
+ - test/data/458_obj
130
+ - test/data/460_decrypted
131
+ - test/data/460_obj
132
+ - test/data/463_decrypted
133
+ - test/data/465_decrypted
134
+ - test/data/465_obj
135
+ - test/data/90_obj
136
+ - test/data/90_obj_comp
137
+ - test/data/decrypted
138
+ - test/data/encrypt_obj
139
+ - test/data/encrypt_string
140
+ - test/data/encrypt_string_128bit
141
+ - test/data/encrypted_object_stream.pdf
142
+ - test/data/firststream
143
+ - test/data/index.pdfobj
144
+ - test/data/index_2bit.pdfobj
145
+ - test/data/index_masked.pdfobj
146
+ - test/data/indexed.pdfobj
147
+ - test/data/indexed_2bit.pdfobj
148
+ - test/data/indexed_masked.pdfobj
149
+ - test/data/inline.png
150
+ - test/data/logo.png
151
+ - test/data/lzw.pdfobj
152
+ - test/data/lzw_index.pdfobj
153
+ - test/data/page_tree.pdf
154
+ - test/data/pdf_20.png
155
+ - test/data/pdf_21.png
156
+ - test/data/pdf_22.png
157
+ - test/data/pdf_50.png
158
+ - test/data/png.pdfobj
159
+ - test/data/space_bug_stream.txt
160
+ - test/data/stream.txt
161
+ - test/data/stream_kerning_bug.txt
162
+ - test/data/stream_kerning_bug2.txt
163
+ - test/data/test.pdf
164
+ - test/data/test.txt
165
+ - test/data/test_text.txt
166
+ - test/data/working_obj
167
+ - test/data/working_obj2
168
+ - test/mock.rb
169
+ - test/suite.rb
170
+ - test/test_pdf_object.rb
171
+ - test/test_pdf_parser.rb
172
+ - test/test_pdf_text.rb
173
+ - test/test_space_bug_05_2004.rb
174
+ - test/test_stream.rb
175
+ - test/test_text_state.rb
176
+ - usage-en.txt
177
+ - user-stories/UserStories_Rpdf2Txt.txt
178
+ - user-stories/documents/swissmedicjournal/04_2004.pdf
179
+ has_rdoc: true
180
+ homepage: http://scm.ywesee.com/?p=rpdf2txt/.git;a=summary
181
+ licenses: []
182
+
183
+ post_install_message:
184
+ rdoc_options:
185
+ - --main
186
+ - README.txt
187
+ require_paths:
188
+ - lib
189
+ required_ruby_version: !ruby/object:Gem::Requirement
190
+ none: false
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ hash: 3
195
+ segments:
196
+ - 0
197
+ version: "0"
198
+ required_rubygems_version: !ruby/object:Gem::Requirement
199
+ none: false
200
+ requirements:
201
+ - - ">="
202
+ - !ruby/object:Gem::Version
203
+ hash: 3
204
+ segments:
205
+ - 0
206
+ version: "0"
207
+ requirements: []
208
+
209
+ rubyforge_project: rpdf2txt
210
+ rubygems_version: 1.3.7
211
+ signing_key:
212
+ specification_version: 3
213
+ summary: rpdf2txt will turn a PDF-file into a textfile.
214
+ test_files:
215
+ - test/test_pdf_object.rb
216
+ - test/test_pdf_parser.rb
217
+ - test/test_pdf_text.rb
218
+ - test/test_space_bug_05_2004.rb
219
+ - test/test_stream.rb
220
+ - test/test_text_state.rb