rpdf2txt 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,352 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2004 Mike Walder, Raphael Waltert, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, mwalder@ywesee.com, rwaltert@ywesee.com
22
+ #
23
+ # PdfParser -- Rpdf2txt-- 14.11.2002 -- mwalder@ywesee.com, rwaltert@ywesee.com
24
+
25
+ module Rpdf2txt
26
+ module SymbolMap
27
+ #this map is not complete!!
28
+ SYMBOL_ENTITIES = {
29
+ 3 => 65,
30
+ 8804 => 163,
31
+ 8805 => 179,
32
+ #alpha-omega
33
+ 913 => 65,
34
+ 914 => 66,
35
+ 915 => 71,
36
+ 916 => 68,
37
+ 917 => 69,
38
+ 918 => 90,
39
+ 919 => 72,
40
+ 920 => 81,
41
+ 921 => 73,
42
+ 922 => 75,
43
+ 923 => 76,
44
+ 924 => 77,
45
+ 925 => 78,
46
+ 926 => 88,
47
+ 927 => 79,
48
+ 928 => 80,
49
+ 929 => 82,
50
+ 931 => 83,
51
+ 932 => 84,
52
+ 933 => 85,
53
+ 934 => 70,
54
+ 935 => 67,
55
+ 936 => 89,
56
+ 945 => 97,
57
+ 946 => 98,
58
+ 947 => 103,
59
+ 948 => 100,
60
+ 949 => 101,
61
+ 950 => 122,
62
+ 951 => 104,
63
+ 952 => 113,
64
+ 953 => 105,
65
+ 954 => 107,
66
+ 955 => 108,
67
+ 956 => 109,
68
+ 957 => 110,
69
+ 958 => 120,
70
+ 959 => 111,
71
+ 960 => 112,
72
+ 961 => 114,
73
+ 963 => 115,
74
+ 964 => 116,
75
+ 965 => 117,
76
+ 966 => 102,
77
+ 967 => 99,
78
+ 968 => 121,
79
+ 969 => 119,
80
+ #dot?
81
+ 8901 => 46,
82
+ #intersection (cap)
83
+ 8745 => 199,
84
+ #union (cup)
85
+ 8746 => 200,
86
+ #infinity
87
+ 8734 => 165,
88
+ #integral
89
+ 8747 => 166,
90
+ #partial differential
91
+ 8706 => 182,
92
+ #not equal to
93
+ 8800 => 185,
94
+ #equal
95
+ 61 => 186,
96
+ #almost equal to
97
+ 8776 => 187,
98
+ #superset of
99
+ 8835 => 201,
100
+ # superset or equal to
101
+ 8839 => 202,
102
+ #not a subset of
103
+ 8836 => 203,
104
+ #subset of
105
+ 8834 => 204,
106
+ #subset or equal to
107
+ 8838 => 205,
108
+ #element of
109
+ 8712 => 206,
110
+ #not an element of
111
+ 8713 => 207,
112
+ #n-ary product
113
+ 8719 => 213,
114
+ #radic
115
+ 8730 => 214,
116
+ #n-ary sum
117
+ 8721 => 229,
118
+ }
119
+ end
120
+ module DefaultHandler
121
+ def column_count
122
+ end
123
+ def column_width
124
+ end
125
+ def identify_columns?
126
+ false
127
+ end
128
+ def send_image(handle)
129
+ end
130
+ def new_font(font)
131
+ end
132
+ def new_fontsize(size)
133
+ end
134
+ def send_column
135
+ end
136
+ def send_colspan
137
+ end
138
+ def send_eof
139
+ @out
140
+ end
141
+ def send_flowing_data(data)
142
+ self.out << data
143
+ end
144
+ def send_hr
145
+ end
146
+ def send_line_break
147
+ self.out << "\n"
148
+ end
149
+ def send_paragraph
150
+ end
151
+ def send_page
152
+ self.out << "\n\n"
153
+ end
154
+ def out
155
+ @out ||= ""
156
+ end
157
+ end
158
+ class SimpleHandler
159
+ include DefaultHandler
160
+ include SymbolMap
161
+ def initialize(io="")
162
+ @out = io
163
+ end
164
+ end
165
+ class ColumnHandler < SimpleHandler
166
+ def initialize(outstream="", padding=' ')
167
+ super(outstream)
168
+ @lines = []
169
+ @padding = padding
170
+ send_line_break
171
+ end
172
+ def column_widths
173
+ all_lengths = []
174
+ last_column = @lines.collect { |line| line.size }.push(1).max - 1
175
+ @lines.each { |line|
176
+ line.each_with_index { |column, idx|
177
+ if(column.is_a?(String))
178
+ column.rstrip!
179
+ if(line[idx.next] || idx == last_column)
180
+ length = column.length
181
+ all_lengths[idx] = [length, all_lengths[idx].to_i].max
182
+ end
183
+ end
184
+ }
185
+ }
186
+ max_lengths = []
187
+ @lines.each { |line|
188
+ line.each_with_index { |column, idx|
189
+ if(column.is_a?(String))
190
+ length = column.length
191
+ idx2 = idx.next
192
+ rm = 0
193
+ while(line[idx2].nil? && (max = all_lengths[idx2]))
194
+ rm += max
195
+ length -= max
196
+ idx2 += 1
197
+ end
198
+ max_lengths[idx] = [length, max_lengths[idx].to_i].max
199
+ end
200
+ }
201
+ line[last_column] ||= nil
202
+ }
203
+ max_lengths
204
+ end
205
+ def identify_columns?
206
+ true
207
+ end
208
+ def send_column
209
+ @current_column += @colspan
210
+ @colspan = 1
211
+ @columns[@current_column] ||= if(@padding.respond_to?(:foldcase))
212
+ u(@padding.dup)
213
+ else
214
+ @padding.dup
215
+ end
216
+ end
217
+ def send_colspan
218
+ @colspan += 1
219
+ end
220
+ def send_hr
221
+ @columns << :hr
222
+ end
223
+ def send_image(image)
224
+ @columns << :image
225
+ end
226
+ def send_flowing_data(data)
227
+ @columns[@current_column] << data
228
+ end
229
+ def send_line_break
230
+ @columns = []
231
+ @lines.push(@columns)
232
+ @current_column = -1
233
+ @colspan = 1
234
+ end
235
+ def send_page
236
+ max_lengths = column_widths
237
+ @lines.each { |line|
238
+ line.each_with_index { |column, idx|
239
+ if(column)
240
+ idx2 = idx.next
241
+ accumulated = max_lengths[idx].to_i
242
+ while((pad = max_lengths[idx2]) && !line[idx2])
243
+ accumulated += pad
244
+ idx2 += 1
245
+ end
246
+ case column
247
+ when :image
248
+ @out << " #IMAGE# ".ljust(accumulated)
249
+ when :hr
250
+ @out << @padding.dup.ljust(accumulated, '-')
251
+ else
252
+ @out << column.ljust(accumulated)
253
+ end
254
+ end
255
+ }
256
+ @out << "\n"
257
+ }
258
+ @lines.clear
259
+ send_line_break
260
+ super
261
+ end
262
+ end
263
+ class RecordingHandler
264
+ def initialize(out = $stdout, columns=false)
265
+ require 'yaml'
266
+ @out = out
267
+ @out << "require 'yaml'\n"
268
+ @columns = columns
269
+ end
270
+ def identify_columns?
271
+ @columns
272
+ end
273
+ def new_font(font)
274
+ @out << <<-EOS
275
+ font = YAML.load <<-EOF
276
+ #{font.to_yaml}
277
+ EOF
278
+ @writer.new_font(font)
279
+ EOS
280
+ end
281
+ def method_missing(symbol, *args, &block)
282
+ argstr = args.collect { |arg| arg.inspect }.join(', ')
283
+ @out << <<-EOS
284
+ @writer.#{symbol}(#{argstr})
285
+ EOS
286
+ end
287
+ end
288
+ class HTMLHandler
289
+ include DefaultHandler
290
+ include SymbolMap
291
+ def initialize
292
+ super
293
+ @state = {
294
+ :italic => false,
295
+ :bold => false,
296
+ :font => false,
297
+ }
298
+ end
299
+ def new_font(font)
300
+ unless(font.nil?)
301
+ if(font.bold?)
302
+ if(!@state[:bold])
303
+ self.out << "<b>"
304
+ @state[:bold] = true
305
+ end
306
+ else
307
+ if(@state[:bold])
308
+ self.out << "</b>"
309
+ @state[:bold] = false
310
+ end
311
+ end
312
+ if(font.italic?)
313
+ if(!@state[:italic])
314
+ self.out << "<i>"
315
+ @state[:italic] = true
316
+ end
317
+ else
318
+ if(@state[:italic])
319
+ self.out << "</i>"
320
+ @state[:italic] = false
321
+ end
322
+ end
323
+ if(@state[:font])
324
+ self.out << "</font>"
325
+ @state[:font] = false
326
+ end
327
+ if(@state[:pre])
328
+ self.out << "</pre>"
329
+ @state[:pre] = false
330
+ end
331
+ font_name = font.basefont_name
332
+ if(!/symbol/i.match(font_name).nil?)
333
+ self.out << "<font face=\"Symbol\">"
334
+ @state[:font] = true
335
+ end
336
+ if (!/courier/i.match(font_name).nil?)
337
+ self.out << "<pre>"
338
+ @state[:pre] = true
339
+ end
340
+ end
341
+ end
342
+ def send_line_break
343
+ @out << "<br>"
344
+ end
345
+ def send_paragraph
346
+ self.out << "<p>"
347
+ end
348
+ def send_page
349
+ self.out << "<p>"
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ # LZW -- rpdf2txt -- 09.07.2008 -- hwyss@ywesee.com
3
+
4
+ module Rpdf2txt
5
+ class LZW
6
+ CLEAR = 256
7
+ EOD = 257
8
+ def self.decode data, early_change=1
9
+ self.new(early_change).decode data
10
+ end
11
+ def initialize(early_change=1)
12
+ @early_change = early_change
13
+ @__dict = (0..255).collect { |num| num.chr }.push :clear, :eod
14
+ init_dictionary
15
+ end
16
+ def init_dictionary
17
+ @dictionary = @__dict.dup
18
+ @code_length = 9
19
+ @boundary = 512 - @early_change
20
+ end
21
+ def decode data
22
+ bits, = data.unpack('B*')
23
+ expected_codes = bits.size / 12
24
+ code = old_code = 0
25
+ result = ''
26
+ max = codes = clears = 0
27
+ while(!bits.empty? && (code = get_next_code bits) && code != EOD)
28
+ codes += 1
29
+ if code == CLEAR
30
+ clears +=1
31
+ init_dictionary
32
+ code = get_next_code bits
33
+ if code.nil? || code == EOD
34
+ return result
35
+ end
36
+ result << @dictionary[code]
37
+ old_code = code
38
+ else
39
+ if string = @dictionary[code]
40
+ result << string
41
+ update_dictionary @dictionary[old_code] + string[0,1]
42
+ old_code = code
43
+ elsif code == @dictionary.size
44
+ string = @dictionary[old_code]
45
+ string += string[0,1]
46
+ result << string
47
+ update_dictionary string
48
+ old_code = code
49
+ else
50
+ raise 'Bad compressed code: %s' % code
51
+ end
52
+ end
53
+ max = [max, result.size].max
54
+ end
55
+ result
56
+ end
57
+ def get_next_code bits
58
+ chunk = bits.slice!(0, @code_length).to_i(2)
59
+ end
60
+ def update_dictionary(str)
61
+ @dictionary.push str
62
+ if @dictionary.size >= @boundary && @code_length < 12
63
+ @code_length += 1
64
+ @boundary = (2**@code_length - @early_change)
65
+ end
66
+ str
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,1114 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
24
+
25
+ require 'zlib'
26
+ require 'rpdf2txt/text'
27
+ require 'rpdf2txt/attributesparser'
28
+ require 'rpdf2txt/cmapparser'
29
+ require 'rpdf2txt/symbol'
30
+ require 'md5'
31
+ require 'matrix'
32
+
33
+ module Rpdf2txt
34
+ class PdfObject
35
+ attr_reader :attributes
36
+ attr_accessor :decoder, :src, :oid
37
+ def initialize(src=nil, target_encoding='utf8')
38
+ @attributes = {}
39
+ @src = src
40
+ @target_encoding = target_encoding
41
+ parse_attributes() unless @src.nil?
42
+ end
43
+ def build_tree(object_catalogue, parent=nil)
44
+ @attributes.each { |key, value|
45
+ case value
46
+ when Array
47
+ value.collect! { |obj| catalogue_object(object_catalogue, obj) || obj }
48
+ when String
49
+ if obj = catalogue_object(object_catalogue, value)
50
+ @attributes.store(key, obj)
51
+ end
52
+ end
53
+ }
54
+ end
55
+ def catalogue_object(catalogue, reference)
56
+ if reference.is_a?(String) && (match = /^(\d+)\s+\d+\s+R/n.match reference)
57
+ catalogue[match[1].to_i]
58
+ end
59
+ end
60
+ def decoded_stream
61
+ raise "abstract method decoded_stream called in #{self.class}; built from source: \n #{@src.tr("\r", "\n")}"
62
+ end
63
+ def oid
64
+ @oid ||= extract_oid(@src)
65
+ end
66
+ def revision_id
67
+ @revision_id ||= extract_revision_id(@src)
68
+ end
69
+ def extract_attribute_stream
70
+ lastindex = @src.index('stream') || -1
71
+ index = @src.index('<<')
72
+ rindex = @src.rindex('>>', lastindex)
73
+ @src[index..(rindex+1)] if(index && rindex)
74
+ end
75
+ def _parse_attributes(src)
76
+ Rpdf2txt.attributes_parser.parse(src)
77
+ end
78
+ def parse_attributes
79
+ src = self.extract_attribute_stream
80
+ if(src.nil?)
81
+ @attibutes = {}
82
+ else
83
+ ast = self._parse_attributes(src)
84
+ ast.compact!
85
+ @attributes = extract_attributes(ast)
86
+ end
87
+ end
88
+ private
89
+ def extract_oid(string)
90
+ /^\d+/n.match(string).to_s.to_i
91
+ end
92
+ def extract_revision_id(string)
93
+ /\s\d+/n.match(string).to_s.to_i
94
+ end
95
+ def extract_attributes(ast)
96
+ if(ast.children_names.include?('value'))
97
+ pdf_unescape(ast.value)
98
+ elsif(ast.children_names.include?('text'))
99
+ pdf_unescape(ast.text.value[1...-1])
100
+ elsif(ast.children_names.include?('values'))
101
+ ast.values.collect { |child| extract_attributes(child) }
102
+ elsif(ast.children_names.include?('pairs'))
103
+ result = {}
104
+ ast.pairs.each { |pair|
105
+ k, v = pair
106
+ keystr = k.value.strip.tr('/','')
107
+ unless(keystr.empty?)
108
+ result.store(keystr.downcase.intern, extract_attributes(v))
109
+ end
110
+ }
111
+ result
112
+ else
113
+ value = ast
114
+ end
115
+ end
116
+ def pdf_escape(input)
117
+ input.gsub(/\\/, '\\\\').gsub(/\n/n, '\n')\
118
+ .gsub(/\r/n, '\r').gsub(/[()]/n, '\\&')
119
+ end
120
+ def pdf_unescape(input)
121
+ input.gsub(/\\n/n, "\n").gsub(/\\r/n, "\r").\
122
+ gsub(/\\\)/n, ')').gsub(/\\\(/n, '(').gsub(/\\\\/n, '\\')
123
+ end
124
+ end
125
+ class PdfEncrypt < PdfObject
126
+ class DecryptionError < RuntimeError
127
+ end
128
+ PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"
129
+ def arc4(key, input)
130
+ output = ''
131
+ s, j, k = (0..255).to_a, 0, (key*256)[0,256].unpack('C*')
132
+ (0..255).each { |x|
133
+ j = (j + s[x] + k[x]) % 256
134
+ s[x], s[j] = s[j], s[x]
135
+ }
136
+ i = j = 0
137
+ input.each_byte { |b|
138
+ i = (i + 1) % 256
139
+ j = (j + s[i]) % 256
140
+ s[i], s[j] = s[j], s[i]
141
+ output << (b ^ s[(s[i] + s[j])%256]).chr
142
+ }
143
+ output
144
+ end
145
+ def compute_user_key encryption_key
146
+ if revision < 3
147
+ pdf_escape arc4(encryption_key, PADDING)
148
+ else
149
+ crypt = Digest::MD5.digest PADDING + file_id
150
+ 20.times do |xor|
151
+ key = encryption_key.unpack('C*').collect! do |byte|
152
+ byte ^ xor
153
+ end.pack('C*')
154
+ crypt = arc4(key, crypt)
155
+ end
156
+ pdf_escape crypt
157
+ end
158
+ end
159
+ def decrypt(pdf_object)
160
+ arc4_key = decrypt_key(pdf_object)
161
+ stream = pdf_object.raw_stream
162
+ arc4(arc4_key, stream)
163
+ end
164
+ def decrypt_key(pdf_object)
165
+ oid = pdf_object.oid
166
+ rev_id = pdf_object.revision_id
167
+ #if it is a ppc we use reverse
168
+ if(self.big_endian?)
169
+ oid_three_bytes = [oid].pack('I*').reverse[0,3]
170
+ rev_id_two_bytes = [rev_id].pack('I*').reverse[0,2]
171
+ else
172
+ oid_three_bytes = [oid].pack('I*')[0,3]
173
+ rev_id_two_bytes = [rev_id].pack('I*')[0,2]
174
+ end
175
+ input = encryption_key << oid_three_bytes << rev_id_two_bytes
176
+ digest = Digest::MD5.digest(input)
177
+ digest[0,[keylength + 5,16].min]
178
+ end
179
+ def big_endian?
180
+ #big endian (ppc) little endian x86
181
+ if ([1].pack('I*') == "\000\000\000\001")
182
+ true
183
+ else
184
+ false
185
+ end
186
+ end
187
+ def encryption_key
188
+ input_string = PADDING.dup
189
+ ## we don't support a user-password. if we did, it would have to replace
190
+ # the first [n..32] bytes of the padding string here.
191
+ input_string << owner_key
192
+ input_string << permission_flag
193
+ input_string << file_id
194
+ ## revision >= 4: add 0xffffffff if document metadata is not encrypted
195
+ digest = Digest::MD5.digest(input_string)
196
+ uk = user_key
197
+ if revision >= 3
198
+ 50.times do digest = Digest::MD5.digest(digest[0,keylength]) end
199
+ uk = uk[0,16]
200
+ end
201
+ encryption_key = digest[0,keylength]
202
+ test_key = compute_user_key encryption_key
203
+ if(test_key != uk)
204
+ raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
205
+ end
206
+ encryption_key
207
+ end
208
+ def file_id= (file_id)
209
+ @file_id = file_id
210
+ end
211
+ def file_id
212
+ [@file_id].pack("H*")
213
+ end
214
+ def keylength
215
+ @keylength ||= (@attributes[:length] || 40).to_i / 8
216
+ end
217
+ def owner_key
218
+ @attributes[:o].to_s
219
+ end
220
+ def permission_flag
221
+ if (self.big_endian?)
222
+ [@attributes[:p].to_i].pack('I*').reverse
223
+ else
224
+ [@attributes[:p].to_i].pack('I*')
225
+ end
226
+ end
227
+ def revision
228
+ @attributes[:r].to_i
229
+ end
230
+ def user_key
231
+ @attributes[:u].to_s
232
+ end
233
+ end
234
+ class Encoding < PdfObject
235
+ def differences
236
+ @differences or
237
+ begin
238
+ @differences = {}
239
+ offset = 0
240
+ idx = 0
241
+ @attributes[:differences].each { |diff|
242
+ if(/^\d+$/n.match(diff))
243
+ offset = diff.to_i
244
+ else
245
+ @differences.store(offset + idx, diff[/\w+/n])
246
+ idx += 1
247
+ end
248
+ }
249
+ @differences
250
+ end
251
+ end
252
+ def convert_symbol(txt)
253
+ res = ''
254
+ txt.each_byte { |byte|
255
+ if(name = differences[byte])
256
+ byte = Symbol.byte(name) || byte
257
+ end
258
+ res << byte
259
+ }
260
+ res
261
+ end
262
+ def symbol_name(byte)
263
+ differences[byte]
264
+ end
265
+ end
266
+ class Font < PdfObject
267
+ attr_accessor :cmap, :descriptor, :rendering_mode, :skewed
268
+ ENCODINGS = {
269
+ '/Identity-H' => 'ascii',
270
+ '/MacRomanEncoding' => 'mac',
271
+ '/UTF8' => 'utf8',
272
+ '/WinAnsiEncoding' => 'ms-ansi',
273
+ }
274
+ AFM_PTRN = /^C\s*(\d+)\s*;\s*
275
+ WX\s*(\d+)\s*;\s*
276
+ N\s*(\w+)/xn
277
+ def basefont_name
278
+ @attributes[:basefont]
279
+ end
280
+ def basefont_width(char)
281
+ basefont_widths[char]
282
+ end
283
+ def basefont_widths
284
+ @basefont_widths ||= load_basefont_widths
285
+ end
286
+ def bold?
287
+ (!!/bold/in.match(basefont_name.to_s)) || @rendering_mode == "2"
288
+ end
289
+ def build_tree(object_catalogue, parent = nil)
290
+ super
291
+ if(desc = @attributes[:descriptor])
292
+ @descriptor = desc
293
+ desc.build_tree(object_catalogue, self)
294
+ end
295
+ end
296
+ def encoding
297
+ enc = @attributes[:encoding]
298
+ if(enc.is_a?(Encoding))
299
+ enc
300
+ else
301
+ ENCODINGS.fetch(enc) {
302
+ warn "unknown encoding #{enc}"
303
+ enc
304
+ }
305
+ end
306
+ end
307
+ def italic?
308
+ @skewed || !!/italic/in.match(basefont_name.to_s)
309
+ end
310
+ def width(char)
311
+ if(char.is_a?(String) && char.length == 1)
312
+ char = char[0]
313
+ end
314
+ _width(char) || named_width(char)
315
+ end
316
+ def widths
317
+ @widths ||= (@attributes[:widths] || [])
318
+ end
319
+ def symbol?
320
+ !!/symbol/in.match(basefont_name.to_s)
321
+ end
322
+ def to_unicode
323
+ @to_unicode ||= (tu = @attributes[:tounicode]) && tu.to_cmap
324
+ end
325
+ private
326
+ def first_char
327
+ @attributes[:firstchar].to_i
328
+ end
329
+ def load_basefont_widths
330
+ widths = {}
331
+ path = File.join(File.dirname(__FILE__), 'data', 'fonts',
332
+ "%s.afm" % basefont_name)
333
+ if(File.readable?(path))
334
+ File.read(path).scan(AFM_PTRN) { |char, width, name|
335
+ widths.store(char.to_i, width)
336
+ widths.store(name, width)
337
+ }
338
+ end
339
+ widths
340
+ end
341
+ def named_width(char)
342
+ enc = @attributes[:encoding]
343
+ if(enc.is_a?(Encoding))
344
+ _width(enc.symbol_name(char))
345
+ end
346
+ end
347
+ def _width(char)
348
+ width = widths.at(char - first_char) if(char.is_a? Integer)
349
+ width ||= basefont_width(char)
350
+ width.to_i if(width)
351
+ end
352
+ end
353
+ class FontDescriptor < PdfObject
354
+ end
355
+ class Unknown < PdfObject
356
+ def to_i(*args)
357
+ ((match = /obj\s*(\d+)/n.match(@src)) && match[1]).to_i(*args)
358
+ end
359
+ end
360
+ class Resource < PdfObject
361
+ def initialize(src=nil, target_encoding='utf8')
362
+ if(src.is_a? Hash)
363
+ @attributes = src
364
+ elsif(src.is_a? String)
365
+ super
366
+ else
367
+ @attributes = {}
368
+ end
369
+ @fonts = {}
370
+ @xobjects = {}
371
+ end
372
+ def build_tree(object_catalogue, parent=nil)
373
+ super
374
+ [:font, :xobject].each { |type| build_attributes type, object_catalogue }
375
+ end
376
+ def font(key)
377
+ @fonts[key]
378
+ end
379
+ def xobject(key)
380
+ @xobjects[key]
381
+ end
382
+ private
383
+ def build_attributes(type, object_catalogue)
384
+ storage = instance_variable_get "@#{type}s"
385
+ attribute = @attributes[type]
386
+ if attribute.is_a?(Hash)
387
+ _build_attributes(storage, attribute, object_catalogue)
388
+ elsif attribute.is_a?(PdfHash)
389
+ _build_attributes(storage, attribute.contents, object_catalogue)
390
+ end
391
+ end
392
+ def _build_attributes(storage, hash, object_catalogue)
393
+ hash.each { |key, val|
394
+ oid = /^\d+/n.match(val).to_s.to_i
395
+ if(obj = object_catalogue[oid])
396
+ obj.build_tree(object_catalogue)
397
+ end
398
+ storage.store(key, obj)
399
+ }
400
+ end
401
+ end
402
+ class TrailerDictionary < PdfObject
403
+ def file_id
404
+ /[a-zA-Z0-9]+/n.match(@attributes[:id].first).to_s
405
+ end
406
+ def encrypt_id
407
+ extract_oid(@attributes[:encrypt])
408
+ end
409
+ =begin
410
+ def parse_attributes
411
+ index = @src.index('trailer')
412
+ rindex = @src.rindex('startxref')
413
+ unless(index && rindex)
414
+ @attibutes = {}
415
+ else
416
+ #set correct offsets <<(begin) >>(end)
417
+ src = @src[index+7..(rindex-1)]
418
+ ast = Rpdf2txt.attributes_parser.parse(src)
419
+ ast.compact!
420
+ @attributes = extract_attributes(ast)
421
+ end
422
+ end
423
+ =end
424
+ def root_id
425
+ extract_oid(@attributes[:root])
426
+ end
427
+ def update(trailer_dict)
428
+ @attributes.update(trailer_dict.attributes)
429
+ end
430
+ protected
431
+ attr_reader :attributes
432
+ end
433
+ class TreeNode < PdfObject
434
+ include Enumerable
435
+ attr_reader :parent
436
+ def build_tree(object_catalogue, parent=nil)
437
+ super
438
+ @parent = parent
439
+ self
440
+ end
441
+ def each
442
+ yield self
443
+ end
444
+ def extract_oids(array)
445
+ array.collect{ |dirty_id|
446
+ if(match = /\d+/on.match(dirty_id))
447
+ match[0].to_i
448
+ end
449
+ }.compact
450
+ end
451
+ def root?
452
+ !(@parent || @attributes[:parent])
453
+ end
454
+ end
455
+ class CatalogNode < TreeNode
456
+ def build_tree(object_catalogue, parent=nil)
457
+ id = extract_oids(@attributes[:pages]).first
458
+ @pages = object_catalogue[id]
459
+ @pages.build_tree(object_catalogue, self)
460
+ super
461
+ end
462
+ def each(&block)
463
+ @pages.each(&block)
464
+ end
465
+ end
466
+ class PageNode < TreeNode
467
+ attr_reader :kids
468
+ def build_tree(object_catalogue, parent=nil)
469
+ @kids = []
470
+ extract_oids(@attributes[:kids]).each { |id|
471
+ child = object_catalogue[id]
472
+ @kids.push(child)
473
+ child.build_tree(object_catalogue, self)
474
+ }
475
+ super
476
+ end
477
+ def each
478
+ @kids.each { |kid|
479
+ kid.each { |result| yield result }
480
+ }
481
+ end
482
+ def media_box
483
+ if(mb = @attributes[:mediabox])
484
+ mb.collect { |val| val.to_f }
485
+ end
486
+ end
487
+ end
488
+ class PageLeaf < TreeNode
489
+ attr_reader :contents, :resources
490
+ def initialize(*args)
491
+ super
492
+ @text_state = TextState.new(@target_encoding)
493
+ end
494
+ def build_tree(object_catalogue, parent=nil)
495
+ @contents=[]
496
+ extract_oids(@attributes[:contents]).each{ |id|
497
+ content = object_catalogue[id]
498
+ @contents.push(content)
499
+ content.build_tree(object_catalogue, self) if content.respond_to?(:build_tree)
500
+ }
501
+ resources = @attributes[:resources]
502
+ if(resources.is_a? String)
503
+ @resources = object_catalogue[extract_oids([resources]).first]
504
+ elsif resources.is_a? Hash
505
+ @resources = Resource.new(resources)
506
+ elsif(resources.nil? && @parent)
507
+ @resources = @parent.resources
508
+ else
509
+ @resources = Resource.new()
510
+ end
511
+ @resources.build_tree(object_catalogue) if @resources.is_a? Resource
512
+ super
513
+ end
514
+ def font(key)
515
+ @resources.font(key)
516
+ end
517
+ def media_box
518
+ if(parent)
519
+ parent.media_box
520
+ end
521
+ end
522
+ def text(callback_handler)
523
+ concat_stream = Stream.new('')
524
+ if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
525
+ @contents.first.build_stream(concat_stream)
526
+ else
527
+ @contents.each { |stream|
528
+ concat_stream.append(stream.decoded_stream)
529
+ }
530
+ end
531
+ @text_state.media_box = self.media_box
532
+ text_snippets = concat_stream.extract_text_objects(self, @text_state)
533
+ join_snippets(text_snippets, callback_handler)
534
+ end
535
+ private
536
+ def each_pair(text_snippets, &block)
537
+ text_snippets.inject(nil) { |last_text_state, text_state|
538
+ if text_state.whitespace_overlap?(last_text_state)
539
+ last_text_state
540
+ else
541
+ block.call(last_text_state, text_state)
542
+ text_state
543
+ end
544
+ }
545
+ end
546
+ def identify_columns(text_snippets, hints={})
547
+ ## find narrowest space on page and use as grid-width
548
+ space = text_snippets.collect { |snip|
549
+ snip.space_width }.select { |w| w > 0 }.min || 100.0
550
+ ## count ocurrences of snippets in each grid-column
551
+ positions = {}
552
+ each_pair(text_snippets) { |last_text_state, text_state|
553
+ unless(text_state.empty? \
554
+ || (last_text_state \
555
+ && text_state.same_column(last_text_state) \
556
+ && !last_text_state.empty?))
557
+ idx = ((text_state.x) / space).floor
558
+ positions[idx] = positions[idx].to_i + 1
559
+ end
560
+ }
561
+ return [] if(positions.empty?)
562
+ ## find the average distance between candidate columns
563
+ #previous = 0
564
+ total = 0
565
+ sorted = positions.sort
566
+ begin
567
+ previous, count = sorted.shift
568
+ end until(count.nil? || count > 2)
569
+ counts = []
570
+ width = sorted.inject(0) { |sum, (pos,count)|
571
+ counts.push [count,pos]
572
+ if(count > 2)
573
+ total += 1
574
+ sum += (pos - previous)
575
+ previous = pos
576
+ end
577
+ sum
578
+ }
579
+ if (colcount = hints[:count]) && counts.size >= colcount
580
+ res = counts.sort[-colcount..-1].collect do |count, pos|
581
+ pos * space
582
+ end
583
+ return res.sort
584
+ end
585
+
586
+ cutwidth = hints[:width]
587
+ if cutwidth.is_a?(String)
588
+ dividend, divisor = cutwidth.split '/', 2
589
+ cutwidth = width * dividend.to_f / divisor.to_f
590
+ end
591
+ cutwidth ||= (total.nonzero?) ? width / total * 0.9 : width
592
+ ## select probable columns
593
+ previous = -cutwidth
594
+ res = []
595
+ sorted = positions.sort
596
+ offset, _ = sorted.first
597
+ sorted.each_with_index { |(pos, count), idx|
598
+ ndx = idx.next
599
+ pos -= offset
600
+ begin
601
+ nxtpos, nxtcount = sorted[ndx]
602
+ ndx += 1
603
+ end until(nxtcount.nil? || nxtcount > 3)
604
+ nxtpos -= offset if nxtpos
605
+ if((count > 1 && (pos - previous) > cutwidth) \
606
+ && !(nxtcount.to_i > count && (nxtpos - pos) < cutwidth))
607
+ previous = pos
608
+ res.push pos + offset
609
+ end
610
+ }
611
+ res.collect { |pos| pos * space }.sort
612
+ end
613
+ def join_snippets(text_snippets, callback_handler)
614
+ text_snippets.sort!
615
+ columns = []
616
+ if(callback_handler.identify_columns?)
617
+ columns = identify_columns(text_snippets,
618
+ :width => callback_handler.column_width,
619
+ :count => callback_handler.column_count)
620
+ columns.shift #throw away the first colum - we'll use the left media-edge
621
+ end
622
+ next_column = nil
623
+ working_set = []
624
+ each_pair(text_snippets) { |last_text_state, text_state|
625
+ text_state.fire_early_callbacks(last_text_state,
626
+ callback_handler)
627
+ # are we on a new line?
628
+ unless(last_text_state && text_state.same_line(last_text_state))
629
+ working_set = columns.dup
630
+ next_column = working_set.shift
631
+ last_text_state = nil
632
+ callback_handler.send_column
633
+ end
634
+ x2 = last_text_state && last_text_state.right_edge.to_i
635
+ while(next_column && (text_state.x.to_i >= next_column.to_i))
636
+ if(x2 && (x2 > next_column.to_i) && !last_text_state.empty?)
637
+ callback_handler.send_colspan
638
+ else
639
+ callback_handler.send_column
640
+ end
641
+ next_column = working_set.shift
642
+ end
643
+ text_state.send_content(last_text_state, callback_handler)
644
+ }
645
+ end
646
+ end
647
+ class Stream < PdfObject
648
+ num = "([0-9.-]+)\\s*"
649
+ dm_str = "#{num}#{num}#{num}#{num}#{num}#{num}cm\\b"
650
+ xobj = '(/\S+)\s*(\bDo\b)'
651
+ @@nontext_scan_pattern = %r!(?:#{dm_str})|(\b[qQ]\b)|#{xobj}|\bBI\b(.*?)\bID\b(.*?)\b(EI)\b!mn
652
+ @@hr_scan_pattern = /#{num}#{num}(\b[lm]\b)/mn
653
+ BT_PATTERN = /\bBT\b(?!(\\[()]|[^(\\])*\))/mn
654
+ ET_PATTERN = /\bET\b(?!(\\[()]|[^(\\])*\))/mn
655
+ FAIL_PTRN = /\((\\[()]|[^)])*\bET\b\s*$/mn
656
+ def append(decoded_stream)
657
+ (@decoded_stream ||= '') << decoded_stream
658
+ end
659
+ def decoded_stream=(decoded_stream)
660
+ @decoded_stream = decoded_stream
661
+ end
662
+ def decoded_stream
663
+ @decoded_stream ||= decode_raw_stream
664
+ end
665
+ def extract_horizontal_rules(dm_src, dmatrix, result)
666
+ last_x = 0
667
+ last_y = 0
668
+ dm_src.scan(@@hr_scan_pattern) { |matches|
669
+ case matches.last.to_s[-1]
670
+ when ?l
671
+ x = matches[0].to_f
672
+ y = matches[1].to_f
673
+ if(x != last_x && y == last_y)
674
+ hr = HorizontalRule.new(x, y, dmatrix)
675
+ hr.current_page, hr.text_state = @page, @text_state
676
+ result.push(hr)
677
+ end
678
+ last_x = x
679
+ last_y = y
680
+ when ?m
681
+ last_x = matches[0].to_f
682
+ last_y = matches[1].to_f
683
+ end
684
+ }
685
+ end
686
+ def extract_nontext_objects(dm_src, dmatrix, stack, result)
687
+ dm_src.scan(@@nontext_scan_pattern) { |matches|
688
+ matches = matches.compact
689
+ case matches.last
690
+ when 'q'
691
+ stack.push(dmatrix)
692
+ when 'Q'
693
+ dmatrix = stack.pop
694
+ when 'Do'
695
+ x, y = (txt = result.last) ? [txt.x, txt.y] : [0, 0]
696
+ ip = ImagePlacement.new(matches[-2], x, y, dmatrix)
697
+ ip.current_page, ip.text_state = @page, @text_state
698
+ result.push ip
699
+ when 'EI'
700
+ attrs, data, _ = matches
701
+ im = InlineImage.new attrs, data.strip
702
+ ip = ImagePlacement.new im, 0, 0, dmatrix
703
+ ip.current_page, ip.text_state = @page, @text_state
704
+ result.push ip
705
+ else
706
+ mmatrix = Matrix[[matches[0].to_f, matches[1].to_f,0],
707
+ [matches[2].to_f, matches[3].to_f,0],
708
+ [matches[4].to_f, matches[5].to_f,1]]
709
+ dmatrix = dmatrix * mmatrix
710
+ end
711
+ }
712
+ dmatrix
713
+ end
714
+ def extract_text_objects(page, text_state)
715
+ @page, @text_state = page, text_state
716
+ stack = []
717
+ result = []
718
+ startpoint = decoded_stream.index(BT_PATTERN)
719
+ endpoint = decoded_stream.index(ET_PATTERN)
720
+ while FAIL_PTRN.match(decoded_stream[0..(endpoint+2)])
721
+ endpoint = decoded_stream.index(ET_PATTERN, endpoint.next)
722
+ end
723
+ unless(startpoint && endpoint && (startpoint < endpoint))
724
+ startpoint = 0
725
+ end
726
+ rotation = (page && Math::PI * page.attributes[:rotate].to_f / 180) || 0
727
+ dmatrix = Matrix[[Math.cos(rotation),Math.sin(rotation),0],
728
+ [Math.sin(rotation),-Math.cos(rotation),0],
729
+ [0,0,1]]
730
+
731
+ dm_src = decoded_stream[0...startpoint]
732
+ while(endpoint && startpoint)
733
+ ### pick out the bits in between Text that are relevant to
734
+ ### text positioning (such as the device-transformation-matrix)
735
+ ### NOTE: as far as I understand, the device matrix should
736
+ ### not be used to position text. However it is used
737
+ ### by some PDF-Creators and therefore we have to include
738
+ ### it in our calculations.
739
+ dmatrix = extract_nontext_objects(dm_src, dmatrix, stack, result)
740
+ extract_horizontal_rules(dm_src, dmatrix, result)
741
+ tsrc = decoded_stream[startpoint..(endpoint+2)]
742
+ while FAIL_PTRN.match(tsrc)
743
+ endpoint = decoded_stream.index(ET_PATTERN, endpoint + 2) || -1
744
+ tsrc = decoded_stream[startpoint..(endpoint+2)]
745
+ end
746
+ text = Text.new(tsrc, @target_encoding, dmatrix)
747
+ text.current_page = page
748
+ text.text_state = text_state
749
+ result.concat text.scan
750
+ startpoint = decoded_stream.index(BT_PATTERN, endpoint)
751
+ if(startpoint)
752
+ dm_src = decoded_stream[endpoint...startpoint]
753
+ endpoint = decoded_stream.index(ET_PATTERN, startpoint)
754
+ end
755
+ end
756
+ result
757
+ end
758
+ def raw_stream
759
+ @raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
760
+ end
761
+ def decode_raw_stream
762
+ @decrypted_stream = raw_stream
763
+ unless(@decoder.nil?)
764
+ @decrypted_stream = @decoder.decrypt(self)
765
+ end
766
+ stream = @decrypted_stream
767
+ [@attributes[:filter]].flatten.compact.each { |filter|
768
+ begin
769
+ stream = case filter
770
+ when "/FlateDecode"
771
+ flate_decode stream
772
+ when "/LZWDecode"
773
+ lzw_decode stream
774
+ else
775
+ raise "Unimplemented filter: #{filter}"
776
+ end
777
+ rescue StandardError => err
778
+ warn "'#{err.message}' when filtering with #{filter}"
779
+ end
780
+ }
781
+ stream
782
+ end
783
+ def flate_decode(data)
784
+ Zlib::Inflate.inflate(data)
785
+ end
786
+ def lzw_decode(data)
787
+ require 'rpdf2txt/lzw'
788
+ earlychange = (parm = @attributes[:decodeparms]) && parm[:earlychange]
789
+ if length = @attributes[:length]
790
+ data = data[0, length.to_i]
791
+ end
792
+ LZW.decode data, (earlychange || 1).to_i
793
+ end
794
+ def to_cmap
795
+ cmap = CMap.new(@src, @target_encoding)
796
+ end
797
+ end
798
+ class ObjStream < Stream
799
+ end
800
+ class Image < Stream
801
+ COLORMAPS = {
802
+ '/DeviceRGB' => 'RGB',
803
+ '/DeviceGray' => 'I',
804
+ '/DeviceCMYK' => 'CMYK',
805
+ }
806
+ def image
807
+ require 'RMagick'
808
+ @image or begin
809
+ columns = @attributes[:width].to_i
810
+ rows = @attributes[:height].to_i
811
+ depth = @attributes[:bitspercomponent].to_i
812
+ mask = @attributes[:mask]
813
+ color_grades = 2 ** depth - 1
814
+ colorspace, basespace, index_colors, index = @attributes[:colorspace]
815
+ index_colors = index_colors.to_i
816
+ colormap = COLORMAPS[colorspace] || COLORMAPS[basespace] || 'RGB'
817
+ colors = colormap.length
818
+ pixels = extract_pixels(decoded_stream, depth)
819
+ case colorspace
820
+ when '/Indexed'
821
+ ## FIXME: this works for some images, but seems to be wrong
822
+ # according to the Documentation
823
+ if mask.is_a?(Array) && (pixels.size - 1 ) > rows * columns
824
+ range = (mask[0].to_i)..(mask[1].to_i)
825
+ pixels.delete_if { |idx| range.include? idx }
826
+ end
827
+ # for indexed images, index_colors correctly describes the
828
+ # depth of the resulting pixels, whereas bitspercomponent
829
+ # may not be accurate
830
+ color_grades = index_colors
831
+ map = extract_colormap(index, index_colors)
832
+ tmp = Array.new(pixels.size * colors)
833
+ pos = 0
834
+ pixels.each { |idx|
835
+ tmp[pos, colors] = map[idx * colors, colors]
836
+ pos += colors
837
+ }
838
+ pixels = tmp
839
+ end
840
+ ## this seems to be undocumented: PNG-images need to be decoded.
841
+ # we can detect this by the additional Byte per Row:
842
+ if pixels.size == (columns * colors + 1) * rows
843
+ pixels = idat_decode pixels, columns, colors
844
+ elsif pixels.size > (rows * columns * colors)
845
+ pixels = pixels[0, rows * columns * colors]
846
+ end
847
+ if color_grades != (2 ** Magick::QuantumDepth - 1)
848
+ div = color_grades.to_f
849
+ pixels.collect! { |px| px / div }
850
+ end
851
+ @image = Magick::Image.constitute(columns, rows, colormap, pixels)
852
+ end
853
+ end
854
+ def idat_decode(data, width, colors)
855
+ scanline_length = colors * width + 1 # for filter
856
+ byte_width = width * colors
857
+
858
+ pixels = []
859
+ row = 0
860
+ until data.empty? do
861
+ row_data = data.slice! 0, scanline_length
862
+ filter = row_data.shift
863
+ case filter
864
+ when 0 then # None
865
+ when 1 then # Sub
866
+ row_data.each_with_index do |byte, index|
867
+ left = index < colors ? 0 : row_data[index - colors]
868
+ row_data[index] = (byte + left) % 256
869
+ end
870
+ when 2 then # Up
871
+ row_data.each_with_index do |byte, index|
872
+ upper = row == 0 ? 0 : pixels[ - byte_width + index ]
873
+ row_data[index] = (upper + byte) % 256
874
+ end
875
+ when 3 then # Average
876
+ row_data.each_with_index do |byte, index|
877
+ upper = row == 0 ? 0 : pixels[ - byte_width + index ]
878
+ left = index < colors ? 0 : row_data[index - colors]
879
+
880
+ row_data[index] = (byte + ((left + upper)/2).floor) % 256
881
+ end
882
+ when 4 then # Paeth
883
+ left = upper = upper_left = nil
884
+ row_data.each_with_index do |byte, index|
885
+
886
+ left = index < colors ? 0 : row_data[index - colors]
887
+ if row == 0 then
888
+ upper = upper_left = 0
889
+ else
890
+ upper_idx = - byte_width + index
891
+ upper = pixels[ upper_idx ]
892
+ upper_left = index < colors ? 0 : pixels[ upper_idx - colors ]
893
+ end
894
+
895
+ paeth = paeth left, upper, upper_left
896
+ row_data[index] = (byte + paeth) % 256
897
+ end
898
+ else
899
+ raise ArgumentError, "Invalid filter algorithm #{filter}"
900
+ end
901
+
902
+ pixels.concat row_data
903
+ row += 1
904
+ end
905
+ pixels
906
+ end
907
+ private
908
+ def extract_colormap(index, mask)
909
+ map = []
910
+ if index.is_a? Stream
911
+ map = index.decoded_stream.unpack('C*').collect { |int| int & mask }
912
+ elsif index.is_a? Array
913
+ map = index
914
+ end
915
+ map
916
+ end
917
+ def extract_pixels(stream, depth)
918
+ case depth
919
+ when 8
920
+ stream.unpack('C*')
921
+ else
922
+ tmp, = stream.unpack('B*')
923
+ pixels = []
924
+ tmp.scan(/.{#{depth}}/n) { |match|
925
+ pixels.push match.to_i(2)
926
+ }
927
+ pixels
928
+ end
929
+ end
930
+ def paeth(a, b, c) # left, above, upper left
931
+ p = a + b - c
932
+ pa = (p - a).abs
933
+ pb = (p - b).abs
934
+ pc = (p - c).abs
935
+
936
+ return a if pa <= pb && pa <= pc
937
+ return b if pb <= pc
938
+ c
939
+ end
940
+ end
941
+ class InlineImage < Image
942
+ ATTR_ABBREVIATIONS = {
943
+ :bpc => :bitspercomponent, :cs => :colorspace,
944
+ :d => :decode, :dp => :decodeparms,
945
+ :f => :filter, :h => :height,
946
+ :im => :imagemask, :i => :interpolate,
947
+ :w => :width,
948
+ }
949
+ OTHER_ABBREVIATIONS = {
950
+ '/G' => '/DeviceGray',
951
+ '/RGB' => '/DeviceRGB',
952
+ '/CMYK' => '/DeviceCMYK',
953
+ '/I' => '/Indexed',
954
+ '/AHx' => '/ASCIIHexDecode',
955
+ '/A85' => '/ASCII85Decode',
956
+ '/LZW' => '/LZWDecode',
957
+ '/Fl' => '/FlateDecode',
958
+ '/RL' => '/RunLengthDecode',
959
+ '/CCF' => '/CCITTFaxDecode',
960
+ '/DCT' => '/DCTDecode',
961
+ }
962
+ def initialize(attrs, data)
963
+ super("<<" << attrs << ">>")
964
+ @raw_stream = data
965
+ end
966
+ def parse_attributes
967
+ super
968
+ ATTR_ABBREVIATIONS.each do |abbr, key|
969
+ if value = @attributes.delete(abbr)
970
+ @attributes.store key, OTHER_ABBREVIATIONS.fetch(value, value)
971
+ end
972
+ end
973
+ end
974
+ end
975
+ class CMap < Stream
976
+ attr_accessor :map
977
+ def initialize(*args)
978
+ @map = {}
979
+ super
980
+ parse_cmap()
981
+ end
982
+ def to_utf8(txt)
983
+ if(@map.nil?)
984
+ txt
985
+ elsif(txt.is_a?(Integer))
986
+ @map[txt]
987
+ else
988
+ txt.unpack('C*').collect { |byte|
989
+ @map.fetch(byte, byte) }.pack('U*')
990
+ end
991
+ end
992
+ private
993
+ #bfchar definition
994
+ def add_to_map_bfchar(ast)
995
+ ast.compact!
996
+ ast.each { |child|
997
+ #convert in to decimal values
998
+ @map.store(_hexvalue(child.source), _hexvalue(child.target))
999
+ }
1000
+ @map
1001
+ end
1002
+ #bfrange definition see page 457 of the pdf manual
1003
+ def add_to_map_bfrange(ast)
1004
+ ast.compact!
1005
+ start_range = ast.start.value.to_s.hex
1006
+ end_range = ast.stop.value.to_s.hex
1007
+ if(ast.children_names.include?('explicit'))
1008
+ explicit = ast.explicit
1009
+ start_range.upto(end_range) { |char|
1010
+ @map.store(char, _hexvalue(explicit.shift))
1011
+ }
1012
+ else
1013
+ offset = _hexvalue(ast.offset)
1014
+ start_range.upto(end_range) { |char|
1015
+ @map.store(char, offset)
1016
+ offset+=1
1017
+ }
1018
+ end
1019
+ @map
1020
+ end
1021
+ def extract_bfchar
1022
+ src = decoded_stream
1023
+ unless(decoded_stream.index('beginbfchar').nil?)
1024
+ index = decoded_stream.index('beginbfchar') + 12
1025
+ rindex = decoded_stream.index('endbfchar')
1026
+ src[index..(rindex)-1]
1027
+ end
1028
+ end
1029
+ def extract_bfrange
1030
+ src = decoded_stream
1031
+ unless(decoded_stream.index('beginbfrange').nil?)
1032
+ index = decoded_stream.index('beginbfrange') + 12
1033
+ rindex = decoded_stream.index('endbfrange')
1034
+ src[index..(rindex)-1]
1035
+ end
1036
+ end
1037
+ def _hexvalue(ast)
1038
+ ast.value.to_s.to_i(16)
1039
+ end
1040
+ def parse_cmap
1041
+ if(src = extract_bfchar)
1042
+ ast = Rpdf2txt.cmap_parser.parse(src)
1043
+ add_to_map_bfchar(ast)
1044
+ end
1045
+ if(src = extract_bfrange)
1046
+ ast = Rpdf2txt.cmap_range_parser.parse(src)
1047
+ ast.each { |node|
1048
+ add_to_map_bfrange(node)
1049
+ }
1050
+ end
1051
+ end
1052
+ end
1053
+ class ReferenceArray < TreeNode
1054
+ def build_tree(object_catalogue, parent=nil)
1055
+ @contents=[]
1056
+ @references.each{ |id|
1057
+ @contents.push(object_catalogue[id]) if object_catalogue[id]
1058
+ }
1059
+ super
1060
+ end
1061
+ def build_stream(concat_stream)
1062
+ @contents.each { |stream|
1063
+ concat_stream.append(stream.decoded_stream)
1064
+ }
1065
+ concat_stream
1066
+ end
1067
+ def parse_attributes
1068
+ src = @src[@src.index('[')..@src.rindex(']')]
1069
+ ast = _parse_attributes(src)
1070
+ ast.compact!
1071
+ @references = extract_oids(extract_attributes(ast))
1072
+ end
1073
+ def root?
1074
+ false
1075
+ end
1076
+ end
1077
+ class PdfArray < TreeNode
1078
+ def build_tree(object_catalogue, parent=nil)
1079
+ @contents=[]
1080
+ super
1081
+ end
1082
+ def at(idx)
1083
+ @contents.at(idx)
1084
+ end
1085
+ def each(&block)
1086
+ @contents.each(&block)
1087
+ end
1088
+ def parse_attributes
1089
+ src = @src[@src.index('[')..@src.rindex(']')]
1090
+ ast = _parse_attributes(src)
1091
+ ast.compact!
1092
+ @contents = extract_attributes(ast)
1093
+ end
1094
+ def root?
1095
+ false
1096
+ end
1097
+ end
1098
+ class PdfHash < TreeNode
1099
+ attr_reader :contents
1100
+ def build_tree(object_catalogue, parent=nil)
1101
+ @contents={}
1102
+ super
1103
+ end
1104
+ def parse_attributes
1105
+ src = @src[@src.index('<<')..@src.rindex('>')]
1106
+ ast = _parse_attributes(src)
1107
+ ast.compact!
1108
+ @contents = extract_attributes(ast)
1109
+ end
1110
+ def root?
1111
+ false
1112
+ end
1113
+ end
1114
+ end