rpdf2txt 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. data/History.txt +5 -0
  2. data/LICENCE +515 -0
  3. data/Manifest.txt +126 -0
  4. data/README.txt +30 -0
  5. data/Rakefile +24 -0
  6. data/bin/rpdf2txt +58 -0
  7. data/config.save +12 -0
  8. data/install.rb +1098 -0
  9. data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
  10. data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
  11. data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
  12. data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
  13. data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
  14. data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
  15. data/lib/rpdf2txt-rockit/grammar.rb +644 -0
  16. data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
  17. data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
  18. data/lib/rpdf2txt-rockit/indexable.rb +53 -0
  19. data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
  20. data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
  21. data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
  22. data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
  23. data/lib/rpdf2txt-rockit/profiler.rb +168 -0
  24. data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
  25. data/lib/rpdf2txt-rockit/rockit.rb +76 -0
  26. data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
  27. data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
  28. data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
  29. data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
  30. data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
  31. data/lib/rpdf2txt-rockit/token.rb +364 -0
  32. data/lib/rpdf2txt-rockit/version.rb +3 -0
  33. data/lib/rpdf2txt/attributesparser.rb +42 -0
  34. data/lib/rpdf2txt/cmapparser.rb +65 -0
  35. data/lib/rpdf2txt/data/_cmap.grammar +11 -0
  36. data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
  37. data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
  38. data/lib/rpdf2txt/data/cmap.grammar +11 -0
  39. data/lib/rpdf2txt/data/cmap.rb +37 -0
  40. data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
  41. data/lib/rpdf2txt/data/cmap_range.rb +43 -0
  42. data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
  43. data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
  44. data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
  45. data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
  46. data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
  47. data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
  48. data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
  49. data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
  50. data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
  51. data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
  52. data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
  53. data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
  54. data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
  55. data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
  56. data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
  57. data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
  58. data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
  59. data/lib/rpdf2txt/data/pdftext.grammar +102 -0
  60. data/lib/rpdf2txt/data/pdftext.rb +146 -0
  61. data/lib/rpdf2txt/default_handler.rb +352 -0
  62. data/lib/rpdf2txt/lzw.rb +69 -0
  63. data/lib/rpdf2txt/object.rb +1114 -0
  64. data/lib/rpdf2txt/parser.rb +169 -0
  65. data/lib/rpdf2txt/symbol.rb +408 -0
  66. data/lib/rpdf2txt/text.rb +182 -0
  67. data/lib/rpdf2txt/text_state.rb +434 -0
  68. data/lib/rpdf2txt/textparser.rb +42 -0
  69. data/test/data/3392_obj +0 -0
  70. data/test/data/397_decrypted +15 -0
  71. data/test/data/450_decrypted +153 -0
  72. data/test/data/450_obj +0 -0
  73. data/test/data/452_decrypted +125 -0
  74. data/test/data/454_decrypted +108 -0
  75. data/test/data/456_decrypted +106 -0
  76. data/test/data/458_decrypted +111 -0
  77. data/test/data/458_obj +0 -0
  78. data/test/data/460_decrypted +118 -0
  79. data/test/data/460_obj +0 -0
  80. data/test/data/463_decrypted +117 -0
  81. data/test/data/465_decrypted +107 -0
  82. data/test/data/465_obj +0 -0
  83. data/test/data/90_obj +0 -0
  84. data/test/data/90_obj_comp +1 -0
  85. data/test/data/decrypted +0 -0
  86. data/test/data/encrypt_obj +0 -0
  87. data/test/data/encrypt_string +0 -0
  88. data/test/data/encrypt_string_128bit +0 -0
  89. data/test/data/encrypted_object_stream.pdf +0 -0
  90. data/test/data/firststream +1 -0
  91. data/test/data/index.pdfobj +0 -0
  92. data/test/data/index_2bit.pdfobj +0 -0
  93. data/test/data/index_masked.pdfobj +0 -0
  94. data/test/data/indexed.pdfobj +0 -0
  95. data/test/data/indexed_2bit.pdfobj +0 -0
  96. data/test/data/indexed_masked.pdfobj +0 -0
  97. data/test/data/inline.png +0 -0
  98. data/test/data/logo.png +0 -0
  99. data/test/data/lzw.pdfobj +0 -0
  100. data/test/data/lzw_index.pdfobj +0 -0
  101. data/test/data/page_tree.pdf +148 -0
  102. data/test/data/pdf_20.png +0 -0
  103. data/test/data/pdf_21.png +0 -0
  104. data/test/data/pdf_22.png +0 -0
  105. data/test/data/pdf_50.png +0 -0
  106. data/test/data/png.pdfobj +0 -0
  107. data/test/data/space_bug_stream.txt +119 -0
  108. data/test/data/stream.txt +292 -0
  109. data/test/data/stream_kerning_bug.txt +13 -0
  110. data/test/data/stream_kerning_bug2.txt +6 -0
  111. data/test/data/test.pdf +0 -0
  112. data/test/data/test.txt +8 -0
  113. data/test/data/test_text.txt +42 -0
  114. data/test/data/working_obj +0 -0
  115. data/test/data/working_obj2 +0 -0
  116. data/test/mock.rb +149 -0
  117. data/test/suite.rb +30 -0
  118. data/test/test_pdf_object.rb +1802 -0
  119. data/test/test_pdf_parser.rb +1340 -0
  120. data/test/test_pdf_text.rb +789 -0
  121. data/test/test_space_bug_05_2004.rb +87 -0
  122. data/test/test_stream.rb +194 -0
  123. data/test/test_text_state.rb +315 -0
  124. data/usage-en.txt +112 -0
  125. data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
  126. data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
  127. metadata +220 -0
@@ -0,0 +1,352 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2004 Mike Walder, Raphael Waltert, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
21
+ # hwyss@ywesee.com, mwalder@ywesee.com, rwaltert@ywesee.com
22
+ #
23
+ # PdfParser -- Rpdf2txt-- 14.11.2002 -- mwalder@ywesee.com, rwaltert@ywesee.com
24
+
25
+ module Rpdf2txt
26
+ module SymbolMap
27
+ #this map is not complete!!
28
+ SYMBOL_ENTITIES = {
29
+ 3 => 65,
30
+ 8804 => 163,
31
+ 8805 => 179,
32
+ #alpha-omega
33
+ 913 => 65,
34
+ 914 => 66,
35
+ 915 => 71,
36
+ 916 => 68,
37
+ 917 => 69,
38
+ 918 => 90,
39
+ 919 => 72,
40
+ 920 => 81,
41
+ 921 => 73,
42
+ 922 => 75,
43
+ 923 => 76,
44
+ 924 => 77,
45
+ 925 => 78,
46
+ 926 => 88,
47
+ 927 => 79,
48
+ 928 => 80,
49
+ 929 => 82,
50
+ 931 => 83,
51
+ 932 => 84,
52
+ 933 => 85,
53
+ 934 => 70,
54
+ 935 => 67,
55
+ 936 => 89,
56
+ 945 => 97,
57
+ 946 => 98,
58
+ 947 => 103,
59
+ 948 => 100,
60
+ 949 => 101,
61
+ 950 => 122,
62
+ 951 => 104,
63
+ 952 => 113,
64
+ 953 => 105,
65
+ 954 => 107,
66
+ 955 => 108,
67
+ 956 => 109,
68
+ 957 => 110,
69
+ 958 => 120,
70
+ 959 => 111,
71
+ 960 => 112,
72
+ 961 => 114,
73
+ 963 => 115,
74
+ 964 => 116,
75
+ 965 => 117,
76
+ 966 => 102,
77
+ 967 => 99,
78
+ 968 => 121,
79
+ 969 => 119,
80
+ #dot?
81
+ 8901 => 46,
82
+ #intersection (cap)
83
+ 8745 => 199,
84
+ #union (cup)
85
+ 8746 => 200,
86
+ #infinity
87
+ 8734 => 165,
88
+ #integral
89
+ 8747 => 166,
90
+ #partial differential
91
+ 8706 => 182,
92
+ #not equal to
93
+ 8800 => 185,
94
+ #equal
95
+ 61 => 186,
96
+ #almost equal to
97
+ 8776 => 187,
98
+ #superset of
99
+ 8835 => 201,
100
+ # superset or equal to
101
+ 8839 => 202,
102
+ #not a subset of
103
+ 8836 => 203,
104
+ #subset of
105
+ 8834 => 204,
106
+ #subset or equal to
107
+ 8838 => 205,
108
+ #element of
109
+ 8712 => 206,
110
+ #not an element of
111
+ 8713 => 207,
112
+ #n-ary product
113
+ 8719 => 213,
114
+ #radic
115
+ 8730 => 214,
116
+ #n-ary sum
117
+ 8721 => 229,
118
+ }
119
+ end
120
+ module DefaultHandler
121
+ def column_count
122
+ end
123
+ def column_width
124
+ end
125
+ def identify_columns?
126
+ false
127
+ end
128
+ def send_image(handle)
129
+ end
130
+ def new_font(font)
131
+ end
132
+ def new_fontsize(size)
133
+ end
134
+ def send_column
135
+ end
136
+ def send_colspan
137
+ end
138
+ def send_eof
139
+ @out
140
+ end
141
+ def send_flowing_data(data)
142
+ self.out << data
143
+ end
144
+ def send_hr
145
+ end
146
+ def send_line_break
147
+ self.out << "\n"
148
+ end
149
+ def send_paragraph
150
+ end
151
+ def send_page
152
+ self.out << "\n\n"
153
+ end
154
+ def out
155
+ @out ||= ""
156
+ end
157
+ end
158
+ class SimpleHandler
159
+ include DefaultHandler
160
+ include SymbolMap
161
+ def initialize(io="")
162
+ @out = io
163
+ end
164
+ end
165
+ class ColumnHandler < SimpleHandler
166
+ def initialize(outstream="", padding=' ')
167
+ super(outstream)
168
+ @lines = []
169
+ @padding = padding
170
+ send_line_break
171
+ end
172
+ def column_widths
173
+ all_lengths = []
174
+ last_column = @lines.collect { |line| line.size }.push(1).max - 1
175
+ @lines.each { |line|
176
+ line.each_with_index { |column, idx|
177
+ if(column.is_a?(String))
178
+ column.rstrip!
179
+ if(line[idx.next] || idx == last_column)
180
+ length = column.length
181
+ all_lengths[idx] = [length, all_lengths[idx].to_i].max
182
+ end
183
+ end
184
+ }
185
+ }
186
+ max_lengths = []
187
+ @lines.each { |line|
188
+ line.each_with_index { |column, idx|
189
+ if(column.is_a?(String))
190
+ length = column.length
191
+ idx2 = idx.next
192
+ rm = 0
193
+ while(line[idx2].nil? && (max = all_lengths[idx2]))
194
+ rm += max
195
+ length -= max
196
+ idx2 += 1
197
+ end
198
+ max_lengths[idx] = [length, max_lengths[idx].to_i].max
199
+ end
200
+ }
201
+ line[last_column] ||= nil
202
+ }
203
+ max_lengths
204
+ end
205
+ def identify_columns?
206
+ true
207
+ end
208
+ def send_column
209
+ @current_column += @colspan
210
+ @colspan = 1
211
+ @columns[@current_column] ||= if(@padding.respond_to?(:foldcase))
212
+ u(@padding.dup)
213
+ else
214
+ @padding.dup
215
+ end
216
+ end
217
+ def send_colspan
218
+ @colspan += 1
219
+ end
220
+ def send_hr
221
+ @columns << :hr
222
+ end
223
+ def send_image(image)
224
+ @columns << :image
225
+ end
226
+ def send_flowing_data(data)
227
+ @columns[@current_column] << data
228
+ end
229
+ def send_line_break
230
+ @columns = []
231
+ @lines.push(@columns)
232
+ @current_column = -1
233
+ @colspan = 1
234
+ end
235
+ def send_page
236
+ max_lengths = column_widths
237
+ @lines.each { |line|
238
+ line.each_with_index { |column, idx|
239
+ if(column)
240
+ idx2 = idx.next
241
+ accumulated = max_lengths[idx].to_i
242
+ while((pad = max_lengths[idx2]) && !line[idx2])
243
+ accumulated += pad
244
+ idx2 += 1
245
+ end
246
+ case column
247
+ when :image
248
+ @out << " #IMAGE# ".ljust(accumulated)
249
+ when :hr
250
+ @out << @padding.dup.ljust(accumulated, '-')
251
+ else
252
+ @out << column.ljust(accumulated)
253
+ end
254
+ end
255
+ }
256
+ @out << "\n"
257
+ }
258
+ @lines.clear
259
+ send_line_break
260
+ super
261
+ end
262
+ end
263
+ class RecordingHandler
264
+ def initialize(out = $stdout, columns=false)
265
+ require 'yaml'
266
+ @out = out
267
+ @out << "require 'yaml'\n"
268
+ @columns = columns
269
+ end
270
+ def identify_columns?
271
+ @columns
272
+ end
273
+ def new_font(font)
274
+ @out << <<-EOS
275
+ font = YAML.load <<-EOF
276
+ #{font.to_yaml}
277
+ EOF
278
+ @writer.new_font(font)
279
+ EOS
280
+ end
281
+ def method_missing(symbol, *args, &block)
282
+ argstr = args.collect { |arg| arg.inspect }.join(', ')
283
+ @out << <<-EOS
284
+ @writer.#{symbol}(#{argstr})
285
+ EOS
286
+ end
287
+ end
288
+ class HTMLHandler
289
+ include DefaultHandler
290
+ include SymbolMap
291
+ def initialize
292
+ super
293
+ @state = {
294
+ :italic => false,
295
+ :bold => false,
296
+ :font => false,
297
+ }
298
+ end
299
+ def new_font(font)
300
+ unless(font.nil?)
301
+ if(font.bold?)
302
+ if(!@state[:bold])
303
+ self.out << "<b>"
304
+ @state[:bold] = true
305
+ end
306
+ else
307
+ if(@state[:bold])
308
+ self.out << "</b>"
309
+ @state[:bold] = false
310
+ end
311
+ end
312
+ if(font.italic?)
313
+ if(!@state[:italic])
314
+ self.out << "<i>"
315
+ @state[:italic] = true
316
+ end
317
+ else
318
+ if(@state[:italic])
319
+ self.out << "</i>"
320
+ @state[:italic] = false
321
+ end
322
+ end
323
+ if(@state[:font])
324
+ self.out << "</font>"
325
+ @state[:font] = false
326
+ end
327
+ if(@state[:pre])
328
+ self.out << "</pre>"
329
+ @state[:pre] = false
330
+ end
331
+ font_name = font.basefont_name
332
+ if(!/symbol/i.match(font_name).nil?)
333
+ self.out << "<font face=\"Symbol\">"
334
+ @state[:font] = true
335
+ end
336
+ if (!/courier/i.match(font_name).nil?)
337
+ self.out << "<pre>"
338
+ @state[:pre] = true
339
+ end
340
+ end
341
+ end
342
+ def send_line_break
343
+ @out << "<br>"
344
+ end
345
+ def send_paragraph
346
+ self.out << "<p>"
347
+ end
348
+ def send_page
349
+ self.out << "<p>"
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ # LZW -- rpdf2txt -- 09.07.2008 -- hwyss@ywesee.com
3
+
4
+ module Rpdf2txt
5
+ class LZW
6
+ CLEAR = 256
7
+ EOD = 257
8
+ def self.decode data, early_change=1
9
+ self.new(early_change).decode data
10
+ end
11
+ def initialize(early_change=1)
12
+ @early_change = early_change
13
+ @__dict = (0..255).collect { |num| num.chr }.push :clear, :eod
14
+ init_dictionary
15
+ end
16
+ def init_dictionary
17
+ @dictionary = @__dict.dup
18
+ @code_length = 9
19
+ @boundary = 512 - @early_change
20
+ end
21
+ def decode data
22
+ bits, = data.unpack('B*')
23
+ expected_codes = bits.size / 12
24
+ code = old_code = 0
25
+ result = ''
26
+ max = codes = clears = 0
27
+ while(!bits.empty? && (code = get_next_code bits) && code != EOD)
28
+ codes += 1
29
+ if code == CLEAR
30
+ clears +=1
31
+ init_dictionary
32
+ code = get_next_code bits
33
+ if code.nil? || code == EOD
34
+ return result
35
+ end
36
+ result << @dictionary[code]
37
+ old_code = code
38
+ else
39
+ if string = @dictionary[code]
40
+ result << string
41
+ update_dictionary @dictionary[old_code] + string[0,1]
42
+ old_code = code
43
+ elsif code == @dictionary.size
44
+ string = @dictionary[old_code]
45
+ string += string[0,1]
46
+ result << string
47
+ update_dictionary string
48
+ old_code = code
49
+ else
50
+ raise 'Bad compressed code: %s' % code
51
+ end
52
+ end
53
+ max = [max, result.size].max
54
+ end
55
+ result
56
+ end
57
+ def get_next_code bits
58
+ chunk = bits.slice!(0, @code_length).to_i(2)
59
+ end
60
+ def update_dictionary(str)
61
+ @dictionary.push str
62
+ if @dictionary.size >= @boundary && @code_length < 12
63
+ @code_length += 1
64
+ @boundary = (2**@code_length - @early_change)
65
+ end
66
+ str
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,1114 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Rpdf2txt -- PDF to Text Parser
4
+ # Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
21
+ # hwyss@ywesee.com, aschrafl@ywesee.com
22
+ #
23
+ # PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
24
+
25
+ require 'zlib'
26
+ require 'rpdf2txt/text'
27
+ require 'rpdf2txt/attributesparser'
28
+ require 'rpdf2txt/cmapparser'
29
+ require 'rpdf2txt/symbol'
30
+ require 'md5'
31
+ require 'matrix'
32
+
33
+ module Rpdf2txt
34
+ class PdfObject
35
+ attr_reader :attributes
36
+ attr_accessor :decoder, :src, :oid
37
+ def initialize(src=nil, target_encoding='utf8')
38
+ @attributes = {}
39
+ @src = src
40
+ @target_encoding = target_encoding
41
+ parse_attributes() unless @src.nil?
42
+ end
43
+ def build_tree(object_catalogue, parent=nil)
44
+ @attributes.each { |key, value|
45
+ case value
46
+ when Array
47
+ value.collect! { |obj| catalogue_object(object_catalogue, obj) || obj }
48
+ when String
49
+ if obj = catalogue_object(object_catalogue, value)
50
+ @attributes.store(key, obj)
51
+ end
52
+ end
53
+ }
54
+ end
55
+ def catalogue_object(catalogue, reference)
56
+ if reference.is_a?(String) && (match = /^(\d+)\s+\d+\s+R/n.match reference)
57
+ catalogue[match[1].to_i]
58
+ end
59
+ end
60
+ def decoded_stream
61
+ raise "abstract method decoded_stream called in #{self.class}; built from source: \n #{@src.tr("\r", "\n")}"
62
+ end
63
+ def oid
64
+ @oid ||= extract_oid(@src)
65
+ end
66
+ def revision_id
67
+ @revision_id ||= extract_revision_id(@src)
68
+ end
69
+ def extract_attribute_stream
70
+ lastindex = @src.index('stream') || -1
71
+ index = @src.index('<<')
72
+ rindex = @src.rindex('>>', lastindex)
73
+ @src[index..(rindex+1)] if(index && rindex)
74
+ end
75
+ def _parse_attributes(src)
76
+ Rpdf2txt.attributes_parser.parse(src)
77
+ end
78
+ def parse_attributes
79
+ src = self.extract_attribute_stream
80
+ if(src.nil?)
81
+ @attibutes = {}
82
+ else
83
+ ast = self._parse_attributes(src)
84
+ ast.compact!
85
+ @attributes = extract_attributes(ast)
86
+ end
87
+ end
88
+ private
89
+ def extract_oid(string)
90
+ /^\d+/n.match(string).to_s.to_i
91
+ end
92
+ def extract_revision_id(string)
93
+ /\s\d+/n.match(string).to_s.to_i
94
+ end
95
+ def extract_attributes(ast)
96
+ if(ast.children_names.include?('value'))
97
+ pdf_unescape(ast.value)
98
+ elsif(ast.children_names.include?('text'))
99
+ pdf_unescape(ast.text.value[1...-1])
100
+ elsif(ast.children_names.include?('values'))
101
+ ast.values.collect { |child| extract_attributes(child) }
102
+ elsif(ast.children_names.include?('pairs'))
103
+ result = {}
104
+ ast.pairs.each { |pair|
105
+ k, v = pair
106
+ keystr = k.value.strip.tr('/','')
107
+ unless(keystr.empty?)
108
+ result.store(keystr.downcase.intern, extract_attributes(v))
109
+ end
110
+ }
111
+ result
112
+ else
113
+ value = ast
114
+ end
115
+ end
116
+ def pdf_escape(input)
117
+ input.gsub(/\\/, '\\\\').gsub(/\n/n, '\n')\
118
+ .gsub(/\r/n, '\r').gsub(/[()]/n, '\\&')
119
+ end
120
+ def pdf_unescape(input)
121
+ input.gsub(/\\n/n, "\n").gsub(/\\r/n, "\r").\
122
+ gsub(/\\\)/n, ')').gsub(/\\\(/n, '(').gsub(/\\\\/n, '\\')
123
+ end
124
+ end
125
+ class PdfEncrypt < PdfObject
126
+ class DecryptionError < RuntimeError
127
+ end
128
+ PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"
129
+ def arc4(key, input)
130
+ output = ''
131
+ s, j, k = (0..255).to_a, 0, (key*256)[0,256].unpack('C*')
132
+ (0..255).each { |x|
133
+ j = (j + s[x] + k[x]) % 256
134
+ s[x], s[j] = s[j], s[x]
135
+ }
136
+ i = j = 0
137
+ input.each_byte { |b|
138
+ i = (i + 1) % 256
139
+ j = (j + s[i]) % 256
140
+ s[i], s[j] = s[j], s[i]
141
+ output << (b ^ s[(s[i] + s[j])%256]).chr
142
+ }
143
+ output
144
+ end
145
+ def compute_user_key encryption_key
146
+ if revision < 3
147
+ pdf_escape arc4(encryption_key, PADDING)
148
+ else
149
+ crypt = Digest::MD5.digest PADDING + file_id
150
+ 20.times do |xor|
151
+ key = encryption_key.unpack('C*').collect! do |byte|
152
+ byte ^ xor
153
+ end.pack('C*')
154
+ crypt = arc4(key, crypt)
155
+ end
156
+ pdf_escape crypt
157
+ end
158
+ end
159
+ def decrypt(pdf_object)
160
+ arc4_key = decrypt_key(pdf_object)
161
+ stream = pdf_object.raw_stream
162
+ arc4(arc4_key, stream)
163
+ end
164
+ def decrypt_key(pdf_object)
165
+ oid = pdf_object.oid
166
+ rev_id = pdf_object.revision_id
167
+ #if it is a ppc we use reverse
168
+ if(self.big_endian?)
169
+ oid_three_bytes = [oid].pack('I*').reverse[0,3]
170
+ rev_id_two_bytes = [rev_id].pack('I*').reverse[0,2]
171
+ else
172
+ oid_three_bytes = [oid].pack('I*')[0,3]
173
+ rev_id_two_bytes = [rev_id].pack('I*')[0,2]
174
+ end
175
+ input = encryption_key << oid_three_bytes << rev_id_two_bytes
176
+ digest = Digest::MD5.digest(input)
177
+ digest[0,[keylength + 5,16].min]
178
+ end
179
+ def big_endian?
180
+ #big endian (ppc) little endian x86
181
+ if ([1].pack('I*') == "\000\000\000\001")
182
+ true
183
+ else
184
+ false
185
+ end
186
+ end
187
+ def encryption_key
188
+ input_string = PADDING.dup
189
+ ## we don't support a user-password. if we did, it would have to replace
190
+ # the first [n..32] bytes of the padding string here.
191
+ input_string << owner_key
192
+ input_string << permission_flag
193
+ input_string << file_id
194
+ ## revision >= 4: add 0xffffffff if document metadata is not encrypted
195
+ digest = Digest::MD5.digest(input_string)
196
+ uk = user_key
197
+ if revision >= 3
198
+ 50.times do digest = Digest::MD5.digest(digest[0,keylength]) end
199
+ uk = uk[0,16]
200
+ end
201
+ encryption_key = digest[0,keylength]
202
+ test_key = compute_user_key encryption_key
203
+ if(test_key != uk)
204
+ raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
205
+ end
206
+ encryption_key
207
+ end
208
+ def file_id= (file_id)
209
+ @file_id = file_id
210
+ end
211
+ def file_id
212
+ [@file_id].pack("H*")
213
+ end
214
+ def keylength
215
+ @keylength ||= (@attributes[:length] || 40).to_i / 8
216
+ end
217
+ def owner_key
218
+ @attributes[:o].to_s
219
+ end
220
+ def permission_flag
221
+ if (self.big_endian?)
222
+ [@attributes[:p].to_i].pack('I*').reverse
223
+ else
224
+ [@attributes[:p].to_i].pack('I*')
225
+ end
226
+ end
227
+ def revision
228
+ @attributes[:r].to_i
229
+ end
230
+ def user_key
231
+ @attributes[:u].to_s
232
+ end
233
+ end
234
+ class Encoding < PdfObject
235
+ def differences
236
+ @differences or
237
+ begin
238
+ @differences = {}
239
+ offset = 0
240
+ idx = 0
241
+ @attributes[:differences].each { |diff|
242
+ if(/^\d+$/n.match(diff))
243
+ offset = diff.to_i
244
+ else
245
+ @differences.store(offset + idx, diff[/\w+/n])
246
+ idx += 1
247
+ end
248
+ }
249
+ @differences
250
+ end
251
+ end
252
+ def convert_symbol(txt)
253
+ res = ''
254
+ txt.each_byte { |byte|
255
+ if(name = differences[byte])
256
+ byte = Symbol.byte(name) || byte
257
+ end
258
+ res << byte
259
+ }
260
+ res
261
+ end
262
+ def symbol_name(byte)
263
+ differences[byte]
264
+ end
265
+ end
266
+ class Font < PdfObject
267
+ attr_accessor :cmap, :descriptor, :rendering_mode, :skewed
268
+ ENCODINGS = {
269
+ '/Identity-H' => 'ascii',
270
+ '/MacRomanEncoding' => 'mac',
271
+ '/UTF8' => 'utf8',
272
+ '/WinAnsiEncoding' => 'ms-ansi',
273
+ }
274
+ AFM_PTRN = /^C\s*(\d+)\s*;\s*
275
+ WX\s*(\d+)\s*;\s*
276
+ N\s*(\w+)/xn
277
+ def basefont_name
278
+ @attributes[:basefont]
279
+ end
280
+ def basefont_width(char)
281
+ basefont_widths[char]
282
+ end
283
+ def basefont_widths
284
+ @basefont_widths ||= load_basefont_widths
285
+ end
286
+ def bold?
287
+ (!!/bold/in.match(basefont_name.to_s)) || @rendering_mode == "2"
288
+ end
289
+ def build_tree(object_catalogue, parent = nil)
290
+ super
291
+ if(desc = @attributes[:descriptor])
292
+ @descriptor = desc
293
+ desc.build_tree(object_catalogue, self)
294
+ end
295
+ end
296
+ def encoding
297
+ enc = @attributes[:encoding]
298
+ if(enc.is_a?(Encoding))
299
+ enc
300
+ else
301
+ ENCODINGS.fetch(enc) {
302
+ warn "unknown encoding #{enc}"
303
+ enc
304
+ }
305
+ end
306
+ end
307
+ def italic?
308
+ @skewed || !!/italic/in.match(basefont_name.to_s)
309
+ end
310
+ def width(char)
311
+ if(char.is_a?(String) && char.length == 1)
312
+ char = char[0]
313
+ end
314
+ _width(char) || named_width(char)
315
+ end
316
+ def widths
317
+ @widths ||= (@attributes[:widths] || [])
318
+ end
319
+ def symbol?
320
+ !!/symbol/in.match(basefont_name.to_s)
321
+ end
322
+ def to_unicode
323
+ @to_unicode ||= (tu = @attributes[:tounicode]) && tu.to_cmap
324
+ end
325
+ private
326
+ def first_char
327
+ @attributes[:firstchar].to_i
328
+ end
329
+ def load_basefont_widths
330
+ widths = {}
331
+ path = File.join(File.dirname(__FILE__), 'data', 'fonts',
332
+ "%s.afm" % basefont_name)
333
+ if(File.readable?(path))
334
+ File.read(path).scan(AFM_PTRN) { |char, width, name|
335
+ widths.store(char.to_i, width)
336
+ widths.store(name, width)
337
+ }
338
+ end
339
+ widths
340
+ end
341
+ def named_width(char)
342
+ enc = @attributes[:encoding]
343
+ if(enc.is_a?(Encoding))
344
+ _width(enc.symbol_name(char))
345
+ end
346
+ end
347
+ def _width(char)
348
+ width = widths.at(char - first_char) if(char.is_a? Integer)
349
+ width ||= basefont_width(char)
350
+ width.to_i if(width)
351
+ end
352
+ end
353
+ class FontDescriptor < PdfObject
354
+ end
355
+ class Unknown < PdfObject
356
+ def to_i(*args)
357
+ ((match = /obj\s*(\d+)/n.match(@src)) && match[1]).to_i(*args)
358
+ end
359
+ end
360
+ class Resource < PdfObject
361
+ def initialize(src=nil, target_encoding='utf8')
362
+ if(src.is_a? Hash)
363
+ @attributes = src
364
+ elsif(src.is_a? String)
365
+ super
366
+ else
367
+ @attributes = {}
368
+ end
369
+ @fonts = {}
370
+ @xobjects = {}
371
+ end
372
+ def build_tree(object_catalogue, parent=nil)
373
+ super
374
+ [:font, :xobject].each { |type| build_attributes type, object_catalogue }
375
+ end
376
+ def font(key)
377
+ @fonts[key]
378
+ end
379
+ def xobject(key)
380
+ @xobjects[key]
381
+ end
382
+ private
383
+ def build_attributes(type, object_catalogue)
384
+ storage = instance_variable_get "@#{type}s"
385
+ attribute = @attributes[type]
386
+ if attribute.is_a?(Hash)
387
+ _build_attributes(storage, attribute, object_catalogue)
388
+ elsif attribute.is_a?(PdfHash)
389
+ _build_attributes(storage, attribute.contents, object_catalogue)
390
+ end
391
+ end
392
+ def _build_attributes(storage, hash, object_catalogue)
393
+ hash.each { |key, val|
394
+ oid = /^\d+/n.match(val).to_s.to_i
395
+ if(obj = object_catalogue[oid])
396
+ obj.build_tree(object_catalogue)
397
+ end
398
+ storage.store(key, obj)
399
+ }
400
+ end
401
+ end
402
+ class TrailerDictionary < PdfObject
403
+ def file_id
404
+ /[a-zA-Z0-9]+/n.match(@attributes[:id].first).to_s
405
+ end
406
+ def encrypt_id
407
+ extract_oid(@attributes[:encrypt])
408
+ end
409
+ =begin
410
+ def parse_attributes
411
+ index = @src.index('trailer')
412
+ rindex = @src.rindex('startxref')
413
+ unless(index && rindex)
414
+ @attibutes = {}
415
+ else
416
+ #set correct offsets <<(begin) >>(end)
417
+ src = @src[index+7..(rindex-1)]
418
+ ast = Rpdf2txt.attributes_parser.parse(src)
419
+ ast.compact!
420
+ @attributes = extract_attributes(ast)
421
+ end
422
+ end
423
+ =end
424
+ def root_id
425
+ extract_oid(@attributes[:root])
426
+ end
427
+ def update(trailer_dict)
428
+ @attributes.update(trailer_dict.attributes)
429
+ end
430
+ protected
431
+ attr_reader :attributes
432
+ end
433
+ class TreeNode < PdfObject
434
+ include Enumerable
435
+ attr_reader :parent
436
+ def build_tree(object_catalogue, parent=nil)
437
+ super
438
+ @parent = parent
439
+ self
440
+ end
441
+ def each
442
+ yield self
443
+ end
444
+ def extract_oids(array)
445
+ array.collect{ |dirty_id|
446
+ if(match = /\d+/on.match(dirty_id))
447
+ match[0].to_i
448
+ end
449
+ }.compact
450
+ end
451
+ def root?
452
+ !(@parent || @attributes[:parent])
453
+ end
454
+ end
455
+ class CatalogNode < TreeNode
456
+ def build_tree(object_catalogue, parent=nil)
457
+ id = extract_oids(@attributes[:pages]).first
458
+ @pages = object_catalogue[id]
459
+ @pages.build_tree(object_catalogue, self)
460
+ super
461
+ end
462
+ def each(&block)
463
+ @pages.each(&block)
464
+ end
465
+ end
466
+ class PageNode < TreeNode
467
+ attr_reader :kids
468
+ def build_tree(object_catalogue, parent=nil)
469
+ @kids = []
470
+ extract_oids(@attributes[:kids]).each { |id|
471
+ child = object_catalogue[id]
472
+ @kids.push(child)
473
+ child.build_tree(object_catalogue, self)
474
+ }
475
+ super
476
+ end
477
+ def each
478
+ @kids.each { |kid|
479
+ kid.each { |result| yield result }
480
+ }
481
+ end
482
+ def media_box
483
+ if(mb = @attributes[:mediabox])
484
+ mb.collect { |val| val.to_f }
485
+ end
486
+ end
487
+ end
488
+ class PageLeaf < TreeNode
489
+ attr_reader :contents, :resources
490
+ def initialize(*args)
491
+ super
492
+ @text_state = TextState.new(@target_encoding)
493
+ end
494
+ def build_tree(object_catalogue, parent=nil)
495
+ @contents=[]
496
+ extract_oids(@attributes[:contents]).each{ |id|
497
+ content = object_catalogue[id]
498
+ @contents.push(content)
499
+ content.build_tree(object_catalogue, self) if content.respond_to?(:build_tree)
500
+ }
501
+ resources = @attributes[:resources]
502
+ if(resources.is_a? String)
503
+ @resources = object_catalogue[extract_oids([resources]).first]
504
+ elsif resources.is_a? Hash
505
+ @resources = Resource.new(resources)
506
+ elsif(resources.nil? && @parent)
507
+ @resources = @parent.resources
508
+ else
509
+ @resources = Resource.new()
510
+ end
511
+ @resources.build_tree(object_catalogue) if @resources.is_a? Resource
512
+ super
513
+ end
514
+ def font(key)
515
+ @resources.font(key)
516
+ end
517
+ def media_box
518
+ if(parent)
519
+ parent.media_box
520
+ end
521
+ end
522
+ def text(callback_handler)
523
+ concat_stream = Stream.new('')
524
+ if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
525
+ @contents.first.build_stream(concat_stream)
526
+ else
527
+ @contents.each { |stream|
528
+ concat_stream.append(stream.decoded_stream)
529
+ }
530
+ end
531
+ @text_state.media_box = self.media_box
532
+ text_snippets = concat_stream.extract_text_objects(self, @text_state)
533
+ join_snippets(text_snippets, callback_handler)
534
+ end
535
+ private
536
+ def each_pair(text_snippets, &block)
537
+ text_snippets.inject(nil) { |last_text_state, text_state|
538
+ if text_state.whitespace_overlap?(last_text_state)
539
+ last_text_state
540
+ else
541
+ block.call(last_text_state, text_state)
542
+ text_state
543
+ end
544
+ }
545
+ end
546
+ def identify_columns(text_snippets, hints={})
547
+ ## find narrowest space on page and use as grid-width
548
+ space = text_snippets.collect { |snip|
549
+ snip.space_width }.select { |w| w > 0 }.min || 100.0
550
+ ## count ocurrences of snippets in each grid-column
551
+ positions = {}
552
+ each_pair(text_snippets) { |last_text_state, text_state|
553
+ unless(text_state.empty? \
554
+ || (last_text_state \
555
+ && text_state.same_column(last_text_state) \
556
+ && !last_text_state.empty?))
557
+ idx = ((text_state.x) / space).floor
558
+ positions[idx] = positions[idx].to_i + 1
559
+ end
560
+ }
561
+ return [] if(positions.empty?)
562
+ ## find the average distance between candidate columns
563
+ #previous = 0
564
+ total = 0
565
+ sorted = positions.sort
566
+ begin
567
+ previous, count = sorted.shift
568
+ end until(count.nil? || count > 2)
569
+ counts = []
570
+ width = sorted.inject(0) { |sum, (pos,count)|
571
+ counts.push [count,pos]
572
+ if(count > 2)
573
+ total += 1
574
+ sum += (pos - previous)
575
+ previous = pos
576
+ end
577
+ sum
578
+ }
579
+ if (colcount = hints[:count]) && counts.size >= colcount
580
+ res = counts.sort[-colcount..-1].collect do |count, pos|
581
+ pos * space
582
+ end
583
+ return res.sort
584
+ end
585
+
586
+ cutwidth = hints[:width]
587
+ if cutwidth.is_a?(String)
588
+ dividend, divisor = cutwidth.split '/', 2
589
+ cutwidth = width * dividend.to_f / divisor.to_f
590
+ end
591
+ cutwidth ||= (total.nonzero?) ? width / total * 0.9 : width
592
+ ## select probable columns
593
+ previous = -cutwidth
594
+ res = []
595
+ sorted = positions.sort
596
+ offset, _ = sorted.first
597
+ sorted.each_with_index { |(pos, count), idx|
598
+ ndx = idx.next
599
+ pos -= offset
600
+ begin
601
+ nxtpos, nxtcount = sorted[ndx]
602
+ ndx += 1
603
+ end until(nxtcount.nil? || nxtcount > 3)
604
+ nxtpos -= offset if nxtpos
605
+ if((count > 1 && (pos - previous) > cutwidth) \
606
+ && !(nxtcount.to_i > count && (nxtpos - pos) < cutwidth))
607
+ previous = pos
608
+ res.push pos + offset
609
+ end
610
+ }
611
+ res.collect { |pos| pos * space }.sort
612
+ end
613
+ def join_snippets(text_snippets, callback_handler)
614
+ text_snippets.sort!
615
+ columns = []
616
+ if(callback_handler.identify_columns?)
617
+ columns = identify_columns(text_snippets,
618
+ :width => callback_handler.column_width,
619
+ :count => callback_handler.column_count)
620
+ columns.shift #throw away the first colum - we'll use the left media-edge
621
+ end
622
+ next_column = nil
623
+ working_set = []
624
+ each_pair(text_snippets) { |last_text_state, text_state|
625
+ text_state.fire_early_callbacks(last_text_state,
626
+ callback_handler)
627
+ # are we on a new line?
628
+ unless(last_text_state && text_state.same_line(last_text_state))
629
+ working_set = columns.dup
630
+ next_column = working_set.shift
631
+ last_text_state = nil
632
+ callback_handler.send_column
633
+ end
634
+ x2 = last_text_state && last_text_state.right_edge.to_i
635
+ while(next_column && (text_state.x.to_i >= next_column.to_i))
636
+ if(x2 && (x2 > next_column.to_i) && !last_text_state.empty?)
637
+ callback_handler.send_colspan
638
+ else
639
+ callback_handler.send_column
640
+ end
641
+ next_column = working_set.shift
642
+ end
643
+ text_state.send_content(last_text_state, callback_handler)
644
+ }
645
+ end
646
+ end
647
+ class Stream < PdfObject
648
+ num = "([0-9.-]+)\\s*"
649
+ dm_str = "#{num}#{num}#{num}#{num}#{num}#{num}cm\\b"
650
+ xobj = '(/\S+)\s*(\bDo\b)'
651
+ @@nontext_scan_pattern = %r!(?:#{dm_str})|(\b[qQ]\b)|#{xobj}|\bBI\b(.*?)\bID\b(.*?)\b(EI)\b!mn
652
+ @@hr_scan_pattern = /#{num}#{num}(\b[lm]\b)/mn
653
+ BT_PATTERN = /\bBT\b(?!(\\[()]|[^(\\])*\))/mn
654
+ ET_PATTERN = /\bET\b(?!(\\[()]|[^(\\])*\))/mn
655
+ FAIL_PTRN = /\((\\[()]|[^)])*\bET\b\s*$/mn
656
+ def append(decoded_stream)
657
+ (@decoded_stream ||= '') << decoded_stream
658
+ end
659
+ def decoded_stream=(decoded_stream)
660
+ @decoded_stream = decoded_stream
661
+ end
662
+ def decoded_stream
663
+ @decoded_stream ||= decode_raw_stream
664
+ end
665
+ def extract_horizontal_rules(dm_src, dmatrix, result)
666
+ last_x = 0
667
+ last_y = 0
668
+ dm_src.scan(@@hr_scan_pattern) { |matches|
669
+ case matches.last.to_s[-1]
670
+ when ?l
671
+ x = matches[0].to_f
672
+ y = matches[1].to_f
673
+ if(x != last_x && y == last_y)
674
+ hr = HorizontalRule.new(x, y, dmatrix)
675
+ hr.current_page, hr.text_state = @page, @text_state
676
+ result.push(hr)
677
+ end
678
+ last_x = x
679
+ last_y = y
680
+ when ?m
681
+ last_x = matches[0].to_f
682
+ last_y = matches[1].to_f
683
+ end
684
+ }
685
+ end
686
+ def extract_nontext_objects(dm_src, dmatrix, stack, result)
687
+ dm_src.scan(@@nontext_scan_pattern) { |matches|
688
+ matches = matches.compact
689
+ case matches.last
690
+ when 'q'
691
+ stack.push(dmatrix)
692
+ when 'Q'
693
+ dmatrix = stack.pop
694
+ when 'Do'
695
+ x, y = (txt = result.last) ? [txt.x, txt.y] : [0, 0]
696
+ ip = ImagePlacement.new(matches[-2], x, y, dmatrix)
697
+ ip.current_page, ip.text_state = @page, @text_state
698
+ result.push ip
699
+ when 'EI'
700
+ attrs, data, _ = matches
701
+ im = InlineImage.new attrs, data.strip
702
+ ip = ImagePlacement.new im, 0, 0, dmatrix
703
+ ip.current_page, ip.text_state = @page, @text_state
704
+ result.push ip
705
+ else
706
+ mmatrix = Matrix[[matches[0].to_f, matches[1].to_f,0],
707
+ [matches[2].to_f, matches[3].to_f,0],
708
+ [matches[4].to_f, matches[5].to_f,1]]
709
+ dmatrix = dmatrix * mmatrix
710
+ end
711
+ }
712
+ dmatrix
713
+ end
714
+ def extract_text_objects(page, text_state)
715
+ @page, @text_state = page, text_state
716
+ stack = []
717
+ result = []
718
+ startpoint = decoded_stream.index(BT_PATTERN)
719
+ endpoint = decoded_stream.index(ET_PATTERN)
720
+ while FAIL_PTRN.match(decoded_stream[0..(endpoint+2)])
721
+ endpoint = decoded_stream.index(ET_PATTERN, endpoint.next)
722
+ end
723
+ unless(startpoint && endpoint && (startpoint < endpoint))
724
+ startpoint = 0
725
+ end
726
+ rotation = (page && Math::PI * page.attributes[:rotate].to_f / 180) || 0
727
+ dmatrix = Matrix[[Math.cos(rotation),Math.sin(rotation),0],
728
+ [Math.sin(rotation),-Math.cos(rotation),0],
729
+ [0,0,1]]
730
+
731
+ dm_src = decoded_stream[0...startpoint]
732
+ while(endpoint && startpoint)
733
+ ### pick out the bits in between Text that are relevant to
734
+ ### text positioning (such as the device-transformation-matrix)
735
+ ### NOTE: as far as I understand, the device matrix should
736
+ ### not be used to position text. However it is used
737
+ ### by some PDF-Creators and therefore we have to include
738
+ ### it in our calculations.
739
+ dmatrix = extract_nontext_objects(dm_src, dmatrix, stack, result)
740
+ extract_horizontal_rules(dm_src, dmatrix, result)
741
+ tsrc = decoded_stream[startpoint..(endpoint+2)]
742
+ while FAIL_PTRN.match(tsrc)
743
+ endpoint = decoded_stream.index(ET_PATTERN, endpoint + 2) || -1
744
+ tsrc = decoded_stream[startpoint..(endpoint+2)]
745
+ end
746
+ text = Text.new(tsrc, @target_encoding, dmatrix)
747
+ text.current_page = page
748
+ text.text_state = text_state
749
+ result.concat text.scan
750
+ startpoint = decoded_stream.index(BT_PATTERN, endpoint)
751
+ if(startpoint)
752
+ dm_src = decoded_stream[endpoint...startpoint]
753
+ endpoint = decoded_stream.index(ET_PATTERN, startpoint)
754
+ end
755
+ end
756
+ result
757
+ end
758
+ def raw_stream
759
+ @raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
760
+ end
761
+ def decode_raw_stream
762
+ @decrypted_stream = raw_stream
763
+ unless(@decoder.nil?)
764
+ @decrypted_stream = @decoder.decrypt(self)
765
+ end
766
+ stream = @decrypted_stream
767
+ [@attributes[:filter]].flatten.compact.each { |filter|
768
+ begin
769
+ stream = case filter
770
+ when "/FlateDecode"
771
+ flate_decode stream
772
+ when "/LZWDecode"
773
+ lzw_decode stream
774
+ else
775
+ raise "Unimplemented filter: #{filter}"
776
+ end
777
+ rescue StandardError => err
778
+ warn "'#{err.message}' when filtering with #{filter}"
779
+ end
780
+ }
781
+ stream
782
+ end
783
+ def flate_decode(data)
784
+ Zlib::Inflate.inflate(data)
785
+ end
786
+ def lzw_decode(data)
787
+ require 'rpdf2txt/lzw'
788
+ earlychange = (parm = @attributes[:decodeparms]) && parm[:earlychange]
789
+ if length = @attributes[:length]
790
+ data = data[0, length.to_i]
791
+ end
792
+ LZW.decode data, (earlychange || 1).to_i
793
+ end
794
+ def to_cmap
795
+ cmap = CMap.new(@src, @target_encoding)
796
+ end
797
+ end
798
+ class ObjStream < Stream
799
+ end
800
+ class Image < Stream
801
+ COLORMAPS = {
802
+ '/DeviceRGB' => 'RGB',
803
+ '/DeviceGray' => 'I',
804
+ '/DeviceCMYK' => 'CMYK',
805
+ }
806
+ def image
807
+ require 'RMagick'
808
+ @image or begin
809
+ columns = @attributes[:width].to_i
810
+ rows = @attributes[:height].to_i
811
+ depth = @attributes[:bitspercomponent].to_i
812
+ mask = @attributes[:mask]
813
+ color_grades = 2 ** depth - 1
814
+ colorspace, basespace, index_colors, index = @attributes[:colorspace]
815
+ index_colors = index_colors.to_i
816
+ colormap = COLORMAPS[colorspace] || COLORMAPS[basespace] || 'RGB'
817
+ colors = colormap.length
818
+ pixels = extract_pixels(decoded_stream, depth)
819
+ case colorspace
820
+ when '/Indexed'
821
+ ## FIXME: this works for some images, but seems to be wrong
822
+ # according to the Documentation
823
+ if mask.is_a?(Array) && (pixels.size - 1 ) > rows * columns
824
+ range = (mask[0].to_i)..(mask[1].to_i)
825
+ pixels.delete_if { |idx| range.include? idx }
826
+ end
827
+ # for indexed images, index_colors correctly describes the
828
+ # depth of the resulting pixels, whereas bitspercomponent
829
+ # may not be accurate
830
+ color_grades = index_colors
831
+ map = extract_colormap(index, index_colors)
832
+ tmp = Array.new(pixels.size * colors)
833
+ pos = 0
834
+ pixels.each { |idx|
835
+ tmp[pos, colors] = map[idx * colors, colors]
836
+ pos += colors
837
+ }
838
+ pixels = tmp
839
+ end
840
+ ## this seems to be undocumented: PNG-images need to be decoded.
841
+ # we can detect this by the additional Byte per Row:
842
+ if pixels.size == (columns * colors + 1) * rows
843
+ pixels = idat_decode pixels, columns, colors
844
+ elsif pixels.size > (rows * columns * colors)
845
+ pixels = pixels[0, rows * columns * colors]
846
+ end
847
+ if color_grades != (2 ** Magick::QuantumDepth - 1)
848
+ div = color_grades.to_f
849
+ pixels.collect! { |px| px / div }
850
+ end
851
+ @image = Magick::Image.constitute(columns, rows, colormap, pixels)
852
+ end
853
+ end
854
+ def idat_decode(data, width, colors)
855
+ scanline_length = colors * width + 1 # for filter
856
+ byte_width = width * colors
857
+
858
+ pixels = []
859
+ row = 0
860
+ until data.empty? do
861
+ row_data = data.slice! 0, scanline_length
862
+ filter = row_data.shift
863
+ case filter
864
+ when 0 then # None
865
+ when 1 then # Sub
866
+ row_data.each_with_index do |byte, index|
867
+ left = index < colors ? 0 : row_data[index - colors]
868
+ row_data[index] = (byte + left) % 256
869
+ end
870
+ when 2 then # Up
871
+ row_data.each_with_index do |byte, index|
872
+ upper = row == 0 ? 0 : pixels[ - byte_width + index ]
873
+ row_data[index] = (upper + byte) % 256
874
+ end
875
+ when 3 then # Average
876
+ row_data.each_with_index do |byte, index|
877
+ upper = row == 0 ? 0 : pixels[ - byte_width + index ]
878
+ left = index < colors ? 0 : row_data[index - colors]
879
+
880
+ row_data[index] = (byte + ((left + upper)/2).floor) % 256
881
+ end
882
+ when 4 then # Paeth
883
+ left = upper = upper_left = nil
884
+ row_data.each_with_index do |byte, index|
885
+
886
+ left = index < colors ? 0 : row_data[index - colors]
887
+ if row == 0 then
888
+ upper = upper_left = 0
889
+ else
890
+ upper_idx = - byte_width + index
891
+ upper = pixels[ upper_idx ]
892
+ upper_left = index < colors ? 0 : pixels[ upper_idx - colors ]
893
+ end
894
+
895
+ paeth = paeth left, upper, upper_left
896
+ row_data[index] = (byte + paeth) % 256
897
+ end
898
+ else
899
+ raise ArgumentError, "Invalid filter algorithm #{filter}"
900
+ end
901
+
902
+ pixels.concat row_data
903
+ row += 1
904
+ end
905
+ pixels
906
+ end
907
+ private
908
+ def extract_colormap(index, mask)
909
+ map = []
910
+ if index.is_a? Stream
911
+ map = index.decoded_stream.unpack('C*').collect { |int| int & mask }
912
+ elsif index.is_a? Array
913
+ map = index
914
+ end
915
+ map
916
+ end
917
+ def extract_pixels(stream, depth)
918
+ case depth
919
+ when 8
920
+ stream.unpack('C*')
921
+ else
922
+ tmp, = stream.unpack('B*')
923
+ pixels = []
924
+ tmp.scan(/.{#{depth}}/n) { |match|
925
+ pixels.push match.to_i(2)
926
+ }
927
+ pixels
928
+ end
929
+ end
930
+ def paeth(a, b, c) # left, above, upper left
931
+ p = a + b - c
932
+ pa = (p - a).abs
933
+ pb = (p - b).abs
934
+ pc = (p - c).abs
935
+
936
+ return a if pa <= pb && pa <= pc
937
+ return b if pb <= pc
938
+ c
939
+ end
940
+ end
941
+ class InlineImage < Image
942
+ ATTR_ABBREVIATIONS = {
943
+ :bpc => :bitspercomponent, :cs => :colorspace,
944
+ :d => :decode, :dp => :decodeparms,
945
+ :f => :filter, :h => :height,
946
+ :im => :imagemask, :i => :interpolate,
947
+ :w => :width,
948
+ }
949
+ OTHER_ABBREVIATIONS = {
950
+ '/G' => '/DeviceGray',
951
+ '/RGB' => '/DeviceRGB',
952
+ '/CMYK' => '/DeviceCMYK',
953
+ '/I' => '/Indexed',
954
+ '/AHx' => '/ASCIIHexDecode',
955
+ '/A85' => '/ASCII85Decode',
956
+ '/LZW' => '/LZWDecode',
957
+ '/Fl' => '/FlateDecode',
958
+ '/RL' => '/RunLengthDecode',
959
+ '/CCF' => '/CCITTFaxDecode',
960
+ '/DCT' => '/DCTDecode',
961
+ }
962
+ def initialize(attrs, data)
963
+ super("<<" << attrs << ">>")
964
+ @raw_stream = data
965
+ end
966
+ def parse_attributes
967
+ super
968
+ ATTR_ABBREVIATIONS.each do |abbr, key|
969
+ if value = @attributes.delete(abbr)
970
+ @attributes.store key, OTHER_ABBREVIATIONS.fetch(value, value)
971
+ end
972
+ end
973
+ end
974
+ end
975
+ class CMap < Stream
976
+ attr_accessor :map
977
+ def initialize(*args)
978
+ @map = {}
979
+ super
980
+ parse_cmap()
981
+ end
982
+ def to_utf8(txt)
983
+ if(@map.nil?)
984
+ txt
985
+ elsif(txt.is_a?(Integer))
986
+ @map[txt]
987
+ else
988
+ txt.unpack('C*').collect { |byte|
989
+ @map.fetch(byte, byte) }.pack('U*')
990
+ end
991
+ end
992
+ private
993
+ #bfchar definition
994
+ def add_to_map_bfchar(ast)
995
+ ast.compact!
996
+ ast.each { |child|
997
+ #convert in to decimal values
998
+ @map.store(_hexvalue(child.source), _hexvalue(child.target))
999
+ }
1000
+ @map
1001
+ end
1002
+ #bfrange definition see page 457 of the pdf manual
1003
+ def add_to_map_bfrange(ast)
1004
+ ast.compact!
1005
+ start_range = ast.start.value.to_s.hex
1006
+ end_range = ast.stop.value.to_s.hex
1007
+ if(ast.children_names.include?('explicit'))
1008
+ explicit = ast.explicit
1009
+ start_range.upto(end_range) { |char|
1010
+ @map.store(char, _hexvalue(explicit.shift))
1011
+ }
1012
+ else
1013
+ offset = _hexvalue(ast.offset)
1014
+ start_range.upto(end_range) { |char|
1015
+ @map.store(char, offset)
1016
+ offset+=1
1017
+ }
1018
+ end
1019
+ @map
1020
+ end
1021
+ def extract_bfchar
1022
+ src = decoded_stream
1023
+ unless(decoded_stream.index('beginbfchar').nil?)
1024
+ index = decoded_stream.index('beginbfchar') + 12
1025
+ rindex = decoded_stream.index('endbfchar')
1026
+ src[index..(rindex)-1]
1027
+ end
1028
+ end
1029
+ def extract_bfrange
1030
+ src = decoded_stream
1031
+ unless(decoded_stream.index('beginbfrange').nil?)
1032
+ index = decoded_stream.index('beginbfrange') + 12
1033
+ rindex = decoded_stream.index('endbfrange')
1034
+ src[index..(rindex)-1]
1035
+ end
1036
+ end
1037
+ def _hexvalue(ast)
1038
+ ast.value.to_s.to_i(16)
1039
+ end
1040
+ def parse_cmap
1041
+ if(src = extract_bfchar)
1042
+ ast = Rpdf2txt.cmap_parser.parse(src)
1043
+ add_to_map_bfchar(ast)
1044
+ end
1045
+ if(src = extract_bfrange)
1046
+ ast = Rpdf2txt.cmap_range_parser.parse(src)
1047
+ ast.each { |node|
1048
+ add_to_map_bfrange(node)
1049
+ }
1050
+ end
1051
+ end
1052
+ end
1053
+ class ReferenceArray < TreeNode
1054
+ def build_tree(object_catalogue, parent=nil)
1055
+ @contents=[]
1056
+ @references.each{ |id|
1057
+ @contents.push(object_catalogue[id]) if object_catalogue[id]
1058
+ }
1059
+ super
1060
+ end
1061
+ def build_stream(concat_stream)
1062
+ @contents.each { |stream|
1063
+ concat_stream.append(stream.decoded_stream)
1064
+ }
1065
+ concat_stream
1066
+ end
1067
+ def parse_attributes
1068
+ src = @src[@src.index('[')..@src.rindex(']')]
1069
+ ast = _parse_attributes(src)
1070
+ ast.compact!
1071
+ @references = extract_oids(extract_attributes(ast))
1072
+ end
1073
+ def root?
1074
+ false
1075
+ end
1076
+ end
1077
+ class PdfArray < TreeNode
1078
+ def build_tree(object_catalogue, parent=nil)
1079
+ @contents=[]
1080
+ super
1081
+ end
1082
+ def at(idx)
1083
+ @contents.at(idx)
1084
+ end
1085
+ def each(&block)
1086
+ @contents.each(&block)
1087
+ end
1088
+ def parse_attributes
1089
+ src = @src[@src.index('[')..@src.rindex(']')]
1090
+ ast = _parse_attributes(src)
1091
+ ast.compact!
1092
+ @contents = extract_attributes(ast)
1093
+ end
1094
+ def root?
1095
+ false
1096
+ end
1097
+ end
1098
+ class PdfHash < TreeNode
1099
+ attr_reader :contents
1100
+ def build_tree(object_catalogue, parent=nil)
1101
+ @contents={}
1102
+ super
1103
+ end
1104
+ def parse_attributes
1105
+ src = @src[@src.index('<<')..@src.rindex('>')]
1106
+ ast = _parse_attributes(src)
1107
+ ast.compact!
1108
+ @contents = extract_attributes(ast)
1109
+ end
1110
+ def root?
1111
+ false
1112
+ end
1113
+ end
1114
+ end