rpdf2txt 0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,352 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2004 Mike Walder, Raphael Waltert, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, mwalder@ywesee.com, rwaltert@ywesee.com
|
22
|
+
#
|
23
|
+
# PdfParser -- Rpdf2txt-- 14.11.2002 -- mwalder@ywesee.com, rwaltert@ywesee.com
|
24
|
+
|
25
|
+
module Rpdf2txt
|
26
|
+
module SymbolMap
|
27
|
+
#this map is not complete!!
|
28
|
+
SYMBOL_ENTITIES = {
|
29
|
+
3 => 65,
|
30
|
+
8804 => 163,
|
31
|
+
8805 => 179,
|
32
|
+
#alpha-omega
|
33
|
+
913 => 65,
|
34
|
+
914 => 66,
|
35
|
+
915 => 71,
|
36
|
+
916 => 68,
|
37
|
+
917 => 69,
|
38
|
+
918 => 90,
|
39
|
+
919 => 72,
|
40
|
+
920 => 81,
|
41
|
+
921 => 73,
|
42
|
+
922 => 75,
|
43
|
+
923 => 76,
|
44
|
+
924 => 77,
|
45
|
+
925 => 78,
|
46
|
+
926 => 88,
|
47
|
+
927 => 79,
|
48
|
+
928 => 80,
|
49
|
+
929 => 82,
|
50
|
+
931 => 83,
|
51
|
+
932 => 84,
|
52
|
+
933 => 85,
|
53
|
+
934 => 70,
|
54
|
+
935 => 67,
|
55
|
+
936 => 89,
|
56
|
+
945 => 97,
|
57
|
+
946 => 98,
|
58
|
+
947 => 103,
|
59
|
+
948 => 100,
|
60
|
+
949 => 101,
|
61
|
+
950 => 122,
|
62
|
+
951 => 104,
|
63
|
+
952 => 113,
|
64
|
+
953 => 105,
|
65
|
+
954 => 107,
|
66
|
+
955 => 108,
|
67
|
+
956 => 109,
|
68
|
+
957 => 110,
|
69
|
+
958 => 120,
|
70
|
+
959 => 111,
|
71
|
+
960 => 112,
|
72
|
+
961 => 114,
|
73
|
+
963 => 115,
|
74
|
+
964 => 116,
|
75
|
+
965 => 117,
|
76
|
+
966 => 102,
|
77
|
+
967 => 99,
|
78
|
+
968 => 121,
|
79
|
+
969 => 119,
|
80
|
+
#dot?
|
81
|
+
8901 => 46,
|
82
|
+
#intersection (cap)
|
83
|
+
8745 => 199,
|
84
|
+
#union (cup)
|
85
|
+
8746 => 200,
|
86
|
+
#infinity
|
87
|
+
8734 => 165,
|
88
|
+
#integral
|
89
|
+
8747 => 166,
|
90
|
+
#partial differential
|
91
|
+
8706 => 182,
|
92
|
+
#not equal to
|
93
|
+
8800 => 185,
|
94
|
+
#equal
|
95
|
+
61 => 186,
|
96
|
+
#almost equal to
|
97
|
+
8776 => 187,
|
98
|
+
#superset of
|
99
|
+
8835 => 201,
|
100
|
+
# superset or equal to
|
101
|
+
8839 => 202,
|
102
|
+
#not a subset of
|
103
|
+
8836 => 203,
|
104
|
+
#subset of
|
105
|
+
8834 => 204,
|
106
|
+
#subset or equal to
|
107
|
+
8838 => 205,
|
108
|
+
#element of
|
109
|
+
8712 => 206,
|
110
|
+
#not an element of
|
111
|
+
8713 => 207,
|
112
|
+
#n-ary product
|
113
|
+
8719 => 213,
|
114
|
+
#radic
|
115
|
+
8730 => 214,
|
116
|
+
#n-ary sum
|
117
|
+
8721 => 229,
|
118
|
+
}
|
119
|
+
end
|
120
|
+
module DefaultHandler
|
121
|
+
def column_count
|
122
|
+
end
|
123
|
+
def column_width
|
124
|
+
end
|
125
|
+
def identify_columns?
|
126
|
+
false
|
127
|
+
end
|
128
|
+
def send_image(handle)
|
129
|
+
end
|
130
|
+
def new_font(font)
|
131
|
+
end
|
132
|
+
def new_fontsize(size)
|
133
|
+
end
|
134
|
+
def send_column
|
135
|
+
end
|
136
|
+
def send_colspan
|
137
|
+
end
|
138
|
+
def send_eof
|
139
|
+
@out
|
140
|
+
end
|
141
|
+
def send_flowing_data(data)
|
142
|
+
self.out << data
|
143
|
+
end
|
144
|
+
def send_hr
|
145
|
+
end
|
146
|
+
def send_line_break
|
147
|
+
self.out << "\n"
|
148
|
+
end
|
149
|
+
def send_paragraph
|
150
|
+
end
|
151
|
+
def send_page
|
152
|
+
self.out << "\n\n"
|
153
|
+
end
|
154
|
+
def out
|
155
|
+
@out ||= ""
|
156
|
+
end
|
157
|
+
end
|
158
|
+
class SimpleHandler
|
159
|
+
include DefaultHandler
|
160
|
+
include SymbolMap
|
161
|
+
def initialize(io="")
|
162
|
+
@out = io
|
163
|
+
end
|
164
|
+
end
|
165
|
+
class ColumnHandler < SimpleHandler
|
166
|
+
def initialize(outstream="", padding=' ')
|
167
|
+
super(outstream)
|
168
|
+
@lines = []
|
169
|
+
@padding = padding
|
170
|
+
send_line_break
|
171
|
+
end
|
172
|
+
def column_widths
|
173
|
+
all_lengths = []
|
174
|
+
last_column = @lines.collect { |line| line.size }.push(1).max - 1
|
175
|
+
@lines.each { |line|
|
176
|
+
line.each_with_index { |column, idx|
|
177
|
+
if(column.is_a?(String))
|
178
|
+
column.rstrip!
|
179
|
+
if(line[idx.next] || idx == last_column)
|
180
|
+
length = column.length
|
181
|
+
all_lengths[idx] = [length, all_lengths[idx].to_i].max
|
182
|
+
end
|
183
|
+
end
|
184
|
+
}
|
185
|
+
}
|
186
|
+
max_lengths = []
|
187
|
+
@lines.each { |line|
|
188
|
+
line.each_with_index { |column, idx|
|
189
|
+
if(column.is_a?(String))
|
190
|
+
length = column.length
|
191
|
+
idx2 = idx.next
|
192
|
+
rm = 0
|
193
|
+
while(line[idx2].nil? && (max = all_lengths[idx2]))
|
194
|
+
rm += max
|
195
|
+
length -= max
|
196
|
+
idx2 += 1
|
197
|
+
end
|
198
|
+
max_lengths[idx] = [length, max_lengths[idx].to_i].max
|
199
|
+
end
|
200
|
+
}
|
201
|
+
line[last_column] ||= nil
|
202
|
+
}
|
203
|
+
max_lengths
|
204
|
+
end
|
205
|
+
def identify_columns?
|
206
|
+
true
|
207
|
+
end
|
208
|
+
def send_column
|
209
|
+
@current_column += @colspan
|
210
|
+
@colspan = 1
|
211
|
+
@columns[@current_column] ||= if(@padding.respond_to?(:foldcase))
|
212
|
+
u(@padding.dup)
|
213
|
+
else
|
214
|
+
@padding.dup
|
215
|
+
end
|
216
|
+
end
|
217
|
+
def send_colspan
|
218
|
+
@colspan += 1
|
219
|
+
end
|
220
|
+
def send_hr
|
221
|
+
@columns << :hr
|
222
|
+
end
|
223
|
+
def send_image(image)
|
224
|
+
@columns << :image
|
225
|
+
end
|
226
|
+
def send_flowing_data(data)
|
227
|
+
@columns[@current_column] << data
|
228
|
+
end
|
229
|
+
def send_line_break
|
230
|
+
@columns = []
|
231
|
+
@lines.push(@columns)
|
232
|
+
@current_column = -1
|
233
|
+
@colspan = 1
|
234
|
+
end
|
235
|
+
def send_page
|
236
|
+
max_lengths = column_widths
|
237
|
+
@lines.each { |line|
|
238
|
+
line.each_with_index { |column, idx|
|
239
|
+
if(column)
|
240
|
+
idx2 = idx.next
|
241
|
+
accumulated = max_lengths[idx].to_i
|
242
|
+
while((pad = max_lengths[idx2]) && !line[idx2])
|
243
|
+
accumulated += pad
|
244
|
+
idx2 += 1
|
245
|
+
end
|
246
|
+
case column
|
247
|
+
when :image
|
248
|
+
@out << " #IMAGE# ".ljust(accumulated)
|
249
|
+
when :hr
|
250
|
+
@out << @padding.dup.ljust(accumulated, '-')
|
251
|
+
else
|
252
|
+
@out << column.ljust(accumulated)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
}
|
256
|
+
@out << "\n"
|
257
|
+
}
|
258
|
+
@lines.clear
|
259
|
+
send_line_break
|
260
|
+
super
|
261
|
+
end
|
262
|
+
end
|
263
|
+
class RecordingHandler
|
264
|
+
def initialize(out = $stdout, columns=false)
|
265
|
+
require 'yaml'
|
266
|
+
@out = out
|
267
|
+
@out << "require 'yaml'\n"
|
268
|
+
@columns = columns
|
269
|
+
end
|
270
|
+
def identify_columns?
|
271
|
+
@columns
|
272
|
+
end
|
273
|
+
def new_font(font)
|
274
|
+
@out << <<-EOS
|
275
|
+
font = YAML.load <<-EOF
|
276
|
+
#{font.to_yaml}
|
277
|
+
EOF
|
278
|
+
@writer.new_font(font)
|
279
|
+
EOS
|
280
|
+
end
|
281
|
+
def method_missing(symbol, *args, &block)
|
282
|
+
argstr = args.collect { |arg| arg.inspect }.join(', ')
|
283
|
+
@out << <<-EOS
|
284
|
+
@writer.#{symbol}(#{argstr})
|
285
|
+
EOS
|
286
|
+
end
|
287
|
+
end
|
288
|
+
class HTMLHandler
|
289
|
+
include DefaultHandler
|
290
|
+
include SymbolMap
|
291
|
+
def initialize
|
292
|
+
super
|
293
|
+
@state = {
|
294
|
+
:italic => false,
|
295
|
+
:bold => false,
|
296
|
+
:font => false,
|
297
|
+
}
|
298
|
+
end
|
299
|
+
def new_font(font)
|
300
|
+
unless(font.nil?)
|
301
|
+
if(font.bold?)
|
302
|
+
if(!@state[:bold])
|
303
|
+
self.out << "<b>"
|
304
|
+
@state[:bold] = true
|
305
|
+
end
|
306
|
+
else
|
307
|
+
if(@state[:bold])
|
308
|
+
self.out << "</b>"
|
309
|
+
@state[:bold] = false
|
310
|
+
end
|
311
|
+
end
|
312
|
+
if(font.italic?)
|
313
|
+
if(!@state[:italic])
|
314
|
+
self.out << "<i>"
|
315
|
+
@state[:italic] = true
|
316
|
+
end
|
317
|
+
else
|
318
|
+
if(@state[:italic])
|
319
|
+
self.out << "</i>"
|
320
|
+
@state[:italic] = false
|
321
|
+
end
|
322
|
+
end
|
323
|
+
if(@state[:font])
|
324
|
+
self.out << "</font>"
|
325
|
+
@state[:font] = false
|
326
|
+
end
|
327
|
+
if(@state[:pre])
|
328
|
+
self.out << "</pre>"
|
329
|
+
@state[:pre] = false
|
330
|
+
end
|
331
|
+
font_name = font.basefont_name
|
332
|
+
if(!/symbol/i.match(font_name).nil?)
|
333
|
+
self.out << "<font face=\"Symbol\">"
|
334
|
+
@state[:font] = true
|
335
|
+
end
|
336
|
+
if (!/courier/i.match(font_name).nil?)
|
337
|
+
self.out << "<pre>"
|
338
|
+
@state[:pre] = true
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
def send_line_break
|
343
|
+
@out << "<br>"
|
344
|
+
end
|
345
|
+
def send_paragraph
|
346
|
+
self.out << "<p>"
|
347
|
+
end
|
348
|
+
def send_page
|
349
|
+
self.out << "<p>"
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
data/lib/rpdf2txt/lzw.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# LZW -- rpdf2txt -- 09.07.2008 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
module Rpdf2txt
|
5
|
+
class LZW
|
6
|
+
CLEAR = 256
|
7
|
+
EOD = 257
|
8
|
+
def self.decode data, early_change=1
|
9
|
+
self.new(early_change).decode data
|
10
|
+
end
|
11
|
+
def initialize(early_change=1)
|
12
|
+
@early_change = early_change
|
13
|
+
@__dict = (0..255).collect { |num| num.chr }.push :clear, :eod
|
14
|
+
init_dictionary
|
15
|
+
end
|
16
|
+
def init_dictionary
|
17
|
+
@dictionary = @__dict.dup
|
18
|
+
@code_length = 9
|
19
|
+
@boundary = 512 - @early_change
|
20
|
+
end
|
21
|
+
def decode data
|
22
|
+
bits, = data.unpack('B*')
|
23
|
+
expected_codes = bits.size / 12
|
24
|
+
code = old_code = 0
|
25
|
+
result = ''
|
26
|
+
max = codes = clears = 0
|
27
|
+
while(!bits.empty? && (code = get_next_code bits) && code != EOD)
|
28
|
+
codes += 1
|
29
|
+
if code == CLEAR
|
30
|
+
clears +=1
|
31
|
+
init_dictionary
|
32
|
+
code = get_next_code bits
|
33
|
+
if code.nil? || code == EOD
|
34
|
+
return result
|
35
|
+
end
|
36
|
+
result << @dictionary[code]
|
37
|
+
old_code = code
|
38
|
+
else
|
39
|
+
if string = @dictionary[code]
|
40
|
+
result << string
|
41
|
+
update_dictionary @dictionary[old_code] + string[0,1]
|
42
|
+
old_code = code
|
43
|
+
elsif code == @dictionary.size
|
44
|
+
string = @dictionary[old_code]
|
45
|
+
string += string[0,1]
|
46
|
+
result << string
|
47
|
+
update_dictionary string
|
48
|
+
old_code = code
|
49
|
+
else
|
50
|
+
raise 'Bad compressed code: %s' % code
|
51
|
+
end
|
52
|
+
end
|
53
|
+
max = [max, result.size].max
|
54
|
+
end
|
55
|
+
result
|
56
|
+
end
|
57
|
+
def get_next_code bits
|
58
|
+
chunk = bits.slice!(0, @code_length).to_i(2)
|
59
|
+
end
|
60
|
+
def update_dictionary(str)
|
61
|
+
@dictionary.push str
|
62
|
+
if @dictionary.size >= @boundary && @code_length < 12
|
63
|
+
@code_length += 1
|
64
|
+
@boundary = (2**@code_length - @early_change)
|
65
|
+
end
|
66
|
+
str
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,1114 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
|
24
|
+
|
25
|
+
require 'zlib'
|
26
|
+
require 'rpdf2txt/text'
|
27
|
+
require 'rpdf2txt/attributesparser'
|
28
|
+
require 'rpdf2txt/cmapparser'
|
29
|
+
require 'rpdf2txt/symbol'
|
30
|
+
require 'md5'
|
31
|
+
require 'matrix'
|
32
|
+
|
33
|
+
module Rpdf2txt
|
34
|
+
class PdfObject
|
35
|
+
attr_reader :attributes
|
36
|
+
attr_accessor :decoder, :src, :oid
|
37
|
+
def initialize(src=nil, target_encoding='utf8')
|
38
|
+
@attributes = {}
|
39
|
+
@src = src
|
40
|
+
@target_encoding = target_encoding
|
41
|
+
parse_attributes() unless @src.nil?
|
42
|
+
end
|
43
|
+
def build_tree(object_catalogue, parent=nil)
|
44
|
+
@attributes.each { |key, value|
|
45
|
+
case value
|
46
|
+
when Array
|
47
|
+
value.collect! { |obj| catalogue_object(object_catalogue, obj) || obj }
|
48
|
+
when String
|
49
|
+
if obj = catalogue_object(object_catalogue, value)
|
50
|
+
@attributes.store(key, obj)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
}
|
54
|
+
end
|
55
|
+
def catalogue_object(catalogue, reference)
|
56
|
+
if reference.is_a?(String) && (match = /^(\d+)\s+\d+\s+R/n.match reference)
|
57
|
+
catalogue[match[1].to_i]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
def decoded_stream
|
61
|
+
raise "abstract method decoded_stream called in #{self.class}; built from source: \n #{@src.tr("\r", "\n")}"
|
62
|
+
end
|
63
|
+
def oid
|
64
|
+
@oid ||= extract_oid(@src)
|
65
|
+
end
|
66
|
+
def revision_id
|
67
|
+
@revision_id ||= extract_revision_id(@src)
|
68
|
+
end
|
69
|
+
def extract_attribute_stream
|
70
|
+
lastindex = @src.index('stream') || -1
|
71
|
+
index = @src.index('<<')
|
72
|
+
rindex = @src.rindex('>>', lastindex)
|
73
|
+
@src[index..(rindex+1)] if(index && rindex)
|
74
|
+
end
|
75
|
+
def _parse_attributes(src)
|
76
|
+
Rpdf2txt.attributes_parser.parse(src)
|
77
|
+
end
|
78
|
+
def parse_attributes
|
79
|
+
src = self.extract_attribute_stream
|
80
|
+
if(src.nil?)
|
81
|
+
@attibutes = {}
|
82
|
+
else
|
83
|
+
ast = self._parse_attributes(src)
|
84
|
+
ast.compact!
|
85
|
+
@attributes = extract_attributes(ast)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
private
|
89
|
+
def extract_oid(string)
|
90
|
+
/^\d+/n.match(string).to_s.to_i
|
91
|
+
end
|
92
|
+
def extract_revision_id(string)
|
93
|
+
/\s\d+/n.match(string).to_s.to_i
|
94
|
+
end
|
95
|
+
def extract_attributes(ast)
|
96
|
+
if(ast.children_names.include?('value'))
|
97
|
+
pdf_unescape(ast.value)
|
98
|
+
elsif(ast.children_names.include?('text'))
|
99
|
+
pdf_unescape(ast.text.value[1...-1])
|
100
|
+
elsif(ast.children_names.include?('values'))
|
101
|
+
ast.values.collect { |child| extract_attributes(child) }
|
102
|
+
elsif(ast.children_names.include?('pairs'))
|
103
|
+
result = {}
|
104
|
+
ast.pairs.each { |pair|
|
105
|
+
k, v = pair
|
106
|
+
keystr = k.value.strip.tr('/','')
|
107
|
+
unless(keystr.empty?)
|
108
|
+
result.store(keystr.downcase.intern, extract_attributes(v))
|
109
|
+
end
|
110
|
+
}
|
111
|
+
result
|
112
|
+
else
|
113
|
+
value = ast
|
114
|
+
end
|
115
|
+
end
|
116
|
+
def pdf_escape(input)
|
117
|
+
input.gsub(/\\/, '\\\\').gsub(/\n/n, '\n')\
|
118
|
+
.gsub(/\r/n, '\r').gsub(/[()]/n, '\\&')
|
119
|
+
end
|
120
|
+
def pdf_unescape(input)
|
121
|
+
input.gsub(/\\n/n, "\n").gsub(/\\r/n, "\r").\
|
122
|
+
gsub(/\\\)/n, ')').gsub(/\\\(/n, '(').gsub(/\\\\/n, '\\')
|
123
|
+
end
|
124
|
+
end
|
125
|
+
class PdfEncrypt < PdfObject
|
126
|
+
class DecryptionError < RuntimeError
|
127
|
+
end
|
128
|
+
PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"
|
129
|
+
def arc4(key, input)
|
130
|
+
output = ''
|
131
|
+
s, j, k = (0..255).to_a, 0, (key*256)[0,256].unpack('C*')
|
132
|
+
(0..255).each { |x|
|
133
|
+
j = (j + s[x] + k[x]) % 256
|
134
|
+
s[x], s[j] = s[j], s[x]
|
135
|
+
}
|
136
|
+
i = j = 0
|
137
|
+
input.each_byte { |b|
|
138
|
+
i = (i + 1) % 256
|
139
|
+
j = (j + s[i]) % 256
|
140
|
+
s[i], s[j] = s[j], s[i]
|
141
|
+
output << (b ^ s[(s[i] + s[j])%256]).chr
|
142
|
+
}
|
143
|
+
output
|
144
|
+
end
|
145
|
+
def compute_user_key encryption_key
|
146
|
+
if revision < 3
|
147
|
+
pdf_escape arc4(encryption_key, PADDING)
|
148
|
+
else
|
149
|
+
crypt = Digest::MD5.digest PADDING + file_id
|
150
|
+
20.times do |xor|
|
151
|
+
key = encryption_key.unpack('C*').collect! do |byte|
|
152
|
+
byte ^ xor
|
153
|
+
end.pack('C*')
|
154
|
+
crypt = arc4(key, crypt)
|
155
|
+
end
|
156
|
+
pdf_escape crypt
|
157
|
+
end
|
158
|
+
end
|
159
|
+
def decrypt(pdf_object)
|
160
|
+
arc4_key = decrypt_key(pdf_object)
|
161
|
+
stream = pdf_object.raw_stream
|
162
|
+
arc4(arc4_key, stream)
|
163
|
+
end
|
164
|
+
def decrypt_key(pdf_object)
|
165
|
+
oid = pdf_object.oid
|
166
|
+
rev_id = pdf_object.revision_id
|
167
|
+
#if it is a ppc we use reverse
|
168
|
+
if(self.big_endian?)
|
169
|
+
oid_three_bytes = [oid].pack('I*').reverse[0,3]
|
170
|
+
rev_id_two_bytes = [rev_id].pack('I*').reverse[0,2]
|
171
|
+
else
|
172
|
+
oid_three_bytes = [oid].pack('I*')[0,3]
|
173
|
+
rev_id_two_bytes = [rev_id].pack('I*')[0,2]
|
174
|
+
end
|
175
|
+
input = encryption_key << oid_three_bytes << rev_id_two_bytes
|
176
|
+
digest = Digest::MD5.digest(input)
|
177
|
+
digest[0,[keylength + 5,16].min]
|
178
|
+
end
|
179
|
+
def big_endian?
|
180
|
+
#big endian (ppc) little endian x86
|
181
|
+
if ([1].pack('I*') == "\000\000\000\001")
|
182
|
+
true
|
183
|
+
else
|
184
|
+
false
|
185
|
+
end
|
186
|
+
end
|
187
|
+
def encryption_key
|
188
|
+
input_string = PADDING.dup
|
189
|
+
## we don't support a user-password. if we did, it would have to replace
|
190
|
+
# the first [n..32] bytes of the padding string here.
|
191
|
+
input_string << owner_key
|
192
|
+
input_string << permission_flag
|
193
|
+
input_string << file_id
|
194
|
+
## revision >= 4: add 0xffffffff if document metadata is not encrypted
|
195
|
+
digest = Digest::MD5.digest(input_string)
|
196
|
+
uk = user_key
|
197
|
+
if revision >= 3
|
198
|
+
50.times do digest = Digest::MD5.digest(digest[0,keylength]) end
|
199
|
+
uk = uk[0,16]
|
200
|
+
end
|
201
|
+
encryption_key = digest[0,keylength]
|
202
|
+
test_key = compute_user_key encryption_key
|
203
|
+
if(test_key != uk)
|
204
|
+
raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
|
205
|
+
end
|
206
|
+
encryption_key
|
207
|
+
end
|
208
|
+
def file_id= (file_id)
|
209
|
+
@file_id = file_id
|
210
|
+
end
|
211
|
+
def file_id
|
212
|
+
[@file_id].pack("H*")
|
213
|
+
end
|
214
|
+
def keylength
|
215
|
+
@keylength ||= (@attributes[:length] || 40).to_i / 8
|
216
|
+
end
|
217
|
+
def owner_key
|
218
|
+
@attributes[:o].to_s
|
219
|
+
end
|
220
|
+
def permission_flag
|
221
|
+
if (self.big_endian?)
|
222
|
+
[@attributes[:p].to_i].pack('I*').reverse
|
223
|
+
else
|
224
|
+
[@attributes[:p].to_i].pack('I*')
|
225
|
+
end
|
226
|
+
end
|
227
|
+
def revision
|
228
|
+
@attributes[:r].to_i
|
229
|
+
end
|
230
|
+
def user_key
|
231
|
+
@attributes[:u].to_s
|
232
|
+
end
|
233
|
+
end
|
234
|
+
class Encoding < PdfObject
|
235
|
+
def differences
|
236
|
+
@differences or
|
237
|
+
begin
|
238
|
+
@differences = {}
|
239
|
+
offset = 0
|
240
|
+
idx = 0
|
241
|
+
@attributes[:differences].each { |diff|
|
242
|
+
if(/^\d+$/n.match(diff))
|
243
|
+
offset = diff.to_i
|
244
|
+
else
|
245
|
+
@differences.store(offset + idx, diff[/\w+/n])
|
246
|
+
idx += 1
|
247
|
+
end
|
248
|
+
}
|
249
|
+
@differences
|
250
|
+
end
|
251
|
+
end
|
252
|
+
def convert_symbol(txt)
|
253
|
+
res = ''
|
254
|
+
txt.each_byte { |byte|
|
255
|
+
if(name = differences[byte])
|
256
|
+
byte = Symbol.byte(name) || byte
|
257
|
+
end
|
258
|
+
res << byte
|
259
|
+
}
|
260
|
+
res
|
261
|
+
end
|
262
|
+
def symbol_name(byte)
|
263
|
+
differences[byte]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
class Font < PdfObject
|
267
|
+
attr_accessor :cmap, :descriptor, :rendering_mode, :skewed
|
268
|
+
ENCODINGS = {
|
269
|
+
'/Identity-H' => 'ascii',
|
270
|
+
'/MacRomanEncoding' => 'mac',
|
271
|
+
'/UTF8' => 'utf8',
|
272
|
+
'/WinAnsiEncoding' => 'ms-ansi',
|
273
|
+
}
|
274
|
+
AFM_PTRN = /^C\s*(\d+)\s*;\s*
|
275
|
+
WX\s*(\d+)\s*;\s*
|
276
|
+
N\s*(\w+)/xn
|
277
|
+
def basefont_name
|
278
|
+
@attributes[:basefont]
|
279
|
+
end
|
280
|
+
def basefont_width(char)
|
281
|
+
basefont_widths[char]
|
282
|
+
end
|
283
|
+
def basefont_widths
|
284
|
+
@basefont_widths ||= load_basefont_widths
|
285
|
+
end
|
286
|
+
def bold?
|
287
|
+
(!!/bold/in.match(basefont_name.to_s)) || @rendering_mode == "2"
|
288
|
+
end
|
289
|
+
def build_tree(object_catalogue, parent = nil)
|
290
|
+
super
|
291
|
+
if(desc = @attributes[:descriptor])
|
292
|
+
@descriptor = desc
|
293
|
+
desc.build_tree(object_catalogue, self)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
def encoding
|
297
|
+
enc = @attributes[:encoding]
|
298
|
+
if(enc.is_a?(Encoding))
|
299
|
+
enc
|
300
|
+
else
|
301
|
+
ENCODINGS.fetch(enc) {
|
302
|
+
warn "unknown encoding #{enc}"
|
303
|
+
enc
|
304
|
+
}
|
305
|
+
end
|
306
|
+
end
|
307
|
+
def italic?
|
308
|
+
@skewed || !!/italic/in.match(basefont_name.to_s)
|
309
|
+
end
|
310
|
+
def width(char)
|
311
|
+
if(char.is_a?(String) && char.length == 1)
|
312
|
+
char = char[0]
|
313
|
+
end
|
314
|
+
_width(char) || named_width(char)
|
315
|
+
end
|
316
|
+
def widths
|
317
|
+
@widths ||= (@attributes[:widths] || [])
|
318
|
+
end
|
319
|
+
def symbol?
|
320
|
+
!!/symbol/in.match(basefont_name.to_s)
|
321
|
+
end
|
322
|
+
def to_unicode
|
323
|
+
@to_unicode ||= (tu = @attributes[:tounicode]) && tu.to_cmap
|
324
|
+
end
|
325
|
+
private
|
326
|
+
def first_char
|
327
|
+
@attributes[:firstchar].to_i
|
328
|
+
end
|
329
|
+
def load_basefont_widths
|
330
|
+
widths = {}
|
331
|
+
path = File.join(File.dirname(__FILE__), 'data', 'fonts',
|
332
|
+
"%s.afm" % basefont_name)
|
333
|
+
if(File.readable?(path))
|
334
|
+
File.read(path).scan(AFM_PTRN) { |char, width, name|
|
335
|
+
widths.store(char.to_i, width)
|
336
|
+
widths.store(name, width)
|
337
|
+
}
|
338
|
+
end
|
339
|
+
widths
|
340
|
+
end
|
341
|
+
def named_width(char)
|
342
|
+
enc = @attributes[:encoding]
|
343
|
+
if(enc.is_a?(Encoding))
|
344
|
+
_width(enc.symbol_name(char))
|
345
|
+
end
|
346
|
+
end
|
347
|
+
def _width(char)
|
348
|
+
width = widths.at(char - first_char) if(char.is_a? Integer)
|
349
|
+
width ||= basefont_width(char)
|
350
|
+
width.to_i if(width)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
class FontDescriptor < PdfObject
|
354
|
+
end
|
355
|
+
class Unknown < PdfObject
|
356
|
+
def to_i(*args)
|
357
|
+
((match = /obj\s*(\d+)/n.match(@src)) && match[1]).to_i(*args)
|
358
|
+
end
|
359
|
+
end
|
360
|
+
class Resource < PdfObject
|
361
|
+
def initialize(src=nil, target_encoding='utf8')
|
362
|
+
if(src.is_a? Hash)
|
363
|
+
@attributes = src
|
364
|
+
elsif(src.is_a? String)
|
365
|
+
super
|
366
|
+
else
|
367
|
+
@attributes = {}
|
368
|
+
end
|
369
|
+
@fonts = {}
|
370
|
+
@xobjects = {}
|
371
|
+
end
|
372
|
+
def build_tree(object_catalogue, parent=nil)
|
373
|
+
super
|
374
|
+
[:font, :xobject].each { |type| build_attributes type, object_catalogue }
|
375
|
+
end
|
376
|
+
def font(key)
|
377
|
+
@fonts[key]
|
378
|
+
end
|
379
|
+
def xobject(key)
|
380
|
+
@xobjects[key]
|
381
|
+
end
|
382
|
+
private
|
383
|
+
def build_attributes(type, object_catalogue)
|
384
|
+
storage = instance_variable_get "@#{type}s"
|
385
|
+
attribute = @attributes[type]
|
386
|
+
if attribute.is_a?(Hash)
|
387
|
+
_build_attributes(storage, attribute, object_catalogue)
|
388
|
+
elsif attribute.is_a?(PdfHash)
|
389
|
+
_build_attributes(storage, attribute.contents, object_catalogue)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
def _build_attributes(storage, hash, object_catalogue)
|
393
|
+
hash.each { |key, val|
|
394
|
+
oid = /^\d+/n.match(val).to_s.to_i
|
395
|
+
if(obj = object_catalogue[oid])
|
396
|
+
obj.build_tree(object_catalogue)
|
397
|
+
end
|
398
|
+
storage.store(key, obj)
|
399
|
+
}
|
400
|
+
end
|
401
|
+
end
|
402
|
+
class TrailerDictionary < PdfObject
|
403
|
+
def file_id
|
404
|
+
/[a-zA-Z0-9]+/n.match(@attributes[:id].first).to_s
|
405
|
+
end
|
406
|
+
def encrypt_id
|
407
|
+
extract_oid(@attributes[:encrypt])
|
408
|
+
end
|
409
|
+
=begin
|
410
|
+
def parse_attributes
|
411
|
+
index = @src.index('trailer')
|
412
|
+
rindex = @src.rindex('startxref')
|
413
|
+
unless(index && rindex)
|
414
|
+
@attibutes = {}
|
415
|
+
else
|
416
|
+
#set correct offsets <<(begin) >>(end)
|
417
|
+
src = @src[index+7..(rindex-1)]
|
418
|
+
ast = Rpdf2txt.attributes_parser.parse(src)
|
419
|
+
ast.compact!
|
420
|
+
@attributes = extract_attributes(ast)
|
421
|
+
end
|
422
|
+
end
|
423
|
+
=end
|
424
|
+
def root_id
|
425
|
+
extract_oid(@attributes[:root])
|
426
|
+
end
|
427
|
+
def update(trailer_dict)
|
428
|
+
@attributes.update(trailer_dict.attributes)
|
429
|
+
end
|
430
|
+
protected
|
431
|
+
attr_reader :attributes
|
432
|
+
end
|
433
|
+
class TreeNode < PdfObject
|
434
|
+
include Enumerable
|
435
|
+
attr_reader :parent
|
436
|
+
def build_tree(object_catalogue, parent=nil)
|
437
|
+
super
|
438
|
+
@parent = parent
|
439
|
+
self
|
440
|
+
end
|
441
|
+
def each
|
442
|
+
yield self
|
443
|
+
end
|
444
|
+
def extract_oids(array)
|
445
|
+
array.collect{ |dirty_id|
|
446
|
+
if(match = /\d+/on.match(dirty_id))
|
447
|
+
match[0].to_i
|
448
|
+
end
|
449
|
+
}.compact
|
450
|
+
end
|
451
|
+
def root?
|
452
|
+
!(@parent || @attributes[:parent])
|
453
|
+
end
|
454
|
+
end
|
455
|
+
class CatalogNode < TreeNode
|
456
|
+
def build_tree(object_catalogue, parent=nil)
|
457
|
+
id = extract_oids(@attributes[:pages]).first
|
458
|
+
@pages = object_catalogue[id]
|
459
|
+
@pages.build_tree(object_catalogue, self)
|
460
|
+
super
|
461
|
+
end
|
462
|
+
def each(&block)
|
463
|
+
@pages.each(&block)
|
464
|
+
end
|
465
|
+
end
|
466
|
+
class PageNode < TreeNode
|
467
|
+
attr_reader :kids
|
468
|
+
def build_tree(object_catalogue, parent=nil)
|
469
|
+
@kids = []
|
470
|
+
extract_oids(@attributes[:kids]).each { |id|
|
471
|
+
child = object_catalogue[id]
|
472
|
+
@kids.push(child)
|
473
|
+
child.build_tree(object_catalogue, self)
|
474
|
+
}
|
475
|
+
super
|
476
|
+
end
|
477
|
+
def each
|
478
|
+
@kids.each { |kid|
|
479
|
+
kid.each { |result| yield result }
|
480
|
+
}
|
481
|
+
end
|
482
|
+
def media_box
|
483
|
+
if(mb = @attributes[:mediabox])
|
484
|
+
mb.collect { |val| val.to_f }
|
485
|
+
end
|
486
|
+
end
|
487
|
+
end
|
488
|
+
class PageLeaf < TreeNode
|
489
|
+
attr_reader :contents, :resources
|
490
|
+
def initialize(*args)
|
491
|
+
super
|
492
|
+
@text_state = TextState.new(@target_encoding)
|
493
|
+
end
|
494
|
+
def build_tree(object_catalogue, parent=nil)
|
495
|
+
@contents=[]
|
496
|
+
extract_oids(@attributes[:contents]).each{ |id|
|
497
|
+
content = object_catalogue[id]
|
498
|
+
@contents.push(content)
|
499
|
+
content.build_tree(object_catalogue, self) if content.respond_to?(:build_tree)
|
500
|
+
}
|
501
|
+
resources = @attributes[:resources]
|
502
|
+
if(resources.is_a? String)
|
503
|
+
@resources = object_catalogue[extract_oids([resources]).first]
|
504
|
+
elsif resources.is_a? Hash
|
505
|
+
@resources = Resource.new(resources)
|
506
|
+
elsif(resources.nil? && @parent)
|
507
|
+
@resources = @parent.resources
|
508
|
+
else
|
509
|
+
@resources = Resource.new()
|
510
|
+
end
|
511
|
+
@resources.build_tree(object_catalogue) if @resources.is_a? Resource
|
512
|
+
super
|
513
|
+
end
|
514
|
+
def font(key)
|
515
|
+
@resources.font(key)
|
516
|
+
end
|
517
|
+
def media_box
|
518
|
+
if(parent)
|
519
|
+
parent.media_box
|
520
|
+
end
|
521
|
+
end
|
522
|
+
def text(callback_handler)
|
523
|
+
concat_stream = Stream.new('')
|
524
|
+
if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
|
525
|
+
@contents.first.build_stream(concat_stream)
|
526
|
+
else
|
527
|
+
@contents.each { |stream|
|
528
|
+
concat_stream.append(stream.decoded_stream)
|
529
|
+
}
|
530
|
+
end
|
531
|
+
@text_state.media_box = self.media_box
|
532
|
+
text_snippets = concat_stream.extract_text_objects(self, @text_state)
|
533
|
+
join_snippets(text_snippets, callback_handler)
|
534
|
+
end
|
535
|
+
private
|
536
|
+
def each_pair(text_snippets, &block)
|
537
|
+
text_snippets.inject(nil) { |last_text_state, text_state|
|
538
|
+
if text_state.whitespace_overlap?(last_text_state)
|
539
|
+
last_text_state
|
540
|
+
else
|
541
|
+
block.call(last_text_state, text_state)
|
542
|
+
text_state
|
543
|
+
end
|
544
|
+
}
|
545
|
+
end
|
546
|
+
def identify_columns(text_snippets, hints={})
|
547
|
+
## find narrowest space on page and use as grid-width
|
548
|
+
space = text_snippets.collect { |snip|
|
549
|
+
snip.space_width }.select { |w| w > 0 }.min || 100.0
|
550
|
+
## count ocurrences of snippets in each grid-column
|
551
|
+
positions = {}
|
552
|
+
each_pair(text_snippets) { |last_text_state, text_state|
|
553
|
+
unless(text_state.empty? \
|
554
|
+
|| (last_text_state \
|
555
|
+
&& text_state.same_column(last_text_state) \
|
556
|
+
&& !last_text_state.empty?))
|
557
|
+
idx = ((text_state.x) / space).floor
|
558
|
+
positions[idx] = positions[idx].to_i + 1
|
559
|
+
end
|
560
|
+
}
|
561
|
+
return [] if(positions.empty?)
|
562
|
+
## find the average distance between candidate columns
|
563
|
+
#previous = 0
|
564
|
+
total = 0
|
565
|
+
sorted = positions.sort
|
566
|
+
begin
|
567
|
+
previous, count = sorted.shift
|
568
|
+
end until(count.nil? || count > 2)
|
569
|
+
counts = []
|
570
|
+
width = sorted.inject(0) { |sum, (pos,count)|
|
571
|
+
counts.push [count,pos]
|
572
|
+
if(count > 2)
|
573
|
+
total += 1
|
574
|
+
sum += (pos - previous)
|
575
|
+
previous = pos
|
576
|
+
end
|
577
|
+
sum
|
578
|
+
}
|
579
|
+
if (colcount = hints[:count]) && counts.size >= colcount
|
580
|
+
res = counts.sort[-colcount..-1].collect do |count, pos|
|
581
|
+
pos * space
|
582
|
+
end
|
583
|
+
return res.sort
|
584
|
+
end
|
585
|
+
|
586
|
+
cutwidth = hints[:width]
|
587
|
+
if cutwidth.is_a?(String)
|
588
|
+
dividend, divisor = cutwidth.split '/', 2
|
589
|
+
cutwidth = width * dividend.to_f / divisor.to_f
|
590
|
+
end
|
591
|
+
cutwidth ||= (total.nonzero?) ? width / total * 0.9 : width
|
592
|
+
## select probable columns
|
593
|
+
previous = -cutwidth
|
594
|
+
res = []
|
595
|
+
sorted = positions.sort
|
596
|
+
offset, _ = sorted.first
|
597
|
+
sorted.each_with_index { |(pos, count), idx|
|
598
|
+
ndx = idx.next
|
599
|
+
pos -= offset
|
600
|
+
begin
|
601
|
+
nxtpos, nxtcount = sorted[ndx]
|
602
|
+
ndx += 1
|
603
|
+
end until(nxtcount.nil? || nxtcount > 3)
|
604
|
+
nxtpos -= offset if nxtpos
|
605
|
+
if((count > 1 && (pos - previous) > cutwidth) \
|
606
|
+
&& !(nxtcount.to_i > count && (nxtpos - pos) < cutwidth))
|
607
|
+
previous = pos
|
608
|
+
res.push pos + offset
|
609
|
+
end
|
610
|
+
}
|
611
|
+
res.collect { |pos| pos * space }.sort
|
612
|
+
end
|
613
|
+
def join_snippets(text_snippets, callback_handler)
|
614
|
+
text_snippets.sort!
|
615
|
+
columns = []
|
616
|
+
if(callback_handler.identify_columns?)
|
617
|
+
columns = identify_columns(text_snippets,
|
618
|
+
:width => callback_handler.column_width,
|
619
|
+
:count => callback_handler.column_count)
|
620
|
+
columns.shift #throw away the first colum - we'll use the left media-edge
|
621
|
+
end
|
622
|
+
next_column = nil
|
623
|
+
working_set = []
|
624
|
+
each_pair(text_snippets) { |last_text_state, text_state|
|
625
|
+
text_state.fire_early_callbacks(last_text_state,
|
626
|
+
callback_handler)
|
627
|
+
# are we on a new line?
|
628
|
+
unless(last_text_state && text_state.same_line(last_text_state))
|
629
|
+
working_set = columns.dup
|
630
|
+
next_column = working_set.shift
|
631
|
+
last_text_state = nil
|
632
|
+
callback_handler.send_column
|
633
|
+
end
|
634
|
+
x2 = last_text_state && last_text_state.right_edge.to_i
|
635
|
+
while(next_column && (text_state.x.to_i >= next_column.to_i))
|
636
|
+
if(x2 && (x2 > next_column.to_i) && !last_text_state.empty?)
|
637
|
+
callback_handler.send_colspan
|
638
|
+
else
|
639
|
+
callback_handler.send_column
|
640
|
+
end
|
641
|
+
next_column = working_set.shift
|
642
|
+
end
|
643
|
+
text_state.send_content(last_text_state, callback_handler)
|
644
|
+
}
|
645
|
+
end
|
646
|
+
end
|
647
|
+
class Stream < PdfObject
|
648
|
+
num = "([0-9.-]+)\\s*"
|
649
|
+
dm_str = "#{num}#{num}#{num}#{num}#{num}#{num}cm\\b"
|
650
|
+
xobj = '(/\S+)\s*(\bDo\b)'
|
651
|
+
@@nontext_scan_pattern = %r!(?:#{dm_str})|(\b[qQ]\b)|#{xobj}|\bBI\b(.*?)\bID\b(.*?)\b(EI)\b!mn
|
652
|
+
@@hr_scan_pattern = /#{num}#{num}(\b[lm]\b)/mn
|
653
|
+
BT_PATTERN = /\bBT\b(?!(\\[()]|[^(\\])*\))/mn
|
654
|
+
ET_PATTERN = /\bET\b(?!(\\[()]|[^(\\])*\))/mn
|
655
|
+
FAIL_PTRN = /\((\\[()]|[^)])*\bET\b\s*$/mn
|
656
|
+
def append(decoded_stream)
|
657
|
+
(@decoded_stream ||= '') << decoded_stream
|
658
|
+
end
|
659
|
+
def decoded_stream=(decoded_stream)
|
660
|
+
@decoded_stream = decoded_stream
|
661
|
+
end
|
662
|
+
def decoded_stream
|
663
|
+
@decoded_stream ||= decode_raw_stream
|
664
|
+
end
|
665
|
+
def extract_horizontal_rules(dm_src, dmatrix, result)
|
666
|
+
last_x = 0
|
667
|
+
last_y = 0
|
668
|
+
dm_src.scan(@@hr_scan_pattern) { |matches|
|
669
|
+
case matches.last.to_s[-1]
|
670
|
+
when ?l
|
671
|
+
x = matches[0].to_f
|
672
|
+
y = matches[1].to_f
|
673
|
+
if(x != last_x && y == last_y)
|
674
|
+
hr = HorizontalRule.new(x, y, dmatrix)
|
675
|
+
hr.current_page, hr.text_state = @page, @text_state
|
676
|
+
result.push(hr)
|
677
|
+
end
|
678
|
+
last_x = x
|
679
|
+
last_y = y
|
680
|
+
when ?m
|
681
|
+
last_x = matches[0].to_f
|
682
|
+
last_y = matches[1].to_f
|
683
|
+
end
|
684
|
+
}
|
685
|
+
end
|
686
|
+
def extract_nontext_objects(dm_src, dmatrix, stack, result)
|
687
|
+
dm_src.scan(@@nontext_scan_pattern) { |matches|
|
688
|
+
matches = matches.compact
|
689
|
+
case matches.last
|
690
|
+
when 'q'
|
691
|
+
stack.push(dmatrix)
|
692
|
+
when 'Q'
|
693
|
+
dmatrix = stack.pop
|
694
|
+
when 'Do'
|
695
|
+
x, y = (txt = result.last) ? [txt.x, txt.y] : [0, 0]
|
696
|
+
ip = ImagePlacement.new(matches[-2], x, y, dmatrix)
|
697
|
+
ip.current_page, ip.text_state = @page, @text_state
|
698
|
+
result.push ip
|
699
|
+
when 'EI'
|
700
|
+
attrs, data, _ = matches
|
701
|
+
im = InlineImage.new attrs, data.strip
|
702
|
+
ip = ImagePlacement.new im, 0, 0, dmatrix
|
703
|
+
ip.current_page, ip.text_state = @page, @text_state
|
704
|
+
result.push ip
|
705
|
+
else
|
706
|
+
mmatrix = Matrix[[matches[0].to_f, matches[1].to_f,0],
|
707
|
+
[matches[2].to_f, matches[3].to_f,0],
|
708
|
+
[matches[4].to_f, matches[5].to_f,1]]
|
709
|
+
dmatrix = dmatrix * mmatrix
|
710
|
+
end
|
711
|
+
}
|
712
|
+
dmatrix
|
713
|
+
end
|
714
|
+
def extract_text_objects(page, text_state)
|
715
|
+
@page, @text_state = page, text_state
|
716
|
+
stack = []
|
717
|
+
result = []
|
718
|
+
startpoint = decoded_stream.index(BT_PATTERN)
|
719
|
+
endpoint = decoded_stream.index(ET_PATTERN)
|
720
|
+
while FAIL_PTRN.match(decoded_stream[0..(endpoint+2)])
|
721
|
+
endpoint = decoded_stream.index(ET_PATTERN, endpoint.next)
|
722
|
+
end
|
723
|
+
unless(startpoint && endpoint && (startpoint < endpoint))
|
724
|
+
startpoint = 0
|
725
|
+
end
|
726
|
+
rotation = (page && Math::PI * page.attributes[:rotate].to_f / 180) || 0
|
727
|
+
dmatrix = Matrix[[Math.cos(rotation),Math.sin(rotation),0],
|
728
|
+
[Math.sin(rotation),-Math.cos(rotation),0],
|
729
|
+
[0,0,1]]
|
730
|
+
|
731
|
+
dm_src = decoded_stream[0...startpoint]
|
732
|
+
while(endpoint && startpoint)
|
733
|
+
### pick out the bits in between Text that are relevant to
|
734
|
+
### text positioning (such as the device-transformation-matrix)
|
735
|
+
### NOTE: as far as I understand, the device matrix should
|
736
|
+
### not be used to position text. However it is used
|
737
|
+
### by some PDF-Creators and therefore we have to include
|
738
|
+
### it in our calculations.
|
739
|
+
dmatrix = extract_nontext_objects(dm_src, dmatrix, stack, result)
|
740
|
+
extract_horizontal_rules(dm_src, dmatrix, result)
|
741
|
+
tsrc = decoded_stream[startpoint..(endpoint+2)]
|
742
|
+
while FAIL_PTRN.match(tsrc)
|
743
|
+
endpoint = decoded_stream.index(ET_PATTERN, endpoint + 2) || -1
|
744
|
+
tsrc = decoded_stream[startpoint..(endpoint+2)]
|
745
|
+
end
|
746
|
+
text = Text.new(tsrc, @target_encoding, dmatrix)
|
747
|
+
text.current_page = page
|
748
|
+
text.text_state = text_state
|
749
|
+
result.concat text.scan
|
750
|
+
startpoint = decoded_stream.index(BT_PATTERN, endpoint)
|
751
|
+
if(startpoint)
|
752
|
+
dm_src = decoded_stream[endpoint...startpoint]
|
753
|
+
endpoint = decoded_stream.index(ET_PATTERN, startpoint)
|
754
|
+
end
|
755
|
+
end
|
756
|
+
result
|
757
|
+
end
|
758
|
+
def raw_stream
|
759
|
+
@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
|
760
|
+
end
|
761
|
+
def decode_raw_stream
|
762
|
+
@decrypted_stream = raw_stream
|
763
|
+
unless(@decoder.nil?)
|
764
|
+
@decrypted_stream = @decoder.decrypt(self)
|
765
|
+
end
|
766
|
+
stream = @decrypted_stream
|
767
|
+
[@attributes[:filter]].flatten.compact.each { |filter|
|
768
|
+
begin
|
769
|
+
stream = case filter
|
770
|
+
when "/FlateDecode"
|
771
|
+
flate_decode stream
|
772
|
+
when "/LZWDecode"
|
773
|
+
lzw_decode stream
|
774
|
+
else
|
775
|
+
raise "Unimplemented filter: #{filter}"
|
776
|
+
end
|
777
|
+
rescue StandardError => err
|
778
|
+
warn "'#{err.message}' when filtering with #{filter}"
|
779
|
+
end
|
780
|
+
}
|
781
|
+
stream
|
782
|
+
end
|
783
|
+
def flate_decode(data)
|
784
|
+
Zlib::Inflate.inflate(data)
|
785
|
+
end
|
786
|
+
def lzw_decode(data)
|
787
|
+
require 'rpdf2txt/lzw'
|
788
|
+
earlychange = (parm = @attributes[:decodeparms]) && parm[:earlychange]
|
789
|
+
if length = @attributes[:length]
|
790
|
+
data = data[0, length.to_i]
|
791
|
+
end
|
792
|
+
LZW.decode data, (earlychange || 1).to_i
|
793
|
+
end
|
794
|
+
def to_cmap
|
795
|
+
cmap = CMap.new(@src, @target_encoding)
|
796
|
+
end
|
797
|
+
end
|
798
|
+
class ObjStream < Stream
|
799
|
+
end
|
800
|
+
class Image < Stream
|
801
|
+
COLORMAPS = {
|
802
|
+
'/DeviceRGB' => 'RGB',
|
803
|
+
'/DeviceGray' => 'I',
|
804
|
+
'/DeviceCMYK' => 'CMYK',
|
805
|
+
}
|
806
|
+
def image
|
807
|
+
require 'RMagick'
|
808
|
+
@image or begin
|
809
|
+
columns = @attributes[:width].to_i
|
810
|
+
rows = @attributes[:height].to_i
|
811
|
+
depth = @attributes[:bitspercomponent].to_i
|
812
|
+
mask = @attributes[:mask]
|
813
|
+
color_grades = 2 ** depth - 1
|
814
|
+
colorspace, basespace, index_colors, index = @attributes[:colorspace]
|
815
|
+
index_colors = index_colors.to_i
|
816
|
+
colormap = COLORMAPS[colorspace] || COLORMAPS[basespace] || 'RGB'
|
817
|
+
colors = colormap.length
|
818
|
+
pixels = extract_pixels(decoded_stream, depth)
|
819
|
+
case colorspace
|
820
|
+
when '/Indexed'
|
821
|
+
## FIXME: this works for some images, but seems to be wrong
|
822
|
+
# according to the Documentation
|
823
|
+
if mask.is_a?(Array) && (pixels.size - 1 ) > rows * columns
|
824
|
+
range = (mask[0].to_i)..(mask[1].to_i)
|
825
|
+
pixels.delete_if { |idx| range.include? idx }
|
826
|
+
end
|
827
|
+
# for indexed images, index_colors correctly describes the
|
828
|
+
# depth of the resulting pixels, whereas bitspercomponent
|
829
|
+
# may not be accurate
|
830
|
+
color_grades = index_colors
|
831
|
+
map = extract_colormap(index, index_colors)
|
832
|
+
tmp = Array.new(pixels.size * colors)
|
833
|
+
pos = 0
|
834
|
+
pixels.each { |idx|
|
835
|
+
tmp[pos, colors] = map[idx * colors, colors]
|
836
|
+
pos += colors
|
837
|
+
}
|
838
|
+
pixels = tmp
|
839
|
+
end
|
840
|
+
## this seems to be undocumented: PNG-images need to be decoded.
|
841
|
+
# we can detect this by the additional Byte per Row:
|
842
|
+
if pixels.size == (columns * colors + 1) * rows
|
843
|
+
pixels = idat_decode pixels, columns, colors
|
844
|
+
elsif pixels.size > (rows * columns * colors)
|
845
|
+
pixels = pixels[0, rows * columns * colors]
|
846
|
+
end
|
847
|
+
if color_grades != (2 ** Magick::QuantumDepth - 1)
|
848
|
+
div = color_grades.to_f
|
849
|
+
pixels.collect! { |px| px / div }
|
850
|
+
end
|
851
|
+
@image = Magick::Image.constitute(columns, rows, colormap, pixels)
|
852
|
+
end
|
853
|
+
end
|
854
|
+
def idat_decode(data, width, colors)
|
855
|
+
scanline_length = colors * width + 1 # for filter
|
856
|
+
byte_width = width * colors
|
857
|
+
|
858
|
+
pixels = []
|
859
|
+
row = 0
|
860
|
+
until data.empty? do
|
861
|
+
row_data = data.slice! 0, scanline_length
|
862
|
+
filter = row_data.shift
|
863
|
+
case filter
|
864
|
+
when 0 then # None
|
865
|
+
when 1 then # Sub
|
866
|
+
row_data.each_with_index do |byte, index|
|
867
|
+
left = index < colors ? 0 : row_data[index - colors]
|
868
|
+
row_data[index] = (byte + left) % 256
|
869
|
+
end
|
870
|
+
when 2 then # Up
|
871
|
+
row_data.each_with_index do |byte, index|
|
872
|
+
upper = row == 0 ? 0 : pixels[ - byte_width + index ]
|
873
|
+
row_data[index] = (upper + byte) % 256
|
874
|
+
end
|
875
|
+
when 3 then # Average
|
876
|
+
row_data.each_with_index do |byte, index|
|
877
|
+
upper = row == 0 ? 0 : pixels[ - byte_width + index ]
|
878
|
+
left = index < colors ? 0 : row_data[index - colors]
|
879
|
+
|
880
|
+
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
881
|
+
end
|
882
|
+
when 4 then # Paeth
|
883
|
+
left = upper = upper_left = nil
|
884
|
+
row_data.each_with_index do |byte, index|
|
885
|
+
|
886
|
+
left = index < colors ? 0 : row_data[index - colors]
|
887
|
+
if row == 0 then
|
888
|
+
upper = upper_left = 0
|
889
|
+
else
|
890
|
+
upper_idx = - byte_width + index
|
891
|
+
upper = pixels[ upper_idx ]
|
892
|
+
upper_left = index < colors ? 0 : pixels[ upper_idx - colors ]
|
893
|
+
end
|
894
|
+
|
895
|
+
paeth = paeth left, upper, upper_left
|
896
|
+
row_data[index] = (byte + paeth) % 256
|
897
|
+
end
|
898
|
+
else
|
899
|
+
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
900
|
+
end
|
901
|
+
|
902
|
+
pixels.concat row_data
|
903
|
+
row += 1
|
904
|
+
end
|
905
|
+
pixels
|
906
|
+
end
|
907
|
+
private
|
908
|
+
def extract_colormap(index, mask)
|
909
|
+
map = []
|
910
|
+
if index.is_a? Stream
|
911
|
+
map = index.decoded_stream.unpack('C*').collect { |int| int & mask }
|
912
|
+
elsif index.is_a? Array
|
913
|
+
map = index
|
914
|
+
end
|
915
|
+
map
|
916
|
+
end
|
917
|
+
def extract_pixels(stream, depth)
|
918
|
+
case depth
|
919
|
+
when 8
|
920
|
+
stream.unpack('C*')
|
921
|
+
else
|
922
|
+
tmp, = stream.unpack('B*')
|
923
|
+
pixels = []
|
924
|
+
tmp.scan(/.{#{depth}}/n) { |match|
|
925
|
+
pixels.push match.to_i(2)
|
926
|
+
}
|
927
|
+
pixels
|
928
|
+
end
|
929
|
+
end
|
930
|
+
def paeth(a, b, c) # left, above, upper left
|
931
|
+
p = a + b - c
|
932
|
+
pa = (p - a).abs
|
933
|
+
pb = (p - b).abs
|
934
|
+
pc = (p - c).abs
|
935
|
+
|
936
|
+
return a if pa <= pb && pa <= pc
|
937
|
+
return b if pb <= pc
|
938
|
+
c
|
939
|
+
end
|
940
|
+
end
|
941
|
+
class InlineImage < Image
|
942
|
+
ATTR_ABBREVIATIONS = {
|
943
|
+
:bpc => :bitspercomponent, :cs => :colorspace,
|
944
|
+
:d => :decode, :dp => :decodeparms,
|
945
|
+
:f => :filter, :h => :height,
|
946
|
+
:im => :imagemask, :i => :interpolate,
|
947
|
+
:w => :width,
|
948
|
+
}
|
949
|
+
OTHER_ABBREVIATIONS = {
|
950
|
+
'/G' => '/DeviceGray',
|
951
|
+
'/RGB' => '/DeviceRGB',
|
952
|
+
'/CMYK' => '/DeviceCMYK',
|
953
|
+
'/I' => '/Indexed',
|
954
|
+
'/AHx' => '/ASCIIHexDecode',
|
955
|
+
'/A85' => '/ASCII85Decode',
|
956
|
+
'/LZW' => '/LZWDecode',
|
957
|
+
'/Fl' => '/FlateDecode',
|
958
|
+
'/RL' => '/RunLengthDecode',
|
959
|
+
'/CCF' => '/CCITTFaxDecode',
|
960
|
+
'/DCT' => '/DCTDecode',
|
961
|
+
}
|
962
|
+
def initialize(attrs, data)
|
963
|
+
super("<<" << attrs << ">>")
|
964
|
+
@raw_stream = data
|
965
|
+
end
|
966
|
+
def parse_attributes
|
967
|
+
super
|
968
|
+
ATTR_ABBREVIATIONS.each do |abbr, key|
|
969
|
+
if value = @attributes.delete(abbr)
|
970
|
+
@attributes.store key, OTHER_ABBREVIATIONS.fetch(value, value)
|
971
|
+
end
|
972
|
+
end
|
973
|
+
end
|
974
|
+
end
|
975
|
+
class CMap < Stream
|
976
|
+
attr_accessor :map
|
977
|
+
def initialize(*args)
|
978
|
+
@map = {}
|
979
|
+
super
|
980
|
+
parse_cmap()
|
981
|
+
end
|
982
|
+
def to_utf8(txt)
|
983
|
+
if(@map.nil?)
|
984
|
+
txt
|
985
|
+
elsif(txt.is_a?(Integer))
|
986
|
+
@map[txt]
|
987
|
+
else
|
988
|
+
txt.unpack('C*').collect { |byte|
|
989
|
+
@map.fetch(byte, byte) }.pack('U*')
|
990
|
+
end
|
991
|
+
end
|
992
|
+
private
|
993
|
+
#bfchar definition
|
994
|
+
def add_to_map_bfchar(ast)
|
995
|
+
ast.compact!
|
996
|
+
ast.each { |child|
|
997
|
+
#convert in to decimal values
|
998
|
+
@map.store(_hexvalue(child.source), _hexvalue(child.target))
|
999
|
+
}
|
1000
|
+
@map
|
1001
|
+
end
|
1002
|
+
#bfrange definition see page 457 of the pdf manual
|
1003
|
+
def add_to_map_bfrange(ast)
|
1004
|
+
ast.compact!
|
1005
|
+
start_range = ast.start.value.to_s.hex
|
1006
|
+
end_range = ast.stop.value.to_s.hex
|
1007
|
+
if(ast.children_names.include?('explicit'))
|
1008
|
+
explicit = ast.explicit
|
1009
|
+
start_range.upto(end_range) { |char|
|
1010
|
+
@map.store(char, _hexvalue(explicit.shift))
|
1011
|
+
}
|
1012
|
+
else
|
1013
|
+
offset = _hexvalue(ast.offset)
|
1014
|
+
start_range.upto(end_range) { |char|
|
1015
|
+
@map.store(char, offset)
|
1016
|
+
offset+=1
|
1017
|
+
}
|
1018
|
+
end
|
1019
|
+
@map
|
1020
|
+
end
|
1021
|
+
def extract_bfchar
|
1022
|
+
src = decoded_stream
|
1023
|
+
unless(decoded_stream.index('beginbfchar').nil?)
|
1024
|
+
index = decoded_stream.index('beginbfchar') + 12
|
1025
|
+
rindex = decoded_stream.index('endbfchar')
|
1026
|
+
src[index..(rindex)-1]
|
1027
|
+
end
|
1028
|
+
end
|
1029
|
+
def extract_bfrange
|
1030
|
+
src = decoded_stream
|
1031
|
+
unless(decoded_stream.index('beginbfrange').nil?)
|
1032
|
+
index = decoded_stream.index('beginbfrange') + 12
|
1033
|
+
rindex = decoded_stream.index('endbfrange')
|
1034
|
+
src[index..(rindex)-1]
|
1035
|
+
end
|
1036
|
+
end
|
1037
|
+
def _hexvalue(ast)
|
1038
|
+
ast.value.to_s.to_i(16)
|
1039
|
+
end
|
1040
|
+
def parse_cmap
|
1041
|
+
if(src = extract_bfchar)
|
1042
|
+
ast = Rpdf2txt.cmap_parser.parse(src)
|
1043
|
+
add_to_map_bfchar(ast)
|
1044
|
+
end
|
1045
|
+
if(src = extract_bfrange)
|
1046
|
+
ast = Rpdf2txt.cmap_range_parser.parse(src)
|
1047
|
+
ast.each { |node|
|
1048
|
+
add_to_map_bfrange(node)
|
1049
|
+
}
|
1050
|
+
end
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
class ReferenceArray < TreeNode
|
1054
|
+
def build_tree(object_catalogue, parent=nil)
|
1055
|
+
@contents=[]
|
1056
|
+
@references.each{ |id|
|
1057
|
+
@contents.push(object_catalogue[id]) if object_catalogue[id]
|
1058
|
+
}
|
1059
|
+
super
|
1060
|
+
end
|
1061
|
+
def build_stream(concat_stream)
|
1062
|
+
@contents.each { |stream|
|
1063
|
+
concat_stream.append(stream.decoded_stream)
|
1064
|
+
}
|
1065
|
+
concat_stream
|
1066
|
+
end
|
1067
|
+
def parse_attributes
|
1068
|
+
src = @src[@src.index('[')..@src.rindex(']')]
|
1069
|
+
ast = _parse_attributes(src)
|
1070
|
+
ast.compact!
|
1071
|
+
@references = extract_oids(extract_attributes(ast))
|
1072
|
+
end
|
1073
|
+
def root?
|
1074
|
+
false
|
1075
|
+
end
|
1076
|
+
end
|
1077
|
+
class PdfArray < TreeNode
|
1078
|
+
def build_tree(object_catalogue, parent=nil)
|
1079
|
+
@contents=[]
|
1080
|
+
super
|
1081
|
+
end
|
1082
|
+
def at(idx)
|
1083
|
+
@contents.at(idx)
|
1084
|
+
end
|
1085
|
+
def each(&block)
|
1086
|
+
@contents.each(&block)
|
1087
|
+
end
|
1088
|
+
def parse_attributes
|
1089
|
+
src = @src[@src.index('[')..@src.rindex(']')]
|
1090
|
+
ast = _parse_attributes(src)
|
1091
|
+
ast.compact!
|
1092
|
+
@contents = extract_attributes(ast)
|
1093
|
+
end
|
1094
|
+
def root?
|
1095
|
+
false
|
1096
|
+
end
|
1097
|
+
end
|
1098
|
+
class PdfHash < TreeNode
|
1099
|
+
attr_reader :contents
|
1100
|
+
def build_tree(object_catalogue, parent=nil)
|
1101
|
+
@contents={}
|
1102
|
+
super
|
1103
|
+
end
|
1104
|
+
def parse_attributes
|
1105
|
+
src = @src[@src.index('<<')..@src.rindex('>')]
|
1106
|
+
ast = _parse_attributes(src)
|
1107
|
+
ast.compact!
|
1108
|
+
@contents = extract_attributes(ast)
|
1109
|
+
end
|
1110
|
+
def root?
|
1111
|
+
false
|
1112
|
+
end
|
1113
|
+
end
|
1114
|
+
end
|