rpdf2txt 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/LICENCE +515 -0
- data/Manifest.txt +126 -0
- data/README.txt +30 -0
- data/Rakefile +24 -0
- data/bin/rpdf2txt +58 -0
- data/config.save +12 -0
- data/install.rb +1098 -0
- data/lib/rpdf2txt-rockit/base_extensions.rb +73 -0
- data/lib/rpdf2txt-rockit/bootstrap.rb +120 -0
- data/lib/rpdf2txt-rockit/bounded_lru_cache.rb +43 -0
- data/lib/rpdf2txt-rockit/conflict_resolution.rb +302 -0
- data/lib/rpdf2txt-rockit/directed_graph.rb +401 -0
- data/lib/rpdf2txt-rockit/glr_parser.rb +393 -0
- data/lib/rpdf2txt-rockit/grammar.rb +644 -0
- data/lib/rpdf2txt-rockit/graphdrawing.rb +107 -0
- data/lib/rpdf2txt-rockit/graphviz_dot.rb +63 -0
- data/lib/rpdf2txt-rockit/indexable.rb +53 -0
- data/lib/rpdf2txt-rockit/lalr_parsetable_generator.rb +144 -0
- data/lib/rpdf2txt-rockit/parse_table.rb +273 -0
- data/lib/rpdf2txt-rockit/parsetable_generation.rb +164 -0
- data/lib/rpdf2txt-rockit/parsing_ambiguities.rb +84 -0
- data/lib/rpdf2txt-rockit/profiler.rb +168 -0
- data/lib/rpdf2txt-rockit/reduce_actions_generator.rb +523 -0
- data/lib/rpdf2txt-rockit/rockit.rb +76 -0
- data/lib/rpdf2txt-rockit/rockit_grammar_ast_eval.rb +187 -0
- data/lib/rpdf2txt-rockit/rockit_grammars_parser.rb +126 -0
- data/lib/rpdf2txt-rockit/sourcecode_dumpable.rb +181 -0
- data/lib/rpdf2txt-rockit/stringscanner.rb +54 -0
- data/lib/rpdf2txt-rockit/syntax_tree.rb +452 -0
- data/lib/rpdf2txt-rockit/token.rb +364 -0
- data/lib/rpdf2txt-rockit/version.rb +3 -0
- data/lib/rpdf2txt/attributesparser.rb +42 -0
- data/lib/rpdf2txt/cmapparser.rb +65 -0
- data/lib/rpdf2txt/data/_cmap.grammar +11 -0
- data/lib/rpdf2txt/data/_cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/_pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/cmap.grammar +11 -0
- data/lib/rpdf2txt/data/cmap.rb +37 -0
- data/lib/rpdf2txt/data/cmap_range.grammar +15 -0
- data/lib/rpdf2txt/data/cmap_range.rb +43 -0
- data/lib/rpdf2txt/data/fonts/Courier-Bold.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-BoldOblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier-Oblique.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Courier.afm +342 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Bold.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-BoldOblique.afm +2827 -0
- data/lib/rpdf2txt/data/fonts/Helvetica-Oblique.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/Helvetica.afm +3051 -0
- data/lib/rpdf2txt/data/fonts/License-Adobe.txt +65 -0
- data/lib/rpdf2txt/data/fonts/Symbol.afm +213 -0
- data/lib/rpdf2txt/data/fonts/Times-Bold.afm +2588 -0
- data/lib/rpdf2txt/data/fonts/Times-BoldItalic.afm +2384 -0
- data/lib/rpdf2txt/data/fonts/Times-Italic.afm +2667 -0
- data/lib/rpdf2txt/data/fonts/Times-Roman.afm +2419 -0
- data/lib/rpdf2txt/data/fonts/ZapfDingbats.afm +225 -0
- data/lib/rpdf2txt/data/pdfattributes.grammar +32 -0
- data/lib/rpdf2txt/data/pdfattributes.rb +71 -0
- data/lib/rpdf2txt/data/pdftext.grammar +102 -0
- data/lib/rpdf2txt/data/pdftext.rb +146 -0
- data/lib/rpdf2txt/default_handler.rb +352 -0
- data/lib/rpdf2txt/lzw.rb +69 -0
- data/lib/rpdf2txt/object.rb +1114 -0
- data/lib/rpdf2txt/parser.rb +169 -0
- data/lib/rpdf2txt/symbol.rb +408 -0
- data/lib/rpdf2txt/text.rb +182 -0
- data/lib/rpdf2txt/text_state.rb +434 -0
- data/lib/rpdf2txt/textparser.rb +42 -0
- data/test/data/3392_obj +0 -0
- data/test/data/397_decrypted +15 -0
- data/test/data/450_decrypted +153 -0
- data/test/data/450_obj +0 -0
- data/test/data/452_decrypted +125 -0
- data/test/data/454_decrypted +108 -0
- data/test/data/456_decrypted +106 -0
- data/test/data/458_decrypted +111 -0
- data/test/data/458_obj +0 -0
- data/test/data/460_decrypted +118 -0
- data/test/data/460_obj +0 -0
- data/test/data/463_decrypted +117 -0
- data/test/data/465_decrypted +107 -0
- data/test/data/465_obj +0 -0
- data/test/data/90_obj +0 -0
- data/test/data/90_obj_comp +1 -0
- data/test/data/decrypted +0 -0
- data/test/data/encrypt_obj +0 -0
- data/test/data/encrypt_string +0 -0
- data/test/data/encrypt_string_128bit +0 -0
- data/test/data/encrypted_object_stream.pdf +0 -0
- data/test/data/firststream +1 -0
- data/test/data/index.pdfobj +0 -0
- data/test/data/index_2bit.pdfobj +0 -0
- data/test/data/index_masked.pdfobj +0 -0
- data/test/data/indexed.pdfobj +0 -0
- data/test/data/indexed_2bit.pdfobj +0 -0
- data/test/data/indexed_masked.pdfobj +0 -0
- data/test/data/inline.png +0 -0
- data/test/data/logo.png +0 -0
- data/test/data/lzw.pdfobj +0 -0
- data/test/data/lzw_index.pdfobj +0 -0
- data/test/data/page_tree.pdf +148 -0
- data/test/data/pdf_20.png +0 -0
- data/test/data/pdf_21.png +0 -0
- data/test/data/pdf_22.png +0 -0
- data/test/data/pdf_50.png +0 -0
- data/test/data/png.pdfobj +0 -0
- data/test/data/space_bug_stream.txt +119 -0
- data/test/data/stream.txt +292 -0
- data/test/data/stream_kerning_bug.txt +13 -0
- data/test/data/stream_kerning_bug2.txt +6 -0
- data/test/data/test.pdf +0 -0
- data/test/data/test.txt +8 -0
- data/test/data/test_text.txt +42 -0
- data/test/data/working_obj +0 -0
- data/test/data/working_obj2 +0 -0
- data/test/mock.rb +149 -0
- data/test/suite.rb +30 -0
- data/test/test_pdf_object.rb +1802 -0
- data/test/test_pdf_parser.rb +1340 -0
- data/test/test_pdf_text.rb +789 -0
- data/test/test_space_bug_05_2004.rb +87 -0
- data/test/test_stream.rb +194 -0
- data/test/test_text_state.rb +315 -0
- data/usage-en.txt +112 -0
- data/user-stories/UserStories_Rpdf2Txt.txt +34 -0
- data/user-stories/documents/swissmedicjournal/04_2004.pdf +0 -0
- metadata +220 -0
@@ -0,0 +1,352 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2004 Mike Walder, Raphael Waltert, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Z�rich, Switzerland
|
21
|
+
# hwyss@ywesee.com, mwalder@ywesee.com, rwaltert@ywesee.com
|
22
|
+
#
|
23
|
+
# PdfParser -- Rpdf2txt-- 14.11.2002 -- mwalder@ywesee.com, rwaltert@ywesee.com
|
24
|
+
|
25
|
+
module Rpdf2txt
|
26
|
+
module SymbolMap
|
27
|
+
#this map is not complete!!
|
28
|
+
SYMBOL_ENTITIES = {
|
29
|
+
3 => 65,
|
30
|
+
8804 => 163,
|
31
|
+
8805 => 179,
|
32
|
+
#alpha-omega
|
33
|
+
913 => 65,
|
34
|
+
914 => 66,
|
35
|
+
915 => 71,
|
36
|
+
916 => 68,
|
37
|
+
917 => 69,
|
38
|
+
918 => 90,
|
39
|
+
919 => 72,
|
40
|
+
920 => 81,
|
41
|
+
921 => 73,
|
42
|
+
922 => 75,
|
43
|
+
923 => 76,
|
44
|
+
924 => 77,
|
45
|
+
925 => 78,
|
46
|
+
926 => 88,
|
47
|
+
927 => 79,
|
48
|
+
928 => 80,
|
49
|
+
929 => 82,
|
50
|
+
931 => 83,
|
51
|
+
932 => 84,
|
52
|
+
933 => 85,
|
53
|
+
934 => 70,
|
54
|
+
935 => 67,
|
55
|
+
936 => 89,
|
56
|
+
945 => 97,
|
57
|
+
946 => 98,
|
58
|
+
947 => 103,
|
59
|
+
948 => 100,
|
60
|
+
949 => 101,
|
61
|
+
950 => 122,
|
62
|
+
951 => 104,
|
63
|
+
952 => 113,
|
64
|
+
953 => 105,
|
65
|
+
954 => 107,
|
66
|
+
955 => 108,
|
67
|
+
956 => 109,
|
68
|
+
957 => 110,
|
69
|
+
958 => 120,
|
70
|
+
959 => 111,
|
71
|
+
960 => 112,
|
72
|
+
961 => 114,
|
73
|
+
963 => 115,
|
74
|
+
964 => 116,
|
75
|
+
965 => 117,
|
76
|
+
966 => 102,
|
77
|
+
967 => 99,
|
78
|
+
968 => 121,
|
79
|
+
969 => 119,
|
80
|
+
#dot?
|
81
|
+
8901 => 46,
|
82
|
+
#intersection (cap)
|
83
|
+
8745 => 199,
|
84
|
+
#union (cup)
|
85
|
+
8746 => 200,
|
86
|
+
#infinity
|
87
|
+
8734 => 165,
|
88
|
+
#integral
|
89
|
+
8747 => 166,
|
90
|
+
#partial differential
|
91
|
+
8706 => 182,
|
92
|
+
#not equal to
|
93
|
+
8800 => 185,
|
94
|
+
#equal
|
95
|
+
61 => 186,
|
96
|
+
#almost equal to
|
97
|
+
8776 => 187,
|
98
|
+
#superset of
|
99
|
+
8835 => 201,
|
100
|
+
# superset or equal to
|
101
|
+
8839 => 202,
|
102
|
+
#not a subset of
|
103
|
+
8836 => 203,
|
104
|
+
#subset of
|
105
|
+
8834 => 204,
|
106
|
+
#subset or equal to
|
107
|
+
8838 => 205,
|
108
|
+
#element of
|
109
|
+
8712 => 206,
|
110
|
+
#not an element of
|
111
|
+
8713 => 207,
|
112
|
+
#n-ary product
|
113
|
+
8719 => 213,
|
114
|
+
#radic
|
115
|
+
8730 => 214,
|
116
|
+
#n-ary sum
|
117
|
+
8721 => 229,
|
118
|
+
}
|
119
|
+
end
|
120
|
+
module DefaultHandler
|
121
|
+
def column_count
|
122
|
+
end
|
123
|
+
def column_width
|
124
|
+
end
|
125
|
+
def identify_columns?
|
126
|
+
false
|
127
|
+
end
|
128
|
+
def send_image(handle)
|
129
|
+
end
|
130
|
+
def new_font(font)
|
131
|
+
end
|
132
|
+
def new_fontsize(size)
|
133
|
+
end
|
134
|
+
def send_column
|
135
|
+
end
|
136
|
+
def send_colspan
|
137
|
+
end
|
138
|
+
def send_eof
|
139
|
+
@out
|
140
|
+
end
|
141
|
+
def send_flowing_data(data)
|
142
|
+
self.out << data
|
143
|
+
end
|
144
|
+
def send_hr
|
145
|
+
end
|
146
|
+
def send_line_break
|
147
|
+
self.out << "\n"
|
148
|
+
end
|
149
|
+
def send_paragraph
|
150
|
+
end
|
151
|
+
def send_page
|
152
|
+
self.out << "\n\n"
|
153
|
+
end
|
154
|
+
def out
|
155
|
+
@out ||= ""
|
156
|
+
end
|
157
|
+
end
|
158
|
+
class SimpleHandler
|
159
|
+
include DefaultHandler
|
160
|
+
include SymbolMap
|
161
|
+
def initialize(io="")
|
162
|
+
@out = io
|
163
|
+
end
|
164
|
+
end
|
165
|
+
class ColumnHandler < SimpleHandler
|
166
|
+
def initialize(outstream="", padding=' ')
|
167
|
+
super(outstream)
|
168
|
+
@lines = []
|
169
|
+
@padding = padding
|
170
|
+
send_line_break
|
171
|
+
end
|
172
|
+
def column_widths
|
173
|
+
all_lengths = []
|
174
|
+
last_column = @lines.collect { |line| line.size }.push(1).max - 1
|
175
|
+
@lines.each { |line|
|
176
|
+
line.each_with_index { |column, idx|
|
177
|
+
if(column.is_a?(String))
|
178
|
+
column.rstrip!
|
179
|
+
if(line[idx.next] || idx == last_column)
|
180
|
+
length = column.length
|
181
|
+
all_lengths[idx] = [length, all_lengths[idx].to_i].max
|
182
|
+
end
|
183
|
+
end
|
184
|
+
}
|
185
|
+
}
|
186
|
+
max_lengths = []
|
187
|
+
@lines.each { |line|
|
188
|
+
line.each_with_index { |column, idx|
|
189
|
+
if(column.is_a?(String))
|
190
|
+
length = column.length
|
191
|
+
idx2 = idx.next
|
192
|
+
rm = 0
|
193
|
+
while(line[idx2].nil? && (max = all_lengths[idx2]))
|
194
|
+
rm += max
|
195
|
+
length -= max
|
196
|
+
idx2 += 1
|
197
|
+
end
|
198
|
+
max_lengths[idx] = [length, max_lengths[idx].to_i].max
|
199
|
+
end
|
200
|
+
}
|
201
|
+
line[last_column] ||= nil
|
202
|
+
}
|
203
|
+
max_lengths
|
204
|
+
end
|
205
|
+
def identify_columns?
|
206
|
+
true
|
207
|
+
end
|
208
|
+
def send_column
|
209
|
+
@current_column += @colspan
|
210
|
+
@colspan = 1
|
211
|
+
@columns[@current_column] ||= if(@padding.respond_to?(:foldcase))
|
212
|
+
u(@padding.dup)
|
213
|
+
else
|
214
|
+
@padding.dup
|
215
|
+
end
|
216
|
+
end
|
217
|
+
def send_colspan
|
218
|
+
@colspan += 1
|
219
|
+
end
|
220
|
+
def send_hr
|
221
|
+
@columns << :hr
|
222
|
+
end
|
223
|
+
def send_image(image)
|
224
|
+
@columns << :image
|
225
|
+
end
|
226
|
+
def send_flowing_data(data)
|
227
|
+
@columns[@current_column] << data
|
228
|
+
end
|
229
|
+
def send_line_break
|
230
|
+
@columns = []
|
231
|
+
@lines.push(@columns)
|
232
|
+
@current_column = -1
|
233
|
+
@colspan = 1
|
234
|
+
end
|
235
|
+
def send_page
|
236
|
+
max_lengths = column_widths
|
237
|
+
@lines.each { |line|
|
238
|
+
line.each_with_index { |column, idx|
|
239
|
+
if(column)
|
240
|
+
idx2 = idx.next
|
241
|
+
accumulated = max_lengths[idx].to_i
|
242
|
+
while((pad = max_lengths[idx2]) && !line[idx2])
|
243
|
+
accumulated += pad
|
244
|
+
idx2 += 1
|
245
|
+
end
|
246
|
+
case column
|
247
|
+
when :image
|
248
|
+
@out << " #IMAGE# ".ljust(accumulated)
|
249
|
+
when :hr
|
250
|
+
@out << @padding.dup.ljust(accumulated, '-')
|
251
|
+
else
|
252
|
+
@out << column.ljust(accumulated)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
}
|
256
|
+
@out << "\n"
|
257
|
+
}
|
258
|
+
@lines.clear
|
259
|
+
send_line_break
|
260
|
+
super
|
261
|
+
end
|
262
|
+
end
|
263
|
+
class RecordingHandler
|
264
|
+
def initialize(out = $stdout, columns=false)
|
265
|
+
require 'yaml'
|
266
|
+
@out = out
|
267
|
+
@out << "require 'yaml'\n"
|
268
|
+
@columns = columns
|
269
|
+
end
|
270
|
+
def identify_columns?
|
271
|
+
@columns
|
272
|
+
end
|
273
|
+
def new_font(font)
|
274
|
+
@out << <<-EOS
|
275
|
+
font = YAML.load <<-EOF
|
276
|
+
#{font.to_yaml}
|
277
|
+
EOF
|
278
|
+
@writer.new_font(font)
|
279
|
+
EOS
|
280
|
+
end
|
281
|
+
def method_missing(symbol, *args, &block)
|
282
|
+
argstr = args.collect { |arg| arg.inspect }.join(', ')
|
283
|
+
@out << <<-EOS
|
284
|
+
@writer.#{symbol}(#{argstr})
|
285
|
+
EOS
|
286
|
+
end
|
287
|
+
end
|
288
|
+
class HTMLHandler
|
289
|
+
include DefaultHandler
|
290
|
+
include SymbolMap
|
291
|
+
def initialize
|
292
|
+
super
|
293
|
+
@state = {
|
294
|
+
:italic => false,
|
295
|
+
:bold => false,
|
296
|
+
:font => false,
|
297
|
+
}
|
298
|
+
end
|
299
|
+
def new_font(font)
|
300
|
+
unless(font.nil?)
|
301
|
+
if(font.bold?)
|
302
|
+
if(!@state[:bold])
|
303
|
+
self.out << "<b>"
|
304
|
+
@state[:bold] = true
|
305
|
+
end
|
306
|
+
else
|
307
|
+
if(@state[:bold])
|
308
|
+
self.out << "</b>"
|
309
|
+
@state[:bold] = false
|
310
|
+
end
|
311
|
+
end
|
312
|
+
if(font.italic?)
|
313
|
+
if(!@state[:italic])
|
314
|
+
self.out << "<i>"
|
315
|
+
@state[:italic] = true
|
316
|
+
end
|
317
|
+
else
|
318
|
+
if(@state[:italic])
|
319
|
+
self.out << "</i>"
|
320
|
+
@state[:italic] = false
|
321
|
+
end
|
322
|
+
end
|
323
|
+
if(@state[:font])
|
324
|
+
self.out << "</font>"
|
325
|
+
@state[:font] = false
|
326
|
+
end
|
327
|
+
if(@state[:pre])
|
328
|
+
self.out << "</pre>"
|
329
|
+
@state[:pre] = false
|
330
|
+
end
|
331
|
+
font_name = font.basefont_name
|
332
|
+
if(!/symbol/i.match(font_name).nil?)
|
333
|
+
self.out << "<font face=\"Symbol\">"
|
334
|
+
@state[:font] = true
|
335
|
+
end
|
336
|
+
if (!/courier/i.match(font_name).nil?)
|
337
|
+
self.out << "<pre>"
|
338
|
+
@state[:pre] = true
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
def send_line_break
|
343
|
+
@out << "<br>"
|
344
|
+
end
|
345
|
+
def send_paragraph
|
346
|
+
self.out << "<p>"
|
347
|
+
end
|
348
|
+
def send_page
|
349
|
+
self.out << "<p>"
|
350
|
+
end
|
351
|
+
end
|
352
|
+
end
|
data/lib/rpdf2txt/lzw.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# LZW -- rpdf2txt -- 09.07.2008 -- hwyss@ywesee.com
|
3
|
+
|
4
|
+
module Rpdf2txt
|
5
|
+
class LZW
|
6
|
+
CLEAR = 256
|
7
|
+
EOD = 257
|
8
|
+
def self.decode data, early_change=1
|
9
|
+
self.new(early_change).decode data
|
10
|
+
end
|
11
|
+
def initialize(early_change=1)
|
12
|
+
@early_change = early_change
|
13
|
+
@__dict = (0..255).collect { |num| num.chr }.push :clear, :eod
|
14
|
+
init_dictionary
|
15
|
+
end
|
16
|
+
def init_dictionary
|
17
|
+
@dictionary = @__dict.dup
|
18
|
+
@code_length = 9
|
19
|
+
@boundary = 512 - @early_change
|
20
|
+
end
|
21
|
+
def decode data
|
22
|
+
bits, = data.unpack('B*')
|
23
|
+
expected_codes = bits.size / 12
|
24
|
+
code = old_code = 0
|
25
|
+
result = ''
|
26
|
+
max = codes = clears = 0
|
27
|
+
while(!bits.empty? && (code = get_next_code bits) && code != EOD)
|
28
|
+
codes += 1
|
29
|
+
if code == CLEAR
|
30
|
+
clears +=1
|
31
|
+
init_dictionary
|
32
|
+
code = get_next_code bits
|
33
|
+
if code.nil? || code == EOD
|
34
|
+
return result
|
35
|
+
end
|
36
|
+
result << @dictionary[code]
|
37
|
+
old_code = code
|
38
|
+
else
|
39
|
+
if string = @dictionary[code]
|
40
|
+
result << string
|
41
|
+
update_dictionary @dictionary[old_code] + string[0,1]
|
42
|
+
old_code = code
|
43
|
+
elsif code == @dictionary.size
|
44
|
+
string = @dictionary[old_code]
|
45
|
+
string += string[0,1]
|
46
|
+
result << string
|
47
|
+
update_dictionary string
|
48
|
+
old_code = code
|
49
|
+
else
|
50
|
+
raise 'Bad compressed code: %s' % code
|
51
|
+
end
|
52
|
+
end
|
53
|
+
max = [max, result.size].max
|
54
|
+
end
|
55
|
+
result
|
56
|
+
end
|
57
|
+
def get_next_code bits
|
58
|
+
chunk = bits.slice!(0, @code_length).to_i(2)
|
59
|
+
end
|
60
|
+
def update_dictionary(str)
|
61
|
+
@dictionary.push str
|
62
|
+
if @dictionary.size >= @boundary && @code_length < 12
|
63
|
+
@code_length += 1
|
64
|
+
@boundary = (2**@code_length - @early_change)
|
65
|
+
end
|
66
|
+
str
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,1114 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Rpdf2txt -- PDF to Text Parser
|
4
|
+
# Copyright (C) 2003 Andreas Schrafl, Hannes Wyss
|
5
|
+
#
|
6
|
+
# This library is free software; you can redistribute it and/or
|
7
|
+
# modify it under the terms of the GNU Lesser General Public
|
8
|
+
# License as published by the Free Software Foundation; either
|
9
|
+
# version 2.1 of the License, or (at your option) any later version.
|
10
|
+
#
|
11
|
+
# This library is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14
|
+
# Lesser General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU Lesser General Public
|
17
|
+
# License along with this library; if not, write to the Free Software
|
18
|
+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
19
|
+
#
|
20
|
+
# ywesee - intellectual capital connected, Winterthurerstrasse 52, CH-8006 Zürich, Switzerland
|
21
|
+
# hwyss@ywesee.com, aschrafl@ywesee.com
|
22
|
+
#
|
23
|
+
# PdfObject -- Rpdf2txt -- 21.11.2002 -- aschrafl@ywesee.com
|
24
|
+
|
25
|
+
require 'zlib'
|
26
|
+
require 'rpdf2txt/text'
|
27
|
+
require 'rpdf2txt/attributesparser'
|
28
|
+
require 'rpdf2txt/cmapparser'
|
29
|
+
require 'rpdf2txt/symbol'
|
30
|
+
require 'md5'
|
31
|
+
require 'matrix'
|
32
|
+
|
33
|
+
module Rpdf2txt
|
34
|
+
class PdfObject
|
35
|
+
attr_reader :attributes
|
36
|
+
attr_accessor :decoder, :src, :oid
|
37
|
+
def initialize(src=nil, target_encoding='utf8')
|
38
|
+
@attributes = {}
|
39
|
+
@src = src
|
40
|
+
@target_encoding = target_encoding
|
41
|
+
parse_attributes() unless @src.nil?
|
42
|
+
end
|
43
|
+
def build_tree(object_catalogue, parent=nil)
|
44
|
+
@attributes.each { |key, value|
|
45
|
+
case value
|
46
|
+
when Array
|
47
|
+
value.collect! { |obj| catalogue_object(object_catalogue, obj) || obj }
|
48
|
+
when String
|
49
|
+
if obj = catalogue_object(object_catalogue, value)
|
50
|
+
@attributes.store(key, obj)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
}
|
54
|
+
end
|
55
|
+
def catalogue_object(catalogue, reference)
|
56
|
+
if reference.is_a?(String) && (match = /^(\d+)\s+\d+\s+R/n.match reference)
|
57
|
+
catalogue[match[1].to_i]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
def decoded_stream
|
61
|
+
raise "abstract method decoded_stream called in #{self.class}; built from source: \n #{@src.tr("\r", "\n")}"
|
62
|
+
end
|
63
|
+
def oid
|
64
|
+
@oid ||= extract_oid(@src)
|
65
|
+
end
|
66
|
+
def revision_id
|
67
|
+
@revision_id ||= extract_revision_id(@src)
|
68
|
+
end
|
69
|
+
def extract_attribute_stream
|
70
|
+
lastindex = @src.index('stream') || -1
|
71
|
+
index = @src.index('<<')
|
72
|
+
rindex = @src.rindex('>>', lastindex)
|
73
|
+
@src[index..(rindex+1)] if(index && rindex)
|
74
|
+
end
|
75
|
+
def _parse_attributes(src)
|
76
|
+
Rpdf2txt.attributes_parser.parse(src)
|
77
|
+
end
|
78
|
+
def parse_attributes
|
79
|
+
src = self.extract_attribute_stream
|
80
|
+
if(src.nil?)
|
81
|
+
@attibutes = {}
|
82
|
+
else
|
83
|
+
ast = self._parse_attributes(src)
|
84
|
+
ast.compact!
|
85
|
+
@attributes = extract_attributes(ast)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
private
|
89
|
+
def extract_oid(string)
|
90
|
+
/^\d+/n.match(string).to_s.to_i
|
91
|
+
end
|
92
|
+
def extract_revision_id(string)
|
93
|
+
/\s\d+/n.match(string).to_s.to_i
|
94
|
+
end
|
95
|
+
def extract_attributes(ast)
|
96
|
+
if(ast.children_names.include?('value'))
|
97
|
+
pdf_unescape(ast.value)
|
98
|
+
elsif(ast.children_names.include?('text'))
|
99
|
+
pdf_unescape(ast.text.value[1...-1])
|
100
|
+
elsif(ast.children_names.include?('values'))
|
101
|
+
ast.values.collect { |child| extract_attributes(child) }
|
102
|
+
elsif(ast.children_names.include?('pairs'))
|
103
|
+
result = {}
|
104
|
+
ast.pairs.each { |pair|
|
105
|
+
k, v = pair
|
106
|
+
keystr = k.value.strip.tr('/','')
|
107
|
+
unless(keystr.empty?)
|
108
|
+
result.store(keystr.downcase.intern, extract_attributes(v))
|
109
|
+
end
|
110
|
+
}
|
111
|
+
result
|
112
|
+
else
|
113
|
+
value = ast
|
114
|
+
end
|
115
|
+
end
|
116
|
+
def pdf_escape(input)
|
117
|
+
input.gsub(/\\/, '\\\\').gsub(/\n/n, '\n')\
|
118
|
+
.gsub(/\r/n, '\r').gsub(/[()]/n, '\\&')
|
119
|
+
end
|
120
|
+
def pdf_unescape(input)
|
121
|
+
input.gsub(/\\n/n, "\n").gsub(/\\r/n, "\r").\
|
122
|
+
gsub(/\\\)/n, ')').gsub(/\\\(/n, '(').gsub(/\\\\/n, '\\')
|
123
|
+
end
|
124
|
+
end
|
125
|
+
class PdfEncrypt < PdfObject
|
126
|
+
class DecryptionError < RuntimeError
|
127
|
+
end
|
128
|
+
PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"
|
129
|
+
def arc4(key, input)
|
130
|
+
output = ''
|
131
|
+
s, j, k = (0..255).to_a, 0, (key*256)[0,256].unpack('C*')
|
132
|
+
(0..255).each { |x|
|
133
|
+
j = (j + s[x] + k[x]) % 256
|
134
|
+
s[x], s[j] = s[j], s[x]
|
135
|
+
}
|
136
|
+
i = j = 0
|
137
|
+
input.each_byte { |b|
|
138
|
+
i = (i + 1) % 256
|
139
|
+
j = (j + s[i]) % 256
|
140
|
+
s[i], s[j] = s[j], s[i]
|
141
|
+
output << (b ^ s[(s[i] + s[j])%256]).chr
|
142
|
+
}
|
143
|
+
output
|
144
|
+
end
|
145
|
+
def compute_user_key encryption_key
|
146
|
+
if revision < 3
|
147
|
+
pdf_escape arc4(encryption_key, PADDING)
|
148
|
+
else
|
149
|
+
crypt = Digest::MD5.digest PADDING + file_id
|
150
|
+
20.times do |xor|
|
151
|
+
key = encryption_key.unpack('C*').collect! do |byte|
|
152
|
+
byte ^ xor
|
153
|
+
end.pack('C*')
|
154
|
+
crypt = arc4(key, crypt)
|
155
|
+
end
|
156
|
+
pdf_escape crypt
|
157
|
+
end
|
158
|
+
end
|
159
|
+
def decrypt(pdf_object)
|
160
|
+
arc4_key = decrypt_key(pdf_object)
|
161
|
+
stream = pdf_object.raw_stream
|
162
|
+
arc4(arc4_key, stream)
|
163
|
+
end
|
164
|
+
def decrypt_key(pdf_object)
|
165
|
+
oid = pdf_object.oid
|
166
|
+
rev_id = pdf_object.revision_id
|
167
|
+
#if it is a ppc we use reverse
|
168
|
+
if(self.big_endian?)
|
169
|
+
oid_three_bytes = [oid].pack('I*').reverse[0,3]
|
170
|
+
rev_id_two_bytes = [rev_id].pack('I*').reverse[0,2]
|
171
|
+
else
|
172
|
+
oid_three_bytes = [oid].pack('I*')[0,3]
|
173
|
+
rev_id_two_bytes = [rev_id].pack('I*')[0,2]
|
174
|
+
end
|
175
|
+
input = encryption_key << oid_three_bytes << rev_id_two_bytes
|
176
|
+
digest = Digest::MD5.digest(input)
|
177
|
+
digest[0,[keylength + 5,16].min]
|
178
|
+
end
|
179
|
+
def big_endian?
|
180
|
+
#big endian (ppc) little endian x86
|
181
|
+
if ([1].pack('I*') == "\000\000\000\001")
|
182
|
+
true
|
183
|
+
else
|
184
|
+
false
|
185
|
+
end
|
186
|
+
end
|
187
|
+
def encryption_key
|
188
|
+
input_string = PADDING.dup
|
189
|
+
## we don't support a user-password. if we did, it would have to replace
|
190
|
+
# the first [n..32] bytes of the padding string here.
|
191
|
+
input_string << owner_key
|
192
|
+
input_string << permission_flag
|
193
|
+
input_string << file_id
|
194
|
+
## revision >= 4: add 0xffffffff if document metadata is not encrypted
|
195
|
+
digest = Digest::MD5.digest(input_string)
|
196
|
+
uk = user_key
|
197
|
+
if revision >= 3
|
198
|
+
50.times do digest = Digest::MD5.digest(digest[0,keylength]) end
|
199
|
+
uk = uk[0,16]
|
200
|
+
end
|
201
|
+
encryption_key = digest[0,keylength]
|
202
|
+
test_key = compute_user_key encryption_key
|
203
|
+
if(test_key != uk)
|
204
|
+
raise DecryptionError, "test-key did not match user-key ('#{test_key.inspect}' / '#{uk.inspect}')"
|
205
|
+
end
|
206
|
+
encryption_key
|
207
|
+
end
|
208
|
+
def file_id= (file_id)
|
209
|
+
@file_id = file_id
|
210
|
+
end
|
211
|
+
def file_id
|
212
|
+
[@file_id].pack("H*")
|
213
|
+
end
|
214
|
+
def keylength
|
215
|
+
@keylength ||= (@attributes[:length] || 40).to_i / 8
|
216
|
+
end
|
217
|
+
def owner_key
|
218
|
+
@attributes[:o].to_s
|
219
|
+
end
|
220
|
+
def permission_flag
|
221
|
+
if (self.big_endian?)
|
222
|
+
[@attributes[:p].to_i].pack('I*').reverse
|
223
|
+
else
|
224
|
+
[@attributes[:p].to_i].pack('I*')
|
225
|
+
end
|
226
|
+
end
|
227
|
+
def revision
|
228
|
+
@attributes[:r].to_i
|
229
|
+
end
|
230
|
+
def user_key
|
231
|
+
@attributes[:u].to_s
|
232
|
+
end
|
233
|
+
end
|
234
|
+
class Encoding < PdfObject
|
235
|
+
def differences
|
236
|
+
@differences or
|
237
|
+
begin
|
238
|
+
@differences = {}
|
239
|
+
offset = 0
|
240
|
+
idx = 0
|
241
|
+
@attributes[:differences].each { |diff|
|
242
|
+
if(/^\d+$/n.match(diff))
|
243
|
+
offset = diff.to_i
|
244
|
+
else
|
245
|
+
@differences.store(offset + idx, diff[/\w+/n])
|
246
|
+
idx += 1
|
247
|
+
end
|
248
|
+
}
|
249
|
+
@differences
|
250
|
+
end
|
251
|
+
end
|
252
|
+
def convert_symbol(txt)
|
253
|
+
res = ''
|
254
|
+
txt.each_byte { |byte|
|
255
|
+
if(name = differences[byte])
|
256
|
+
byte = Symbol.byte(name) || byte
|
257
|
+
end
|
258
|
+
res << byte
|
259
|
+
}
|
260
|
+
res
|
261
|
+
end
|
262
|
+
def symbol_name(byte)
|
263
|
+
differences[byte]
|
264
|
+
end
|
265
|
+
end
|
266
|
+
class Font < PdfObject
|
267
|
+
attr_accessor :cmap, :descriptor, :rendering_mode, :skewed
|
268
|
+
ENCODINGS = {
|
269
|
+
'/Identity-H' => 'ascii',
|
270
|
+
'/MacRomanEncoding' => 'mac',
|
271
|
+
'/UTF8' => 'utf8',
|
272
|
+
'/WinAnsiEncoding' => 'ms-ansi',
|
273
|
+
}
|
274
|
+
AFM_PTRN = /^C\s*(\d+)\s*;\s*
|
275
|
+
WX\s*(\d+)\s*;\s*
|
276
|
+
N\s*(\w+)/xn
|
277
|
+
def basefont_name
|
278
|
+
@attributes[:basefont]
|
279
|
+
end
|
280
|
+
def basefont_width(char)
|
281
|
+
basefont_widths[char]
|
282
|
+
end
|
283
|
+
def basefont_widths
|
284
|
+
@basefont_widths ||= load_basefont_widths
|
285
|
+
end
|
286
|
+
def bold?
|
287
|
+
(!!/bold/in.match(basefont_name.to_s)) || @rendering_mode == "2"
|
288
|
+
end
|
289
|
+
def build_tree(object_catalogue, parent = nil)
|
290
|
+
super
|
291
|
+
if(desc = @attributes[:descriptor])
|
292
|
+
@descriptor = desc
|
293
|
+
desc.build_tree(object_catalogue, self)
|
294
|
+
end
|
295
|
+
end
|
296
|
+
def encoding
|
297
|
+
enc = @attributes[:encoding]
|
298
|
+
if(enc.is_a?(Encoding))
|
299
|
+
enc
|
300
|
+
else
|
301
|
+
ENCODINGS.fetch(enc) {
|
302
|
+
warn "unknown encoding #{enc}"
|
303
|
+
enc
|
304
|
+
}
|
305
|
+
end
|
306
|
+
end
|
307
|
+
def italic?
|
308
|
+
@skewed || !!/italic/in.match(basefont_name.to_s)
|
309
|
+
end
|
310
|
+
def width(char)
|
311
|
+
if(char.is_a?(String) && char.length == 1)
|
312
|
+
char = char[0]
|
313
|
+
end
|
314
|
+
_width(char) || named_width(char)
|
315
|
+
end
|
316
|
+
def widths
|
317
|
+
@widths ||= (@attributes[:widths] || [])
|
318
|
+
end
|
319
|
+
def symbol?
|
320
|
+
!!/symbol/in.match(basefont_name.to_s)
|
321
|
+
end
|
322
|
+
def to_unicode
|
323
|
+
@to_unicode ||= (tu = @attributes[:tounicode]) && tu.to_cmap
|
324
|
+
end
|
325
|
+
private
|
326
|
+
def first_char
|
327
|
+
@attributes[:firstchar].to_i
|
328
|
+
end
|
329
|
+
def load_basefont_widths
|
330
|
+
widths = {}
|
331
|
+
path = File.join(File.dirname(__FILE__), 'data', 'fonts',
|
332
|
+
"%s.afm" % basefont_name)
|
333
|
+
if(File.readable?(path))
|
334
|
+
File.read(path).scan(AFM_PTRN) { |char, width, name|
|
335
|
+
widths.store(char.to_i, width)
|
336
|
+
widths.store(name, width)
|
337
|
+
}
|
338
|
+
end
|
339
|
+
widths
|
340
|
+
end
|
341
|
+
def named_width(char)
|
342
|
+
enc = @attributes[:encoding]
|
343
|
+
if(enc.is_a?(Encoding))
|
344
|
+
_width(enc.symbol_name(char))
|
345
|
+
end
|
346
|
+
end
|
347
|
+
def _width(char)
|
348
|
+
width = widths.at(char - first_char) if(char.is_a? Integer)
|
349
|
+
width ||= basefont_width(char)
|
350
|
+
width.to_i if(width)
|
351
|
+
end
|
352
|
+
end
|
353
|
+
class FontDescriptor < PdfObject
|
354
|
+
end
|
355
|
+
class Unknown < PdfObject
|
356
|
+
def to_i(*args)
|
357
|
+
((match = /obj\s*(\d+)/n.match(@src)) && match[1]).to_i(*args)
|
358
|
+
end
|
359
|
+
end
|
360
|
+
class Resource < PdfObject
|
361
|
+
def initialize(src=nil, target_encoding='utf8')
|
362
|
+
if(src.is_a? Hash)
|
363
|
+
@attributes = src
|
364
|
+
elsif(src.is_a? String)
|
365
|
+
super
|
366
|
+
else
|
367
|
+
@attributes = {}
|
368
|
+
end
|
369
|
+
@fonts = {}
|
370
|
+
@xobjects = {}
|
371
|
+
end
|
372
|
+
def build_tree(object_catalogue, parent=nil)
|
373
|
+
super
|
374
|
+
[:font, :xobject].each { |type| build_attributes type, object_catalogue }
|
375
|
+
end
|
376
|
+
def font(key)
|
377
|
+
@fonts[key]
|
378
|
+
end
|
379
|
+
def xobject(key)
|
380
|
+
@xobjects[key]
|
381
|
+
end
|
382
|
+
private
|
383
|
+
def build_attributes(type, object_catalogue)
|
384
|
+
storage = instance_variable_get "@#{type}s"
|
385
|
+
attribute = @attributes[type]
|
386
|
+
if attribute.is_a?(Hash)
|
387
|
+
_build_attributes(storage, attribute, object_catalogue)
|
388
|
+
elsif attribute.is_a?(PdfHash)
|
389
|
+
_build_attributes(storage, attribute.contents, object_catalogue)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
def _build_attributes(storage, hash, object_catalogue)
|
393
|
+
hash.each { |key, val|
|
394
|
+
oid = /^\d+/n.match(val).to_s.to_i
|
395
|
+
if(obj = object_catalogue[oid])
|
396
|
+
obj.build_tree(object_catalogue)
|
397
|
+
end
|
398
|
+
storage.store(key, obj)
|
399
|
+
}
|
400
|
+
end
|
401
|
+
end
|
402
|
+
class TrailerDictionary < PdfObject
|
403
|
+
def file_id
|
404
|
+
/[a-zA-Z0-9]+/n.match(@attributes[:id].first).to_s
|
405
|
+
end
|
406
|
+
def encrypt_id
|
407
|
+
extract_oid(@attributes[:encrypt])
|
408
|
+
end
|
409
|
+
=begin
|
410
|
+
def parse_attributes
|
411
|
+
index = @src.index('trailer')
|
412
|
+
rindex = @src.rindex('startxref')
|
413
|
+
unless(index && rindex)
|
414
|
+
@attibutes = {}
|
415
|
+
else
|
416
|
+
#set correct offsets <<(begin) >>(end)
|
417
|
+
src = @src[index+7..(rindex-1)]
|
418
|
+
ast = Rpdf2txt.attributes_parser.parse(src)
|
419
|
+
ast.compact!
|
420
|
+
@attributes = extract_attributes(ast)
|
421
|
+
end
|
422
|
+
end
|
423
|
+
=end
|
424
|
+
def root_id
|
425
|
+
extract_oid(@attributes[:root])
|
426
|
+
end
|
427
|
+
def update(trailer_dict)
|
428
|
+
@attributes.update(trailer_dict.attributes)
|
429
|
+
end
|
430
|
+
protected
|
431
|
+
attr_reader :attributes
|
432
|
+
end
|
433
|
+
class TreeNode < PdfObject
|
434
|
+
include Enumerable
|
435
|
+
attr_reader :parent
|
436
|
+
def build_tree(object_catalogue, parent=nil)
|
437
|
+
super
|
438
|
+
@parent = parent
|
439
|
+
self
|
440
|
+
end
|
441
|
+
def each
|
442
|
+
yield self
|
443
|
+
end
|
444
|
+
def extract_oids(array)
|
445
|
+
array.collect{ |dirty_id|
|
446
|
+
if(match = /\d+/on.match(dirty_id))
|
447
|
+
match[0].to_i
|
448
|
+
end
|
449
|
+
}.compact
|
450
|
+
end
|
451
|
+
def root?
|
452
|
+
!(@parent || @attributes[:parent])
|
453
|
+
end
|
454
|
+
end
|
455
|
+
class CatalogNode < TreeNode
|
456
|
+
def build_tree(object_catalogue, parent=nil)
|
457
|
+
id = extract_oids(@attributes[:pages]).first
|
458
|
+
@pages = object_catalogue[id]
|
459
|
+
@pages.build_tree(object_catalogue, self)
|
460
|
+
super
|
461
|
+
end
|
462
|
+
def each(&block)
|
463
|
+
@pages.each(&block)
|
464
|
+
end
|
465
|
+
end
|
466
|
+
class PageNode < TreeNode
|
467
|
+
attr_reader :kids
|
468
|
+
def build_tree(object_catalogue, parent=nil)
|
469
|
+
@kids = []
|
470
|
+
extract_oids(@attributes[:kids]).each { |id|
|
471
|
+
child = object_catalogue[id]
|
472
|
+
@kids.push(child)
|
473
|
+
child.build_tree(object_catalogue, self)
|
474
|
+
}
|
475
|
+
super
|
476
|
+
end
|
477
|
+
def each
|
478
|
+
@kids.each { |kid|
|
479
|
+
kid.each { |result| yield result }
|
480
|
+
}
|
481
|
+
end
|
482
|
+
def media_box
|
483
|
+
if(mb = @attributes[:mediabox])
|
484
|
+
mb.collect { |val| val.to_f }
|
485
|
+
end
|
486
|
+
end
|
487
|
+
end
|
488
|
+
class PageLeaf < TreeNode
|
489
|
+
attr_reader :contents, :resources
|
490
|
+
def initialize(*args)
|
491
|
+
super
|
492
|
+
@text_state = TextState.new(@target_encoding)
|
493
|
+
end
|
494
|
+
def build_tree(object_catalogue, parent=nil)
|
495
|
+
@contents=[]
|
496
|
+
extract_oids(@attributes[:contents]).each{ |id|
|
497
|
+
content = object_catalogue[id]
|
498
|
+
@contents.push(content)
|
499
|
+
content.build_tree(object_catalogue, self) if content.respond_to?(:build_tree)
|
500
|
+
}
|
501
|
+
resources = @attributes[:resources]
|
502
|
+
if(resources.is_a? String)
|
503
|
+
@resources = object_catalogue[extract_oids([resources]).first]
|
504
|
+
elsif resources.is_a? Hash
|
505
|
+
@resources = Resource.new(resources)
|
506
|
+
elsif(resources.nil? && @parent)
|
507
|
+
@resources = @parent.resources
|
508
|
+
else
|
509
|
+
@resources = Resource.new()
|
510
|
+
end
|
511
|
+
@resources.build_tree(object_catalogue) if @resources.is_a? Resource
|
512
|
+
super
|
513
|
+
end
|
514
|
+
def font(key)
|
515
|
+
@resources.font(key)
|
516
|
+
end
|
517
|
+
def media_box
|
518
|
+
if(parent)
|
519
|
+
parent.media_box
|
520
|
+
end
|
521
|
+
end
|
522
|
+
def text(callback_handler)
|
523
|
+
concat_stream = Stream.new('')
|
524
|
+
if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
|
525
|
+
@contents.first.build_stream(concat_stream)
|
526
|
+
else
|
527
|
+
@contents.each { |stream|
|
528
|
+
concat_stream.append(stream.decoded_stream)
|
529
|
+
}
|
530
|
+
end
|
531
|
+
@text_state.media_box = self.media_box
|
532
|
+
text_snippets = concat_stream.extract_text_objects(self, @text_state)
|
533
|
+
join_snippets(text_snippets, callback_handler)
|
534
|
+
end
|
535
|
+
private
|
536
|
+
def each_pair(text_snippets, &block)
|
537
|
+
text_snippets.inject(nil) { |last_text_state, text_state|
|
538
|
+
if text_state.whitespace_overlap?(last_text_state)
|
539
|
+
last_text_state
|
540
|
+
else
|
541
|
+
block.call(last_text_state, text_state)
|
542
|
+
text_state
|
543
|
+
end
|
544
|
+
}
|
545
|
+
end
|
546
|
+
def identify_columns(text_snippets, hints={})
|
547
|
+
## find narrowest space on page and use as grid-width
|
548
|
+
space = text_snippets.collect { |snip|
|
549
|
+
snip.space_width }.select { |w| w > 0 }.min || 100.0
|
550
|
+
## count ocurrences of snippets in each grid-column
|
551
|
+
positions = {}
|
552
|
+
each_pair(text_snippets) { |last_text_state, text_state|
|
553
|
+
unless(text_state.empty? \
|
554
|
+
|| (last_text_state \
|
555
|
+
&& text_state.same_column(last_text_state) \
|
556
|
+
&& !last_text_state.empty?))
|
557
|
+
idx = ((text_state.x) / space).floor
|
558
|
+
positions[idx] = positions[idx].to_i + 1
|
559
|
+
end
|
560
|
+
}
|
561
|
+
return [] if(positions.empty?)
|
562
|
+
## find the average distance between candidate columns
|
563
|
+
#previous = 0
|
564
|
+
total = 0
|
565
|
+
sorted = positions.sort
|
566
|
+
begin
|
567
|
+
previous, count = sorted.shift
|
568
|
+
end until(count.nil? || count > 2)
|
569
|
+
counts = []
|
570
|
+
width = sorted.inject(0) { |sum, (pos,count)|
|
571
|
+
counts.push [count,pos]
|
572
|
+
if(count > 2)
|
573
|
+
total += 1
|
574
|
+
sum += (pos - previous)
|
575
|
+
previous = pos
|
576
|
+
end
|
577
|
+
sum
|
578
|
+
}
|
579
|
+
if (colcount = hints[:count]) && counts.size >= colcount
|
580
|
+
res = counts.sort[-colcount..-1].collect do |count, pos|
|
581
|
+
pos * space
|
582
|
+
end
|
583
|
+
return res.sort
|
584
|
+
end
|
585
|
+
|
586
|
+
cutwidth = hints[:width]
|
587
|
+
if cutwidth.is_a?(String)
|
588
|
+
dividend, divisor = cutwidth.split '/', 2
|
589
|
+
cutwidth = width * dividend.to_f / divisor.to_f
|
590
|
+
end
|
591
|
+
cutwidth ||= (total.nonzero?) ? width / total * 0.9 : width
|
592
|
+
## select probable columns
|
593
|
+
previous = -cutwidth
|
594
|
+
res = []
|
595
|
+
sorted = positions.sort
|
596
|
+
offset, _ = sorted.first
|
597
|
+
sorted.each_with_index { |(pos, count), idx|
|
598
|
+
ndx = idx.next
|
599
|
+
pos -= offset
|
600
|
+
begin
|
601
|
+
nxtpos, nxtcount = sorted[ndx]
|
602
|
+
ndx += 1
|
603
|
+
end until(nxtcount.nil? || nxtcount > 3)
|
604
|
+
nxtpos -= offset if nxtpos
|
605
|
+
if((count > 1 && (pos - previous) > cutwidth) \
|
606
|
+
&& !(nxtcount.to_i > count && (nxtpos - pos) < cutwidth))
|
607
|
+
previous = pos
|
608
|
+
res.push pos + offset
|
609
|
+
end
|
610
|
+
}
|
611
|
+
res.collect { |pos| pos * space }.sort
|
612
|
+
end
|
613
|
+
def join_snippets(text_snippets, callback_handler)
|
614
|
+
text_snippets.sort!
|
615
|
+
columns = []
|
616
|
+
if(callback_handler.identify_columns?)
|
617
|
+
columns = identify_columns(text_snippets,
|
618
|
+
:width => callback_handler.column_width,
|
619
|
+
:count => callback_handler.column_count)
|
620
|
+
columns.shift #throw away the first colum - we'll use the left media-edge
|
621
|
+
end
|
622
|
+
next_column = nil
|
623
|
+
working_set = []
|
624
|
+
each_pair(text_snippets) { |last_text_state, text_state|
|
625
|
+
text_state.fire_early_callbacks(last_text_state,
|
626
|
+
callback_handler)
|
627
|
+
# are we on a new line?
|
628
|
+
unless(last_text_state && text_state.same_line(last_text_state))
|
629
|
+
working_set = columns.dup
|
630
|
+
next_column = working_set.shift
|
631
|
+
last_text_state = nil
|
632
|
+
callback_handler.send_column
|
633
|
+
end
|
634
|
+
x2 = last_text_state && last_text_state.right_edge.to_i
|
635
|
+
while(next_column && (text_state.x.to_i >= next_column.to_i))
|
636
|
+
if(x2 && (x2 > next_column.to_i) && !last_text_state.empty?)
|
637
|
+
callback_handler.send_colspan
|
638
|
+
else
|
639
|
+
callback_handler.send_column
|
640
|
+
end
|
641
|
+
next_column = working_set.shift
|
642
|
+
end
|
643
|
+
text_state.send_content(last_text_state, callback_handler)
|
644
|
+
}
|
645
|
+
end
|
646
|
+
end
|
647
|
+
class Stream < PdfObject
|
648
|
+
num = "([0-9.-]+)\\s*"
|
649
|
+
dm_str = "#{num}#{num}#{num}#{num}#{num}#{num}cm\\b"
|
650
|
+
xobj = '(/\S+)\s*(\bDo\b)'
|
651
|
+
@@nontext_scan_pattern = %r!(?:#{dm_str})|(\b[qQ]\b)|#{xobj}|\bBI\b(.*?)\bID\b(.*?)\b(EI)\b!mn
|
652
|
+
@@hr_scan_pattern = /#{num}#{num}(\b[lm]\b)/mn
|
653
|
+
BT_PATTERN = /\bBT\b(?!(\\[()]|[^(\\])*\))/mn
|
654
|
+
ET_PATTERN = /\bET\b(?!(\\[()]|[^(\\])*\))/mn
|
655
|
+
FAIL_PTRN = /\((\\[()]|[^)])*\bET\b\s*$/mn
|
656
|
+
def append(decoded_stream)
|
657
|
+
(@decoded_stream ||= '') << decoded_stream
|
658
|
+
end
|
659
|
+
def decoded_stream=(decoded_stream)
|
660
|
+
@decoded_stream = decoded_stream
|
661
|
+
end
|
662
|
+
def decoded_stream
|
663
|
+
@decoded_stream ||= decode_raw_stream
|
664
|
+
end
|
665
|
+
def extract_horizontal_rules(dm_src, dmatrix, result)
|
666
|
+
last_x = 0
|
667
|
+
last_y = 0
|
668
|
+
dm_src.scan(@@hr_scan_pattern) { |matches|
|
669
|
+
case matches.last.to_s[-1]
|
670
|
+
when ?l
|
671
|
+
x = matches[0].to_f
|
672
|
+
y = matches[1].to_f
|
673
|
+
if(x != last_x && y == last_y)
|
674
|
+
hr = HorizontalRule.new(x, y, dmatrix)
|
675
|
+
hr.current_page, hr.text_state = @page, @text_state
|
676
|
+
result.push(hr)
|
677
|
+
end
|
678
|
+
last_x = x
|
679
|
+
last_y = y
|
680
|
+
when ?m
|
681
|
+
last_x = matches[0].to_f
|
682
|
+
last_y = matches[1].to_f
|
683
|
+
end
|
684
|
+
}
|
685
|
+
end
|
686
|
+
def extract_nontext_objects(dm_src, dmatrix, stack, result)
|
687
|
+
dm_src.scan(@@nontext_scan_pattern) { |matches|
|
688
|
+
matches = matches.compact
|
689
|
+
case matches.last
|
690
|
+
when 'q'
|
691
|
+
stack.push(dmatrix)
|
692
|
+
when 'Q'
|
693
|
+
dmatrix = stack.pop
|
694
|
+
when 'Do'
|
695
|
+
x, y = (txt = result.last) ? [txt.x, txt.y] : [0, 0]
|
696
|
+
ip = ImagePlacement.new(matches[-2], x, y, dmatrix)
|
697
|
+
ip.current_page, ip.text_state = @page, @text_state
|
698
|
+
result.push ip
|
699
|
+
when 'EI'
|
700
|
+
attrs, data, _ = matches
|
701
|
+
im = InlineImage.new attrs, data.strip
|
702
|
+
ip = ImagePlacement.new im, 0, 0, dmatrix
|
703
|
+
ip.current_page, ip.text_state = @page, @text_state
|
704
|
+
result.push ip
|
705
|
+
else
|
706
|
+
mmatrix = Matrix[[matches[0].to_f, matches[1].to_f,0],
|
707
|
+
[matches[2].to_f, matches[3].to_f,0],
|
708
|
+
[matches[4].to_f, matches[5].to_f,1]]
|
709
|
+
dmatrix = dmatrix * mmatrix
|
710
|
+
end
|
711
|
+
}
|
712
|
+
dmatrix
|
713
|
+
end
|
714
|
+
def extract_text_objects(page, text_state)
|
715
|
+
@page, @text_state = page, text_state
|
716
|
+
stack = []
|
717
|
+
result = []
|
718
|
+
startpoint = decoded_stream.index(BT_PATTERN)
|
719
|
+
endpoint = decoded_stream.index(ET_PATTERN)
|
720
|
+
while FAIL_PTRN.match(decoded_stream[0..(endpoint+2)])
|
721
|
+
endpoint = decoded_stream.index(ET_PATTERN, endpoint.next)
|
722
|
+
end
|
723
|
+
unless(startpoint && endpoint && (startpoint < endpoint))
|
724
|
+
startpoint = 0
|
725
|
+
end
|
726
|
+
rotation = (page && Math::PI * page.attributes[:rotate].to_f / 180) || 0
|
727
|
+
dmatrix = Matrix[[Math.cos(rotation),Math.sin(rotation),0],
|
728
|
+
[Math.sin(rotation),-Math.cos(rotation),0],
|
729
|
+
[0,0,1]]
|
730
|
+
|
731
|
+
dm_src = decoded_stream[0...startpoint]
|
732
|
+
while(endpoint && startpoint)
|
733
|
+
### pick out the bits in between Text that are relevant to
|
734
|
+
### text positioning (such as the device-transformation-matrix)
|
735
|
+
### NOTE: as far as I understand, the device matrix should
|
736
|
+
### not be used to position text. However it is used
|
737
|
+
### by some PDF-Creators and therefore we have to include
|
738
|
+
### it in our calculations.
|
739
|
+
dmatrix = extract_nontext_objects(dm_src, dmatrix, stack, result)
|
740
|
+
extract_horizontal_rules(dm_src, dmatrix, result)
|
741
|
+
tsrc = decoded_stream[startpoint..(endpoint+2)]
|
742
|
+
while FAIL_PTRN.match(tsrc)
|
743
|
+
endpoint = decoded_stream.index(ET_PATTERN, endpoint + 2) || -1
|
744
|
+
tsrc = decoded_stream[startpoint..(endpoint+2)]
|
745
|
+
end
|
746
|
+
text = Text.new(tsrc, @target_encoding, dmatrix)
|
747
|
+
text.current_page = page
|
748
|
+
text.text_state = text_state
|
749
|
+
result.concat text.scan
|
750
|
+
startpoint = decoded_stream.index(BT_PATTERN, endpoint)
|
751
|
+
if(startpoint)
|
752
|
+
dm_src = decoded_stream[endpoint...startpoint]
|
753
|
+
endpoint = decoded_stream.index(ET_PATTERN, startpoint)
|
754
|
+
end
|
755
|
+
end
|
756
|
+
result
|
757
|
+
end
|
758
|
+
def raw_stream
|
759
|
+
@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
|
760
|
+
end
|
761
|
+
def decode_raw_stream
|
762
|
+
@decrypted_stream = raw_stream
|
763
|
+
unless(@decoder.nil?)
|
764
|
+
@decrypted_stream = @decoder.decrypt(self)
|
765
|
+
end
|
766
|
+
stream = @decrypted_stream
|
767
|
+
[@attributes[:filter]].flatten.compact.each { |filter|
|
768
|
+
begin
|
769
|
+
stream = case filter
|
770
|
+
when "/FlateDecode"
|
771
|
+
flate_decode stream
|
772
|
+
when "/LZWDecode"
|
773
|
+
lzw_decode stream
|
774
|
+
else
|
775
|
+
raise "Unimplemented filter: #{filter}"
|
776
|
+
end
|
777
|
+
rescue StandardError => err
|
778
|
+
warn "'#{err.message}' when filtering with #{filter}"
|
779
|
+
end
|
780
|
+
}
|
781
|
+
stream
|
782
|
+
end
|
783
|
+
def flate_decode(data)
|
784
|
+
Zlib::Inflate.inflate(data)
|
785
|
+
end
|
786
|
+
def lzw_decode(data)
|
787
|
+
require 'rpdf2txt/lzw'
|
788
|
+
earlychange = (parm = @attributes[:decodeparms]) && parm[:earlychange]
|
789
|
+
if length = @attributes[:length]
|
790
|
+
data = data[0, length.to_i]
|
791
|
+
end
|
792
|
+
LZW.decode data, (earlychange || 1).to_i
|
793
|
+
end
|
794
|
+
def to_cmap
|
795
|
+
cmap = CMap.new(@src, @target_encoding)
|
796
|
+
end
|
797
|
+
end
|
798
|
+
class ObjStream < Stream
|
799
|
+
end
|
800
|
+
class Image < Stream
|
801
|
+
COLORMAPS = {
|
802
|
+
'/DeviceRGB' => 'RGB',
|
803
|
+
'/DeviceGray' => 'I',
|
804
|
+
'/DeviceCMYK' => 'CMYK',
|
805
|
+
}
|
806
|
+
def image
|
807
|
+
require 'RMagick'
|
808
|
+
@image or begin
|
809
|
+
columns = @attributes[:width].to_i
|
810
|
+
rows = @attributes[:height].to_i
|
811
|
+
depth = @attributes[:bitspercomponent].to_i
|
812
|
+
mask = @attributes[:mask]
|
813
|
+
color_grades = 2 ** depth - 1
|
814
|
+
colorspace, basespace, index_colors, index = @attributes[:colorspace]
|
815
|
+
index_colors = index_colors.to_i
|
816
|
+
colormap = COLORMAPS[colorspace] || COLORMAPS[basespace] || 'RGB'
|
817
|
+
colors = colormap.length
|
818
|
+
pixels = extract_pixels(decoded_stream, depth)
|
819
|
+
case colorspace
|
820
|
+
when '/Indexed'
|
821
|
+
## FIXME: this works for some images, but seems to be wrong
|
822
|
+
# according to the Documentation
|
823
|
+
if mask.is_a?(Array) && (pixels.size - 1 ) > rows * columns
|
824
|
+
range = (mask[0].to_i)..(mask[1].to_i)
|
825
|
+
pixels.delete_if { |idx| range.include? idx }
|
826
|
+
end
|
827
|
+
# for indexed images, index_colors correctly describes the
|
828
|
+
# depth of the resulting pixels, whereas bitspercomponent
|
829
|
+
# may not be accurate
|
830
|
+
color_grades = index_colors
|
831
|
+
map = extract_colormap(index, index_colors)
|
832
|
+
tmp = Array.new(pixels.size * colors)
|
833
|
+
pos = 0
|
834
|
+
pixels.each { |idx|
|
835
|
+
tmp[pos, colors] = map[idx * colors, colors]
|
836
|
+
pos += colors
|
837
|
+
}
|
838
|
+
pixels = tmp
|
839
|
+
end
|
840
|
+
## this seems to be undocumented: PNG-images need to be decoded.
|
841
|
+
# we can detect this by the additional Byte per Row:
|
842
|
+
if pixels.size == (columns * colors + 1) * rows
|
843
|
+
pixels = idat_decode pixels, columns, colors
|
844
|
+
elsif pixels.size > (rows * columns * colors)
|
845
|
+
pixels = pixels[0, rows * columns * colors]
|
846
|
+
end
|
847
|
+
if color_grades != (2 ** Magick::QuantumDepth - 1)
|
848
|
+
div = color_grades.to_f
|
849
|
+
pixels.collect! { |px| px / div }
|
850
|
+
end
|
851
|
+
@image = Magick::Image.constitute(columns, rows, colormap, pixels)
|
852
|
+
end
|
853
|
+
end
|
854
|
+
def idat_decode(data, width, colors)
|
855
|
+
scanline_length = colors * width + 1 # for filter
|
856
|
+
byte_width = width * colors
|
857
|
+
|
858
|
+
pixels = []
|
859
|
+
row = 0
|
860
|
+
until data.empty? do
|
861
|
+
row_data = data.slice! 0, scanline_length
|
862
|
+
filter = row_data.shift
|
863
|
+
case filter
|
864
|
+
when 0 then # None
|
865
|
+
when 1 then # Sub
|
866
|
+
row_data.each_with_index do |byte, index|
|
867
|
+
left = index < colors ? 0 : row_data[index - colors]
|
868
|
+
row_data[index] = (byte + left) % 256
|
869
|
+
end
|
870
|
+
when 2 then # Up
|
871
|
+
row_data.each_with_index do |byte, index|
|
872
|
+
upper = row == 0 ? 0 : pixels[ - byte_width + index ]
|
873
|
+
row_data[index] = (upper + byte) % 256
|
874
|
+
end
|
875
|
+
when 3 then # Average
|
876
|
+
row_data.each_with_index do |byte, index|
|
877
|
+
upper = row == 0 ? 0 : pixels[ - byte_width + index ]
|
878
|
+
left = index < colors ? 0 : row_data[index - colors]
|
879
|
+
|
880
|
+
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
881
|
+
end
|
882
|
+
when 4 then # Paeth
|
883
|
+
left = upper = upper_left = nil
|
884
|
+
row_data.each_with_index do |byte, index|
|
885
|
+
|
886
|
+
left = index < colors ? 0 : row_data[index - colors]
|
887
|
+
if row == 0 then
|
888
|
+
upper = upper_left = 0
|
889
|
+
else
|
890
|
+
upper_idx = - byte_width + index
|
891
|
+
upper = pixels[ upper_idx ]
|
892
|
+
upper_left = index < colors ? 0 : pixels[ upper_idx - colors ]
|
893
|
+
end
|
894
|
+
|
895
|
+
paeth = paeth left, upper, upper_left
|
896
|
+
row_data[index] = (byte + paeth) % 256
|
897
|
+
end
|
898
|
+
else
|
899
|
+
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
900
|
+
end
|
901
|
+
|
902
|
+
pixels.concat row_data
|
903
|
+
row += 1
|
904
|
+
end
|
905
|
+
pixels
|
906
|
+
end
|
907
|
+
private
|
908
|
+
def extract_colormap(index, mask)
|
909
|
+
map = []
|
910
|
+
if index.is_a? Stream
|
911
|
+
map = index.decoded_stream.unpack('C*').collect { |int| int & mask }
|
912
|
+
elsif index.is_a? Array
|
913
|
+
map = index
|
914
|
+
end
|
915
|
+
map
|
916
|
+
end
|
917
|
+
def extract_pixels(stream, depth)
|
918
|
+
case depth
|
919
|
+
when 8
|
920
|
+
stream.unpack('C*')
|
921
|
+
else
|
922
|
+
tmp, = stream.unpack('B*')
|
923
|
+
pixels = []
|
924
|
+
tmp.scan(/.{#{depth}}/n) { |match|
|
925
|
+
pixels.push match.to_i(2)
|
926
|
+
}
|
927
|
+
pixels
|
928
|
+
end
|
929
|
+
end
|
930
|
+
def paeth(a, b, c) # left, above, upper left
|
931
|
+
p = a + b - c
|
932
|
+
pa = (p - a).abs
|
933
|
+
pb = (p - b).abs
|
934
|
+
pc = (p - c).abs
|
935
|
+
|
936
|
+
return a if pa <= pb && pa <= pc
|
937
|
+
return b if pb <= pc
|
938
|
+
c
|
939
|
+
end
|
940
|
+
end
|
941
|
+
class InlineImage < Image
|
942
|
+
ATTR_ABBREVIATIONS = {
|
943
|
+
:bpc => :bitspercomponent, :cs => :colorspace,
|
944
|
+
:d => :decode, :dp => :decodeparms,
|
945
|
+
:f => :filter, :h => :height,
|
946
|
+
:im => :imagemask, :i => :interpolate,
|
947
|
+
:w => :width,
|
948
|
+
}
|
949
|
+
OTHER_ABBREVIATIONS = {
|
950
|
+
'/G' => '/DeviceGray',
|
951
|
+
'/RGB' => '/DeviceRGB',
|
952
|
+
'/CMYK' => '/DeviceCMYK',
|
953
|
+
'/I' => '/Indexed',
|
954
|
+
'/AHx' => '/ASCIIHexDecode',
|
955
|
+
'/A85' => '/ASCII85Decode',
|
956
|
+
'/LZW' => '/LZWDecode',
|
957
|
+
'/Fl' => '/FlateDecode',
|
958
|
+
'/RL' => '/RunLengthDecode',
|
959
|
+
'/CCF' => '/CCITTFaxDecode',
|
960
|
+
'/DCT' => '/DCTDecode',
|
961
|
+
}
|
962
|
+
def initialize(attrs, data)
|
963
|
+
super("<<" << attrs << ">>")
|
964
|
+
@raw_stream = data
|
965
|
+
end
|
966
|
+
def parse_attributes
|
967
|
+
super
|
968
|
+
ATTR_ABBREVIATIONS.each do |abbr, key|
|
969
|
+
if value = @attributes.delete(abbr)
|
970
|
+
@attributes.store key, OTHER_ABBREVIATIONS.fetch(value, value)
|
971
|
+
end
|
972
|
+
end
|
973
|
+
end
|
974
|
+
end
|
975
|
+
class CMap < Stream
|
976
|
+
attr_accessor :map
|
977
|
+
def initialize(*args)
|
978
|
+
@map = {}
|
979
|
+
super
|
980
|
+
parse_cmap()
|
981
|
+
end
|
982
|
+
def to_utf8(txt)
|
983
|
+
if(@map.nil?)
|
984
|
+
txt
|
985
|
+
elsif(txt.is_a?(Integer))
|
986
|
+
@map[txt]
|
987
|
+
else
|
988
|
+
txt.unpack('C*').collect { |byte|
|
989
|
+
@map.fetch(byte, byte) }.pack('U*')
|
990
|
+
end
|
991
|
+
end
|
992
|
+
private
|
993
|
+
#bfchar definition
|
994
|
+
def add_to_map_bfchar(ast)
|
995
|
+
ast.compact!
|
996
|
+
ast.each { |child|
|
997
|
+
#convert in to decimal values
|
998
|
+
@map.store(_hexvalue(child.source), _hexvalue(child.target))
|
999
|
+
}
|
1000
|
+
@map
|
1001
|
+
end
|
1002
|
+
#bfrange definition see page 457 of the pdf manual
|
1003
|
+
def add_to_map_bfrange(ast)
|
1004
|
+
ast.compact!
|
1005
|
+
start_range = ast.start.value.to_s.hex
|
1006
|
+
end_range = ast.stop.value.to_s.hex
|
1007
|
+
if(ast.children_names.include?('explicit'))
|
1008
|
+
explicit = ast.explicit
|
1009
|
+
start_range.upto(end_range) { |char|
|
1010
|
+
@map.store(char, _hexvalue(explicit.shift))
|
1011
|
+
}
|
1012
|
+
else
|
1013
|
+
offset = _hexvalue(ast.offset)
|
1014
|
+
start_range.upto(end_range) { |char|
|
1015
|
+
@map.store(char, offset)
|
1016
|
+
offset+=1
|
1017
|
+
}
|
1018
|
+
end
|
1019
|
+
@map
|
1020
|
+
end
|
1021
|
+
def extract_bfchar
|
1022
|
+
src = decoded_stream
|
1023
|
+
unless(decoded_stream.index('beginbfchar').nil?)
|
1024
|
+
index = decoded_stream.index('beginbfchar') + 12
|
1025
|
+
rindex = decoded_stream.index('endbfchar')
|
1026
|
+
src[index..(rindex)-1]
|
1027
|
+
end
|
1028
|
+
end
|
1029
|
+
def extract_bfrange
|
1030
|
+
src = decoded_stream
|
1031
|
+
unless(decoded_stream.index('beginbfrange').nil?)
|
1032
|
+
index = decoded_stream.index('beginbfrange') + 12
|
1033
|
+
rindex = decoded_stream.index('endbfrange')
|
1034
|
+
src[index..(rindex)-1]
|
1035
|
+
end
|
1036
|
+
end
|
1037
|
+
def _hexvalue(ast)
|
1038
|
+
ast.value.to_s.to_i(16)
|
1039
|
+
end
|
1040
|
+
def parse_cmap
|
1041
|
+
if(src = extract_bfchar)
|
1042
|
+
ast = Rpdf2txt.cmap_parser.parse(src)
|
1043
|
+
add_to_map_bfchar(ast)
|
1044
|
+
end
|
1045
|
+
if(src = extract_bfrange)
|
1046
|
+
ast = Rpdf2txt.cmap_range_parser.parse(src)
|
1047
|
+
ast.each { |node|
|
1048
|
+
add_to_map_bfrange(node)
|
1049
|
+
}
|
1050
|
+
end
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
class ReferenceArray < TreeNode
|
1054
|
+
def build_tree(object_catalogue, parent=nil)
|
1055
|
+
@contents=[]
|
1056
|
+
@references.each{ |id|
|
1057
|
+
@contents.push(object_catalogue[id]) if object_catalogue[id]
|
1058
|
+
}
|
1059
|
+
super
|
1060
|
+
end
|
1061
|
+
def build_stream(concat_stream)
|
1062
|
+
@contents.each { |stream|
|
1063
|
+
concat_stream.append(stream.decoded_stream)
|
1064
|
+
}
|
1065
|
+
concat_stream
|
1066
|
+
end
|
1067
|
+
def parse_attributes
|
1068
|
+
src = @src[@src.index('[')..@src.rindex(']')]
|
1069
|
+
ast = _parse_attributes(src)
|
1070
|
+
ast.compact!
|
1071
|
+
@references = extract_oids(extract_attributes(ast))
|
1072
|
+
end
|
1073
|
+
def root?
|
1074
|
+
false
|
1075
|
+
end
|
1076
|
+
end
|
1077
|
+
class PdfArray < TreeNode
|
1078
|
+
def build_tree(object_catalogue, parent=nil)
|
1079
|
+
@contents=[]
|
1080
|
+
super
|
1081
|
+
end
|
1082
|
+
def at(idx)
|
1083
|
+
@contents.at(idx)
|
1084
|
+
end
|
1085
|
+
def each(&block)
|
1086
|
+
@contents.each(&block)
|
1087
|
+
end
|
1088
|
+
def parse_attributes
|
1089
|
+
src = @src[@src.index('[')..@src.rindex(']')]
|
1090
|
+
ast = _parse_attributes(src)
|
1091
|
+
ast.compact!
|
1092
|
+
@contents = extract_attributes(ast)
|
1093
|
+
end
|
1094
|
+
def root?
|
1095
|
+
false
|
1096
|
+
end
|
1097
|
+
end
|
1098
|
+
class PdfHash < TreeNode
|
1099
|
+
attr_reader :contents
|
1100
|
+
def build_tree(object_catalogue, parent=nil)
|
1101
|
+
@contents={}
|
1102
|
+
super
|
1103
|
+
end
|
1104
|
+
def parse_attributes
|
1105
|
+
src = @src[@src.index('<<')..@src.rindex('>')]
|
1106
|
+
ast = _parse_attributes(src)
|
1107
|
+
ast.compact!
|
1108
|
+
@contents = extract_attributes(ast)
|
1109
|
+
end
|
1110
|
+
def root?
|
1111
|
+
false
|
1112
|
+
end
|
1113
|
+
end
|
1114
|
+
end
|