combine_pdf 0.2.5 → 0.2.37
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/CHANGELOG.md +273 -27
- data/LICENSE.txt +2 -1
- data/README.md +69 -4
- data/lib/combine_pdf/api.rb +156 -153
- data/lib/combine_pdf/basic_writer.rb +41 -53
- data/lib/combine_pdf/decrypt.rb +238 -228
- data/lib/combine_pdf/exceptions.rb +4 -0
- data/lib/combine_pdf/filter.rb +79 -85
- data/lib/combine_pdf/fonts.rb +451 -462
- data/lib/combine_pdf/page_methods.rb +891 -946
- data/lib/combine_pdf/parser.rb +663 -531
- data/lib/combine_pdf/pdf_protected.rb +341 -126
- data/lib/combine_pdf/pdf_public.rb +492 -454
- data/lib/combine_pdf/renderer.rb +146 -141
- data/lib/combine_pdf/version.rb +1 -2
- data/lib/combine_pdf.rb +14 -18
- data/test/automated +132 -0
- data/test/console +4 -4
- data/test/named_dest +84 -0
- metadata +8 -5
- data/lib/combine_pdf/operations.rb +0 -416
data/lib/combine_pdf/parser.rb
CHANGED
@@ -5,535 +5,667 @@
|
|
5
5
|
## is subject to the same license.
|
6
6
|
########################################################
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
8
|
module CombinePDF
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
9
|
+
# @!visibility private
|
10
|
+
# @private
|
11
|
+
#:nodoc: all
|
12
|
+
|
13
|
+
protected
|
14
|
+
|
15
|
+
# This is the Parser class.
|
16
|
+
#
|
17
|
+
# It takes PDF data and parses it.
|
18
|
+
#
|
19
|
+
# The information is then used to initialize a PDF object.
|
20
|
+
#
|
21
|
+
# This is an internal class. you don't need it.
|
22
|
+
class PDFParser
|
23
|
+
# @!visibility private
|
24
|
+
|
25
|
+
# the array containing all the parsed data (PDF Objects)
|
26
|
+
attr_reader :parsed
|
27
|
+
# a Float representing the PDF version of the data parsed (if exists).
|
28
|
+
attr_reader :version
|
29
|
+
# the info and root objects, as found (if found) in the PDF file.
|
30
|
+
#
|
31
|
+
# they are mainly to used to know if the file is (was) encrypted and to get more details.
|
32
|
+
attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata
|
33
|
+
|
34
|
+
attr_reader :allow_optional_content
|
35
|
+
# when creating a parser, it is important to set the data (String) we wish to parse.
|
36
|
+
#
|
37
|
+
# <b>the data is required and it is not possible to set the data at a later stage</b>
|
38
|
+
#
|
39
|
+
# string:: the data to be parsed, as a String object.
|
40
|
+
def initialize(string, options = {})
|
41
|
+
raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
|
42
|
+
@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
|
43
|
+
@literal_strings = [].dup
|
44
|
+
@hex_strings = [].dup
|
45
|
+
@streams = [].dup
|
46
|
+
@parsed = [].dup
|
47
|
+
@references = [].dup
|
48
|
+
@root_object = {}.dup
|
49
|
+
@info_object = {}.dup
|
50
|
+
@names_object = {}.dup
|
51
|
+
@outlines_object = {}.dup
|
52
|
+
@forms_object = {}.dup
|
53
|
+
@metadata = nil
|
54
|
+
@strings_dictionary = {}.dup # all strings are one string
|
55
|
+
@version = nil
|
56
|
+
@scanner = nil
|
57
|
+
@allow_optional_content = options[:allow_optional_content]
|
58
|
+
end
|
59
|
+
|
60
|
+
# parse the data in the new parser (the data already set through the initialize / new method)
|
61
|
+
def parse
|
62
|
+
return [] if @string_to_parse.empty?
|
63
|
+
return @parsed unless @parsed.empty?
|
64
|
+
@scanner = StringScanner.new @string_to_parse
|
65
|
+
@scanner.pos = 0
|
66
|
+
@scanner.skip(/[^%]*/) if @scanner.exist?(/%PDF/i)
|
67
|
+
if @scanner.scan /\%PDF\-[\d\-\.]+/
|
68
|
+
@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
|
69
|
+
loop do
|
70
|
+
break unless @scanner.scan(/[^\d\r\n]+/)
|
71
|
+
break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
|
72
|
+
break if @scanner.eos?
|
73
|
+
@scanner.pos += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
@parsed = _parse_
|
77
|
+
# puts @parsed
|
78
|
+
|
79
|
+
raise 'Unknown PDF parsing error - malformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
|
80
|
+
|
81
|
+
if @root_object == {}.freeze
|
82
|
+
xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
|
83
|
+
xref_streams.each do |xref_dictionary|
|
84
|
+
@root_object.merge! xref_dictionary
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze
|
89
|
+
|
90
|
+
if @root_object[:Encrypt]
|
91
|
+
# change_references_to_actual_values @root_object
|
92
|
+
warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
|
93
|
+
decryptor = PDFDecrypt.new @parsed, @root_object
|
94
|
+
decryptor.decrypt
|
95
|
+
# do we really need to apply to @parsed? No, there is no need.
|
96
|
+
end
|
97
|
+
|
98
|
+
## search for objects streams
|
99
|
+
object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
|
100
|
+
unless object_streams.empty?
|
101
|
+
warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
|
102
|
+
|
103
|
+
object_streams.each do |o|
|
104
|
+
## un-encode (using the correct filter) the object streams
|
105
|
+
PDFFilter.inflate_object o
|
106
|
+
## extract objects from stream to top level arry @parsed
|
107
|
+
@scanner = StringScanner.new o[:raw_stream_content]
|
108
|
+
stream_data = _parse_
|
109
|
+
id_array = []
|
110
|
+
while stream_data[0].is_a? (Integer)
|
111
|
+
id_array << stream_data.shift
|
112
|
+
stream_data.shift
|
113
|
+
end
|
114
|
+
while id_array[0] && stream_data[0]
|
115
|
+
stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
|
116
|
+
stream_data[0][:indirect_reference_id] = id_array.shift
|
117
|
+
stream_data[0][:indirect_generation_number] = 0
|
118
|
+
@parsed << stream_data.shift
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# serialize_objects_and_references.catalog_pages
|
124
|
+
|
125
|
+
# Benchmark.bm do |bm|
|
126
|
+
# bm.report("serialize") {1000.times {serialize_objects_and_references} }
|
127
|
+
# bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
|
128
|
+
# bm.report("catalog") {1000.times {catalog_pages} }
|
129
|
+
# end
|
130
|
+
|
131
|
+
serialize_objects_and_references
|
132
|
+
|
133
|
+
catalog_pages
|
134
|
+
|
135
|
+
# Strings were unified, we can let them go..
|
136
|
+
@strings_dictionary.clear
|
137
|
+
|
138
|
+
# collect any missing objects from the forms_data
|
139
|
+
unless @forms_object.nil? || @forms_object.empty?
|
140
|
+
@forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } }
|
141
|
+
@forms_object[:related_objects].delete @forms_object
|
142
|
+
end
|
143
|
+
|
144
|
+
@info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
|
145
|
+
if @info_object && @info_object.is_a?(Hash)
|
146
|
+
@parsed.delete @info_object
|
147
|
+
CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
|
148
|
+
@info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
|
149
|
+
else
|
150
|
+
@info_object = {}
|
151
|
+
end
|
152
|
+
# # # ## remove object streams - if they exist
|
153
|
+
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
154
|
+
# # # ## remove XREF dictionaries - if they exist
|
155
|
+
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
|
156
|
+
|
157
|
+
@parsed
|
158
|
+
end
|
159
|
+
|
160
|
+
# the actual recoursive parsing is done here.
|
161
|
+
#
|
162
|
+
# this is an internal function, but it was left exposed for posible future features.
|
163
|
+
def _parse_
|
164
|
+
out = []
|
165
|
+
str = ''
|
166
|
+
fresh = true
|
167
|
+
while @scanner.rest?
|
168
|
+
# last ||= 0
|
169
|
+
# out.last.tap do |o|
|
170
|
+
# if o.is_a?(Hash)
|
171
|
+
# puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
|
172
|
+
# o.each do |k, v|
|
173
|
+
# puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
|
174
|
+
# end
|
175
|
+
# else
|
176
|
+
# puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
|
177
|
+
# end
|
178
|
+
# puts "next is #{@scanner.peek 8}"
|
179
|
+
# end unless (last == out.count) || (-1 == (last = out.count))
|
180
|
+
if @scanner.scan(/\[/)
|
181
|
+
out << _parse_
|
182
|
+
##########################################
|
183
|
+
## parse a Dictionary
|
184
|
+
##########################################
|
185
|
+
elsif @scanner.scan(/<</)
|
186
|
+
data = _parse_
|
187
|
+
obj = {}
|
188
|
+
obj[data.shift] = data.shift while data[0]
|
189
|
+
out << obj
|
190
|
+
##########################################
|
191
|
+
## return content of array or dictionary
|
192
|
+
##########################################
|
193
|
+
elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
|
194
|
+
return out
|
195
|
+
##########################################
|
196
|
+
## parse a Stream
|
197
|
+
##########################################
|
198
|
+
elsif @scanner.scan(/stream[\r\n]/)
|
199
|
+
@scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
|
200
|
+
# the following was dicarded because some PDF files didn't have an EOL marker as required
|
201
|
+
# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
|
202
|
+
# instead, a non-strict RegExp is used:
|
203
|
+
str = @scanner.scan_until(/endstream/)
|
204
|
+
# raise error if the stream doesn't end.
|
205
|
+
raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
|
206
|
+
# need to remove end of stream
|
207
|
+
if out.last.is_a? Hash
|
208
|
+
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
209
|
+
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
210
|
+
else
|
211
|
+
warn 'Stream not attached to dictionary!'
|
212
|
+
out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
213
|
+
end
|
214
|
+
##########################################
|
215
|
+
## parse an Object after finished
|
216
|
+
##########################################
|
217
|
+
elsif str = @scanner.scan(/endobj/)
|
218
|
+
# what to do when this is an object?
|
219
|
+
if out.last.is_a? Hash
|
220
|
+
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
221
|
+
else
|
222
|
+
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
223
|
+
end
|
224
|
+
fresh = true
|
225
|
+
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
226
|
+
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
227
|
+
# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
|
228
|
+
##########################################
|
229
|
+
## parse a Hex String
|
230
|
+
##########################################
|
231
|
+
elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
|
232
|
+
# warn "Found a hex string"
|
233
|
+
out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
|
234
|
+
##########################################
|
235
|
+
## parse a Literal String
|
236
|
+
##########################################
|
237
|
+
elsif @scanner.scan(/\(/)
|
238
|
+
# warn "Found a literal string"
|
239
|
+
str = ''.force_encoding(Encoding::ASCII_8BIT)
|
240
|
+
count = 1
|
241
|
+
while count > 0 && @scanner.rest?
|
242
|
+
scn = @scanner.scan_until(/[\(\)]/)
|
243
|
+
unless scn
|
244
|
+
warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
|
245
|
+
count = 0 # error
|
246
|
+
next
|
247
|
+
end
|
248
|
+
|
249
|
+
str += scn.to_s
|
250
|
+
seperator_count = 0
|
251
|
+
seperator_count += 1 while str[-2 - seperator_count] == '\\'
|
252
|
+
|
253
|
+
case str[-1]
|
254
|
+
when '('
|
255
|
+
## The following solution might fail when (string ends with this sign: \\)
|
256
|
+
count += 1 unless seperator_count.odd?
|
257
|
+
when ')'
|
258
|
+
count -= 1 unless seperator_count.odd?
|
259
|
+
else
|
260
|
+
warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
|
261
|
+
count = 0 # error
|
262
|
+
end
|
263
|
+
end
|
264
|
+
# The PDF formatted string is: str[0..-2]
|
265
|
+
# now starting to convert to regular string
|
266
|
+
str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
|
267
|
+
str = []
|
268
|
+
until str_bytes.empty?
|
269
|
+
case str_bytes[0]
|
270
|
+
when 13 # eol - \r
|
271
|
+
# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
|
272
|
+
# shall be treated as a byte value of (0Ah),
|
273
|
+
# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
|
274
|
+
str_bytes.shift
|
275
|
+
str_bytes.shift if str_bytes[0] == 10
|
276
|
+
str << 10
|
277
|
+
when 10 # eol - \n
|
278
|
+
# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
|
279
|
+
# shall be treated as a byte value of (0Ah),
|
280
|
+
# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
|
281
|
+
str_bytes.shift
|
282
|
+
str_bytes.shift if str_bytes[0] == 13
|
283
|
+
str << 10
|
284
|
+
when 92 # "\\".ord == 92
|
285
|
+
str_bytes.shift
|
286
|
+
rep = str_bytes.shift
|
287
|
+
case rep
|
288
|
+
when 110 # n
|
289
|
+
str << 10 # new line
|
290
|
+
when 114 # r
|
291
|
+
str << 13 # CR
|
292
|
+
when 116 # t
|
293
|
+
str << 9 # tab
|
294
|
+
when 98 # b
|
295
|
+
str << 8
|
296
|
+
when 102 # f, form-feed
|
297
|
+
str << 12
|
298
|
+
when 48..57 # octal notation for byte?
|
299
|
+
rep = rep.chr
|
300
|
+
rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
|
301
|
+
rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
|
302
|
+
str << rep.to_i
|
303
|
+
when 10 # new line, ignore
|
304
|
+
str_bytes.shift if str_bytes[0] == 13
|
305
|
+
true
|
306
|
+
when 13 # new line (or double notation for new line), ignore
|
307
|
+
str_bytes.shift if str_bytes[0] == 10
|
308
|
+
true
|
309
|
+
else
|
310
|
+
str << rep
|
311
|
+
end
|
312
|
+
else
|
313
|
+
str << str_bytes.shift
|
314
|
+
end
|
315
|
+
end
|
316
|
+
out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
|
317
|
+
##########################################
|
318
|
+
## Parse a comment
|
319
|
+
##########################################
|
320
|
+
elsif str = @scanner.scan(/\%/)
|
321
|
+
# is a comment, skip until new line
|
322
|
+
loop do
|
323
|
+
# break unless @scanner.scan(/[^\d\r\n]+/)
|
324
|
+
break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
|
325
|
+
@scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
|
326
|
+
end
|
327
|
+
# puts "AFTER COMMENT: #{@scanner.peek 8}"
|
328
|
+
##########################################
|
329
|
+
## Parse a Name
|
330
|
+
##########################################
|
331
|
+
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
332
|
+
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
333
|
+
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
334
|
+
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
335
|
+
elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
|
336
|
+
out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
|
337
|
+
##########################################
|
338
|
+
## Parse a Number
|
339
|
+
##########################################
|
340
|
+
elsif str = @scanner.scan(/[\+\-\.\d]+/)
|
341
|
+
str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
|
342
|
+
##########################################
|
343
|
+
## Parse an Object Reference
|
344
|
+
##########################################
|
345
|
+
elsif @scanner.scan(/R/)
|
346
|
+
out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
347
|
+
# @references << out.last
|
348
|
+
##########################################
|
349
|
+
## Parse Bool - true and after false
|
350
|
+
##########################################
|
351
|
+
elsif @scanner.scan(/true/)
|
352
|
+
out << true
|
353
|
+
elsif @scanner.scan(/false/)
|
354
|
+
out << false
|
355
|
+
##########################################
|
356
|
+
## Parse NULL - null
|
357
|
+
##########################################
|
358
|
+
elsif @scanner.scan(/null/)
|
359
|
+
out << nil
|
360
|
+
##########################################
|
361
|
+
## XREF - check for encryption... anything else?
|
362
|
+
##########################################
|
363
|
+
elsif @scanner.scan(/(startxref)|(xref)/)
|
364
|
+
##########
|
365
|
+
## get root object to check for encryption
|
366
|
+
@scanner.scan_until(/(trailer)|(\%EOF)/)
|
367
|
+
fresh = true
|
368
|
+
if @scanner.matched[-1] == 'r'
|
369
|
+
if @scanner.skip_until(/<</)
|
370
|
+
data = _parse_
|
371
|
+
@root_object ||= {}
|
372
|
+
@root_object[data.shift] = data.shift while data[0]
|
373
|
+
end
|
374
|
+
##########
|
375
|
+
## skip untill end of segment, maked by %%EOF
|
376
|
+
@scanner.skip_until(/\%\%EOF/)
|
377
|
+
##########
|
378
|
+
## If this was the last valid segment, ignore any trailing garbage
|
379
|
+
## (issue #49 resolution)
|
380
|
+
break unless @scanner.exist?(/\%\%EOF/)
|
381
|
+
|
382
|
+
end
|
383
|
+
|
384
|
+
elsif @scanner.scan(/[\s]+/)
|
385
|
+
# Generally, do nothing
|
386
|
+
nil
|
387
|
+
elsif @scanner.scan(/obj[\s]*/)
|
388
|
+
# Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
|
389
|
+
unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
|
390
|
+
keep = []
|
391
|
+
keep << out.pop # .tap {|i| puts "#{i} is an ID"}
|
392
|
+
keep << out.pop # .tap {|i| puts "#{i} is a REF"}
|
393
|
+
|
394
|
+
if out.last.is_a? Hash
|
395
|
+
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
396
|
+
else
|
397
|
+
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
398
|
+
end
|
399
|
+
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
400
|
+
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
401
|
+
warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
|
402
|
+
|
403
|
+
out << keep.pop
|
404
|
+
out << keep.pop
|
405
|
+
end
|
406
|
+
fresh = false
|
407
|
+
else
|
408
|
+
# always advance
|
409
|
+
# warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos - 4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
|
410
|
+
warn 'Warning: parser advancing for unknown reason. Potential data-loss.'
|
411
|
+
@scanner.pos = @scanner.pos + 1
|
412
|
+
end
|
413
|
+
end
|
414
|
+
out
|
415
|
+
end
|
416
|
+
|
417
|
+
protected
|
418
|
+
|
419
|
+
# resets cataloging and pages
|
420
|
+
def catalog_pages(catalogs = nil, inheritance_hash = {})
|
421
|
+
unless catalogs
|
422
|
+
|
423
|
+
if root_object[:Root]
|
424
|
+
catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
|
425
|
+
else
|
426
|
+
catalogs = (@parsed.select { |obj| obj[:Type] == :Catalog }).last
|
427
|
+
end
|
428
|
+
|
429
|
+
@parsed.delete_if { |obj| obj.nil? || obj[:Type] == :Catalog }
|
430
|
+
@parsed << catalogs
|
431
|
+
|
432
|
+
raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
|
433
|
+
end
|
434
|
+
if catalogs.is_a?(Array)
|
435
|
+
catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
|
436
|
+
elsif catalogs.is_a?(Hash)
|
437
|
+
if catalogs[:is_reference_only]
|
438
|
+
if catalogs[:referenced_object]
|
439
|
+
catalog_pages(catalogs[:referenced_object], inheritance_hash)
|
440
|
+
else
|
441
|
+
warn "couldn't follow reference!!! #{catalogs} not found!"
|
442
|
+
end
|
443
|
+
else
|
444
|
+
unless catalogs[:Type] == :Page
|
445
|
+
raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
|
446
|
+
inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
|
447
|
+
inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
|
448
|
+
inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
|
449
|
+
if catalogs[:Resources]
|
450
|
+
inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
|
451
|
+
(inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
|
452
|
+
end
|
453
|
+
if catalogs[:ColorSpace]
|
454
|
+
inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
|
455
|
+
(inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
|
456
|
+
end
|
457
|
+
# (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
|
458
|
+
# (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
|
459
|
+
|
460
|
+
# inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
|
461
|
+
# inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
|
462
|
+
# inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
|
463
|
+
end
|
464
|
+
|
465
|
+
case catalogs[:Type]
|
466
|
+
when :Page
|
467
|
+
|
468
|
+
catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
|
469
|
+
catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
|
470
|
+
catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
|
471
|
+
if inheritance_hash[:Resources]
|
472
|
+
catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
|
473
|
+
(catalogs[:Resources][:referenced_object] || catalogs[:Resources]).update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
|
474
|
+
end
|
475
|
+
if inheritance_hash[:ColorSpace]
|
476
|
+
catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
|
477
|
+
(catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]).update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
|
478
|
+
end
|
479
|
+
# (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
|
480
|
+
# catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
|
481
|
+
# catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
|
482
|
+
# catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
|
483
|
+
|
484
|
+
# avoide references on MediaBox, CropBox and Rotate
|
485
|
+
catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
|
486
|
+
catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
|
487
|
+
catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
|
488
|
+
|
489
|
+
catalogs.instance_eval { extend Page_Methods }
|
490
|
+
when :Pages
|
491
|
+
catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
|
492
|
+
when :Catalog
|
493
|
+
@forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
|
494
|
+
@names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
|
495
|
+
@outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
|
496
|
+
if catalogs[:Dests] # convert PDF 1.1 Dests to PDF 1.2+ Dests
|
497
|
+
dests_arry = (@names_object[:Dests] ||= {})
|
498
|
+
dests_arry = ((dests_arry[:referenced_object] || dests_arry)[:Names] ||= [])
|
499
|
+
((catalogs[:Dests][:referenced_object] || catalogs[:Dests])[:referenced_object] || (catalogs[:Dests][:referenced_object] || catalogs[:Dests])).each {|k,v| next if CombinePDF::PDF::PRIVATE_HASH_KEYS.include?(k); dests_arry << unify_string(k.to_s); dests_arry << v; }
|
500
|
+
end
|
501
|
+
catalog_pages(catalogs[:Pages], inheritance_hash.dup) unless catalogs[:Pages].nil?
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
self
|
506
|
+
end
|
507
|
+
|
508
|
+
def get_refernced_object(reference_hash = {})
|
509
|
+
@parsed.each do |stored_object|
|
510
|
+
return stored_object if stored_object.is_a?(Hash) &&
|
511
|
+
reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
512
|
+
reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
513
|
+
# return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
|
514
|
+
# reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
515
|
+
# reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
516
|
+
end
|
517
|
+
warn "didn't find reference #{reference_hash}"
|
518
|
+
nil
|
519
|
+
end
|
520
|
+
|
521
|
+
# # @private
|
522
|
+
# # connects references and objects, according to their reference id's.
|
523
|
+
# #
|
524
|
+
# # should be moved to the parser's workflow.
|
525
|
+
# #
|
526
|
+
# def serialize_objects_and_references_old
|
527
|
+
# obj_dir = {}
|
528
|
+
# # create a dictionary for referenced objects (no value resolution at this point)
|
529
|
+
# @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
|
530
|
+
# # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
|
531
|
+
# @references.each do |obj|
|
532
|
+
# obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
|
533
|
+
# warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
|
534
|
+
# obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
|
535
|
+
# end
|
536
|
+
# obj_dir.clear
|
537
|
+
# @references.clear
|
538
|
+
# self
|
539
|
+
# end
|
540
|
+
|
541
|
+
# @private
|
542
|
+
# connects references and objects, according to their reference id's.
|
543
|
+
#
|
544
|
+
# Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
|
545
|
+
#
|
546
|
+
# should be moved to the parser's workflow.
|
547
|
+
#
|
548
|
+
def serialize_objects_and_references
|
549
|
+
obj_dir = {}
|
550
|
+
# create a dictionary for referenced objects (no value resolution at this point)
|
551
|
+
# @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
|
552
|
+
@parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
|
553
|
+
should_resolve = [@parsed, @root_object]
|
554
|
+
while should_resolve.count > 0
|
555
|
+
obj = should_resolve.pop
|
556
|
+
if obj.is_a?(Hash)
|
557
|
+
obj.keys.each do |k|
|
558
|
+
o = obj[k]
|
559
|
+
if o.is_a?(Hash)
|
560
|
+
if o[:is_reference_only]
|
561
|
+
if o[:indirect_reference_id].nil?
|
562
|
+
o = nil
|
563
|
+
else
|
564
|
+
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
|
565
|
+
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
|
566
|
+
o.delete :indirect_reference_id
|
567
|
+
o.delete :indirect_generation_number
|
568
|
+
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
|
569
|
+
end
|
570
|
+
obj[k] = o
|
571
|
+
else
|
572
|
+
should_resolve << o
|
573
|
+
end
|
574
|
+
elsif o.is_a?(Array)
|
575
|
+
should_resolve << o
|
576
|
+
end
|
577
|
+
end
|
578
|
+
elsif obj.is_a?(Array)
|
579
|
+
obj.map! do |o|
|
580
|
+
if o.is_a?(Hash)
|
581
|
+
if o[:is_reference_only]
|
582
|
+
if o[:indirect_reference_id].nil?
|
583
|
+
o = nil
|
584
|
+
else
|
585
|
+
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
|
586
|
+
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
|
587
|
+
o.delete :indirect_reference_id
|
588
|
+
o.delete :indirect_generation_number
|
589
|
+
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
|
590
|
+
end
|
591
|
+
else
|
592
|
+
should_resolve << o
|
593
|
+
end
|
594
|
+
elsif o.is_a?(Array)
|
595
|
+
should_resolve << o
|
596
|
+
end
|
597
|
+
o
|
598
|
+
end
|
599
|
+
end
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
# def serialize_objects_and_references
|
604
|
+
# rec_resolve = proc do |level|
|
605
|
+
# if level.is_a?(Hash)
|
606
|
+
# if level[:is_reference_only]
|
607
|
+
# level[:referenced_object] = get_refernced_object(level)
|
608
|
+
# level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level
|
609
|
+
# level.delete :indirect_reference_id
|
610
|
+
# level.delete :indirect_generation_number
|
611
|
+
# else
|
612
|
+
# level.keys.each do |k|
|
613
|
+
# level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil?
|
614
|
+
# end
|
615
|
+
# end
|
616
|
+
# elsif level.is_a?(Array)
|
617
|
+
# level.map! { |o| rec_resolve.call(o) }
|
618
|
+
# end
|
619
|
+
# level
|
620
|
+
# end
|
621
|
+
# rec_resolve.call(@root_object)
|
622
|
+
# rec_resolve.call(@parsed)
|
623
|
+
# self
|
624
|
+
# end
|
625
|
+
|
626
|
+
# All Strings are one String
|
627
|
+
def unify_string(str)
|
628
|
+
@strings_dictionary[str] ||= str
|
629
|
+
end
|
630
|
+
|
631
|
+
# @private
|
632
|
+
# this method reviews a Hash and updates it by merging Hash data,
|
633
|
+
# preffering the old over the new.
|
634
|
+
def self.hash_update_proc_for_old(_key, old_data, new_data)
|
635
|
+
if old_data.is_a? Hash
|
636
|
+
old_data.merge(new_data, &method(:hash_update_proc_for_old))
|
637
|
+
else
|
638
|
+
old_data
|
639
|
+
end
|
640
|
+
end
|
641
|
+
|
642
|
+
# @private
|
643
|
+
# this method reviews a Hash an updates it by merging Hash data,
|
644
|
+
# preffering the new over the old.
|
645
|
+
def self.hash_update_proc_for_new(_key, old_data, new_data)
|
646
|
+
if old_data.is_a? Hash
|
647
|
+
old_data.merge(new_data, &method(:hash_update_proc_for_new))
|
648
|
+
else
|
649
|
+
new_data
|
650
|
+
end
|
651
|
+
end
|
652
|
+
|
653
|
+
# # run block of code on evey PDF object (PDF objects are class Hash)
|
654
|
+
# def each_object(object, limit_references = true, already_visited = {}, &block)
|
655
|
+
# unless limit_references
|
656
|
+
# already_visited[object.object_id] = true
|
657
|
+
# end
|
658
|
+
# case
|
659
|
+
# when object.is_a?(Array)
|
660
|
+
# object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
|
661
|
+
# when object.is_a?(Hash)
|
662
|
+
# yield(object)
|
663
|
+
# unless limit_references && object[:is_reference_only]
|
664
|
+
# object.each do |k,v|
|
665
|
+
# each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
|
666
|
+
# end
|
667
|
+
# end
|
668
|
+
# end
|
669
|
+
# end
|
670
|
+
end
|
671
|
+
end
|