combine_pdf 0.2.5 → 0.2.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/CHANGELOG.md +273 -27
- data/LICENSE.txt +2 -1
- data/README.md +69 -4
- data/lib/combine_pdf/api.rb +156 -153
- data/lib/combine_pdf/basic_writer.rb +41 -53
- data/lib/combine_pdf/decrypt.rb +238 -228
- data/lib/combine_pdf/exceptions.rb +4 -0
- data/lib/combine_pdf/filter.rb +79 -85
- data/lib/combine_pdf/fonts.rb +451 -462
- data/lib/combine_pdf/page_methods.rb +891 -946
- data/lib/combine_pdf/parser.rb +663 -531
- data/lib/combine_pdf/pdf_protected.rb +341 -126
- data/lib/combine_pdf/pdf_public.rb +492 -454
- data/lib/combine_pdf/renderer.rb +146 -141
- data/lib/combine_pdf/version.rb +1 -2
- data/lib/combine_pdf.rb +14 -18
- data/test/automated +132 -0
- data/test/console +4 -4
- data/test/named_dest +84 -0
- metadata +8 -5
- data/lib/combine_pdf/operations.rb +0 -416
data/lib/combine_pdf/parser.rb
CHANGED
@@ -5,535 +5,667 @@
|
|
5
5
|
## is subject to the same license.
|
6
6
|
########################################################
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
8
|
module CombinePDF
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
9
|
+
# @!visibility private
|
10
|
+
# @private
|
11
|
+
#:nodoc: all
|
12
|
+
|
13
|
+
protected
|
14
|
+
|
15
|
+
# This is the Parser class.
|
16
|
+
#
|
17
|
+
# It takes PDF data and parses it.
|
18
|
+
#
|
19
|
+
# The information is then used to initialize a PDF object.
|
20
|
+
#
|
21
|
+
# This is an internal class. you don't need it.
|
22
|
+
class PDFParser
|
23
|
+
# @!visibility private
|
24
|
+
|
25
|
+
# the array containing all the parsed data (PDF Objects)
|
26
|
+
attr_reader :parsed
|
27
|
+
# a Float representing the PDF version of the data parsed (if exists).
|
28
|
+
attr_reader :version
|
29
|
+
# the info and root objects, as found (if found) in the PDF file.
|
30
|
+
#
|
31
|
+
# they are mainly to used to know if the file is (was) encrypted and to get more details.
|
32
|
+
attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata
|
33
|
+
|
34
|
+
attr_reader :allow_optional_content
|
35
|
+
# when creating a parser, it is important to set the data (String) we wish to parse.
|
36
|
+
#
|
37
|
+
# <b>the data is required and it is not possible to set the data at a later stage</b>
|
38
|
+
#
|
39
|
+
# string:: the data to be parsed, as a String object.
|
40
|
+
def initialize(string, options = {})
|
41
|
+
raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
|
42
|
+
@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
|
43
|
+
@literal_strings = [].dup
|
44
|
+
@hex_strings = [].dup
|
45
|
+
@streams = [].dup
|
46
|
+
@parsed = [].dup
|
47
|
+
@references = [].dup
|
48
|
+
@root_object = {}.dup
|
49
|
+
@info_object = {}.dup
|
50
|
+
@names_object = {}.dup
|
51
|
+
@outlines_object = {}.dup
|
52
|
+
@forms_object = {}.dup
|
53
|
+
@metadata = nil
|
54
|
+
@strings_dictionary = {}.dup # all strings are one string
|
55
|
+
@version = nil
|
56
|
+
@scanner = nil
|
57
|
+
@allow_optional_content = options[:allow_optional_content]
|
58
|
+
end
|
59
|
+
|
60
|
+
# parse the data in the new parser (the data already set through the initialize / new method)
|
61
|
+
def parse
|
62
|
+
return [] if @string_to_parse.empty?
|
63
|
+
return @parsed unless @parsed.empty?
|
64
|
+
@scanner = StringScanner.new @string_to_parse
|
65
|
+
@scanner.pos = 0
|
66
|
+
@scanner.skip(/[^%]*/) if @scanner.exist?(/%PDF/i)
|
67
|
+
if @scanner.scan /\%PDF\-[\d\-\.]+/
|
68
|
+
@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
|
69
|
+
loop do
|
70
|
+
break unless @scanner.scan(/[^\d\r\n]+/)
|
71
|
+
break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
|
72
|
+
break if @scanner.eos?
|
73
|
+
@scanner.pos += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
@parsed = _parse_
|
77
|
+
# puts @parsed
|
78
|
+
|
79
|
+
raise 'Unknown PDF parsing error - malformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
|
80
|
+
|
81
|
+
if @root_object == {}.freeze
|
82
|
+
xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
|
83
|
+
xref_streams.each do |xref_dictionary|
|
84
|
+
@root_object.merge! xref_dictionary
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze
|
89
|
+
|
90
|
+
if @root_object[:Encrypt]
|
91
|
+
# change_references_to_actual_values @root_object
|
92
|
+
warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
|
93
|
+
decryptor = PDFDecrypt.new @parsed, @root_object
|
94
|
+
decryptor.decrypt
|
95
|
+
# do we really need to apply to @parsed? No, there is no need.
|
96
|
+
end
|
97
|
+
|
98
|
+
## search for objects streams
|
99
|
+
object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
|
100
|
+
unless object_streams.empty?
|
101
|
+
warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
|
102
|
+
|
103
|
+
object_streams.each do |o|
|
104
|
+
## un-encode (using the correct filter) the object streams
|
105
|
+
PDFFilter.inflate_object o
|
106
|
+
## extract objects from stream to top level arry @parsed
|
107
|
+
@scanner = StringScanner.new o[:raw_stream_content]
|
108
|
+
stream_data = _parse_
|
109
|
+
id_array = []
|
110
|
+
while stream_data[0].is_a? (Integer)
|
111
|
+
id_array << stream_data.shift
|
112
|
+
stream_data.shift
|
113
|
+
end
|
114
|
+
while id_array[0] && stream_data[0]
|
115
|
+
stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
|
116
|
+
stream_data[0][:indirect_reference_id] = id_array.shift
|
117
|
+
stream_data[0][:indirect_generation_number] = 0
|
118
|
+
@parsed << stream_data.shift
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# serialize_objects_and_references.catalog_pages
|
124
|
+
|
125
|
+
# Benchmark.bm do |bm|
|
126
|
+
# bm.report("serialize") {1000.times {serialize_objects_and_references} }
|
127
|
+
# bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
|
128
|
+
# bm.report("catalog") {1000.times {catalog_pages} }
|
129
|
+
# end
|
130
|
+
|
131
|
+
serialize_objects_and_references
|
132
|
+
|
133
|
+
catalog_pages
|
134
|
+
|
135
|
+
# Strings were unified, we can let them go..
|
136
|
+
@strings_dictionary.clear
|
137
|
+
|
138
|
+
# collect any missing objects from the forms_data
|
139
|
+
unless @forms_object.nil? || @forms_object.empty?
|
140
|
+
@forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } }
|
141
|
+
@forms_object[:related_objects].delete @forms_object
|
142
|
+
end
|
143
|
+
|
144
|
+
@info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
|
145
|
+
if @info_object && @info_object.is_a?(Hash)
|
146
|
+
@parsed.delete @info_object
|
147
|
+
CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
|
148
|
+
@info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
|
149
|
+
else
|
150
|
+
@info_object = {}
|
151
|
+
end
|
152
|
+
# # # ## remove object streams - if they exist
|
153
|
+
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
154
|
+
# # # ## remove XREF dictionaries - if they exist
|
155
|
+
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
|
156
|
+
|
157
|
+
@parsed
|
158
|
+
end
|
159
|
+
|
160
|
+
# the actual recoursive parsing is done here.
|
161
|
+
#
|
162
|
+
# this is an internal function, but it was left exposed for posible future features.
|
163
|
+
def _parse_
|
164
|
+
out = []
|
165
|
+
str = ''
|
166
|
+
fresh = true
|
167
|
+
while @scanner.rest?
|
168
|
+
# last ||= 0
|
169
|
+
# out.last.tap do |o|
|
170
|
+
# if o.is_a?(Hash)
|
171
|
+
# puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
|
172
|
+
# o.each do |k, v|
|
173
|
+
# puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
|
174
|
+
# end
|
175
|
+
# else
|
176
|
+
# puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
|
177
|
+
# end
|
178
|
+
# puts "next is #{@scanner.peek 8}"
|
179
|
+
# end unless (last == out.count) || (-1 == (last = out.count))
|
180
|
+
if @scanner.scan(/\[/)
|
181
|
+
out << _parse_
|
182
|
+
##########################################
|
183
|
+
## parse a Dictionary
|
184
|
+
##########################################
|
185
|
+
elsif @scanner.scan(/<</)
|
186
|
+
data = _parse_
|
187
|
+
obj = {}
|
188
|
+
obj[data.shift] = data.shift while data[0]
|
189
|
+
out << obj
|
190
|
+
##########################################
|
191
|
+
## return content of array or dictionary
|
192
|
+
##########################################
|
193
|
+
elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
|
194
|
+
return out
|
195
|
+
##########################################
|
196
|
+
## parse a Stream
|
197
|
+
##########################################
|
198
|
+
elsif @scanner.scan(/stream[\r\n]/)
|
199
|
+
@scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
|
200
|
+
# the following was dicarded because some PDF files didn't have an EOL marker as required
|
201
|
+
# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
|
202
|
+
# instead, a non-strict RegExp is used:
|
203
|
+
str = @scanner.scan_until(/endstream/)
|
204
|
+
# raise error if the stream doesn't end.
|
205
|
+
raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
|
206
|
+
# need to remove end of stream
|
207
|
+
if out.last.is_a? Hash
|
208
|
+
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
209
|
+
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
210
|
+
else
|
211
|
+
warn 'Stream not attached to dictionary!'
|
212
|
+
out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
213
|
+
end
|
214
|
+
##########################################
|
215
|
+
## parse an Object after finished
|
216
|
+
##########################################
|
217
|
+
elsif str = @scanner.scan(/endobj/)
|
218
|
+
# what to do when this is an object?
|
219
|
+
if out.last.is_a? Hash
|
220
|
+
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
221
|
+
else
|
222
|
+
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
223
|
+
end
|
224
|
+
fresh = true
|
225
|
+
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
226
|
+
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
227
|
+
# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
|
228
|
+
##########################################
|
229
|
+
## parse a Hex String
|
230
|
+
##########################################
|
231
|
+
elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
|
232
|
+
# warn "Found a hex string"
|
233
|
+
out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
|
234
|
+
##########################################
|
235
|
+
## parse a Literal String
|
236
|
+
##########################################
|
237
|
+
elsif @scanner.scan(/\(/)
|
238
|
+
# warn "Found a literal string"
|
239
|
+
str = ''.force_encoding(Encoding::ASCII_8BIT)
|
240
|
+
count = 1
|
241
|
+
while count > 0 && @scanner.rest?
|
242
|
+
scn = @scanner.scan_until(/[\(\)]/)
|
243
|
+
unless scn
|
244
|
+
warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
|
245
|
+
count = 0 # error
|
246
|
+
next
|
247
|
+
end
|
248
|
+
|
249
|
+
str += scn.to_s
|
250
|
+
seperator_count = 0
|
251
|
+
seperator_count += 1 while str[-2 - seperator_count] == '\\'
|
252
|
+
|
253
|
+
case str[-1]
|
254
|
+
when '('
|
255
|
+
## The following solution might fail when (string ends with this sign: \\)
|
256
|
+
count += 1 unless seperator_count.odd?
|
257
|
+
when ')'
|
258
|
+
count -= 1 unless seperator_count.odd?
|
259
|
+
else
|
260
|
+
warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
|
261
|
+
count = 0 # error
|
262
|
+
end
|
263
|
+
end
|
264
|
+
# The PDF formatted string is: str[0..-2]
|
265
|
+
# now starting to convert to regular string
|
266
|
+
str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
|
267
|
+
str = []
|
268
|
+
until str_bytes.empty?
|
269
|
+
case str_bytes[0]
|
270
|
+
when 13 # eol - \r
|
271
|
+
# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
|
272
|
+
# shall be treated as a byte value of (0Ah),
|
273
|
+
# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
|
274
|
+
str_bytes.shift
|
275
|
+
str_bytes.shift if str_bytes[0] == 10
|
276
|
+
str << 10
|
277
|
+
when 10 # eol - \n
|
278
|
+
# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
|
279
|
+
# shall be treated as a byte value of (0Ah),
|
280
|
+
# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
|
281
|
+
str_bytes.shift
|
282
|
+
str_bytes.shift if str_bytes[0] == 13
|
283
|
+
str << 10
|
284
|
+
when 92 # "\\".ord == 92
|
285
|
+
str_bytes.shift
|
286
|
+
rep = str_bytes.shift
|
287
|
+
case rep
|
288
|
+
when 110 # n
|
289
|
+
str << 10 # new line
|
290
|
+
when 114 # r
|
291
|
+
str << 13 # CR
|
292
|
+
when 116 # t
|
293
|
+
str << 9 # tab
|
294
|
+
when 98 # b
|
295
|
+
str << 8
|
296
|
+
when 102 # f, form-feed
|
297
|
+
str << 12
|
298
|
+
when 48..57 # octal notation for byte?
|
299
|
+
rep = rep.chr
|
300
|
+
rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
|
301
|
+
rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
|
302
|
+
str << rep.to_i
|
303
|
+
when 10 # new line, ignore
|
304
|
+
str_bytes.shift if str_bytes[0] == 13
|
305
|
+
true
|
306
|
+
when 13 # new line (or double notation for new line), ignore
|
307
|
+
str_bytes.shift if str_bytes[0] == 10
|
308
|
+
true
|
309
|
+
else
|
310
|
+
str << rep
|
311
|
+
end
|
312
|
+
else
|
313
|
+
str << str_bytes.shift
|
314
|
+
end
|
315
|
+
end
|
316
|
+
out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
|
317
|
+
##########################################
|
318
|
+
## Parse a comment
|
319
|
+
##########################################
|
320
|
+
elsif str = @scanner.scan(/\%/)
|
321
|
+
# is a comment, skip until new line
|
322
|
+
loop do
|
323
|
+
# break unless @scanner.scan(/[^\d\r\n]+/)
|
324
|
+
break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
|
325
|
+
@scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
|
326
|
+
end
|
327
|
+
# puts "AFTER COMMENT: #{@scanner.peek 8}"
|
328
|
+
##########################################
|
329
|
+
## Parse a Name
|
330
|
+
##########################################
|
331
|
+
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
332
|
+
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
333
|
+
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
334
|
+
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
335
|
+
elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
|
336
|
+
out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
|
337
|
+
##########################################
|
338
|
+
## Parse a Number
|
339
|
+
##########################################
|
340
|
+
elsif str = @scanner.scan(/[\+\-\.\d]+/)
|
341
|
+
str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
|
342
|
+
##########################################
|
343
|
+
## Parse an Object Reference
|
344
|
+
##########################################
|
345
|
+
elsif @scanner.scan(/R/)
|
346
|
+
out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
347
|
+
# @references << out.last
|
348
|
+
##########################################
|
349
|
+
## Parse Bool - true and after false
|
350
|
+
##########################################
|
351
|
+
elsif @scanner.scan(/true/)
|
352
|
+
out << true
|
353
|
+
elsif @scanner.scan(/false/)
|
354
|
+
out << false
|
355
|
+
##########################################
|
356
|
+
## Parse NULL - null
|
357
|
+
##########################################
|
358
|
+
elsif @scanner.scan(/null/)
|
359
|
+
out << nil
|
360
|
+
##########################################
|
361
|
+
## XREF - check for encryption... anything else?
|
362
|
+
##########################################
|
363
|
+
elsif @scanner.scan(/(startxref)|(xref)/)
|
364
|
+
##########
|
365
|
+
## get root object to check for encryption
|
366
|
+
@scanner.scan_until(/(trailer)|(\%EOF)/)
|
367
|
+
fresh = true
|
368
|
+
if @scanner.matched[-1] == 'r'
|
369
|
+
if @scanner.skip_until(/<</)
|
370
|
+
data = _parse_
|
371
|
+
@root_object ||= {}
|
372
|
+
@root_object[data.shift] = data.shift while data[0]
|
373
|
+
end
|
374
|
+
##########
|
375
|
+
## skip untill end of segment, maked by %%EOF
|
376
|
+
@scanner.skip_until(/\%\%EOF/)
|
377
|
+
##########
|
378
|
+
## If this was the last valid segment, ignore any trailing garbage
|
379
|
+
## (issue #49 resolution)
|
380
|
+
break unless @scanner.exist?(/\%\%EOF/)
|
381
|
+
|
382
|
+
end
|
383
|
+
|
384
|
+
elsif @scanner.scan(/[\s]+/)
|
385
|
+
# Generally, do nothing
|
386
|
+
nil
|
387
|
+
elsif @scanner.scan(/obj[\s]*/)
|
388
|
+
# Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
|
389
|
+
unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
|
390
|
+
keep = []
|
391
|
+
keep << out.pop # .tap {|i| puts "#{i} is an ID"}
|
392
|
+
keep << out.pop # .tap {|i| puts "#{i} is a REF"}
|
393
|
+
|
394
|
+
if out.last.is_a? Hash
|
395
|
+
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
396
|
+
else
|
397
|
+
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
398
|
+
end
|
399
|
+
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
400
|
+
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
401
|
+
warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
|
402
|
+
|
403
|
+
out << keep.pop
|
404
|
+
out << keep.pop
|
405
|
+
end
|
406
|
+
fresh = false
|
407
|
+
else
|
408
|
+
# always advance
|
409
|
+
# warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos - 4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
|
410
|
+
warn 'Warning: parser advancing for unknown reason. Potential data-loss.'
|
411
|
+
@scanner.pos = @scanner.pos + 1
|
412
|
+
end
|
413
|
+
end
|
414
|
+
out
|
415
|
+
end
|
416
|
+
|
417
|
+
protected
|
418
|
+
|
419
|
+
# resets cataloging and pages
|
420
|
+
def catalog_pages(catalogs = nil, inheritance_hash = {})
|
421
|
+
unless catalogs
|
422
|
+
|
423
|
+
if root_object[:Root]
|
424
|
+
catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
|
425
|
+
else
|
426
|
+
catalogs = (@parsed.select { |obj| obj[:Type] == :Catalog }).last
|
427
|
+
end
|
428
|
+
|
429
|
+
@parsed.delete_if { |obj| obj.nil? || obj[:Type] == :Catalog }
|
430
|
+
@parsed << catalogs
|
431
|
+
|
432
|
+
raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
|
433
|
+
end
|
434
|
+
if catalogs.is_a?(Array)
|
435
|
+
catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
|
436
|
+
elsif catalogs.is_a?(Hash)
|
437
|
+
if catalogs[:is_reference_only]
|
438
|
+
if catalogs[:referenced_object]
|
439
|
+
catalog_pages(catalogs[:referenced_object], inheritance_hash)
|
440
|
+
else
|
441
|
+
warn "couldn't follow reference!!! #{catalogs} not found!"
|
442
|
+
end
|
443
|
+
else
|
444
|
+
unless catalogs[:Type] == :Page
|
445
|
+
raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
|
446
|
+
inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
|
447
|
+
inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
|
448
|
+
inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
|
449
|
+
if catalogs[:Resources]
|
450
|
+
inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
|
451
|
+
(inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
|
452
|
+
end
|
453
|
+
if catalogs[:ColorSpace]
|
454
|
+
inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
|
455
|
+
(inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
|
456
|
+
end
|
457
|
+
# (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
|
458
|
+
# (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
|
459
|
+
|
460
|
+
# inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
|
461
|
+
# inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
|
462
|
+
# inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
|
463
|
+
end
|
464
|
+
|
465
|
+
case catalogs[:Type]
|
466
|
+
when :Page
|
467
|
+
|
468
|
+
catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
|
469
|
+
catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
|
470
|
+
catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
|
471
|
+
if inheritance_hash[:Resources]
|
472
|
+
catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
|
473
|
+
(catalogs[:Resources][:referenced_object] || catalogs[:Resources]).update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
|
474
|
+
end
|
475
|
+
if inheritance_hash[:ColorSpace]
|
476
|
+
catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
|
477
|
+
(catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]).update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
|
478
|
+
end
|
479
|
+
# (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
|
480
|
+
# catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
|
481
|
+
# catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
|
482
|
+
# catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
|
483
|
+
|
484
|
+
# avoide references on MediaBox, CropBox and Rotate
|
485
|
+
catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
|
486
|
+
catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
|
487
|
+
catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
|
488
|
+
|
489
|
+
catalogs.instance_eval { extend Page_Methods }
|
490
|
+
when :Pages
|
491
|
+
catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
|
492
|
+
when :Catalog
|
493
|
+
@forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
|
494
|
+
@names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
|
495
|
+
@outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
|
496
|
+
if catalogs[:Dests] # convert PDF 1.1 Dests to PDF 1.2+ Dests
|
497
|
+
dests_arry = (@names_object[:Dests] ||= {})
|
498
|
+
dests_arry = ((dests_arry[:referenced_object] || dests_arry)[:Names] ||= [])
|
499
|
+
((catalogs[:Dests][:referenced_object] || catalogs[:Dests])[:referenced_object] || (catalogs[:Dests][:referenced_object] || catalogs[:Dests])).each {|k,v| next if CombinePDF::PDF::PRIVATE_HASH_KEYS.include?(k); dests_arry << unify_string(k.to_s); dests_arry << v; }
|
500
|
+
end
|
501
|
+
catalog_pages(catalogs[:Pages], inheritance_hash.dup) unless catalogs[:Pages].nil?
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
self
|
506
|
+
end
|
507
|
+
|
508
|
+
def get_refernced_object(reference_hash = {})
|
509
|
+
@parsed.each do |stored_object|
|
510
|
+
return stored_object if stored_object.is_a?(Hash) &&
|
511
|
+
reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
512
|
+
reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
513
|
+
# return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
|
514
|
+
# reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
515
|
+
# reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
516
|
+
end
|
517
|
+
warn "didn't find reference #{reference_hash}"
|
518
|
+
nil
|
519
|
+
end
|
520
|
+
|
521
|
+
# # @private
|
522
|
+
# # connects references and objects, according to their reference id's.
|
523
|
+
# #
|
524
|
+
# # should be moved to the parser's workflow.
|
525
|
+
# #
|
526
|
+
# def serialize_objects_and_references_old
|
527
|
+
# obj_dir = {}
|
528
|
+
# # create a dictionary for referenced objects (no value resolution at this point)
|
529
|
+
# @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
|
530
|
+
# # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
|
531
|
+
# @references.each do |obj|
|
532
|
+
# obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
|
533
|
+
# warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
|
534
|
+
# obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
|
535
|
+
# end
|
536
|
+
# obj_dir.clear
|
537
|
+
# @references.clear
|
538
|
+
# self
|
539
|
+
# end
|
540
|
+
|
541
|
+
# @private
|
542
|
+
# connects references and objects, according to their reference id's.
|
543
|
+
#
|
544
|
+
# Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
|
545
|
+
#
|
546
|
+
# should be moved to the parser's workflow.
|
547
|
+
#
|
548
|
+
def serialize_objects_and_references
|
549
|
+
obj_dir = {}
|
550
|
+
# create a dictionary for referenced objects (no value resolution at this point)
|
551
|
+
# @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
|
552
|
+
@parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
|
553
|
+
should_resolve = [@parsed, @root_object]
|
554
|
+
while should_resolve.count > 0
|
555
|
+
obj = should_resolve.pop
|
556
|
+
if obj.is_a?(Hash)
|
557
|
+
obj.keys.each do |k|
|
558
|
+
o = obj[k]
|
559
|
+
if o.is_a?(Hash)
|
560
|
+
if o[:is_reference_only]
|
561
|
+
if o[:indirect_reference_id].nil?
|
562
|
+
o = nil
|
563
|
+
else
|
564
|
+
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
|
565
|
+
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
|
566
|
+
o.delete :indirect_reference_id
|
567
|
+
o.delete :indirect_generation_number
|
568
|
+
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
|
569
|
+
end
|
570
|
+
obj[k] = o
|
571
|
+
else
|
572
|
+
should_resolve << o
|
573
|
+
end
|
574
|
+
elsif o.is_a?(Array)
|
575
|
+
should_resolve << o
|
576
|
+
end
|
577
|
+
end
|
578
|
+
elsif obj.is_a?(Array)
|
579
|
+
obj.map! do |o|
|
580
|
+
if o.is_a?(Hash)
|
581
|
+
if o[:is_reference_only]
|
582
|
+
if o[:indirect_reference_id].nil?
|
583
|
+
o = nil
|
584
|
+
else
|
585
|
+
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
|
586
|
+
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
|
587
|
+
o.delete :indirect_reference_id
|
588
|
+
o.delete :indirect_generation_number
|
589
|
+
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
|
590
|
+
end
|
591
|
+
else
|
592
|
+
should_resolve << o
|
593
|
+
end
|
594
|
+
elsif o.is_a?(Array)
|
595
|
+
should_resolve << o
|
596
|
+
end
|
597
|
+
o
|
598
|
+
end
|
599
|
+
end
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
# def serialize_objects_and_references
|
604
|
+
# rec_resolve = proc do |level|
|
605
|
+
# if level.is_a?(Hash)
|
606
|
+
# if level[:is_reference_only]
|
607
|
+
# level[:referenced_object] = get_refernced_object(level)
|
608
|
+
# level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level
|
609
|
+
# level.delete :indirect_reference_id
|
610
|
+
# level.delete :indirect_generation_number
|
611
|
+
# else
|
612
|
+
# level.keys.each do |k|
|
613
|
+
# level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil?
|
614
|
+
# end
|
615
|
+
# end
|
616
|
+
# elsif level.is_a?(Array)
|
617
|
+
# level.map! { |o| rec_resolve.call(o) }
|
618
|
+
# end
|
619
|
+
# level
|
620
|
+
# end
|
621
|
+
# rec_resolve.call(@root_object)
|
622
|
+
# rec_resolve.call(@parsed)
|
623
|
+
# self
|
624
|
+
# end
|
625
|
+
|
626
|
+
# All Strings are one String
|
627
|
+
def unify_string(str)
|
628
|
+
@strings_dictionary[str] ||= str
|
629
|
+
end
|
630
|
+
|
631
|
+
# @private
|
632
|
+
# this method reviews a Hash and updates it by merging Hash data,
|
633
|
+
# preffering the old over the new.
|
634
|
+
def self.hash_update_proc_for_old(_key, old_data, new_data)
|
635
|
+
if old_data.is_a? Hash
|
636
|
+
old_data.merge(new_data, &method(:hash_update_proc_for_old))
|
637
|
+
else
|
638
|
+
old_data
|
639
|
+
end
|
640
|
+
end
|
641
|
+
|
642
|
+
# @private
|
643
|
+
# this method reviews a Hash an updates it by merging Hash data,
|
644
|
+
# preffering the new over the old.
|
645
|
+
def self.hash_update_proc_for_new(_key, old_data, new_data)
|
646
|
+
if old_data.is_a? Hash
|
647
|
+
old_data.merge(new_data, &method(:hash_update_proc_for_new))
|
648
|
+
else
|
649
|
+
new_data
|
650
|
+
end
|
651
|
+
end
|
652
|
+
|
653
|
+
# # run block of code on evey PDF object (PDF objects are class Hash)
|
654
|
+
# def each_object(object, limit_references = true, already_visited = {}, &block)
|
655
|
+
# unless limit_references
|
656
|
+
# already_visited[object.object_id] = true
|
657
|
+
# end
|
658
|
+
# case
|
659
|
+
# when object.is_a?(Array)
|
660
|
+
# object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
|
661
|
+
# when object.is_a?(Hash)
|
662
|
+
# yield(object)
|
663
|
+
# unless limit_references && object[:is_reference_only]
|
664
|
+
# object.each do |k,v|
|
665
|
+
# each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
|
666
|
+
# end
|
667
|
+
# end
|
668
|
+
# end
|
669
|
+
# end
|
670
|
+
end
|
671
|
+
end
|