combine_pdf 0.2.5 → 0.2.37

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,535 +5,667 @@
5
5
  ## is subject to the same license.
6
6
  ########################################################
7
7
 
8
-
9
-
10
-
11
8
  module CombinePDF
12
-
13
-
14
- # @!visibility private
15
- # @private
16
- #:nodoc: all
17
-
18
- protected
19
-
20
- # This is the Parser class.
21
- #
22
- # It takes PDF data and parses it.
23
- #
24
- # The information is then used to initialize a PDF object.
25
- #
26
- # This is an internal class. you don't need it.
27
- class PDFParser
28
-
29
- # @!visibility private
30
-
31
-
32
- # the array containing all the parsed data (PDF Objects)
33
- attr_reader :parsed
34
- # a Float representing the PDF version of the data parsed (if exists).
35
- attr_reader :version
36
- # the info and root objects, as found (if found) in the PDF file.
37
- #
38
- # they are mainly to used to know if the file is (was) encrypted and to get more details.
39
- attr_reader :info_object, :root_object
40
-
41
- # when creating a parser, it is important to set the data (String) we wish to parse.
42
- #
43
- # <b>the data is required and it is not possible to set the data at a later stage</b>
44
- #
45
- # string:: the data to be parsed, as a String object.
46
- def initialize (string)
47
- raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
48
- @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
49
- @literal_strings = []
50
- @hex_strings = []
51
- @streams = []
52
- @parsed = []
53
- @references = []
54
- @root_object = {}
55
- @info_object = {}
56
- @version = nil
57
- @scanner = nil
58
- end
59
-
60
- # parse the data in the new parser (the data already set through the initialize / new method)
61
- def parse
62
- return [] if @string_to_parse.empty?
63
- return @parsed unless @parsed.empty?
64
- @scanner = StringScanner.new @string_to_parse
65
- @scanner.pos = 0
66
- if @scanner.scan /\%PDF\-[\d\-\.]+/
67
- @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
68
- end
69
-
70
- @parsed = _parse_
71
-
72
- if @root_object == {}
73
- xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
74
- xref_streams.each do |xref_dictionary|
75
- @root_object.merge! xref_dictionary
76
- end
77
- end
78
- raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
79
-
80
- if @root_object[:Encrypt]
81
- change_references_to_actual_values @root_object
82
- warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
83
- decryptor = PDFDecrypt.new @parsed, @root_object
84
- decryptor.decrypt
85
- #do we really need to apply to @parsed? No, there is no need.
86
- end
87
-
88
- ## search for objects streams
89
- object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
90
- unless object_streams.empty?
91
- warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
92
-
93
- object_streams.each do |o|
94
- ## un-encode (using the correct filter) the object streams
95
- PDFFilter.inflate_object o
96
- ## extract objects from stream to top level arry @parsed
97
- @scanner = StringScanner.new o[:raw_stream_content]
98
- stream_data = _parse_
99
- id_array = []
100
- while stream_data[0].is_a? Fixnum
101
- id_array << stream_data.shift
102
- stream_data.shift
103
- end
104
- while id_array[0] && stream_data[0]
105
- stream_data[0] = {indirect_without_dictionary: stream_data[0]} unless stream_data[0].is_a?(Hash)
106
- stream_data[0][:indirect_reference_id] = id_array.shift
107
- stream_data[0][:indirect_generation_number] = 0
108
- @parsed << stream_data.shift
109
- end
110
- end
111
- end
112
-
113
-
114
- # serialize_objects_and_references.catalog_pages
115
-
116
- # Benchmark.bm do |bm|
117
- # bm.report("serialize") {1000.times {serialize_objects_and_references} }
118
- # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
119
- # bm.report("catalog") {1000.times {catalog_pages} }
120
- # end
121
-
122
- serialize_objects_and_references.catalog_pages
123
-
124
- @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
125
- if @info_object && @info_object.is_a?(Hash)
126
- @parsed.delete @info_object
127
- CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
128
- @info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
129
- else
130
- @info_object = {}
131
- end
132
- # # # ## remove object streams - if they exist
133
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
134
- # # # ## remove XREF dictionaries - if they exist
135
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
136
-
137
- @parsed
138
- end
139
-
140
- # the actual recoursive parsing is done here.
141
- #
142
- # this is an internal function, but it was left exposed for posible future features.
143
- def _parse_
144
- out = []
145
- str = ''
146
- while @scanner.rest? do
147
- case
148
- ##########################################
149
- ## parse an Array
150
- ##########################################
151
- when @scanner.scan(/\[/)
152
- out << _parse_
153
- ##########################################
154
- ## parse a Dictionary
155
- ##########################################
156
- when @scanner.scan(/<</)
157
- data = _parse_
158
- obj = {}
159
- obj[data.shift] = data.shift while data[0]
160
- out << obj
161
- ##########################################
162
- ## return content of array or dictionary
163
- ##########################################
164
- when @scanner.scan(/\]/), @scanner.scan(/>>/)
165
- return out
166
- ##########################################
167
- ## parse a Stream
168
- ##########################################
169
- when @scanner.scan(/stream[\r]?[\n]/)
170
- # the following was dicarded because some PDF files didn't have an EOL marker as required
171
- # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
172
- # instead, a non-strict RegExp is used:
173
- str = @scanner.scan_until(/endstream/)
174
- # raise error if the stream doesn't end.
175
- raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
176
- # need to remove end of stream
177
- if out.last.is_a? Hash
178
- out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
179
- # out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]?endstream/, "")
180
- else
181
- warn "Stream not attached to dictionary!"
182
- out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
183
- end
184
- ##########################################
185
- ## parse an Object after finished
186
- ##########################################
187
- when str = @scanner.scan(/endobj/)
188
- #what to do when this is an object?
189
- if out.last.is_a? Hash
190
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
191
- else
192
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
193
- end
194
- ##########################################
195
- ## parse a Hex String
196
- ##########################################
197
- when str = @scanner.scan(/<[0-9a-fA-F]+>/)
198
- # warn "Found a hex string"
199
- out << [str[1..-2]].pack('H*')
200
- ##########################################
201
- ## parse a Literal String
202
- ##########################################
203
- when @scanner.scan(/\(/)
204
- # warn "Found a literal string"
205
- str = ''.force_encoding(Encoding::ASCII_8BIT)
206
- count = 1
207
- while count > 0 && @scanner.rest? do
208
- str += @scanner.scan_until(/[\(\)]/).to_s
209
- seperator_count = 0
210
- seperator_count += 1 while str[-2-seperator_count] == "\\"
211
-
212
- case str[-1]
213
- when '('
214
- ## The following solution fails when (string ends with this sign: \\)
215
-
216
- count += 1 unless seperator_count.odd?
217
- when ')'
218
- count -= 1 unless seperator_count.odd?
219
- else
220
- warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
221
- count = 0 # error
222
- end
223
- end
224
- # The PDF formatted string is: str[0..-2]
225
- # now starting to convert to regular string
226
- str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
227
- str = []
228
- until str_bytes.empty?
229
- case str_bytes[0]
230
- when 13 # eol - \r
231
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
232
- # shall be treated as a byte value of (0Ah),
233
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
234
- str_bytes.shift
235
- str_bytes.shift if str_bytes[0] == 10
236
- str << 10
237
- when 10 # eol - \n
238
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
239
- # shall be treated as a byte value of (0Ah),
240
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
241
- str_bytes.shift
242
- str_bytes.shift if str_bytes[0] == 13
243
- str << 10
244
- when 92 # "\\".ord == 92
245
- str_bytes.shift
246
- rep = str_bytes.shift
247
- case rep
248
- when 110 #n
249
- str << 10 #new line
250
- when 114 #r
251
- str << 13 # CR
252
- when 116 #t
253
- str << 9 #tab
254
- when 98 #b
255
- str << 8
256
- when 102 #f
257
- str << 255
258
- when 48..57 #octal notation for byte?
259
- rep = rep.chr
260
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
261
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
262
- str << rep.to_i
263
- when 10 # new line, ignore
264
- str_bytes.shift if str_bytes[0] == 13
265
- true
266
- when 13 # new line (or double notation for new line), ignore
267
- str_bytes.shift if str_bytes[0] == 10
268
- true
269
- else
270
- str << rep
271
- end
272
- else
273
- str << str_bytes.shift
274
- end
275
- end
276
- out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
277
- ##########################################
278
- ## Parse a comment
279
- ##########################################
280
- when str = @scanner.scan(/\%/)
281
- #is a comment, skip until new line
282
- @scanner.skip_until /[\n\r]+/
283
- ##########################################
284
- ## Parse a Name
285
- ##########################################
286
- # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
287
- # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
288
- # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
289
- # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
290
- when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
291
- out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
292
- ##########################################
293
- ## Parse a Number
294
- ##########################################
295
- when str = @scanner.scan(/[\+\-\.\d]+/)
296
- str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
297
- ##########################################
298
- ## Parse an Object Reference
299
- ##########################################
300
- when @scanner.scan(/R/)
301
- out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
302
- @references << out.last
303
- ##########################################
304
- ## Parse Bool - true and after false
305
- ##########################################
306
- when @scanner.scan(/true/)
307
- out << true
308
- when @scanner.scan(/false/)
309
- out << false
310
- ##########################################
311
- ## Parse NULL - null
312
- ##########################################
313
- when @scanner.scan(/null/)
314
- out << nil
315
- ##########################################
316
- ## XREF - check for encryption... anything else?
317
- ##########################################
318
- when @scanner.scan(/xref/)
319
- ##########
320
- ## get root object to check for encryption
321
- @scanner.scan_until(/(trailer)|(\%EOF)/)
322
-
323
- if @scanner.matched[-1] == 'r'
324
- if @scanner.skip_until(/<</)
325
- data = _parse_
326
- @root_object = {}
327
- @root_object[data.shift] = data.shift while data[0]
328
- end
329
- ##########
330
- ## skip untill end of segment, maked by %%EOF
331
- @scanner.skip_until(/\%\%EOF/)
332
- end
333
-
334
- when @scanner.scan(/[\s]+/)
335
- # Generally, do nothing
336
- nil
337
- when @scanner.scan(/obj[\s]*/)
338
- # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
339
- unless out[-4].nil? || out[-4].is_a?(Hash)
340
- keep = []
341
- keep << out.pop
342
- keep << out.pop
343
-
344
- if out.last.is_a? Hash
345
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
346
- else
347
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
348
- end
349
- warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
350
-
351
- out << keep.pop
352
- out << keep.pop
353
- end
354
- else
355
- # always advance
356
- # warn "Advnacing for unknown reason..."
357
- @scanner.pos = @scanner.pos + 1
358
- end
359
- end
360
- out
361
- end
362
-
363
- protected
364
-
365
-
366
-
367
- # resets cataloging and pages
368
- def catalog_pages(catalogs = nil, secure_injection = false, inheritance_hash = {})
369
- unless catalogs
370
-
371
- if root_object[:Root]
372
- catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
373
- else
374
- catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
375
- end
376
- @parsed.delete_if {|obj| obj[:Type] == :Catalog}
377
- @parsed << catalogs
378
-
379
- raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
380
- end
381
- case
382
- when catalogs.is_a?(Array)
383
- catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
384
- when catalogs.is_a?(Hash)
385
- if catalogs[:is_reference_only]
386
- if catalogs[:referenced_object]
387
- catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
388
- else
389
- warn "couldn't follow reference!!! #{catalogs} not found!"
390
- end
391
- else
392
- unless catalogs[:Type] == :Page
393
- raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
394
- inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
395
- inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
396
- inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
397
- (inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
398
- (inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
399
-
400
- # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
401
- # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
402
- # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
403
- end
404
-
405
- case catalogs[:Type]
406
- when :Page
407
-
408
- catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
409
- catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
410
- catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
411
- (catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
412
- (catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
413
- # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
414
- # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
415
- # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
416
-
417
-
418
- # avoide references on MediaBox, CropBox and Rotate
419
- catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
420
- catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
421
- catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
422
-
423
- catalogs.instance_eval {extend Page_Methods}
424
- catalogs.secure_injection = secure_injection
425
- when :Pages
426
- catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
427
- when :Catalog
428
- catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
429
- end
430
- end
431
- end
432
- self
433
- end
434
-
435
- # fails!
436
- def change_references_to_actual_values(hash_with_references = {})
437
- hash_with_references.each do |k,v|
438
- if v.is_a?(Hash) && v[:is_reference_only]
439
- hash_with_references[k] = get_refernced_object(v)
440
- hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
441
- warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
442
- hash_with_references[k] = v unless hash_with_references[k]
443
- end
444
- end
445
- hash_with_references
446
- end
447
-
448
- def get_refernced_object(reference_hash = {})
449
- @parsed.each do |stored_object|
450
- return stored_object if ( stored_object.is_a?(Hash) &&
451
- reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
452
- reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
453
- end
454
- warn "didn't find reference #{reference_hash}"
455
- nil
456
- end
457
-
458
- # @private
459
- # connects references and objects, according to their reference id's.
460
- #
461
- # should be moved to the parser's workflow.
462
- #
463
- def serialize_objects_and_references
464
- obj_dir = {}
465
- @parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
466
- # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
467
- @references.each do |obj|
468
- obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
469
- warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
470
- obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
471
- end
472
- self
473
- # rescue => e
474
- # puts (@parsed.select {|o| !o.is_a?(Hash)})
475
- # puts (@parsed)
476
- # puts (@references)
477
- # raise e
478
- end
479
-
480
- # @private
481
- # this method reviews a Hash and updates it by merging Hash data,
482
- # preffering the old over the new.
483
- def self.hash_update_proc_for_old key, old_data, new_data
484
- if old_data.is_a? Hash
485
- old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
486
- else
487
- old_data
488
- end
489
- end
490
- # @private
491
- # this method reviews a Hash an updates it by merging Hash data,
492
- # preffering the new over the old.
493
- def self.hash_update_proc_for_new key, old_data, new_data
494
- if old_data.is_a? Hash
495
- old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
496
- else
497
- new_data
498
- end
499
- end
500
-
501
- # # @private
502
- # # connects references and objects, according to their reference id's.
503
- # #
504
- # # should be moved to the parser's workflow.
505
- # #
506
- # def old_serialize_objects_and_references(object = nil)
507
- # objects_reference_hash = {}
508
- # # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
509
- # @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
510
- # each_object(@parsed) do |obj|
511
- # if obj[:is_reference_only]
512
- # obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
513
- # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
514
- # # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
515
- # end
516
- # end
517
- # self
518
- # end
519
-
520
- # # run block of code on evey PDF object (PDF objects are class Hash)
521
- # def each_object(object, limit_references = true, already_visited = {}, &block)
522
- # unless limit_references
523
- # already_visited[object.object_id] = true
524
- # end
525
- # case
526
- # when object.is_a?(Array)
527
- # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
528
- # when object.is_a?(Hash)
529
- # yield(object)
530
- # unless limit_references && object[:is_reference_only]
531
- # object.each do |k,v|
532
- # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
533
- # end
534
- # end
535
- # end
536
- # end
537
-
538
- end
539
- end
9
+ # @!visibility private
10
+ # @private
11
+ #:nodoc: all
12
+
13
+ protected
14
+
15
+ # This is the Parser class.
16
+ #
17
+ # It takes PDF data and parses it.
18
+ #
19
+ # The information is then used to initialize a PDF object.
20
+ #
21
+ # This is an internal class. you don't need it.
22
+ class PDFParser
23
+ # @!visibility private
24
+
25
+ # the array containing all the parsed data (PDF Objects)
26
+ attr_reader :parsed
27
+ # a Float representing the PDF version of the data parsed (if exists).
28
+ attr_reader :version
29
+ # the info and root objects, as found (if found) in the PDF file.
30
+ #
31
+ # they are mainly to used to know if the file is (was) encrypted and to get more details.
32
+ attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata
33
+
34
+ attr_reader :allow_optional_content
35
+ # when creating a parser, it is important to set the data (String) we wish to parse.
36
+ #
37
+ # <b>the data is required and it is not possible to set the data at a later stage</b>
38
+ #
39
+ # string:: the data to be parsed, as a String object.
40
+ def initialize(string, options = {})
41
+ raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
42
+ @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
43
+ @literal_strings = [].dup
44
+ @hex_strings = [].dup
45
+ @streams = [].dup
46
+ @parsed = [].dup
47
+ @references = [].dup
48
+ @root_object = {}.dup
49
+ @info_object = {}.dup
50
+ @names_object = {}.dup
51
+ @outlines_object = {}.dup
52
+ @forms_object = {}.dup
53
+ @metadata = nil
54
+ @strings_dictionary = {}.dup # all strings are one string
55
+ @version = nil
56
+ @scanner = nil
57
+ @allow_optional_content = options[:allow_optional_content]
58
+ end
59
+
60
+ # parse the data in the new parser (the data already set through the initialize / new method)
61
+ def parse
62
+ return [] if @string_to_parse.empty?
63
+ return @parsed unless @parsed.empty?
64
+ @scanner = StringScanner.new @string_to_parse
65
+ @scanner.pos = 0
66
+ @scanner.skip(/[^%]*/) if @scanner.exist?(/%PDF/i)
67
+ if @scanner.scan /\%PDF\-[\d\-\.]+/
68
+ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
69
+ loop do
70
+ break unless @scanner.scan(/[^\d\r\n]+/)
71
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
72
+ break if @scanner.eos?
73
+ @scanner.pos += 1
74
+ end
75
+ end
76
+ @parsed = _parse_
77
+ # puts @parsed
78
+
79
+ raise 'Unknown PDF parsing error - malformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
80
+
81
+ if @root_object == {}.freeze
82
+ xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
83
+ xref_streams.each do |xref_dictionary|
84
+ @root_object.merge! xref_dictionary
85
+ end
86
+ end
87
+
88
+ raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze
89
+
90
+ if @root_object[:Encrypt]
91
+ # change_references_to_actual_values @root_object
92
+ warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
93
+ decryptor = PDFDecrypt.new @parsed, @root_object
94
+ decryptor.decrypt
95
+ # do we really need to apply to @parsed? No, there is no need.
96
+ end
97
+
98
+ ## search for objects streams
99
+ object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
100
+ unless object_streams.empty?
101
+ warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
102
+
103
+ object_streams.each do |o|
104
+ ## un-encode (using the correct filter) the object streams
105
+ PDFFilter.inflate_object o
106
+ ## extract objects from stream to top level arry @parsed
107
+ @scanner = StringScanner.new o[:raw_stream_content]
108
+ stream_data = _parse_
109
+ id_array = []
110
+ while stream_data[0].is_a? (Integer)
111
+ id_array << stream_data.shift
112
+ stream_data.shift
113
+ end
114
+ while id_array[0] && stream_data[0]
115
+ stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
116
+ stream_data[0][:indirect_reference_id] = id_array.shift
117
+ stream_data[0][:indirect_generation_number] = 0
118
+ @parsed << stream_data.shift
119
+ end
120
+ end
121
+ end
122
+
123
+ # serialize_objects_and_references.catalog_pages
124
+
125
+ # Benchmark.bm do |bm|
126
+ # bm.report("serialize") {1000.times {serialize_objects_and_references} }
127
+ # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
128
+ # bm.report("catalog") {1000.times {catalog_pages} }
129
+ # end
130
+
131
+ serialize_objects_and_references
132
+
133
+ catalog_pages
134
+
135
+ # Strings were unified, we can let them go..
136
+ @strings_dictionary.clear
137
+
138
+ # collect any missing objects from the forms_data
139
+ unless @forms_object.nil? || @forms_object.empty?
140
+ @forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } }
141
+ @forms_object[:related_objects].delete @forms_object
142
+ end
143
+
144
+ @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
145
+ if @info_object && @info_object.is_a?(Hash)
146
+ @parsed.delete @info_object
147
+ CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
148
+ @info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
149
+ else
150
+ @info_object = {}
151
+ end
152
+ # # # ## remove object streams - if they exist
153
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
154
+ # # # ## remove XREF dictionaries - if they exist
155
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
156
+
157
+ @parsed
158
+ end
159
+
160
+ # the actual recoursive parsing is done here.
161
+ #
162
+ # this is an internal function, but it was left exposed for posible future features.
163
+ def _parse_
164
+ out = []
165
+ str = ''
166
+ fresh = true
167
+ while @scanner.rest?
168
+ # last ||= 0
169
+ # out.last.tap do |o|
170
+ # if o.is_a?(Hash)
171
+ # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
172
+ # o.each do |k, v|
173
+ # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
174
+ # end
175
+ # else
176
+ # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
177
+ # end
178
+ # puts "next is #{@scanner.peek 8}"
179
+ # end unless (last == out.count) || (-1 == (last = out.count))
180
+ if @scanner.scan(/\[/)
181
+ out << _parse_
182
+ ##########################################
183
+ ## parse a Dictionary
184
+ ##########################################
185
+ elsif @scanner.scan(/<</)
186
+ data = _parse_
187
+ obj = {}
188
+ obj[data.shift] = data.shift while data[0]
189
+ out << obj
190
+ ##########################################
191
+ ## return content of array or dictionary
192
+ ##########################################
193
+ elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
194
+ return out
195
+ ##########################################
196
+ ## parse a Stream
197
+ ##########################################
198
+ elsif @scanner.scan(/stream[\r\n]/)
199
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
200
+ # the following was dicarded because some PDF files didn't have an EOL marker as required
201
+ # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
202
+ # instead, a non-strict RegExp is used:
203
+ str = @scanner.scan_until(/endstream/)
204
+ # raise error if the stream doesn't end.
205
+ raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
206
+ # need to remove end of stream
207
+ if out.last.is_a? Hash
208
+ # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
209
+ out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
210
+ else
211
+ warn 'Stream not attached to dictionary!'
212
+ out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
213
+ end
214
+ ##########################################
215
+ ## parse an Object after finished
216
+ ##########################################
217
+ elsif str = @scanner.scan(/endobj/)
218
+ # what to do when this is an object?
219
+ if out.last.is_a? Hash
220
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
221
+ else
222
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
223
+ end
224
+ fresh = true
225
+ # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
226
+ out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
227
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
228
+ ##########################################
229
+ ## parse a Hex String
230
+ ##########################################
231
+ elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
232
+ # warn "Found a hex string"
233
+ out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
234
+ ##########################################
235
+ ## parse a Literal String
236
+ ##########################################
237
+ elsif @scanner.scan(/\(/)
238
+ # warn "Found a literal string"
239
+ str = ''.force_encoding(Encoding::ASCII_8BIT)
240
+ count = 1
241
+ while count > 0 && @scanner.rest?
242
+ scn = @scanner.scan_until(/[\(\)]/)
243
+ unless scn
244
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
245
+ count = 0 # error
246
+ next
247
+ end
248
+
249
+ str += scn.to_s
250
+ seperator_count = 0
251
+ seperator_count += 1 while str[-2 - seperator_count] == '\\'
252
+
253
+ case str[-1]
254
+ when '('
255
+ ## The following solution might fail when (string ends with this sign: \\)
256
+ count += 1 unless seperator_count.odd?
257
+ when ')'
258
+ count -= 1 unless seperator_count.odd?
259
+ else
260
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
261
+ count = 0 # error
262
+ end
263
+ end
264
+ # The PDF formatted string is: str[0..-2]
265
+ # now starting to convert to regular string
266
+ str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
267
+ str = []
268
+ until str_bytes.empty?
269
+ case str_bytes[0]
270
+ when 13 # eol - \r
271
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
272
+ # shall be treated as a byte value of (0Ah),
273
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
274
+ str_bytes.shift
275
+ str_bytes.shift if str_bytes[0] == 10
276
+ str << 10
277
+ when 10 # eol - \n
278
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
279
+ # shall be treated as a byte value of (0Ah),
280
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
281
+ str_bytes.shift
282
+ str_bytes.shift if str_bytes[0] == 13
283
+ str << 10
284
+ when 92 # "\\".ord == 92
285
+ str_bytes.shift
286
+ rep = str_bytes.shift
287
+ case rep
288
+ when 110 # n
289
+ str << 10 # new line
290
+ when 114 # r
291
+ str << 13 # CR
292
+ when 116 # t
293
+ str << 9 # tab
294
+ when 98 # b
295
+ str << 8
296
+ when 102 # f, form-feed
297
+ str << 12
298
+ when 48..57 # octal notation for byte?
299
+ rep = rep.chr
300
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
301
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
302
+ str << rep.to_i
303
+ when 10 # new line, ignore
304
+ str_bytes.shift if str_bytes[0] == 13
305
+ true
306
+ when 13 # new line (or double notation for new line), ignore
307
+ str_bytes.shift if str_bytes[0] == 10
308
+ true
309
+ else
310
+ str << rep
311
+ end
312
+ else
313
+ str << str_bytes.shift
314
+ end
315
+ end
316
+ out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
317
+ ##########################################
318
+ ## Parse a comment
319
+ ##########################################
320
+ elsif str = @scanner.scan(/\%/)
321
+ # is a comment, skip until new line
322
+ loop do
323
+ # break unless @scanner.scan(/[^\d\r\n]+/)
324
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
325
+ @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
326
+ end
327
+ # puts "AFTER COMMENT: #{@scanner.peek 8}"
328
+ ##########################################
329
+ ## Parse a Name
330
+ ##########################################
331
+ # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
332
+ # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
333
+ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
334
+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
335
+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
336
+ out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
337
+ ##########################################
338
+ ## Parse a Number
339
+ ##########################################
340
+ elsif str = @scanner.scan(/[\+\-\.\d]+/)
341
+ str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
342
+ ##########################################
343
+ ## Parse an Object Reference
344
+ ##########################################
345
+ elsif @scanner.scan(/R/)
346
+ out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
347
+ # @references << out.last
348
+ ##########################################
349
+ ## Parse Bool - true and after false
350
+ ##########################################
351
+ elsif @scanner.scan(/true/)
352
+ out << true
353
+ elsif @scanner.scan(/false/)
354
+ out << false
355
+ ##########################################
356
+ ## Parse NULL - null
357
+ ##########################################
358
+ elsif @scanner.scan(/null/)
359
+ out << nil
360
+ ##########################################
361
+ ## XREF - check for encryption... anything else?
362
+ ##########################################
363
+ elsif @scanner.scan(/(startxref)|(xref)/)
364
+ ##########
365
+ ## get root object to check for encryption
366
+ @scanner.scan_until(/(trailer)|(\%EOF)/)
367
+ fresh = true
368
+ if @scanner.matched[-1] == 'r'
369
+ if @scanner.skip_until(/<</)
370
+ data = _parse_
371
+ @root_object ||= {}
372
+ @root_object[data.shift] = data.shift while data[0]
373
+ end
374
+ ##########
375
+ ## skip untill end of segment, maked by %%EOF
376
+ @scanner.skip_until(/\%\%EOF/)
377
+ ##########
378
+ ## If this was the last valid segment, ignore any trailing garbage
379
+ ## (issue #49 resolution)
380
+ break unless @scanner.exist?(/\%\%EOF/)
381
+
382
+ end
383
+
384
+ elsif @scanner.scan(/[\s]+/)
385
+ # Generally, do nothing
386
+ nil
387
+ elsif @scanner.scan(/obj[\s]*/)
388
+ # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
389
+ unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
390
+ keep = []
391
+ keep << out.pop # .tap {|i| puts "#{i} is an ID"}
392
+ keep << out.pop # .tap {|i| puts "#{i} is a REF"}
393
+
394
+ if out.last.is_a? Hash
395
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
396
+ else
397
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
398
+ end
399
+ # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
400
+ out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
401
+ warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
402
+
403
+ out << keep.pop
404
+ out << keep.pop
405
+ end
406
+ fresh = false
407
+ else
408
+ # always advance
409
+ # warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos - 4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
410
+ warn 'Warning: parser advancing for unknown reason. Potential data-loss.'
411
+ @scanner.pos = @scanner.pos + 1
412
+ end
413
+ end
414
+ out
415
+ end
416
+
417
+ protected
418
+
419
+ # resets cataloging and pages
420
+ def catalog_pages(catalogs = nil, inheritance_hash = {})
421
+ unless catalogs
422
+
423
+ if root_object[:Root]
424
+ catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
425
+ else
426
+ catalogs = (@parsed.select { |obj| obj[:Type] == :Catalog }).last
427
+ end
428
+
429
+ @parsed.delete_if { |obj| obj.nil? || obj[:Type] == :Catalog }
430
+ @parsed << catalogs
431
+
432
+ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
433
+ end
434
+ if catalogs.is_a?(Array)
435
+ catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
436
+ elsif catalogs.is_a?(Hash)
437
+ if catalogs[:is_reference_only]
438
+ if catalogs[:referenced_object]
439
+ catalog_pages(catalogs[:referenced_object], inheritance_hash)
440
+ else
441
+ warn "couldn't follow reference!!! #{catalogs} not found!"
442
+ end
443
+ else
444
+ unless catalogs[:Type] == :Page
445
+ raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
446
+ inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
447
+ inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
448
+ inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
449
+ if catalogs[:Resources]
450
+ inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
451
+ (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
452
+ end
453
+ if catalogs[:ColorSpace]
454
+ inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
455
+ (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
456
+ end
457
+ # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
458
+ # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
459
+
460
+ # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
461
+ # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
462
+ # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
463
+ end
464
+
465
+ case catalogs[:Type]
466
+ when :Page
467
+
468
+ catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
469
+ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
470
+ catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
471
+ if inheritance_hash[:Resources]
472
+ catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
473
+ (catalogs[:Resources][:referenced_object] || catalogs[:Resources]).update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
474
+ end
475
+ if inheritance_hash[:ColorSpace]
476
+ catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
477
+ (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]).update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
478
+ end
479
+ # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
480
+ # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
481
+ # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
482
+ # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
483
+
484
+ # avoide references on MediaBox, CropBox and Rotate
485
+ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
486
+ catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
487
+ catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
488
+
489
+ catalogs.instance_eval { extend Page_Methods }
490
+ when :Pages
491
+ catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
492
+ when :Catalog
493
+ @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
494
+ @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
495
+ @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
496
+ if catalogs[:Dests] # convert PDF 1.1 Dests to PDF 1.2+ Dests
497
+ dests_arry = (@names_object[:Dests] ||= {})
498
+ dests_arry = ((dests_arry[:referenced_object] || dests_arry)[:Names] ||= [])
499
+ ((catalogs[:Dests][:referenced_object] || catalogs[:Dests])[:referenced_object] || (catalogs[:Dests][:referenced_object] || catalogs[:Dests])).each {|k,v| next if CombinePDF::PDF::PRIVATE_HASH_KEYS.include?(k); dests_arry << unify_string(k.to_s); dests_arry << v; }
500
+ end
501
+ catalog_pages(catalogs[:Pages], inheritance_hash.dup) unless catalogs[:Pages].nil?
502
+ end
503
+ end
504
+ end
505
+ self
506
+ end
507
+
508
+ def get_refernced_object(reference_hash = {})
509
+ @parsed.each do |stored_object|
510
+ return stored_object if stored_object.is_a?(Hash) &&
511
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
512
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
513
+ # return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
514
+ # reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
515
+ # reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
516
+ end
517
+ warn "didn't find reference #{reference_hash}"
518
+ nil
519
+ end
520
+
521
+ # # @private
522
+ # # connects references and objects, according to their reference id's.
523
+ # #
524
+ # # should be moved to the parser's workflow.
525
+ # #
526
+ # def serialize_objects_and_references_old
527
+ # obj_dir = {}
528
+ # # create a dictionary for referenced objects (no value resolution at this point)
529
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
530
+ # # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
531
+ # @references.each do |obj|
532
+ # obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
533
+ # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
534
+ # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
535
+ # end
536
+ # obj_dir.clear
537
+ # @references.clear
538
+ # self
539
+ # end
540
+
541
+ # @private
542
+ # connects references and objects, according to their reference id's.
543
+ #
544
+ # Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
545
+ #
546
+ # should be moved to the parser's workflow.
547
+ #
548
+ def serialize_objects_and_references
549
+ obj_dir = {}
550
+ # create a dictionary for referenced objects (no value resolution at this point)
551
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
552
+ @parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
553
+ should_resolve = [@parsed, @root_object]
554
+ while should_resolve.count > 0
555
+ obj = should_resolve.pop
556
+ if obj.is_a?(Hash)
557
+ obj.keys.each do |k|
558
+ o = obj[k]
559
+ if o.is_a?(Hash)
560
+ if o[:is_reference_only]
561
+ if o[:indirect_reference_id].nil?
562
+ o = nil
563
+ else
564
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
565
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
566
+ o.delete :indirect_reference_id
567
+ o.delete :indirect_generation_number
568
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
569
+ end
570
+ obj[k] = o
571
+ else
572
+ should_resolve << o
573
+ end
574
+ elsif o.is_a?(Array)
575
+ should_resolve << o
576
+ end
577
+ end
578
+ elsif obj.is_a?(Array)
579
+ obj.map! do |o|
580
+ if o.is_a?(Hash)
581
+ if o[:is_reference_only]
582
+ if o[:indirect_reference_id].nil?
583
+ o = nil
584
+ else
585
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
586
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
587
+ o.delete :indirect_reference_id
588
+ o.delete :indirect_generation_number
589
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
590
+ end
591
+ else
592
+ should_resolve << o
593
+ end
594
+ elsif o.is_a?(Array)
595
+ should_resolve << o
596
+ end
597
+ o
598
+ end
599
+ end
600
+ end
601
+ end
602
+
603
+ # def serialize_objects_and_references
604
+ # rec_resolve = proc do |level|
605
+ # if level.is_a?(Hash)
606
+ # if level[:is_reference_only]
607
+ # level[:referenced_object] = get_refernced_object(level)
608
+ # level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level
609
+ # level.delete :indirect_reference_id
610
+ # level.delete :indirect_generation_number
611
+ # else
612
+ # level.keys.each do |k|
613
+ # level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil?
614
+ # end
615
+ # end
616
+ # elsif level.is_a?(Array)
617
+ # level.map! { |o| rec_resolve.call(o) }
618
+ # end
619
+ # level
620
+ # end
621
+ # rec_resolve.call(@root_object)
622
+ # rec_resolve.call(@parsed)
623
+ # self
624
+ # end
625
+
626
+ # All Strings are one String
627
+ def unify_string(str)
628
+ @strings_dictionary[str] ||= str
629
+ end
630
+
631
+ # @private
632
+ # this method reviews a Hash and updates it by merging Hash data,
633
+ # preffering the old over the new.
634
+ def self.hash_update_proc_for_old(_key, old_data, new_data)
635
+ if old_data.is_a? Hash
636
+ old_data.merge(new_data, &method(:hash_update_proc_for_old))
637
+ else
638
+ old_data
639
+ end
640
+ end
641
+
642
+ # @private
643
+ # this method reviews a Hash an updates it by merging Hash data,
644
+ # preffering the new over the old.
645
+ def self.hash_update_proc_for_new(_key, old_data, new_data)
646
+ if old_data.is_a? Hash
647
+ old_data.merge(new_data, &method(:hash_update_proc_for_new))
648
+ else
649
+ new_data
650
+ end
651
+ end
652
+
653
+ # # run block of code on evey PDF object (PDF objects are class Hash)
654
+ # def each_object(object, limit_references = true, already_visited = {}, &block)
655
+ # unless limit_references
656
+ # already_visited[object.object_id] = true
657
+ # end
658
+ # case
659
+ # when object.is_a?(Array)
660
+ # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
661
+ # when object.is_a?(Hash)
662
+ # yield(object)
663
+ # unless limit_references && object[:is_reference_only]
664
+ # object.each do |k,v|
665
+ # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
666
+ # end
667
+ # end
668
+ # end
669
+ # end
670
+ end
671
+ end