combine_pdf 0.2.5 → 0.2.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,535 +5,667 @@
5
5
  ## is subject to the same license.
6
6
  ########################################################
7
7
 
8
-
9
-
10
-
11
8
  module CombinePDF
12
-
13
-
14
- # @!visibility private
15
- # @private
16
- #:nodoc: all
17
-
18
- protected
19
-
20
- # This is the Parser class.
21
- #
22
- # It takes PDF data and parses it.
23
- #
24
- # The information is then used to initialize a PDF object.
25
- #
26
- # This is an internal class. you don't need it.
27
- class PDFParser
28
-
29
- # @!visibility private
30
-
31
-
32
- # the array containing all the parsed data (PDF Objects)
33
- attr_reader :parsed
34
- # a Float representing the PDF version of the data parsed (if exists).
35
- attr_reader :version
36
- # the info and root objects, as found (if found) in the PDF file.
37
- #
38
- # they are mainly to used to know if the file is (was) encrypted and to get more details.
39
- attr_reader :info_object, :root_object
40
-
41
- # when creating a parser, it is important to set the data (String) we wish to parse.
42
- #
43
- # <b>the data is required and it is not possible to set the data at a later stage</b>
44
- #
45
- # string:: the data to be parsed, as a String object.
46
- def initialize (string)
47
- raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
48
- @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
49
- @literal_strings = []
50
- @hex_strings = []
51
- @streams = []
52
- @parsed = []
53
- @references = []
54
- @root_object = {}
55
- @info_object = {}
56
- @version = nil
57
- @scanner = nil
58
- end
59
-
60
- # parse the data in the new parser (the data already set through the initialize / new method)
61
- def parse
62
- return [] if @string_to_parse.empty?
63
- return @parsed unless @parsed.empty?
64
- @scanner = StringScanner.new @string_to_parse
65
- @scanner.pos = 0
66
- if @scanner.scan /\%PDF\-[\d\-\.]+/
67
- @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
68
- end
69
-
70
- @parsed = _parse_
71
-
72
- if @root_object == {}
73
- xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
74
- xref_streams.each do |xref_dictionary|
75
- @root_object.merge! xref_dictionary
76
- end
77
- end
78
- raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
79
-
80
- if @root_object[:Encrypt]
81
- change_references_to_actual_values @root_object
82
- warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
83
- decryptor = PDFDecrypt.new @parsed, @root_object
84
- decryptor.decrypt
85
- #do we really need to apply to @parsed? No, there is no need.
86
- end
87
-
88
- ## search for objects streams
89
- object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
90
- unless object_streams.empty?
91
- warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
92
-
93
- object_streams.each do |o|
94
- ## un-encode (using the correct filter) the object streams
95
- PDFFilter.inflate_object o
96
- ## extract objects from stream to top level arry @parsed
97
- @scanner = StringScanner.new o[:raw_stream_content]
98
- stream_data = _parse_
99
- id_array = []
100
- while stream_data[0].is_a? Fixnum
101
- id_array << stream_data.shift
102
- stream_data.shift
103
- end
104
- while id_array[0] && stream_data[0]
105
- stream_data[0] = {indirect_without_dictionary: stream_data[0]} unless stream_data[0].is_a?(Hash)
106
- stream_data[0][:indirect_reference_id] = id_array.shift
107
- stream_data[0][:indirect_generation_number] = 0
108
- @parsed << stream_data.shift
109
- end
110
- end
111
- end
112
-
113
-
114
- # serialize_objects_and_references.catalog_pages
115
-
116
- # Benchmark.bm do |bm|
117
- # bm.report("serialize") {1000.times {serialize_objects_and_references} }
118
- # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
119
- # bm.report("catalog") {1000.times {catalog_pages} }
120
- # end
121
-
122
- serialize_objects_and_references.catalog_pages
123
-
124
- @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
125
- if @info_object && @info_object.is_a?(Hash)
126
- @parsed.delete @info_object
127
- CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
128
- @info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
129
- else
130
- @info_object = {}
131
- end
132
- # # # ## remove object streams - if they exist
133
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
134
- # # # ## remove XREF dictionaries - if they exist
135
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
136
-
137
- @parsed
138
- end
139
-
140
- # the actual recoursive parsing is done here.
141
- #
142
- # this is an internal function, but it was left exposed for posible future features.
143
- def _parse_
144
- out = []
145
- str = ''
146
- while @scanner.rest? do
147
- case
148
- ##########################################
149
- ## parse an Array
150
- ##########################################
151
- when @scanner.scan(/\[/)
152
- out << _parse_
153
- ##########################################
154
- ## parse a Dictionary
155
- ##########################################
156
- when @scanner.scan(/<</)
157
- data = _parse_
158
- obj = {}
159
- obj[data.shift] = data.shift while data[0]
160
- out << obj
161
- ##########################################
162
- ## return content of array or dictionary
163
- ##########################################
164
- when @scanner.scan(/\]/), @scanner.scan(/>>/)
165
- return out
166
- ##########################################
167
- ## parse a Stream
168
- ##########################################
169
- when @scanner.scan(/stream[\r]?[\n]/)
170
- # the following was dicarded because some PDF files didn't have an EOL marker as required
171
- # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
172
- # instead, a non-strict RegExp is used:
173
- str = @scanner.scan_until(/endstream/)
174
- # raise error if the stream doesn't end.
175
- raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
176
- # need to remove end of stream
177
- if out.last.is_a? Hash
178
- out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
179
- # out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]?endstream/, "")
180
- else
181
- warn "Stream not attached to dictionary!"
182
- out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
183
- end
184
- ##########################################
185
- ## parse an Object after finished
186
- ##########################################
187
- when str = @scanner.scan(/endobj/)
188
- #what to do when this is an object?
189
- if out.last.is_a? Hash
190
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
191
- else
192
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
193
- end
194
- ##########################################
195
- ## parse a Hex String
196
- ##########################################
197
- when str = @scanner.scan(/<[0-9a-fA-F]+>/)
198
- # warn "Found a hex string"
199
- out << [str[1..-2]].pack('H*')
200
- ##########################################
201
- ## parse a Literal String
202
- ##########################################
203
- when @scanner.scan(/\(/)
204
- # warn "Found a literal string"
205
- str = ''.force_encoding(Encoding::ASCII_8BIT)
206
- count = 1
207
- while count > 0 && @scanner.rest? do
208
- str += @scanner.scan_until(/[\(\)]/).to_s
209
- seperator_count = 0
210
- seperator_count += 1 while str[-2-seperator_count] == "\\"
211
-
212
- case str[-1]
213
- when '('
214
- ## The following solution fails when (string ends with this sign: \\)
215
-
216
- count += 1 unless seperator_count.odd?
217
- when ')'
218
- count -= 1 unless seperator_count.odd?
219
- else
220
- warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
221
- count = 0 # error
222
- end
223
- end
224
- # The PDF formatted string is: str[0..-2]
225
- # now starting to convert to regular string
226
- str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
227
- str = []
228
- until str_bytes.empty?
229
- case str_bytes[0]
230
- when 13 # eol - \r
231
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
232
- # shall be treated as a byte value of (0Ah),
233
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
234
- str_bytes.shift
235
- str_bytes.shift if str_bytes[0] == 10
236
- str << 10
237
- when 10 # eol - \n
238
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
239
- # shall be treated as a byte value of (0Ah),
240
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
241
- str_bytes.shift
242
- str_bytes.shift if str_bytes[0] == 13
243
- str << 10
244
- when 92 # "\\".ord == 92
245
- str_bytes.shift
246
- rep = str_bytes.shift
247
- case rep
248
- when 110 #n
249
- str << 10 #new line
250
- when 114 #r
251
- str << 13 # CR
252
- when 116 #t
253
- str << 9 #tab
254
- when 98 #b
255
- str << 8
256
- when 102 #f
257
- str << 255
258
- when 48..57 #octal notation for byte?
259
- rep = rep.chr
260
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
261
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
262
- str << rep.to_i
263
- when 10 # new line, ignore
264
- str_bytes.shift if str_bytes[0] == 13
265
- true
266
- when 13 # new line (or double notation for new line), ignore
267
- str_bytes.shift if str_bytes[0] == 10
268
- true
269
- else
270
- str << rep
271
- end
272
- else
273
- str << str_bytes.shift
274
- end
275
- end
276
- out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
277
- ##########################################
278
- ## Parse a comment
279
- ##########################################
280
- when str = @scanner.scan(/\%/)
281
- #is a comment, skip until new line
282
- @scanner.skip_until /[\n\r]+/
283
- ##########################################
284
- ## Parse a Name
285
- ##########################################
286
- # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
287
- # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
288
- # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
289
- # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
290
- when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
291
- out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
292
- ##########################################
293
- ## Parse a Number
294
- ##########################################
295
- when str = @scanner.scan(/[\+\-\.\d]+/)
296
- str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
297
- ##########################################
298
- ## Parse an Object Reference
299
- ##########################################
300
- when @scanner.scan(/R/)
301
- out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
302
- @references << out.last
303
- ##########################################
304
- ## Parse Bool - true and after false
305
- ##########################################
306
- when @scanner.scan(/true/)
307
- out << true
308
- when @scanner.scan(/false/)
309
- out << false
310
- ##########################################
311
- ## Parse NULL - null
312
- ##########################################
313
- when @scanner.scan(/null/)
314
- out << nil
315
- ##########################################
316
- ## XREF - check for encryption... anything else?
317
- ##########################################
318
- when @scanner.scan(/xref/)
319
- ##########
320
- ## get root object to check for encryption
321
- @scanner.scan_until(/(trailer)|(\%EOF)/)
322
-
323
- if @scanner.matched[-1] == 'r'
324
- if @scanner.skip_until(/<</)
325
- data = _parse_
326
- @root_object = {}
327
- @root_object[data.shift] = data.shift while data[0]
328
- end
329
- ##########
330
- ## skip untill end of segment, maked by %%EOF
331
- @scanner.skip_until(/\%\%EOF/)
332
- end
333
-
334
- when @scanner.scan(/[\s]+/)
335
- # Generally, do nothing
336
- nil
337
- when @scanner.scan(/obj[\s]*/)
338
- # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
339
- unless out[-4].nil? || out[-4].is_a?(Hash)
340
- keep = []
341
- keep << out.pop
342
- keep << out.pop
343
-
344
- if out.last.is_a? Hash
345
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
346
- else
347
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
348
- end
349
- warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
350
-
351
- out << keep.pop
352
- out << keep.pop
353
- end
354
- else
355
- # always advance
356
- # warn "Advnacing for unknown reason..."
357
- @scanner.pos = @scanner.pos + 1
358
- end
359
- end
360
- out
361
- end
362
-
363
- protected
364
-
365
-
366
-
367
- # resets cataloging and pages
368
- def catalog_pages(catalogs = nil, secure_injection = false, inheritance_hash = {})
369
- unless catalogs
370
-
371
- if root_object[:Root]
372
- catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
373
- else
374
- catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
375
- end
376
- @parsed.delete_if {|obj| obj[:Type] == :Catalog}
377
- @parsed << catalogs
378
-
379
- raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
380
- end
381
- case
382
- when catalogs.is_a?(Array)
383
- catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
384
- when catalogs.is_a?(Hash)
385
- if catalogs[:is_reference_only]
386
- if catalogs[:referenced_object]
387
- catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
388
- else
389
- warn "couldn't follow reference!!! #{catalogs} not found!"
390
- end
391
- else
392
- unless catalogs[:Type] == :Page
393
- raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
394
- inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
395
- inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
396
- inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
397
- (inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
398
- (inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
399
-
400
- # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
401
- # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
402
- # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
403
- end
404
-
405
- case catalogs[:Type]
406
- when :Page
407
-
408
- catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
409
- catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
410
- catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
411
- (catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
412
- (catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
413
- # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
414
- # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
415
- # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
416
-
417
-
418
- # avoide references on MediaBox, CropBox and Rotate
419
- catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
420
- catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
421
- catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
422
-
423
- catalogs.instance_eval {extend Page_Methods}
424
- catalogs.secure_injection = secure_injection
425
- when :Pages
426
- catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
427
- when :Catalog
428
- catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
429
- end
430
- end
431
- end
432
- self
433
- end
434
-
435
- # fails!
436
- def change_references_to_actual_values(hash_with_references = {})
437
- hash_with_references.each do |k,v|
438
- if v.is_a?(Hash) && v[:is_reference_only]
439
- hash_with_references[k] = get_refernced_object(v)
440
- hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
441
- warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
442
- hash_with_references[k] = v unless hash_with_references[k]
443
- end
444
- end
445
- hash_with_references
446
- end
447
-
448
- def get_refernced_object(reference_hash = {})
449
- @parsed.each do |stored_object|
450
- return stored_object if ( stored_object.is_a?(Hash) &&
451
- reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
452
- reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
453
- end
454
- warn "didn't find reference #{reference_hash}"
455
- nil
456
- end
457
-
458
- # @private
459
- # connects references and objects, according to their reference id's.
460
- #
461
- # should be moved to the parser's workflow.
462
- #
463
- def serialize_objects_and_references
464
- obj_dir = {}
465
- @parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
466
- # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
467
- @references.each do |obj|
468
- obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
469
- warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
470
- obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
471
- end
472
- self
473
- # rescue => e
474
- # puts (@parsed.select {|o| !o.is_a?(Hash)})
475
- # puts (@parsed)
476
- # puts (@references)
477
- # raise e
478
- end
479
-
480
- # @private
481
- # this method reviews a Hash and updates it by merging Hash data,
482
- # preffering the old over the new.
483
- def self.hash_update_proc_for_old key, old_data, new_data
484
- if old_data.is_a? Hash
485
- old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
486
- else
487
- old_data
488
- end
489
- end
490
- # @private
491
- # this method reviews a Hash an updates it by merging Hash data,
492
- # preffering the new over the old.
493
- def self.hash_update_proc_for_new key, old_data, new_data
494
- if old_data.is_a? Hash
495
- old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
496
- else
497
- new_data
498
- end
499
- end
500
-
501
- # # @private
502
- # # connects references and objects, according to their reference id's.
503
- # #
504
- # # should be moved to the parser's workflow.
505
- # #
506
- # def old_serialize_objects_and_references(object = nil)
507
- # objects_reference_hash = {}
508
- # # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
509
- # @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
510
- # each_object(@parsed) do |obj|
511
- # if obj[:is_reference_only]
512
- # obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
513
- # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
514
- # # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
515
- # end
516
- # end
517
- # self
518
- # end
519
-
520
- # # run block of code on evey PDF object (PDF objects are class Hash)
521
- # def each_object(object, limit_references = true, already_visited = {}, &block)
522
- # unless limit_references
523
- # already_visited[object.object_id] = true
524
- # end
525
- # case
526
- # when object.is_a?(Array)
527
- # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
528
- # when object.is_a?(Hash)
529
- # yield(object)
530
- # unless limit_references && object[:is_reference_only]
531
- # object.each do |k,v|
532
- # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
533
- # end
534
- # end
535
- # end
536
- # end
537
-
538
- end
539
- end
9
+ # @!visibility private
10
+ # @private
11
+ #:nodoc: all
12
+
13
+ protected
14
+
15
+ # This is the Parser class.
16
+ #
17
+ # It takes PDF data and parses it.
18
+ #
19
+ # The information is then used to initialize a PDF object.
20
+ #
21
+ # This is an internal class. you don't need it.
22
+ class PDFParser
23
+ # @!visibility private
24
+
25
+ # the array containing all the parsed data (PDF Objects)
26
+ attr_reader :parsed
27
+ # a Float representing the PDF version of the data parsed (if exists).
28
+ attr_reader :version
29
+ # the info and root objects, as found (if found) in the PDF file.
30
+ #
31
+ # they are mainly to used to know if the file is (was) encrypted and to get more details.
32
+ attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object, :metadata
33
+
34
+ attr_reader :allow_optional_content
35
+ # when creating a parser, it is important to set the data (String) we wish to parse.
36
+ #
37
+ # <b>the data is required and it is not possible to set the data at a later stage</b>
38
+ #
39
+ # string:: the data to be parsed, as a String object.
40
+ def initialize(string, options = {})
41
+ raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
42
+ @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
43
+ @literal_strings = [].dup
44
+ @hex_strings = [].dup
45
+ @streams = [].dup
46
+ @parsed = [].dup
47
+ @references = [].dup
48
+ @root_object = {}.dup
49
+ @info_object = {}.dup
50
+ @names_object = {}.dup
51
+ @outlines_object = {}.dup
52
+ @forms_object = {}.dup
53
+ @metadata = nil
54
+ @strings_dictionary = {}.dup # all strings are one string
55
+ @version = nil
56
+ @scanner = nil
57
+ @allow_optional_content = options[:allow_optional_content]
58
+ end
59
+
60
+ # parse the data in the new parser (the data already set through the initialize / new method)
61
+ def parse
62
+ return [] if @string_to_parse.empty?
63
+ return @parsed unless @parsed.empty?
64
+ @scanner = StringScanner.new @string_to_parse
65
+ @scanner.pos = 0
66
+ @scanner.skip(/[^%]*/) if @scanner.exist?(/%PDF/i)
67
+ if @scanner.scan /\%PDF\-[\d\-\.]+/
68
+ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
69
+ loop do
70
+ break unless @scanner.scan(/[^\d\r\n]+/)
71
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
72
+ break if @scanner.eos?
73
+ @scanner.pos += 1
74
+ end
75
+ end
76
+ @parsed = _parse_
77
+ # puts @parsed
78
+
79
+ raise 'Unknown PDF parsing error - malformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
80
+
81
+ if @root_object == {}.freeze
82
+ xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
83
+ xref_streams.each do |xref_dictionary|
84
+ @root_object.merge! xref_dictionary
85
+ end
86
+ end
87
+
88
+ raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}.freeze
89
+
90
+ if @root_object[:Encrypt]
91
+ # change_references_to_actual_values @root_object
92
+ warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
93
+ decryptor = PDFDecrypt.new @parsed, @root_object
94
+ decryptor.decrypt
95
+ # do we really need to apply to @parsed? No, there is no need.
96
+ end
97
+
98
+ ## search for objects streams
99
+ object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
100
+ unless object_streams.empty?
101
+ warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
102
+
103
+ object_streams.each do |o|
104
+ ## un-encode (using the correct filter) the object streams
105
+ PDFFilter.inflate_object o
106
+ ## extract objects from stream to top level arry @parsed
107
+ @scanner = StringScanner.new o[:raw_stream_content]
108
+ stream_data = _parse_
109
+ id_array = []
110
+ while stream_data[0].is_a? (Integer)
111
+ id_array << stream_data.shift
112
+ stream_data.shift
113
+ end
114
+ while id_array[0] && stream_data[0]
115
+ stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
116
+ stream_data[0][:indirect_reference_id] = id_array.shift
117
+ stream_data[0][:indirect_generation_number] = 0
118
+ @parsed << stream_data.shift
119
+ end
120
+ end
121
+ end
122
+
123
+ # serialize_objects_and_references.catalog_pages
124
+
125
+ # Benchmark.bm do |bm|
126
+ # bm.report("serialize") {1000.times {serialize_objects_and_references} }
127
+ # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
128
+ # bm.report("catalog") {1000.times {catalog_pages} }
129
+ # end
130
+
131
+ serialize_objects_and_references
132
+
133
+ catalog_pages
134
+
135
+ # Strings were unified, we can let them go..
136
+ @strings_dictionary.clear
137
+
138
+ # collect any missing objects from the forms_data
139
+ unless @forms_object.nil? || @forms_object.empty?
140
+ @forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } }
141
+ @forms_object[:related_objects].delete @forms_object
142
+ end
143
+
144
+ @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
145
+ if @info_object && @info_object.is_a?(Hash)
146
+ @parsed.delete @info_object
147
+ CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
148
+ @info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
149
+ else
150
+ @info_object = {}
151
+ end
152
+ # # # ## remove object streams - if they exist
153
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
154
+ # # # ## remove XREF dictionaries - if they exist
155
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
156
+
157
+ @parsed
158
+ end
159
+
160
+ # the actual recoursive parsing is done here.
161
+ #
162
+ # this is an internal function, but it was left exposed for posible future features.
163
+ def _parse_
164
+ out = []
165
+ str = ''
166
+ fresh = true
167
+ while @scanner.rest?
168
+ # last ||= 0
169
+ # out.last.tap do |o|
170
+ # if o.is_a?(Hash)
171
+ # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
172
+ # o.each do |k, v|
173
+ # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
174
+ # end
175
+ # else
176
+ # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
177
+ # end
178
+ # puts "next is #{@scanner.peek 8}"
179
+ # end unless (last == out.count) || (-1 == (last = out.count))
180
+ if @scanner.scan(/\[/)
181
+ out << _parse_
182
+ ##########################################
183
+ ## parse a Dictionary
184
+ ##########################################
185
+ elsif @scanner.scan(/<</)
186
+ data = _parse_
187
+ obj = {}
188
+ obj[data.shift] = data.shift while data[0]
189
+ out << obj
190
+ ##########################################
191
+ ## return content of array or dictionary
192
+ ##########################################
193
+ elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
194
+ return out
195
+ ##########################################
196
+ ## parse a Stream
197
+ ##########################################
198
+ elsif @scanner.scan(/stream[\r\n]/)
199
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
200
+ # the following was dicarded because some PDF files didn't have an EOL marker as required
201
+ # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
202
+ # instead, a non-strict RegExp is used:
203
+ str = @scanner.scan_until(/endstream/)
204
+ # raise error if the stream doesn't end.
205
+ raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
206
+ # need to remove end of stream
207
+ if out.last.is_a? Hash
208
+ # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
209
+ out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
210
+ else
211
+ warn 'Stream not attached to dictionary!'
212
+ out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
213
+ end
214
+ ##########################################
215
+ ## parse an Object after finished
216
+ ##########################################
217
+ elsif str = @scanner.scan(/endobj/)
218
+ # what to do when this is an object?
219
+ if out.last.is_a? Hash
220
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
221
+ else
222
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
223
+ end
224
+ fresh = true
225
+ # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
226
+ out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
227
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
228
+ ##########################################
229
+ ## parse a Hex String
230
+ ##########################################
231
+ elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
232
+ # warn "Found a hex string"
233
+ out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
234
+ ##########################################
235
+ ## parse a Literal String
236
+ ##########################################
237
+ elsif @scanner.scan(/\(/)
238
+ # warn "Found a literal string"
239
+ str = ''.force_encoding(Encoding::ASCII_8BIT)
240
+ count = 1
241
+ while count > 0 && @scanner.rest?
242
+ scn = @scanner.scan_until(/[\(\)]/)
243
+ unless scn
244
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
245
+ count = 0 # error
246
+ next
247
+ end
248
+
249
+ str += scn.to_s
250
+ seperator_count = 0
251
+ seperator_count += 1 while str[-2 - seperator_count] == '\\'
252
+
253
+ case str[-1]
254
+ when '('
255
+ ## The following solution might fail when (string ends with this sign: \\)
256
+ count += 1 unless seperator_count.odd?
257
+ when ')'
258
+ count -= 1 unless seperator_count.odd?
259
+ else
260
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
261
+ count = 0 # error
262
+ end
263
+ end
264
+ # The PDF formatted string is: str[0..-2]
265
+ # now starting to convert to regular string
266
+ str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
267
+ str = []
268
+ until str_bytes.empty?
269
+ case str_bytes[0]
270
+ when 13 # eol - \r
271
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
272
+ # shall be treated as a byte value of (0Ah),
273
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
274
+ str_bytes.shift
275
+ str_bytes.shift if str_bytes[0] == 10
276
+ str << 10
277
+ when 10 # eol - \n
278
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
279
+ # shall be treated as a byte value of (0Ah),
280
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
281
+ str_bytes.shift
282
+ str_bytes.shift if str_bytes[0] == 13
283
+ str << 10
284
+ when 92 # "\\".ord == 92
285
+ str_bytes.shift
286
+ rep = str_bytes.shift
287
+ case rep
288
+ when 110 # n
289
+ str << 10 # new line
290
+ when 114 # r
291
+ str << 13 # CR
292
+ when 116 # t
293
+ str << 9 # tab
294
+ when 98 # b
295
+ str << 8
296
+ when 102 # f, form-feed
297
+ str << 12
298
+ when 48..57 # octal notation for byte?
299
+ rep = rep.chr
300
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
301
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
302
+ str << rep.to_i
303
+ when 10 # new line, ignore
304
+ str_bytes.shift if str_bytes[0] == 13
305
+ true
306
+ when 13 # new line (or double notation for new line), ignore
307
+ str_bytes.shift if str_bytes[0] == 10
308
+ true
309
+ else
310
+ str << rep
311
+ end
312
+ else
313
+ str << str_bytes.shift
314
+ end
315
+ end
316
+ out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
317
+ ##########################################
318
+ ## Parse a comment
319
+ ##########################################
320
+ elsif str = @scanner.scan(/\%/)
321
+ # is a comment, skip until new line
322
+ loop do
323
+ # break unless @scanner.scan(/[^\d\r\n]+/)
324
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
325
+ @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
326
+ end
327
+ # puts "AFTER COMMENT: #{@scanner.peek 8}"
328
+ ##########################################
329
+ ## Parse a Name
330
+ ##########################################
331
+ # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
332
+ # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
333
+ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
334
+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
335
+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
336
+ out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
337
+ ##########################################
338
+ ## Parse a Number
339
+ ##########################################
340
+ elsif str = @scanner.scan(/[\+\-\.\d]+/)
341
+ str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
342
+ ##########################################
343
+ ## Parse an Object Reference
344
+ ##########################################
345
+ elsif @scanner.scan(/R/)
346
+ out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
347
+ # @references << out.last
348
+ ##########################################
349
+ ## Parse Bool - true and after false
350
+ ##########################################
351
+ elsif @scanner.scan(/true/)
352
+ out << true
353
+ elsif @scanner.scan(/false/)
354
+ out << false
355
+ ##########################################
356
+ ## Parse NULL - null
357
+ ##########################################
358
+ elsif @scanner.scan(/null/)
359
+ out << nil
360
+ ##########################################
361
+ ## XREF - check for encryption... anything else?
362
+ ##########################################
363
+ elsif @scanner.scan(/(startxref)|(xref)/)
364
+ ##########
365
+ ## get root object to check for encryption
366
+ @scanner.scan_until(/(trailer)|(\%EOF)/)
367
+ fresh = true
368
+ if @scanner.matched[-1] == 'r'
369
+ if @scanner.skip_until(/<</)
370
+ data = _parse_
371
+ @root_object ||= {}
372
+ @root_object[data.shift] = data.shift while data[0]
373
+ end
374
+ ##########
375
+ ## skip untill end of segment, maked by %%EOF
376
+ @scanner.skip_until(/\%\%EOF/)
377
+ ##########
378
+ ## If this was the last valid segment, ignore any trailing garbage
379
+ ## (issue #49 resolution)
380
+ break unless @scanner.exist?(/\%\%EOF/)
381
+
382
+ end
383
+
384
+ elsif @scanner.scan(/[\s]+/)
385
+ # Generally, do nothing
386
+ nil
387
+ elsif @scanner.scan(/obj[\s]*/)
388
+ # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
389
+ unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
390
+ keep = []
391
+ keep << out.pop # .tap {|i| puts "#{i} is an ID"}
392
+ keep << out.pop # .tap {|i| puts "#{i} is a REF"}
393
+
394
+ if out.last.is_a? Hash
395
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
396
+ else
397
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
398
+ end
399
+ # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
400
+ out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
401
+ warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
402
+
403
+ out << keep.pop
404
+ out << keep.pop
405
+ end
406
+ fresh = false
407
+ else
408
+ # always advance
409
+ # warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos - 4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
410
+ warn 'Warning: parser advancing for unknown reason. Potential data-loss.'
411
+ @scanner.pos = @scanner.pos + 1
412
+ end
413
+ end
414
+ out
415
+ end
416
+
417
+ protected
418
+
419
+ # resets cataloging and pages
420
+ def catalog_pages(catalogs = nil, inheritance_hash = {})
421
+ unless catalogs
422
+
423
+ if root_object[:Root]
424
+ catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
425
+ else
426
+ catalogs = (@parsed.select { |obj| obj[:Type] == :Catalog }).last
427
+ end
428
+
429
+ @parsed.delete_if { |obj| obj.nil? || obj[:Type] == :Catalog }
430
+ @parsed << catalogs
431
+
432
+ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
433
+ end
434
+ if catalogs.is_a?(Array)
435
+ catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
436
+ elsif catalogs.is_a?(Hash)
437
+ if catalogs[:is_reference_only]
438
+ if catalogs[:referenced_object]
439
+ catalog_pages(catalogs[:referenced_object], inheritance_hash)
440
+ else
441
+ warn "couldn't follow reference!!! #{catalogs} not found!"
442
+ end
443
+ else
444
+ unless catalogs[:Type] == :Page
445
+ raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
446
+ inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
447
+ inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
448
+ inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
449
+ if catalogs[:Resources]
450
+ inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
451
+ (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
452
+ end
453
+ if catalogs[:ColorSpace]
454
+ inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
455
+ (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
456
+ end
457
+ # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
458
+ # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
459
+
460
+ # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
461
+ # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
462
+ # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
463
+ end
464
+
465
+ case catalogs[:Type]
466
+ when :Page
467
+
468
+ catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
469
+ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
470
+ catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
471
+ if inheritance_hash[:Resources]
472
+ catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
473
+ (catalogs[:Resources][:referenced_object] || catalogs[:Resources]).update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
474
+ end
475
+ if inheritance_hash[:ColorSpace]
476
+ catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
477
+ (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]).update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
478
+ end
479
+ # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
480
+ # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
481
+ # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
482
+ # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
483
+
484
+ # avoide references on MediaBox, CropBox and Rotate
485
+ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
486
+ catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
487
+ catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
488
+
489
+ catalogs.instance_eval { extend Page_Methods }
490
+ when :Pages
491
+ catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
492
+ when :Catalog
493
+ @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
494
+ @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
495
+ @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
496
+ if catalogs[:Dests] # convert PDF 1.1 Dests to PDF 1.2+ Dests
497
+ dests_arry = (@names_object[:Dests] ||= {})
498
+ dests_arry = ((dests_arry[:referenced_object] || dests_arry)[:Names] ||= [])
499
+ ((catalogs[:Dests][:referenced_object] || catalogs[:Dests])[:referenced_object] || (catalogs[:Dests][:referenced_object] || catalogs[:Dests])).each {|k,v| next if CombinePDF::PDF::PRIVATE_HASH_KEYS.include?(k); dests_arry << unify_string(k.to_s); dests_arry << v; }
500
+ end
501
+ catalog_pages(catalogs[:Pages], inheritance_hash.dup) unless catalogs[:Pages].nil?
502
+ end
503
+ end
504
+ end
505
+ self
506
+ end
507
+
508
+ def get_refernced_object(reference_hash = {})
509
+ @parsed.each do |stored_object|
510
+ return stored_object if stored_object.is_a?(Hash) &&
511
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
512
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
513
+ # return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
514
+ # reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
515
+ # reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
516
+ end
517
+ warn "didn't find reference #{reference_hash}"
518
+ nil
519
+ end
520
+
521
+ # # @private
522
+ # # connects references and objects, according to their reference id's.
523
+ # #
524
+ # # should be moved to the parser's workflow.
525
+ # #
526
+ # def serialize_objects_and_references_old
527
+ # obj_dir = {}
528
+ # # create a dictionary for referenced objects (no value resolution at this point)
529
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
530
+ # # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
531
+ # @references.each do |obj|
532
+ # obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
533
+ # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
534
+ # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
535
+ # end
536
+ # obj_dir.clear
537
+ # @references.clear
538
+ # self
539
+ # end
540
+
541
+ # @private
542
+ # connects references and objects, according to their reference id's.
543
+ #
544
+ # Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
545
+ #
546
+ # should be moved to the parser's workflow.
547
+ #
548
+ def serialize_objects_and_references
549
+ obj_dir = {}
550
+ # create a dictionary for referenced objects (no value resolution at this point)
551
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
552
+ @parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
553
+ should_resolve = [@parsed, @root_object]
554
+ while should_resolve.count > 0
555
+ obj = should_resolve.pop
556
+ if obj.is_a?(Hash)
557
+ obj.keys.each do |k|
558
+ o = obj[k]
559
+ if o.is_a?(Hash)
560
+ if o[:is_reference_only]
561
+ if o[:indirect_reference_id].nil?
562
+ o = nil
563
+ else
564
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
565
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
566
+ o.delete :indirect_reference_id
567
+ o.delete :indirect_generation_number
568
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
569
+ end
570
+ obj[k] = o
571
+ else
572
+ should_resolve << o
573
+ end
574
+ elsif o.is_a?(Array)
575
+ should_resolve << o
576
+ end
577
+ end
578
+ elsif obj.is_a?(Array)
579
+ obj.map! do |o|
580
+ if o.is_a?(Hash)
581
+ if o[:is_reference_only]
582
+ if o[:indirect_reference_id].nil?
583
+ o = nil
584
+ else
585
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
586
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
587
+ o.delete :indirect_reference_id
588
+ o.delete :indirect_generation_number
589
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
590
+ end
591
+ else
592
+ should_resolve << o
593
+ end
594
+ elsif o.is_a?(Array)
595
+ should_resolve << o
596
+ end
597
+ o
598
+ end
599
+ end
600
+ end
601
+ end
602
+
603
+ # def serialize_objects_and_references
604
+ # rec_resolve = proc do |level|
605
+ # if level.is_a?(Hash)
606
+ # if level[:is_reference_only]
607
+ # level[:referenced_object] = get_refernced_object(level)
608
+ # level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level
609
+ # level.delete :indirect_reference_id
610
+ # level.delete :indirect_generation_number
611
+ # else
612
+ # level.keys.each do |k|
613
+ # level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil?
614
+ # end
615
+ # end
616
+ # elsif level.is_a?(Array)
617
+ # level.map! { |o| rec_resolve.call(o) }
618
+ # end
619
+ # level
620
+ # end
621
+ # rec_resolve.call(@root_object)
622
+ # rec_resolve.call(@parsed)
623
+ # self
624
+ # end
625
+
626
+ # All Strings are one String
627
+ def unify_string(str)
628
+ @strings_dictionary[str] ||= str
629
+ end
630
+
631
+ # @private
632
+ # this method reviews a Hash and updates it by merging Hash data,
633
+ # preffering the old over the new.
634
+ def self.hash_update_proc_for_old(_key, old_data, new_data)
635
+ if old_data.is_a? Hash
636
+ old_data.merge(new_data, &method(:hash_update_proc_for_old))
637
+ else
638
+ old_data
639
+ end
640
+ end
641
+
642
+ # @private
643
+ # this method reviews a Hash an updates it by merging Hash data,
644
+ # preffering the new over the old.
645
+ def self.hash_update_proc_for_new(_key, old_data, new_data)
646
+ if old_data.is_a? Hash
647
+ old_data.merge(new_data, &method(:hash_update_proc_for_new))
648
+ else
649
+ new_data
650
+ end
651
+ end
652
+
653
+ # # run block of code on evey PDF object (PDF objects are class Hash)
654
+ # def each_object(object, limit_references = true, already_visited = {}, &block)
655
+ # unless limit_references
656
+ # already_visited[object.object_id] = true
657
+ # end
658
+ # case
659
+ # when object.is_a?(Array)
660
+ # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
661
+ # when object.is_a?(Hash)
662
+ # yield(object)
663
+ # unless limit_references && object[:is_reference_only]
664
+ # object.each do |k,v|
665
+ # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
666
+ # end
667
+ # end
668
+ # end
669
+ # end
670
+ end
671
+ end