combine_pdf 0.2.21 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,583 +5,654 @@
5
5
  ## is subject to the same license.
6
6
  ########################################################
7
7
 
8
-
9
-
10
-
11
8
  module CombinePDF
12
-
13
-
14
- # @!visibility private
15
- # @private
16
- #:nodoc: all
17
-
18
- protected
19
-
20
- # This is the Parser class.
21
- #
22
- # It takes PDF data and parses it.
23
- #
24
- # The information is then used to initialize a PDF object.
25
- #
26
- # This is an internal class. you don't need it.
27
- class PDFParser
28
-
29
- # @!visibility private
30
-
31
-
32
- # the array containing all the parsed data (PDF Objects)
33
- attr_reader :parsed
34
- # a Float representing the PDF version of the data parsed (if exists).
35
- attr_reader :version
36
- # the info and root objects, as found (if found) in the PDF file.
37
- #
38
- # they are mainly to used to know if the file is (was) encrypted and to get more details.
39
- attr_reader :info_object, :root_object, :names_object, :forms_object
40
-
41
- # when creating a parser, it is important to set the data (String) we wish to parse.
42
- #
43
- # <b>the data is required and it is not possible to set the data at a later stage</b>
44
- #
45
- # string:: the data to be parsed, as a String object.
46
- def initialize (string)
47
- raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
48
- @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
49
- @literal_strings = []
50
- @hex_strings = []
51
- @streams = []
52
- @parsed = []
53
- @references = []
54
- @root_object = {}
55
- @info_object = {}
56
- @names_object = {}
57
- @forms_object = {}
58
- @strings_dictionary = {} # all strings are one string
59
- @version = nil
60
- @scanner = nil
61
- end
62
-
63
- # parse the data in the new parser (the data already set through the initialize / new method)
64
- def parse
65
- return [] if @string_to_parse.empty?
66
- return @parsed unless @parsed.empty?
67
- @scanner = StringScanner.new @string_to_parse
68
- @scanner.pos = 0
69
- if @scanner.scan /\%PDF\-[\d\-\.]+/
70
- @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
71
- loop do
72
- break unless @scanner.scan(/[^\d\r\n]+/)
73
- break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
74
- break if @scanner.eos?
75
- @scanner.pos += 1
76
- end
77
- end
78
- @parsed = _parse_
79
- # puts @parsed
80
-
81
- raise "Unknown PDF parsing error - maleformed PDF file?" unless (@parsed.select {|i| !i.is_a?(Hash)}).empty?
82
-
83
- if @root_object == {}
84
- xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
85
- xref_streams.each do |xref_dictionary|
86
- @root_object.merge! xref_dictionary
87
- end
88
- end
89
- raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
90
-
91
- if @root_object[:Encrypt]
92
- change_references_to_actual_values @root_object
93
- warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
94
- decryptor = PDFDecrypt.new @parsed, @root_object
95
- decryptor.decrypt
96
- #do we really need to apply to @parsed? No, there is no need.
97
- end
98
-
99
- ## search for objects streams
100
- object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
101
- unless object_streams.empty?
102
- warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
103
-
104
- object_streams.each do |o|
105
- ## un-encode (using the correct filter) the object streams
106
- PDFFilter.inflate_object o
107
- ## extract objects from stream to top level arry @parsed
108
- @scanner = StringScanner.new o[:raw_stream_content]
109
- stream_data = _parse_
110
- id_array = []
111
- while stream_data[0].is_a? Fixnum
112
- id_array << stream_data.shift
113
- stream_data.shift
114
- end
115
- while id_array[0] && stream_data[0]
116
- stream_data[0] = {indirect_without_dictionary: stream_data[0]} unless stream_data[0].is_a?(Hash)
117
- stream_data[0][:indirect_reference_id] = id_array.shift
118
- stream_data[0][:indirect_generation_number] = 0
119
- @parsed << stream_data.shift
120
- end
121
- end
122
- end
123
-
124
- # Strings were unified, we can let them go..
125
- @strings_dictionary.clear
126
-
127
- # serialize_objects_and_references.catalog_pages
128
-
129
- # Benchmark.bm do |bm|
130
- # bm.report("serialize") {1000.times {serialize_objects_and_references} }
131
- # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
132
- # bm.report("catalog") {1000.times {catalog_pages} }
133
- # end
134
-
135
- serialize_objects_and_references.catalog_pages
136
-
137
- @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
138
- if @info_object && @info_object.is_a?(Hash)
139
- @parsed.delete @info_object
140
- CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
141
- @info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
142
- else
143
- @info_object = {}
144
- end
145
- # # # ## remove object streams - if they exist
146
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
147
- # # # ## remove XREF dictionaries - if they exist
148
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
149
-
150
- @parsed
151
- end
152
-
153
- # the actual recoursive parsing is done here.
154
- #
155
- # this is an internal function, but it was left exposed for posible future features.
156
- def _parse_
157
- out = []
158
- str = ''
159
- fresh = true
160
- while @scanner.rest? do
161
- # last ||= 0
162
- # out.last.tap do |o|
163
- # if o.is_a?(Hash)
164
- # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
165
- # o.each do |k, v|
166
- # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
167
- # end
168
- # else
169
- # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
170
- # end
171
- # puts "next is #{@scanner.peek 8}"
172
- # end unless (last == out.count) || (-1 == (last = out.count))
173
- case
174
- ##########################################
175
- ## parse an Array
176
- ##########################################
177
- when @scanner.scan(/\[/)
178
- out << _parse_
179
- ##########################################
180
- ## parse a Dictionary
181
- ##########################################
182
- when @scanner.scan(/<</)
183
- data = _parse_
184
- obj = {}
185
- obj[data.shift] = data.shift while data[0]
186
- out << obj
187
- ##########################################
188
- ## return content of array or dictionary
189
- ##########################################
190
- when @scanner.scan(/\]/), @scanner.scan(/>>/)
191
- return out
192
- ##########################################
193
- ## parse a Stream
194
- ##########################################
195
- when @scanner.scan(/stream[\r\n]/)
196
- @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
197
- # the following was dicarded because some PDF files didn't have an EOL marker as required
198
- # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
199
- # instead, a non-strict RegExp is used:
200
- str = @scanner.scan_until(/endstream/)
201
- # raise error if the stream doesn't end.
202
- raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
203
- # need to remove end of stream
204
- if out.last.is_a? Hash
205
- # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
206
- out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
207
- else
208
- warn "Stream not attached to dictionary!"
209
- out << str.sub(/(\r\n|\n|\r)?endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
210
- end
211
- ##########################################
212
- ## parse an Object after finished
213
- ##########################################
214
- when str = @scanner.scan(/endobj/)
215
- #what to do when this is an object?
216
- if out.last.is_a? Hash
217
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
218
- else
219
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
220
- end
221
- fresh = true
222
- # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum)
223
- ##########################################
224
- ## parse a Hex String
225
- ##########################################
226
- when str = @scanner.scan(/<[0-9a-fA-F]+>/)
227
- # warn "Found a hex string"
228
- out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
229
- ##########################################
230
- ## parse a Literal String
231
- ##########################################
232
- when @scanner.scan(/\(/)
233
- # warn "Found a literal string"
234
- str = ''.force_encoding(Encoding::ASCII_8BIT)
235
- count = 1
236
- while count > 0 && @scanner.rest? do
237
- scn = @scanner.scan_until(/[\(\)]/)
238
- unless scn
239
- warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
240
- count = 0 # error
241
- next
242
- end
243
-
244
- str += scn.to_s
245
- seperator_count = 0
246
- seperator_count += 1 while str[-2-seperator_count] == "\\"
247
-
248
- case str[-1]
249
- when '('
250
- ## The following solution might fail when (string ends with this sign: \\)
251
- count += 1 unless seperator_count.odd?
252
- when ')'
253
- count -= 1 unless seperator_count.odd?
254
- else
255
- warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
256
- count = 0 # error
257
- end
258
- end
259
- # The PDF formatted string is: str[0..-2]
260
- # now starting to convert to regular string
261
- str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
262
- str = []
263
- until str_bytes.empty?
264
- case str_bytes[0]
265
- when 13 # eol - \r
266
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
267
- # shall be treated as a byte value of (0Ah),
268
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
269
- str_bytes.shift
270
- str_bytes.shift if str_bytes[0] == 10
271
- str << 10
272
- when 10 # eol - \n
273
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
274
- # shall be treated as a byte value of (0Ah),
275
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
276
- str_bytes.shift
277
- str_bytes.shift if str_bytes[0] == 13
278
- str << 10
279
- when 92 # "\\".ord == 92
280
- str_bytes.shift
281
- rep = str_bytes.shift
282
- case rep
283
- when 110 #n
284
- str << 10 #new line
285
- when 114 #r
286
- str << 13 # CR
287
- when 116 #t
288
- str << 9 #tab
289
- when 98 #b
290
- str << 8
291
- when 102 #f
292
- str << 255
293
- when 48..57 #octal notation for byte?
294
- rep = rep.chr
295
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
296
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
297
- str << rep.to_i
298
- when 10 # new line, ignore
299
- str_bytes.shift if str_bytes[0] == 13
300
- true
301
- when 13 # new line (or double notation for new line), ignore
302
- str_bytes.shift if str_bytes[0] == 10
303
- true
304
- else
305
- str << rep
306
- end
307
- else
308
- str << str_bytes.shift
309
- end
310
- end
311
- out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
312
- ##########################################
313
- ## Parse a comment
314
- ##########################################
315
- when str = @scanner.scan(/\%/)
316
- #is a comment, skip until new line
317
- loop do
318
- # break unless @scanner.scan(/[^\d\r\n]+/)
319
- break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
320
- @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
321
- end
322
- # puts "AFTER COMMENT: #{@scanner.peek 8}"
323
- ##########################################
324
- ## Parse a Name
325
- ##########################################
326
- # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
327
- # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
328
- # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
329
- # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
330
- when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
331
- out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
332
- ##########################################
333
- ## Parse a Number
334
- ##########################################
335
- when str = @scanner.scan(/[\+\-\.\d]+/)
336
- str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
337
- ##########################################
338
- ## Parse an Object Reference
339
- ##########################################
340
- when @scanner.scan(/R/)
341
- out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
342
- @references << out.last
343
- ##########################################
344
- ## Parse Bool - true and after false
345
- ##########################################
346
- when @scanner.scan(/true/)
347
- out << true
348
- when @scanner.scan(/false/)
349
- out << false
350
- ##########################################
351
- ## Parse NULL - null
352
- ##########################################
353
- when @scanner.scan(/null/)
354
- out << nil
355
- ##########################################
356
- ## XREF - check for encryption... anything else?
357
- ##########################################
358
- when @scanner.scan(/xref/)
359
- ##########
360
- ## get root object to check for encryption
361
- @scanner.scan_until(/(trailer)|(\%EOF)/)
362
- fresh = true
363
- if @scanner.matched[-1] == 'r'
364
- if @scanner.skip_until(/<</)
365
- data = _parse_
366
- @root_object ||= {}
367
- @root_object[data.shift] = data.shift while data[0]
368
- end
369
- ##########
370
- ## skip untill end of segment, maked by %%EOF
371
- @scanner.skip_until(/\%\%EOF/)
372
- ##########
373
- ## If this was the last valid segment, ignore any trailing garbage
374
- ## (issue #49 resolution)
375
- break unless @scanner.exist?(/\%\%EOF/)
376
-
377
- end
378
-
379
- when @scanner.scan(/[\s]+/)
380
- # Generally, do nothing
381
- nil
382
- when @scanner.scan(/obj[\s]*/)
383
- # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
384
- unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
385
- keep = []
386
- keep << out.pop # .tap {|i| puts "#{i} is an ID"}
387
- keep << out.pop # .tap {|i| puts "#{i} is a REF"}
388
-
389
- if out.last.is_a? Hash
390
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
391
- else
392
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
393
- end
394
- warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
395
-
396
- out << keep.pop
397
- out << keep.pop
398
- end
399
- fresh = false
400
- else
401
- # always advance
402
- # warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
403
- warn "Warning: parser advnacing for unknown reason. Potential data-loss."
404
- @scanner.pos = @scanner.pos + 1
405
- end
406
- end
407
- out
408
- end
409
-
410
- protected
411
-
412
-
413
-
414
- # resets cataloging and pages
415
- def catalog_pages(catalogs = nil, inheritance_hash = {})
416
- unless catalogs
417
-
418
- if root_object[:Root]
419
- catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
420
- else
421
- catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
422
- end
423
- @parsed.delete_if {|obj| obj[:Type] == :Catalog}
424
- @parsed << catalogs
425
-
426
- raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
427
- end
428
- case
429
- when catalogs.is_a?(Array)
430
- catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?}
431
- when catalogs.is_a?(Hash)
432
- if catalogs[:is_reference_only]
433
- if catalogs[:referenced_object]
434
- catalog_pages(catalogs[:referenced_object], inheritance_hash)
435
- else
436
- warn "couldn't follow reference!!! #{catalogs} not found!"
437
- end
438
- else
439
- unless catalogs[:Type] == :Page
440
- raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
441
- inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
442
- inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
443
- inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
444
- (inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
445
- (inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
446
-
447
- # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
448
- # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
449
- # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
450
- end
451
-
452
- case catalogs[:Type]
453
- when :Page
454
-
455
- catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
456
- catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
457
- catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
458
- (catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
459
- (catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
460
- # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
461
- # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
462
- # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
463
-
464
-
465
- # avoide references on MediaBox, CropBox and Rotate
466
- catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
467
- catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
468
- catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
469
-
470
- catalogs.instance_eval {extend Page_Methods}
471
- when :Pages
472
- catalog_pages(catalogs[:Kids], inheritance_hash.dup ) unless catalogs[:Kids].nil?
473
- when :Catalog
474
- @forms_object.update( (catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:AcroForm]
475
- @names_object.update( (catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Names]
476
- catalog_pages(catalogs[:Pages], inheritance_hash.dup ) unless catalogs[:Pages].nil?
477
- end
478
- end
479
- end
480
- self
481
- end
482
-
483
- # fails!
484
- def change_references_to_actual_values(hash_with_references = {})
485
- hash_with_references.each do |k,v|
486
- if v.is_a?(Hash) && v[:is_reference_only]
487
- hash_with_references[k] = get_refernced_object(v)
488
- hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
489
- warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
490
- hash_with_references[k] = v unless hash_with_references[k]
491
- end
492
- end
493
- hash_with_references
494
- end
495
-
496
- def get_refernced_object(reference_hash = {})
497
- @parsed.each do |stored_object|
498
- return stored_object if ( stored_object.is_a?(Hash) &&
499
- reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
500
- reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
501
- end
502
- warn "didn't find reference #{reference_hash}"
503
- nil
504
- end
505
-
506
- # @private
507
- # connects references and objects, according to their reference id's.
508
- #
509
- # should be moved to the parser's workflow.
510
- #
511
- def serialize_objects_and_references
512
- obj_dir = {}
513
- @parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
514
- # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
515
- @references.each do |obj|
516
- obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
517
- warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
518
- obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
519
- end
520
- self
521
- end
522
-
523
- # All Strings are one String
524
- def unify_string str
525
- @strings_dictionary[str] ||= str
526
- end
527
-
528
- # @private
529
- # this method reviews a Hash and updates it by merging Hash data,
530
- # preffering the old over the new.
531
- def self.hash_update_proc_for_old key, old_data, new_data
532
- if old_data.is_a? Hash
533
- old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
534
- else
535
- old_data
536
- end
537
- end
538
- # @private
539
- # this method reviews a Hash an updates it by merging Hash data,
540
- # preffering the new over the old.
541
- def self.hash_update_proc_for_new key, old_data, new_data
542
- if old_data.is_a? Hash
543
- old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
544
- else
545
- new_data
546
- end
547
- end
548
-
549
- # # @private
550
- # # connects references and objects, according to their reference id's.
551
- # #
552
- # # should be moved to the parser's workflow.
553
- # #
554
- # def old_serialize_objects_and_references(object = nil)
555
- # objects_reference_hash = {}
556
- # # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
557
- # @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
558
- # each_object(@parsed) do |obj|
559
- # if obj[:is_reference_only]
560
- # obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
561
- # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
562
- # # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
563
- # end
564
- # end
565
- # self
566
- # end
567
-
568
- # # run block of code on evey PDF object (PDF objects are class Hash)
569
- # def each_object(object, limit_references = true, already_visited = {}, &block)
570
- # unless limit_references
571
- # already_visited[object.object_id] = true
572
- # end
573
- # case
574
- # when object.is_a?(Array)
575
- # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
576
- # when object.is_a?(Hash)
577
- # yield(object)
578
- # unless limit_references && object[:is_reference_only]
579
- # object.each do |k,v|
580
- # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
581
- # end
582
- # end
583
- # end
584
- # end
585
-
586
- end
9
+ # @!visibility private
10
+ # @private
11
+ #:nodoc: all
12
+
13
+ protected
14
+
15
+ # This is the Parser class.
16
+ #
17
+ # It takes PDF data and parses it.
18
+ #
19
+ # The information is then used to initialize a PDF object.
20
+ #
21
+ # This is an internal class. you don't need it.
22
+ class PDFParser
23
+ # @!visibility private
24
+
25
+ # the array containing all the parsed data (PDF Objects)
26
+ attr_reader :parsed
27
+ # a Float representing the PDF version of the data parsed (if exists).
28
+ attr_reader :version
29
+ # the info and root objects, as found (if found) in the PDF file.
30
+ #
31
+ # they are mainly to used to know if the file is (was) encrypted and to get more details.
32
+ attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object
33
+
34
+ attr_reader :allow_optional_content
35
+ # when creating a parser, it is important to set the data (String) we wish to parse.
36
+ #
37
+ # <b>the data is required and it is not possible to set the data at a later stage</b>
38
+ #
39
+ # string:: the data to be parsed, as a String object.
40
+ def initialize(string, options = {})
41
+ raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
42
+ @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
43
+ @literal_strings = [].dup
44
+ @hex_strings = [].dup
45
+ @streams = [].dup
46
+ @parsed = [].dup
47
+ @references = [].dup
48
+ @root_object = {}.dup
49
+ @info_object = {}.dup
50
+ @names_object = {}.dup
51
+ @outlines_object = {}.dup
52
+ @forms_object = {}.dup
53
+ @strings_dictionary = {}.dup # all strings are one string
54
+ @version = nil
55
+ @scanner = nil
56
+ @allow_optional_content = options[:allow_optional_content]
57
+ end
58
+
59
+ # parse the data in the new parser (the data already set through the initialize / new method)
60
+ def parse
61
+ return [] if @string_to_parse.empty?
62
+ return @parsed unless @parsed.empty?
63
+ @scanner = StringScanner.new @string_to_parse
64
+ @scanner.pos = 0
65
+ if @scanner.scan /\%PDF\-[\d\-\.]+/
66
+ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
67
+ loop do
68
+ break unless @scanner.scan(/[^\d\r\n]+/)
69
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
70
+ break if @scanner.eos?
71
+ @scanner.pos += 1
72
+ end
73
+ end
74
+ @parsed = _parse_
75
+ # puts @parsed
76
+
77
+ raise 'Unknown PDF parsing error - maleformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
78
+
79
+ if @root_object == {}
80
+ xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
81
+ xref_streams.each do |xref_dictionary|
82
+ @root_object.merge! xref_dictionary
83
+ end
84
+ end
85
+ raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}
86
+
87
+ if @root_object[:Encrypt]
88
+ # change_references_to_actual_values @root_object
89
+ warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
90
+ decryptor = PDFDecrypt.new @parsed, @root_object
91
+ decryptor.decrypt
92
+ # do we really need to apply to @parsed? No, there is no need.
93
+ end
94
+
95
+ ## search for objects streams
96
+ object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
97
+ unless object_streams.empty?
98
+ warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
99
+
100
+ object_streams.each do |o|
101
+ ## un-encode (using the correct filter) the object streams
102
+ PDFFilter.inflate_object o
103
+ ## extract objects from stream to top level arry @parsed
104
+ @scanner = StringScanner.new o[:raw_stream_content]
105
+ stream_data = _parse_
106
+ id_array = []
107
+ while stream_data[0].is_a? Fixnum
108
+ id_array << stream_data.shift
109
+ stream_data.shift
110
+ end
111
+ while id_array[0] && stream_data[0]
112
+ stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
113
+ stream_data[0][:indirect_reference_id] = id_array.shift
114
+ stream_data[0][:indirect_generation_number] = 0
115
+ @parsed << stream_data.shift
116
+ end
117
+ end
118
+ end
119
+
120
+ # Strings were unified, we can let them go..
121
+ @strings_dictionary.clear
122
+
123
+ # serialize_objects_and_references.catalog_pages
124
+
125
+ # Benchmark.bm do |bm|
126
+ # bm.report("serialize") {1000.times {serialize_objects_and_references} }
127
+ # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
128
+ # bm.report("catalog") {1000.times {catalog_pages} }
129
+ # end
130
+
131
+ serialize_objects_and_references
132
+
133
+ catalog_pages
134
+
135
+ # collect any missing objects from the forms_data
136
+ unless @forms_object.nil? || @forms_object.empty?
137
+ @forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } }
138
+ @forms_object[:related_objects].delete @forms_object
139
+ end
140
+
141
+ @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
142
+ if @info_object && @info_object.is_a?(Hash)
143
+ @parsed.delete @info_object
144
+ CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
145
+ @info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
146
+ else
147
+ @info_object = {}
148
+ end
149
+ # # # ## remove object streams - if they exist
150
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
151
+ # # # ## remove XREF dictionaries - if they exist
152
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
153
+
154
+ @parsed
155
+ end
156
+
157
+ # the actual recoursive parsing is done here.
158
+ #
159
+ # this is an internal function, but it was left exposed for posible future features.
160
+ def _parse_
161
+ out = []
162
+ str = ''
163
+ fresh = true
164
+ while @scanner.rest?
165
+ # last ||= 0
166
+ # out.last.tap do |o|
167
+ # if o.is_a?(Hash)
168
+ # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
169
+ # o.each do |k, v|
170
+ # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
171
+ # end
172
+ # else
173
+ # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
174
+ # end
175
+ # puts "next is #{@scanner.peek 8}"
176
+ # end unless (last == out.count) || (-1 == (last = out.count))
177
+ if @scanner.scan(/\[/)
178
+ out << _parse_
179
+ ##########################################
180
+ ## parse a Dictionary
181
+ ##########################################
182
+ elsif @scanner.scan(/<</)
183
+ data = _parse_
184
+ obj = {}
185
+ obj[data.shift] = data.shift while data[0]
186
+ out << obj
187
+ ##########################################
188
+ ## return content of array or dictionary
189
+ ##########################################
190
+ elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
191
+ return out
192
+ ##########################################
193
+ ## parse a Stream
194
+ ##########################################
195
+ elsif @scanner.scan(/stream[\r\n]/)
196
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
197
+ # the following was dicarded because some PDF files didn't have an EOL marker as required
198
+ # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
199
+ # instead, a non-strict RegExp is used:
200
+ str = @scanner.scan_until(/endstream/)
201
+ # raise error if the stream doesn't end.
202
+ raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
203
+ # need to remove end of stream
204
+ if out.last.is_a? Hash
205
+ # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
206
+ out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
207
+ else
208
+ warn 'Stream not attached to dictionary!'
209
+ out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
210
+ end
211
+ ##########################################
212
+ ## parse an Object after finished
213
+ ##########################################
214
+ elsif str = @scanner.scan(/endobj/)
215
+ # what to do when this is an object?
216
+ if out.last.is_a? Hash
217
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
218
+ else
219
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
220
+ end
221
+ fresh = true
222
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum)
223
+ ##########################################
224
+ ## parse a Hex String
225
+ ##########################################
226
+ elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
227
+ # warn "Found a hex string"
228
+ out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
229
+ ##########################################
230
+ ## parse a Literal String
231
+ ##########################################
232
+ elsif @scanner.scan(/\(/)
233
+ # warn "Found a literal string"
234
+ str = ''.force_encoding(Encoding::ASCII_8BIT)
235
+ count = 1
236
+ while count > 0 && @scanner.rest?
237
+ scn = @scanner.scan_until(/[\(\)]/)
238
+ unless scn
239
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
240
+ count = 0 # error
241
+ next
242
+ end
243
+
244
+ str += scn.to_s
245
+ seperator_count = 0
246
+ seperator_count += 1 while str[-2 - seperator_count] == '\\'
247
+
248
+ case str[-1]
249
+ when '('
250
+ ## The following solution might fail when (string ends with this sign: \\)
251
+ count += 1 unless seperator_count.odd?
252
+ when ')'
253
+ count -= 1 unless seperator_count.odd?
254
+ else
255
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
256
+ count = 0 # error
257
+ end
258
+ end
259
+ # The PDF formatted string is: str[0..-2]
260
+ # now starting to convert to regular string
261
+ str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
262
+ str = []
263
+ until str_bytes.empty?
264
+ case str_bytes[0]
265
+ when 13 # eol - \r
266
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
267
+ # shall be treated as a byte value of (0Ah),
268
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
269
+ str_bytes.shift
270
+ str_bytes.shift if str_bytes[0] == 10
271
+ str << 10
272
+ when 10 # eol - \n
273
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
274
+ # shall be treated as a byte value of (0Ah),
275
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
276
+ str_bytes.shift
277
+ str_bytes.shift if str_bytes[0] == 13
278
+ str << 10
279
+ when 92 # "\\".ord == 92
280
+ str_bytes.shift
281
+ rep = str_bytes.shift
282
+ case rep
283
+ when 110 # n
284
+ str << 10 # new line
285
+ when 114 # r
286
+ str << 13 # CR
287
+ when 116 # t
288
+ str << 9 # tab
289
+ when 98 # b
290
+ str << 8
291
+ when 102 # f, form-feed
292
+ str << 12
293
+ when 48..57 # octal notation for byte?
294
+ rep = rep.chr
295
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
296
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
297
+ str << rep.to_i
298
+ when 10 # new line, ignore
299
+ str_bytes.shift if str_bytes[0] == 13
300
+ true
301
+ when 13 # new line (or double notation for new line), ignore
302
+ str_bytes.shift if str_bytes[0] == 10
303
+ true
304
+ else
305
+ str << rep
306
+ end
307
+ else
308
+ str << str_bytes.shift
309
+ end
310
+ end
311
+ out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
312
+ ##########################################
313
+ ## Parse a comment
314
+ ##########################################
315
+ elsif str = @scanner.scan(/\%/)
316
+ # is a comment, skip until new line
317
+ loop do
318
+ # break unless @scanner.scan(/[^\d\r\n]+/)
319
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
320
+ @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
321
+ end
322
+ # puts "AFTER COMMENT: #{@scanner.peek 8}"
323
+ ##########################################
324
+ ## Parse a Name
325
+ ##########################################
326
+ # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
327
+ # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
328
+ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
329
+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
330
+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
331
+ out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
332
+ ##########################################
333
+ ## Parse a Number
334
+ ##########################################
335
+ elsif str = @scanner.scan(/[\+\-\.\d]+/)
336
+ str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
337
+ ##########################################
338
+ ## Parse an Object Reference
339
+ ##########################################
340
+ elsif @scanner.scan(/R/)
341
+ out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
342
+ # @references << out.last
343
+ ##########################################
344
+ ## Parse Bool - true and after false
345
+ ##########################################
346
+ elsif @scanner.scan(/true/)
347
+ out << true
348
+ elsif @scanner.scan(/false/)
349
+ out << false
350
+ ##########################################
351
+ ## Parse NULL - null
352
+ ##########################################
353
+ elsif @scanner.scan(/null/)
354
+ out << nil
355
+ ##########################################
356
+ ## XREF - check for encryption... anything else?
357
+ ##########################################
358
+ elsif @scanner.scan(/xref/)
359
+ ##########
360
+ ## get root object to check for encryption
361
+ @scanner.scan_until(/(trailer)|(\%EOF)/)
362
+ fresh = true
363
+ if @scanner.matched[-1] == 'r'
364
+ if @scanner.skip_until(/<</)
365
+ data = _parse_
366
+ @root_object ||= {}
367
+ @root_object[data.shift] = data.shift while data[0]
368
+ end
369
+ ##########
370
+ ## skip untill end of segment, maked by %%EOF
371
+ @scanner.skip_until(/\%\%EOF/)
372
+ ##########
373
+ ## If this was the last valid segment, ignore any trailing garbage
374
+ ## (issue #49 resolution)
375
+ break unless @scanner.exist?(/\%\%EOF/)
376
+
377
+ end
378
+
379
+ elsif @scanner.scan(/[\s]+/)
380
+ # Generally, do nothing
381
+ nil
382
+ elsif @scanner.scan(/obj[\s]*/)
383
+ # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
384
+ unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
385
+ keep = []
386
+ keep << out.pop # .tap {|i| puts "#{i} is an ID"}
387
+ keep << out.pop # .tap {|i| puts "#{i} is a REF"}
388
+
389
+ if out.last.is_a? Hash
390
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
391
+ else
392
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
393
+ end
394
+ warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
395
+
396
+ out << keep.pop
397
+ out << keep.pop
398
+ end
399
+ fresh = false
400
+ else
401
+ # always advance
402
+ # warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos-4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
403
+ warn 'Warning: parser advancing for unknown reason. Potential data-loss.'
404
+ @scanner.pos = @scanner.pos + 1
405
+ end
406
+ end
407
+ out
408
+ end
409
+
410
+ protected
411
+
412
+ # resets cataloging and pages
413
+ def catalog_pages(catalogs = nil, inheritance_hash = {})
414
+ unless catalogs
415
+
416
+ if root_object[:Root]
417
+ catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
418
+ else
419
+ catalogs = (@parsed.select { |obj| obj[:Type] == :Catalog }).last
420
+ end
421
+ @parsed.delete_if { |obj| obj[:Type] == :Catalog }
422
+ @parsed << catalogs
423
+
424
+ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
425
+ end
426
+ if catalogs.is_a?(Array)
427
+ catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
428
+ elsif catalogs.is_a?(Hash)
429
+ if catalogs[:is_reference_only]
430
+ if catalogs[:referenced_object]
431
+ catalog_pages(catalogs[:referenced_object], inheritance_hash)
432
+ else
433
+ warn "couldn't follow reference!!! #{catalogs} not found!"
434
+ end
435
+ else
436
+ unless catalogs[:Type] == :Page
437
+ raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
438
+ inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
439
+ inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
440
+ inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
441
+ if catalogs[:Resources]
442
+ inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
443
+ (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
444
+ end
445
+ if catalogs[:ColorSpace]
446
+ inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
447
+ (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
448
+ end
449
+ # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
450
+ # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
451
+
452
+ # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
453
+ # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
454
+ # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
455
+ end
456
+
457
+ case catalogs[:Type]
458
+ when :Page
459
+
460
+ catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
461
+ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
462
+ catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
463
+ if inheritance_hash[:Resources]
464
+ catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
465
+ (catalogs[:Resources][:referenced_object] || catalogs[:Resources]).update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
466
+ end
467
+ if inheritance_hash[:ColorSpace]
468
+ catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
469
+ (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]).update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
470
+ end
471
+ # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
472
+ # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
473
+ # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
474
+ # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
475
+
476
+ # avoide references on MediaBox, CropBox and Rotate
477
+ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
478
+ catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
479
+ catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
480
+
481
+ catalogs.instance_eval { extend Page_Methods }
482
+ when :Pages
483
+ catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
484
+ when :Catalog
485
+ @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
486
+ @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
487
+ @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
488
+ catalog_pages(catalogs[:Pages], inheritance_hash.dup) unless catalogs[:Pages].nil?
489
+ end
490
+ end
491
+ end
492
+ self
493
+ end
494
+
495
+ def get_refernced_object(reference_hash = {})
496
+ @parsed.each do |stored_object|
497
+ return stored_object if stored_object.is_a?(Hash) &&
498
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
499
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
500
+ # return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
501
+ # reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
502
+ # reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
503
+ end
504
+ warn "didn't find reference #{reference_hash}"
505
+ nil
506
+ end
507
+
508
+ # # @private
509
+ # # connects references and objects, according to their reference id's.
510
+ # #
511
+ # # should be moved to the parser's workflow.
512
+ # #
513
+ # def serialize_objects_and_references_old
514
+ # obj_dir = {}
515
+ # # create a dictionary for referenced objects (no value resolution at this point)
516
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
517
+ # # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
518
+ # @references.each do |obj|
519
+ # obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
520
+ # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
521
+ # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
522
+ # end
523
+ # obj_dir.clear
524
+ # @references.clear
525
+ # self
526
+ # end
527
+
528
+ # @private
529
+ # connects references and objects, according to their reference id's.
530
+ #
531
+ # Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
532
+ #
533
+ # should be moved to the parser's workflow.
534
+ #
535
+ def serialize_objects_and_references
536
+ obj_dir = {}
537
+ # create a dictionary for referenced objects (no value resolution at this point)
538
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
539
+ @parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
540
+ should_resolve = [@parsed, @root_object]
541
+ while should_resolve.count > 0
542
+ obj = should_resolve.pop
543
+ if obj.is_a?(Hash)
544
+ obj.keys.each do |k|
545
+ o = obj[k]
546
+ if o.is_a?(Hash)
547
+ if o[:is_reference_only]
548
+ if o[:indirect_reference_id].nil?
549
+ o = nil
550
+ else
551
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
552
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
553
+ o.delete :indirect_reference_id
554
+ o.delete :indirect_generation_number
555
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
556
+ end
557
+ obj[k] = o
558
+ else
559
+ should_resolve << o
560
+ end
561
+ elsif o.is_a?(Array)
562
+ should_resolve << o
563
+ end
564
+ end
565
+ elsif obj.is_a?(Array)
566
+ obj.map! do |o|
567
+ if o.is_a?(Hash)
568
+ if o[:is_reference_only]
569
+ if o[:indirect_reference_id].nil?
570
+ o = nil
571
+ else
572
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
573
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
574
+ o.delete :indirect_reference_id
575
+ o.delete :indirect_generation_number
576
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
577
+ end
578
+ else
579
+ should_resolve << o
580
+ end
581
+ elsif o.is_a?(Array)
582
+ should_resolve << o
583
+ end
584
+ o
585
+ end
586
+ end
587
+ end
588
+ end
589
+
590
+ # def serialize_objects_and_references
591
+ # rec_resolve = proc do |level|
592
+ # if level.is_a?(Hash)
593
+ # if level[:is_reference_only]
594
+ # level[:referenced_object] = get_refernced_object(level)
595
+ # level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level
596
+ # level.delete :indirect_reference_id
597
+ # level.delete :indirect_generation_number
598
+ # else
599
+ # level.keys.each do |k|
600
+ # level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil?
601
+ # end
602
+ # end
603
+ # elsif level.is_a?(Array)
604
+ # level.map! { |o| rec_resolve.call(o) }
605
+ # end
606
+ # level
607
+ # end
608
+ # rec_resolve.call(@root_object)
609
+ # rec_resolve.call(@parsed)
610
+ # self
611
+ # end
612
+
613
+ # All Strings are one String
614
+ def unify_string(str)
615
+ @strings_dictionary[str] ||= str
616
+ end
617
+
618
+ # @private
619
+ # this method reviews a Hash and updates it by merging Hash data,
620
+ # preffering the old over the new.
621
+ def self.hash_update_proc_for_old(_key, old_data, new_data)
622
+ if old_data.is_a? Hash
623
+ old_data.merge(new_data, &method(:hash_update_proc_for_old))
624
+ else
625
+ old_data
626
+ end
627
+ end
628
+
629
+ # @private
630
+ # this method reviews a Hash an updates it by merging Hash data,
631
+ # preffering the new over the old.
632
+ def self.hash_update_proc_for_new(_key, old_data, new_data)
633
+ if old_data.is_a? Hash
634
+ old_data.merge(new_data, &method(:hash_update_proc_for_new))
635
+ else
636
+ new_data
637
+ end
638
+ end
639
+
640
+ # # run block of code on evey PDF object (PDF objects are class Hash)
641
+ # def each_object(object, limit_references = true, already_visited = {}, &block)
642
+ # unless limit_references
643
+ # already_visited[object.object_id] = true
644
+ # end
645
+ # case
646
+ # when object.is_a?(Array)
647
+ # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
648
+ # when object.is_a?(Hash)
649
+ # yield(object)
650
+ # unless limit_references && object[:is_reference_only]
651
+ # object.each do |k,v|
652
+ # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
653
+ # end
654
+ # end
655
+ # end
656
+ # end
657
+ end
587
658
  end