combine_pdf 0.2.21 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,583 +5,654 @@
5
5
  ## is subject to the same license.
6
6
  ########################################################
7
7
 
8
-
9
-
10
-
11
8
  module CombinePDF
12
-
13
-
14
- # @!visibility private
15
- # @private
16
- #:nodoc: all
17
-
18
- protected
19
-
20
- # This is the Parser class.
21
- #
22
- # It takes PDF data and parses it.
23
- #
24
- # The information is then used to initialize a PDF object.
25
- #
26
- # This is an internal class. you don't need it.
27
- class PDFParser
28
-
29
- # @!visibility private
30
-
31
-
32
- # the array containing all the parsed data (PDF Objects)
33
- attr_reader :parsed
34
- # a Float representing the PDF version of the data parsed (if exists).
35
- attr_reader :version
36
- # the info and root objects, as found (if found) in the PDF file.
37
- #
38
- # they are mainly to used to know if the file is (was) encrypted and to get more details.
39
- attr_reader :info_object, :root_object, :names_object, :forms_object
40
-
41
- # when creating a parser, it is important to set the data (String) we wish to parse.
42
- #
43
- # <b>the data is required and it is not possible to set the data at a later stage</b>
44
- #
45
- # string:: the data to be parsed, as a String object.
46
- def initialize (string)
47
- raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
48
- @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
49
- @literal_strings = []
50
- @hex_strings = []
51
- @streams = []
52
- @parsed = []
53
- @references = []
54
- @root_object = {}
55
- @info_object = {}
56
- @names_object = {}
57
- @forms_object = {}
58
- @strings_dictionary = {} # all strings are one string
59
- @version = nil
60
- @scanner = nil
61
- end
62
-
63
- # parse the data in the new parser (the data already set through the initialize / new method)
64
- def parse
65
- return [] if @string_to_parse.empty?
66
- return @parsed unless @parsed.empty?
67
- @scanner = StringScanner.new @string_to_parse
68
- @scanner.pos = 0
69
- if @scanner.scan /\%PDF\-[\d\-\.]+/
70
- @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
71
- loop do
72
- break unless @scanner.scan(/[^\d\r\n]+/)
73
- break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
74
- break if @scanner.eos?
75
- @scanner.pos += 1
76
- end
77
- end
78
- @parsed = _parse_
79
- # puts @parsed
80
-
81
- raise "Unknown PDF parsing error - maleformed PDF file?" unless (@parsed.select {|i| !i.is_a?(Hash)}).empty?
82
-
83
- if @root_object == {}
84
- xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
85
- xref_streams.each do |xref_dictionary|
86
- @root_object.merge! xref_dictionary
87
- end
88
- end
89
- raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
90
-
91
- if @root_object[:Encrypt]
92
- change_references_to_actual_values @root_object
93
- warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
94
- decryptor = PDFDecrypt.new @parsed, @root_object
95
- decryptor.decrypt
96
- #do we really need to apply to @parsed? No, there is no need.
97
- end
98
-
99
- ## search for objects streams
100
- object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
101
- unless object_streams.empty?
102
- warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
103
-
104
- object_streams.each do |o|
105
- ## un-encode (using the correct filter) the object streams
106
- PDFFilter.inflate_object o
107
- ## extract objects from stream to top level arry @parsed
108
- @scanner = StringScanner.new o[:raw_stream_content]
109
- stream_data = _parse_
110
- id_array = []
111
- while stream_data[0].is_a? Fixnum
112
- id_array << stream_data.shift
113
- stream_data.shift
114
- end
115
- while id_array[0] && stream_data[0]
116
- stream_data[0] = {indirect_without_dictionary: stream_data[0]} unless stream_data[0].is_a?(Hash)
117
- stream_data[0][:indirect_reference_id] = id_array.shift
118
- stream_data[0][:indirect_generation_number] = 0
119
- @parsed << stream_data.shift
120
- end
121
- end
122
- end
123
-
124
- # Strings were unified, we can let them go..
125
- @strings_dictionary.clear
126
-
127
- # serialize_objects_and_references.catalog_pages
128
-
129
- # Benchmark.bm do |bm|
130
- # bm.report("serialize") {1000.times {serialize_objects_and_references} }
131
- # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
132
- # bm.report("catalog") {1000.times {catalog_pages} }
133
- # end
134
-
135
- serialize_objects_and_references.catalog_pages
136
-
137
- @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
138
- if @info_object && @info_object.is_a?(Hash)
139
- @parsed.delete @info_object
140
- CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
141
- @info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
142
- else
143
- @info_object = {}
144
- end
145
- # # # ## remove object streams - if they exist
146
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
147
- # # # ## remove XREF dictionaries - if they exist
148
- # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
149
-
150
- @parsed
151
- end
152
-
153
- # the actual recoursive parsing is done here.
154
- #
155
- # this is an internal function, but it was left exposed for posible future features.
156
- def _parse_
157
- out = []
158
- str = ''
159
- fresh = true
160
- while @scanner.rest? do
161
- # last ||= 0
162
- # out.last.tap do |o|
163
- # if o.is_a?(Hash)
164
- # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
165
- # o.each do |k, v|
166
- # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
167
- # end
168
- # else
169
- # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
170
- # end
171
- # puts "next is #{@scanner.peek 8}"
172
- # end unless (last == out.count) || (-1 == (last = out.count))
173
- case
174
- ##########################################
175
- ## parse an Array
176
- ##########################################
177
- when @scanner.scan(/\[/)
178
- out << _parse_
179
- ##########################################
180
- ## parse a Dictionary
181
- ##########################################
182
- when @scanner.scan(/<</)
183
- data = _parse_
184
- obj = {}
185
- obj[data.shift] = data.shift while data[0]
186
- out << obj
187
- ##########################################
188
- ## return content of array or dictionary
189
- ##########################################
190
- when @scanner.scan(/\]/), @scanner.scan(/>>/)
191
- return out
192
- ##########################################
193
- ## parse a Stream
194
- ##########################################
195
- when @scanner.scan(/stream[\r\n]/)
196
- @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
197
- # the following was dicarded because some PDF files didn't have an EOL marker as required
198
- # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
199
- # instead, a non-strict RegExp is used:
200
- str = @scanner.scan_until(/endstream/)
201
- # raise error if the stream doesn't end.
202
- raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
203
- # need to remove end of stream
204
- if out.last.is_a? Hash
205
- # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
206
- out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
207
- else
208
- warn "Stream not attached to dictionary!"
209
- out << str.sub(/(\r\n|\n|\r)?endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
210
- end
211
- ##########################################
212
- ## parse an Object after finished
213
- ##########################################
214
- when str = @scanner.scan(/endobj/)
215
- #what to do when this is an object?
216
- if out.last.is_a? Hash
217
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
218
- else
219
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
220
- end
221
- fresh = true
222
- # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum)
223
- ##########################################
224
- ## parse a Hex String
225
- ##########################################
226
- when str = @scanner.scan(/<[0-9a-fA-F]+>/)
227
- # warn "Found a hex string"
228
- out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
229
- ##########################################
230
- ## parse a Literal String
231
- ##########################################
232
- when @scanner.scan(/\(/)
233
- # warn "Found a literal string"
234
- str = ''.force_encoding(Encoding::ASCII_8BIT)
235
- count = 1
236
- while count > 0 && @scanner.rest? do
237
- scn = @scanner.scan_until(/[\(\)]/)
238
- unless scn
239
- warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
240
- count = 0 # error
241
- next
242
- end
243
-
244
- str += scn.to_s
245
- seperator_count = 0
246
- seperator_count += 1 while str[-2-seperator_count] == "\\"
247
-
248
- case str[-1]
249
- when '('
250
- ## The following solution might fail when (string ends with this sign: \\)
251
- count += 1 unless seperator_count.odd?
252
- when ')'
253
- count -= 1 unless seperator_count.odd?
254
- else
255
- warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
256
- count = 0 # error
257
- end
258
- end
259
- # The PDF formatted string is: str[0..-2]
260
- # now starting to convert to regular string
261
- str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
262
- str = []
263
- until str_bytes.empty?
264
- case str_bytes[0]
265
- when 13 # eol - \r
266
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
267
- # shall be treated as a byte value of (0Ah),
268
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
269
- str_bytes.shift
270
- str_bytes.shift if str_bytes[0] == 10
271
- str << 10
272
- when 10 # eol - \n
273
- # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
274
- # shall be treated as a byte value of (0Ah),
275
- # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
276
- str_bytes.shift
277
- str_bytes.shift if str_bytes[0] == 13
278
- str << 10
279
- when 92 # "\\".ord == 92
280
- str_bytes.shift
281
- rep = str_bytes.shift
282
- case rep
283
- when 110 #n
284
- str << 10 #new line
285
- when 114 #r
286
- str << 13 # CR
287
- when 116 #t
288
- str << 9 #tab
289
- when 98 #b
290
- str << 8
291
- when 102 #f
292
- str << 255
293
- when 48..57 #octal notation for byte?
294
- rep = rep.chr
295
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
296
- rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
297
- str << rep.to_i
298
- when 10 # new line, ignore
299
- str_bytes.shift if str_bytes[0] == 13
300
- true
301
- when 13 # new line (or double notation for new line), ignore
302
- str_bytes.shift if str_bytes[0] == 10
303
- true
304
- else
305
- str << rep
306
- end
307
- else
308
- str << str_bytes.shift
309
- end
310
- end
311
- out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
312
- ##########################################
313
- ## Parse a comment
314
- ##########################################
315
- when str = @scanner.scan(/\%/)
316
- #is a comment, skip until new line
317
- loop do
318
- # break unless @scanner.scan(/[^\d\r\n]+/)
319
- break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
320
- @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
321
- end
322
- # puts "AFTER COMMENT: #{@scanner.peek 8}"
323
- ##########################################
324
- ## Parse a Name
325
- ##########################################
326
- # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
327
- # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
328
- # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
329
- # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
330
- when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
331
- out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
332
- ##########################################
333
- ## Parse a Number
334
- ##########################################
335
- when str = @scanner.scan(/[\+\-\.\d]+/)
336
- str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
337
- ##########################################
338
- ## Parse an Object Reference
339
- ##########################################
340
- when @scanner.scan(/R/)
341
- out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
342
- @references << out.last
343
- ##########################################
344
- ## Parse Bool - true and after false
345
- ##########################################
346
- when @scanner.scan(/true/)
347
- out << true
348
- when @scanner.scan(/false/)
349
- out << false
350
- ##########################################
351
- ## Parse NULL - null
352
- ##########################################
353
- when @scanner.scan(/null/)
354
- out << nil
355
- ##########################################
356
- ## XREF - check for encryption... anything else?
357
- ##########################################
358
- when @scanner.scan(/xref/)
359
- ##########
360
- ## get root object to check for encryption
361
- @scanner.scan_until(/(trailer)|(\%EOF)/)
362
- fresh = true
363
- if @scanner.matched[-1] == 'r'
364
- if @scanner.skip_until(/<</)
365
- data = _parse_
366
- @root_object ||= {}
367
- @root_object[data.shift] = data.shift while data[0]
368
- end
369
- ##########
370
- ## skip untill end of segment, maked by %%EOF
371
- @scanner.skip_until(/\%\%EOF/)
372
- ##########
373
- ## If this was the last valid segment, ignore any trailing garbage
374
- ## (issue #49 resolution)
375
- break unless @scanner.exist?(/\%\%EOF/)
376
-
377
- end
378
-
379
- when @scanner.scan(/[\s]+/)
380
- # Generally, do nothing
381
- nil
382
- when @scanner.scan(/obj[\s]*/)
383
- # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
384
- unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
385
- keep = []
386
- keep << out.pop # .tap {|i| puts "#{i} is an ID"}
387
- keep << out.pop # .tap {|i| puts "#{i} is a REF"}
388
-
389
- if out.last.is_a? Hash
390
- out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
391
- else
392
- out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
393
- end
394
- warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
395
-
396
- out << keep.pop
397
- out << keep.pop
398
- end
399
- fresh = false
400
- else
401
- # always advance
402
- # warn "Advnacing for unknown reason... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
403
- warn "Warning: parser advnacing for unknown reason. Potential data-loss."
404
- @scanner.pos = @scanner.pos + 1
405
- end
406
- end
407
- out
408
- end
409
-
410
- protected
411
-
412
-
413
-
414
- # resets cataloging and pages
415
- def catalog_pages(catalogs = nil, inheritance_hash = {})
416
- unless catalogs
417
-
418
- if root_object[:Root]
419
- catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
420
- else
421
- catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
422
- end
423
- @parsed.delete_if {|obj| obj[:Type] == :Catalog}
424
- @parsed << catalogs
425
-
426
- raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
427
- end
428
- case
429
- when catalogs.is_a?(Array)
430
- catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?}
431
- when catalogs.is_a?(Hash)
432
- if catalogs[:is_reference_only]
433
- if catalogs[:referenced_object]
434
- catalog_pages(catalogs[:referenced_object], inheritance_hash)
435
- else
436
- warn "couldn't follow reference!!! #{catalogs} not found!"
437
- end
438
- else
439
- unless catalogs[:Type] == :Page
440
- raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
441
- inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
442
- inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
443
- inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
444
- (inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
445
- (inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
446
-
447
- # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
448
- # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
449
- # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
450
- end
451
-
452
- case catalogs[:Type]
453
- when :Page
454
-
455
- catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
456
- catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
457
- catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
458
- (catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
459
- (catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
460
- # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
461
- # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
462
- # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
463
-
464
-
465
- # avoide references on MediaBox, CropBox and Rotate
466
- catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
467
- catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
468
- catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
469
-
470
- catalogs.instance_eval {extend Page_Methods}
471
- when :Pages
472
- catalog_pages(catalogs[:Kids], inheritance_hash.dup ) unless catalogs[:Kids].nil?
473
- when :Catalog
474
- @forms_object.update( (catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:AcroForm]
475
- @names_object.update( (catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Names]
476
- catalog_pages(catalogs[:Pages], inheritance_hash.dup ) unless catalogs[:Pages].nil?
477
- end
478
- end
479
- end
480
- self
481
- end
482
-
483
- # fails!
484
- def change_references_to_actual_values(hash_with_references = {})
485
- hash_with_references.each do |k,v|
486
- if v.is_a?(Hash) && v[:is_reference_only]
487
- hash_with_references[k] = get_refernced_object(v)
488
- hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
489
- warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
490
- hash_with_references[k] = v unless hash_with_references[k]
491
- end
492
- end
493
- hash_with_references
494
- end
495
-
496
- def get_refernced_object(reference_hash = {})
497
- @parsed.each do |stored_object|
498
- return stored_object if ( stored_object.is_a?(Hash) &&
499
- reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
500
- reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
501
- end
502
- warn "didn't find reference #{reference_hash}"
503
- nil
504
- end
505
-
506
- # @private
507
- # connects references and objects, according to their reference id's.
508
- #
509
- # should be moved to the parser's workflow.
510
- #
511
- def serialize_objects_and_references
512
- obj_dir = {}
513
- @parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
514
- # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
515
- @references.each do |obj|
516
- obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
517
- warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
518
- obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
519
- end
520
- self
521
- end
522
-
523
- # All Strings are one String
524
- def unify_string str
525
- @strings_dictionary[str] ||= str
526
- end
527
-
528
- # @private
529
- # this method reviews a Hash and updates it by merging Hash data,
530
- # preffering the old over the new.
531
- def self.hash_update_proc_for_old key, old_data, new_data
532
- if old_data.is_a? Hash
533
- old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
534
- else
535
- old_data
536
- end
537
- end
538
- # @private
539
- # this method reviews a Hash an updates it by merging Hash data,
540
- # preffering the new over the old.
541
- def self.hash_update_proc_for_new key, old_data, new_data
542
- if old_data.is_a? Hash
543
- old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
544
- else
545
- new_data
546
- end
547
- end
548
-
549
- # # @private
550
- # # connects references and objects, according to their reference id's.
551
- # #
552
- # # should be moved to the parser's workflow.
553
- # #
554
- # def old_serialize_objects_and_references(object = nil)
555
- # objects_reference_hash = {}
556
- # # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
557
- # @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
558
- # each_object(@parsed) do |obj|
559
- # if obj[:is_reference_only]
560
- # obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
561
- # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
562
- # # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
563
- # end
564
- # end
565
- # self
566
- # end
567
-
568
- # # run block of code on evey PDF object (PDF objects are class Hash)
569
- # def each_object(object, limit_references = true, already_visited = {}, &block)
570
- # unless limit_references
571
- # already_visited[object.object_id] = true
572
- # end
573
- # case
574
- # when object.is_a?(Array)
575
- # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
576
- # when object.is_a?(Hash)
577
- # yield(object)
578
- # unless limit_references && object[:is_reference_only]
579
- # object.each do |k,v|
580
- # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
581
- # end
582
- # end
583
- # end
584
- # end
585
-
586
- end
9
+ # @!visibility private
10
+ # @private
11
+ #:nodoc: all
12
+
13
+ protected
14
+
15
+ # This is the Parser class.
16
+ #
17
+ # It takes PDF data and parses it.
18
+ #
19
+ # The information is then used to initialize a PDF object.
20
+ #
21
+ # This is an internal class. you don't need it.
22
+ class PDFParser
23
+ # @!visibility private
24
+
25
+ # the array containing all the parsed data (PDF Objects)
26
+ attr_reader :parsed
27
+ # a Float representing the PDF version of the data parsed (if exists).
28
+ attr_reader :version
29
+ # the info and root objects, as found (if found) in the PDF file.
30
+ #
31
+ # they are mainly to used to know if the file is (was) encrypted and to get more details.
32
+ attr_reader :info_object, :root_object, :names_object, :forms_object, :outlines_object
33
+
34
+ attr_reader :allow_optional_content
35
+ # when creating a parser, it is important to set the data (String) we wish to parse.
36
+ #
37
+ # <b>the data is required and it is not possible to set the data at a later stage</b>
38
+ #
39
+ # string:: the data to be parsed, as a String object.
40
+ def initialize(string, options = {})
41
+ raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
42
+ @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
43
+ @literal_strings = [].dup
44
+ @hex_strings = [].dup
45
+ @streams = [].dup
46
+ @parsed = [].dup
47
+ @references = [].dup
48
+ @root_object = {}.dup
49
+ @info_object = {}.dup
50
+ @names_object = {}.dup
51
+ @outlines_object = {}.dup
52
+ @forms_object = {}.dup
53
+ @strings_dictionary = {}.dup # all strings are one string
54
+ @version = nil
55
+ @scanner = nil
56
+ @allow_optional_content = options[:allow_optional_content]
57
+ end
58
+
59
+ # parse the data in the new parser (the data already set through the initialize / new method)
60
+ def parse
61
+ return [] if @string_to_parse.empty?
62
+ return @parsed unless @parsed.empty?
63
+ @scanner = StringScanner.new @string_to_parse
64
+ @scanner.pos = 0
65
+ if @scanner.scan /\%PDF\-[\d\-\.]+/
66
+ @version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
67
+ loop do
68
+ break unless @scanner.scan(/[^\d\r\n]+/)
69
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/)
70
+ break if @scanner.eos?
71
+ @scanner.pos += 1
72
+ end
73
+ end
74
+ @parsed = _parse_
75
+ # puts @parsed
76
+
77
+ raise 'Unknown PDF parsing error - maleformed PDF file?' unless (@parsed.select { |i| !i.is_a?(Hash) }).empty?
78
+
79
+ if @root_object == {}
80
+ xref_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :XRef }
81
+ xref_streams.each do |xref_dictionary|
82
+ @root_object.merge! xref_dictionary
83
+ end
84
+ end
85
+ raise 'root is unknown - cannot determine if file is Encrypted' if @root_object == {}
86
+
87
+ if @root_object[:Encrypt]
88
+ # change_references_to_actual_values @root_object
89
+ warn 'PDF is Encrypted! Attempting to decrypt - not yet fully supported.'
90
+ decryptor = PDFDecrypt.new @parsed, @root_object
91
+ decryptor.decrypt
92
+ # do we really need to apply to @parsed? No, there is no need.
93
+ end
94
+
95
+ ## search for objects streams
96
+ object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
97
+ unless object_streams.empty?
98
+ warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
99
+
100
+ object_streams.each do |o|
101
+ ## un-encode (using the correct filter) the object streams
102
+ PDFFilter.inflate_object o
103
+ ## extract objects from stream to top level arry @parsed
104
+ @scanner = StringScanner.new o[:raw_stream_content]
105
+ stream_data = _parse_
106
+ id_array = []
107
+ while stream_data[0].is_a? Fixnum
108
+ id_array << stream_data.shift
109
+ stream_data.shift
110
+ end
111
+ while id_array[0] && stream_data[0]
112
+ stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
113
+ stream_data[0][:indirect_reference_id] = id_array.shift
114
+ stream_data[0][:indirect_generation_number] = 0
115
+ @parsed << stream_data.shift
116
+ end
117
+ end
118
+ end
119
+
120
+ # Strings were unified, we can let them go..
121
+ @strings_dictionary.clear
122
+
123
+ # serialize_objects_and_references.catalog_pages
124
+
125
+ # Benchmark.bm do |bm|
126
+ # bm.report("serialize") {1000.times {serialize_objects_and_references} }
127
+ # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
128
+ # bm.report("catalog") {1000.times {catalog_pages} }
129
+ # end
130
+
131
+ serialize_objects_and_references
132
+
133
+ catalog_pages
134
+
135
+ # collect any missing objects from the forms_data
136
+ unless @forms_object.nil? || @forms_object.empty?
137
+ @forms_object[:related_objects] = (@parsed.select { |o| o[:FT] }).map! { |o| { is_reference_only: true, referenced_object: o } }
138
+ @forms_object[:related_objects].delete @forms_object
139
+ end
140
+
141
+ @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
142
+ if @info_object && @info_object.is_a?(Hash)
143
+ @parsed.delete @info_object
144
+ CombinePDF::PDF::PRIVATE_HASH_KEYS.each { |key| @info_object.delete key }
145
+ @info_object.each { |_k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object] }
146
+ else
147
+ @info_object = {}
148
+ end
149
+ # # # ## remove object streams - if they exist
150
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
151
+ # # # ## remove XREF dictionaries - if they exist
152
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
153
+
154
+ @parsed
155
+ end
156
+
157
+ # the actual recoursive parsing is done here.
158
+ #
159
+ # this is an internal function, but it was left exposed for posible future features.
160
+ def _parse_
161
+ out = []
162
+ str = ''
163
+ fresh = true
164
+ while @scanner.rest?
165
+ # last ||= 0
166
+ # out.last.tap do |o|
167
+ # if o.is_a?(Hash)
168
+ # puts "[#{@scanner.pos}] Parser has a Dictionary (#{o.class.name}) with data:"
169
+ # o.each do |k, v|
170
+ # puts " #{k}: is #{v.class.name} with data: #{v.to_s[0..4]}#{"..." if v.to_s.length > 5}"
171
+ # end
172
+ # else
173
+ # puts "[#{@scanner.pos}] Parser has #{o.class.name} with data: #{o.to_s[0..4]}#{"..." if o.to_s.length > 5}"
174
+ # end
175
+ # puts "next is #{@scanner.peek 8}"
176
+ # end unless (last == out.count) || (-1 == (last = out.count))
177
+ if @scanner.scan(/\[/)
178
+ out << _parse_
179
+ ##########################################
180
+ ## parse a Dictionary
181
+ ##########################################
182
+ elsif @scanner.scan(/<</)
183
+ data = _parse_
184
+ obj = {}
185
+ obj[data.shift] = data.shift while data[0]
186
+ out << obj
187
+ ##########################################
188
+ ## return content of array or dictionary
189
+ ##########################################
190
+ elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
191
+ return out
192
+ ##########################################
193
+ ## parse a Stream
194
+ ##########################################
195
+ elsif @scanner.scan(/stream[\r\n]/)
196
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
197
+ # the following was dicarded because some PDF files didn't have an EOL marker as required
198
+ # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
199
+ # instead, a non-strict RegExp is used:
200
+ str = @scanner.scan_until(/endstream/)
201
+ # raise error if the stream doesn't end.
202
+ raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
203
+ # need to remove end of stream
204
+ if out.last.is_a? Hash
205
+ # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
206
+ out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
207
+ else
208
+ warn 'Stream not attached to dictionary!'
209
+ out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
210
+ end
211
+ ##########################################
212
+ ## parse an Object after finished
213
+ ##########################################
214
+ elsif str = @scanner.scan(/endobj/)
215
+ # what to do when this is an object?
216
+ if out.last.is_a? Hash
217
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
218
+ else
219
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
220
+ end
221
+ fresh = true
222
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Fixnum)
223
+ ##########################################
224
+ ## parse a Hex String
225
+ ##########################################
226
+ elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
227
+ # warn "Found a hex string"
228
+ out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
229
+ ##########################################
230
+ ## parse a Literal String
231
+ ##########################################
232
+ elsif @scanner.scan(/\(/)
233
+ # warn "Found a literal string"
234
+ str = ''.force_encoding(Encoding::ASCII_8BIT)
235
+ count = 1
236
+ while count > 0 && @scanner.rest?
237
+ scn = @scanner.scan_until(/[\(\)]/)
238
+ unless scn
239
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
240
+ count = 0 # error
241
+ next
242
+ end
243
+
244
+ str += scn.to_s
245
+ seperator_count = 0
246
+ seperator_count += 1 while str[-2 - seperator_count] == '\\'
247
+
248
+ case str[-1]
249
+ when '('
250
+ ## The following solution might fail when (string ends with this sign: \\)
251
+ count += 1 unless seperator_count.odd?
252
+ when ')'
253
+ count -= 1 unless seperator_count.odd?
254
+ else
255
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
256
+ count = 0 # error
257
+ end
258
+ end
259
+ # The PDF formatted string is: str[0..-2]
260
+ # now starting to convert to regular string
261
+ str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
262
+ str = []
263
+ until str_bytes.empty?
264
+ case str_bytes[0]
265
+ when 13 # eol - \r
266
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
267
+ # shall be treated as a byte value of (0Ah),
268
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
269
+ str_bytes.shift
270
+ str_bytes.shift if str_bytes[0] == 10
271
+ str << 10
272
+ when 10 # eol - \n
273
+ # An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
274
+ # shall be treated as a byte value of (0Ah),
275
+ # irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
276
+ str_bytes.shift
277
+ str_bytes.shift if str_bytes[0] == 13
278
+ str << 10
279
+ when 92 # "\\".ord == 92
280
+ str_bytes.shift
281
+ rep = str_bytes.shift
282
+ case rep
283
+ when 110 # n
284
+ str << 10 # new line
285
+ when 114 # r
286
+ str << 13 # CR
287
+ when 116 # t
288
+ str << 9 # tab
289
+ when 98 # b
290
+ str << 8
291
+ when 102 # f, form-feed
292
+ str << 12
293
+ when 48..57 # octal notation for byte?
294
+ rep = rep.chr
295
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57)
296
+ rep += str_bytes.shift.chr if str_bytes[0].between?(48, 57) && ((rep + str_bytes[0].chr).to_i <= 255)
297
+ str << rep.to_i
298
+ when 10 # new line, ignore
299
+ str_bytes.shift if str_bytes[0] == 13
300
+ true
301
+ when 13 # new line (or double notation for new line), ignore
302
+ str_bytes.shift if str_bytes[0] == 10
303
+ true
304
+ else
305
+ str << rep
306
+ end
307
+ else
308
+ str << str_bytes.shift
309
+ end
310
+ end
311
+ out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
312
+ ##########################################
313
+ ## Parse a comment
314
+ ##########################################
315
+ elsif str = @scanner.scan(/\%/)
316
+ # is a comment, skip until new line
317
+ loop do
318
+ # break unless @scanner.scan(/[^\d\r\n]+/)
319
+ break if @scanner.check(/([\d]+[\s]+[\d]+[\s]+obj[\n\r\s]+\<\<)|([\n\r]+)/) || @scanner.eos? # || @scanner.scan(/[^\d]+[\r\n]+/) ||
320
+ @scanner.scan(/[^\d\r\n]+/) || @scanner.pos += 1
321
+ end
322
+ # puts "AFTER COMMENT: #{@scanner.peek 8}"
323
+ ##########################################
324
+ ## Parse a Name
325
+ ##########################################
326
+ # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
327
+ # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
328
+ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
329
+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
330
+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
331
+ out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
332
+ ##########################################
333
+ ## Parse a Number
334
+ ##########################################
335
+ elsif str = @scanner.scan(/[\+\-\.\d]+/)
336
+ str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
337
+ ##########################################
338
+ ## Parse an Object Reference
339
+ ##########################################
340
+ elsif @scanner.scan(/R/)
341
+ out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
342
+ # @references << out.last
343
+ ##########################################
344
+ ## Parse Bool - true and after false
345
+ ##########################################
346
+ elsif @scanner.scan(/true/)
347
+ out << true
348
+ elsif @scanner.scan(/false/)
349
+ out << false
350
+ ##########################################
351
+ ## Parse NULL - null
352
+ ##########################################
353
+ elsif @scanner.scan(/null/)
354
+ out << nil
355
+ ##########################################
356
+ ## XREF - check for encryption... anything else?
357
+ ##########################################
358
+ elsif @scanner.scan(/xref/)
359
+ ##########
360
+ ## get root object to check for encryption
361
+ @scanner.scan_until(/(trailer)|(\%EOF)/)
362
+ fresh = true
363
+ if @scanner.matched[-1] == 'r'
364
+ if @scanner.skip_until(/<</)
365
+ data = _parse_
366
+ @root_object ||= {}
367
+ @root_object[data.shift] = data.shift while data[0]
368
+ end
369
+ ##########
370
+ ## skip untill end of segment, maked by %%EOF
371
+ @scanner.skip_until(/\%\%EOF/)
372
+ ##########
373
+ ## If this was the last valid segment, ignore any trailing garbage
374
+ ## (issue #49 resolution)
375
+ break unless @scanner.exist?(/\%\%EOF/)
376
+
377
+ end
378
+
379
+ elsif @scanner.scan(/[\s]+/)
380
+ # Generally, do nothing
381
+ nil
382
+ elsif @scanner.scan(/obj[\s]*/)
383
+ # Fix wkhtmltopdf PDF authoring issue - missing 'endobj' keywords
384
+ unless fresh || (out[-4].nil? || out[-4].is_a?(Hash))
385
+ keep = []
386
+ keep << out.pop # .tap {|i| puts "#{i} is an ID"}
387
+ keep << out.pop # .tap {|i| puts "#{i} is a REF"}
388
+
389
+ if out.last.is_a? Hash
390
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
391
+ else
392
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
393
+ end
394
+ warn "'endobj' keyword was missing for Object ID: #{out.last[:indirect_reference_id]}, trying to auto-fix issue, but might fail."
395
+
396
+ out << keep.pop
397
+ out << keep.pop
398
+ end
399
+ fresh = false
400
+ else
401
+ # always advance
402
+ # warn "Advancing for unknown reason... #{@scanner.string[@scanner.pos-4, 8]} ... #{@scanner.peek(4)}" unless @scanner.peek(1) =~ /[\s\n]/
403
+ warn 'Warning: parser advancing for unknown reason. Potential data-loss.'
404
+ @scanner.pos = @scanner.pos + 1
405
+ end
406
+ end
407
+ out
408
+ end
409
+
410
+ protected
411
+
412
+ # resets cataloging and pages
413
+ def catalog_pages(catalogs = nil, inheritance_hash = {})
414
+ unless catalogs
415
+
416
+ if root_object[:Root]
417
+ catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
418
+ else
419
+ catalogs = (@parsed.select { |obj| obj[:Type] == :Catalog }).last
420
+ end
421
+ @parsed.delete_if { |obj| obj[:Type] == :Catalog }
422
+ @parsed << catalogs
423
+
424
+ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
425
+ end
426
+ if catalogs.is_a?(Array)
427
+ catalogs.each { |c| catalog_pages(c, inheritance_hash) unless c.nil? }
428
+ elsif catalogs.is_a?(Hash)
429
+ if catalogs[:is_reference_only]
430
+ if catalogs[:referenced_object]
431
+ catalog_pages(catalogs[:referenced_object], inheritance_hash)
432
+ else
433
+ warn "couldn't follow reference!!! #{catalogs} not found!"
434
+ end
435
+ else
436
+ unless catalogs[:Type] == :Page
437
+ raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if (catalogs[:AS] || catalogs[:OCProperties]) && !@allow_optional_content
438
+ inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
439
+ inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
440
+ inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
441
+ if catalogs[:Resources]
442
+ inheritance_hash[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
443
+ (inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_old))
444
+ end
445
+ if catalogs[:ColorSpace]
446
+ inheritance_hash[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
447
+ (inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
448
+ end
449
+ # (inheritance_hash[:Resources] ||= {}).update((catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Resources]
450
+ # (inheritance_hash[:ColorSpace] ||= {}).update((catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:ColorSpace]
451
+
452
+ # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
453
+ # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
454
+ # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
455
+ end
456
+
457
+ case catalogs[:Type]
458
+ when :Page
459
+
460
+ catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
461
+ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
462
+ catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
463
+ if inheritance_hash[:Resources]
464
+ catalogs[:Resources] ||= { referenced_object: {}, is_reference_only: true }.dup
465
+ (catalogs[:Resources][:referenced_object] || catalogs[:Resources]).update((inheritance_hash[:Resources][:referenced_object] || inheritance_hash[:Resources]), &self.class.method(:hash_update_proc_for_old))
466
+ end
467
+ if inheritance_hash[:ColorSpace]
468
+ catalogs[:ColorSpace] ||= { referenced_object: {}, is_reference_only: true }.dup
469
+ (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]).update((inheritance_hash[:ColorSpace][:referenced_object] || inheritance_hash[:ColorSpace]), &self.class.method(:hash_update_proc_for_old))
470
+ end
471
+ # (catalogs[:ColorSpace] ||= {}).update(inheritance_hash[:ColorSpace], &self.class.method(:hash_update_proc_for_old)) if inheritance_hash[:ColorSpace]
472
+ # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
473
+ # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
474
+ # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
475
+
476
+ # avoide references on MediaBox, CropBox and Rotate
477
+ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
478
+ catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
479
+ catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
480
+
481
+ catalogs.instance_eval { extend Page_Methods }
482
+ when :Pages
483
+ catalog_pages(catalogs[:Kids], inheritance_hash.dup) unless catalogs[:Kids].nil?
484
+ when :Catalog
485
+ @forms_object.update((catalogs[:AcroForm][:referenced_object] || catalogs[:AcroForm]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:AcroForm]
486
+ @names_object.update((catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Names]
487
+ @outlines_object.update((catalogs[:Outlines][:referenced_object] || catalogs[:Outlines]), &self.class.method(:hash_update_proc_for_new)) if catalogs[:Outlines]
488
+ catalog_pages(catalogs[:Pages], inheritance_hash.dup) unless catalogs[:Pages].nil?
489
+ end
490
+ end
491
+ end
492
+ self
493
+ end
494
+
495
+ def get_refernced_object(reference_hash = {})
496
+ @parsed.each do |stored_object|
497
+ return stored_object if stored_object.is_a?(Hash) &&
498
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
499
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
500
+ # return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
501
+ # reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
502
+ # reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
503
+ end
504
+ warn "didn't find reference #{reference_hash}"
505
+ nil
506
+ end
507
+
508
+ # # @private
509
+ # # connects references and objects, according to their reference id's.
510
+ # #
511
+ # # should be moved to the parser's workflow.
512
+ # #
513
+ # def serialize_objects_and_references_old
514
+ # obj_dir = {}
515
+ # # create a dictionary for referenced objects (no value resolution at this point)
516
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
517
+ # # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
518
+ # @references.each do |obj|
519
+ # obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
520
+ # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
521
+ # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
522
+ # end
523
+ # obj_dir.clear
524
+ # @references.clear
525
+ # self
526
+ # end
527
+
528
+ # @private
529
+ # connects references and objects, according to their reference id's.
530
+ #
531
+ # Also replaces :indirect_without_dictionary objects with their actual values. Strings, Hashes and Arrays still share memory space.
532
+ #
533
+ # should be moved to the parser's workflow.
534
+ #
535
+ def serialize_objects_and_references
536
+ obj_dir = {}
537
+ # create a dictionary for referenced objects (no value resolution at this point)
538
+ # @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
539
+ @parsed.each { |o| obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]] = o }
540
+ should_resolve = [@parsed, @root_object]
541
+ while should_resolve.count > 0
542
+ obj = should_resolve.pop
543
+ if obj.is_a?(Hash)
544
+ obj.keys.each do |k|
545
+ o = obj[k]
546
+ if o.is_a?(Hash)
547
+ if o[:is_reference_only]
548
+ if o[:indirect_reference_id].nil?
549
+ o = nil
550
+ else
551
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
552
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
553
+ o.delete :indirect_reference_id
554
+ o.delete :indirect_generation_number
555
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
556
+ end
557
+ obj[k] = o
558
+ else
559
+ should_resolve << o
560
+ end
561
+ elsif o.is_a?(Array)
562
+ should_resolve << o
563
+ end
564
+ end
565
+ elsif obj.is_a?(Array)
566
+ obj.map! do |o|
567
+ if o.is_a?(Hash)
568
+ if o[:is_reference_only]
569
+ if o[:indirect_reference_id].nil?
570
+ o = nil
571
+ else
572
+ o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
573
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
574
+ o.delete :indirect_reference_id
575
+ o.delete :indirect_generation_number
576
+ o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
577
+ end
578
+ else
579
+ should_resolve << o
580
+ end
581
+ elsif o.is_a?(Array)
582
+ should_resolve << o
583
+ end
584
+ o
585
+ end
586
+ end
587
+ end
588
+ end
589
+
590
+ # def serialize_objects_and_references
591
+ # rec_resolve = proc do |level|
592
+ # if level.is_a?(Hash)
593
+ # if level[:is_reference_only]
594
+ # level[:referenced_object] = get_refernced_object(level)
595
+ # level = (level[:referenced_object] && level[:referenced_object][:indirect_without_dictionary]) || level
596
+ # level.delete :indirect_reference_id
597
+ # level.delete :indirect_generation_number
598
+ # else
599
+ # level.keys.each do |k|
600
+ # level[k] = rec_resolve.call(level[k]) unless level[k].is_a?(Hash) && level[k][:indirect_reference_id] && level[k][:is_reference_only].nil?
601
+ # end
602
+ # end
603
+ # elsif level.is_a?(Array)
604
+ # level.map! { |o| rec_resolve.call(o) }
605
+ # end
606
+ # level
607
+ # end
608
+ # rec_resolve.call(@root_object)
609
+ # rec_resolve.call(@parsed)
610
+ # self
611
+ # end
612
+
613
+ # All Strings are one String
614
+ def unify_string(str)
615
+ @strings_dictionary[str] ||= str
616
+ end
617
+
618
+ # @private
619
+ # this method reviews a Hash and updates it by merging Hash data,
620
+ # preffering the old over the new.
621
+ def self.hash_update_proc_for_old(_key, old_data, new_data)
622
+ if old_data.is_a? Hash
623
+ old_data.merge(new_data, &method(:hash_update_proc_for_old))
624
+ else
625
+ old_data
626
+ end
627
+ end
628
+
629
+ # @private
630
+ # this method reviews a Hash an updates it by merging Hash data,
631
+ # preffering the new over the old.
632
+ def self.hash_update_proc_for_new(_key, old_data, new_data)
633
+ if old_data.is_a? Hash
634
+ old_data.merge(new_data, &method(:hash_update_proc_for_new))
635
+ else
636
+ new_data
637
+ end
638
+ end
639
+
640
+ # # run block of code on evey PDF object (PDF objects are class Hash)
641
+ # def each_object(object, limit_references = true, already_visited = {}, &block)
642
+ # unless limit_references
643
+ # already_visited[object.object_id] = true
644
+ # end
645
+ # case
646
+ # when object.is_a?(Array)
647
+ # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
648
+ # when object.is_a?(Hash)
649
+ # yield(object)
650
+ # unless limit_references && object[:is_reference_only]
651
+ # object.each do |k,v|
652
+ # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
653
+ # end
654
+ # end
655
+ # end
656
+ # end
657
+ end
587
658
  end