combine_pdf 0.1.23 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,15 +6,6 @@ module CombinePDF
6
6
  ################################################################
7
7
 
8
8
 
9
- # lists the Hash keys used for PDF objects
10
- #
11
- # the CombinePDF library doesn't use special classes for its objects (PDFPage class, PDFStream class or anything like that).
12
- #
13
- # there is only one PDF class which represents the whole of the PDF file.
14
- #
15
- # this Hash lists the private Hash keys that the CombinePDF library uses to
16
- # differentiate between complex PDF objects.
17
- PRIVATE_HASH_KEYS = [:indirect_reference_id, :indirect_generation_number, :raw_stream_content, :is_reference_only, :referenced_object, :indirect_without_dictionary]
18
9
 
19
10
  # holds a simple content stream that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
20
11
  CONTENT_CONTAINER_START = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: 'q'} }
@@ -225,7 +216,7 @@ module CombinePDF
225
216
  hash_with_references[k] = v unless hash_with_references[k]
226
217
  end
227
218
  end
228
- hash_with_references
219
+ hash_with_references
229
220
  end
230
221
  def change_connected_references_to_actual_values(hash_with_references = {})
231
222
  if hash_with_references.is_a?(Hash)
@@ -10,8 +10,16 @@
10
10
 
11
11
  module CombinePDF
12
12
 
13
- # This module injects methods into existing page objects
13
+ # This module injects page editing methods into existing page objects and the PDFWriter objects.
14
14
  module Page_Methods
15
+ include Renderer
16
+
17
+ # holds the string that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
18
+ CONTENT_CONTAINER_START = 'q'
19
+ # holds the string that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
20
+ CONTENT_CONTAINER_MIDDLE = "Q\nq"
21
+ # holds the string that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
22
+ CONTENT_CONTAINER_END = 'Q'
15
23
 
16
24
  # accessor (getter) for the secure_injection setting
17
25
  def secure_injection
@@ -21,17 +29,66 @@ module CombinePDF
21
29
  def secure_injection= safe
22
30
  @secure_injection = safe
23
31
  end
32
+ # sets secure_injection to `true` and returns self, allowing for chaining methods
33
+ def make_secure
34
+ @secure_injection = true
35
+ self
36
+ end
37
+ # sets secure_injection to `false` and returns self, allowing for chaining methods
38
+ def make_unsecure
39
+ @secure_injection = false
40
+ self
41
+ end
24
42
 
25
43
  # the injection method
26
44
  def << obj
27
- obj = secure_injection ? PDFOperations.copy_and_secure_for_injection(obj) : PDFOperations.create_deep_copy(obj)
28
- PDFOperations.inject_to_page self, obj
29
- # should add new referenced objects to the main PDF objects array,
30
- # but isn't done because the container is unknown.
31
- # This should be resolved once the container is rendered and references are renewed.
32
- # holder.add_referenced self
45
+ inject_page obj, true
46
+ end
47
+ def >> obj
48
+ inject_page obj, false
49
+ end
50
+ def inject_page obj, top = true
51
+
52
+ raise TypeError, "couldn't inject data, expecting a PDF page (Hash type)" unless obj.is_a?(Page_Methods)
53
+
54
+ obj = obj.copy #obj.copy(secure_injection)
55
+
56
+ # following the reference chain and assigning a pointer to the correct Resouces object.
57
+ # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
58
+
59
+ # injecting each of the values in the injected Page
60
+ res = resources
61
+ obj.resources.each do |key, new_val|
62
+ unless PDF::PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
63
+ if res[key].nil?
64
+ res[key] = new_val
65
+ elsif res[key].is_a?(Hash) && new_val.is_a?(Hash)
66
+ new_val.update resources[key] # make sure the old values are respected
67
+ res[key].update new_val # transfer old and new values to the injected page
68
+ end #Do nothing if array - ot is the PROC array, which is an issue
69
+ end
70
+ end
71
+ resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
72
+
73
+ if top # if this is a stamp (overlay)
74
+ insert_content CONTENT_CONTAINER_START, 0
75
+ insert_content CONTENT_CONTAINER_MIDDLE
76
+ obj[:Contents].each {|c| insert_content c }
77
+ insert_content CONTENT_CONTAINER_END
78
+ else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
79
+ old_contents = self[:Contents]
80
+ self[:Contents] = []
81
+ insert_content CONTENT_CONTAINER_START
82
+ obj[:Contents].each {|c| insert_content c }
83
+ insert_content CONTENT_CONTAINER_MIDDLE
84
+ old_contents.each { |c| insert_content c }
85
+ insert_content CONTENT_CONTAINER_END
86
+ end
87
+ init_contents
88
+
33
89
  self
34
90
  end
91
+
35
92
  # accessor (setter) for the :MediaBox element of the page
36
93
  # dimensions:: an Array consisting of four numbers (can be floats) setting the size of the media box.
37
94
  def mediabox=(dimensions = [0.0, 0.0, 612.0, 792.0])
@@ -40,7 +97,7 @@ module CombinePDF
40
97
 
41
98
  # accessor (getter) for the :MediaBox element of the page
42
99
  def mediabox
43
- self[:MediaBox].is_a?(Array) ? self[:MediaBox] : self[:MediaBox][:referenced_object]
100
+ actual_object self[:MediaBox]
44
101
  end
45
102
 
46
103
  # accessor (setter) for the :CropBox element of the page
@@ -51,7 +108,7 @@ module CombinePDF
51
108
 
52
109
  # accessor (getter) for the :CropBox element of the page
53
110
  def cropbox
54
- (self[:CropBox].is_a?(Array) || self[:CropBox].nil?) ? self[:CropBox] : self[:CropBox][:referenced_object]
111
+ actual_object self[:CropBox]
55
112
  end
56
113
 
57
114
  # get page size
@@ -149,7 +206,7 @@ module CombinePDF
149
206
  box_graphic_state[:LC], box_graphic_state[:LJ] = 2, 1
150
207
  end
151
208
  box_graphic_state = graphic_state box_graphic_state # adds the graphic state to Resources and gets the reference
152
- box_stream << "#{PDFOperations._object_to_pdf box_graphic_state} gs\n"
209
+ box_stream << "#{object_to_pdf box_graphic_state} gs\n"
153
210
 
154
211
  # the following line was removed for Acrobat Reader compatability
155
212
  # box_stream << "DeviceRGB CS\nDeviceRGB cs\n"
@@ -243,7 +300,7 @@ module CombinePDF
243
300
  text_stream << "q\n"
244
301
  text_stream << "#{options[:ctm].join ' '} cm\n" if options[:ctm]
245
302
  text_graphic_state = graphic_state({ca: options[:opacity], CA: options[:opacity], LW: options[:stroke_width].to_f, LC: 2, LJ: 1, LD: 0 })
246
- text_stream << "#{PDFOperations._object_to_pdf text_graphic_state} gs\n"
303
+ text_stream << "#{object_to_pdf text_graphic_state} gs\n"
247
304
 
248
305
  # the following line was removed for Acrobat Reader compatability
249
306
  # text_stream << "DeviceRGB CS\nDeviceRGB cs\n"
@@ -266,9 +323,9 @@ module CombinePDF
266
323
  end
267
324
  # format text object(s)
268
325
  # text_stream << "#{options[:font_color].join(' ')} rg\n" # sets the color state
269
- encode(text, fonts).each do |encoded|
326
+ encode_text(text, fonts).each do |encoded|
270
327
  text_stream << "BT\n" # the Begine Text marker
271
- text_stream << PDFOperations._format_name_to_pdf(set_font encoded[0]) # Set font name
328
+ text_stream << format_name_to_pdf(set_font encoded[0]) # Set font name
272
329
  text_stream << " #{font_size.round 3} Tf\n" # set font size and add font operator
273
330
  text_stream << "#{x.round 4} #{y.round 4} Td\n" # set location for text object
274
331
  text_stream << ( encoded[1] ) # insert the encoded string to the stream
@@ -335,9 +392,9 @@ module CombinePDF
335
392
  ctm.push( ( (x*c).abs - x*c + (y*s).abs + y*s )/2 , ( (x*s).abs - x*s + (y*c).abs - y*c )/2 )
336
393
 
337
394
  # insert the rotation stream into the current content stream
338
- insert_object "q\n#{ctm.join ' '} cm\n", 0
395
+ insert_content "q\n#{ctm.join ' '} cm\n", 0
339
396
  # close the rotation stream
340
- insert_object PDFOperations.create_deep_copy(CONTENT_CONTAINER_END)
397
+ insert_content CONTENT_CONTAINER_END
341
398
  # reset the mediabox and cropbox values - THIS IS ONLY FOR ORIENTATION CHANGE...
342
399
  if ((self[:Rotate].to_f / 90)%2) != 0
343
400
  self[:MediaBox] = self[:MediaBox].values_at(1,0,3,2)
@@ -488,6 +545,29 @@ module CombinePDF
488
545
  self
489
546
  end
490
547
 
548
+ # since only the Content streams are modified (Resource hashes are created anew),
549
+ # it should be safe (and a lot faster) to create a deep copy only for the content hashes and streams.
550
+ def copy(secure = false)
551
+ delete :Parent
552
+ prep_content_array
553
+ page_copy = self.clone
554
+ page_copy[:Contents] = page_copy[:Contents].map do |obj|
555
+ obj = obj.dup
556
+ obj[:referenced_object] = obj[:referenced_object].dup if obj[:referenced_object]
557
+ obj[:referenced_object][:raw_stream_content] = obj[:referenced_object][:raw_stream_content].dup if obj[:referenced_object] && obj[:referenced_object][:raw_stream_content]
558
+ obj
559
+ end
560
+ if page_copy[:Resources]
561
+ page_copy[:Resources] = page_copy[:Resources].dup
562
+ page_copy[:Resources][:referenced_object] = page_copy[:Resources][:referenced_object].dup if page_copy[:Resources][:referenced_object]
563
+ page_res = page_copy.resources
564
+ page_res.each do |k, v|
565
+ page_res[k] = v.dup if v.is_a?(Array) || v.is_a?(Hash)
566
+ v[:referenced_object] = v[:referenced_object].dup if v.is_a?(Hash) && v[:referenced_object]
567
+ end
568
+ end
569
+ return page_copy.instance_exec(secure) { |s| secure_for_copy if s ; init_contents; self }
570
+ end
491
571
 
492
572
  ###################################
493
573
  # protected methods
@@ -501,13 +581,14 @@ module CombinePDF
501
581
  end
502
582
  #initializes the content stream in case it was not initialized before
503
583
  def init_contents
584
+ self[:Contents].delete({ is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: ''} })
504
585
  # wrap content streams
505
- insert_object 'q', 0
506
- insert_object 'Q'
586
+ insert_content 'q', 0
587
+ insert_content 'Q'
507
588
 
508
589
  # Prep content
509
590
  @contents = ''
510
- insert_object @contents
591
+ insert_content @contents
511
592
  @contents
512
593
  end
513
594
 
@@ -516,16 +597,21 @@ module CombinePDF
516
597
  # accepts:
517
598
  # object:: can be a string or a hash object
518
599
  # location:: can be any numeral related to the possition in the :Contents array. defaults to -1 == insert at the end.
519
- def insert_object object, location = -1
600
+ def insert_content object, location = -1
520
601
  object = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: object} } if object.is_a?(String)
521
602
  raise TypeError, "expected a String or Hash object." unless object.is_a?(Hash)
522
- unless self[:Contents].is_a?(Array)
523
- self[:Contents] = [ self[:Contents] ].compact
524
- end
603
+ prep_content_array
525
604
  self[:Contents].insert location, object
526
605
  self
527
606
  end
528
607
 
608
+ def prep_content_array
609
+ return self if self[:Contents].is_a?(Array)
610
+ self[:Contents] = self[:Contents][:referenced_object] if self[:Contents].is_a?(Hash) && self[:Contents][:referenced_object] && self[:Contents][:referenced_object].is_a?(Array)
611
+ self[:Contents] = [ self[:Contents] ].compact
612
+ self
613
+ end
614
+
529
615
  #returns the basic font name used internally
530
616
  def base_font_name
531
617
  @base_font_name ||= "Writer" + SecureRandom.hex(7) + "PDF"
@@ -587,7 +673,7 @@ module CombinePDF
587
673
  end
588
674
 
589
675
  # encodes the text in an array of [:font_name, <PDFHexString>] for use in textbox
590
- def encode text, fonts
676
+ def encode_text text, fonts
591
677
  # text must be a unicode string and fonts must be an array.
592
678
  # this is an internal method, don't perform tests.
593
679
  fonts_array = []
@@ -651,6 +737,181 @@ module CombinePDF
651
737
  end
652
738
  out.join.strip
653
739
  end
740
+
741
+
742
+ # copy_and_secure_for_injection(page)
743
+ # - page is a page in the pages array, i.e.
744
+ # pdf.pages[0]
745
+ # takes a page object and:
746
+ #
747
+ # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
748
+ #
749
+ # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
750
+ def secure_for_copy
751
+ # initiate dictionary from old names to new names
752
+ names_dictionary = {}
753
+
754
+ # travel every dictionary to pick up names (keys), change them and add them to the dictionary
755
+ self[:Resources].each do |k,v|
756
+ if v.is_a?(Hash)
757
+ new_dictionary = {}
758
+ new_name = "Combine" + SecureRandom.hex(7) + "PDF"
759
+ i = 1
760
+ v.each do |old_key, value|
761
+ new_key = (new_name + i.to_s).to_sym
762
+ names_dictionary[old_key] = new_key
763
+ new_dictionary[new_key] = value
764
+ i += 1
765
+ end
766
+ self[:Resources][k] = new_dictionary
767
+ end
768
+ end
769
+
770
+ # now that we have replaced the names in the resources dictionaries,
771
+ # it is time to replace the names inside the stream
772
+ # we will need to make sure we have access to the stream injected
773
+ # we will user PDFFilter.inflate_object
774
+ self[:Contents].each do |c|
775
+ stream = actual_object(c)
776
+ PDFFilter.inflate_object stream
777
+ names_dictionary.each do |old_key, new_key|
778
+ stream[:raw_stream_content].gsub! object_to_pdf(old_key), object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
779
+ end
780
+ # # # the following code isn't needed now that we wrap both the existing and incoming content streams.
781
+ # # patch back to PDF defaults, for OCRed PDF files.
782
+ # stream[:raw_stream_content] = "q\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\n" % stream[:raw_stream_content]
783
+ end
784
+ self
785
+ end
786
+
787
+
788
+
789
+ # ################
790
+ # ##
791
+
792
+ # def inject_to_page page = {Type: :Page, MediaBox: [0,0,612.0,792.0], Resources: {}, Contents: []}, stream = nil, top = true
793
+ # # make sure both the page reciving the new data and the injected page are of the correct data type.
794
+ # return false unless page.is_a?(Hash) && stream.is_a?(Hash)
795
+
796
+ # # following the reference chain and assigning a pointer to the correct Resouces object.
797
+ # # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
798
+ # page[:Resources] ||= {}
799
+ # original_resources = page[:Resources]
800
+ # if original_resources[:is_reference_only]
801
+ # original_resources = original_resources[:referenced_object]
802
+ # raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless original_resources
803
+ # end
804
+ # original_contents = page[:Contents]
805
+ # original_contents = [original_contents] unless original_contents.is_a? Array
806
+
807
+ # stream[:Resources] ||= {}
808
+ # stream_resources = stream[:Resources]
809
+ # if stream_resources[:is_reference_only]
810
+ # stream_resources = stream_resources[:referenced_object]
811
+ # raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless stream_resources
812
+ # end
813
+ # stream_contents = stream[:Contents]
814
+ # stream_contents = [stream_contents] unless stream_contents.is_a? Array
815
+
816
+ # # collect keys as objects - this is to make sure that
817
+ # # we are working on the actual resource data, rather then references
818
+ # flatten_resources_dictionaries stream_resources
819
+ # flatten_resources_dictionaries original_resources
820
+
821
+ # # injecting each of the values in the injected Page
822
+ # stream_resources.each do |key, new_val|
823
+ # unless PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
824
+ # if original_resources[key].nil?
825
+ # original_resources[key] = new_val
826
+ # elsif original_resources[key].is_a?(Hash) && new_val.is_a?(Hash)
827
+ # new_val.update original_resources[key] # make sure the old values are respected
828
+ # original_resources[key].update new_val # transfer old and new values to the injected page
829
+ # end #Do nothing if array - ot is the PROC array, which is an issue
830
+ # end
831
+ # end
832
+ # original_resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
833
+
834
+ # if top # if this is a stamp (overlay)
835
+ # page[:Contents] = original_contents
836
+ # page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
837
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
838
+ # page[:Contents].push *stream_contents
839
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
840
+ # else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
841
+ # page[:Contents] = stream_contents
842
+ # page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
843
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
844
+ # page[:Contents].push *original_contents
845
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
846
+ # end
847
+
848
+ # page
849
+ # end
850
+ # # copy_and_secure_for_injection(page)
851
+ # # - page is a page in the pages array, i.e.
852
+ # # pdf.pages[0]
853
+ # # takes a page object and:
854
+ # #
855
+ # # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
856
+ # #
857
+ # # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
858
+ # def copy_and_secure_for_injection(page)
859
+ # # copy page
860
+ # new_page = create_deep_copy page
861
+
862
+ # # initiate dictionary from old names to new names
863
+ # names_dictionary = {}
864
+
865
+ # # itirate through all keys that are name objects and give them new names (add to dic)
866
+ # # this should be done for every dictionary in :Resources
867
+ # # this is a few steps stage:
868
+
869
+ # # 1. get resources object
870
+ # resources = new_page[:Resources]
871
+ # if resources[:is_reference_only]
872
+ # resources = resources[:referenced_object]
873
+ # raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless resources
874
+ # end
875
+
876
+ # # 2. establich direct access to dictionaries and remove reference values
877
+ # flatten_resources_dictionaries resources
878
+
879
+ # # 3. travel every dictionary to pick up names (keys), change them and add them to the dictionary
880
+ # resources.each do |k,v|
881
+ # if v.is_a?(Hash)
882
+ # new_dictionary = {}
883
+ # new_name = "Combine" + SecureRandom.hex(7) + "PDF"
884
+ # i = 1
885
+ # v.each do |old_key, value|
886
+ # new_key = (new_name + i.to_s).to_sym
887
+ # names_dictionary[old_key] = new_key
888
+ # new_dictionary[new_key] = value
889
+ # i += 1
890
+ # end
891
+ # resources[k] = new_dictionary
892
+ # end
893
+ # end
894
+
895
+ # # now that we have replaced the names in the resources dictionaries,
896
+ # # it is time to replace the names inside the stream
897
+ # # we will need to make sure we have access to the stream injected
898
+ # # we will user PDFFilter.inflate_object
899
+ # (new_page[:Contents].is_a?(Array) ? new_page[:Contents] : [new_page[:Contents] ]).each do |c|
900
+ # stream = c[:referenced_object]
901
+ # PDFFilter.inflate_object stream
902
+ # names_dictionary.each do |old_key, new_key|
903
+ # stream[:raw_stream_content].gsub! _object_to_pdf(old_key), _object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
904
+ # end
905
+ # # patch back to PDF defaults, for OCRed PDF files.
906
+ # # stream[:raw_stream_content] = "q\nq\nq\nDeviceRGB CS\nDeviceRGB cs\n0 0 0 rg\n0 0 0 RG\n0 Tr\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
907
+ # # the following was removed for Acrobat Reader compatability: DeviceRGB CS\nDeviceRGB cs\n
908
+ # stream[:raw_stream_content] = "q\nq\nq\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
909
+ # end
910
+
911
+ # new_page
912
+ # end
913
+
914
+
654
915
  end
655
916
 
656
917
  end
@@ -44,12 +44,13 @@ module CombinePDF
44
44
  #
45
45
  # string:: the data to be parsed, as a String object.
46
46
  def initialize (string)
47
- raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
47
+ raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
48
48
  @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
49
49
  @literal_strings = []
50
50
  @hex_strings = []
51
51
  @streams = []
52
52
  @parsed = []
53
+ @references = []
53
54
  @root_object = {}
54
55
  @info_object = {}
55
56
  @version = nil
@@ -58,6 +59,7 @@ module CombinePDF
58
59
 
59
60
  # parse the data in the new parser (the data already set through the initialize / new method)
60
61
  def parse
62
+ return [] if @string_to_parse.empty?
61
63
  return @parsed unless @parsed.empty?
62
64
  @scanner = StringScanner.new @string_to_parse
63
65
  @scanner.pos = 0
@@ -76,8 +78,8 @@ module CombinePDF
76
78
  raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
77
79
 
78
80
  if @root_object[:Encrypt]
79
- PDFOperations.change_references_to_actual_values @parsed, @root_object
80
- warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
81
+ change_references_to_actual_values @root_object
82
+ warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
81
83
  decryptor = PDFDecrypt.new @parsed, @root_object
82
84
  decryptor.decrypt
83
85
  #do we really need to apply to @parsed? No, there is no need.
@@ -106,21 +108,32 @@ module CombinePDF
106
108
  @parsed << stream_data.shift
107
109
  end
108
110
  end
109
- # ## remove object streams
110
- @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
111
- # ## remove XREF dictionaries
112
- @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
113
111
  end
114
112
 
115
- PDFOperations.change_references_to_actual_values @parsed, @root_object
116
- @info_object = @root_object[:Info]
113
+
114
+ # serialize_objects_and_references.catalog_pages
115
+
116
+ # Benchmark.bm do |bm|
117
+ # bm.report("serialize") {1000.times {serialize_objects_and_references} }
118
+ # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
119
+ # bm.report("catalog") {1000.times {catalog_pages} }
120
+ # end
121
+
122
+ serialize_objects_and_references.catalog_pages
123
+
124
+ @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
117
125
  if @info_object && @info_object.is_a?(Hash)
118
126
  @parsed.delete @info_object
119
- PDFOperations.change_references_to_actual_values @parsed, @info_object
120
- PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
127
+ CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
128
+ @info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
121
129
  else
122
130
  @info_object = {}
123
131
  end
132
+ # # # ## remove object streams - if they exist
133
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
134
+ # # # ## remove XREF dictionaries - if they exist
135
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
136
+
124
137
  @parsed
125
138
  end
126
139
 
@@ -189,7 +202,7 @@ module CombinePDF
189
202
  ##########################################
190
203
  when @scanner.scan(/\(/)
191
204
  # warn "Found a literal string"
192
- str = ''
205
+ str = ''.force_encoding(Encoding::ASCII_8BIT)
193
206
  count = 1
194
207
  while count > 0 && @scanner.rest? do
195
208
  str += @scanner.scan_until(/[\(\)]/).to_s
@@ -209,8 +222,8 @@ module CombinePDF
209
222
  end
210
223
  end
211
224
  # The PDF formatted string is: str[0..-2]
212
- # now staring to convert to regular string
213
- str_bytes = str[0..-2].bytes.to_a
225
+ # now starting to convert to regular string
226
+ str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
214
227
  str = []
215
228
  until str_bytes.empty?
216
229
  case str_bytes[0]
@@ -260,7 +273,7 @@ module CombinePDF
260
273
  str << str_bytes.shift
261
274
  end
262
275
  end
263
- out << str.pack('C*')
276
+ out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
264
277
  ##########################################
265
278
  ## Parse a comment
266
279
  ##########################################
@@ -286,6 +299,7 @@ module CombinePDF
286
299
  ##########################################
287
300
  when @scanner.scan(/R/)
288
301
  out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
302
+ @references << out.last
289
303
  ##########################################
290
304
  ## Parse Bool - true and after false
291
305
  ##########################################
@@ -329,5 +343,176 @@ module CombinePDF
329
343
  end
330
344
  out
331
345
  end
346
+
347
+ protected
348
+
349
+
350
+
351
+ # resets cataloging and pages
352
+ def catalog_pages(catalogs = nil, secure_injection = true, inheritance_hash = {})
353
+ unless catalogs
354
+
355
+ if root_object[:Root]
356
+ catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
357
+ else
358
+ catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
359
+ end
360
+ @parsed.delete_if {|obj| obj[:Type] == :Catalog}
361
+ @parsed << catalogs
362
+
363
+ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
364
+ end
365
+ case
366
+ when catalogs.is_a?(Array)
367
+ catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
368
+ when catalogs.is_a?(Hash)
369
+ if catalogs[:is_reference_only]
370
+ if catalogs[:referenced_object]
371
+ catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
372
+ else
373
+ warn "couldn't follow reference!!! #{catalogs} not found!"
374
+ end
375
+ else
376
+ unless catalogs[:Type] == :Page
377
+ raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
378
+ inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
379
+ inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
380
+ inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
381
+ (inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
382
+ (inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
383
+
384
+ # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
385
+ # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
386
+ # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
387
+ end
388
+
389
+ case catalogs[:Type]
390
+ when :Page
391
+
392
+ catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
393
+ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
394
+ catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
395
+ (catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
396
+ (catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
397
+ # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
398
+ # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
399
+ # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
400
+
401
+
402
+ # avoide references on MediaBox, CropBox and Rotate
403
+ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
404
+ catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
405
+ catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
406
+
407
+ catalogs.instance_eval {extend Page_Methods}
408
+ catalogs.secure_injection = secure_injection
409
+ when :Pages
410
+ catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
411
+ when :Catalog
412
+ catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
413
+ end
414
+ end
415
+ end
416
+ self
417
+ end
418
+
419
+ # fails!
420
+ def change_references_to_actual_values(hash_with_references = {})
421
+ hash_with_references.each do |k,v|
422
+ if v.is_a?(Hash) && v[:is_reference_only]
423
+ hash_with_references[k] = get_refernced_object(v)
424
+ hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
425
+ warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
426
+ hash_with_references[k] = v unless hash_with_references[k]
427
+ end
428
+ end
429
+ hash_with_references
430
+ end
431
+
432
+ def get_refernced_object(reference_hash = {})
433
+ @parsed.each do |stored_object|
434
+ return stored_object if ( stored_object.is_a?(Hash) &&
435
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
436
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
437
+ end
438
+ warn "didn't find reference #{reference_hash}"
439
+ nil
440
+ end
441
+
442
+ # @private
443
+ # connects references and objects, according to their reference id's.
444
+ #
445
+ # should be moved to the parser's workflow.
446
+ #
447
+ def serialize_objects_and_references
448
+ obj_dir = {}
449
+ @parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
450
+ # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
451
+ @references.each do |obj|
452
+ obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
453
+ warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
454
+ obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
455
+ end
456
+ self
457
+ end
458
+
459
+ # @private
460
+ # this method reviews a Hash and updates it by merging Hash data,
461
+ # preffering the old over the new.
462
+ def self.hash_update_proc_for_old key, old_data, new_data
463
+ if old_data.is_a? Hash
464
+ old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
465
+ else
466
+ old_data
467
+ end
468
+ end
469
+ # @private
470
+ # this method reviews a Hash an updates it by merging Hash data,
471
+ # preffering the new over the old.
472
+ def self.hash_update_proc_for_new key, old_data, new_data
473
+ if old_data.is_a? Hash
474
+ old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
475
+ else
476
+ new_data
477
+ end
478
+ end
479
+
480
+ # # @private
481
+ # # connects references and objects, according to their reference id's.
482
+ # #
483
+ # # should be moved to the parser's workflow.
484
+ # #
485
+ # def old_serialize_objects_and_references(object = nil)
486
+ # objects_reference_hash = {}
487
+ # # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
488
+ # @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
489
+ # each_object(@parsed) do |obj|
490
+ # if obj[:is_reference_only]
491
+ # obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
492
+ # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
493
+ # # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
494
+ # end
495
+ # end
496
+ # self
497
+ # end
498
+
499
+ # # run block of code on evey PDF object (PDF objects are class Hash)
500
+ # def each_object(object, limit_references = true, already_visited = {}, &block)
501
+ # unless limit_references
502
+ # already_visited[object.object_id] = true
503
+ # end
504
+ # case
505
+ # when object.is_a?(Array)
506
+ # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
507
+ # when object.is_a?(Hash)
508
+ # yield(object)
509
+ # unless limit_references && object[:is_reference_only]
510
+ # object.each do |k,v|
511
+ # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
512
+ # end
513
+ # end
514
+ # end
515
+ # end
516
+
332
517
  end
333
518
  end