combine_pdf 0.1.23 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,6 @@ module CombinePDF
6
6
  ################################################################
7
7
 
8
8
 
9
- # lists the Hash keys used for PDF objects
10
- #
11
- # the CombinePDF library doesn't use special classes for its objects (PDFPage class, PDFStream class or anything like that).
12
- #
13
- # there is only one PDF class which represents the whole of the PDF file.
14
- #
15
- # this Hash lists the private Hash keys that the CombinePDF library uses to
16
- # differentiate between complex PDF objects.
17
- PRIVATE_HASH_KEYS = [:indirect_reference_id, :indirect_generation_number, :raw_stream_content, :is_reference_only, :referenced_object, :indirect_without_dictionary]
18
9
 
19
10
  # holds a simple content stream that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
20
11
  CONTENT_CONTAINER_START = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: 'q'} }
@@ -225,7 +216,7 @@ module CombinePDF
225
216
  hash_with_references[k] = v unless hash_with_references[k]
226
217
  end
227
218
  end
228
- hash_with_references
219
+ hash_with_references
229
220
  end
230
221
  def change_connected_references_to_actual_values(hash_with_references = {})
231
222
  if hash_with_references.is_a?(Hash)
@@ -10,8 +10,16 @@
10
10
 
11
11
  module CombinePDF
12
12
 
13
- # This module injects methods into existing page objects
13
+ # This module injects page editing methods into existing page objects and the PDFWriter objects.
14
14
  module Page_Methods
15
+ include Renderer
16
+
17
+ # holds the string that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
18
+ CONTENT_CONTAINER_START = 'q'
19
+ # holds the string that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
20
+ CONTENT_CONTAINER_MIDDLE = "Q\nq"
21
+ # holds the string that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
22
+ CONTENT_CONTAINER_END = 'Q'
15
23
 
16
24
  # accessor (getter) for the secure_injection setting
17
25
  def secure_injection
@@ -21,17 +29,66 @@ module CombinePDF
21
29
  def secure_injection= safe
22
30
  @secure_injection = safe
23
31
  end
32
+ # sets secure_injection to `true` and returns self, allowing for chaining methods
33
+ def make_secure
34
+ @secure_injection = true
35
+ self
36
+ end
37
+ # sets secure_injection to `false` and returns self, allowing for chaining methods
38
+ def make_unsecure
39
+ @secure_injection = false
40
+ self
41
+ end
24
42
 
25
43
  # the injection method
26
44
  def << obj
27
- obj = secure_injection ? PDFOperations.copy_and_secure_for_injection(obj) : PDFOperations.create_deep_copy(obj)
28
- PDFOperations.inject_to_page self, obj
29
- # should add new referenced objects to the main PDF objects array,
30
- # but isn't done because the container is unknown.
31
- # This should be resolved once the container is rendered and references are renewed.
32
- # holder.add_referenced self
45
+ inject_page obj, true
46
+ end
47
+ def >> obj
48
+ inject_page obj, false
49
+ end
50
+ def inject_page obj, top = true
51
+
52
+ raise TypeError, "couldn't inject data, expecting a PDF page (Hash type)" unless obj.is_a?(Page_Methods)
53
+
54
+ obj = obj.copy #obj.copy(secure_injection)
55
+
56
+ # following the reference chain and assigning a pointer to the correct Resouces object.
57
+ # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
58
+
59
+ # injecting each of the values in the injected Page
60
+ res = resources
61
+ obj.resources.each do |key, new_val|
62
+ unless PDF::PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
63
+ if res[key].nil?
64
+ res[key] = new_val
65
+ elsif res[key].is_a?(Hash) && new_val.is_a?(Hash)
66
+ new_val.update resources[key] # make sure the old values are respected
67
+ res[key].update new_val # transfer old and new values to the injected page
68
+ end #Do nothing if array - ot is the PROC array, which is an issue
69
+ end
70
+ end
71
+ resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
72
+
73
+ if top # if this is a stamp (overlay)
74
+ insert_content CONTENT_CONTAINER_START, 0
75
+ insert_content CONTENT_CONTAINER_MIDDLE
76
+ obj[:Contents].each {|c| insert_content c }
77
+ insert_content CONTENT_CONTAINER_END
78
+ else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
79
+ old_contents = self[:Contents]
80
+ self[:Contents] = []
81
+ insert_content CONTENT_CONTAINER_START
82
+ obj[:Contents].each {|c| insert_content c }
83
+ insert_content CONTENT_CONTAINER_MIDDLE
84
+ old_contents.each { |c| insert_content c }
85
+ insert_content CONTENT_CONTAINER_END
86
+ end
87
+ init_contents
88
+
33
89
  self
34
90
  end
91
+
35
92
  # accessor (setter) for the :MediaBox element of the page
36
93
  # dimensions:: an Array consisting of four numbers (can be floats) setting the size of the media box.
37
94
  def mediabox=(dimensions = [0.0, 0.0, 612.0, 792.0])
@@ -40,7 +97,7 @@ module CombinePDF
40
97
 
41
98
  # accessor (getter) for the :MediaBox element of the page
42
99
  def mediabox
43
- self[:MediaBox].is_a?(Array) ? self[:MediaBox] : self[:MediaBox][:referenced_object]
100
+ actual_object self[:MediaBox]
44
101
  end
45
102
 
46
103
  # accessor (setter) for the :CropBox element of the page
@@ -51,7 +108,7 @@ module CombinePDF
51
108
 
52
109
  # accessor (getter) for the :CropBox element of the page
53
110
  def cropbox
54
- (self[:CropBox].is_a?(Array) || self[:CropBox].nil?) ? self[:CropBox] : self[:CropBox][:referenced_object]
111
+ actual_object self[:CropBox]
55
112
  end
56
113
 
57
114
  # get page size
@@ -149,7 +206,7 @@ module CombinePDF
149
206
  box_graphic_state[:LC], box_graphic_state[:LJ] = 2, 1
150
207
  end
151
208
  box_graphic_state = graphic_state box_graphic_state # adds the graphic state to Resources and gets the reference
152
- box_stream << "#{PDFOperations._object_to_pdf box_graphic_state} gs\n"
209
+ box_stream << "#{object_to_pdf box_graphic_state} gs\n"
153
210
 
154
211
  # the following line was removed for Acrobat Reader compatability
155
212
  # box_stream << "DeviceRGB CS\nDeviceRGB cs\n"
@@ -243,7 +300,7 @@ module CombinePDF
243
300
  text_stream << "q\n"
244
301
  text_stream << "#{options[:ctm].join ' '} cm\n" if options[:ctm]
245
302
  text_graphic_state = graphic_state({ca: options[:opacity], CA: options[:opacity], LW: options[:stroke_width].to_f, LC: 2, LJ: 1, LD: 0 })
246
- text_stream << "#{PDFOperations._object_to_pdf text_graphic_state} gs\n"
303
+ text_stream << "#{object_to_pdf text_graphic_state} gs\n"
247
304
 
248
305
  # the following line was removed for Acrobat Reader compatability
249
306
  # text_stream << "DeviceRGB CS\nDeviceRGB cs\n"
@@ -266,9 +323,9 @@ module CombinePDF
266
323
  end
267
324
  # format text object(s)
268
325
  # text_stream << "#{options[:font_color].join(' ')} rg\n" # sets the color state
269
- encode(text, fonts).each do |encoded|
326
+ encode_text(text, fonts).each do |encoded|
270
327
  text_stream << "BT\n" # the Begine Text marker
271
- text_stream << PDFOperations._format_name_to_pdf(set_font encoded[0]) # Set font name
328
+ text_stream << format_name_to_pdf(set_font encoded[0]) # Set font name
272
329
  text_stream << " #{font_size.round 3} Tf\n" # set font size and add font operator
273
330
  text_stream << "#{x.round 4} #{y.round 4} Td\n" # set location for text object
274
331
  text_stream << ( encoded[1] ) # insert the encoded string to the stream
@@ -335,9 +392,9 @@ module CombinePDF
335
392
  ctm.push( ( (x*c).abs - x*c + (y*s).abs + y*s )/2 , ( (x*s).abs - x*s + (y*c).abs - y*c )/2 )
336
393
 
337
394
  # insert the rotation stream into the current content stream
338
- insert_object "q\n#{ctm.join ' '} cm\n", 0
395
+ insert_content "q\n#{ctm.join ' '} cm\n", 0
339
396
  # close the rotation stream
340
- insert_object PDFOperations.create_deep_copy(CONTENT_CONTAINER_END)
397
+ insert_content CONTENT_CONTAINER_END
341
398
  # reset the mediabox and cropbox values - THIS IS ONLY FOR ORIENTATION CHANGE...
342
399
  if ((self[:Rotate].to_f / 90)%2) != 0
343
400
  self[:MediaBox] = self[:MediaBox].values_at(1,0,3,2)
@@ -488,6 +545,29 @@ module CombinePDF
488
545
  self
489
546
  end
490
547
 
548
+ # since only the Content streams are modified (Resource hashes are created anew),
549
+ # it should be safe (and a lot faster) to create a deep copy only for the content hashes and streams.
550
+ def copy(secure = false)
551
+ delete :Parent
552
+ prep_content_array
553
+ page_copy = self.clone
554
+ page_copy[:Contents] = page_copy[:Contents].map do |obj|
555
+ obj = obj.dup
556
+ obj[:referenced_object] = obj[:referenced_object].dup if obj[:referenced_object]
557
+ obj[:referenced_object][:raw_stream_content] = obj[:referenced_object][:raw_stream_content].dup if obj[:referenced_object] && obj[:referenced_object][:raw_stream_content]
558
+ obj
559
+ end
560
+ if page_copy[:Resources]
561
+ page_copy[:Resources] = page_copy[:Resources].dup
562
+ page_copy[:Resources][:referenced_object] = page_copy[:Resources][:referenced_object].dup if page_copy[:Resources][:referenced_object]
563
+ page_res = page_copy.resources
564
+ page_res.each do |k, v|
565
+ page_res[k] = v.dup if v.is_a?(Array) || v.is_a?(Hash)
566
+ v[:referenced_object] = v[:referenced_object].dup if v.is_a?(Hash) && v[:referenced_object]
567
+ end
568
+ end
569
+ return page_copy.instance_exec(secure) { |s| secure_for_copy if s ; init_contents; self }
570
+ end
491
571
 
492
572
  ###################################
493
573
  # protected methods
@@ -501,13 +581,14 @@ module CombinePDF
501
581
  end
502
582
  #initializes the content stream in case it was not initialized before
503
583
  def init_contents
584
+ self[:Contents].delete({ is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: ''} })
504
585
  # wrap content streams
505
- insert_object 'q', 0
506
- insert_object 'Q'
586
+ insert_content 'q', 0
587
+ insert_content 'Q'
507
588
 
508
589
  # Prep content
509
590
  @contents = ''
510
- insert_object @contents
591
+ insert_content @contents
511
592
  @contents
512
593
  end
513
594
 
@@ -516,16 +597,21 @@ module CombinePDF
516
597
  # accepts:
517
598
  # object:: can be a string or a hash object
518
599
  # location:: can be any numeral related to the possition in the :Contents array. defaults to -1 == insert at the end.
519
- def insert_object object, location = -1
600
+ def insert_content object, location = -1
520
601
  object = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: object} } if object.is_a?(String)
521
602
  raise TypeError, "expected a String or Hash object." unless object.is_a?(Hash)
522
- unless self[:Contents].is_a?(Array)
523
- self[:Contents] = [ self[:Contents] ].compact
524
- end
603
+ prep_content_array
525
604
  self[:Contents].insert location, object
526
605
  self
527
606
  end
528
607
 
608
+ def prep_content_array
609
+ return self if self[:Contents].is_a?(Array)
610
+ self[:Contents] = self[:Contents][:referenced_object] if self[:Contents].is_a?(Hash) && self[:Contents][:referenced_object] && self[:Contents][:referenced_object].is_a?(Array)
611
+ self[:Contents] = [ self[:Contents] ].compact
612
+ self
613
+ end
614
+
529
615
  #returns the basic font name used internally
530
616
  def base_font_name
531
617
  @base_font_name ||= "Writer" + SecureRandom.hex(7) + "PDF"
@@ -587,7 +673,7 @@ module CombinePDF
587
673
  end
588
674
 
589
675
  # encodes the text in an array of [:font_name, <PDFHexString>] for use in textbox
590
- def encode text, fonts
676
+ def encode_text text, fonts
591
677
  # text must be a unicode string and fonts must be an array.
592
678
  # this is an internal method, don't perform tests.
593
679
  fonts_array = []
@@ -651,6 +737,181 @@ module CombinePDF
651
737
  end
652
738
  out.join.strip
653
739
  end
740
+
741
+
742
+ # copy_and_secure_for_injection(page)
743
+ # - page is a page in the pages array, i.e.
744
+ # pdf.pages[0]
745
+ # takes a page object and:
746
+ #
747
+ # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
748
+ #
749
+ # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
750
+ def secure_for_copy
751
+ # initiate dictionary from old names to new names
752
+ names_dictionary = {}
753
+
754
+ # travel every dictionary to pick up names (keys), change them and add them to the dictionary
755
+ self[:Resources].each do |k,v|
756
+ if v.is_a?(Hash)
757
+ new_dictionary = {}
758
+ new_name = "Combine" + SecureRandom.hex(7) + "PDF"
759
+ i = 1
760
+ v.each do |old_key, value|
761
+ new_key = (new_name + i.to_s).to_sym
762
+ names_dictionary[old_key] = new_key
763
+ new_dictionary[new_key] = value
764
+ i += 1
765
+ end
766
+ self[:Resources][k] = new_dictionary
767
+ end
768
+ end
769
+
770
+ # now that we have replaced the names in the resources dictionaries,
771
+ # it is time to replace the names inside the stream
772
+ # we will need to make sure we have access to the stream injected
773
+ # we will user PDFFilter.inflate_object
774
+ self[:Contents].each do |c|
775
+ stream = actual_object(c)
776
+ PDFFilter.inflate_object stream
777
+ names_dictionary.each do |old_key, new_key|
778
+ stream[:raw_stream_content].gsub! object_to_pdf(old_key), object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
779
+ end
780
+ # # # the following code isn't needed now that we wrap both the existing and incoming content streams.
781
+ # # patch back to PDF defaults, for OCRed PDF files.
782
+ # stream[:raw_stream_content] = "q\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\n" % stream[:raw_stream_content]
783
+ end
784
+ self
785
+ end
786
+
787
+
788
+
789
+ # ################
790
+ # ##
791
+
792
+ # def inject_to_page page = {Type: :Page, MediaBox: [0,0,612.0,792.0], Resources: {}, Contents: []}, stream = nil, top = true
793
+ # # make sure both the page reciving the new data and the injected page are of the correct data type.
794
+ # return false unless page.is_a?(Hash) && stream.is_a?(Hash)
795
+
796
+ # # following the reference chain and assigning a pointer to the correct Resouces object.
797
+ # # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
798
+ # page[:Resources] ||= {}
799
+ # original_resources = page[:Resources]
800
+ # if original_resources[:is_reference_only]
801
+ # original_resources = original_resources[:referenced_object]
802
+ # raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless original_resources
803
+ # end
804
+ # original_contents = page[:Contents]
805
+ # original_contents = [original_contents] unless original_contents.is_a? Array
806
+
807
+ # stream[:Resources] ||= {}
808
+ # stream_resources = stream[:Resources]
809
+ # if stream_resources[:is_reference_only]
810
+ # stream_resources = stream_resources[:referenced_object]
811
+ # raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless stream_resources
812
+ # end
813
+ # stream_contents = stream[:Contents]
814
+ # stream_contents = [stream_contents] unless stream_contents.is_a? Array
815
+
816
+ # # collect keys as objects - this is to make sure that
817
+ # # we are working on the actual resource data, rather then references
818
+ # flatten_resources_dictionaries stream_resources
819
+ # flatten_resources_dictionaries original_resources
820
+
821
+ # # injecting each of the values in the injected Page
822
+ # stream_resources.each do |key, new_val|
823
+ # unless PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
824
+ # if original_resources[key].nil?
825
+ # original_resources[key] = new_val
826
+ # elsif original_resources[key].is_a?(Hash) && new_val.is_a?(Hash)
827
+ # new_val.update original_resources[key] # make sure the old values are respected
828
+ # original_resources[key].update new_val # transfer old and new values to the injected page
829
+ # end #Do nothing if array - ot is the PROC array, which is an issue
830
+ # end
831
+ # end
832
+ # original_resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
833
+
834
+ # if top # if this is a stamp (overlay)
835
+ # page[:Contents] = original_contents
836
+ # page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
837
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
838
+ # page[:Contents].push *stream_contents
839
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
840
+ # else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
841
+ # page[:Contents] = stream_contents
842
+ # page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
843
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
844
+ # page[:Contents].push *original_contents
845
+ # page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
846
+ # end
847
+
848
+ # page
849
+ # end
850
+ # # copy_and_secure_for_injection(page)
851
+ # # - page is a page in the pages array, i.e.
852
+ # # pdf.pages[0]
853
+ # # takes a page object and:
854
+ # #
855
+ # # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
856
+ # #
857
+ # # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
858
+ # def copy_and_secure_for_injection(page)
859
+ # # copy page
860
+ # new_page = create_deep_copy page
861
+
862
+ # # initiate dictionary from old names to new names
863
+ # names_dictionary = {}
864
+
865
+ # # itirate through all keys that are name objects and give them new names (add to dic)
866
+ # # this should be done for every dictionary in :Resources
867
+ # # this is a few steps stage:
868
+
869
+ # # 1. get resources object
870
+ # resources = new_page[:Resources]
871
+ # if resources[:is_reference_only]
872
+ # resources = resources[:referenced_object]
873
+ # raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless resources
874
+ # end
875
+
876
+ # # 2. establich direct access to dictionaries and remove reference values
877
+ # flatten_resources_dictionaries resources
878
+
879
+ # # 3. travel every dictionary to pick up names (keys), change them and add them to the dictionary
880
+ # resources.each do |k,v|
881
+ # if v.is_a?(Hash)
882
+ # new_dictionary = {}
883
+ # new_name = "Combine" + SecureRandom.hex(7) + "PDF"
884
+ # i = 1
885
+ # v.each do |old_key, value|
886
+ # new_key = (new_name + i.to_s).to_sym
887
+ # names_dictionary[old_key] = new_key
888
+ # new_dictionary[new_key] = value
889
+ # i += 1
890
+ # end
891
+ # resources[k] = new_dictionary
892
+ # end
893
+ # end
894
+
895
+ # # now that we have replaced the names in the resources dictionaries,
896
+ # # it is time to replace the names inside the stream
897
+ # # we will need to make sure we have access to the stream injected
898
+ # # we will user PDFFilter.inflate_object
899
+ # (new_page[:Contents].is_a?(Array) ? new_page[:Contents] : [new_page[:Contents] ]).each do |c|
900
+ # stream = c[:referenced_object]
901
+ # PDFFilter.inflate_object stream
902
+ # names_dictionary.each do |old_key, new_key|
903
+ # stream[:raw_stream_content].gsub! _object_to_pdf(old_key), _object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
904
+ # end
905
+ # # patch back to PDF defaults, for OCRed PDF files.
906
+ # # stream[:raw_stream_content] = "q\nq\nq\nDeviceRGB CS\nDeviceRGB cs\n0 0 0 rg\n0 0 0 RG\n0 Tr\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
907
+ # # the following was removed for Acrobat Reader compatability: DeviceRGB CS\nDeviceRGB cs\n
908
+ # stream[:raw_stream_content] = "q\nq\nq\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
909
+ # end
910
+
911
+ # new_page
912
+ # end
913
+
914
+
654
915
  end
655
916
 
656
917
  end
@@ -44,12 +44,13 @@ module CombinePDF
44
44
  #
45
45
  # string:: the data to be parsed, as a String object.
46
46
  def initialize (string)
47
- raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
47
+ raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
48
48
  @string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
49
49
  @literal_strings = []
50
50
  @hex_strings = []
51
51
  @streams = []
52
52
  @parsed = []
53
+ @references = []
53
54
  @root_object = {}
54
55
  @info_object = {}
55
56
  @version = nil
@@ -58,6 +59,7 @@ module CombinePDF
58
59
 
59
60
  # parse the data in the new parser (the data already set through the initialize / new method)
60
61
  def parse
62
+ return [] if @string_to_parse.empty?
61
63
  return @parsed unless @parsed.empty?
62
64
  @scanner = StringScanner.new @string_to_parse
63
65
  @scanner.pos = 0
@@ -76,8 +78,8 @@ module CombinePDF
76
78
  raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
77
79
 
78
80
  if @root_object[:Encrypt]
79
- PDFOperations.change_references_to_actual_values @parsed, @root_object
80
- warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
81
+ change_references_to_actual_values @root_object
82
+ warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
81
83
  decryptor = PDFDecrypt.new @parsed, @root_object
82
84
  decryptor.decrypt
83
85
  #do we really need to apply to @parsed? No, there is no need.
@@ -106,21 +108,32 @@ module CombinePDF
106
108
  @parsed << stream_data.shift
107
109
  end
108
110
  end
109
- # ## remove object streams
110
- @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
111
- # ## remove XREF dictionaries
112
- @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
113
111
  end
114
112
 
115
- PDFOperations.change_references_to_actual_values @parsed, @root_object
116
- @info_object = @root_object[:Info]
113
+
114
+ # serialize_objects_and_references.catalog_pages
115
+
116
+ # Benchmark.bm do |bm|
117
+ # bm.report("serialize") {1000.times {serialize_objects_and_references} }
118
+ # bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
119
+ # bm.report("catalog") {1000.times {catalog_pages} }
120
+ # end
121
+
122
+ serialize_objects_and_references.catalog_pages
123
+
124
+ @info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
117
125
  if @info_object && @info_object.is_a?(Hash)
118
126
  @parsed.delete @info_object
119
- PDFOperations.change_references_to_actual_values @parsed, @info_object
120
- PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
127
+ CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
128
+ @info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
121
129
  else
122
130
  @info_object = {}
123
131
  end
132
+ # # # ## remove object streams - if they exist
133
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
134
+ # # # ## remove XREF dictionaries - if they exist
135
+ # @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
136
+
124
137
  @parsed
125
138
  end
126
139
 
@@ -189,7 +202,7 @@ module CombinePDF
189
202
  ##########################################
190
203
  when @scanner.scan(/\(/)
191
204
  # warn "Found a literal string"
192
- str = ''
205
+ str = ''.force_encoding(Encoding::ASCII_8BIT)
193
206
  count = 1
194
207
  while count > 0 && @scanner.rest? do
195
208
  str += @scanner.scan_until(/[\(\)]/).to_s
@@ -209,8 +222,8 @@ module CombinePDF
209
222
  end
210
223
  end
211
224
  # The PDF formatted string is: str[0..-2]
212
- # now staring to convert to regular string
213
- str_bytes = str[0..-2].bytes.to_a
225
+ # now starting to convert to regular string
226
+ str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
214
227
  str = []
215
228
  until str_bytes.empty?
216
229
  case str_bytes[0]
@@ -260,7 +273,7 @@ module CombinePDF
260
273
  str << str_bytes.shift
261
274
  end
262
275
  end
263
- out << str.pack('C*')
276
+ out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
264
277
  ##########################################
265
278
  ## Parse a comment
266
279
  ##########################################
@@ -286,6 +299,7 @@ module CombinePDF
286
299
  ##########################################
287
300
  when @scanner.scan(/R/)
288
301
  out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
302
+ @references << out.last
289
303
  ##########################################
290
304
  ## Parse Bool - true and after false
291
305
  ##########################################
@@ -329,5 +343,176 @@ module CombinePDF
329
343
  end
330
344
  out
331
345
  end
346
+
347
+ protected
348
+
349
+
350
+
351
+ # resets cataloging and pages
352
+ def catalog_pages(catalogs = nil, secure_injection = true, inheritance_hash = {})
353
+ unless catalogs
354
+
355
+ if root_object[:Root]
356
+ catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
357
+ else
358
+ catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
359
+ end
360
+ @parsed.delete_if {|obj| obj[:Type] == :Catalog}
361
+ @parsed << catalogs
362
+
363
+ raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
364
+ end
365
+ case
366
+ when catalogs.is_a?(Array)
367
+ catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
368
+ when catalogs.is_a?(Hash)
369
+ if catalogs[:is_reference_only]
370
+ if catalogs[:referenced_object]
371
+ catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
372
+ else
373
+ warn "couldn't follow reference!!! #{catalogs} not found!"
374
+ end
375
+ else
376
+ unless catalogs[:Type] == :Page
377
+ raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
378
+ inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
379
+ inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
380
+ inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
381
+ (inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
382
+ (inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
383
+
384
+ # inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
385
+ # inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
386
+ # inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
387
+ end
388
+
389
+ case catalogs[:Type]
390
+ when :Page
391
+
392
+ catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
393
+ catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
394
+ catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
395
+ (catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
396
+ (catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
397
+ # catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
398
+ # catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
399
+ # catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
400
+
401
+
402
+ # avoide references on MediaBox, CropBox and Rotate
403
+ catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
404
+ catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
405
+ catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
406
+
407
+ catalogs.instance_eval {extend Page_Methods}
408
+ catalogs.secure_injection = secure_injection
409
+ when :Pages
410
+ catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
411
+ when :Catalog
412
+ catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
413
+ end
414
+ end
415
+ end
416
+ self
417
+ end
418
+
419
+ # fails!
420
+ def change_references_to_actual_values(hash_with_references = {})
421
+ hash_with_references.each do |k,v|
422
+ if v.is_a?(Hash) && v[:is_reference_only]
423
+ hash_with_references[k] = get_refernced_object(v)
424
+ hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
425
+ warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
426
+ hash_with_references[k] = v unless hash_with_references[k]
427
+ end
428
+ end
429
+ hash_with_references
430
+ end
431
+
432
+ def get_refernced_object(reference_hash = {})
433
+ @parsed.each do |stored_object|
434
+ return stored_object if ( stored_object.is_a?(Hash) &&
435
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
436
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
437
+ end
438
+ warn "didn't find reference #{reference_hash}"
439
+ nil
440
+ end
441
+
442
+ # @private
443
+ # connects references and objects, according to their reference id's.
444
+ #
445
+ # should be moved to the parser's workflow.
446
+ #
447
+ def serialize_objects_and_references
448
+ obj_dir = {}
449
+ @parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
450
+ # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
451
+ @references.each do |obj|
452
+ obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
453
+ warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
454
+ obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
455
+ end
456
+ self
457
+ end
458
+
459
+ # @private
460
+ # this method reviews a Hash and updates it by merging Hash data,
461
+ # preffering the old over the new.
462
+ def self.hash_update_proc_for_old key, old_data, new_data
463
+ if old_data.is_a? Hash
464
+ old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
465
+ else
466
+ old_data
467
+ end
468
+ end
469
+ # @private
470
+ # this method reviews a Hash an updates it by merging Hash data,
471
+ # preffering the new over the old.
472
+ def self.hash_update_proc_for_new key, old_data, new_data
473
+ if old_data.is_a? Hash
474
+ old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
475
+ else
476
+ new_data
477
+ end
478
+ end
479
+
480
+ # # @private
481
+ # # connects references and objects, according to their reference id's.
482
+ # #
483
+ # # should be moved to the parser's workflow.
484
+ # #
485
+ # def old_serialize_objects_and_references(object = nil)
486
+ # objects_reference_hash = {}
487
+ # # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
488
+ # @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
489
+ # each_object(@parsed) do |obj|
490
+ # if obj[:is_reference_only]
491
+ # obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
492
+ # warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
493
+ # # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
494
+ # end
495
+ # end
496
+ # self
497
+ # end
498
+
499
+ # # run block of code on evey PDF object (PDF objects are class Hash)
500
+ # def each_object(object, limit_references = true, already_visited = {}, &block)
501
+ # unless limit_references
502
+ # already_visited[object.object_id] = true
503
+ # end
504
+ # case
505
+ # when object.is_a?(Array)
506
+ # object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
507
+ # when object.is_a?(Hash)
508
+ # yield(object)
509
+ # unless limit_references && object[:is_reference_only]
510
+ # object.each do |k,v|
511
+ # each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
512
+ # end
513
+ # end
514
+ # end
515
+ # end
516
+
332
517
  end
333
518
  end