combine_pdf 0.1.23 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +8 -8
- data/combine_pdf.gemspec +1 -1
- data/lib/combine_pdf.rb +22 -19
- data/lib/combine_pdf/{combine_pdf_methods.rb → api.rb} +16 -83
- data/lib/combine_pdf/basic_writer.rb +62 -0
- data/lib/combine_pdf/{combine_pdf_decrypt.rb → decrypt.rb} +35 -2
- data/lib/combine_pdf/{combine_pdf_filter.rb → filter.rb} +0 -0
- data/lib/combine_pdf/{combine_pdf_fonts.rb → fonts.rb} +19 -8
- data/lib/combine_pdf/{combine_pdf_operations.rb → operations.rb} +1 -10
- data/lib/combine_pdf/{combine_pdf_page.rb → page_methods.rb} +284 -23
- data/lib/combine_pdf/{combine_pdf_parser.rb → parser.rb} +200 -15
- data/lib/combine_pdf/pdf_protected.rb +141 -0
- data/lib/combine_pdf/pdf_public.rb +402 -0
- data/lib/combine_pdf/renderer.rb +168 -0
- data/lib/combine_pdf/version.rb +1 -1
- metadata +15 -13
- data/lib/combine_pdf/combine_pdf_basic_writer.rb +0 -451
- data/lib/combine_pdf/combine_pdf_pdf.rb +0 -724
@@ -6,15 +6,6 @@ module CombinePDF
|
|
6
6
|
################################################################
|
7
7
|
|
8
8
|
|
9
|
-
# lists the Hash keys used for PDF objects
|
10
|
-
#
|
11
|
-
# the CombinePDF library doesn't use special classes for its objects (PDFPage class, PDFStream class or anything like that).
|
12
|
-
#
|
13
|
-
# there is only one PDF class which represents the whole of the PDF file.
|
14
|
-
#
|
15
|
-
# this Hash lists the private Hash keys that the CombinePDF library uses to
|
16
|
-
# differentiate between complex PDF objects.
|
17
|
-
PRIVATE_HASH_KEYS = [:indirect_reference_id, :indirect_generation_number, :raw_stream_content, :is_reference_only, :referenced_object, :indirect_without_dictionary]
|
18
9
|
|
19
10
|
# holds a simple content stream that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
|
20
11
|
CONTENT_CONTAINER_START = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: 'q'} }
|
@@ -225,7 +216,7 @@ module CombinePDF
|
|
225
216
|
hash_with_references[k] = v unless hash_with_references[k]
|
226
217
|
end
|
227
218
|
end
|
228
|
-
hash_with_references
|
219
|
+
hash_with_references
|
229
220
|
end
|
230
221
|
def change_connected_references_to_actual_values(hash_with_references = {})
|
231
222
|
if hash_with_references.is_a?(Hash)
|
@@ -10,8 +10,16 @@
|
|
10
10
|
|
11
11
|
module CombinePDF
|
12
12
|
|
13
|
-
# This module injects methods into existing page objects
|
13
|
+
# This module injects page editing methods into existing page objects and the PDFWriter objects.
|
14
14
|
module Page_Methods
|
15
|
+
include Renderer
|
16
|
+
|
17
|
+
# holds the string that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
|
18
|
+
CONTENT_CONTAINER_START = 'q'
|
19
|
+
# holds the string that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
|
20
|
+
CONTENT_CONTAINER_MIDDLE = "Q\nq"
|
21
|
+
# holds the string that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
|
22
|
+
CONTENT_CONTAINER_END = 'Q'
|
15
23
|
|
16
24
|
# accessor (getter) for the secure_injection setting
|
17
25
|
def secure_injection
|
@@ -21,17 +29,66 @@ module CombinePDF
|
|
21
29
|
def secure_injection= safe
|
22
30
|
@secure_injection = safe
|
23
31
|
end
|
32
|
+
# sets secure_injection to `true` and returns self, allowing for chaining methods
|
33
|
+
def make_secure
|
34
|
+
@secure_injection = true
|
35
|
+
self
|
36
|
+
end
|
37
|
+
# sets secure_injection to `false` and returns self, allowing for chaining methods
|
38
|
+
def make_unsecure
|
39
|
+
@secure_injection = false
|
40
|
+
self
|
41
|
+
end
|
24
42
|
|
25
43
|
# the injection method
|
26
44
|
def << obj
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
45
|
+
inject_page obj, true
|
46
|
+
end
|
47
|
+
def >> obj
|
48
|
+
inject_page obj, false
|
49
|
+
end
|
50
|
+
def inject_page obj, top = true
|
51
|
+
|
52
|
+
raise TypeError, "couldn't inject data, expecting a PDF page (Hash type)" unless obj.is_a?(Page_Methods)
|
53
|
+
|
54
|
+
obj = obj.copy #obj.copy(secure_injection)
|
55
|
+
|
56
|
+
# following the reference chain and assigning a pointer to the correct Resouces object.
|
57
|
+
# (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
|
58
|
+
|
59
|
+
# injecting each of the values in the injected Page
|
60
|
+
res = resources
|
61
|
+
obj.resources.each do |key, new_val|
|
62
|
+
unless PDF::PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
|
63
|
+
if res[key].nil?
|
64
|
+
res[key] = new_val
|
65
|
+
elsif res[key].is_a?(Hash) && new_val.is_a?(Hash)
|
66
|
+
new_val.update resources[key] # make sure the old values are respected
|
67
|
+
res[key].update new_val # transfer old and new values to the injected page
|
68
|
+
end #Do nothing if array - ot is the PROC array, which is an issue
|
69
|
+
end
|
70
|
+
end
|
71
|
+
resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
|
72
|
+
|
73
|
+
if top # if this is a stamp (overlay)
|
74
|
+
insert_content CONTENT_CONTAINER_START, 0
|
75
|
+
insert_content CONTENT_CONTAINER_MIDDLE
|
76
|
+
obj[:Contents].each {|c| insert_content c }
|
77
|
+
insert_content CONTENT_CONTAINER_END
|
78
|
+
else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
|
79
|
+
old_contents = self[:Contents]
|
80
|
+
self[:Contents] = []
|
81
|
+
insert_content CONTENT_CONTAINER_START
|
82
|
+
obj[:Contents].each {|c| insert_content c }
|
83
|
+
insert_content CONTENT_CONTAINER_MIDDLE
|
84
|
+
old_contents.each { |c| insert_content c }
|
85
|
+
insert_content CONTENT_CONTAINER_END
|
86
|
+
end
|
87
|
+
init_contents
|
88
|
+
|
33
89
|
self
|
34
90
|
end
|
91
|
+
|
35
92
|
# accessor (setter) for the :MediaBox element of the page
|
36
93
|
# dimensions:: an Array consisting of four numbers (can be floats) setting the size of the media box.
|
37
94
|
def mediabox=(dimensions = [0.0, 0.0, 612.0, 792.0])
|
@@ -40,7 +97,7 @@ module CombinePDF
|
|
40
97
|
|
41
98
|
# accessor (getter) for the :MediaBox element of the page
|
42
99
|
def mediabox
|
43
|
-
|
100
|
+
actual_object self[:MediaBox]
|
44
101
|
end
|
45
102
|
|
46
103
|
# accessor (setter) for the :CropBox element of the page
|
@@ -51,7 +108,7 @@ module CombinePDF
|
|
51
108
|
|
52
109
|
# accessor (getter) for the :CropBox element of the page
|
53
110
|
def cropbox
|
54
|
-
|
111
|
+
actual_object self[:CropBox]
|
55
112
|
end
|
56
113
|
|
57
114
|
# get page size
|
@@ -149,7 +206,7 @@ module CombinePDF
|
|
149
206
|
box_graphic_state[:LC], box_graphic_state[:LJ] = 2, 1
|
150
207
|
end
|
151
208
|
box_graphic_state = graphic_state box_graphic_state # adds the graphic state to Resources and gets the reference
|
152
|
-
box_stream << "#{
|
209
|
+
box_stream << "#{object_to_pdf box_graphic_state} gs\n"
|
153
210
|
|
154
211
|
# the following line was removed for Acrobat Reader compatability
|
155
212
|
# box_stream << "DeviceRGB CS\nDeviceRGB cs\n"
|
@@ -243,7 +300,7 @@ module CombinePDF
|
|
243
300
|
text_stream << "q\n"
|
244
301
|
text_stream << "#{options[:ctm].join ' '} cm\n" if options[:ctm]
|
245
302
|
text_graphic_state = graphic_state({ca: options[:opacity], CA: options[:opacity], LW: options[:stroke_width].to_f, LC: 2, LJ: 1, LD: 0 })
|
246
|
-
text_stream << "#{
|
303
|
+
text_stream << "#{object_to_pdf text_graphic_state} gs\n"
|
247
304
|
|
248
305
|
# the following line was removed for Acrobat Reader compatability
|
249
306
|
# text_stream << "DeviceRGB CS\nDeviceRGB cs\n"
|
@@ -266,9 +323,9 @@ module CombinePDF
|
|
266
323
|
end
|
267
324
|
# format text object(s)
|
268
325
|
# text_stream << "#{options[:font_color].join(' ')} rg\n" # sets the color state
|
269
|
-
|
326
|
+
encode_text(text, fonts).each do |encoded|
|
270
327
|
text_stream << "BT\n" # the Begine Text marker
|
271
|
-
text_stream <<
|
328
|
+
text_stream << format_name_to_pdf(set_font encoded[0]) # Set font name
|
272
329
|
text_stream << " #{font_size.round 3} Tf\n" # set font size and add font operator
|
273
330
|
text_stream << "#{x.round 4} #{y.round 4} Td\n" # set location for text object
|
274
331
|
text_stream << ( encoded[1] ) # insert the encoded string to the stream
|
@@ -335,9 +392,9 @@ module CombinePDF
|
|
335
392
|
ctm.push( ( (x*c).abs - x*c + (y*s).abs + y*s )/2 , ( (x*s).abs - x*s + (y*c).abs - y*c )/2 )
|
336
393
|
|
337
394
|
# insert the rotation stream into the current content stream
|
338
|
-
|
395
|
+
insert_content "q\n#{ctm.join ' '} cm\n", 0
|
339
396
|
# close the rotation stream
|
340
|
-
|
397
|
+
insert_content CONTENT_CONTAINER_END
|
341
398
|
# reset the mediabox and cropbox values - THIS IS ONLY FOR ORIENTATION CHANGE...
|
342
399
|
if ((self[:Rotate].to_f / 90)%2) != 0
|
343
400
|
self[:MediaBox] = self[:MediaBox].values_at(1,0,3,2)
|
@@ -488,6 +545,29 @@ module CombinePDF
|
|
488
545
|
self
|
489
546
|
end
|
490
547
|
|
548
|
+
# since only the Content streams are modified (Resource hashes are created anew),
|
549
|
+
# it should be safe (and a lot faster) to create a deep copy only for the content hashes and streams.
|
550
|
+
def copy(secure = false)
|
551
|
+
delete :Parent
|
552
|
+
prep_content_array
|
553
|
+
page_copy = self.clone
|
554
|
+
page_copy[:Contents] = page_copy[:Contents].map do |obj|
|
555
|
+
obj = obj.dup
|
556
|
+
obj[:referenced_object] = obj[:referenced_object].dup if obj[:referenced_object]
|
557
|
+
obj[:referenced_object][:raw_stream_content] = obj[:referenced_object][:raw_stream_content].dup if obj[:referenced_object] && obj[:referenced_object][:raw_stream_content]
|
558
|
+
obj
|
559
|
+
end
|
560
|
+
if page_copy[:Resources]
|
561
|
+
page_copy[:Resources] = page_copy[:Resources].dup
|
562
|
+
page_copy[:Resources][:referenced_object] = page_copy[:Resources][:referenced_object].dup if page_copy[:Resources][:referenced_object]
|
563
|
+
page_res = page_copy.resources
|
564
|
+
page_res.each do |k, v|
|
565
|
+
page_res[k] = v.dup if v.is_a?(Array) || v.is_a?(Hash)
|
566
|
+
v[:referenced_object] = v[:referenced_object].dup if v.is_a?(Hash) && v[:referenced_object]
|
567
|
+
end
|
568
|
+
end
|
569
|
+
return page_copy.instance_exec(secure) { |s| secure_for_copy if s ; init_contents; self }
|
570
|
+
end
|
491
571
|
|
492
572
|
###################################
|
493
573
|
# protected methods
|
@@ -501,13 +581,14 @@ module CombinePDF
|
|
501
581
|
end
|
502
582
|
#initializes the content stream in case it was not initialized before
|
503
583
|
def init_contents
|
584
|
+
self[:Contents].delete({ is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: ''} })
|
504
585
|
# wrap content streams
|
505
|
-
|
506
|
-
|
586
|
+
insert_content 'q', 0
|
587
|
+
insert_content 'Q'
|
507
588
|
|
508
589
|
# Prep content
|
509
590
|
@contents = ''
|
510
|
-
|
591
|
+
insert_content @contents
|
511
592
|
@contents
|
512
593
|
end
|
513
594
|
|
@@ -516,16 +597,21 @@ module CombinePDF
|
|
516
597
|
# accepts:
|
517
598
|
# object:: can be a string or a hash object
|
518
599
|
# location:: can be any numeral related to the possition in the :Contents array. defaults to -1 == insert at the end.
|
519
|
-
def
|
600
|
+
def insert_content object, location = -1
|
520
601
|
object = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: object} } if object.is_a?(String)
|
521
602
|
raise TypeError, "expected a String or Hash object." unless object.is_a?(Hash)
|
522
|
-
|
523
|
-
self[:Contents] = [ self[:Contents] ].compact
|
524
|
-
end
|
603
|
+
prep_content_array
|
525
604
|
self[:Contents].insert location, object
|
526
605
|
self
|
527
606
|
end
|
528
607
|
|
608
|
+
def prep_content_array
|
609
|
+
return self if self[:Contents].is_a?(Array)
|
610
|
+
self[:Contents] = self[:Contents][:referenced_object] if self[:Contents].is_a?(Hash) && self[:Contents][:referenced_object] && self[:Contents][:referenced_object].is_a?(Array)
|
611
|
+
self[:Contents] = [ self[:Contents] ].compact
|
612
|
+
self
|
613
|
+
end
|
614
|
+
|
529
615
|
#returns the basic font name used internally
|
530
616
|
def base_font_name
|
531
617
|
@base_font_name ||= "Writer" + SecureRandom.hex(7) + "PDF"
|
@@ -587,7 +673,7 @@ module CombinePDF
|
|
587
673
|
end
|
588
674
|
|
589
675
|
# encodes the text in an array of [:font_name, <PDFHexString>] for use in textbox
|
590
|
-
def
|
676
|
+
def encode_text text, fonts
|
591
677
|
# text must be a unicode string and fonts must be an array.
|
592
678
|
# this is an internal method, don't perform tests.
|
593
679
|
fonts_array = []
|
@@ -651,6 +737,181 @@ module CombinePDF
|
|
651
737
|
end
|
652
738
|
out.join.strip
|
653
739
|
end
|
740
|
+
|
741
|
+
|
742
|
+
# copy_and_secure_for_injection(page)
|
743
|
+
# - page is a page in the pages array, i.e.
|
744
|
+
# pdf.pages[0]
|
745
|
+
# takes a page object and:
|
746
|
+
#
|
747
|
+
# makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
|
748
|
+
#
|
749
|
+
# then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
|
750
|
+
def secure_for_copy
|
751
|
+
# initiate dictionary from old names to new names
|
752
|
+
names_dictionary = {}
|
753
|
+
|
754
|
+
# travel every dictionary to pick up names (keys), change them and add them to the dictionary
|
755
|
+
self[:Resources].each do |k,v|
|
756
|
+
if v.is_a?(Hash)
|
757
|
+
new_dictionary = {}
|
758
|
+
new_name = "Combine" + SecureRandom.hex(7) + "PDF"
|
759
|
+
i = 1
|
760
|
+
v.each do |old_key, value|
|
761
|
+
new_key = (new_name + i.to_s).to_sym
|
762
|
+
names_dictionary[old_key] = new_key
|
763
|
+
new_dictionary[new_key] = value
|
764
|
+
i += 1
|
765
|
+
end
|
766
|
+
self[:Resources][k] = new_dictionary
|
767
|
+
end
|
768
|
+
end
|
769
|
+
|
770
|
+
# now that we have replaced the names in the resources dictionaries,
|
771
|
+
# it is time to replace the names inside the stream
|
772
|
+
# we will need to make sure we have access to the stream injected
|
773
|
+
# we will user PDFFilter.inflate_object
|
774
|
+
self[:Contents].each do |c|
|
775
|
+
stream = actual_object(c)
|
776
|
+
PDFFilter.inflate_object stream
|
777
|
+
names_dictionary.each do |old_key, new_key|
|
778
|
+
stream[:raw_stream_content].gsub! object_to_pdf(old_key), object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
|
779
|
+
end
|
780
|
+
# # # the following code isn't needed now that we wrap both the existing and incoming content streams.
|
781
|
+
# # patch back to PDF defaults, for OCRed PDF files.
|
782
|
+
# stream[:raw_stream_content] = "q\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\n" % stream[:raw_stream_content]
|
783
|
+
end
|
784
|
+
self
|
785
|
+
end
|
786
|
+
|
787
|
+
|
788
|
+
|
789
|
+
# ################
|
790
|
+
# ##
|
791
|
+
|
792
|
+
# def inject_to_page page = {Type: :Page, MediaBox: [0,0,612.0,792.0], Resources: {}, Contents: []}, stream = nil, top = true
|
793
|
+
# # make sure both the page reciving the new data and the injected page are of the correct data type.
|
794
|
+
# return false unless page.is_a?(Hash) && stream.is_a?(Hash)
|
795
|
+
|
796
|
+
# # following the reference chain and assigning a pointer to the correct Resouces object.
|
797
|
+
# # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
|
798
|
+
# page[:Resources] ||= {}
|
799
|
+
# original_resources = page[:Resources]
|
800
|
+
# if original_resources[:is_reference_only]
|
801
|
+
# original_resources = original_resources[:referenced_object]
|
802
|
+
# raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless original_resources
|
803
|
+
# end
|
804
|
+
# original_contents = page[:Contents]
|
805
|
+
# original_contents = [original_contents] unless original_contents.is_a? Array
|
806
|
+
|
807
|
+
# stream[:Resources] ||= {}
|
808
|
+
# stream_resources = stream[:Resources]
|
809
|
+
# if stream_resources[:is_reference_only]
|
810
|
+
# stream_resources = stream_resources[:referenced_object]
|
811
|
+
# raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless stream_resources
|
812
|
+
# end
|
813
|
+
# stream_contents = stream[:Contents]
|
814
|
+
# stream_contents = [stream_contents] unless stream_contents.is_a? Array
|
815
|
+
|
816
|
+
# # collect keys as objects - this is to make sure that
|
817
|
+
# # we are working on the actual resource data, rather then references
|
818
|
+
# flatten_resources_dictionaries stream_resources
|
819
|
+
# flatten_resources_dictionaries original_resources
|
820
|
+
|
821
|
+
# # injecting each of the values in the injected Page
|
822
|
+
# stream_resources.each do |key, new_val|
|
823
|
+
# unless PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
|
824
|
+
# if original_resources[key].nil?
|
825
|
+
# original_resources[key] = new_val
|
826
|
+
# elsif original_resources[key].is_a?(Hash) && new_val.is_a?(Hash)
|
827
|
+
# new_val.update original_resources[key] # make sure the old values are respected
|
828
|
+
# original_resources[key].update new_val # transfer old and new values to the injected page
|
829
|
+
# end #Do nothing if array - ot is the PROC array, which is an issue
|
830
|
+
# end
|
831
|
+
# end
|
832
|
+
# original_resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
|
833
|
+
|
834
|
+
# if top # if this is a stamp (overlay)
|
835
|
+
# page[:Contents] = original_contents
|
836
|
+
# page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
|
837
|
+
# page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
|
838
|
+
# page[:Contents].push *stream_contents
|
839
|
+
# page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
|
840
|
+
# else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
|
841
|
+
# page[:Contents] = stream_contents
|
842
|
+
# page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
|
843
|
+
# page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
|
844
|
+
# page[:Contents].push *original_contents
|
845
|
+
# page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
|
846
|
+
# end
|
847
|
+
|
848
|
+
# page
|
849
|
+
# end
|
850
|
+
# # copy_and_secure_for_injection(page)
|
851
|
+
# # - page is a page in the pages array, i.e.
|
852
|
+
# # pdf.pages[0]
|
853
|
+
# # takes a page object and:
|
854
|
+
# #
|
855
|
+
# # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
|
856
|
+
# #
|
857
|
+
# # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
|
858
|
+
# def copy_and_secure_for_injection(page)
|
859
|
+
# # copy page
|
860
|
+
# new_page = create_deep_copy page
|
861
|
+
|
862
|
+
# # initiate dictionary from old names to new names
|
863
|
+
# names_dictionary = {}
|
864
|
+
|
865
|
+
# # itirate through all keys that are name objects and give them new names (add to dic)
|
866
|
+
# # this should be done for every dictionary in :Resources
|
867
|
+
# # this is a few steps stage:
|
868
|
+
|
869
|
+
# # 1. get resources object
|
870
|
+
# resources = new_page[:Resources]
|
871
|
+
# if resources[:is_reference_only]
|
872
|
+
# resources = resources[:referenced_object]
|
873
|
+
# raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless resources
|
874
|
+
# end
|
875
|
+
|
876
|
+
# # 2. establich direct access to dictionaries and remove reference values
|
877
|
+
# flatten_resources_dictionaries resources
|
878
|
+
|
879
|
+
# # 3. travel every dictionary to pick up names (keys), change them and add them to the dictionary
|
880
|
+
# resources.each do |k,v|
|
881
|
+
# if v.is_a?(Hash)
|
882
|
+
# new_dictionary = {}
|
883
|
+
# new_name = "Combine" + SecureRandom.hex(7) + "PDF"
|
884
|
+
# i = 1
|
885
|
+
# v.each do |old_key, value|
|
886
|
+
# new_key = (new_name + i.to_s).to_sym
|
887
|
+
# names_dictionary[old_key] = new_key
|
888
|
+
# new_dictionary[new_key] = value
|
889
|
+
# i += 1
|
890
|
+
# end
|
891
|
+
# resources[k] = new_dictionary
|
892
|
+
# end
|
893
|
+
# end
|
894
|
+
|
895
|
+
# # now that we have replaced the names in the resources dictionaries,
|
896
|
+
# # it is time to replace the names inside the stream
|
897
|
+
# # we will need to make sure we have access to the stream injected
|
898
|
+
# # we will user PDFFilter.inflate_object
|
899
|
+
# (new_page[:Contents].is_a?(Array) ? new_page[:Contents] : [new_page[:Contents] ]).each do |c|
|
900
|
+
# stream = c[:referenced_object]
|
901
|
+
# PDFFilter.inflate_object stream
|
902
|
+
# names_dictionary.each do |old_key, new_key|
|
903
|
+
# stream[:raw_stream_content].gsub! _object_to_pdf(old_key), _object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
|
904
|
+
# end
|
905
|
+
# # patch back to PDF defaults, for OCRed PDF files.
|
906
|
+
# # stream[:raw_stream_content] = "q\nq\nq\nDeviceRGB CS\nDeviceRGB cs\n0 0 0 rg\n0 0 0 RG\n0 Tr\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
|
907
|
+
# # the following was removed for Acrobat Reader compatability: DeviceRGB CS\nDeviceRGB cs\n
|
908
|
+
# stream[:raw_stream_content] = "q\nq\nq\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
|
909
|
+
# end
|
910
|
+
|
911
|
+
# new_page
|
912
|
+
# end
|
913
|
+
|
914
|
+
|
654
915
|
end
|
655
916
|
|
656
917
|
end
|
@@ -44,12 +44,13 @@ module CombinePDF
|
|
44
44
|
#
|
45
45
|
# string:: the data to be parsed, as a String object.
|
46
46
|
def initialize (string)
|
47
|
-
raise TypeError, "couldn't parse
|
47
|
+
raise TypeError, "couldn't parse data, expecting type String" unless string.is_a? String
|
48
48
|
@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
|
49
49
|
@literal_strings = []
|
50
50
|
@hex_strings = []
|
51
51
|
@streams = []
|
52
52
|
@parsed = []
|
53
|
+
@references = []
|
53
54
|
@root_object = {}
|
54
55
|
@info_object = {}
|
55
56
|
@version = nil
|
@@ -58,6 +59,7 @@ module CombinePDF
|
|
58
59
|
|
59
60
|
# parse the data in the new parser (the data already set through the initialize / new method)
|
60
61
|
def parse
|
62
|
+
return [] if @string_to_parse.empty?
|
61
63
|
return @parsed unless @parsed.empty?
|
62
64
|
@scanner = StringScanner.new @string_to_parse
|
63
65
|
@scanner.pos = 0
|
@@ -76,8 +78,8 @@ module CombinePDF
|
|
76
78
|
raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
|
77
79
|
|
78
80
|
if @root_object[:Encrypt]
|
79
|
-
|
80
|
-
warn "PDF is Encrypted! Attempting to
|
81
|
+
change_references_to_actual_values @root_object
|
82
|
+
warn "PDF is Encrypted! Attempting to decrypt - not yet fully supported."
|
81
83
|
decryptor = PDFDecrypt.new @parsed, @root_object
|
82
84
|
decryptor.decrypt
|
83
85
|
#do we really need to apply to @parsed? No, there is no need.
|
@@ -106,21 +108,32 @@ module CombinePDF
|
|
106
108
|
@parsed << stream_data.shift
|
107
109
|
end
|
108
110
|
end
|
109
|
-
# ## remove object streams
|
110
|
-
@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
111
|
-
# ## remove XREF dictionaries
|
112
|
-
@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
|
113
111
|
end
|
114
112
|
|
115
|
-
|
116
|
-
|
113
|
+
|
114
|
+
# serialize_objects_and_references.catalog_pages
|
115
|
+
|
116
|
+
# Benchmark.bm do |bm|
|
117
|
+
# bm.report("serialize") {1000.times {serialize_objects_and_references} }
|
118
|
+
# bm.report("serialize - old") {1000.times {old_serialize_objects_and_references} }
|
119
|
+
# bm.report("catalog") {1000.times {catalog_pages} }
|
120
|
+
# end
|
121
|
+
|
122
|
+
serialize_objects_and_references.catalog_pages
|
123
|
+
|
124
|
+
@info_object = @root_object[:Info] ? (@root_object[:Info][:referenced_object] || @root_object[:Info]) : false
|
117
125
|
if @info_object && @info_object.is_a?(Hash)
|
118
126
|
@parsed.delete @info_object
|
119
|
-
|
120
|
-
|
127
|
+
CombinePDF::PDF::PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
|
128
|
+
@info_object.each {|k, v| @info_object = v[:referenced_object] if v.is_a?(Hash) && v[:referenced_object]}
|
121
129
|
else
|
122
130
|
@info_object = {}
|
123
131
|
end
|
132
|
+
# # # ## remove object streams - if they exist
|
133
|
+
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
134
|
+
# # # ## remove XREF dictionaries - if they exist
|
135
|
+
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
|
136
|
+
|
124
137
|
@parsed
|
125
138
|
end
|
126
139
|
|
@@ -189,7 +202,7 @@ module CombinePDF
|
|
189
202
|
##########################################
|
190
203
|
when @scanner.scan(/\(/)
|
191
204
|
# warn "Found a literal string"
|
192
|
-
str = ''
|
205
|
+
str = ''.force_encoding(Encoding::ASCII_8BIT)
|
193
206
|
count = 1
|
194
207
|
while count > 0 && @scanner.rest? do
|
195
208
|
str += @scanner.scan_until(/[\(\)]/).to_s
|
@@ -209,8 +222,8 @@ module CombinePDF
|
|
209
222
|
end
|
210
223
|
end
|
211
224
|
# The PDF formatted string is: str[0..-2]
|
212
|
-
# now
|
213
|
-
str_bytes = str[0..-2].bytes.to_a
|
225
|
+
# now starting to convert to regular string
|
226
|
+
str_bytes = str.force_encoding(Encoding::ASCII_8BIT)[0..-2].bytes.to_a
|
214
227
|
str = []
|
215
228
|
until str_bytes.empty?
|
216
229
|
case str_bytes[0]
|
@@ -260,7 +273,7 @@ module CombinePDF
|
|
260
273
|
str << str_bytes.shift
|
261
274
|
end
|
262
275
|
end
|
263
|
-
out << str.pack('C*')
|
276
|
+
out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
|
264
277
|
##########################################
|
265
278
|
## Parse a comment
|
266
279
|
##########################################
|
@@ -286,6 +299,7 @@ module CombinePDF
|
|
286
299
|
##########################################
|
287
300
|
when @scanner.scan(/R/)
|
288
301
|
out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
|
302
|
+
@references << out.last
|
289
303
|
##########################################
|
290
304
|
## Parse Bool - true and after false
|
291
305
|
##########################################
|
@@ -329,5 +343,176 @@ module CombinePDF
|
|
329
343
|
end
|
330
344
|
out
|
331
345
|
end
|
346
|
+
|
347
|
+
protected
|
348
|
+
|
349
|
+
|
350
|
+
|
351
|
+
# resets cataloging and pages
|
352
|
+
def catalog_pages(catalogs = nil, secure_injection = true, inheritance_hash = {})
|
353
|
+
unless catalogs
|
354
|
+
|
355
|
+
if root_object[:Root]
|
356
|
+
catalogs = root_object[:Root][:referenced_object] || root_object[:Root]
|
357
|
+
else
|
358
|
+
catalogs = (@parsed.select {|obj| obj[:Type] == :Catalog}).last
|
359
|
+
end
|
360
|
+
@parsed.delete_if {|obj| obj[:Type] == :Catalog}
|
361
|
+
@parsed << catalogs
|
362
|
+
|
363
|
+
raise "Unknown error - parsed data doesn't contain a cataloged object!" unless catalogs
|
364
|
+
end
|
365
|
+
case
|
366
|
+
when catalogs.is_a?(Array)
|
367
|
+
catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
|
368
|
+
when catalogs.is_a?(Hash)
|
369
|
+
if catalogs[:is_reference_only]
|
370
|
+
if catalogs[:referenced_object]
|
371
|
+
catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
|
372
|
+
else
|
373
|
+
warn "couldn't follow reference!!! #{catalogs} not found!"
|
374
|
+
end
|
375
|
+
else
|
376
|
+
unless catalogs[:Type] == :Page
|
377
|
+
raise "Optional Content PDF files aren't supported and their pages cannot be safely extracted." if catalogs[:AS] || catalogs[:OCProperties]
|
378
|
+
inheritance_hash[:MediaBox] = catalogs[:MediaBox] if catalogs[:MediaBox]
|
379
|
+
inheritance_hash[:CropBox] = catalogs[:CropBox] if catalogs[:CropBox]
|
380
|
+
inheritance_hash[:Rotate] = catalogs[:Rotate] if catalogs[:Rotate]
|
381
|
+
(inheritance_hash[:Resources] ||= {}).update( (catalogs[:Resources][:referenced_object] || catalogs[:Resources]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Resources]
|
382
|
+
(inheritance_hash[:ColorSpace] ||= {}).update( (catalogs[:ColorSpace][:referenced_object] || catalogs[:ColorSpace]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:ColorSpace]
|
383
|
+
|
384
|
+
# inheritance_hash[:Order] = catalogs[:Order] if catalogs[:Order]
|
385
|
+
# inheritance_hash[:OCProperties] = catalogs[:OCProperties] if catalogs[:OCProperties]
|
386
|
+
# inheritance_hash[:AS] = catalogs[:AS] if catalogs[:AS]
|
387
|
+
end
|
388
|
+
|
389
|
+
case catalogs[:Type]
|
390
|
+
when :Page
|
391
|
+
|
392
|
+
catalogs[:MediaBox] ||= inheritance_hash[:MediaBox] if inheritance_hash[:MediaBox]
|
393
|
+
catalogs[:CropBox] ||= inheritance_hash[:CropBox] if inheritance_hash[:CropBox]
|
394
|
+
catalogs[:Rotate] ||= inheritance_hash[:Rotate] if inheritance_hash[:Rotate]
|
395
|
+
(catalogs[:Resources] ||= {}).update( inheritance_hash[:Resources], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:Resources]
|
396
|
+
(catalogs[:ColorSpace] ||= {}).update( inheritance_hash[:ColorSpace], &( self.class.method(:hash_update_proc_for_old) ) ) if inheritance_hash[:ColorSpace]
|
397
|
+
# catalogs[:Order] ||= inheritance_hash[:Order] if inheritance_hash[:Order]
|
398
|
+
# catalogs[:AS] ||= inheritance_hash[:AS] if inheritance_hash[:AS]
|
399
|
+
# catalogs[:OCProperties] ||= inheritance_hash[:OCProperties] if inheritance_hash[:OCProperties]
|
400
|
+
|
401
|
+
|
402
|
+
# avoide references on MediaBox, CropBox and Rotate
|
403
|
+
catalogs[:MediaBox] = catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary] if catalogs[:MediaBox].is_a?(Hash) && catalogs[:MediaBox][:referenced_object].is_a?(Hash) && catalogs[:MediaBox][:referenced_object][:indirect_without_dictionary]
|
404
|
+
catalogs[:CropBox] = catalogs[:CropBox][:referenced_object][:indirect_without_dictionary] if catalogs[:CropBox].is_a?(Hash) && catalogs[:CropBox][:referenced_object].is_a?(Hash) && catalogs[:CropBox][:referenced_object][:indirect_without_dictionary]
|
405
|
+
catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
|
406
|
+
|
407
|
+
catalogs.instance_eval {extend Page_Methods}
|
408
|
+
catalogs.secure_injection = secure_injection
|
409
|
+
when :Pages
|
410
|
+
catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
|
411
|
+
when :Catalog
|
412
|
+
catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
|
413
|
+
end
|
414
|
+
end
|
415
|
+
end
|
416
|
+
self
|
417
|
+
end
|
418
|
+
|
419
|
+
# fails!
|
420
|
+
def change_references_to_actual_values(hash_with_references = {})
|
421
|
+
hash_with_references.each do |k,v|
|
422
|
+
if v.is_a?(Hash) && v[:is_reference_only]
|
423
|
+
hash_with_references[k] = get_refernced_object(v)
|
424
|
+
hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
|
425
|
+
warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
|
426
|
+
hash_with_references[k] = v unless hash_with_references[k]
|
427
|
+
end
|
428
|
+
end
|
429
|
+
hash_with_references
|
430
|
+
end
|
431
|
+
|
432
|
+
def get_refernced_object(reference_hash = {})
|
433
|
+
@parsed.each do |stored_object|
|
434
|
+
return stored_object if ( stored_object.is_a?(Hash) &&
|
435
|
+
reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
436
|
+
reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
|
437
|
+
end
|
438
|
+
warn "didn't find reference #{reference_hash}"
|
439
|
+
nil
|
440
|
+
end
|
441
|
+
|
442
|
+
# @private
|
443
|
+
# connects references and objects, according to their reference id's.
|
444
|
+
#
|
445
|
+
# should be moved to the parser's workflow.
|
446
|
+
#
|
447
|
+
def serialize_objects_and_references
|
448
|
+
obj_dir = {}
|
449
|
+
@parsed.each {|o| obj_dir[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
|
450
|
+
# @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
|
451
|
+
@references.each do |obj|
|
452
|
+
obj[:referenced_object] = obj_dir[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
|
453
|
+
warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
|
454
|
+
obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
|
455
|
+
end
|
456
|
+
self
|
457
|
+
end
|
458
|
+
|
459
|
+
# @private
|
460
|
+
# this method reviews a Hash and updates it by merging Hash data,
|
461
|
+
# preffering the old over the new.
|
462
|
+
def self.hash_update_proc_for_old key, old_data, new_data
|
463
|
+
if old_data.is_a? Hash
|
464
|
+
old_data.merge( new_data, &self.method(:hash_update_proc_for_old) )
|
465
|
+
else
|
466
|
+
old_data
|
467
|
+
end
|
468
|
+
end
|
469
|
+
# @private
|
470
|
+
# this method reviews a Hash an updates it by merging Hash data,
|
471
|
+
# preffering the new over the old.
|
472
|
+
def self.hash_update_proc_for_new key, old_data, new_data
|
473
|
+
if old_data.is_a? Hash
|
474
|
+
old_data.merge( new_data, &self.method(:hash_update_proc_for_new) )
|
475
|
+
else
|
476
|
+
new_data
|
477
|
+
end
|
478
|
+
end
|
479
|
+
|
480
|
+
# # @private
|
481
|
+
# # connects references and objects, according to their reference id's.
|
482
|
+
# #
|
483
|
+
# # should be moved to the parser's workflow.
|
484
|
+
# #
|
485
|
+
# def old_serialize_objects_and_references(object = nil)
|
486
|
+
# objects_reference_hash = {}
|
487
|
+
# # @parsed.each {|o| objects_reference_hash[ [ o.delete(:indirect_reference_id), o.delete(:indirect_generation_number) ] ] = o }
|
488
|
+
# @parsed.each {|o| objects_reference_hash[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
|
489
|
+
# each_object(@parsed) do |obj|
|
490
|
+
# if obj[:is_reference_only]
|
491
|
+
# obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
|
492
|
+
# warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj.to_s}" unless obj[:referenced_object]
|
493
|
+
# # obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
|
494
|
+
# end
|
495
|
+
# end
|
496
|
+
# self
|
497
|
+
# end
|
498
|
+
|
499
|
+
# # run block of code on evey PDF object (PDF objects are class Hash)
|
500
|
+
# def each_object(object, limit_references = true, already_visited = {}, &block)
|
501
|
+
# unless limit_references
|
502
|
+
# already_visited[object.object_id] = true
|
503
|
+
# end
|
504
|
+
# case
|
505
|
+
# when object.is_a?(Array)
|
506
|
+
# object.each {|obj| each_object(obj, limit_references, already_visited, &block)}
|
507
|
+
# when object.is_a?(Hash)
|
508
|
+
# yield(object)
|
509
|
+
# unless limit_references && object[:is_reference_only]
|
510
|
+
# object.each do |k,v|
|
511
|
+
# each_object(v, limit_references, already_visited, &block) unless already_visited[v.object_id]
|
512
|
+
# end
|
513
|
+
# end
|
514
|
+
# end
|
515
|
+
# end
|
516
|
+
|
332
517
|
end
|
333
518
|
end
|