combine_pdf 0.2.6 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 63c084e190f8c5f47aaa3db6cd32064a7e7dcfb2
4
- data.tar.gz: dbd7e1efe7b372abae318839eb80658913664b29
3
+ metadata.gz: e1379a60fa97dde419575868f841a676254f61cd
4
+ data.tar.gz: e55e8f95eaefa828499d393147a13ace92fbd70b
5
5
  SHA512:
6
- metadata.gz: a374e9be84a6397294a09b1fc76ae0a346ee9bce8abf1d8750742417494ecacf2b49ed843c060c13716046121d5c1ca3b570bf7fc18c828a99ce5cb857a043fc
7
- data.tar.gz: bf343d5749131896f4fe1dc2882bd62eb0696eb20c9021048e0b1d8f15837b580a0b50a4db083e752a3e73aa5bcbc88909f78c5d46b7f5ddb1e38cb264919e17
6
+ metadata.gz: 2cd478b21fe7634e7abb4552fa1b0770465c7e097b00c7ba842d19e243a529667e8b48fdeb7bae64117e30419e9d7fd7a9aeca39d35f342910df4656f887f6c6
7
+ data.tar.gz: 72d49c89926cb106d133fb17746ec2374225c2d5e5e72c8a4f1c38779d9f6abc188a82c1aa610ebb15ce8adf6da9419ae6ef175dd3b3fb84e6bc8cb938cd9680
@@ -2,6 +2,16 @@
2
2
 
3
3
  ***
4
4
 
5
+ Change log v.0.2.7
6
+
7
+ **Fix**: Fixed an issue where a malformed PDF String could cause the parser to hang.
8
+
9
+ **Update**: Inner PDF links (links to pages within the PDF file) will now be preserved when importing a whole PDF (although Outlines, for now, are discarede and their related links will be discarded as well). If the same destination page is inserted more than once (the first version will be preferred).
10
+
11
+ **Deprecation Warning**: the `Page_Methods#secure_injection`, `Page_Methods#make_unsecure` and `Page_Methods#make_secure` methods are deprecated. Use `Page_Methods#copy(true)` for safeguarding against font/resource conflicts when "stamping" one PDF page over another.
12
+
13
+ ***
14
+
5
15
  Change log v.0.2.6
6
16
 
7
17
  **fixed**: Hasan Iskandar fixed issue #30 - Output file cannot be saved from Adobe Reader with "Save As optimizes for Fast Web View" preference enabled. Thank you Hasan.
@@ -4,6 +4,7 @@ require 'zlib'
4
4
  require 'securerandom'
5
5
  require 'strscan'
6
6
  require 'matrix'
7
+ require 'set'
7
8
 
8
9
  #require the RC4 Gem
9
10
  require 'rc4'
@@ -23,19 +23,23 @@ module CombinePDF
23
23
 
24
24
  # accessor (getter) for the secure_injection setting
25
25
  def secure_injection
26
+ warn "**Deprecation Warning**: the `Page_Methods#secure_injection`, `Page_Methods#make_unsecure` and `Page_Methods#make_secure` methods are deprecated. Use `Page_Methods#copy(true)` for safeguarding against font/resource conflicts when 'stamping' one PDF page over another."
26
27
  @secure_injection
27
28
  end
28
29
  # accessor (setter) for the secure_injection setting
29
30
  def secure_injection= safe
31
+ warn "**Deprecation Warning**: the `Page_Methods#secure_injection`, `Page_Methods#make_unsecure` and `Page_Methods#make_secure` methods are deprecated. Use `Page_Methods#copy(true)` for safeguarding against font/resource conflicts when 'stamping' one PDF page over another."
30
32
  @secure_injection = safe
31
33
  end
32
34
  # sets secure_injection to `true` and returns self, allowing for chaining methods
33
35
  def make_secure
36
+ warn "**Deprecation Warning**: the `Page_Methods#secure_injection`, `Page_Methods#make_unsecure` and `Page_Methods#make_secure` methods are deprecated. Use `Page_Methods#copy(true)` for safeguarding against font/resource conflicts when 'stamping' one PDF page over another."
34
37
  @secure_injection = true
35
38
  self
36
39
  end
37
40
  # sets secure_injection to `false` and returns self, allowing for chaining methods
38
41
  def make_unsecure
42
+ warn "**Deprecation Warning**: the `Page_Methods#secure_injection`, `Page_Methods#make_unsecure` and `Page_Methods#make_secure` methods are deprecated. Use `Page_Methods#copy(true)` for safeguarding against font/resource conflicts when 'stamping' one PDF page over another."
39
43
  @secure_injection = false
40
44
  self
41
45
  end
@@ -36,7 +36,7 @@ module CombinePDF
36
36
  # the info and root objects, as found (if found) in the PDF file.
37
37
  #
38
38
  # they are mainly to used to know if the file is (was) encrypted and to get more details.
39
- attr_reader :info_object, :root_object
39
+ attr_reader :info_object, :root_object, :names_object
40
40
 
41
41
  # when creating a parser, it is important to set the data (String) we wish to parse.
42
42
  #
@@ -53,6 +53,8 @@ module CombinePDF
53
53
  @references = []
54
54
  @root_object = {}
55
55
  @info_object = {}
56
+ @names_object = {}
57
+ @strings_dictionary = {} # all strings are one string
56
58
  @version = nil
57
59
  @scanner = nil
58
60
  end
@@ -113,6 +115,9 @@ module CombinePDF
113
115
  end
114
116
  end
115
117
 
118
+ # Strings were unified, we can let them go..
119
+ @strings_dictionary.clear
120
+
116
121
 
117
122
  # serialize_objects_and_references.catalog_pages
118
123
 
@@ -179,10 +184,10 @@ module CombinePDF
179
184
  # need to remove end of stream
180
185
  if out.last.is_a? Hash
181
186
  # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
182
- out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]endstream\z/, "")
187
+ out.last[:raw_stream_content] = unify_string str.sub(/[\n\r]?[\n\r]endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
183
188
  else
184
189
  warn "Stream not attached to dictionary!"
185
- out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
190
+ out << str.sub(/[\n\r]?[\n\r]endstream\z/, "").force_encoding(Encoding::ASCII_8BIT)
186
191
  end
187
192
  ##########################################
188
193
  ## parse an Object after finished
@@ -199,7 +204,7 @@ module CombinePDF
199
204
  ##########################################
200
205
  when str = @scanner.scan(/<[0-9a-fA-F]+>/)
201
206
  # warn "Found a hex string"
202
- out << [str[1..-2]].pack('H*')
207
+ out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
203
208
  ##########################################
204
209
  ## parse a Literal String
205
210
  ##########################################
@@ -208,14 +213,20 @@ module CombinePDF
208
213
  str = ''.force_encoding(Encoding::ASCII_8BIT)
209
214
  count = 1
210
215
  while count > 0 && @scanner.rest? do
211
- str += @scanner.scan_until(/[\(\)]/).to_s
216
+ scn = @scanner.scan_until(/[\(\)]/)
217
+ unless scn
218
+ warn "Unknown error parsing string at #{@scanner.pos} for string: #{str}!"
219
+ count = 0 # error
220
+ next
221
+ end
222
+
223
+ str += scn.to_s
212
224
  seperator_count = 0
213
225
  seperator_count += 1 while str[-2-seperator_count] == "\\"
214
226
 
215
227
  case str[-1]
216
228
  when '('
217
- ## The following solution fails when (string ends with this sign: \\)
218
-
229
+ ## The following solution might fail when (string ends with this sign: \\)
219
230
  count += 1 unless seperator_count.odd?
220
231
  when ')'
221
232
  count -= 1 unless seperator_count.odd?
@@ -276,7 +287,7 @@ module CombinePDF
276
287
  str << str_bytes.shift
277
288
  end
278
289
  end
279
- out << str.pack('C*').force_encoding(Encoding::ASCII_8BIT)
290
+ out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
280
291
  ##########################################
281
292
  ## Parse a comment
282
293
  ##########################################
@@ -368,7 +379,7 @@ module CombinePDF
368
379
 
369
380
 
370
381
  # resets cataloging and pages
371
- def catalog_pages(catalogs = nil, secure_injection = false, inheritance_hash = {})
382
+ def catalog_pages(catalogs = nil, inheritance_hash = {})
372
383
  unless catalogs
373
384
 
374
385
  if root_object[:Root]
@@ -383,11 +394,11 @@ module CombinePDF
383
394
  end
384
395
  case
385
396
  when catalogs.is_a?(Array)
386
- catalogs.each {|c| catalog_pages(c, secure_injection, inheritance_hash ) unless c.nil?}
397
+ catalogs.each {|c| catalog_pages(c, inheritance_hash ) unless c.nil?}
387
398
  when catalogs.is_a?(Hash)
388
399
  if catalogs[:is_reference_only]
389
400
  if catalogs[:referenced_object]
390
- catalog_pages(catalogs[:referenced_object], secure_injection, inheritance_hash)
401
+ catalog_pages(catalogs[:referenced_object], inheritance_hash)
391
402
  else
392
403
  warn "couldn't follow reference!!! #{catalogs} not found!"
393
404
  end
@@ -424,11 +435,11 @@ module CombinePDF
424
435
  catalogs[:Rotate] = catalogs[:Rotate][:referenced_object][:indirect_without_dictionary] if catalogs[:Rotate].is_a?(Hash) && catalogs[:Rotate][:referenced_object].is_a?(Hash) && catalogs[:Rotate][:referenced_object][:indirect_without_dictionary]
425
436
 
426
437
  catalogs.instance_eval {extend Page_Methods}
427
- catalogs.secure_injection = secure_injection
428
438
  when :Pages
429
- catalog_pages(catalogs[:Kids], secure_injection, inheritance_hash.dup ) unless catalogs[:Kids].nil?
439
+ catalog_pages(catalogs[:Kids], inheritance_hash.dup ) unless catalogs[:Kids].nil?
430
440
  when :Catalog
431
- catalog_pages(catalogs[:Pages], secure_injection, inheritance_hash.dup ) unless catalogs[:Pages].nil?
441
+ @names_object.update( (catalogs[:Names][:referenced_object] || catalogs[:Names]), &self.class.method(:hash_update_proc_for_new) ) if catalogs[:Names]
442
+ catalog_pages(catalogs[:Pages], inheritance_hash.dup ) unless catalogs[:Pages].nil?
432
443
  end
433
444
  end
434
445
  end
@@ -473,11 +484,11 @@ module CombinePDF
473
484
  obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
474
485
  end
475
486
  self
476
- # rescue => e
477
- # puts (@parsed.select {|o| !o.is_a?(Hash)})
478
- # puts (@parsed)
479
- # puts (@references)
480
- # raise e
487
+ end
488
+
489
+ # All Strings are one String
490
+ def unify_string str
491
+ @strings_dictionary[str] ||= str
481
492
  end
482
493
 
483
494
  # @private
@@ -25,15 +25,15 @@ module CombinePDF
25
25
  # this function adds the references contained in "object", but DOESN'T add the object itself.
26
26
  #
27
27
  # this is used for internal operations, such as injectng data using the << operator.
28
- def add_referenced(object)
28
+ def add_referenced(object, dup_pages = true)
29
29
  # add references but not root
30
30
  case
31
31
  when object.is_a?(Array)
32
- object.each {|it| add_referenced(it)}
32
+ object.each {|it| add_referenced(it, dup_pages)}
33
33
  return true
34
34
  when object.is_a?(Hash)
35
35
  # first if statement is actually a workaround for a bug in Acrobat Reader, regarding duplicate pages.
36
- if object[:is_reference_only] && object[:referenced_object] && object[:referenced_object].is_a?(Hash) && object[:referenced_object][:Type] == :Page
36
+ if dup_pages && object[:is_reference_only] && object[:referenced_object] && object[:referenced_object].is_a?(Hash) && object[:referenced_object][:Type] == :Page
37
37
  if @objects.find_index object[:referenced_object]
38
38
  @objects << (object[:referenced_object] = object[:referenced_object].dup)
39
39
  else
@@ -49,6 +49,8 @@ module CombinePDF
49
49
  # stop this path, there is no need to run over the Hash's keys and values
50
50
  return true
51
51
  else
52
+ # stop if page propegation is false
53
+ return true if !dup_pages && object[:referenced_object][:Type] == :Page
52
54
  # @objects.include? object[:referenced_object] is bound to be false
53
55
  # the object wasn't found - add it to the @objects array
54
56
  @objects << object[:referenced_object]
@@ -56,8 +58,8 @@ module CombinePDF
56
58
 
57
59
  end
58
60
  object.each do |k, v|
59
- add_referenced(v) unless k == :Parent
60
- end
61
+ add_referenced(v, dup_pages) unless k == :Parent
62
+ end
61
63
  else
62
64
  return false
63
65
  end
@@ -83,8 +85,10 @@ module CombinePDF
83
85
  # build new Pages object
84
86
  pages_object = {Type: :Pages, Count: page_list.length, Kids: page_list.map {|p| {referenced_object: p, is_reference_only: true} } }
85
87
 
88
+ # rebuild/rename the names dictionary
89
+ rebuild_names
86
90
  # build new Catalog object
87
- catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true} }
91
+ catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true}, Names: {referenced_object: @names, is_reference_only: true} }
88
92
  catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
89
93
 
90
94
  # point old Pages pointers to new Pages object
@@ -103,17 +107,22 @@ module CombinePDF
103
107
  catalog_object
104
108
  end
105
109
 
110
+ def names_object
111
+ @names
112
+ end
113
+
106
114
  # @private
107
115
  # this is an alternative to the rebuild_catalog catalog method
108
116
  # this method is used by the to_pdf method, for streamlining the PDF output.
109
117
  # there is no point is calling the method before preparing the output.
110
118
  def rebuild_catalog_and_objects
111
119
  catalog = rebuild_catalog
112
- @objects = []
120
+ @objects.clear
113
121
  @objects << @info
114
122
  add_referenced @info
115
123
  @objects << catalog
116
- add_referenced catalog
124
+ add_referenced catalog[:Pages]
125
+ add_referenced catalog[:Names], false
117
126
  catalog
118
127
  end
119
128
 
@@ -138,6 +147,70 @@ module CombinePDF
138
147
  @objects.each {|obj| obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)}
139
148
  end
140
149
 
150
+ def rebuild_names name_tree = nil, base = "CombinePDF_0000000"
151
+ if name_tree
152
+ dic = []
153
+ case name_tree
154
+ when Array
155
+ if name_tree[0].is_a? String
156
+ (name_tree.length/2).times do |i|
157
+ dic << (name_tree[i*2].clear << base.next!)
158
+ dic << name_tree[(i*2) + 1]
159
+ end
160
+ else
161
+ name_tree.each {|kid| dic.concat rebuild_names(kid, base) }
162
+ end
163
+ when Hash
164
+ if name_tree[:Kids]
165
+ dic.concat rebuild_names(name_tree[:Kids], base)
166
+ elsif name_tree[:Names]
167
+ dic.concat rebuild_names(name_tree[:Names], base)
168
+ elsif name_tree[:referenced_object]
169
+ dic.concat rebuild_names(name_tree[:referenced_object], base)
170
+ end
171
+ end
172
+ return dic
173
+ end
174
+ @names.keys.each do |k|
175
+ @names[k] = {referenced_object: { Names: rebuild_names(@names[k], base) } , is_reference_only: true} unless k == :Type
176
+ end
177
+ end
178
+
179
+ # @private
180
+ # this method reviews a Hash an updates it by merging Hash data,
181
+ # preffering the new over the old.
182
+ def self.hash_merge_new_no_page key, old_data, new_data
183
+ if old_data.is_a? Hash
184
+ return old_data if old_data[:Type] == :Page
185
+ old_data.merge( new_data, &( @hash_merge_new_no_page_proc ||= self.method(:hash_merge_new_no_page) ) )
186
+ elsif old_data.is_a? Array
187
+ old_data + new_data
188
+ else
189
+ new_data
190
+ end
191
+ end
192
+
193
+
194
+ private
195
+
196
+ def renaming_dictionary object = nil, dictionary = {}
197
+ object ||= @names
198
+ case object
199
+ when Array
200
+ object.length.times {|i| object[i].is_a?(String) ? (dictionary[object[i]] = (dictionary.last || "Random_0001").next) : renaming_dictionary(object[i], dictionary) }
201
+ when Hash
202
+ object.values.each {|v| renaming_dictionary v, dictionary }
203
+ end
204
+ end
205
+
206
+ def rename_object object, dictionary
207
+ case object
208
+ when Array
209
+ object.length.times {|i| }
210
+ when Hash
211
+ end
212
+ end
213
+
141
214
  end
142
215
  end
143
216
 
@@ -103,6 +103,7 @@ module CombinePDF
103
103
  # set data from parser
104
104
  @version = parser.version if parser.version.is_a? Float
105
105
  @info = parser.info_object || {}
106
+ @names = parser.names_object || {}
106
107
 
107
108
  # general globals
108
109
  @set_start_id = 1
@@ -293,6 +294,7 @@ module CombinePDF
293
294
  if data.is_a? PDF
294
295
  @version = [@version, data.version].max
295
296
  pages_to_add = data.pages
297
+ @names.update data.names_object, &self.class.method(:hash_merge_new_no_page)
296
298
  elsif data.is_a?(Array) && (data.select {|o| !(o.is_a?(Hash) && o[:Type] == :Page) } ).empty?
297
299
  pages_to_add = data
298
300
  elsif data.is_a?(Hash) && data[:Type] == :Page
@@ -1,4 +1,4 @@
1
1
  module CombinePDF
2
- VERSION = "0.2.6"
2
+ VERSION = "0.2.7"
3
3
  end
4
4
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: combine_pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Boaz Segev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-08 00:00:00.000000000 Z
11
+ date: 2015-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-rc4
@@ -74,7 +74,6 @@ files:
74
74
  - lib/combine_pdf/decrypt.rb
75
75
  - lib/combine_pdf/filter.rb
76
76
  - lib/combine_pdf/fonts.rb
77
- - lib/combine_pdf/operations.rb
78
77
  - lib/combine_pdf/page_methods.rb
79
78
  - lib/combine_pdf/parser.rb
80
79
  - lib/combine_pdf/pdf_protected.rb
@@ -108,3 +107,4 @@ specification_version: 4
108
107
  summary: Combine, stamp and watermark PDF files in pure Ruby.
109
108
  test_files:
110
109
  - test/console
110
+ has_rdoc:
@@ -1,416 +0,0 @@
1
- module CombinePDF
2
-
3
- ################################################################
4
- ## These are common functions, used within the different classes
5
- ## These functions aren't open to the public.
6
- ################################################################
7
-
8
-
9
-
10
- # holds a simple content stream that starts a PDF graphic state container - used for wrapping malformed PDF content streams.
11
- CONTENT_CONTAINER_START = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: 'q'} }
12
- # holds a simple content stream that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
13
- CONTENT_CONTAINER_MIDDLE = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: "Q\nq"} }
14
- # holds a simple content stream that ends a PDF graphic state container - used for wrapping malformed PDF content streams.
15
- CONTENT_CONTAINER_END = { is_reference_only: true , referenced_object: {indirect_reference_id: 0, raw_stream_content: 'Q'} }
16
-
17
- # @private
18
- # @!visibility private
19
- #:nodoc: all
20
-
21
- protected
22
-
23
- # @!visibility private
24
-
25
- # This is an internal class. you don't need it.
26
- module PDFOperations
27
-
28
- module_function
29
-
30
- # @!visibility private
31
-
32
- def inject_to_page page = {Type: :Page, MediaBox: [0,0,612.0,792.0], Resources: {}, Contents: []}, stream = nil, top = true
33
- # make sure both the page reciving the new data and the injected page are of the correct data type.
34
- return false unless page.is_a?(Hash) && stream.is_a?(Hash)
35
-
36
- # following the reference chain and assigning a pointer to the correct Resouces object.
37
- # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
38
- page[:Resources] ||= {}
39
- original_resources = page[:Resources]
40
- if original_resources[:is_reference_only]
41
- original_resources = original_resources[:referenced_object]
42
- raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless original_resources
43
- end
44
- original_contents = page[:Contents]
45
- original_contents = [original_contents] unless original_contents.is_a? Array
46
-
47
- stream[:Resources] ||= {}
48
- stream_resources = stream[:Resources]
49
- if stream_resources[:is_reference_only]
50
- stream_resources = stream_resources[:referenced_object]
51
- raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless stream_resources
52
- end
53
- stream_contents = stream[:Contents]
54
- stream_contents = [stream_contents] unless stream_contents.is_a? Array
55
-
56
- # collect keys as objects - this is to make sure that
57
- # we are working on the actual resource data, rather then references
58
- flatten_resources_dictionaries stream_resources
59
- flatten_resources_dictionaries original_resources
60
-
61
- # injecting each of the values in the injected Page
62
- stream_resources.each do |key, new_val|
63
- unless PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
64
- if original_resources[key].nil?
65
- original_resources[key] = new_val
66
- elsif original_resources[key].is_a?(Hash) && new_val.is_a?(Hash)
67
- new_val.update original_resources[key] # make sure the old values are respected
68
- original_resources[key].update new_val # transfer old and new values to the injected page
69
- end #Do nothing if array - ot is the PROC array, which is an issue
70
- end
71
- end
72
- original_resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
73
-
74
- if top # if this is a stamp (overlay)
75
- page[:Contents] = original_contents
76
- page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
77
- page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
78
- page[:Contents].push *stream_contents
79
- page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
80
- else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
81
- page[:Contents] = stream_contents
82
- page[:Contents].unshift create_deep_copy(CONTENT_CONTAINER_START)
83
- page[:Contents].push create_deep_copy(CONTENT_CONTAINER_MIDDLE)
84
- page[:Contents].push *original_contents
85
- page[:Contents].push create_deep_copy(CONTENT_CONTAINER_END)
86
- end
87
-
88
- page
89
- end
90
- # copy_and_secure_for_injection(page)
91
- # - page is a page in the pages array, i.e.
92
- # pdf.pages[0]
93
- # takes a page object and:
94
- #
95
- # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
96
- #
97
- # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
98
- def copy_and_secure_for_injection(page)
99
- # copy page
100
- new_page = create_deep_copy page
101
-
102
- # initiate dictionary from old names to new names
103
- names_dictionary = {}
104
-
105
- # itirate through all keys that are name objects and give them new names (add to dic)
106
- # this should be done for every dictionary in :Resources
107
- # this is a few steps stage:
108
-
109
- # 1. get resources object
110
- resources = new_page[:Resources]
111
- if resources[:is_reference_only]
112
- resources = resources[:referenced_object]
113
- raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless resources
114
- end
115
-
116
- # 2. establich direct access to dictionaries and remove reference values
117
- flatten_resources_dictionaries resources
118
-
119
- # 3. travel every dictionary to pick up names (keys), change them and add them to the dictionary
120
- resources.each do |k,v|
121
- if v.is_a?(Hash)
122
- new_dictionary = {}
123
- new_name = "Combine" + SecureRandom.hex(7) + "PDF"
124
- i = 1
125
- v.each do |old_key, value|
126
- new_key = (new_name + i.to_s).to_sym
127
- names_dictionary[old_key] = new_key
128
- new_dictionary[new_key] = value
129
- i += 1
130
- end
131
- resources[k] = new_dictionary
132
- end
133
- end
134
-
135
- # now that we have replaced the names in the resources dictionaries,
136
- # it is time to replace the names inside the stream
137
- # we will need to make sure we have access to the stream injected
138
- # we will user PDFFilter.inflate_object
139
- (new_page[:Contents].is_a?(Array) ? new_page[:Contents] : [new_page[:Contents] ]).each do |c|
140
- stream = c[:referenced_object]
141
- PDFFilter.inflate_object stream
142
- names_dictionary.each do |old_key, new_key|
143
- stream[:raw_stream_content].gsub! _object_to_pdf(old_key), _object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
144
- end
145
- # patch back to PDF defaults, for OCRed PDF files.
146
- # stream[:raw_stream_content] = "q\nq\nq\nDeviceRGB CS\nDeviceRGB cs\n0 0 0 rg\n0 0 0 RG\n0 Tr\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
147
- # the following was removed for Acrobat Reader compatability: DeviceRGB CS\nDeviceRGB cs\n
148
- stream[:raw_stream_content] = "q\nq\nq\n0 0 0 rg\n0 0 0 RG\n0 Tr\n1 0 0 1 0 0 cm\n%s\nQ\nQ\nQ\n" % stream[:raw_stream_content]
149
- end
150
-
151
- new_page
152
- end
153
- def flatten_resources_dictionaries(resources)
154
- resources.each do |k,v|
155
- if v.is_a?(Hash) && v[:is_reference_only]
156
- if v[:referenced_object]
157
- resources[k] = resources[k][:referenced_object].dup
158
- resources[k].delete(:indirect_reference_id)
159
- resources[k].delete(:indirect_generation_number)
160
- elsif v[:indirect_without_dictionary]
161
- resources[k] = resources[k][:indirect_without_dictionary]
162
- end
163
- end
164
- end
165
- end
166
-
167
- # returns the PDF Object Hash holding the acutal data (if exists) or the original hash (if it wasn't a reference)
168
- #
169
- # works only AFTER references have been connected.
170
- def get_referenced object
171
- object[:referenced_object] || object
172
- end
173
-
174
-
175
- # Ruby normally assigns pointes.
176
- # noramlly:
177
- # a = [1,2,3] # => [1,2,3]
178
- # b = a # => [1,2,3]
179
- # a << 4 # => [1,2,3,4]
180
- # b # => [1,2,3,4]
181
- # This method makes sure that the memory is copied instead of a pointer assigned.
182
- # this works using recursion, so that arrays and hashes within arrays and hashes are also copied and not pointed to.
183
- # One needs to be careful of infinit loops using this function.
184
- def create_deep_copy object
185
- if object.is_a?(Array)
186
- return object.map { |e| create_deep_copy e }
187
- elsif object.is_a?(Hash)
188
- return {}.tap {|out| object.each {|k,v| out[create_deep_copy(k)] = create_deep_copy(v) unless k == :Parent} }
189
- elsif object.is_a?(String)
190
- return object.dup
191
- else
192
- return object # objects that aren't Strings, Arrays or Hashes (such as Symbols and Fixnums) won't be edited inplace.
193
- end
194
- end
195
- # removes id and generation number values, for better comparrison
196
- # and avoiding object duplication
197
- # objects:: one or more objects in a PDF file/page.
198
- def remove_old_ids objects
199
- _each_object(objects) {|obj| obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)}
200
- end
201
- def get_refernced_object(objects_array = [], reference_hash = {})
202
- objects_array.each do |stored_object|
203
- return stored_object if ( stored_object.is_a?(Hash) &&
204
- reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
205
- reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
206
- end
207
- warn "didn't find reference #{reference_hash}"
208
- nil
209
- end
210
- def change_references_to_actual_values(objects_array = [], hash_with_references = {})
211
- hash_with_references.each do |k,v|
212
- if v.is_a?(Hash) && v[:is_reference_only]
213
- hash_with_references[k] = PDFOperations.get_refernced_object( objects_array, v)
214
- hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
215
- warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
216
- hash_with_references[k] = v unless hash_with_references[k]
217
- end
218
- end
219
- hash_with_references
220
- end
221
- def change_connected_references_to_actual_values(hash_with_references = {})
222
- if hash_with_references.is_a?(Hash)
223
- hash_with_references.each do |k,v|
224
- if v.is_a?(Hash) && v[:is_reference_only]
225
- if v[:indirect_without_dictionary]
226
- hash_with_references[k] = v[:indirect_without_dictionary]
227
- elsif v[:referenced_object]
228
- hash_with_references[k] = v[:referenced_object]
229
- else
230
- raise "Cannot change references to values, as they are disconnected!"
231
- end
232
- end
233
- end
234
- hash_with_references.each {|k, v| change_connected_references_to_actual_values(v) if v.is_a?(Hash) || v.is_a?(Array)}
235
- elsif hash_with_references.is_a?(Array)
236
- hash_with_references.each {|item| change_connected_references_to_actual_values(item) if item.is_a?(Hash) || item.is_a?(Array)}
237
- end
238
- hash_with_references
239
- end
240
- def connect_references_and_actual_values(objects_array = [], hash_with_references = {})
241
- ret = true
242
- hash_with_references.each do |k,v|
243
- if v.is_a?(Hash) && v[:is_reference_only]
244
- ref_obj = PDFOperations.get_refernced_object( objects_array, v)
245
- hash_with_references[k] = ref_obj[:indirect_without_dictionary] if ref_obj.is_a?(Hash) && ref_obj[:indirect_without_dictionary]
246
- ret = false
247
- end
248
- end
249
- ret
250
- end
251
-
252
-
253
- def _each_object(object, limit_references = true, first_call = true, &block)
254
- # #####################
255
- # ## v.1.2 needs optimazation
256
- # case
257
- # when object.is_a?(Array)
258
- # object.each {|obj| _each_object(obj, limit_references, &block)}
259
- # when object.is_a?(Hash)
260
- # yield(object)
261
- # object.each do |k,v|
262
- # unless (limit_references && k == :referenced_object)
263
- # unless k == :Parent
264
- # _each_object(v, limit_references, &block)
265
- # end
266
- # end
267
- # end
268
- # end
269
- #####################
270
- ## v.2.1 needs optimazation
271
- ## version 2.1 is slightly faster then v.1.2
272
- @already_visited = [] if first_call
273
- unless limit_references
274
- @already_visited << object.object_id
275
- end
276
- case
277
- when object.is_a?(Array)
278
- object.each {|obj| _each_object(obj, limit_references, false, &block)}
279
- when object.is_a?(Hash)
280
- yield(object)
281
- unless limit_references && object[:is_reference_only]
282
- object.each do |k,v|
283
- _each_object(v, limit_references, false, &block) unless @already_visited.include? v.object_id
284
- end
285
- end
286
- end
287
- end
288
-
289
-
290
-
291
- # Formats an object into PDF format. This is used my the PDF object to format the PDF file and it is used in the secure injection which is still being developed.
292
- def _object_to_pdf object
293
- case
294
- when object.nil?
295
- return "null"
296
- when object.is_a?(String)
297
- return _format_string_to_pdf object
298
- when object.is_a?(Symbol)
299
- return _format_name_to_pdf object
300
- when object.is_a?(Array)
301
- return _format_array_to_pdf object
302
- when object.is_a?(Fixnum), object.is_a?(Float), object.is_a?(TrueClass), object.is_a?(FalseClass)
303
- return object.to_s + " "
304
- when object.is_a?(Hash)
305
- return _format_hash_to_pdf object
306
- else
307
- return ''
308
- end
309
- end
310
-
311
- def _format_string_to_pdf(object)
312
- if @string_output == :literal #if format is set to Literal
313
- #### can be better...
314
- replacement_hash = {
315
- "\x0A" => "\\n",
316
- "\x0D" => "\\r",
317
- "\x09" => "\\t",
318
- "\x08" => "\\b",
319
- "\xFF" => "\\f",
320
- "\x28" => "\\(",
321
- "\x29" => "\\)",
322
- "\x5C" => "\\\\"
323
- }
324
- 32.times {|i| replacement_hash[i.chr] ||= "\\#{i}"}
325
- (256-128).times {|i| replacement_hash[(i + 127).chr] ||= "\\#{i+127}"}
326
- ("(" + ([].tap {|out| object.bytes.each {|byte| replacement_hash[ byte.chr ] ? (replacement_hash[ byte.chr ].bytes.each {|b| out << b}) : out << byte } }).pack('C*') + ")").force_encoding(Encoding::ASCII_8BIT)
327
- else
328
- # A hexadecimal string shall be written as a sequence of hexadecimal digits (0–9 and either A–F or a–f)
329
- # encoded as ASCII characters and enclosed within angle brackets (using LESS-THAN SIGN (3Ch) and GREATER- THAN SIGN (3Eh)).
330
- ("<" + object.unpack('H*')[0] + ">").force_encoding(Encoding::ASCII_8BIT)
331
- end
332
- end
333
- def _format_name_to_pdf(object)
334
- # a name object is an atomic symbol uniquely defined by a sequence of ANY characters (8-bit values) except null (character code 0).
335
- # print name as a simple string. all characters between ~ and ! (except #) can be raw
336
- # the rest will have a number sign and their HEX equivalant
337
- # from the standard:
338
- # When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to introduce a name. The SOLIDUS is not part of the name but is a prefix indicating that what follows is a sequence of characters representing the name in the PDF file and shall follow these rules:
339
- # a) A NUMBER SIGN (23h) (#) in a name shall be written by using its 2-digit hexadecimal code (23), preceded by the NUMBER SIGN.
340
- # b) Any character in a name that is a regular character (other than NUMBER SIGN) shall be written as itself or by using its 2-digit hexadecimal code, preceded by the NUMBER SIGN.
341
- # c) Any character that is not a regular character shall be written using its 2-digit hexadecimal code, preceded by the NUMBER SIGN only.
342
- # [0x00, 0x09, 0x0a, 0x0c, 0x0d, 0x20, 0x28, 0x29, 0x3c, 0x3e, 0x5b, 0x5d, 0x7b, 0x7d, 0x2f, 0x25]
343
- out = object.to_s.bytes.to_a.map do |b|
344
- case b
345
- when 0..15
346
- '#0' + b.to_s(16)
347
- when 15..32, 35, 37, 40, 41, 47, 60, 62, 91, 93, 123, 125, 127..256
348
- '#' + b.to_s(16)
349
- else
350
- b.chr
351
- end
352
- end
353
- "/" + out.join()
354
- end
355
- def _format_array_to_pdf(object)
356
- # An array shall be written as a sequence of objects enclosed in SQUARE BRACKETS (using LEFT SQUARE BRACKET (5Bh) and RIGHT SQUARE BRACKET (5Dh)).
357
- # EXAMPLE [549 3.14 false (Ralph) /SomeName]
358
- ("[" + (object.collect {|item| _object_to_pdf(item)}).join(' ') + "]").force_encoding(Encoding::ASCII_8BIT)
359
-
360
- end
361
-
362
- def _format_hash_to_pdf(object)
363
- # if the object is only a reference:
364
- # special conditions apply, and there is only the setting of the reference (if needed) and output
365
- if object[:is_reference_only]
366
- #
367
- if object[:referenced_object] && object[:referenced_object].is_a?(Hash)
368
- object[:indirect_reference_id] = object[:referenced_object][:indirect_reference_id]
369
- object[:indirect_generation_number] = object[:referenced_object][:indirect_generation_number]
370
- end
371
- object[:indirect_reference_id] ||= 0
372
- object[:indirect_generation_number] ||= 0
373
- return "#{object[:indirect_reference_id].to_s} #{object[:indirect_generation_number].to_s} R".force_encoding(Encoding::ASCII_8BIT)
374
- end
375
-
376
- # if the object is indirect...
377
- out = []
378
- if object[:indirect_reference_id]
379
- object[:indirect_reference_id] ||= 0
380
- object[:indirect_generation_number] ||= 0
381
- out << "#{object[:indirect_reference_id].to_s} #{object[:indirect_generation_number].to_s} obj\n".force_encoding(Encoding::ASCII_8BIT)
382
- if object[:indirect_without_dictionary]
383
- out << _object_to_pdf(object[:indirect_without_dictionary])
384
- out << "\nendobj\n"
385
- return out.join().force_encoding(Encoding::ASCII_8BIT)
386
- end
387
- end
388
- # correct stream length, if the object is a stream.
389
- object[:Length] = object[:raw_stream_content].bytesize if object[:raw_stream_content]
390
-
391
- # if the object is not a simple object, it is a dictionary
392
- # A dictionary shall be written as a sequence of key-value pairs enclosed in double angle brackets (<<...>>)
393
- # (using LESS-THAN SIGNs (3Ch) and GREATER-THAN SIGNs (3Eh)).
394
- out << "<<\n".force_encoding(Encoding::ASCII_8BIT)
395
- object.each do |key, value|
396
- out << "#{_object_to_pdf key} #{_object_to_pdf value}\n".force_encoding(Encoding::ASCII_8BIT) unless PRIVATE_HASH_KEYS.include? key
397
- end
398
- out << ">>".force_encoding(Encoding::ASCII_8BIT)
399
- out << "\nstream\n#{object[:raw_stream_content]}\nendstream".force_encoding(Encoding::ASCII_8BIT) if object[:raw_stream_content]
400
- out << "\nendobj\n" if object[:indirect_reference_id]
401
- out.join().force_encoding(Encoding::ASCII_8BIT)
402
- end
403
- end
404
- end
405
-
406
- #########################################################
407
- # this file is part of the CombinePDF library and the code
408
- # is subject to the same license (MIT).
409
- #########################################################
410
- # PDF object types cross reference:
411
- # Indirect objects, references, dictionaries and streams are Hash
412
- # arrays are Array
413
- # strings are String
414
- # names are Symbols (String.to_sym)
415
- # numbers are Fixnum or Float
416
- # boolean are TrueClass or FalseClass