combine_pdf 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -50,7 +50,14 @@ module CombinePDF
50
50
  # prepare PNG group
51
51
  end
52
52
  else
53
- object[:raw_stream_content] = Zlib::Inflate.inflate object[:raw_stream_content]
53
+ inflator = Zlib::Inflate.new
54
+
55
+ object[:raw_stream_content] = inflator.inflate object[:raw_stream_content]
56
+ begin
57
+ inflator.finish
58
+ rescue
59
+ end
60
+ inflator.close
54
61
  object[:Length] = object[:raw_stream_content].bytesize
55
62
  end
56
63
  when nil
@@ -0,0 +1,381 @@
1
+ module CombinePDF
2
+
3
+ #:nodoc: all
4
+ ################################################################
5
+ ## These are common functions, used within the different classes
6
+ ## These functions aren't open to the public.
7
+ ################################################################
8
+ #@private
9
+ PRIVATE_HASH_KEYS = [:indirect_reference_id, :indirect_generation_number, :raw_stream_content, :is_reference_only, :referenced_object, :indirect_without_dictionary]
10
+ #@private
11
+ LITERAL_STRING_REPLACEMENT_HASH = {
12
+ 110 => 10, # "\\n".bytes = [92, 110] "\n".ord = 10
13
+ 114 => 13, #r
14
+ 116 => 9, #t
15
+ 98 => 8, #b
16
+ 102 => 255, #f
17
+ 40 => 40, #(
18
+ 41 => 41, #)
19
+ 92 => 92 #\
20
+ }
21
+ #@private
22
+ #:nodoc: all
23
+ module PDFOperations
24
+ module_function
25
+ def inject_to_page page = {Type: :Page, MediaBox: [0,0,612.0,792.0], Resources: {}, Contents: []}, stream = nil, top = true
26
+ # make sure both the page reciving the new data and the injected page are of the correct data type.
27
+ return false unless page.is_a?(Hash) && stream.is_a?(Hash)
28
+
29
+ # following the reference chain and assigning a pointer to the correct Resouces object.
30
+ # (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
31
+ original_resources = page[:Resources]
32
+ if original_resources[:is_reference_only]
33
+ original_resources = original_resources[:referenced_object]
34
+ raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless original_resources
35
+ end
36
+ original_contents = page[:Contents]
37
+ original_contents = [original_contents] unless original_contents.is_a? Array
38
+
39
+ stream_resources = stream[:Resources]
40
+ if stream_resources[:is_reference_only]
41
+ stream_resources = stream_resources[:referenced_object]
42
+ raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless stream_resources
43
+ end
44
+ stream_contents = stream[:Contents]
45
+ stream_contents = [stream_contents] unless stream_contents.is_a? Array
46
+
47
+ # collect keys as objects - this is to make sure that
48
+ # we are working on the actual resource data, rather then references
49
+ flatten_resources_dictionaries stream_resources
50
+ flatten_resources_dictionaries original_resources
51
+
52
+ # injecting each of the values in the injected Page
53
+ stream_resources.each do |key, new_val|
54
+ unless PRIVATE_HASH_KEYS.include? key # keep CombinePDF structual data intact.
55
+ if original_resources[key].nil?
56
+ original_resources[key] = new_val
57
+ elsif original_resources[key].is_a?(Hash) && new_val.is_a?(Hash)
58
+ new_val.update original_resources[key] # make sure the old values are respected
59
+ original_resources[key].update new_val # transfer old and new values to the injected page
60
+ end #Do nothing if array - ot is the PROC array, which is an issue
61
+ end
62
+ end
63
+ original_resources[:ProcSet] = [:PDF, :Text, :ImageB, :ImageC, :ImageI] # this was recommended by the ISO. 32000-1:2008
64
+
65
+ if top # if this is a stamp (overlay)
66
+ page[:Contents] = original_contents
67
+ page[:Contents].push *stream_contents
68
+ else #if this was a watermark (underlay? would be lost if the page was scanned, as white might not be transparent)
69
+ page[:Contents] = stream_contents
70
+ page[:Contents].push *original_contents
71
+ end
72
+
73
+ page
74
+ end
75
+ # copy_and_secure_for_injection(page)
76
+ # - page is a page in the pages array, i.e.
77
+ # pdf.pages[0]
78
+ # takes a page object and:
79
+ #
80
+ # makes a deep copy of the page (Ruby defaults to pointers, so this will copy the memory).
81
+ #
82
+ # then it will rewrite the content stream with renamed resources, so as to avoid name conflicts.
83
+ def copy_and_secure_for_injection(page)
84
+ # copy page
85
+ new_page = create_deep_copy page
86
+
87
+ # initiate dictionary from old names to new names
88
+ names_dictionary = {}
89
+
90
+ # itirate through all keys that are name objects and give them new names (add to dic)
91
+ # this should be done for every dictionary in :Resources
92
+ # this is a few steps stage:
93
+
94
+ # 1. get resources object
95
+ resources = new_page[:Resources]
96
+ if resources[:is_reference_only]
97
+ resources = resources[:referenced_object]
98
+ raise "Couldn't tap into resources dictionary, as it is a reference and isn't linked." unless resources
99
+ end
100
+
101
+ # 2. establich direct access to dictionaries and remove reference values
102
+ flatten_resources_dictionaries resources
103
+
104
+ # 3. travel every dictionary to pick up names (keys), change them and add them to the dictionary
105
+ resources.each do |k,v|
106
+ if v.is_a?(Hash)
107
+ new_dictionary = {}
108
+ v.each do |old_key, value|
109
+ new_key = ("CombinePDF" + SecureRandom.urlsafe_base64(9)).to_sym
110
+ names_dictionary[old_key] = new_key
111
+ new_dictionary[new_key] = value
112
+ end
113
+ resources[k] = new_dictionary
114
+ end
115
+ end
116
+
117
+ # now that we have replaced the names in the resources dictionaries,
118
+ # it is time to replace the names inside the stream
119
+ # we will need to make sure we have access to the stream injected
120
+ # we will user PDFFilter.inflate_object
121
+ (new_page[:Contents].is_a?(Array) ? new_page[:Contents] : [new_page[:Contents] ]).each do |c|
122
+ stream = c[:referenced_object]
123
+ PDFFilter.inflate_object stream
124
+ names_dictionary.each do |old_key, new_key|
125
+ stream[:raw_stream_content].gsub! _object_to_pdf(old_key), _object_to_pdf(new_key) ##### PRAY(!) that the parsed datawill be correctly reproduced!
126
+ end
127
+ end
128
+
129
+ new_page
130
+ end
131
+ def flatten_resources_dictionaries(resources)
132
+ resources.each do |k,v|
133
+ if v.is_a?(Hash) && v[:is_reference_only]
134
+ if v[:referenced_object]
135
+ resources[k] = resources[k][:referenced_object].dup
136
+ resources[k].delete(:indirect_reference_id)
137
+ resources[k].delete(:indirect_generation_number)
138
+ elsif v[:indirect_without_dictionary]
139
+ resources[k] = resources[k][:indirect_without_dictionary]
140
+ end
141
+ end
142
+ end
143
+ end
144
+
145
+
146
+ # Ruby normally assigns pointes.
147
+ # noramlly:
148
+ # a = [1,2,3] # => [1,2,3]
149
+ # b = a # => [1,2,3]
150
+ # a << 4 # => [1,2,3,4]
151
+ # b # => [1,2,3,4]
152
+ # This method makes sure that the memory is copied instead of a pointer assigned.
153
+ # this works using recursion, so that arrays and hashes within arrays and hashes are also copied and not pointed to.
154
+ # One needs to be careful of infinit loops using this function.
155
+ def create_deep_copy object
156
+ if object.is_a?(Array)
157
+ return object.map { |e| create_deep_copy e }
158
+ elsif object.is_a?(Hash)
159
+ return {}.tap {|out| object.each {|k,v| out[create_deep_copy(k)] = create_deep_copy(v) unless k == :Parent} }
160
+ elsif object.is_a?(String)
161
+ return object.dup
162
+ else
163
+ return object # objects that aren't Strings, Arrays or Hashes (such as Symbols and Fixnums) aren't pointers in Ruby and are always copied.
164
+ end
165
+ end
166
+ def get_refernced_object(objects_array = [], reference_hash = {})
167
+ objects_array.each do |stored_object|
168
+ return stored_object if ( stored_object.is_a?(Hash) &&
169
+ reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
170
+ reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number] )
171
+ end
172
+ warn "didn't find reference #{reference_hash}"
173
+ nil
174
+ end
175
+ def change_references_to_actual_values(objects_array = [], hash_with_references = {})
176
+ hash_with_references.each do |k,v|
177
+ if v.is_a?(Hash) && v[:is_reference_only]
178
+ hash_with_references[k] = PDFOperations.get_refernced_object( objects_array, v)
179
+ hash_with_references[k] = hash_with_references[k][:indirect_without_dictionary] if hash_with_references[k].is_a?(Hash) && hash_with_references[k][:indirect_without_dictionary]
180
+ warn "Couldn't connect all values from references - didn't find reference #{hash_with_references}!!!" if hash_with_references[k] == nil
181
+ hash_with_references[k] = v unless hash_with_references[k]
182
+ end
183
+ end
184
+ hash_with_references
185
+ end
186
+ def change_connected_references_to_actual_values(hash_with_references = {})
187
+ if hash_with_references.is_a?(Hash)
188
+ hash_with_references.each do |k,v|
189
+ if v.is_a?(Hash) && v[:is_reference_only]
190
+ if v[:indirect_without_dictionary]
191
+ hash_with_references[k] = v[:indirect_without_dictionary]
192
+ elsif v[:referenced_object]
193
+ hash_with_references[k] = v[:referenced_object]
194
+ else
195
+ raise "Cannot change references to values, as they are disconnected!"
196
+ end
197
+ end
198
+ end
199
+ hash_with_references.each {|k, v| change_connected_references_to_actual_values(v) if v.is_a?(Hash) || v.is_a?(Array)}
200
+ elsif hash_with_references.is_a?(Array)
201
+ hash_with_references.each {|item| change_connected_references_to_actual_values(item) if item.is_a?(Hash) || item.is_a?(Array)}
202
+ end
203
+ hash_with_references
204
+ end
205
+ def connect_references_and_actual_values(objects_array = [], hash_with_references = {})
206
+ ret = true
207
+ hash_with_references.each do |k,v|
208
+ if v.is_a?(Hash) && v[:is_reference_only]
209
+ ref_obj = PDFOperations.get_refernced_object( objects_array, v)
210
+ hash_with_references[k] = ref_obj[:indirect_without_dictionary] if ref_obj.is_a?(Hash) && ref_obj[:indirect_without_dictionary]
211
+ ret = false
212
+ end
213
+ end
214
+ ret
215
+ end
216
+
217
+
218
+ def _each_object(object, limit_references = true, first_call = true, &block)
219
+ # #####################
220
+ # ## v.1.2 needs optimazation
221
+ # case
222
+ # when object.is_a?(Array)
223
+ # object.each {|obj| _each_object(obj, limit_references, &block)}
224
+ # when object.is_a?(Hash)
225
+ # yield(object)
226
+ # object.each do |k,v|
227
+ # unless (limit_references && k == :referenced_object)
228
+ # unless k == :Parent
229
+ # _each_object(v, limit_references, &block)
230
+ # end
231
+ # end
232
+ # end
233
+ # end
234
+ #####################
235
+ ## v.2.1 needs optimazation
236
+ ## version 2.1 is slightly faster then v.1.2
237
+ @already_visited = [] if first_call
238
+ unless limit_references
239
+ @already_visited << object.object_id
240
+ end
241
+ case
242
+ when object.is_a?(Array)
243
+ object.each {|obj| _each_object(obj, limit_references, false, &block)}
244
+ when object.is_a?(Hash)
245
+ yield(object)
246
+ unless limit_references && object[:is_reference_only]
247
+ object.each do |k,v|
248
+ _each_object(v, limit_references, false, &block) unless @already_visited.include? v.object_id
249
+ end
250
+ end
251
+ end
252
+ end
253
+
254
+
255
+
256
+ # Formats an object into PDF format. This is used my the PDF object to format the PDF file and it is used in the secure injection which is still being developed.
257
+ def _object_to_pdf object
258
+ case
259
+ when object.nil?
260
+ return "null"
261
+ when object.is_a?(String)
262
+ return _format_string_to_pdf object
263
+ when object.is_a?(Symbol)
264
+ return _format_name_to_pdf object
265
+ when object.is_a?(Array)
266
+ return _format_array_to_pdf object
267
+ when object.is_a?(Fixnum), object.is_a?(Float), object.is_a?(TrueClass), object.is_a?(FalseClass)
268
+ return object.to_s + " "
269
+ when object.is_a?(Hash)
270
+ return _format_hash_to_pdf object
271
+ else
272
+ return ''
273
+ end
274
+ end
275
+
276
+ def _format_string_to_pdf(object)
277
+ if @string_output == :literal #if format is set to Literal
278
+ #### can be better...
279
+ replacement_hash = {
280
+ "\x0A" => "\\n",
281
+ "\x0D" => "\\r",
282
+ "\x09" => "\\t",
283
+ "\x08" => "\\b",
284
+ "\xFF" => "\\f",
285
+ "\x28" => "\\(",
286
+ "\x29" => "\\)",
287
+ "\x5C" => "\\\\"
288
+ }
289
+ 32.times {|i| replacement_hash[i.chr] ||= "\\#{i}"}
290
+ (256-128).times {|i| replacement_hash[(i + 127).chr] ||= "\\#{i+127}"}
291
+ ("(" + ([].tap {|out| object.bytes.each {|byte| replacement_hash[ byte.chr ] ? (replacement_hash[ byte.chr ].bytes.each {|b| out << b}) : out << byte } }).pack('C*') + ")").force_encoding(Encoding::ASCII_8BIT)
292
+ else
293
+ # A hexadecimal string shall be written as a sequence of hexadecimal digits (0–9 and either A–F or a–f)
294
+ # encoded as ASCII characters and enclosed within angle brackets (using LESS-THAN SIGN (3Ch) and GREATER- THAN SIGN (3Eh)).
295
+ ("<" + object.unpack('H*')[0] + ">").force_encoding(Encoding::ASCII_8BIT)
296
+ end
297
+ end
298
+ def _format_name_to_pdf(object)
299
+ # a name object is an atomic symbol uniquely defined by a sequence of ANY characters (8-bit values) except null (character code 0).
300
+ # print name as a simple string. all characters between ~ and ! (except #) can be raw
301
+ # the rest will have a number sign and their HEX equivalant
302
+ # from the standard:
303
+ # When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to introduce a name. The SOLIDUS is not part of the name but is a prefix indicating that what follows is a sequence of characters representing the name in the PDF file and shall follow these rules:
304
+ # a) A NUMBER SIGN (23h) (#) in a name shall be written by using its 2-digit hexadecimal code (23), preceded by the NUMBER SIGN.
305
+ # b) Any character in a name that is a regular character (other than NUMBER SIGN) shall be written as itself or by using its 2-digit hexadecimal code, preceded by the NUMBER SIGN.
306
+ # c) Any character that is not a regular character shall be written using its 2-digit hexadecimal code, preceded by the NUMBER SIGN only.
307
+ # [0x00, 0x09, 0x0a, 0x0c, 0x0d, 0x20, 0x28, 0x29, 0x3c, 0x3e, 0x5b, 0x5d, 0x7b, 0x7d, 0x2f, 0x25]
308
+ out = object.to_s.bytes.map do |b|
309
+ case b
310
+ when 0..15
311
+ '#0' + b.to_s(16)
312
+ when 15..32, 35, 37, 40, 41, 47, 60, 62, 91, 93, 123, 125, 127..256
313
+ '#' + b.to_s(16)
314
+ else
315
+ b.chr
316
+ end
317
+ end
318
+ "/" + out.join()
319
+ end
320
+ def _format_array_to_pdf(object)
321
+ # An array shall be written as a sequence of objects enclosed in SQUARE BRACKETS (using LEFT SQUARE BRACKET (5Bh) and RIGHT SQUARE BRACKET (5Dh)).
322
+ # EXAMPLE [549 3.14 false (Ralph) /SomeName]
323
+ ("[" + (object.collect {|item| _object_to_pdf(item)}).join(' ') + "]").force_encoding(Encoding::ASCII_8BIT)
324
+
325
+ end
326
+
327
+ def _format_hash_to_pdf(object)
328
+ # if the object is only a reference:
329
+ # special conditions apply, and there is only the setting of the reference (if needed) and output
330
+ if object[:is_reference_only]
331
+ #
332
+ if object[:referenced_object] && object[:referenced_object].is_a?(Hash)
333
+ object[:indirect_reference_id] = object[:referenced_object][:indirect_reference_id]
334
+ object[:indirect_generation_number] = object[:referenced_object][:indirect_generation_number]
335
+ end
336
+ object[:indirect_reference_id] ||= 0
337
+ object[:indirect_generation_number] ||= 0
338
+ return "#{object[:indirect_reference_id].to_s} #{object[:indirect_generation_number].to_s} R".force_encoding(Encoding::ASCII_8BIT)
339
+ end
340
+
341
+ # if the object is indirect...
342
+ out = []
343
+ if object[:indirect_reference_id]
344
+ object[:indirect_reference_id] ||= 0
345
+ object[:indirect_generation_number] ||= 0
346
+ out << "#{object[:indirect_reference_id].to_s} #{object[:indirect_generation_number].to_s} obj\n".force_encoding(Encoding::ASCII_8BIT)
347
+ if object[:indirect_without_dictionary]
348
+ out << _object_to_pdf(object[:indirect_without_dictionary])
349
+ out << "\nendobj\n"
350
+ return out.join().force_encoding(Encoding::ASCII_8BIT)
351
+ end
352
+ end
353
+ # correct stream length, if the object is a stream.
354
+ object[:Length] = object[:raw_stream_content].bytesize if object[:raw_stream_content]
355
+
356
+ # if the object is not a simple object, it is a dictionary
357
+ # A dictionary shall be written as a sequence of key-value pairs enclosed in double angle brackets (<<...>>)
358
+ # (using LESS-THAN SIGNs (3Ch) and GREATER-THAN SIGNs (3Eh)).
359
+ out << "<<\n".force_encoding(Encoding::ASCII_8BIT)
360
+ object.each do |key, value|
361
+ out << "#{_object_to_pdf key} #{_object_to_pdf value}\n".force_encoding(Encoding::ASCII_8BIT) unless PRIVATE_HASH_KEYS.include? key
362
+ end
363
+ out << ">>".force_encoding(Encoding::ASCII_8BIT)
364
+ out << "\nstream\n#{object[:raw_stream_content]}\nendstream".force_encoding(Encoding::ASCII_8BIT) if object[:raw_stream_content]
365
+ out << "\nendobj\n" if object[:indirect_reference_id]
366
+ out.join().force_encoding(Encoding::ASCII_8BIT)
367
+ end
368
+ end
369
+ end
370
+
371
+ #########################################################
372
+ # this file is part of the CombinePDF library and the code
373
+ # is subject to the same license (GPLv3).
374
+ #########################################################
375
+ # PDF object types cross reference:
376
+ # Indirect objects, references, dictionaries and streams are Hash
377
+ # arrays are Array
378
+ # strings are String
379
+ # names are Symbols (String.to_sym)
380
+ # numbers are Fixnum or Float
381
+ # boolean are TrueClass or FalseClass
@@ -9,6 +9,7 @@
9
9
 
10
10
 
11
11
 
12
+
12
13
  module CombinePDF
13
14
  #######################################################
14
15
  # PDF class is the PDF object that can save itself to
@@ -80,7 +81,7 @@ module CombinePDF
80
81
  # use this to export the PDF file without saving to disk (such as sending through HTTP ect').
81
82
  def to_pdf
82
83
  #reset version if not specified
83
- @version = 1.3 if @version == 0
84
+ @version = 1.5 if @version.to_f == 0.0
84
85
  #set creation date for merged file
85
86
  @info[:CreationDate] = Time.now.strftime "D:%Y%m%d%H%M%S%:::z'00"
86
87
  #rebuild resources if needed
@@ -163,7 +164,8 @@ module CombinePDF
163
164
  catalogs.define_singleton_method("<<".to_sym) do |obj|
164
165
  obj = PDFOperations.copy_and_secure_for_injection obj
165
166
  PDFOperations.inject_to_page self, obj
166
- holder.add_referenced obj
167
+ holder.add_referenced self # add new referenced objects
168
+ self
167
169
  end
168
170
  page_list << catalogs
169
171
  when :Pages
@@ -190,7 +192,7 @@ module CombinePDF
190
192
  ## how should we add data to PDF?
191
193
  ## and how to handles imported pages?
192
194
  case
193
- when (obj.is_a?(PDF))
195
+ when (obj.is_a?(PDF))
194
196
  @version = [@version, obj.version].max
195
197
 
196
198
  obj.renumber_object_ids @set_start_id + @objects.length
@@ -198,7 +200,7 @@ module CombinePDF
198
200
  @objects.push(*obj.objects)
199
201
  # rebuild_catalog
200
202
  @need_to_rebuild_resources = true
201
- when (obj.is_a?(Hash) && obj[:Type] == :Page), (obj.is_a?(Array) && (obj.reject {|i| i.is_a?(Hash) && i[:Type] == :Page}).empty?)
203
+ when (obj.is_a?(Hash) && obj[:Type] == :Page), (obj.is_a?(Array) && (obj.reject {|i| i.is_a?(Hash) && i[:Type] == :Page}).empty?)
202
204
  # set obj paramater to array if it's only one page
203
205
  obj = [obj] if obj.is_a?(Hash)
204
206
  # add page(s) to objects
@@ -214,7 +216,32 @@ module CombinePDF
214
216
  @need_to_rebuild_resources = true
215
217
  else
216
218
  warn "Shouldn't add objects to the file if they are not top-level indirect PDF objects."
219
+ retrun false # return false, which will also stop any chaining.
217
220
  end
221
+ return self #return self object for injection chaining (pdf << page << page << page)
222
+ end
223
+
224
+ # get the title for the pdf
225
+ # The title is stored in the information dictionary and isn't requited
226
+ def title
227
+ return @info[:Title]
228
+ end
229
+ # set the title for the pdf
230
+ # The title is stored in the information dictionary and isn't requited
231
+ # new_title:: a string that is the new author value.
232
+ def title=(new_title = nil)
233
+ @info[:Title] = new_title
234
+ end
235
+ # get the author value for the pdf
236
+ # The author is stored in the information dictionary and isn't requited
237
+ def author
238
+ return @info[:Author]
239
+ end
240
+ # set the author for the pdf
241
+ # The author is stored in the information dictionary and isn't requited
242
+ # new_title:: a string that is the new author value.
243
+ def author=(new_author = nil)
244
+ @info[:Author] = new_author
218
245
  end
219
246
  end
220
247
  class PDF #:nodoc: all