combine_pdf 0.2.37 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cac27b28f3653156374b1ea4a429676625ba0c9f
4
- data.tar.gz: 8ce9f60a9bdcbd763a72461703c51845dbab0f2c
3
+ metadata.gz: 3663c5f5602eeed30aba5405fc0503ab9a865432
4
+ data.tar.gz: f6e07e2fbb180065146c32a440f29348fb2a2808
5
5
  SHA512:
6
- metadata.gz: 78aa47281a6f9fa5723a99ed9ce666479999348b69a19778e02eb2144e1c507d4a62ac23a07e4b46079193d14fecf77cbb9dac06591ac2354e92046ba0ba5d20
7
- data.tar.gz: 2b92948efba5ab031a46865416b13b1aecb892a1763c0d45190598313e6e0139907ece00c129349060ff030c7b38531d1d848f9696168d5fec499a4c65121db8
6
+ metadata.gz: 835236c99911009df5112cc92c3a042e6fe2dcd634e9b189f9977aef9f9f42ae33ac9865e94d08e8d26b6ff0328d7bf84ccc0dc858897c2c0ceae10c3c80c944
7
+ data.tar.gz: f11df3aa2c055a17be86c83766ea85f536afc5dcf0b299cd0e5be48344724909619ea54c8b52b7a9997907ec9d5c585d21f9ba81eeb17a74a77234aba2a79994
@@ -2,7 +2,19 @@
2
2
 
3
3
  ***
4
4
 
5
- #### Change log v.0.2.37 (Release Candidate)
5
+ #### Change log v.1.0.0
6
+
7
+ **Fix**: Fixed a possible issue with string corruption... it might have only existed in the development version, I'm not sure, but it's fixed anyway.
8
+
9
+ **Fix** (degrade): Fixed an issue related to deeply nested objects causing unreasonable slowdowns. The issue was resolved by degrading the PDF optimization process to review object with `stream` data instead of reviewing every object. This means more duplicate objects might be observed when similar PDF files are merged.
10
+
11
+ **Fix**: Fixed an issue related to form data where font information was lost during the PDF optimization process.
12
+
13
+ **Fix**: Fixed issue #108 by adding support for PDFs that have spaces and missing zeros in their hex encoded strings. Credit to @emmanuelmillionaer.
14
+
15
+ ***
16
+
17
+ #### Change log v.0.2.37
6
18
 
7
19
  **Fix**: Fixed `Page_Methods#textbox` default `:x`,`:y` to allow for non-zero/cropped page origin. Credit to @donnguyen for exposing the issue.
8
20
 
@@ -138,7 +138,7 @@ end
138
138
  # arrays are Array
139
139
  # strings are String
140
140
  # names are Symbols (String.to_sym)
141
- # numbers are Fixnum or Float
141
+ # numbers are Integer or Float (Numeric)
142
142
  # boolean are TrueClass or FalseClass
143
143
 
144
144
  ## test performance with:
@@ -107,7 +107,7 @@ module CombinePDF
107
107
  @scanner = StringScanner.new o[:raw_stream_content]
108
108
  stream_data = _parse_
109
109
  id_array = []
110
- while stream_data[0].is_a? (Integer)
110
+ while stream_data[0].is_a? (Numeric)
111
111
  id_array << stream_data.shift
112
112
  stream_data.shift
113
113
  end
@@ -180,57 +180,34 @@ module CombinePDF
180
180
  if @scanner.scan(/\[/)
181
181
  out << _parse_
182
182
  ##########################################
183
- ## parse a Dictionary
184
- ##########################################
185
- elsif @scanner.scan(/<</)
186
- data = _parse_
187
- obj = {}
188
- obj[data.shift] = data.shift while data[0]
189
- out << obj
190
- ##########################################
191
- ## return content of array or dictionary
192
- ##########################################
193
- elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
194
- return out
195
- ##########################################
196
- ## parse a Stream
183
+ ## Parse a Name
197
184
  ##########################################
198
- elsif @scanner.scan(/stream[\r\n]/)
199
- @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
200
- # the following was dicarded because some PDF files didn't have an EOL marker as required
201
- # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
202
- # instead, a non-strict RegExp is used:
203
- str = @scanner.scan_until(/endstream/)
204
- # raise error if the stream doesn't end.
205
- raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
206
- # need to remove end of stream
207
- if out.last.is_a? Hash
208
- # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
209
- out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
210
- else
211
- warn 'Stream not attached to dictionary!'
212
- out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
213
- end
185
+ # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
186
+ # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
187
+ # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
188
+ # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
189
+ elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
190
+ out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
214
191
  ##########################################
215
- ## parse an Object after finished
192
+ ## Parse a Number
216
193
  ##########################################
217
- elsif str = @scanner.scan(/endobj/)
218
- # what to do when this is an object?
219
- if out.last.is_a? Hash
220
- out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
221
- else
222
- out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
223
- end
224
- fresh = true
225
- # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
226
- out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
227
- # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
194
+ elsif str = @scanner.scan(/[\+\-\.\d]+/)
195
+ str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
228
196
  ##########################################
229
197
  ## parse a Hex String
230
198
  ##########################################
231
- elsif str = @scanner.scan(/<[0-9a-fA-F]*>/)
199
+ elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/)
232
200
  # warn "Found a hex string"
233
- out << unify_string([str[1..-2]].pack('H*').force_encoding(Encoding::ASCII_8BIT))
201
+ str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT)
202
+ # str = "0#{str}" if str.length.odd?
203
+ out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT))
204
+ ##########################################
205
+ ## parse a space delimited Hex String
206
+ ##########################################
207
+ elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/)
208
+ # warn "Found a space seperated hex string"
209
+ str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b}
210
+ out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT))
234
211
  ##########################################
235
212
  ## parse a Literal String
236
213
  ##########################################
@@ -315,6 +292,52 @@ module CombinePDF
315
292
  end
316
293
  out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
317
294
  ##########################################
295
+ ## parse a Dictionary
296
+ ##########################################
297
+ elsif @scanner.scan(/<</)
298
+ data = _parse_
299
+ obj = {}
300
+ obj[data.shift] = data.shift while data[0]
301
+ out << obj
302
+ ##########################################
303
+ ## return content of array or dictionary
304
+ ##########################################
305
+ elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
306
+ return out
307
+ ##########################################
308
+ ## parse a Stream
309
+ ##########################################
310
+ elsif @scanner.scan(/stream[\r\n]/)
311
+ @scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
312
+ # the following was dicarded because some PDF files didn't have an EOL marker as required
313
+ # str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
314
+ # instead, a non-strict RegExp is used:
315
+ str = @scanner.scan_until(/endstream/)
316
+ # raise error if the stream doesn't end.
317
+ raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
318
+ # need to remove end of stream
319
+ if out.last.is_a? Hash
320
+ # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
321
+ out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
322
+ else
323
+ warn 'Stream not attached to dictionary!'
324
+ out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
325
+ end
326
+ ##########################################
327
+ ## parse an Object after finished
328
+ ##########################################
329
+ elsif str = @scanner.scan(/endobj/)
330
+ # what to do when this is an object?
331
+ if out.last.is_a? Hash
332
+ out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
333
+ else
334
+ out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
335
+ end
336
+ fresh = true
337
+ # fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
338
+ out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
339
+ # puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric)
340
+ ##########################################
318
341
  ## Parse a comment
319
342
  ##########################################
320
343
  elsif str = @scanner.scan(/\%/)
@@ -326,20 +349,6 @@ module CombinePDF
326
349
  end
327
350
  # puts "AFTER COMMENT: #{@scanner.peek 8}"
328
351
  ##########################################
329
- ## Parse a Name
330
- ##########################################
331
- # old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
332
- # I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
333
- # all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
334
- # all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
335
- elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
336
- out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
337
- ##########################################
338
- ## Parse a Number
339
- ##########################################
340
- elsif str = @scanner.scan(/[\+\-\.\d]+/)
341
- str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
342
- ##########################################
343
352
  ## Parse an Object Reference
344
353
  ##########################################
345
354
  elsif @scanner.scan(/R/)
@@ -562,7 +571,7 @@ module CombinePDF
562
571
  o = nil
563
572
  else
564
573
  o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
565
- warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
574
+ warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0)
566
575
  o.delete :indirect_reference_id
567
576
  o.delete :indirect_generation_number
568
577
  o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
@@ -33,12 +33,14 @@ module CombinePDF
33
33
  if obj.is_a?(Hash)
34
34
  referenced = obj[:referenced_object]
35
35
  if referenced && referenced.any?
36
- tmp = resolved[referenced.object_id] || existing[referenced]
36
+ # tmp = resolved[referenced.object_id] || existing[referenced]
37
+ tmp = resolved[referenced.object_id] || (referenced[:raw_stream_content] && existing[referenced[:raw_stream_content]])
37
38
  if tmp
38
39
  obj[:referenced_object] = tmp
39
40
  else
40
41
  resolved[obj.object_id] = referenced
41
- existing[referenced] = referenced
42
+ # existing[referenced] = referenced
43
+ existing[referenced[:raw_stream_content]] = referenced
42
44
  should_resolve << referenced
43
45
  @objects << referenced
44
46
  end
@@ -150,7 +152,8 @@ module CombinePDF
150
152
  catalog = rebuild_catalog
151
153
  page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
152
154
  # adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
153
- add_referenced([page_objects, @forms_data, @names, @outlines, @info])
155
+ # add_referenced([page_objects, @forms_data, @names, @outlines, @info])
156
+ add_referenced(@objects.dup)
154
157
  catalog
155
158
  end
156
159
 
@@ -163,7 +166,7 @@ module CombinePDF
163
166
  def renumber_object_ids(start = nil)
164
167
  @set_start_id = start || @set_start_id
165
168
  start = @set_start_id
166
- history = {}
169
+ # history = {}
167
170
  @objects.each do |obj|
168
171
  obj[:indirect_reference_id] = start
169
172
  start += 1
@@ -191,7 +194,7 @@ module CombinePDF
191
194
  if pos[0].is_a? String
192
195
  (pos.length / 2).times do |i|
193
196
  dic << (pos[i * 2].clear << base.next!)
194
- pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Integer))
197
+ pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Numeric))
195
198
  dic << (pos[(i * 2) + 1].is_a?(Array) ? { is_reference_only: true, referenced_object: { indirect_without_dictionary: pos[(i * 2) + 1] } } : pos[(i * 2) + 1])
196
199
  # dic << pos[(i * 2) + 1]
197
200
  end
@@ -225,12 +228,13 @@ module CombinePDF
225
228
  # preffering the new over the old.
226
229
  def self.hash_merge_new_no_page(_key, old_data, new_data)
227
230
  return old_data unless new_data
231
+ return new_data unless old_data
228
232
  if old_data.is_a?(Hash) && new_data.is_a?(Hash)
229
233
  return old_data if (old_data[:Type] == :Page)
230
234
  old_data.merge(new_data, &(@hash_merge_new_no_page_proc ||= method(:hash_merge_new_no_page)))
231
235
  elsif old_data.is_a? Array
232
- new_data = [new_data] unless new_data.is_a? Array
233
- old_data + new_data
236
+ return old_data + new_data if new_data.is_a?(Array)
237
+ return old_data.dup << new_data
234
238
  elsif new_data.is_a? Array
235
239
  new_data + [old_data]
236
240
  else
@@ -100,7 +100,7 @@ module CombinePDF
100
100
  @names = parser.names_object || {}
101
101
  @forms_data = parser.forms_object || {}
102
102
  @outlines = parser.outlines_object || {}
103
- # rebuild the catalo, to fix wkhtmltopdf's use of static page numbers
103
+ # rebuild the catalog, to fix wkhtmltopdf's use of static page numbers
104
104
  rebuild_catalog
105
105
 
106
106
  # general globals
@@ -20,7 +20,7 @@ module CombinePDF
20
20
  return format_name_to_pdf object
21
21
  elsif object.is_a?(Array)
22
22
  return format_array_to_pdf object
23
- elsif object.is_a?(Integer) || object.is_a?(Float) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
23
+ elsif object.is_a?(Numeric) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
24
24
  return object.to_s
25
25
  elsif object.is_a?(Hash)
26
26
  return format_hash_to_pdf object
@@ -1,3 +1,3 @@
1
1
  module CombinePDF
2
- VERSION = '0.2.37'.freeze
2
+ VERSION = '1.0.0'.freeze
3
3
  end
@@ -14,6 +14,7 @@ require 'bundler/setup'
14
14
 
15
15
  pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
16
16
  pdf.save '01_check_radio_buttuns.pdf'
17
+ pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
17
18
  pdf << CombinePDF.load("./Ruby/test\ pdfs/empty_form.pdf")
18
19
  pdf << CombinePDF.load("./Ruby/test\ pdfs/filled_form.pdf")
19
20
  pdf.save '02_check_form_unification_middle_is_empty.pdf'
@@ -51,7 +52,16 @@ pdf.save '06_check_links_to_second_copy.pdf'
51
52
  lists = %w(./Ruby/test\ pdfs/outlines/self_merge_err.pdf ./Ruby/test\ pdfs/outlines/big_toc.pdf ./Ruby/test\ pdfs/outlines/bigger_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc2.pdf ./Ruby/test\ pdfs/outlines/named_dest.pdf ./Ruby/test\ pdfs/outlines/named_dest2.pdf)
52
53
 
53
54
  i = 0
54
- lists.each { |n| CombinePDF.load(n).save("07_#{(i += 1)}_#{n.split('/')[-1]}"); (CombinePDF.load(n) << CombinePDF.load(n)).save("07_#{i}x2_#{n.split('/')[-1]}") }
55
+ lists.each do |n|
56
+ # puts "loading #{n}"
57
+ pdf = CombinePDF.load(n)
58
+ # puts "saving 07_#{(i += 1)}_#{n.split('/')[-1]}"
59
+ pdf.save("07_#{(i += 1)}_#{n.split('/')[-1]}")
60
+ # puts "loading #{n}X2"
61
+ pdf = CombinePDF.load(n) << CombinePDF.load(n)
62
+ # puts "saving 07_#{i}x2_#{n.split('/')[-1]}"
63
+ pdf.save("07_#{i}x2_#{n.split('/')[-1]}")
64
+ end
55
65
  pdf = CombinePDF.new
56
66
  lists.each { |n| pdf << CombinePDF.load(n) }
57
67
  pdf.save('07_named destinations.pdf')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: combine_pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.37
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Boaz Segev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-10 00:00:00.000000000 Z
11
+ date: 2017-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-rc4
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
104
  version: '0'
105
105
  requirements: []
106
106
  rubyforge_project:
107
- rubygems_version: 2.6.8
107
+ rubygems_version: 2.6.11
108
108
  signing_key:
109
109
  specification_version: 4
110
110
  summary: Combine, stamp and watermark PDF files in pure Ruby.