combine_pdf 0.2.37 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/lib/combine_pdf.rb +1 -1
- data/lib/combine_pdf/parser.rb +69 -60
- data/lib/combine_pdf/pdf_protected.rb +11 -7
- data/lib/combine_pdf/pdf_public.rb +1 -1
- data/lib/combine_pdf/renderer.rb +1 -1
- data/lib/combine_pdf/version.rb +1 -1
- data/test/automated +11 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3663c5f5602eeed30aba5405fc0503ab9a865432
|
4
|
+
data.tar.gz: f6e07e2fbb180065146c32a440f29348fb2a2808
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 835236c99911009df5112cc92c3a042e6fe2dcd634e9b189f9977aef9f9f42ae33ac9865e94d08e8d26b6ff0328d7bf84ccc0dc858897c2c0ceae10c3c80c944
|
7
|
+
data.tar.gz: f11df3aa2c055a17be86c83766ea85f536afc5dcf0b299cd0e5be48344724909619ea54c8b52b7a9997907ec9d5c585d21f9ba81eeb17a74a77234aba2a79994
|
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,19 @@
|
|
2
2
|
|
3
3
|
***
|
4
4
|
|
5
|
-
#### Change log v.0.
|
5
|
+
#### Change log v.1.0.0
|
6
|
+
|
7
|
+
**Fix**: Fixed a possible issue with string corruption... it might have only existed in the development version, I'm not sure, but it's fixed anyway.
|
8
|
+
|
9
|
+
**Fix** (degrade): Fixed an issue related to deeply nested objects causing unreasonable slowdowns. The issue was resolved by degrading the PDF optimization process to review object with `stream` data instead of reviewing every object. This means more duplicate objects might be observed when similar PDF files are merged.
|
10
|
+
|
11
|
+
**Fix**: Fixed an issue related to form data where font information was lost during the PDF optimization process.
|
12
|
+
|
13
|
+
**Fix**: Fixed issue #108 by adding support for PDFs that have spaces and missing zeros in their hex encoded strings. Credit to @emmanuelmillionaer.
|
14
|
+
|
15
|
+
***
|
16
|
+
|
17
|
+
#### Change log v.0.2.37
|
6
18
|
|
7
19
|
**Fix**: Fixed `Page_Methods#textbox` default `:x`,`:y` to allow for non-zero/cropped page origin. Credit to @donnguyen for exposing the issue.
|
8
20
|
|
data/lib/combine_pdf.rb
CHANGED
data/lib/combine_pdf/parser.rb
CHANGED
@@ -107,7 +107,7 @@ module CombinePDF
|
|
107
107
|
@scanner = StringScanner.new o[:raw_stream_content]
|
108
108
|
stream_data = _parse_
|
109
109
|
id_array = []
|
110
|
-
while stream_data[0].is_a? (
|
110
|
+
while stream_data[0].is_a? (Numeric)
|
111
111
|
id_array << stream_data.shift
|
112
112
|
stream_data.shift
|
113
113
|
end
|
@@ -180,57 +180,34 @@ module CombinePDF
|
|
180
180
|
if @scanner.scan(/\[/)
|
181
181
|
out << _parse_
|
182
182
|
##########################################
|
183
|
-
##
|
184
|
-
##########################################
|
185
|
-
elsif @scanner.scan(/<</)
|
186
|
-
data = _parse_
|
187
|
-
obj = {}
|
188
|
-
obj[data.shift] = data.shift while data[0]
|
189
|
-
out << obj
|
190
|
-
##########################################
|
191
|
-
## return content of array or dictionary
|
192
|
-
##########################################
|
193
|
-
elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
|
194
|
-
return out
|
195
|
-
##########################################
|
196
|
-
## parse a Stream
|
183
|
+
## Parse a Name
|
197
184
|
##########################################
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
# raise error if the stream doesn't end.
|
205
|
-
raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
|
206
|
-
# need to remove end of stream
|
207
|
-
if out.last.is_a? Hash
|
208
|
-
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
209
|
-
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
210
|
-
else
|
211
|
-
warn 'Stream not attached to dictionary!'
|
212
|
-
out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
213
|
-
end
|
185
|
+
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
186
|
+
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
187
|
+
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
188
|
+
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
189
|
+
elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
|
190
|
+
out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
|
214
191
|
##########################################
|
215
|
-
##
|
192
|
+
## Parse a Number
|
216
193
|
##########################################
|
217
|
-
elsif str = @scanner.scan(/
|
218
|
-
|
219
|
-
if out.last.is_a? Hash
|
220
|
-
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
221
|
-
else
|
222
|
-
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
223
|
-
end
|
224
|
-
fresh = true
|
225
|
-
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
226
|
-
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
227
|
-
# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
|
194
|
+
elsif str = @scanner.scan(/[\+\-\.\d]+/)
|
195
|
+
str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
|
228
196
|
##########################################
|
229
197
|
## parse a Hex String
|
230
198
|
##########################################
|
231
|
-
elsif str = @scanner.scan(
|
199
|
+
elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/)
|
232
200
|
# warn "Found a hex string"
|
233
|
-
|
201
|
+
str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT)
|
202
|
+
# str = "0#{str}" if str.length.odd?
|
203
|
+
out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT))
|
204
|
+
##########################################
|
205
|
+
## parse a space delimited Hex String
|
206
|
+
##########################################
|
207
|
+
elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/)
|
208
|
+
# warn "Found a space seperated hex string"
|
209
|
+
str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b}
|
210
|
+
out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT))
|
234
211
|
##########################################
|
235
212
|
## parse a Literal String
|
236
213
|
##########################################
|
@@ -315,6 +292,52 @@ module CombinePDF
|
|
315
292
|
end
|
316
293
|
out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
|
317
294
|
##########################################
|
295
|
+
## parse a Dictionary
|
296
|
+
##########################################
|
297
|
+
elsif @scanner.scan(/<</)
|
298
|
+
data = _parse_
|
299
|
+
obj = {}
|
300
|
+
obj[data.shift] = data.shift while data[0]
|
301
|
+
out << obj
|
302
|
+
##########################################
|
303
|
+
## return content of array or dictionary
|
304
|
+
##########################################
|
305
|
+
elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
|
306
|
+
return out
|
307
|
+
##########################################
|
308
|
+
## parse a Stream
|
309
|
+
##########################################
|
310
|
+
elsif @scanner.scan(/stream[\r\n]/)
|
311
|
+
@scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
|
312
|
+
# the following was dicarded because some PDF files didn't have an EOL marker as required
|
313
|
+
# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
|
314
|
+
# instead, a non-strict RegExp is used:
|
315
|
+
str = @scanner.scan_until(/endstream/)
|
316
|
+
# raise error if the stream doesn't end.
|
317
|
+
raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
|
318
|
+
# need to remove end of stream
|
319
|
+
if out.last.is_a? Hash
|
320
|
+
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
321
|
+
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
322
|
+
else
|
323
|
+
warn 'Stream not attached to dictionary!'
|
324
|
+
out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
325
|
+
end
|
326
|
+
##########################################
|
327
|
+
## parse an Object after finished
|
328
|
+
##########################################
|
329
|
+
elsif str = @scanner.scan(/endobj/)
|
330
|
+
# what to do when this is an object?
|
331
|
+
if out.last.is_a? Hash
|
332
|
+
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
333
|
+
else
|
334
|
+
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
335
|
+
end
|
336
|
+
fresh = true
|
337
|
+
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
338
|
+
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
339
|
+
# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric)
|
340
|
+
##########################################
|
318
341
|
## Parse a comment
|
319
342
|
##########################################
|
320
343
|
elsif str = @scanner.scan(/\%/)
|
@@ -326,20 +349,6 @@ module CombinePDF
|
|
326
349
|
end
|
327
350
|
# puts "AFTER COMMENT: #{@scanner.peek 8}"
|
328
351
|
##########################################
|
329
|
-
## Parse a Name
|
330
|
-
##########################################
|
331
|
-
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
332
|
-
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
333
|
-
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
334
|
-
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
335
|
-
elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
|
336
|
-
out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
|
337
|
-
##########################################
|
338
|
-
## Parse a Number
|
339
|
-
##########################################
|
340
|
-
elsif str = @scanner.scan(/[\+\-\.\d]+/)
|
341
|
-
str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
|
342
|
-
##########################################
|
343
352
|
## Parse an Object Reference
|
344
353
|
##########################################
|
345
354
|
elsif @scanner.scan(/R/)
|
@@ -562,7 +571,7 @@ module CombinePDF
|
|
562
571
|
o = nil
|
563
572
|
else
|
564
573
|
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
|
565
|
-
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
|
574
|
+
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0)
|
566
575
|
o.delete :indirect_reference_id
|
567
576
|
o.delete :indirect_generation_number
|
568
577
|
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
|
@@ -33,12 +33,14 @@ module CombinePDF
|
|
33
33
|
if obj.is_a?(Hash)
|
34
34
|
referenced = obj[:referenced_object]
|
35
35
|
if referenced && referenced.any?
|
36
|
-
tmp = resolved[referenced.object_id] || existing[referenced]
|
36
|
+
# tmp = resolved[referenced.object_id] || existing[referenced]
|
37
|
+
tmp = resolved[referenced.object_id] || (referenced[:raw_stream_content] && existing[referenced[:raw_stream_content]])
|
37
38
|
if tmp
|
38
39
|
obj[:referenced_object] = tmp
|
39
40
|
else
|
40
41
|
resolved[obj.object_id] = referenced
|
41
|
-
existing[referenced] = referenced
|
42
|
+
# existing[referenced] = referenced
|
43
|
+
existing[referenced[:raw_stream_content]] = referenced
|
42
44
|
should_resolve << referenced
|
43
45
|
@objects << referenced
|
44
46
|
end
|
@@ -150,7 +152,8 @@ module CombinePDF
|
|
150
152
|
catalog = rebuild_catalog
|
151
153
|
page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
|
152
154
|
# adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
|
153
|
-
add_referenced([page_objects, @forms_data, @names, @outlines, @info])
|
155
|
+
# add_referenced([page_objects, @forms_data, @names, @outlines, @info])
|
156
|
+
add_referenced(@objects.dup)
|
154
157
|
catalog
|
155
158
|
end
|
156
159
|
|
@@ -163,7 +166,7 @@ module CombinePDF
|
|
163
166
|
def renumber_object_ids(start = nil)
|
164
167
|
@set_start_id = start || @set_start_id
|
165
168
|
start = @set_start_id
|
166
|
-
history = {}
|
169
|
+
# history = {}
|
167
170
|
@objects.each do |obj|
|
168
171
|
obj[:indirect_reference_id] = start
|
169
172
|
start += 1
|
@@ -191,7 +194,7 @@ module CombinePDF
|
|
191
194
|
if pos[0].is_a? String
|
192
195
|
(pos.length / 2).times do |i|
|
193
196
|
dic << (pos[i * 2].clear << base.next!)
|
194
|
-
pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(
|
197
|
+
pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Numeric))
|
195
198
|
dic << (pos[(i * 2) + 1].is_a?(Array) ? { is_reference_only: true, referenced_object: { indirect_without_dictionary: pos[(i * 2) + 1] } } : pos[(i * 2) + 1])
|
196
199
|
# dic << pos[(i * 2) + 1]
|
197
200
|
end
|
@@ -225,12 +228,13 @@ module CombinePDF
|
|
225
228
|
# preffering the new over the old.
|
226
229
|
def self.hash_merge_new_no_page(_key, old_data, new_data)
|
227
230
|
return old_data unless new_data
|
231
|
+
return new_data unless old_data
|
228
232
|
if old_data.is_a?(Hash) && new_data.is_a?(Hash)
|
229
233
|
return old_data if (old_data[:Type] == :Page)
|
230
234
|
old_data.merge(new_data, &(@hash_merge_new_no_page_proc ||= method(:hash_merge_new_no_page)))
|
231
235
|
elsif old_data.is_a? Array
|
232
|
-
|
233
|
-
old_data
|
236
|
+
return old_data + new_data if new_data.is_a?(Array)
|
237
|
+
return old_data.dup << new_data
|
234
238
|
elsif new_data.is_a? Array
|
235
239
|
new_data + [old_data]
|
236
240
|
else
|
@@ -100,7 +100,7 @@ module CombinePDF
|
|
100
100
|
@names = parser.names_object || {}
|
101
101
|
@forms_data = parser.forms_object || {}
|
102
102
|
@outlines = parser.outlines_object || {}
|
103
|
-
# rebuild the
|
103
|
+
# rebuild the catalog, to fix wkhtmltopdf's use of static page numbers
|
104
104
|
rebuild_catalog
|
105
105
|
|
106
106
|
# general globals
|
data/lib/combine_pdf/renderer.rb
CHANGED
@@ -20,7 +20,7 @@ module CombinePDF
|
|
20
20
|
return format_name_to_pdf object
|
21
21
|
elsif object.is_a?(Array)
|
22
22
|
return format_array_to_pdf object
|
23
|
-
elsif object.is_a?(
|
23
|
+
elsif object.is_a?(Numeric) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
|
24
24
|
return object.to_s
|
25
25
|
elsif object.is_a?(Hash)
|
26
26
|
return format_hash_to_pdf object
|
data/lib/combine_pdf/version.rb
CHANGED
data/test/automated
CHANGED
@@ -14,6 +14,7 @@ require 'bundler/setup'
|
|
14
14
|
|
15
15
|
pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
|
16
16
|
pdf.save '01_check_radio_buttuns.pdf'
|
17
|
+
pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
|
17
18
|
pdf << CombinePDF.load("./Ruby/test\ pdfs/empty_form.pdf")
|
18
19
|
pdf << CombinePDF.load("./Ruby/test\ pdfs/filled_form.pdf")
|
19
20
|
pdf.save '02_check_form_unification_middle_is_empty.pdf'
|
@@ -51,7 +52,16 @@ pdf.save '06_check_links_to_second_copy.pdf'
|
|
51
52
|
lists = %w(./Ruby/test\ pdfs/outlines/self_merge_err.pdf ./Ruby/test\ pdfs/outlines/big_toc.pdf ./Ruby/test\ pdfs/outlines/bigger_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc2.pdf ./Ruby/test\ pdfs/outlines/named_dest.pdf ./Ruby/test\ pdfs/outlines/named_dest2.pdf)
|
52
53
|
|
53
54
|
i = 0
|
54
|
-
lists.each
|
55
|
+
lists.each do |n|
|
56
|
+
# puts "loading #{n}"
|
57
|
+
pdf = CombinePDF.load(n)
|
58
|
+
# puts "saving 07_#{(i += 1)}_#{n.split('/')[-1]}"
|
59
|
+
pdf.save("07_#{(i += 1)}_#{n.split('/')[-1]}")
|
60
|
+
# puts "loading #{n}X2"
|
61
|
+
pdf = CombinePDF.load(n) << CombinePDF.load(n)
|
62
|
+
# puts "saving 07_#{i}x2_#{n.split('/')[-1]}"
|
63
|
+
pdf.save("07_#{i}x2_#{n.split('/')[-1]}")
|
64
|
+
end
|
55
65
|
pdf = CombinePDF.new
|
56
66
|
lists.each { |n| pdf << CombinePDF.load(n) }
|
57
67
|
pdf.save('07_named destinations.pdf')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: combine_pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Boaz Segev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-rc4
|
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
106
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.6.
|
107
|
+
rubygems_version: 2.6.11
|
108
108
|
signing_key:
|
109
109
|
specification_version: 4
|
110
110
|
summary: Combine, stamp and watermark PDF files in pure Ruby.
|