combine_pdf 0.2.37 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/lib/combine_pdf.rb +1 -1
- data/lib/combine_pdf/parser.rb +69 -60
- data/lib/combine_pdf/pdf_protected.rb +11 -7
- data/lib/combine_pdf/pdf_public.rb +1 -1
- data/lib/combine_pdf/renderer.rb +1 -1
- data/lib/combine_pdf/version.rb +1 -1
- data/test/automated +11 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3663c5f5602eeed30aba5405fc0503ab9a865432
|
4
|
+
data.tar.gz: f6e07e2fbb180065146c32a440f29348fb2a2808
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 835236c99911009df5112cc92c3a042e6fe2dcd634e9b189f9977aef9f9f42ae33ac9865e94d08e8d26b6ff0328d7bf84ccc0dc858897c2c0ceae10c3c80c944
|
7
|
+
data.tar.gz: f11df3aa2c055a17be86c83766ea85f536afc5dcf0b299cd0e5be48344724909619ea54c8b52b7a9997907ec9d5c585d21f9ba81eeb17a74a77234aba2a79994
|
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,19 @@
|
|
2
2
|
|
3
3
|
***
|
4
4
|
|
5
|
-
#### Change log v.0.
|
5
|
+
#### Change log v.1.0.0
|
6
|
+
|
7
|
+
**Fix**: Fixed a possible issue with string corruption... it might have only existed in the development version, I'm not sure, but it's fixed anyway.
|
8
|
+
|
9
|
+
**Fix** (degrade): Fixed an issue related to deeply nested objects causing unreasonable slowdowns. The issue was resolved by degrading the PDF optimization process to review object with `stream` data instead of reviewing every object. This means more duplicate objects might be observed when similar PDF files are merged.
|
10
|
+
|
11
|
+
**Fix**: Fixed an issue related to form data where font information was lost during the PDF optimization process.
|
12
|
+
|
13
|
+
**Fix**: Fixed issue #108 by adding support for PDFs that have spaces and missing zeros in their hex encoded strings. Credit to @emmanuelmillionaer.
|
14
|
+
|
15
|
+
***
|
16
|
+
|
17
|
+
#### Change log v.0.2.37
|
6
18
|
|
7
19
|
**Fix**: Fixed `Page_Methods#textbox` default `:x`,`:y` to allow for non-zero/cropped page origin. Credit to @donnguyen for exposing the issue.
|
8
20
|
|
data/lib/combine_pdf.rb
CHANGED
data/lib/combine_pdf/parser.rb
CHANGED
@@ -107,7 +107,7 @@ module CombinePDF
|
|
107
107
|
@scanner = StringScanner.new o[:raw_stream_content]
|
108
108
|
stream_data = _parse_
|
109
109
|
id_array = []
|
110
|
-
while stream_data[0].is_a? (
|
110
|
+
while stream_data[0].is_a? (Numeric)
|
111
111
|
id_array << stream_data.shift
|
112
112
|
stream_data.shift
|
113
113
|
end
|
@@ -180,57 +180,34 @@ module CombinePDF
|
|
180
180
|
if @scanner.scan(/\[/)
|
181
181
|
out << _parse_
|
182
182
|
##########################################
|
183
|
-
##
|
184
|
-
##########################################
|
185
|
-
elsif @scanner.scan(/<</)
|
186
|
-
data = _parse_
|
187
|
-
obj = {}
|
188
|
-
obj[data.shift] = data.shift while data[0]
|
189
|
-
out << obj
|
190
|
-
##########################################
|
191
|
-
## return content of array or dictionary
|
192
|
-
##########################################
|
193
|
-
elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
|
194
|
-
return out
|
195
|
-
##########################################
|
196
|
-
## parse a Stream
|
183
|
+
## Parse a Name
|
197
184
|
##########################################
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
# raise error if the stream doesn't end.
|
205
|
-
raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
|
206
|
-
# need to remove end of stream
|
207
|
-
if out.last.is_a? Hash
|
208
|
-
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
209
|
-
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
210
|
-
else
|
211
|
-
warn 'Stream not attached to dictionary!'
|
212
|
-
out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
213
|
-
end
|
185
|
+
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
186
|
+
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
187
|
+
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
188
|
+
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
189
|
+
elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
|
190
|
+
out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
|
214
191
|
##########################################
|
215
|
-
##
|
192
|
+
## Parse a Number
|
216
193
|
##########################################
|
217
|
-
elsif str = @scanner.scan(/
|
218
|
-
|
219
|
-
if out.last.is_a? Hash
|
220
|
-
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
221
|
-
else
|
222
|
-
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
223
|
-
end
|
224
|
-
fresh = true
|
225
|
-
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
226
|
-
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
227
|
-
# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Integer)
|
194
|
+
elsif str = @scanner.scan(/[\+\-\.\d]+/)
|
195
|
+
str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
|
228
196
|
##########################################
|
229
197
|
## parse a Hex String
|
230
198
|
##########################################
|
231
|
-
elsif str = @scanner.scan(
|
199
|
+
elsif str = @scanner.scan(/\<[0-9a-fA-F]*\>/)
|
232
200
|
# warn "Found a hex string"
|
233
|
-
|
201
|
+
str = str.slice(1..-2).force_encoding(Encoding::ASCII_8BIT)
|
202
|
+
# str = "0#{str}" if str.length.odd?
|
203
|
+
out << unify_string([str].pack('H*').force_encoding(Encoding::ASCII_8BIT))
|
204
|
+
##########################################
|
205
|
+
## parse a space delimited Hex String
|
206
|
+
##########################################
|
207
|
+
elsif str = @scanner.scan(/\<[0-9a-fA-F\s]*\>/)
|
208
|
+
# warn "Found a space seperated hex string"
|
209
|
+
str = str.force_encoding(Encoding::ASCII_8BIT).split(/\s/).map! {|b| b.length.odd? ? "0#{b}" : b}
|
210
|
+
out << unify_string(str.pack('H*' * str.length).force_encoding(Encoding::ASCII_8BIT))
|
234
211
|
##########################################
|
235
212
|
## parse a Literal String
|
236
213
|
##########################################
|
@@ -315,6 +292,52 @@ module CombinePDF
|
|
315
292
|
end
|
316
293
|
out << unify_string(str.pack('C*').force_encoding(Encoding::ASCII_8BIT))
|
317
294
|
##########################################
|
295
|
+
## parse a Dictionary
|
296
|
+
##########################################
|
297
|
+
elsif @scanner.scan(/<</)
|
298
|
+
data = _parse_
|
299
|
+
obj = {}
|
300
|
+
obj[data.shift] = data.shift while data[0]
|
301
|
+
out << obj
|
302
|
+
##########################################
|
303
|
+
## return content of array or dictionary
|
304
|
+
##########################################
|
305
|
+
elsif @scanner.scan(/\]/) || @scanner.scan(/>>/)
|
306
|
+
return out
|
307
|
+
##########################################
|
308
|
+
## parse a Stream
|
309
|
+
##########################################
|
310
|
+
elsif @scanner.scan(/stream[\r\n]/)
|
311
|
+
@scanner.pos += 1 if @scanner.peek(1) == "\n".freeze && @scanner.matched[-1] != "\n".freeze
|
312
|
+
# the following was dicarded because some PDF files didn't have an EOL marker as required
|
313
|
+
# str = @scanner.scan_until(/(\r\n|\r|\n)endstream/)
|
314
|
+
# instead, a non-strict RegExp is used:
|
315
|
+
str = @scanner.scan_until(/endstream/)
|
316
|
+
# raise error if the stream doesn't end.
|
317
|
+
raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
|
318
|
+
# need to remove end of stream
|
319
|
+
if out.last.is_a? Hash
|
320
|
+
# out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
321
|
+
out.last[:raw_stream_content] = unify_string str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
322
|
+
else
|
323
|
+
warn 'Stream not attached to dictionary!'
|
324
|
+
out << str.sub(/(\r\n|\n|\r)?endstream\z/, '').force_encoding(Encoding::ASCII_8BIT)
|
325
|
+
end
|
326
|
+
##########################################
|
327
|
+
## parse an Object after finished
|
328
|
+
##########################################
|
329
|
+
elsif str = @scanner.scan(/endobj/)
|
330
|
+
# what to do when this is an object?
|
331
|
+
if out.last.is_a? Hash
|
332
|
+
out << out.pop.merge(indirect_generation_number: out.pop, indirect_reference_id: out.pop)
|
333
|
+
else
|
334
|
+
out << { indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop }
|
335
|
+
end
|
336
|
+
fresh = true
|
337
|
+
# fix wkhtmltopdf use of PDF 1.1 Dest using symbols instead of strings
|
338
|
+
out.last[:Dest] = unify_string(out.last[:Dest].to_s) if out.last[:Dest] && out.last[:Dest].is_a?(Symbol)
|
339
|
+
# puts "!!!!!!!!! Error with :indirect_reference_id\n\nObject #{out.last} :indirect_reference_id = #{out.last[:indirect_reference_id]}" unless out.last[:indirect_reference_id].is_a?(Numeric)
|
340
|
+
##########################################
|
318
341
|
## Parse a comment
|
319
342
|
##########################################
|
320
343
|
elsif str = @scanner.scan(/\%/)
|
@@ -326,20 +349,6 @@ module CombinePDF
|
|
326
349
|
end
|
327
350
|
# puts "AFTER COMMENT: #{@scanner.peek 8}"
|
328
351
|
##########################################
|
329
|
-
## Parse a Name
|
330
|
-
##########################################
|
331
|
-
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
332
|
-
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
333
|
-
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
334
|
-
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
335
|
-
elsif str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]*/)
|
336
|
-
out << (str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) { |a| a[1..2].hex.chr }).to_sym
|
337
|
-
##########################################
|
338
|
-
## Parse a Number
|
339
|
-
##########################################
|
340
|
-
elsif str = @scanner.scan(/[\+\-\.\d]+/)
|
341
|
-
str =~ /\./ ? (out << str.to_f) : (out << str.to_i)
|
342
|
-
##########################################
|
343
352
|
## Parse an Object Reference
|
344
353
|
##########################################
|
345
354
|
elsif @scanner.scan(/R/)
|
@@ -562,7 +571,7 @@ module CombinePDF
|
|
562
571
|
o = nil
|
563
572
|
else
|
564
573
|
o[:referenced_object] = obj_dir[[o[:indirect_reference_id], o[:indirect_generation_number]]]
|
565
|
-
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil?
|
574
|
+
warn "Couldn't connect reference for #{o}" if o[:referenced_object].nil? && (o[:indirect_reference_id] + o[:indirect_generation_number] != 0)
|
566
575
|
o.delete :indirect_reference_id
|
567
576
|
o.delete :indirect_generation_number
|
568
577
|
o = (o[:referenced_object] && o[:referenced_object][:indirect_without_dictionary]) || o
|
@@ -33,12 +33,14 @@ module CombinePDF
|
|
33
33
|
if obj.is_a?(Hash)
|
34
34
|
referenced = obj[:referenced_object]
|
35
35
|
if referenced && referenced.any?
|
36
|
-
tmp = resolved[referenced.object_id] || existing[referenced]
|
36
|
+
# tmp = resolved[referenced.object_id] || existing[referenced]
|
37
|
+
tmp = resolved[referenced.object_id] || (referenced[:raw_stream_content] && existing[referenced[:raw_stream_content]])
|
37
38
|
if tmp
|
38
39
|
obj[:referenced_object] = tmp
|
39
40
|
else
|
40
41
|
resolved[obj.object_id] = referenced
|
41
|
-
existing[referenced] = referenced
|
42
|
+
# existing[referenced] = referenced
|
43
|
+
existing[referenced[:raw_stream_content]] = referenced
|
42
44
|
should_resolve << referenced
|
43
45
|
@objects << referenced
|
44
46
|
end
|
@@ -150,7 +152,8 @@ module CombinePDF
|
|
150
152
|
catalog = rebuild_catalog
|
151
153
|
page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
|
152
154
|
# adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
|
153
|
-
add_referenced([page_objects, @forms_data, @names, @outlines, @info])
|
155
|
+
# add_referenced([page_objects, @forms_data, @names, @outlines, @info])
|
156
|
+
add_referenced(@objects.dup)
|
154
157
|
catalog
|
155
158
|
end
|
156
159
|
|
@@ -163,7 +166,7 @@ module CombinePDF
|
|
163
166
|
def renumber_object_ids(start = nil)
|
164
167
|
@set_start_id = start || @set_start_id
|
165
168
|
start = @set_start_id
|
166
|
-
history = {}
|
169
|
+
# history = {}
|
167
170
|
@objects.each do |obj|
|
168
171
|
obj[:indirect_reference_id] = start
|
169
172
|
start += 1
|
@@ -191,7 +194,7 @@ module CombinePDF
|
|
191
194
|
if pos[0].is_a? String
|
192
195
|
(pos.length / 2).times do |i|
|
193
196
|
dic << (pos[i * 2].clear << base.next!)
|
194
|
-
pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(
|
197
|
+
pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Numeric))
|
195
198
|
dic << (pos[(i * 2) + 1].is_a?(Array) ? { is_reference_only: true, referenced_object: { indirect_without_dictionary: pos[(i * 2) + 1] } } : pos[(i * 2) + 1])
|
196
199
|
# dic << pos[(i * 2) + 1]
|
197
200
|
end
|
@@ -225,12 +228,13 @@ module CombinePDF
|
|
225
228
|
# preffering the new over the old.
|
226
229
|
def self.hash_merge_new_no_page(_key, old_data, new_data)
|
227
230
|
return old_data unless new_data
|
231
|
+
return new_data unless old_data
|
228
232
|
if old_data.is_a?(Hash) && new_data.is_a?(Hash)
|
229
233
|
return old_data if (old_data[:Type] == :Page)
|
230
234
|
old_data.merge(new_data, &(@hash_merge_new_no_page_proc ||= method(:hash_merge_new_no_page)))
|
231
235
|
elsif old_data.is_a? Array
|
232
|
-
|
233
|
-
old_data
|
236
|
+
return old_data + new_data if new_data.is_a?(Array)
|
237
|
+
return old_data.dup << new_data
|
234
238
|
elsif new_data.is_a? Array
|
235
239
|
new_data + [old_data]
|
236
240
|
else
|
@@ -100,7 +100,7 @@ module CombinePDF
|
|
100
100
|
@names = parser.names_object || {}
|
101
101
|
@forms_data = parser.forms_object || {}
|
102
102
|
@outlines = parser.outlines_object || {}
|
103
|
-
# rebuild the
|
103
|
+
# rebuild the catalog, to fix wkhtmltopdf's use of static page numbers
|
104
104
|
rebuild_catalog
|
105
105
|
|
106
106
|
# general globals
|
data/lib/combine_pdf/renderer.rb
CHANGED
@@ -20,7 +20,7 @@ module CombinePDF
|
|
20
20
|
return format_name_to_pdf object
|
21
21
|
elsif object.is_a?(Array)
|
22
22
|
return format_array_to_pdf object
|
23
|
-
elsif object.is_a?(
|
23
|
+
elsif object.is_a?(Numeric) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
|
24
24
|
return object.to_s
|
25
25
|
elsif object.is_a?(Hash)
|
26
26
|
return format_hash_to_pdf object
|
data/lib/combine_pdf/version.rb
CHANGED
data/test/automated
CHANGED
@@ -14,6 +14,7 @@ require 'bundler/setup'
|
|
14
14
|
|
15
15
|
pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
|
16
16
|
pdf.save '01_check_radio_buttuns.pdf'
|
17
|
+
pdf = CombinePDF.load "./Ruby/test\ pdfs/filled_form.pdf"
|
17
18
|
pdf << CombinePDF.load("./Ruby/test\ pdfs/empty_form.pdf")
|
18
19
|
pdf << CombinePDF.load("./Ruby/test\ pdfs/filled_form.pdf")
|
19
20
|
pdf.save '02_check_form_unification_middle_is_empty.pdf'
|
@@ -51,7 +52,16 @@ pdf.save '06_check_links_to_second_copy.pdf'
|
|
51
52
|
lists = %w(./Ruby/test\ pdfs/outlines/self_merge_err.pdf ./Ruby/test\ pdfs/outlines/big_toc.pdf ./Ruby/test\ pdfs/outlines/bigger_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc.pdf ./Ruby/test\ pdfs/outlines/named_dest_no_toc2.pdf ./Ruby/test\ pdfs/outlines/named_dest.pdf ./Ruby/test\ pdfs/outlines/named_dest2.pdf)
|
52
53
|
|
53
54
|
i = 0
|
54
|
-
lists.each
|
55
|
+
lists.each do |n|
|
56
|
+
# puts "loading #{n}"
|
57
|
+
pdf = CombinePDF.load(n)
|
58
|
+
# puts "saving 07_#{(i += 1)}_#{n.split('/')[-1]}"
|
59
|
+
pdf.save("07_#{(i += 1)}_#{n.split('/')[-1]}")
|
60
|
+
# puts "loading #{n}X2"
|
61
|
+
pdf = CombinePDF.load(n) << CombinePDF.load(n)
|
62
|
+
# puts "saving 07_#{i}x2_#{n.split('/')[-1]}"
|
63
|
+
pdf.save("07_#{i}x2_#{n.split('/')[-1]}")
|
64
|
+
end
|
55
65
|
pdf = CombinePDF.new
|
56
66
|
lists.each { |n| pdf << CombinePDF.load(n) }
|
57
67
|
pdf.save('07_named destinations.pdf')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: combine_pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Boaz Segev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-rc4
|
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
106
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.6.
|
107
|
+
rubygems_version: 2.6.11
|
108
108
|
signing_key:
|
109
109
|
specification_version: 4
|
110
110
|
summary: Combine, stamp and watermark PDF files in pure Ruby.
|