combine_pdf 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/combine_pdf/parser.rb +62 -44
- data/lib/combine_pdf/version.rb +1 -1
- data/test/automated +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a262d8592dbe90e0a4930aebadda8866b731a586
|
4
|
+
data.tar.gz: fdf9a2877028b673f55d23741f8c8f59af647a4f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 763aa425d24ef58b83717123f4ef7c962d2ecd083a39275df136950872e286af49d0efbc8f8d2a2f4cf0967439848c151e587073497a8ff1a7fd9b0f9ef42d7f
|
7
|
+
data.tar.gz: 1c86313e09d88a07e4a2ee43fd53ce445b784770140a6892854d8f50d9dff9140c5fd993ae4d50b54ffa12f0cea6fb706a1026e18304185ccdedae3ceee0fbcb
|
data/lib/combine_pdf/parser.rb
CHANGED
@@ -52,6 +52,7 @@ module CombinePDF
|
|
52
52
|
@forms_object = {}.dup
|
53
53
|
@metadata = nil
|
54
54
|
@strings_dictionary = {}.dup # all strings are one string
|
55
|
+
@resolution_hash = {}.dup
|
55
56
|
@version = nil
|
56
57
|
@scanner = nil
|
57
58
|
@allow_optional_content = options[:allow_optional_content]
|
@@ -95,18 +96,20 @@ module CombinePDF
|
|
95
96
|
# do we really need to apply to @parsed? No, there is no need.
|
96
97
|
end
|
97
98
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
99
|
+
# search for objects streams and replace them "in-place"
|
100
|
+
# the inplace resolution prevents versioning errors
|
101
|
+
while (true)
|
102
|
+
found_object_streams = false
|
103
|
+
@parsed.length.times do |i|
|
104
|
+
o = @parsed[i]
|
105
|
+
next unless o.is_a?(Hash) && o[:Type] == :ObjStm
|
104
106
|
## un-encode (using the correct filter) the object streams
|
105
107
|
PDFFilter.inflate_object o
|
106
|
-
## extract objects from stream
|
108
|
+
## extract objects from stream
|
107
109
|
@scanner = StringScanner.new o[:raw_stream_content]
|
108
110
|
stream_data = _parse_
|
109
111
|
id_array = []
|
112
|
+
collection = [nil]
|
110
113
|
while stream_data[0].is_a? (Numeric)
|
111
114
|
id_array << stream_data.shift
|
112
115
|
stream_data.shift
|
@@ -115,11 +118,42 @@ module CombinePDF
|
|
115
118
|
stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
|
116
119
|
stream_data[0][:indirect_reference_id] = id_array.shift
|
117
120
|
stream_data[0][:indirect_generation_number] = 0
|
118
|
-
|
121
|
+
collection << (stream_data.shift)
|
119
122
|
end
|
123
|
+
# place new objects right after this one (removing this one as well)
|
124
|
+
@parsed[i] = collection
|
125
|
+
found_object_streams = true
|
120
126
|
end
|
127
|
+
break unless found_object_streams
|
128
|
+
@parsed.flatten!
|
129
|
+
@parsed.compact!
|
121
130
|
end
|
122
131
|
|
132
|
+
#
|
133
|
+
# object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
|
134
|
+
# unless object_streams.empty?
|
135
|
+
# warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
|
136
|
+
#
|
137
|
+
# object_streams.each do |o|
|
138
|
+
# ## un-encode (using the correct filter) the object streams
|
139
|
+
# PDFFilter.inflate_object o
|
140
|
+
# ## extract objects from stream to top level arry @parsed
|
141
|
+
# @scanner = StringScanner.new o[:raw_stream_content]
|
142
|
+
# stream_data = _parse_
|
143
|
+
# id_array = []
|
144
|
+
# while stream_data[0].is_a? (Numeric)
|
145
|
+
# id_array << stream_data.shift
|
146
|
+
# stream_data.shift
|
147
|
+
# end
|
148
|
+
# while id_array[0] && stream_data[0]
|
149
|
+
# stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
|
150
|
+
# stream_data[0][:indirect_reference_id] = id_array.shift
|
151
|
+
# stream_data[0][:indirect_generation_number] = 0
|
152
|
+
# @parsed << stream_data.shift
|
153
|
+
# end
|
154
|
+
# end
|
155
|
+
# end
|
156
|
+
|
123
157
|
# serialize_objects_and_references.catalog_pages
|
124
158
|
|
125
159
|
# Benchmark.bm do |bm|
|
@@ -149,6 +183,9 @@ module CombinePDF
|
|
149
183
|
else
|
150
184
|
@info_object = {}
|
151
185
|
end
|
186
|
+
|
187
|
+
# we can clear the resolution hash now
|
188
|
+
@resolution_hash.clear if @resolution_hash
|
152
189
|
# # # ## remove object streams - if they exist
|
153
190
|
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
154
191
|
# # # ## remove XREF dictionaries - if they exist
|
@@ -377,7 +414,7 @@ module CombinePDF
|
|
377
414
|
if @scanner.matched[-1] == 'r'
|
378
415
|
if @scanner.skip_until(/<</)
|
379
416
|
data = _parse_
|
380
|
-
@root_object ||= {}
|
417
|
+
(@root_object ||= {}).clear
|
381
418
|
@root_object[data.shift] = data.shift while data[0]
|
382
419
|
end
|
383
420
|
##########
|
@@ -514,39 +551,6 @@ module CombinePDF
|
|
514
551
|
self
|
515
552
|
end
|
516
553
|
|
517
|
-
def get_refernced_object(reference_hash = {})
|
518
|
-
@parsed.each do |stored_object|
|
519
|
-
return stored_object if stored_object.is_a?(Hash) &&
|
520
|
-
reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
521
|
-
reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
522
|
-
# return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
|
523
|
-
# reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
524
|
-
# reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
525
|
-
end
|
526
|
-
warn "didn't find reference #{reference_hash}"
|
527
|
-
nil
|
528
|
-
end
|
529
|
-
|
530
|
-
# # @private
|
531
|
-
# # connects references and objects, according to their reference id's.
|
532
|
-
# #
|
533
|
-
# # should be moved to the parser's workflow.
|
534
|
-
# #
|
535
|
-
# def serialize_objects_and_references_old
|
536
|
-
# obj_dir = {}
|
537
|
-
# # create a dictionary for referenced objects (no value resolution at this point)
|
538
|
-
# @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
|
539
|
-
# # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
|
540
|
-
# @references.each do |obj|
|
541
|
-
# obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
|
542
|
-
# warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
|
543
|
-
# obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
|
544
|
-
# end
|
545
|
-
# obj_dir.clear
|
546
|
-
# @references.clear
|
547
|
-
# self
|
548
|
-
# end
|
549
|
-
|
550
554
|
# @private
|
551
555
|
# connects references and objects, according to their reference id's.
|
552
556
|
#
|
@@ -556,9 +560,23 @@ module CombinePDF
|
|
556
560
|
#
|
557
561
|
def serialize_objects_and_references
|
558
562
|
obj_dir = {}
|
563
|
+
objid_cache = {}
|
559
564
|
# create a dictionary for referenced objects (no value resolution at this point)
|
560
|
-
#
|
561
|
-
@parsed.
|
565
|
+
# at the same time, delete duplicates and old versions when objects have multiple versions
|
566
|
+
@parsed.uniq!
|
567
|
+
@parsed.length.times do |i|
|
568
|
+
o = @parsed[i]
|
569
|
+
objid_cache[o.object_id] = i
|
570
|
+
tmp_key = [o[:indirect_reference_id], o[:indirect_generation_number]]
|
571
|
+
if tmp_found = obj_dir[tmp_key]
|
572
|
+
tmp_found.clear
|
573
|
+
@parsed[objid_cache[tmp_found.object_id]] = nil
|
574
|
+
end
|
575
|
+
obj_dir[tmp_key] = o
|
576
|
+
end
|
577
|
+
@parsed.compact!
|
578
|
+
objid_cache.clear
|
579
|
+
|
562
580
|
should_resolve = [@parsed, @root_object]
|
563
581
|
while should_resolve.count > 0
|
564
582
|
obj = should_resolve.pop
|
data/lib/combine_pdf/version.rb
CHANGED
data/test/automated
CHANGED
@@ -19,6 +19,10 @@ pdf << CombinePDF.load("./Ruby/test\ pdfs/empty_form.pdf")
|
|
19
19
|
pdf << CombinePDF.load("./Ruby/test\ pdfs/filled_form.pdf")
|
20
20
|
pdf.save '02_check_form_unification_middle_is_empty.pdf'
|
21
21
|
|
22
|
+
pdf = CombinePDF.load "./Ruby/test\ pdfs/check_form_data__objstreams_w_versions.pdf"
|
23
|
+
pdf.save '02_01_check_form_data_ordering_issue.pdf'
|
24
|
+
|
25
|
+
|
22
26
|
pdf = CombinePDF.load './Ruby/test pdfs/share-font-background.pdf'
|
23
27
|
pdf2 = CombinePDF.load './Ruby/test pdfs/share-font-foreground.pdf'
|
24
28
|
i = 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: combine_pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Boaz Segev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-rc4
|