combine_pdf 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/combine_pdf/parser.rb +62 -44
- data/lib/combine_pdf/version.rb +1 -1
- data/test/automated +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a262d8592dbe90e0a4930aebadda8866b731a586
|
4
|
+
data.tar.gz: fdf9a2877028b673f55d23741f8c8f59af647a4f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 763aa425d24ef58b83717123f4ef7c962d2ecd083a39275df136950872e286af49d0efbc8f8d2a2f4cf0967439848c151e587073497a8ff1a7fd9b0f9ef42d7f
|
7
|
+
data.tar.gz: 1c86313e09d88a07e4a2ee43fd53ce445b784770140a6892854d8f50d9dff9140c5fd993ae4d50b54ffa12f0cea6fb706a1026e18304185ccdedae3ceee0fbcb
|
data/lib/combine_pdf/parser.rb
CHANGED
@@ -52,6 +52,7 @@ module CombinePDF
|
|
52
52
|
@forms_object = {}.dup
|
53
53
|
@metadata = nil
|
54
54
|
@strings_dictionary = {}.dup # all strings are one string
|
55
|
+
@resolution_hash = {}.dup
|
55
56
|
@version = nil
|
56
57
|
@scanner = nil
|
57
58
|
@allow_optional_content = options[:allow_optional_content]
|
@@ -95,18 +96,20 @@ module CombinePDF
|
|
95
96
|
# do we really need to apply to @parsed? No, there is no need.
|
96
97
|
end
|
97
98
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
99
|
+
# search for objects streams and replace them "in-place"
|
100
|
+
# the inplace resolution prevents versioning errors
|
101
|
+
while (true)
|
102
|
+
found_object_streams = false
|
103
|
+
@parsed.length.times do |i|
|
104
|
+
o = @parsed[i]
|
105
|
+
next unless o.is_a?(Hash) && o[:Type] == :ObjStm
|
104
106
|
## un-encode (using the correct filter) the object streams
|
105
107
|
PDFFilter.inflate_object o
|
106
|
-
## extract objects from stream
|
108
|
+
## extract objects from stream
|
107
109
|
@scanner = StringScanner.new o[:raw_stream_content]
|
108
110
|
stream_data = _parse_
|
109
111
|
id_array = []
|
112
|
+
collection = [nil]
|
110
113
|
while stream_data[0].is_a? (Numeric)
|
111
114
|
id_array << stream_data.shift
|
112
115
|
stream_data.shift
|
@@ -115,11 +118,42 @@ module CombinePDF
|
|
115
118
|
stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
|
116
119
|
stream_data[0][:indirect_reference_id] = id_array.shift
|
117
120
|
stream_data[0][:indirect_generation_number] = 0
|
118
|
-
|
121
|
+
collection << (stream_data.shift)
|
119
122
|
end
|
123
|
+
# place new objects right after this one (removing this one as well)
|
124
|
+
@parsed[i] = collection
|
125
|
+
found_object_streams = true
|
120
126
|
end
|
127
|
+
break unless found_object_streams
|
128
|
+
@parsed.flatten!
|
129
|
+
@parsed.compact!
|
121
130
|
end
|
122
131
|
|
132
|
+
#
|
133
|
+
# object_streams = @parsed.select { |obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm }
|
134
|
+
# unless object_streams.empty?
|
135
|
+
# warn 'PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects.'
|
136
|
+
#
|
137
|
+
# object_streams.each do |o|
|
138
|
+
# ## un-encode (using the correct filter) the object streams
|
139
|
+
# PDFFilter.inflate_object o
|
140
|
+
# ## extract objects from stream to top level arry @parsed
|
141
|
+
# @scanner = StringScanner.new o[:raw_stream_content]
|
142
|
+
# stream_data = _parse_
|
143
|
+
# id_array = []
|
144
|
+
# while stream_data[0].is_a? (Numeric)
|
145
|
+
# id_array << stream_data.shift
|
146
|
+
# stream_data.shift
|
147
|
+
# end
|
148
|
+
# while id_array[0] && stream_data[0]
|
149
|
+
# stream_data[0] = { indirect_without_dictionary: stream_data[0] } unless stream_data[0].is_a?(Hash)
|
150
|
+
# stream_data[0][:indirect_reference_id] = id_array.shift
|
151
|
+
# stream_data[0][:indirect_generation_number] = 0
|
152
|
+
# @parsed << stream_data.shift
|
153
|
+
# end
|
154
|
+
# end
|
155
|
+
# end
|
156
|
+
|
123
157
|
# serialize_objects_and_references.catalog_pages
|
124
158
|
|
125
159
|
# Benchmark.bm do |bm|
|
@@ -149,6 +183,9 @@ module CombinePDF
|
|
149
183
|
else
|
150
184
|
@info_object = {}
|
151
185
|
end
|
186
|
+
|
187
|
+
# we can clear the resolution hash now
|
188
|
+
@resolution_hash.clear if @resolution_hash
|
152
189
|
# # # ## remove object streams - if they exist
|
153
190
|
# @parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
154
191
|
# # # ## remove XREF dictionaries - if they exist
|
@@ -377,7 +414,7 @@ module CombinePDF
|
|
377
414
|
if @scanner.matched[-1] == 'r'
|
378
415
|
if @scanner.skip_until(/<</)
|
379
416
|
data = _parse_
|
380
|
-
@root_object ||= {}
|
417
|
+
(@root_object ||= {}).clear
|
381
418
|
@root_object[data.shift] = data.shift while data[0]
|
382
419
|
end
|
383
420
|
##########
|
@@ -514,39 +551,6 @@ module CombinePDF
|
|
514
551
|
self
|
515
552
|
end
|
516
553
|
|
517
|
-
def get_refernced_object(reference_hash = {})
|
518
|
-
@parsed.each do |stored_object|
|
519
|
-
return stored_object if stored_object.is_a?(Hash) &&
|
520
|
-
reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
521
|
-
reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
522
|
-
# return (stored_object[:indirect_without_dictionary] || stored_object) if stored_object.is_a?(Hash) &&
|
523
|
-
# reference_hash[:indirect_reference_id] == stored_object[:indirect_reference_id] &&
|
524
|
-
# reference_hash[:indirect_generation_number] == stored_object[:indirect_generation_number]
|
525
|
-
end
|
526
|
-
warn "didn't find reference #{reference_hash}"
|
527
|
-
nil
|
528
|
-
end
|
529
|
-
|
530
|
-
# # @private
|
531
|
-
# # connects references and objects, according to their reference id's.
|
532
|
-
# #
|
533
|
-
# # should be moved to the parser's workflow.
|
534
|
-
# #
|
535
|
-
# def serialize_objects_and_references_old
|
536
|
-
# obj_dir = {}
|
537
|
-
# # create a dictionary for referenced objects (no value resolution at this point)
|
538
|
-
# @parsed.each { |o| obj_dir[[o.delete(:indirect_reference_id), o.delete(:indirect_generation_number)]] = o }
|
539
|
-
# # @parsed.each {|o| obj_dir[ [ o.[](:indirect_reference_id), o.[](:indirect_generation_number) ] ] = o }
|
540
|
-
# @references.each do |obj|
|
541
|
-
# obj[:referenced_object] = obj_dir[[obj[:indirect_reference_id], obj[:indirect_generation_number]]]
|
542
|
-
# warn "couldn't connect a reference!!! could be a null or removed (empty) object, Silent error!!!\n Object raising issue: #{obj}" unless obj[:referenced_object]
|
543
|
-
# obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)
|
544
|
-
# end
|
545
|
-
# obj_dir.clear
|
546
|
-
# @references.clear
|
547
|
-
# self
|
548
|
-
# end
|
549
|
-
|
550
554
|
# @private
|
551
555
|
# connects references and objects, according to their reference id's.
|
552
556
|
#
|
@@ -556,9 +560,23 @@ module CombinePDF
|
|
556
560
|
#
|
557
561
|
def serialize_objects_and_references
|
558
562
|
obj_dir = {}
|
563
|
+
objid_cache = {}
|
559
564
|
# create a dictionary for referenced objects (no value resolution at this point)
|
560
|
-
#
|
561
|
-
@parsed.
|
565
|
+
# at the same time, delete duplicates and old versions when objects have multiple versions
|
566
|
+
@parsed.uniq!
|
567
|
+
@parsed.length.times do |i|
|
568
|
+
o = @parsed[i]
|
569
|
+
objid_cache[o.object_id] = i
|
570
|
+
tmp_key = [o[:indirect_reference_id], o[:indirect_generation_number]]
|
571
|
+
if tmp_found = obj_dir[tmp_key]
|
572
|
+
tmp_found.clear
|
573
|
+
@parsed[objid_cache[tmp_found.object_id]] = nil
|
574
|
+
end
|
575
|
+
obj_dir[tmp_key] = o
|
576
|
+
end
|
577
|
+
@parsed.compact!
|
578
|
+
objid_cache.clear
|
579
|
+
|
562
580
|
should_resolve = [@parsed, @root_object]
|
563
581
|
while should_resolve.count > 0
|
564
582
|
obj = should_resolve.pop
|
data/lib/combine_pdf/version.rb
CHANGED
data/test/automated
CHANGED
@@ -19,6 +19,10 @@ pdf << CombinePDF.load("./Ruby/test\ pdfs/empty_form.pdf")
|
|
19
19
|
pdf << CombinePDF.load("./Ruby/test\ pdfs/filled_form.pdf")
|
20
20
|
pdf.save '02_check_form_unification_middle_is_empty.pdf'
|
21
21
|
|
22
|
+
pdf = CombinePDF.load "./Ruby/test\ pdfs/check_form_data__objstreams_w_versions.pdf"
|
23
|
+
pdf.save '02_01_check_form_data_ordering_issue.pdf'
|
24
|
+
|
25
|
+
|
22
26
|
pdf = CombinePDF.load './Ruby/test pdfs/share-font-background.pdf'
|
23
27
|
pdf2 = CombinePDF.load './Ruby/test pdfs/share-font-foreground.pdf'
|
24
28
|
i = 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: combine_pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Boaz Segev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-rc4
|