combine_pdf 0.2.30 → 0.2.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f81f9412da41045468ecaa6e10104fc9062eee8d
4
- data.tar.gz: 23410127dcabe19c6b9ddee352752f9b7bd0abb6
3
+ metadata.gz: 63b0c324e1bf003b0c0fc963eb2071b6c4672c18
4
+ data.tar.gz: 34c200edda06074773888c098b9d4f9a6479d752
5
5
  SHA512:
6
- metadata.gz: 03fdcce50faf9045930e435cbdf4d31e4ed96419f6e594796df89ab7c6e0891568467d5c1b632338c5e33fe17f3e68fe14c1c7c6c199e6d6d8a33ec47e438a46
7
- data.tar.gz: 24442ecac5ee2ed427de851eb5de4081ab8bb4a8625cdcaa2c9d4a84da86c79feaf3b02928ebee22462b2963a94ea9683fde5a47a3e6a35400d9ace2f3b13489
6
+ metadata.gz: ebf1cd2a7c1077f71d6f41037f0ad341c06e6bbb305bfa030833956feb0bb80576e2eaa5fd7324000ff95834f8c125cafc40a782ac97c16c7e3c7e7c02166794
7
+ data.tar.gz: a4cc257441939fbc0dd59dffe2d3faf11ade63b89fa2637bee8b54df7ea9e31a2ec5612d366ec9208856d750deac8fe10f1ddcaab6f9bf21996e6aa52c775e90
@@ -2,9 +2,21 @@
2
2
 
3
3
  ***
4
4
 
5
+ Change log v.0.2.31
6
+
7
+ **Broke**: Broke the fix for issue #65 so that Radio buttons data might be lost... working on a fix.
8
+
9
+ **Fix**: Fixed issue #82 (reintroduction of issue #19 due to core engine rewrite) related to a workaround for an issue with AcrobatReader. Credit to @gyuchang for testing and helping with the fix.
10
+
11
+ **Merge**: Merged pull request #80, fixing an issue with byte decoding. Credit to @gyuchang for the PR.
12
+
13
+ **Performance**: Improved performance for the reference and duplicate object resolution. Credit to @gyuchang for pointing some optimization options.
14
+
15
+ ***
16
+
5
17
  Change log v.0.2.30
6
18
 
7
- **Fix**: Fixed an issue where HTTP artifacts before the beginning of a PDF file / string would prevent the PDF from being parsed. This fixes issue #78 reported by @robvitaro.
19
+ **Fix**: Fixed an issue where HTTP artifacts before the beginning of a PDF file / string would prevent the PDF from being parsed. This should fix issue #78 reported by @robvitaro.
8
20
 
9
21
  ***
10
22
 
@@ -200,7 +200,7 @@ module CombinePDF
200
200
  # instead, a non-strict RegExp is used:
201
201
  str = @scanner.scan_until(/endstream/)
202
202
  # raise error if the stream doesn't end.
203
- raise "Parsing Error: PDF file error - a stream object wasn't properly colsed using 'endstream'!" unless str
203
+ raise "Parsing Error: PDF file error - a stream object wasn't properly closed using 'endstream'!" unless str
204
204
  # need to remove end of stream
205
205
  if out.last.is_a? Hash
206
206
  # out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
@@ -19,84 +19,42 @@ module CombinePDF
19
19
  # this function adds the references contained in `@objects`.
20
20
  #
21
21
  # this is used for internal operations, such as injectng data using the << operator.
22
- def add_referenced
22
+ def add_referenced(should_resolve = [])
23
23
  # add references but not root
24
- should_resolve = @objects.dup
25
24
  dup_pages = nil
26
- resolved = [].to_set
25
+ # an existing object map
26
+ resolved = {}.dup
27
+ existing = {}.dup
28
+ @objects.each { |obj| existing[obj] = obj }
29
+ # loop until should_resolve is empty
27
30
  while should_resolve.any?
28
31
  obj = should_resolve.pop
32
+ next if resolved[obj.object_id] # the object exists
29
33
  if obj.is_a?(Hash)
30
- next if resolved.include? obj.object_id
31
- resolved << obj.object_id
32
- if obj[:referenced_object]
33
- tmp = @objects.find_index(obj[:referenced_object])
34
+ referenced = obj[:referenced_object]
35
+ if referenced && referenced.any?
36
+ tmp = resolved[referenced.object_id] || existing[referenced]
34
37
  if tmp
35
- tmp = @objects[tmp]
36
38
  obj[:referenced_object] = tmp
37
39
  else
38
- tmp = obj[:referenced_object]
39
- should_resolve << tmp
40
- @objects << tmp
40
+ resolved[obj.object_id] = referenced
41
+ existing[referenced] = referenced
42
+ should_resolve << referenced
43
+ @objects << referenced
41
44
  end
42
45
  else
43
- obj.keys.each { |k| should_resolve << obj[k] unless k == :Parent || resolved.include?(obj[k].object_id) || !obj[k].is_a?(Enumerable) }
46
+ resolved[obj.object_id] = obj
47
+ obj.keys.each { |k| should_resolve << obj[k] unless !obj[k].is_a?(Enumerable) || resolved[obj[k].object_id] }
44
48
  end
45
49
  elsif obj.is_a?(Array)
46
- next if resolved.include? obj.object_id
47
- resolved << obj.object_id
50
+ resolved[obj.object_id] = obj
48
51
  should_resolve.concat obj
49
52
  end
50
53
  end
51
54
  resolved.clear
55
+ existing.clear
52
56
  end
53
57
 
54
- # # @private
55
- # # Some PDF objects contain references to other PDF objects.
56
- # #
57
- # # this function adds the references contained in "object", but DOESN'T add the object itself.
58
- # #
59
- # # this is used for internal operations, such as injectng data using the << operator.
60
- # def add_referenced(object, dup_pages = true)
61
- # # add references but not root
62
- # if object.is_a?(Array)
63
- # object.each { |it| add_referenced(it, dup_pages) }
64
- # return true
65
- # elsif object.is_a?(Hash)
66
- # # first if statement is actually a workaround for a bug in Acrobat Reader, regarding duplicate pages.
67
- # if dup_pages && object[:is_reference_only] && object[:referenced_object] && object[:referenced_object].is_a?(Hash) && object[:referenced_object][:Type] == :Page
68
- # if @objects.find_index object[:referenced_object]
69
- # @objects << (object[:referenced_object] = object[:referenced_object].dup)
70
- # else
71
- # @objects << object[:referenced_object]
72
- # end
73
- # elsif object[:is_reference_only] && object[:referenced_object]
74
- # found_at = @objects.find_index object[:referenced_object]
75
- # if found_at
76
- # # if the objects are equal, they might still be different objects!
77
- # # so, we need to make sure they are the same object for the pointers to effect id numbering
78
- # # and formatting operations.
79
- # object[:referenced_object] = @objects[found_at]
80
- # # stop this path, there is no need to run over the Hash's keys and values
81
- # return true
82
- # else
83
- # # stop if page propegation is false
84
- # return true if !dup_pages && object[:referenced_object][:Type] == :Page
85
- # # @objects.include? object[:referenced_object] is bound to be false
86
- # # the object wasn't found - add it to the @objects array
87
- # @objects << object[:referenced_object]
88
- # end
89
- #
90
- # end
91
- # object.each do |k, v|
92
- # add_referenced(v, dup_pages) unless RECORSIVE_PROTECTION[k]
93
- # end
94
- # else
95
- # return false
96
- # end
97
- # true
98
- # end
99
-
100
58
  # @private
101
59
  def rebuild_catalog(*with_pages)
102
60
  # # build page list v.1 Slow but WORKS
@@ -113,38 +71,62 @@ module CombinePDF
113
71
  # add pages to catalog, if requested
114
72
  page_list.concat(with_pages) unless with_pages.empty?
115
73
 
74
+ # duplicate any non-unique pages - This is a special case to resolve Adobe Acrobat Reader issues (see issues #19 and #81)
75
+ uniqueness = {}.dup
76
+ page_list.each { |page| page = page.dup if uniqueness[page.object_id]; uniqueness[page.object_id] = page }
77
+ page_list.clear
78
+ page_list = uniqueness.values
79
+ uniqueness.clear
80
+
116
81
  # build new Pages object
117
- pages_object = { Type: :Pages, Count: page_list.length, Kids: page_list.map { |p| { referenced_object: p, is_reference_only: true } } }
82
+ page_object_kids = [].dup
83
+ pages_object = { Type: :Pages, Count: page_list.length, Kids: page_object_kids }
84
+ pages_object_reference = { referenced_object: pages_object, is_reference_only: true }
85
+ page_list.each { |pg| pg[:Parent] = pages_object_reference; page_object_kids << ({ referenced_object: pg, is_reference_only: true }) }
118
86
 
119
87
  # rebuild/rename the names dictionary
120
88
  rebuild_names
121
89
  # build new Catalog object
122
90
  catalog_object = { Type: :Catalog,
123
- Pages: { referenced_object: pages_object, is_reference_only: true },
124
- Names: { referenced_object: @names, is_reference_only: true },
125
- Outlines: { referenced_object: @outlines, is_reference_only: true } }
91
+ Pages: { referenced_object: pages_object, is_reference_only: true } }
92
+ # pages_object[:Parent] = { referenced_object: catalog_object, is_reference_only: true } # causes AcrobatReader to fail
126
93
  catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
127
94
 
128
- # rebuild/rename the forms dictionary
129
- if @forms_data.nil? || @forms_data.empty?
130
- @forms_data = nil
131
- else
132
- @forms_data = { referenced_object: (@forms_data[:referenced_object] || @forms_data), is_reference_only: true }
133
- catalog_object[:AcroForm] = @forms_data
134
- end
135
-
136
95
  # point old Pages pointers to new Pages object
137
96
  ## first point known pages objects - enough?
138
97
  pages.each { |p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true } }
139
98
  ## or should we, go over structure? (fails)
140
99
  # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
141
100
 
142
- # remove old catalog and pages objects
143
- @objects.reject! { |obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
101
+ # # remove old catalog and pages objects
102
+ # @objects.reject! { |obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
103
+ # remove old objects list and trees
104
+ @objects.clear
144
105
 
145
106
  # inject new catalog and pages objects
146
- @objects << pages_object
107
+ @objects << @info if @info
147
108
  @objects << catalog_object
109
+ @objects << pages_object
110
+
111
+ # rebuild/rename the forms dictionary
112
+ if @forms_data.nil? || @forms_data.empty?
113
+ @forms_data = nil
114
+ else
115
+ @forms_data = { referenced_object: (@forms_data[:referenced_object] || @forms_data), is_reference_only: true }
116
+ catalog_object[:AcroForm] = @forms_data
117
+ @objects << @forms_data[:referenced_object]
118
+ end
119
+
120
+ # add the names dictionary
121
+ if @names && @names.length > 1
122
+ @objects << @names
123
+ catalog_object[:Names] = { referenced_object: @names, is_reference_only: true }
124
+ end
125
+ # add the outlines dictionary
126
+ if @outlines && @outlines.any?
127
+ @objects << @outlines
128
+ catalog_object[:Outlines] = { referenced_object: @outlines, is_reference_only: true }
129
+ end
148
130
 
149
131
  catalog_object
150
132
  end
@@ -166,26 +148,9 @@ module CombinePDF
166
148
  # there is no point is calling the method before preparing the output.
167
149
  def rebuild_catalog_and_objects
168
150
  catalog = rebuild_catalog
169
- @objects.clear
170
- @objects << @info
171
- @objects << catalog
172
- # fix Acrobat Reader issue with page reference uniqueness (must be unique or older Acrobat Reader fails)
173
- catalog[:Pages][:referenced_object][:Kids].each do |page|
174
- tmp = page[:referenced_object]
175
- tmp = page[:referenced_object] = tmp.dup if @objects.include? tmp
176
- @objects << tmp
177
- end
151
+ page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
178
152
  # adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
179
- # puts (Benchmark.measure do
180
- add_referenced
181
- # end)
182
- # @objects << @info
183
- # add_referenced @info
184
- # add_referenced catalog
185
- # add_referenced catalog[:Pages]
186
- # add_referenced catalog[:Names], false
187
- # add_referenced catalog[:Outlines], false
188
- # add_referenced catalog[:AcroForm], false
153
+ add_referenced([page_objects, @forms_data, @names, @outlines, @info])
189
154
  catalog
190
155
  end
191
156
 
@@ -304,9 +269,9 @@ module CombinePDF
304
269
  # parent - the outline base node of the resulting merged outline
305
270
  # FIXME implement the possibility to insert somewhere in the middle of the outline
306
271
  prev = nil
307
- pos = first = actual_object(((position != 0) ? old_data : new_data)[:First])
308
- last = actual_object(((position != 0) ? new_data : old_data)[:Last])
309
- median = { is_reference_only: true, referenced_object: actual_object(((position != 0) ? new_data : old_data)[:First]) }
272
+ pos = first = actual_object((position.nonzero? ? old_data : new_data)[:First])
273
+ last = actual_object((position.nonzero? ? new_data : old_data)[:Last])
274
+ median = { is_reference_only: true, referenced_object: actual_object((position.nonzero? ? new_data : old_data)[:First]) }
310
275
  old_data[:First] = { is_reference_only: true, referenced_object: first }
311
276
  old_data[:Last] = { is_reference_only: true, referenced_object: last }
312
277
  parent = { is_reference_only: true, referenced_object: old_data }
@@ -21,7 +21,7 @@ module CombinePDF
21
21
  elsif object.is_a?(Array)
22
22
  return format_array_to_pdf object
23
23
  elsif object.is_a?(Fixnum) || object.is_a?(Float) || object.is_a?(TrueClass) || object.is_a?(FalseClass)
24
- return object.to_s + ' '
24
+ return object.to_s
25
25
  elsif object.is_a?(Hash)
26
26
  return format_hash_to_pdf object
27
27
  else
@@ -33,12 +33,12 @@ module CombinePDF
33
33
  "\x0D" => '\\r',
34
34
  "\x09" => '\\t',
35
35
  "\x08" => '\\b',
36
- "\xFF" => '\\f',
36
+ "\x0C" => '\\f', # form-feed (\f) == 0x0C
37
37
  "\x28" => '\\(',
38
38
  "\x29" => '\\)',
39
39
  "\x5C" => '\\\\' }.dup
40
40
  32.times { |i| STRING_REPLACEMENT_HASH[i.chr] ||= "\\#{i}" }
41
- (256 - 128).times { |i| STRING_REPLACEMENT_HASH[(i + 127).chr] ||= "\\#{i + 127}" }
41
+ (256 - 127).times { |i| STRING_REPLACEMENT_HASH[(i + 127).chr] ||= "\\#{i + 127}" }
42
42
 
43
43
  def format_string_to_pdf(object)
44
44
  # object.force_encoding(Encoding::ASCII_8BIT)
@@ -1,3 +1,3 @@
1
1
  module CombinePDF
2
- VERSION = '0.2.30'.freeze
2
+ VERSION = '0.2.31'.freeze
3
3
  end
@@ -28,15 +28,16 @@ pdf = CombinePDF.load './Ruby/test pdfs/names_go_haywire_0.pdf'
28
28
  pdf << CombinePDF.load('./Ruby/test pdfs/names_go_haywire_1.pdf')
29
29
  pdf.save '04_check_view_and_names_reference.pdf'
30
30
 
31
- str = IO.binread './Ruby/test pdfs/outlines/self_merge_err.pdf'
31
+ pdf = CombinePDF.load('./Ruby/test pdfs/outlines/self_merge_err.pdf')
32
+ pdf.save '05_x1_scribus_test.pdf'
32
33
  pdf = CombinePDF.load('./Ruby/test pdfs/outlines/self_merge_err.pdf')
33
34
  pdf << CombinePDF.load('./Ruby/test pdfs/outlines/self_merge_err.pdf')
34
- pdf.save '05_scribus_test.pdf'
35
+ pdf.save '05_x2_scribus_test.pdf'
35
36
  # pdf = CombinePDF.load "./Ruby/test pdfs/named_dest.pdf";nil
36
37
  # pdf.save '05_check_named_dest_links.pdf' # this will take a while
37
38
  # pdf = CombinePDF.load "./Ruby/test pdfs/named_dest.pdf";nil
38
- # pdf << CombinePDF.load("./Ruby/test pdfs/named_dest.pdf");nil
39
- # pdf.save '05_1_check_named_dest_links.pdf' # never ends... :-(
39
+ pdf << CombinePDF.load('./Ruby/test pdfs/named_dest.pdf'); nil
40
+ pdf.save '05_1_timeless_check_named_dest_links.pdf' # never ends... :-(
40
41
 
41
42
  pdf = CombinePDF.load './Ruby/test pdfs/outline_small.pdf'
42
43
  pdf << CombinePDF.load('./Ruby/test pdfs/outline_small.pdf')
@@ -55,6 +56,17 @@ CombinePDF.load("./Ruby/test\ pdfs/Scribus-unknown_err2.pdf").save '08_2-unknown
55
56
  CombinePDF.load("./Ruby/test\ pdfs/Scribus-unknown_err3.pdf").save '08_3-unknown-err-empty-str.pdf'
56
57
 
57
58
  CombinePDF.load("/Users/2Be/Ruby/test\ pdfs/nil_object.pdf").save('09_nil_in_parsed_array.pdf')
59
+
60
+ require 'prawn'
61
+ IO.binwrite '10_prawn.pdf', (Prawn::Document.new { text 'Hello World!' }).render
62
+ page = CombinePDF.parse((Prawn::Document.new { text 'Hello World!' }).render)
63
+ pdf = CombinePDF.new
64
+ pdf << page
65
+ pdf.save '10_parsed_from_prawn.pdf'
66
+ pdf = CombinePDF.new
67
+ pdf << page << page
68
+ pdf.save('10_AcrobatReader_is_unique_page.pdf')
69
+
58
70
  # unify = [
59
71
  # "./Ruby/test\ pdfs/AESv2\ encrypted.pdf",
60
72
  # "./Ruby/test\ pdfs/data-in-comment.pdf",
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: combine_pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.30
4
+ version: 0.2.31
5
5
  platform: ruby
6
6
  authors:
7
7
  - Boaz Segev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-27 00:00:00.000000000 Z
11
+ date: 2016-08-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-rc4
@@ -111,4 +111,3 @@ test_files:
111
111
  - test/automated
112
112
  - test/console
113
113
  - test/named_dest
114
- has_rdoc: