combine_pdf 0.2.5 → 0.2.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,137 +5,352 @@
5
5
  ## is subject to the same license.
6
6
  ########################################################
7
7
 
8
+ module CombinePDF
9
+ class PDF
10
+ protected
8
11
 
12
+ include Renderer
9
13
 
14
+ # RECORSIVE_PROTECTION = { Parent: true, Last: true}.freeze
10
15
 
16
+ # @private
17
+ # Some PDF objects contain references to other PDF objects.
18
+ #
19
+ # this function adds the references contained in `@objects`.
20
+ #
21
+ # this is used for internal operations, such as injectng data using the << operator.
22
+ def add_referenced(should_resolve = [])
23
+ # add references but not root
24
+ dup_pages = nil
25
+ # an existing object map
26
+ resolved = {}.dup
27
+ existing = {}.dup
28
+ @objects.each { |obj| existing[obj] = obj }
29
+ # loop until should_resolve is empty
30
+ while should_resolve.any?
31
+ obj = should_resolve.pop
32
+ next if resolved[obj.object_id] # the object exists
33
+ if obj.is_a?(Hash)
34
+ referenced = obj[:referenced_object]
35
+ if referenced && referenced.any?
36
+ tmp = resolved[referenced.object_id] || existing[referenced]
37
+ if tmp
38
+ obj[:referenced_object] = tmp
39
+ else
40
+ resolved[obj.object_id] = referenced
41
+ existing[referenced] = referenced
42
+ should_resolve << referenced
43
+ @objects << referenced
44
+ end
45
+ else
46
+ resolved[obj.object_id] = obj
47
+ obj.keys.each { |k| should_resolve << obj[k] unless !obj[k].is_a?(Enumerable) || resolved[obj[k].object_id] }
48
+ end
49
+ elsif obj.is_a?(Array)
50
+ resolved[obj.object_id] = obj
51
+ should_resolve.concat obj
52
+ end
53
+ end
54
+ resolved.clear
55
+ existing.clear
56
+ end
11
57
 
58
+ # @private
59
+ def rebuild_catalog(*with_pages)
60
+ # # build page list v.1 Slow but WORKS
61
+ # # Benchmark testing value: 26.708394
62
+ # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
63
+ # old_catalogs ||= []
64
+ # page_list = []
65
+ # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page }
12
66
 
13
- module CombinePDF
67
+ # build page list v.2 faster, better, and works
68
+ # Benchmark testing value: 0.215114
69
+ page_list = pages
14
70
 
71
+ # add pages to catalog, if requested
72
+ page_list.concat(with_pages) unless with_pages.empty?
15
73
 
16
- class PDF
17
-
18
- protected
19
-
20
- include Renderer
21
-
22
- # @private
23
- # Some PDF objects contain references to other PDF objects.
24
- #
25
- # this function adds the references contained in "object", but DOESN'T add the object itself.
26
- #
27
- # this is used for internal operations, such as injectng data using the << operator.
28
- def add_referenced(object)
29
- # add references but not root
30
- case
31
- when object.is_a?(Array)
32
- object.each {|it| add_referenced(it)}
33
- return true
34
- when object.is_a?(Hash)
35
- # first if statement is actually a workaround for a bug in Acrobat Reader, regarding duplicate pages.
36
- if object[:is_reference_only] && object[:referenced_object] && object[:referenced_object].is_a?(Hash) && object[:referenced_object][:Type] == :Page
37
- if @objects.find_index object[:referenced_object]
38
- @objects << (object[:referenced_object] = object[:referenced_object].dup)
39
- else
40
- @objects << object[:referenced_object]
41
- end
42
- elsif object[:is_reference_only] && object[:referenced_object]
43
- found_at = @objects.find_index object[:referenced_object]
44
- if found_at
45
- #if the objects are equal, they might still be different objects!
46
- # so, we need to make sure they are the same object for the pointers to effect id numbering
47
- # and formatting operations.
48
- object[:referenced_object] = @objects[found_at]
49
- # stop this path, there is no need to run over the Hash's keys and values
50
- return true
51
- else
52
- # @objects.include? object[:referenced_object] is bound to be false
53
- # the object wasn't found - add it to the @objects array
54
- @objects << object[:referenced_object]
55
- end
56
-
57
- end
58
- object.each do |k, v|
59
- add_referenced(v) unless k == :Parent
60
- end
61
- else
62
- return false
63
- end
64
- true
65
- end
66
-
67
- # @private
68
- def rebuild_catalog(*with_pages)
69
- # # build page list v.1 Slow but WORKS
70
- # # Benchmark testing value: 26.708394
71
- # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
72
- # old_catalogs ||= []
73
- # page_list = []
74
- # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page }
75
-
76
- # build page list v.2 faster, better, and works
77
- # Benchmark testing value: 0.215114
78
- page_list = pages
79
-
80
- # add pages to catalog, if requested
81
- page_list.push(*with_pages) unless with_pages.empty?
82
-
83
- # build new Pages object
84
- pages_object = {Type: :Pages, Count: page_list.length, Kids: page_list.map {|p| {referenced_object: p, is_reference_only: true} } }
85
-
86
- # build new Catalog object
87
- catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true} }
88
- catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
89
-
90
- # point old Pages pointers to new Pages object
91
- ## first point known pages objects - enough?
92
- pages.each {|p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true} }
93
- ## or should we, go over structure? (fails)
94
- # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
95
-
96
- # remove old catalog and pages objects
97
- @objects.reject! {|obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
98
-
99
- # inject new catalog and pages objects
100
- @objects << pages_object
101
- @objects << catalog_object
102
-
103
- catalog_object
104
- end
105
-
106
- # @private
107
- # this is an alternative to the rebuild_catalog catalog method
108
- # this method is used by the to_pdf method, for streamlining the PDF output.
109
- # there is no point is calling the method before preparing the output.
110
- def rebuild_catalog_and_objects
111
- catalog = rebuild_catalog
112
- @objects = []
113
- @objects << catalog
114
- add_referenced catalog
115
- catalog
116
- end
117
-
118
- def get_existing_catalogs
119
- (@objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}) || (@objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Page})
120
- end
121
-
122
-
123
-
124
- # end
125
- # @private
126
- def renumber_object_ids(start = nil)
127
- @set_start_id = start || @set_start_id
128
- start = @set_start_id
129
- history = {}
130
- @objects.each do |obj|
131
- obj[:indirect_reference_id] = start
132
- start += 1
133
- end
134
- end
135
- def remove_old_ids
136
- @objects.each {|obj| obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)}
137
- end
138
-
139
- end
140
- end
74
+ # duplicate any non-unique pages - This is a special case to resolve Adobe Acrobat Reader issues (see issues #19 and #81)
75
+ uniqueness = {}.dup
76
+ page_list.each { |page| page = page[:referenced_object] || page; page = page.dup if uniqueness[page.object_id]; uniqueness[page.object_id] = page }
77
+ page_list.clear
78
+ page_list = uniqueness.values
79
+ uniqueness.clear
80
+
81
+ # build new Pages object
82
+ page_object_kids = [].dup
83
+ pages_object = { Type: :Pages, Count: page_list.length, Kids: page_object_kids }
84
+ pages_object_reference = { referenced_object: pages_object, is_reference_only: true }
85
+ page_list.each { |pg| pg[:Parent] = pages_object_reference; page_object_kids << ({ referenced_object: pg, is_reference_only: true }) }
86
+
87
+ # rebuild/rename the names dictionary
88
+ rebuild_names
89
+ # build new Catalog object
90
+ catalog_object = { Type: :Catalog,
91
+ Pages: { referenced_object: pages_object, is_reference_only: true } }
92
+ # pages_object[:Parent] = { referenced_object: catalog_object, is_reference_only: true } # causes AcrobatReader to fail
93
+ catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
94
+
95
+ # point old Pages pointers to new Pages object
96
+ ## first point known pages objects - enough?
97
+ pages.each { |p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true } }
98
+ ## or should we, go over structure? (fails)
99
+ # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
100
+
101
+ # # remove old catalog and pages objects
102
+ # @objects.reject! { |obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
103
+ # remove old objects list and trees
104
+ @objects.clear
105
+
106
+ # inject new catalog and pages objects
107
+ @objects << @info if @info
108
+ @objects << catalog_object
109
+ @objects << pages_object
110
+
111
+ # rebuild/rename the forms dictionary
112
+ if @forms_data.nil? || @forms_data.empty?
113
+ @forms_data = nil
114
+ else
115
+ @forms_data = { referenced_object: (@forms_data[:referenced_object] || @forms_data), is_reference_only: true }
116
+ catalog_object[:AcroForm] = @forms_data
117
+ @objects << @forms_data[:referenced_object]
118
+ end
119
+
120
+ # add the names dictionary
121
+ if @names && @names.length > 1
122
+ @objects << @names
123
+ catalog_object[:Names] = { referenced_object: @names, is_reference_only: true }
124
+ end
125
+ # add the outlines dictionary
126
+ if @outlines && @outlines.any?
127
+ @objects << @outlines
128
+ catalog_object[:Outlines] = { referenced_object: @outlines, is_reference_only: true }
129
+ end
130
+
131
+ catalog_object
132
+ end
133
+
134
+ def names_object
135
+ @names
136
+ end
137
+
138
+ def outlines_object
139
+ @outlines
140
+ end
141
+ # def forms_data
142
+ # @forms_data
143
+ # end
141
144
 
145
+ # @private
146
+ # this is an alternative to the rebuild_catalog catalog method
147
+ # this method is used by the to_pdf method, for streamlining the PDF output.
148
+ # there is no point is calling the method before preparing the output.
149
+ def rebuild_catalog_and_objects
150
+ catalog = rebuild_catalog
151
+ page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
152
+ # adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
153
+ add_referenced([page_objects, @forms_data, @names, @outlines, @info])
154
+ catalog
155
+ end
156
+
157
+ def get_existing_catalogs
158
+ (@objects.select { |obj| obj.is_a?(Hash) && obj[:Type] == :Catalog }) || (@objects.select { |obj| obj.is_a?(Hash) && obj[:Type] == :Page })
159
+ end
160
+
161
+ # end
162
+ # @private
163
+ def renumber_object_ids(start = nil)
164
+ @set_start_id = start || @set_start_id
165
+ start = @set_start_id
166
+ history = {}
167
+ @objects.each do |obj|
168
+ obj[:indirect_reference_id] = start
169
+ start += 1
170
+ end
171
+ end
172
+
173
+ def remove_old_ids
174
+ @objects.each { |obj| obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number) }
175
+ end
176
+
177
+ POSSIBLE_NAME_TREES = [:Dests, :AP, :Pages, :IDS, :Templates, :URLS, :JavaScript, :EmbeddedFiles, :AlternatePresentations, :Renditions].to_set.freeze
178
+
179
+ def rebuild_names(name_tree = nil, base = 'CombinePDF_0000000')
180
+ if name_tree
181
+ return nil unless name_tree.is_a?(Hash)
182
+ name_tree = name_tree[:referenced_object] || name_tree
183
+ dic = []
184
+ # map a names tree and return a valid name tree. Do not recourse.
185
+ should_resolve = [name_tree[:Kids], name_tree[:Names]]
186
+ resolved = [].to_set
187
+ while should_resolve.any?
188
+ pos = should_resolve.pop
189
+ if pos.is_a? Array
190
+ next if resolved.include?(pos.object_id)
191
+ if pos[0].is_a? String
192
+ (pos.length / 2).times do |i|
193
+ dic << (pos[i * 2].clear << base.next!)
194
+ pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Integer))
195
+ dic << (pos[(i * 2) + 1].is_a?(Array) ? { is_reference_only: true, referenced_object: { indirect_without_dictionary: pos[(i * 2) + 1] } } : pos[(i * 2) + 1])
196
+ # dic << pos[(i * 2) + 1]
197
+ end
198
+ else
199
+ should_resolve.concat pos
200
+ end
201
+ elsif pos.is_a? Hash
202
+ pos = pos[:referenced_object] || pos
203
+ next if resolved.include?(pos.object_id)
204
+ should_resolve << pos[:Kids] if pos[:Kids]
205
+ should_resolve << pos[:Names] if pos[:Names]
206
+ end
207
+ resolved << pos.object_id
208
+ end
209
+ return { referenced_object: { Names: dic }, is_reference_only: true }
210
+ end
211
+ @names ||= @names[:referenced_object]
212
+ new_names = { Type: :Names }.dup
213
+ POSSIBLE_NAME_TREES.each do |ntree|
214
+ if @names[ntree]
215
+ new_names[ntree] = rebuild_names(@names[ntree], base)
216
+ @names[ntree].clear
217
+ end
218
+ end
219
+ @names.clear
220
+ @names = new_names
221
+ end
222
+
223
+ # @private
224
+ # this method reviews a Hash and updates it by merging Hash data,
225
+ # preffering the new over the old.
226
+ def self.hash_merge_new_no_page(_key, old_data, new_data)
227
+ return old_data unless new_data
228
+ if old_data.is_a?(Hash) && new_data.is_a?(Hash)
229
+ return old_data if (old_data[:Type] == :Page)
230
+ old_data.merge(new_data, &(@hash_merge_new_no_page_proc ||= method(:hash_merge_new_no_page)))
231
+ elsif old_data.is_a? Array
232
+ new_data = [new_data] unless new_data.is_a? Array
233
+ old_data + new_data
234
+ elsif new_data.is_a? Array
235
+ new_data + [old_data]
236
+ else
237
+ new_data
238
+ end
239
+ end
240
+
241
+ # Merges 2 outlines by appending one to the end or start of the other.
242
+ # old_data - the main outline, which is also the one that will be used in the resulting PDF.
243
+ # new_data - the outline to be appended
244
+ # position - an integer representing the position where a PDF is being inserted.
245
+ # This method only differentiates between inserted at the beginning, or not.
246
+ # Not at the beginning, means the new outline will be added to the end of the original outline.
247
+ # An outline base node (tree base) has :Type, :Count, :First, :Last
248
+ # Every node within the outline base node's :First or :Last can have also have the following pointers to other nodes:
249
+ # :First or :Last (only if the node has a subtree / subsection)
250
+ # :Parent (the node's parent)
251
+ # :Prev, :Next (previous and next node)
252
+ # Non-node-pointer data in these nodes:
253
+ # :Title - the node's title displayed in the PDF outline
254
+ # :Count - Number of nodes in it's subtree (0 if no subtree)
255
+ # :Dest - node link destination (if the node is linking to something)
256
+ def merge_outlines(old_data, new_data, position)
257
+ old_data = actual_object(old_data)
258
+ new_data = actual_object(new_data)
259
+ if old_data.nil? || old_data.empty? || old_data[:First].nil?
260
+ # old_data is a reference to the actual object,
261
+ # so if we update old_data, we're done, no need to take any further action
262
+ old_data.update new_data
263
+ elsif new_data.nil? || new_data.empty? || new_data[:First].nil?
264
+ return old_data
265
+ else
266
+ new_data = new_data.dup # avoid old data corruption
267
+ # number of outline nodes, after the merge
268
+ old_data[:Count] = old_data[:Count].to_i + new_data[:Count].to_i
269
+ # walk the Hash here ...
270
+ # I'm just using the start / end insert-position for now...
271
+ # first - is going to be the start of the outline base node's :First, after the merge
272
+ # last - is going to be the end of the outline base node's :Last, after the merge
273
+ # median - the start of what will be appended to the end of the outline base node's :First
274
+ # parent - the outline base node of the resulting merged outline
275
+ # FIXME implement the possibility to insert somewhere in the middle of the outline
276
+ prev = nil
277
+ pos = first = actual_object((position.nonzero? ? old_data : new_data)[:First])
278
+ last = actual_object((position.nonzero? ? new_data : old_data)[:Last])
279
+ median = { is_reference_only: true, referenced_object: actual_object((position.nonzero? ? new_data : old_data)[:First]) }
280
+ old_data[:First] = { is_reference_only: true, referenced_object: first }
281
+ old_data[:Last] = { is_reference_only: true, referenced_object: last }
282
+ parent = { is_reference_only: true, referenced_object: old_data }
283
+ while pos
284
+ # walking through old_data here and updating the :Parent as we go,
285
+ # this updates the inserted new_data :Parent's as well once it is appended and the
286
+ # loop keeps walking the appended data.
287
+ pos[:Parent] = parent if pos[:Parent]
288
+ # connect the two outlines
289
+ # if there is no :Next, the end of the outline base node's :First is reached and this is
290
+ # where the new data gets appended, the same way you would append to a two-way linked list.
291
+ if pos[:Next].nil?
292
+ median[:referenced_object][:Prev] = { is_reference_only: true, referenced_object: prev } if median
293
+ pos[:Next] = median
294
+ # midian becomes 'nil' because this loop keeps going after the appending is done,
295
+ # to update the parents of the appended tree and we wouldn't want to keep appending it infinitely.
296
+ median = nil
297
+ end
298
+ # iterating over the outlines main nodes (this is not going into subtrees)
299
+ # while keeping every rotations previous node saved
300
+ prev = pos
301
+ pos = actual_object(pos[:Next])
302
+ end
303
+ # make sure the last object doesn't have the :Next and the first no :Prev property
304
+ prev.delete :Next
305
+ actual_object(old_data[:First]).delete :Prev
306
+ end
307
+ end
308
+
309
+ # Prints the whole outline hash to a file,
310
+ # with basic indentation and replacing raw streams with "RAW STREAM"
311
+ # (subbing doesn't allways work that great for big streams)
312
+ # outline - outline hash
313
+ # file - "filename.filetype" string
314
+ def print_outline_to_file(outline, file)
315
+ outline_subbed_str = outline.to_s.gsub(/\:raw_stream_content=\>"(?:(?!"}).)*+"\}\}/, ':raw_stream_content=> RAW STREAM}}')
316
+ brace_cnt = 0
317
+ formatted_outline_str = ''
318
+ outline_subbed_str.each_char do |c|
319
+ if c == '{'
320
+ formatted_outline_str << "\n" << "\t" * brace_cnt << c
321
+ brace_cnt += 1
322
+ elsif c == '}'
323
+ brace_cnt -= 1
324
+ brace_cnt = 0 if brace_cnt < 0
325
+ formatted_outline_str << c << "\n" << "\t" * brace_cnt
326
+ elsif c == '\n'
327
+ formatted_outline_str << c << "\t" * brace_cnt
328
+ else
329
+ formatted_outline_str << c
330
+ end
331
+ end
332
+ formatted_outline_str << "\n" * 10
333
+ File.open(file, 'w') { |file| file.write(formatted_outline_str) }
334
+ end
335
+
336
+ private
337
+
338
+ def renaming_dictionary(object = nil, dictionary = {})
339
+ object ||= @names
340
+ case object
341
+ when Array
342
+ object.length.times { |i| object[i].is_a?(String) ? (dictionary[object[i]] = (dictionary.last || 'Random_0001').next) : renaming_dictionary(object[i], dictionary) }
343
+ when Hash
344
+ object.values.each { |v| renaming_dictionary v, dictionary }
345
+ end
346
+ end
347
+
348
+ def rename_object(object, _dictionary)
349
+ case object
350
+ when Array
351
+ object.length.times { |i| }
352
+ when Hash
353
+ end
354
+ end
355
+ end
356
+ end