combine_pdf 0.2.5 → 0.2.37

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,137 +5,352 @@
5
5
  ## is subject to the same license.
6
6
  ########################################################
7
7
 
8
+ module CombinePDF
9
+ class PDF
10
+ protected
8
11
 
12
+ include Renderer
9
13
 
14
+ # RECORSIVE_PROTECTION = { Parent: true, Last: true}.freeze
10
15
 
16
+ # @private
17
+ # Some PDF objects contain references to other PDF objects.
18
+ #
19
+ # this function adds the references contained in `@objects`.
20
+ #
21
+ # this is used for internal operations, such as injectng data using the << operator.
22
+ def add_referenced(should_resolve = [])
23
+ # add references but not root
24
+ dup_pages = nil
25
+ # an existing object map
26
+ resolved = {}.dup
27
+ existing = {}.dup
28
+ @objects.each { |obj| existing[obj] = obj }
29
+ # loop until should_resolve is empty
30
+ while should_resolve.any?
31
+ obj = should_resolve.pop
32
+ next if resolved[obj.object_id] # the object exists
33
+ if obj.is_a?(Hash)
34
+ referenced = obj[:referenced_object]
35
+ if referenced && referenced.any?
36
+ tmp = resolved[referenced.object_id] || existing[referenced]
37
+ if tmp
38
+ obj[:referenced_object] = tmp
39
+ else
40
+ resolved[obj.object_id] = referenced
41
+ existing[referenced] = referenced
42
+ should_resolve << referenced
43
+ @objects << referenced
44
+ end
45
+ else
46
+ resolved[obj.object_id] = obj
47
+ obj.keys.each { |k| should_resolve << obj[k] unless !obj[k].is_a?(Enumerable) || resolved[obj[k].object_id] }
48
+ end
49
+ elsif obj.is_a?(Array)
50
+ resolved[obj.object_id] = obj
51
+ should_resolve.concat obj
52
+ end
53
+ end
54
+ resolved.clear
55
+ existing.clear
56
+ end
11
57
 
58
+ # @private
59
+ def rebuild_catalog(*with_pages)
60
+ # # build page list v.1 Slow but WORKS
61
+ # # Benchmark testing value: 26.708394
62
+ # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
63
+ # old_catalogs ||= []
64
+ # page_list = []
65
+ # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page }
12
66
 
13
- module CombinePDF
67
+ # build page list v.2 faster, better, and works
68
+ # Benchmark testing value: 0.215114
69
+ page_list = pages
14
70
 
71
+ # add pages to catalog, if requested
72
+ page_list.concat(with_pages) unless with_pages.empty?
15
73
 
16
- class PDF
17
-
18
- protected
19
-
20
- include Renderer
21
-
22
- # @private
23
- # Some PDF objects contain references to other PDF objects.
24
- #
25
- # this function adds the references contained in "object", but DOESN'T add the object itself.
26
- #
27
- # this is used for internal operations, such as injectng data using the << operator.
28
- def add_referenced(object)
29
- # add references but not root
30
- case
31
- when object.is_a?(Array)
32
- object.each {|it| add_referenced(it)}
33
- return true
34
- when object.is_a?(Hash)
35
- # first if statement is actually a workaround for a bug in Acrobat Reader, regarding duplicate pages.
36
- if object[:is_reference_only] && object[:referenced_object] && object[:referenced_object].is_a?(Hash) && object[:referenced_object][:Type] == :Page
37
- if @objects.find_index object[:referenced_object]
38
- @objects << (object[:referenced_object] = object[:referenced_object].dup)
39
- else
40
- @objects << object[:referenced_object]
41
- end
42
- elsif object[:is_reference_only] && object[:referenced_object]
43
- found_at = @objects.find_index object[:referenced_object]
44
- if found_at
45
- #if the objects are equal, they might still be different objects!
46
- # so, we need to make sure they are the same object for the pointers to effect id numbering
47
- # and formatting operations.
48
- object[:referenced_object] = @objects[found_at]
49
- # stop this path, there is no need to run over the Hash's keys and values
50
- return true
51
- else
52
- # @objects.include? object[:referenced_object] is bound to be false
53
- # the object wasn't found - add it to the @objects array
54
- @objects << object[:referenced_object]
55
- end
56
-
57
- end
58
- object.each do |k, v|
59
- add_referenced(v) unless k == :Parent
60
- end
61
- else
62
- return false
63
- end
64
- true
65
- end
66
-
67
- # @private
68
- def rebuild_catalog(*with_pages)
69
- # # build page list v.1 Slow but WORKS
70
- # # Benchmark testing value: 26.708394
71
- # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
72
- # old_catalogs ||= []
73
- # page_list = []
74
- # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page }
75
-
76
- # build page list v.2 faster, better, and works
77
- # Benchmark testing value: 0.215114
78
- page_list = pages
79
-
80
- # add pages to catalog, if requested
81
- page_list.push(*with_pages) unless with_pages.empty?
82
-
83
- # build new Pages object
84
- pages_object = {Type: :Pages, Count: page_list.length, Kids: page_list.map {|p| {referenced_object: p, is_reference_only: true} } }
85
-
86
- # build new Catalog object
87
- catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true} }
88
- catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
89
-
90
- # point old Pages pointers to new Pages object
91
- ## first point known pages objects - enough?
92
- pages.each {|p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true} }
93
- ## or should we, go over structure? (fails)
94
- # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
95
-
96
- # remove old catalog and pages objects
97
- @objects.reject! {|obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
98
-
99
- # inject new catalog and pages objects
100
- @objects << pages_object
101
- @objects << catalog_object
102
-
103
- catalog_object
104
- end
105
-
106
- # @private
107
- # this is an alternative to the rebuild_catalog catalog method
108
- # this method is used by the to_pdf method, for streamlining the PDF output.
109
- # there is no point is calling the method before preparing the output.
110
- def rebuild_catalog_and_objects
111
- catalog = rebuild_catalog
112
- @objects = []
113
- @objects << catalog
114
- add_referenced catalog
115
- catalog
116
- end
117
-
118
- def get_existing_catalogs
119
- (@objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}) || (@objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Page})
120
- end
121
-
122
-
123
-
124
- # end
125
- # @private
126
- def renumber_object_ids(start = nil)
127
- @set_start_id = start || @set_start_id
128
- start = @set_start_id
129
- history = {}
130
- @objects.each do |obj|
131
- obj[:indirect_reference_id] = start
132
- start += 1
133
- end
134
- end
135
- def remove_old_ids
136
- @objects.each {|obj| obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number)}
137
- end
138
-
139
- end
140
- end
74
+ # duplicate any non-unique pages - This is a special case to resolve Adobe Acrobat Reader issues (see issues #19 and #81)
75
+ uniqueness = {}.dup
76
+ page_list.each { |page| page = page[:referenced_object] || page; page = page.dup if uniqueness[page.object_id]; uniqueness[page.object_id] = page }
77
+ page_list.clear
78
+ page_list = uniqueness.values
79
+ uniqueness.clear
80
+
81
+ # build new Pages object
82
+ page_object_kids = [].dup
83
+ pages_object = { Type: :Pages, Count: page_list.length, Kids: page_object_kids }
84
+ pages_object_reference = { referenced_object: pages_object, is_reference_only: true }
85
+ page_list.each { |pg| pg[:Parent] = pages_object_reference; page_object_kids << ({ referenced_object: pg, is_reference_only: true }) }
86
+
87
+ # rebuild/rename the names dictionary
88
+ rebuild_names
89
+ # build new Catalog object
90
+ catalog_object = { Type: :Catalog,
91
+ Pages: { referenced_object: pages_object, is_reference_only: true } }
92
+ # pages_object[:Parent] = { referenced_object: catalog_object, is_reference_only: true } # causes AcrobatReader to fail
93
+ catalog_object[:ViewerPreferences] = @viewer_preferences unless @viewer_preferences.empty?
94
+
95
+ # point old Pages pointers to new Pages object
96
+ ## first point known pages objects - enough?
97
+ pages.each { |p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true } }
98
+ ## or should we, go over structure? (fails)
99
+ # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
100
+
101
+ # # remove old catalog and pages objects
102
+ # @objects.reject! { |obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
103
+ # remove old objects list and trees
104
+ @objects.clear
105
+
106
+ # inject new catalog and pages objects
107
+ @objects << @info if @info
108
+ @objects << catalog_object
109
+ @objects << pages_object
110
+
111
+ # rebuild/rename the forms dictionary
112
+ if @forms_data.nil? || @forms_data.empty?
113
+ @forms_data = nil
114
+ else
115
+ @forms_data = { referenced_object: (@forms_data[:referenced_object] || @forms_data), is_reference_only: true }
116
+ catalog_object[:AcroForm] = @forms_data
117
+ @objects << @forms_data[:referenced_object]
118
+ end
119
+
120
+ # add the names dictionary
121
+ if @names && @names.length > 1
122
+ @objects << @names
123
+ catalog_object[:Names] = { referenced_object: @names, is_reference_only: true }
124
+ end
125
+ # add the outlines dictionary
126
+ if @outlines && @outlines.any?
127
+ @objects << @outlines
128
+ catalog_object[:Outlines] = { referenced_object: @outlines, is_reference_only: true }
129
+ end
130
+
131
+ catalog_object
132
+ end
133
+
134
+ def names_object
135
+ @names
136
+ end
137
+
138
+ def outlines_object
139
+ @outlines
140
+ end
141
+ # def forms_data
142
+ # @forms_data
143
+ # end
141
144
 
145
+ # @private
146
+ # this is an alternative to the rebuild_catalog catalog method
147
+ # this method is used by the to_pdf method, for streamlining the PDF output.
148
+ # there is no point is calling the method before preparing the output.
149
+ def rebuild_catalog_and_objects
150
+ catalog = rebuild_catalog
151
+ page_objects = catalog[:Pages][:referenced_object][:Kids].map { |e| @objects << e[:referenced_object]; e[:referenced_object] }
152
+ # adds every referenced object to the @objects (root), addition is performed as pointers rather then copies
153
+ add_referenced([page_objects, @forms_data, @names, @outlines, @info])
154
+ catalog
155
+ end
156
+
157
+ def get_existing_catalogs
158
+ (@objects.select { |obj| obj.is_a?(Hash) && obj[:Type] == :Catalog }) || (@objects.select { |obj| obj.is_a?(Hash) && obj[:Type] == :Page })
159
+ end
160
+
161
+ # end
162
+ # @private
163
+ def renumber_object_ids(start = nil)
164
+ @set_start_id = start || @set_start_id
165
+ start = @set_start_id
166
+ history = {}
167
+ @objects.each do |obj|
168
+ obj[:indirect_reference_id] = start
169
+ start += 1
170
+ end
171
+ end
172
+
173
+ def remove_old_ids
174
+ @objects.each { |obj| obj.delete(:indirect_reference_id); obj.delete(:indirect_generation_number) }
175
+ end
176
+
177
+ POSSIBLE_NAME_TREES = [:Dests, :AP, :Pages, :IDS, :Templates, :URLS, :JavaScript, :EmbeddedFiles, :AlternatePresentations, :Renditions].to_set.freeze
178
+
179
+ def rebuild_names(name_tree = nil, base = 'CombinePDF_0000000')
180
+ if name_tree
181
+ return nil unless name_tree.is_a?(Hash)
182
+ name_tree = name_tree[:referenced_object] || name_tree
183
+ dic = []
184
+ # map a names tree and return a valid name tree. Do not recourse.
185
+ should_resolve = [name_tree[:Kids], name_tree[:Names]]
186
+ resolved = [].to_set
187
+ while should_resolve.any?
188
+ pos = should_resolve.pop
189
+ if pos.is_a? Array
190
+ next if resolved.include?(pos.object_id)
191
+ if pos[0].is_a? String
192
+ (pos.length / 2).times do |i|
193
+ dic << (pos[i * 2].clear << base.next!)
194
+ pos[(i * 2) + 1][0] = {is_reference_only: true, referenced_object: pages[pos[(i * 2) + 1][0]]} if(pos[(i * 2) + 1].is_a?(Array) && pos[(i * 2) + 1][0].is_a?(Integer))
195
+ dic << (pos[(i * 2) + 1].is_a?(Array) ? { is_reference_only: true, referenced_object: { indirect_without_dictionary: pos[(i * 2) + 1] } } : pos[(i * 2) + 1])
196
+ # dic << pos[(i * 2) + 1]
197
+ end
198
+ else
199
+ should_resolve.concat pos
200
+ end
201
+ elsif pos.is_a? Hash
202
+ pos = pos[:referenced_object] || pos
203
+ next if resolved.include?(pos.object_id)
204
+ should_resolve << pos[:Kids] if pos[:Kids]
205
+ should_resolve << pos[:Names] if pos[:Names]
206
+ end
207
+ resolved << pos.object_id
208
+ end
209
+ return { referenced_object: { Names: dic }, is_reference_only: true }
210
+ end
211
+ @names ||= @names[:referenced_object]
212
+ new_names = { Type: :Names }.dup
213
+ POSSIBLE_NAME_TREES.each do |ntree|
214
+ if @names[ntree]
215
+ new_names[ntree] = rebuild_names(@names[ntree], base)
216
+ @names[ntree].clear
217
+ end
218
+ end
219
+ @names.clear
220
+ @names = new_names
221
+ end
222
+
223
+ # @private
224
+ # this method reviews a Hash and updates it by merging Hash data,
225
+ # preffering the new over the old.
226
+ def self.hash_merge_new_no_page(_key, old_data, new_data)
227
+ return old_data unless new_data
228
+ if old_data.is_a?(Hash) && new_data.is_a?(Hash)
229
+ return old_data if (old_data[:Type] == :Page)
230
+ old_data.merge(new_data, &(@hash_merge_new_no_page_proc ||= method(:hash_merge_new_no_page)))
231
+ elsif old_data.is_a? Array
232
+ new_data = [new_data] unless new_data.is_a? Array
233
+ old_data + new_data
234
+ elsif new_data.is_a? Array
235
+ new_data + [old_data]
236
+ else
237
+ new_data
238
+ end
239
+ end
240
+
241
+ # Merges 2 outlines by appending one to the end or start of the other.
242
+ # old_data - the main outline, which is also the one that will be used in the resulting PDF.
243
+ # new_data - the outline to be appended
244
+ # position - an integer representing the position where a PDF is being inserted.
245
+ # This method only differentiates between inserted at the beginning, or not.
246
+ # Not at the beginning, means the new outline will be added to the end of the original outline.
247
+ # An outline base node (tree base) has :Type, :Count, :First, :Last
248
+ # Every node within the outline base node's :First or :Last can have also have the following pointers to other nodes:
249
+ # :First or :Last (only if the node has a subtree / subsection)
250
+ # :Parent (the node's parent)
251
+ # :Prev, :Next (previous and next node)
252
+ # Non-node-pointer data in these nodes:
253
+ # :Title - the node's title displayed in the PDF outline
254
+ # :Count - Number of nodes in it's subtree (0 if no subtree)
255
+ # :Dest - node link destination (if the node is linking to something)
256
+ def merge_outlines(old_data, new_data, position)
257
+ old_data = actual_object(old_data)
258
+ new_data = actual_object(new_data)
259
+ if old_data.nil? || old_data.empty? || old_data[:First].nil?
260
+ # old_data is a reference to the actual object,
261
+ # so if we update old_data, we're done, no need to take any further action
262
+ old_data.update new_data
263
+ elsif new_data.nil? || new_data.empty? || new_data[:First].nil?
264
+ return old_data
265
+ else
266
+ new_data = new_data.dup # avoid old data corruption
267
+ # number of outline nodes, after the merge
268
+ old_data[:Count] = old_data[:Count].to_i + new_data[:Count].to_i
269
+ # walk the Hash here ...
270
+ # I'm just using the start / end insert-position for now...
271
+ # first - is going to be the start of the outline base node's :First, after the merge
272
+ # last - is going to be the end of the outline base node's :Last, after the merge
273
+ # median - the start of what will be appended to the end of the outline base node's :First
274
+ # parent - the outline base node of the resulting merged outline
275
+ # FIXME implement the possibility to insert somewhere in the middle of the outline
276
+ prev = nil
277
+ pos = first = actual_object((position.nonzero? ? old_data : new_data)[:First])
278
+ last = actual_object((position.nonzero? ? new_data : old_data)[:Last])
279
+ median = { is_reference_only: true, referenced_object: actual_object((position.nonzero? ? new_data : old_data)[:First]) }
280
+ old_data[:First] = { is_reference_only: true, referenced_object: first }
281
+ old_data[:Last] = { is_reference_only: true, referenced_object: last }
282
+ parent = { is_reference_only: true, referenced_object: old_data }
283
+ while pos
284
+ # walking through old_data here and updating the :Parent as we go,
285
+ # this updates the inserted new_data :Parent's as well once it is appended and the
286
+ # loop keeps walking the appended data.
287
+ pos[:Parent] = parent if pos[:Parent]
288
+ # connect the two outlines
289
+ # if there is no :Next, the end of the outline base node's :First is reached and this is
290
+ # where the new data gets appended, the same way you would append to a two-way linked list.
291
+ if pos[:Next].nil?
292
+ median[:referenced_object][:Prev] = { is_reference_only: true, referenced_object: prev } if median
293
+ pos[:Next] = median
294
+ # midian becomes 'nil' because this loop keeps going after the appending is done,
295
+ # to update the parents of the appended tree and we wouldn't want to keep appending it infinitely.
296
+ median = nil
297
+ end
298
+ # iterating over the outlines main nodes (this is not going into subtrees)
299
+ # while keeping every rotations previous node saved
300
+ prev = pos
301
+ pos = actual_object(pos[:Next])
302
+ end
303
+ # make sure the last object doesn't have the :Next and the first no :Prev property
304
+ prev.delete :Next
305
+ actual_object(old_data[:First]).delete :Prev
306
+ end
307
+ end
308
+
309
+ # Prints the whole outline hash to a file,
310
+ # with basic indentation and replacing raw streams with "RAW STREAM"
311
+ # (subbing doesn't allways work that great for big streams)
312
+ # outline - outline hash
313
+ # file - "filename.filetype" string
314
+ def print_outline_to_file(outline, file)
315
+ outline_subbed_str = outline.to_s.gsub(/\:raw_stream_content=\>"(?:(?!"}).)*+"\}\}/, ':raw_stream_content=> RAW STREAM}}')
316
+ brace_cnt = 0
317
+ formatted_outline_str = ''
318
+ outline_subbed_str.each_char do |c|
319
+ if c == '{'
320
+ formatted_outline_str << "\n" << "\t" * brace_cnt << c
321
+ brace_cnt += 1
322
+ elsif c == '}'
323
+ brace_cnt -= 1
324
+ brace_cnt = 0 if brace_cnt < 0
325
+ formatted_outline_str << c << "\n" << "\t" * brace_cnt
326
+ elsif c == '\n'
327
+ formatted_outline_str << c << "\t" * brace_cnt
328
+ else
329
+ formatted_outline_str << c
330
+ end
331
+ end
332
+ formatted_outline_str << "\n" * 10
333
+ File.open(file, 'w') { |file| file.write(formatted_outline_str) }
334
+ end
335
+
336
+ private
337
+
338
+ def renaming_dictionary(object = nil, dictionary = {})
339
+ object ||= @names
340
+ case object
341
+ when Array
342
+ object.length.times { |i| object[i].is_a?(String) ? (dictionary[object[i]] = (dictionary.last || 'Random_0001').next) : renaming_dictionary(object[i], dictionary) }
343
+ when Hash
344
+ object.values.each { |v| renaming_dictionary v, dictionary }
345
+ end
346
+ end
347
+
348
+ def rename_object(object, _dictionary)
349
+ case object
350
+ when Array
351
+ object.length.times { |i| }
352
+ when Hash
353
+ end
354
+ end
355
+ end
356
+ end