combine_pdf 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ # -*- encoding : utf-8 -*-
2
+ ########################################################
3
+ ## Thoughts from reading the ISO 32000-1:2008
4
+ ## this file is part of the CombinePDF library and the code
5
+ ## is subject to the same license.
6
+ ########################################################
7
+ module CombinePDF
8
+ ########################################################
9
+ ## PDF class is the PDF object that can save itself to
10
+ ## a file and that can be used as a container for a full
11
+ ## PDF file data, including version etc'.
12
+ ########################################################
13
+ class PDF
14
+ attr_reader :objects, :info
15
+ attr_accessor :string_output
16
+ attr_accessor :version
17
+ def initialize (*args)
18
+ # default before setting
19
+ @objects = []
20
+ @version = 0
21
+ @info = {}
22
+ if args[0].is_a? PDFParser
23
+ @objects = args[0].parse
24
+ @version = args[0].version if args[0].version.is_a? Float
25
+ @info = args[0].info_object || {}
26
+ elsif args[0].is_a? Array
27
+ # object initialization
28
+ @objects = args[0]
29
+ @version = args[1] if args[1].is_a? Float
30
+ elsif args[0].is_a? Hash
31
+ @objects = args
32
+ end
33
+ # connecting references with original objects
34
+ serialize_objects_and_references
35
+ # general globals
36
+ @string_output = :literal
37
+ @need_to_rebuild_resources = false
38
+ @set_start_id = 1
39
+ @info[:Producer] = "Ruby CombinePDF Library by Boaz Segev"
40
+ @info.delete :CreationDate
41
+ @info.delete :ModDate
42
+ warn "finished to initialize PDF object."
43
+ end
44
+
45
+ # Formats the data to PDF formats and returns a binary string that represents the PDF file content.
46
+ # This method is used by the save(file_name) method to save the content to a file.
47
+ # use this to export the PDF file without saving to disk (such as sending through HTTP ect').
48
+ def to_pdf
49
+ #reset version if not specified
50
+ @version = 1.3 if @version == 0
51
+ #set creation date for merged file
52
+ @info[:CreationDate] = Time.now.strftime "D:%Y%m%d%H%M%S%:::z'00"
53
+ #rebuild resources if needed
54
+ if @need_to_rebuild_resources
55
+ rebuild_resources
56
+ end
57
+ catalog = rebuild_catalog_and_objects #rebuild_catalog
58
+
59
+ warn "Formatting PDF output"
60
+
61
+ out = []
62
+ xref = []
63
+ indirect_object_count = 1 #the first object is the null object
64
+ #write head (version and binanry-code)
65
+ out << "%PDF-#{@version.to_s}\n%\x00\x00\x00\x00".force_encoding(Encoding::ASCII_8BIT)
66
+
67
+ #collect objects and set xref table locations
68
+ loc = 0
69
+ out.each {|line| loc += line.bytes.length + 1}
70
+ @objects.each do |o|
71
+ indirect_object_count += 1
72
+ xref << loc
73
+ out << PDFOperations._object_to_pdf(o)
74
+ loc += out.last.length + 1
75
+ end
76
+ warn "Building XREF"
77
+ xref_location = 0
78
+ out.each { |line| xref_location += line.bytes.length + 1}
79
+ out << "xref\n\r0 #{(indirect_object_count).to_s}\n\r0000000000 65535 f \n\r"
80
+ xref.each {|offset| out << ( out.pop + ("%010d 00000 n \n\r" % offset) ) }
81
+ out << out.pop + "trailer"
82
+ out << "<<\n/Root #{false || "#{catalog[:indirect_reference_id]} #{catalog[:indirect_generation_number]} R"}"
83
+ out << "/Size #{indirect_object_count.to_s}"
84
+ if @info.is_a?(Hash)
85
+ PRIVATE_HASH_KEYS.each {|key| @info.delete key} # make sure the dictionary is rendered inline, without stream
86
+ out << "/Info #{PDFOperations._object_to_pdf @info}"
87
+ end
88
+ out << ">>\nstartxref\n#{xref_location.to_s}\n%%EOF"
89
+ out.join("\n").force_encoding(Encoding::ASCII_8BIT)
90
+ end
91
+
92
+ # Seve the PDF to file.
93
+ # save(file_name)
94
+ # - file_name is a string or path object for the output.
95
+ # Notice! if the file exists, it WILL be overwritten.
96
+ def save(file_name)
97
+ IO.binwrite file_name, to_pdf
98
+ end
99
+ # this function returns all the pages cataloged in the catalog.
100
+ # if no catalog is passed, it seeks the existing catalog(s) and searches
101
+ # for any registered Page objects.
102
+ def pages(catalogs = nil)
103
+ page_list = []
104
+ if catalogs == nil
105
+ catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
106
+ catalogs ||= []
107
+ end
108
+ case
109
+ when catalogs.is_a?(Array)
110
+ catalogs.each {|c| page_list.push *(pages(c)) unless c.nil?}
111
+ when catalogs.is_a?(Hash)
112
+ if catalogs[:is_reference_only]
113
+ catalogs[:referenced_object] = pages(PDFOperations.get_refernced_object @objects, catalogs) unless catalogs[:referenced_object]
114
+ if catalogs[:referenced_object]
115
+ page_list.push *( pages(catalogs[:referenced_object]) )
116
+ else
117
+ warn "couldn't follow reference!!! #{catalogs} not found!"
118
+ end
119
+ else
120
+ case catalogs[:Type]
121
+ when :Page
122
+ holder = self
123
+ catalogs.define_singleton_method("<<".to_sym) do |obj|
124
+ obj = PDFOperations.copy_and_secure_for_injection obj
125
+ PDFOperations.inject_to_page self, obj
126
+ holder.add_referenced obj
127
+ end
128
+ page_list << catalogs
129
+ when :Pages
130
+ page_list.push *(pages(catalogs[:Kids])) unless catalogs[:Kids].nil?
131
+ when :Catalog
132
+ page_list.push *(pages(catalogs[:Pages])) unless catalogs[:Pages].nil?
133
+ end
134
+ end
135
+ end
136
+ page_list
137
+ end
138
+
139
+ # this function returns all the Page objects - regardless of order and even if not cataloged
140
+ # could be used for finding "lost" pages... but actually rather useless.
141
+ def all_pages
142
+ #########
143
+ ## Only return the page item, but make sure all references are connected so that
144
+ ## referenced items and be reached through the connections.
145
+ [].tap {|out| each_object {|obj| out << obj if obj.is_a?(Hash) && obj[:Type] == :Page } }
146
+ end
147
+
148
+ # this function adds pages or CombinePDF objects at the end of the file (merge)
149
+ # for example:
150
+ # pdf = CombinePDF.new "first_file.pdf"
151
+ # pdf << CombinePDF.new "second_file.pdf"
152
+ # pdf.save "both_files_merged.pdf"
153
+ def << (obj)
154
+ #########
155
+ ## how should we add data to PDF?
156
+ ## and how to handles imported pages?
157
+ case
158
+ when (obj.is_a?(PDF))
159
+ @version = [@version, obj.version].max
160
+
161
+ obj.renumber_object_ids @set_start_id + @objects.length
162
+
163
+ @objects.push(*obj.objects)
164
+ # rebuild_catalog
165
+ @need_to_rebuild_resources = true
166
+ when (obj.is_a?(Hash) && obj[:Type] == :Page), (obj.is_a?(Array) && (obj.reject {|i| i.is_a?(Hash) && i[:Type] == :Page}).empty?)
167
+ # set obj paramater to array if it's only one page
168
+ obj = [obj] if obj.is_a?(Hash)
169
+ # add page(s) to objects
170
+ @objects.push(*obj)
171
+ # add page dependencies to objects
172
+ add_referenced(obj)
173
+ # add page(s) to Catalog(s)
174
+ rebuild_catalog obj
175
+ @need_to_rebuild_resources = true
176
+ when (obj.is_a?(Hash) && obj[:indirect_reference_id] && obj[:referenced_object].nil?)
177
+ #only let top level indirect objects into the PDF tree.
178
+ @objects << obj
179
+ @need_to_rebuild_resources = true
180
+ else
181
+ warn "Shouldn't add objects to the file if they are not top-level indirect PDF objects."
182
+ end
183
+ end
184
+
185
+ def serialize_objects_and_references(object = nil)
186
+ warn "connecting objects with their references (serialize_objects_and_references)."
187
+
188
+ # # Version 3.5 injects indirect objects if they arn't dictionaries.
189
+ # # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" }
190
+ # # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
191
+ # # ######### Intreduces a BUG with catalogging pages... why? I don't know... mybey doesn't catch all.
192
+ # each_object do |obj|
193
+ # obj.each do |k, v|
194
+ # if v.is_a?(Hash) && v[:is_reference_only]
195
+ # v[:referenced_object] = PDFOperations.get_refernced_object @objects, v
196
+ # raise "couldn't connect references" unless v[:referenced_object]
197
+ # obj[k] = v[:referenced_object][:indirect_without_dictionary] if v[:referenced_object][:indirect_without_dictionary]
198
+ # end
199
+ # end
200
+ # end
201
+
202
+ # Version 4
203
+ # benchmark 1000.times was 0.980651 sec for:
204
+ # pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf"
205
+ # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
206
+ objects_reference_hash = {}
207
+ @objects.each {|o| objects_reference_hash[ [o[:indirect_reference_id], o[:indirect_generation_number] ] ] = o }
208
+ each_object do |obj|
209
+ if obj[:is_reference_only]
210
+ obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
211
+ warn "couldn't connect a reference!!! could be a null object, Silent error!!!" unless obj[:referenced_object]
212
+ end
213
+ end
214
+
215
+ # # Version 3
216
+ # # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" }
217
+ # # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
218
+ # each_object do |obj|
219
+ # if obj[:is_reference_only]
220
+ # obj[:referenced_object] = PDFOperations.get_refernced_object @objects, obj
221
+ # warn "couldn't connect a reference!!! could be a null object, Silent error!!!" unless obj[:referenced_object]
222
+ # end
223
+ # end
224
+
225
+ end
226
+ def renumber_object_ids(start = nil)
227
+ warn "Resetting Object Reference IDs"
228
+ @set_start_id ||= start
229
+ start = @set_start_id
230
+ history = {}
231
+ all_indirect_object.each do |obj|
232
+ obj[:indirect_reference_id] = start
233
+ start += 1
234
+ end
235
+ warn "Finished serializing IDs"
236
+ end
237
+
238
+ def references(indirect_reference_id = nil, indirect_generation_number = nil)
239
+ ref = {indirect_reference_id: indirect_reference_id, indirect_generation_number: indirect_generation_number}
240
+ out = []
241
+ each_object do |obj|
242
+ if obj[:is_reference_only]
243
+ if (indirect_reference_id == nil && indirect_generation_number == nil)
244
+ out << obj
245
+ elsif compare_reference_values(ref, obj)
246
+ out << obj
247
+ end
248
+ end
249
+ end
250
+ out
251
+ end
252
+ def all_indirect_object
253
+ [].tap {|out| @objects.each {|obj| out << obj if (obj.is_a?(Hash) && obj[:is_reference_only].nil?) } }
254
+ end
255
+ def sort_objects_by_id
256
+ @objects.sort! do |a,b|
257
+ if a.is_a?(Hash) && a[:indirect_reference_id] && a[:is_reference_only].nil? && b.is_a?(Hash) && b[:indirect_reference_id] && b[:is_reference_only].nil?
258
+ return a[:indirect_reference_id] <=> b[:indirect_reference_id]
259
+ end
260
+ 0
261
+ end
262
+ end
263
+
264
+ def add_referenced(object)
265
+ # add references but not root
266
+ case
267
+ when object.is_a?(Array)
268
+ object.each {|it| add_referenced(it)}
269
+ when object.is_a?(Hash)
270
+ if object[:is_reference_only] && object[:referenced_object]
271
+ unless @objects.include? object[:referenced_object]
272
+ @objects << object[:referenced_object]
273
+ object[:referenced_object].each do |k, v|
274
+ add_referenced(v) unless k == :Parent
275
+ end
276
+ end
277
+ else
278
+ object.each do |k, v|
279
+ add_referenced(v) unless k == :Parent
280
+ end
281
+ end
282
+ end
283
+ end
284
+ def rebuild_catalog(*with_pages)
285
+ ##########################
286
+ ## Test-Run - How is that done?
287
+ warn "Re-Building Catalog"
288
+
289
+ # # build page list v.1 Slow but WORKS
290
+ # # Benchmark testing value: 26.708394
291
+ # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
292
+ # old_catalogs ||= []
293
+ # page_list = []
294
+ # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page }
295
+
296
+ # build page list v.2
297
+ # Benchmark testing value: 0.215114
298
+ page_list = pages
299
+
300
+ # add pages to catalog, if requested
301
+ page_list.push(*with_pages) unless with_pages.empty?
302
+
303
+ # build new Pages object
304
+ pages_object = {Type: :Pages, Count: page_list.length, Kids: page_list.map {|p| {referenced_object: p, is_reference_only: true} } }
305
+
306
+ # build new Catalog object
307
+ catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true} }
308
+
309
+ # point old Pages pointers to new Pages object
310
+ ## first point known pages objects - enough?
311
+ pages.each {|p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true} }
312
+ ## or should we, go over structure? (fails)
313
+ # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
314
+
315
+ # remove old catalog and pages objects
316
+ @objects.reject! {|obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
317
+
318
+ # inject new catalog and pages objects
319
+ @objects << pages_object
320
+ @objects << catalog_object
321
+
322
+ catalog_object
323
+ end
324
+ # this is an alternative to the rebuild_catalog catalog method
325
+ # this method might eventually be used by the to_pdf method, for streamlining the PDF output.
326
+ def rebuild_catalog_and_objects
327
+ catalog = rebuild_catalog
328
+ @objects = []
329
+ @objects << catalog
330
+ add_referenced catalog
331
+ renumber_object_ids
332
+ catalog
333
+ end
334
+
335
+ def rebuild_resources
336
+
337
+ warn "Resources re-building disabled as it isn't worth the price in peformance as of yet."
338
+
339
+ return true
340
+
341
+ warn "Re-Building Resources"
342
+ @need_to_rebuild_resources = false
343
+ # what are resources?
344
+ # anything at the top level of the file exept catalogs, page lists (Pages) and pages...
345
+ not_resources = [:Catalog, :Pages, :Page]
346
+ # get old resources list
347
+ old_resources = @objects.select {|obj| obj.is_a?(Hash) && !not_resources.include?(obj[:Type])}
348
+ # collect all unique resources while ignoring double values and resetting references
349
+ # also ignore inner values (canot use PRIVATE_HASH_KEYS because of stream and other issues)
350
+ ignore_keys = [:indirect_reference_id, :indirect_generation_number, :is_reference_only, :referenced_object]
351
+ new_resources = []
352
+ all_references = references
353
+ old_resources.each do |old_r|
354
+ add = true
355
+ new_resources.each do |new_r|
356
+ # ## v.1.0 - slower
357
+ # if (old_r.reject {|k,v| ignore_keys.include?(k) }) == (new_r.reject {|k,v| ignore_keys.include?(k)})
358
+ # all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
359
+ # add = false
360
+ # end
361
+ ## v.1.1 - faster, doesn't build two hashes (but iterates one)
362
+ if ( [].tap {|out| old_r.each {|k,v| out << true unless ((!ignore_keys.include?(k)) && new_r[k] == v) } } .empty?)
363
+ all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
364
+ add = false
365
+ end
366
+ end
367
+ new_resources << old_r if add
368
+ end
369
+ # remove old resources
370
+ @objects.reject! {|obj| old_resources.include?(obj)}
371
+ # insert new resources
372
+ @objects.push *new_resources
373
+ # rebuild stream lengths?
374
+ end
375
+
376
+ # run block of code on evey object (Hash)
377
+ def each_object(&block)
378
+ PDFOperations._each_object(@objects, &block)
379
+ end
380
+ # the function rerturns true if the reference belongs to the object
381
+ def compare_reference_values(obj, ref)
382
+ if obj[:referenced_object] && ref[:referenced_object]
383
+ return (obj[:referenced_object][:indirect_reference_id] == ref[:referenced_object][:indirect_reference_id] && obj[:referenced_object][:indirect_generation_number] == ref[:referenced_object][:indirect_generation_number])
384
+ elsif ref[:referenced_object]
385
+ return (obj[:indirect_reference_id] == ref[:referenced_object][:indirect_reference_id] && obj[:indirect_generation_number] == ref[:referenced_object][:indirect_generation_number])
386
+ elsif obj[:referenced_object]
387
+ return (obj[:referenced_object][:indirect_reference_id] == ref[:indirect_reference_id] && obj[:referenced_object][:indirect_generation_number] == ref[:indirect_generation_number])
388
+ else
389
+ return (obj[:indirect_reference_id] == ref[:indirect_reference_id] && obj[:indirect_generation_number] == ref[:indirect_generation_number])
390
+ end
391
+ end
392
+
393
+
394
+ end
395
+ end
396
+
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: combine_pdf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Boaz Segev
8
+ - Masters of the open source community
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-09-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ruby-rc4
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ description: A nifty gem, in pure Ruby, to parse PDF files and combine (merge) them
29
+ with other PDF files, watermark them or stamp them (all using the PDF file format).
30
+ email: bsegev@gmail.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - lib/combine_pdf.rb
36
+ - lib/combine_pdf/combine_pdf_basic_writer.rb
37
+ - lib/combine_pdf/combine_pdf_decrypt.rb
38
+ - lib/combine_pdf/combine_pdf_filter.rb
39
+ - lib/combine_pdf/combine_pdf_parser.rb
40
+ - lib/combine_pdf/combine_pdf_pdf.rb
41
+ homepage: https://github.com/boazsegev/combine_pdf
42
+ licenses:
43
+ - GPLv3
44
+ metadata: {}
45
+ post_install_message:
46
+ rdoc_options: []
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.9.2
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ requirements: []
60
+ rubyforge_project:
61
+ rubygems_version: 2.2.2
62
+ signing_key:
63
+ specification_version: 4
64
+ summary: Combine, stamp and watermark PDF files in pure Ruby.
65
+ test_files: []
66
+ has_rdoc: