combine_pdf 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,396 @@
1
+ # -*- encoding : utf-8 -*-
2
+ ########################################################
3
+ ## Thoughts from reading the ISO 32000-1:2008
4
+ ## this file is part of the CombinePDF library and the code
5
+ ## is subject to the same license.
6
+ ########################################################
7
+ module CombinePDF
8
+ ########################################################
9
+ ## PDF class is the PDF object that can save itself to
10
+ ## a file and that can be used as a container for a full
11
+ ## PDF file data, including version etc'.
12
+ ########################################################
13
+ class PDF
14
+ attr_reader :objects, :info
15
+ attr_accessor :string_output
16
+ attr_accessor :version
17
+ def initialize (*args)
18
+ # default before setting
19
+ @objects = []
20
+ @version = 0
21
+ @info = {}
22
+ if args[0].is_a? PDFParser
23
+ @objects = args[0].parse
24
+ @version = args[0].version if args[0].version.is_a? Float
25
+ @info = args[0].info_object || {}
26
+ elsif args[0].is_a? Array
27
+ # object initialization
28
+ @objects = args[0]
29
+ @version = args[1] if args[1].is_a? Float
30
+ elsif args[0].is_a? Hash
31
+ @objects = args
32
+ end
33
+ # connecting references with original objects
34
+ serialize_objects_and_references
35
+ # general globals
36
+ @string_output = :literal
37
+ @need_to_rebuild_resources = false
38
+ @set_start_id = 1
39
+ @info[:Producer] = "Ruby CombinePDF Library by Boaz Segev"
40
+ @info.delete :CreationDate
41
+ @info.delete :ModDate
42
+ warn "finished to initialize PDF object."
43
+ end
44
+
45
+ # Formats the data to PDF formats and returns a binary string that represents the PDF file content.
46
+ # This method is used by the save(file_name) method to save the content to a file.
47
+ # use this to export the PDF file without saving to disk (such as sending through HTTP ect').
48
+ def to_pdf
49
+ #reset version if not specified
50
+ @version = 1.3 if @version == 0
51
+ #set creation date for merged file
52
+ @info[:CreationDate] = Time.now.strftime "D:%Y%m%d%H%M%S%:::z'00"
53
+ #rebuild resources if needed
54
+ if @need_to_rebuild_resources
55
+ rebuild_resources
56
+ end
57
+ catalog = rebuild_catalog_and_objects #rebuild_catalog
58
+
59
+ warn "Formatting PDF output"
60
+
61
+ out = []
62
+ xref = []
63
+ indirect_object_count = 1 #the first object is the null object
64
+ #write head (version and binanry-code)
65
+ out << "%PDF-#{@version.to_s}\n%\x00\x00\x00\x00".force_encoding(Encoding::ASCII_8BIT)
66
+
67
+ #collect objects and set xref table locations
68
+ loc = 0
69
+ out.each {|line| loc += line.bytes.length + 1}
70
+ @objects.each do |o|
71
+ indirect_object_count += 1
72
+ xref << loc
73
+ out << PDFOperations._object_to_pdf(o)
74
+ loc += out.last.length + 1
75
+ end
76
+ warn "Building XREF"
77
+ xref_location = 0
78
+ out.each { |line| xref_location += line.bytes.length + 1}
79
+ out << "xref\n\r0 #{(indirect_object_count).to_s}\n\r0000000000 65535 f \n\r"
80
+ xref.each {|offset| out << ( out.pop + ("%010d 00000 n \n\r" % offset) ) }
81
+ out << out.pop + "trailer"
82
+ out << "<<\n/Root #{false || "#{catalog[:indirect_reference_id]} #{catalog[:indirect_generation_number]} R"}"
83
+ out << "/Size #{indirect_object_count.to_s}"
84
+ if @info.is_a?(Hash)
85
+ PRIVATE_HASH_KEYS.each {|key| @info.delete key} # make sure the dictionary is rendered inline, without stream
86
+ out << "/Info #{PDFOperations._object_to_pdf @info}"
87
+ end
88
+ out << ">>\nstartxref\n#{xref_location.to_s}\n%%EOF"
89
+ out.join("\n").force_encoding(Encoding::ASCII_8BIT)
90
+ end
91
+
92
+ # Seve the PDF to file.
93
+ # save(file_name)
94
+ # - file_name is a string or path object for the output.
95
+ # Notice! if the file exists, it WILL be overwritten.
96
+ def save(file_name)
97
+ IO.binwrite file_name, to_pdf
98
+ end
99
+ # this function returns all the pages cataloged in the catalog.
100
+ # if no catalog is passed, it seeks the existing catalog(s) and searches
101
+ # for any registered Page objects.
102
+ def pages(catalogs = nil)
103
+ page_list = []
104
+ if catalogs == nil
105
+ catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
106
+ catalogs ||= []
107
+ end
108
+ case
109
+ when catalogs.is_a?(Array)
110
+ catalogs.each {|c| page_list.push *(pages(c)) unless c.nil?}
111
+ when catalogs.is_a?(Hash)
112
+ if catalogs[:is_reference_only]
113
+ catalogs[:referenced_object] = pages(PDFOperations.get_refernced_object @objects, catalogs) unless catalogs[:referenced_object]
114
+ if catalogs[:referenced_object]
115
+ page_list.push *( pages(catalogs[:referenced_object]) )
116
+ else
117
+ warn "couldn't follow reference!!! #{catalogs} not found!"
118
+ end
119
+ else
120
+ case catalogs[:Type]
121
+ when :Page
122
+ holder = self
123
+ catalogs.define_singleton_method("<<".to_sym) do |obj|
124
+ obj = PDFOperations.copy_and_secure_for_injection obj
125
+ PDFOperations.inject_to_page self, obj
126
+ holder.add_referenced obj
127
+ end
128
+ page_list << catalogs
129
+ when :Pages
130
+ page_list.push *(pages(catalogs[:Kids])) unless catalogs[:Kids].nil?
131
+ when :Catalog
132
+ page_list.push *(pages(catalogs[:Pages])) unless catalogs[:Pages].nil?
133
+ end
134
+ end
135
+ end
136
+ page_list
137
+ end
138
+
139
+ # this function returns all the Page objects - regardless of order and even if not cataloged
140
+ # could be used for finding "lost" pages... but actually rather useless.
141
+ def all_pages
142
+ #########
143
+ ## Only return the page item, but make sure all references are connected so that
144
+ ## referenced items and be reached through the connections.
145
+ [].tap {|out| each_object {|obj| out << obj if obj.is_a?(Hash) && obj[:Type] == :Page } }
146
+ end
147
+
148
+ # this function adds pages or CombinePDF objects at the end of the file (merge)
149
+ # for example:
150
+ # pdf = CombinePDF.new "first_file.pdf"
151
+ # pdf << CombinePDF.new "second_file.pdf"
152
+ # pdf.save "both_files_merged.pdf"
153
+ def << (obj)
154
+ #########
155
+ ## how should we add data to PDF?
156
+ ## and how to handles imported pages?
157
+ case
158
+ when (obj.is_a?(PDF))
159
+ @version = [@version, obj.version].max
160
+
161
+ obj.renumber_object_ids @set_start_id + @objects.length
162
+
163
+ @objects.push(*obj.objects)
164
+ # rebuild_catalog
165
+ @need_to_rebuild_resources = true
166
+ when (obj.is_a?(Hash) && obj[:Type] == :Page), (obj.is_a?(Array) && (obj.reject {|i| i.is_a?(Hash) && i[:Type] == :Page}).empty?)
167
+ # set obj paramater to array if it's only one page
168
+ obj = [obj] if obj.is_a?(Hash)
169
+ # add page(s) to objects
170
+ @objects.push(*obj)
171
+ # add page dependencies to objects
172
+ add_referenced(obj)
173
+ # add page(s) to Catalog(s)
174
+ rebuild_catalog obj
175
+ @need_to_rebuild_resources = true
176
+ when (obj.is_a?(Hash) && obj[:indirect_reference_id] && obj[:referenced_object].nil?)
177
+ #only let top level indirect objects into the PDF tree.
178
+ @objects << obj
179
+ @need_to_rebuild_resources = true
180
+ else
181
+ warn "Shouldn't add objects to the file if they are not top-level indirect PDF objects."
182
+ end
183
+ end
184
+
185
+ def serialize_objects_and_references(object = nil)
186
+ warn "connecting objects with their references (serialize_objects_and_references)."
187
+
188
+ # # Version 3.5 injects indirect objects if they arn't dictionaries.
189
+ # # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" }
190
+ # # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
191
+ # # ######### Intreduces a BUG with catalogging pages... why? I don't know... mybey doesn't catch all.
192
+ # each_object do |obj|
193
+ # obj.each do |k, v|
194
+ # if v.is_a?(Hash) && v[:is_reference_only]
195
+ # v[:referenced_object] = PDFOperations.get_refernced_object @objects, v
196
+ # raise "couldn't connect references" unless v[:referenced_object]
197
+ # obj[k] = v[:referenced_object][:indirect_without_dictionary] if v[:referenced_object][:indirect_without_dictionary]
198
+ # end
199
+ # end
200
+ # end
201
+
202
+ # Version 4
203
+ # benchmark 1000.times was 0.980651 sec for:
204
+ # pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf"
205
+ # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
206
+ objects_reference_hash = {}
207
+ @objects.each {|o| objects_reference_hash[ [o[:indirect_reference_id], o[:indirect_generation_number] ] ] = o }
208
+ each_object do |obj|
209
+ if obj[:is_reference_only]
210
+ obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ]
211
+ warn "couldn't connect a reference!!! could be a null object, Silent error!!!" unless obj[:referenced_object]
212
+ end
213
+ end
214
+
215
+ # # Version 3
216
+ # # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" }
217
+ # # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
218
+ # each_object do |obj|
219
+ # if obj[:is_reference_only]
220
+ # obj[:referenced_object] = PDFOperations.get_refernced_object @objects, obj
221
+ # warn "couldn't connect a reference!!! could be a null object, Silent error!!!" unless obj[:referenced_object]
222
+ # end
223
+ # end
224
+
225
+ end
226
+ def renumber_object_ids(start = nil)
227
+ warn "Resetting Object Reference IDs"
228
+ @set_start_id ||= start
229
+ start = @set_start_id
230
+ history = {}
231
+ all_indirect_object.each do |obj|
232
+ obj[:indirect_reference_id] = start
233
+ start += 1
234
+ end
235
+ warn "Finished serializing IDs"
236
+ end
237
+
238
+ def references(indirect_reference_id = nil, indirect_generation_number = nil)
239
+ ref = {indirect_reference_id: indirect_reference_id, indirect_generation_number: indirect_generation_number}
240
+ out = []
241
+ each_object do |obj|
242
+ if obj[:is_reference_only]
243
+ if (indirect_reference_id == nil && indirect_generation_number == nil)
244
+ out << obj
245
+ elsif compare_reference_values(ref, obj)
246
+ out << obj
247
+ end
248
+ end
249
+ end
250
+ out
251
+ end
252
+ def all_indirect_object
253
+ [].tap {|out| @objects.each {|obj| out << obj if (obj.is_a?(Hash) && obj[:is_reference_only].nil?) } }
254
+ end
255
+ def sort_objects_by_id
256
+ @objects.sort! do |a,b|
257
+ if a.is_a?(Hash) && a[:indirect_reference_id] && a[:is_reference_only].nil? && b.is_a?(Hash) && b[:indirect_reference_id] && b[:is_reference_only].nil?
258
+ return a[:indirect_reference_id] <=> b[:indirect_reference_id]
259
+ end
260
+ 0
261
+ end
262
+ end
263
+
264
+ def add_referenced(object)
265
+ # add references but not root
266
+ case
267
+ when object.is_a?(Array)
268
+ object.each {|it| add_referenced(it)}
269
+ when object.is_a?(Hash)
270
+ if object[:is_reference_only] && object[:referenced_object]
271
+ unless @objects.include? object[:referenced_object]
272
+ @objects << object[:referenced_object]
273
+ object[:referenced_object].each do |k, v|
274
+ add_referenced(v) unless k == :Parent
275
+ end
276
+ end
277
+ else
278
+ object.each do |k, v|
279
+ add_referenced(v) unless k == :Parent
280
+ end
281
+ end
282
+ end
283
+ end
284
+ def rebuild_catalog(*with_pages)
285
+ ##########################
286
+ ## Test-Run - How is that done?
287
+ warn "Re-Building Catalog"
288
+
289
+ # # build page list v.1 Slow but WORKS
290
+ # # Benchmark testing value: 26.708394
291
+ # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog}
292
+ # old_catalogs ||= []
293
+ # page_list = []
294
+ # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page }
295
+
296
+ # build page list v.2
297
+ # Benchmark testing value: 0.215114
298
+ page_list = pages
299
+
300
+ # add pages to catalog, if requested
301
+ page_list.push(*with_pages) unless with_pages.empty?
302
+
303
+ # build new Pages object
304
+ pages_object = {Type: :Pages, Count: page_list.length, Kids: page_list.map {|p| {referenced_object: p, is_reference_only: true} } }
305
+
306
+ # build new Catalog object
307
+ catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true} }
308
+
309
+ # point old Pages pointers to new Pages object
310
+ ## first point known pages objects - enough?
311
+ pages.each {|p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true} }
312
+ ## or should we, go over structure? (fails)
313
+ # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages}
314
+
315
+ # remove old catalog and pages objects
316
+ @objects.reject! {|obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) }
317
+
318
+ # inject new catalog and pages objects
319
+ @objects << pages_object
320
+ @objects << catalog_object
321
+
322
+ catalog_object
323
+ end
324
+ # this is an alternative to the rebuild_catalog catalog method
325
+ # this method might eventually be used by the to_pdf method, for streamlining the PDF output.
326
+ def rebuild_catalog_and_objects
327
+ catalog = rebuild_catalog
328
+ @objects = []
329
+ @objects << catalog
330
+ add_referenced catalog
331
+ renumber_object_ids
332
+ catalog
333
+ end
334
+
335
+ def rebuild_resources
336
+
337
+ warn "Resources re-building disabled as it isn't worth the price in peformance as of yet."
338
+
339
+ return true
340
+
341
+ warn "Re-Building Resources"
342
+ @need_to_rebuild_resources = false
343
+ # what are resources?
344
+ # anything at the top level of the file exept catalogs, page lists (Pages) and pages...
345
+ not_resources = [:Catalog, :Pages, :Page]
346
+ # get old resources list
347
+ old_resources = @objects.select {|obj| obj.is_a?(Hash) && !not_resources.include?(obj[:Type])}
348
+ # collect all unique resources while ignoring double values and resetting references
349
+ # also ignore inner values (canot use PRIVATE_HASH_KEYS because of stream and other issues)
350
+ ignore_keys = [:indirect_reference_id, :indirect_generation_number, :is_reference_only, :referenced_object]
351
+ new_resources = []
352
+ all_references = references
353
+ old_resources.each do |old_r|
354
+ add = true
355
+ new_resources.each do |new_r|
356
+ # ## v.1.0 - slower
357
+ # if (old_r.reject {|k,v| ignore_keys.include?(k) }) == (new_r.reject {|k,v| ignore_keys.include?(k)})
358
+ # all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
359
+ # add = false
360
+ # end
361
+ ## v.1.1 - faster, doesn't build two hashes (but iterates one)
362
+ if ( [].tap {|out| old_r.each {|k,v| out << true unless ((!ignore_keys.include?(k)) && new_r[k] == v) } } .empty?)
363
+ all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
364
+ add = false
365
+ end
366
+ end
367
+ new_resources << old_r if add
368
+ end
369
+ # remove old resources
370
+ @objects.reject! {|obj| old_resources.include?(obj)}
371
+ # insert new resources
372
+ @objects.push *new_resources
373
+ # rebuild stream lengths?
374
+ end
375
+
376
+ # run block of code on evey object (Hash)
377
+ def each_object(&block)
378
+ PDFOperations._each_object(@objects, &block)
379
+ end
380
+ # the function rerturns true if the reference belongs to the object
381
+ def compare_reference_values(obj, ref)
382
+ if obj[:referenced_object] && ref[:referenced_object]
383
+ return (obj[:referenced_object][:indirect_reference_id] == ref[:referenced_object][:indirect_reference_id] && obj[:referenced_object][:indirect_generation_number] == ref[:referenced_object][:indirect_generation_number])
384
+ elsif ref[:referenced_object]
385
+ return (obj[:indirect_reference_id] == ref[:referenced_object][:indirect_reference_id] && obj[:indirect_generation_number] == ref[:referenced_object][:indirect_generation_number])
386
+ elsif obj[:referenced_object]
387
+ return (obj[:referenced_object][:indirect_reference_id] == ref[:indirect_reference_id] && obj[:referenced_object][:indirect_generation_number] == ref[:indirect_generation_number])
388
+ else
389
+ return (obj[:indirect_reference_id] == ref[:indirect_reference_id] && obj[:indirect_generation_number] == ref[:indirect_generation_number])
390
+ end
391
+ end
392
+
393
+
394
+ end
395
+ end
396
+
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: combine_pdf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Boaz Segev
8
+ - Masters of the open source community
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-09-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ruby-rc4
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: '0'
28
+ description: A nifty gem, in pure Ruby, to parse PDF files and combine (merge) them
29
+ with other PDF files, watermark them or stamp them (all using the PDF file format).
30
+ email: bsegev@gmail.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - lib/combine_pdf.rb
36
+ - lib/combine_pdf/combine_pdf_basic_writer.rb
37
+ - lib/combine_pdf/combine_pdf_decrypt.rb
38
+ - lib/combine_pdf/combine_pdf_filter.rb
39
+ - lib/combine_pdf/combine_pdf_parser.rb
40
+ - lib/combine_pdf/combine_pdf_pdf.rb
41
+ homepage: https://github.com/boazsegev/combine_pdf
42
+ licenses:
43
+ - GPLv3
44
+ metadata: {}
45
+ post_install_message:
46
+ rdoc_options: []
47
+ require_paths:
48
+ - lib
49
+ required_ruby_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.9.2
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ requirements: []
60
+ rubyforge_project:
61
+ rubygems_version: 2.2.2
62
+ signing_key:
63
+ specification_version: 4
64
+ summary: Combine, stamp and watermark PDF files in pure Ruby.
65
+ test_files: []
66
+ has_rdoc: