combine_pdf 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/combine_pdf.rb +33 -24
- data/lib/combine_pdf/combine_pdf_basic_writer.rb +4 -4
- data/lib/combine_pdf/combine_pdf_decrypt.rb +1 -2
- data/lib/combine_pdf/combine_pdf_operations.rb +2 -0
- data/lib/combine_pdf/combine_pdf_parser.rb +0 -6
- data/lib/combine_pdf/combine_pdf_pdf.rb +4 -60
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d910d296f84bd5493a258317eace2f9cabf7d4b
|
4
|
+
data.tar.gz: 67efe73d182be27f9ad597e48cff83c3147ab37d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8924f5bb53e5f8e00c224f360e343688b9902d1c2e7b59c319e07cc66ce763b08651619be64217d1af4a7e1029344091dcb106ffbc7ac41a3e496c5a0e0f7bbc
|
7
|
+
data.tar.gz: d2162982205d6c89a6ace341d8d2a3e123caff712d7c8dd3e9220ec4ce4dc67dfa2759e4429a07bc2c28d66600be00c5b6221d26c11bb34501d4fdb42092aba5
|
data/lib/combine_pdf.rb
CHANGED
@@ -5,6 +5,10 @@ require 'zlib'
|
|
5
5
|
require 'securerandom'
|
6
6
|
require 'strscan'
|
7
7
|
|
8
|
+
#require the RC4 Gem
|
9
|
+
require 'rc4'
|
10
|
+
|
11
|
+
|
8
12
|
load "combine_pdf/combine_pdf_operations.rb"
|
9
13
|
load "combine_pdf/combine_pdf_basic_writer.rb"
|
10
14
|
load "combine_pdf/combine_pdf_decrypt.rb"
|
@@ -13,26 +17,6 @@ load "combine_pdf/combine_pdf_filter.rb"
|
|
13
17
|
load "combine_pdf/combine_pdf_parser.rb"
|
14
18
|
load "combine_pdf/combine_pdf_pdf.rb"
|
15
19
|
|
16
|
-
# # will be removed one font support and font library is completed.
|
17
|
-
# require "combine_pdf/font_metrics/courier-bold_metrics.rb"
|
18
|
-
# require "combine_pdf/font_metrics/courier-boldoblique_metrics.rb"
|
19
|
-
# require "combine_pdf/font_metrics/courier-oblique_metrics.rb"
|
20
|
-
# require "combine_pdf/font_metrics/courier_metrics.rb"
|
21
|
-
# require "combine_pdf/font_metrics/helvetica-bold_metrics.rb"
|
22
|
-
# require "combine_pdf/font_metrics/helvetica-boldoblique_metrics.rb"
|
23
|
-
# require "combine_pdf/font_metrics/helvetica-oblique_metrics.rb"
|
24
|
-
# require "combine_pdf/font_metrics/helvetica_metrics.rb"
|
25
|
-
# require "combine_pdf/font_metrics/symbol_metrics.rb"
|
26
|
-
# require "combine_pdf/font_metrics/times-bold_metrics.rb"
|
27
|
-
# require "combine_pdf/font_metrics/times-bolditalic_metrics.rb"
|
28
|
-
# require "combine_pdf/font_metrics/times-italic_metrics.rb"
|
29
|
-
# require "combine_pdf/font_metrics/times-roman_metrics.rb"
|
30
|
-
# require "combine_pdf/font_metrics/zapfdingbats_metrics.rb"
|
31
|
-
# require "combine_pdf/font_metrics/metrics_dictionary.rb"
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
20
|
|
37
21
|
# This is a pure ruby library to combine/merge, stmap/overlay and number PDF files - as well as to create tables (ment for indexing combined files).
|
38
22
|
#
|
@@ -314,9 +298,9 @@ end
|
|
314
298
|
# numbers are Fixnum or Float
|
315
299
|
# boolean are TrueClass or FalseClass
|
316
300
|
|
317
|
-
##
|
318
|
-
## puts Benchmark.measure { pdf = CombinePDF.new(
|
319
|
-
## demo: file_name = "
|
301
|
+
## test performance with:
|
302
|
+
## puts Benchmark.measure { pdf = CombinePDF.new(file); pdf.save "test.pdf" } # PDFEditor.new_pdf
|
303
|
+
## demo: file_name = "~/Ruby/pdfs/encrypted.pdf"; pdf=0; puts Benchmark.measure { pdf = CombinePDF.new(file_name); pdf.save "test.pdf" }
|
320
304
|
## at the moment... my code it terribly slow for larger files... :(
|
321
305
|
## The file saving is solved (I hope)... but file loading is an issue.
|
322
306
|
## pdf.each_object {|obj| puts "Stream length: #{obj[:raw_stream_content].length} was registered as #{obj[:Length].is_a?(Hash)? obj[:Length][:referenced_object][:indirect_without_dictionary] : obj[:Length]}" if obj[:raw_stream_content] }
|
@@ -324,5 +308,30 @@ end
|
|
324
308
|
## puts Benchmark.measure { 1000.times { (CombinePDF::PDFOperations.get_refernced_object pdf.objects, {indirect_reference_id: 100, indirect_generation_number:0}).object_id } }
|
325
309
|
## puts Benchmark.measure { 1000.times { (pdf.objects.select {|o| o[:indirect_reference_id]== 100 && o[:indirect_generation_number] == 0})[0].object_id } }
|
326
310
|
## puts Benchmark.measure { {}.tap {|out| pdf.objects.each {|o| out[ [o[:indirect_reference_id], o[:indirect_generation_number] ] ] = o }} }
|
327
|
-
|
311
|
+
##
|
312
|
+
#### local test for CombinePDF
|
313
|
+
## file = "/Users/2Be/Ruby/pdfs/encrypted.pdf"
|
314
|
+
## puts Benchmark.measure { 1000.times { pdf = CombinePDF.new(file); pdf.save "test.pdf" } }
|
315
|
+
### gives : 2.540000 0.140000 2.680000 ( 2.696524)
|
316
|
+
## puts Benchmark.measure { pdf = CombinePDF.new() ; 1000.times { pdf << CombinePDF.new(file) } ; pdf.save "test.pdf" }
|
317
|
+
### gives: 11.770000 0.090000 11.860000 ( 11.879411) #why the difference? NOT the object reference rebuilding...
|
318
|
+
### file size: 7Kb success
|
319
|
+
###### gives: 7.440000 0.100000 7.540000 ( 7.536460) (!!!) with draft file size 8kb
|
320
|
+
##
|
321
|
+
#### local test by pdftk
|
322
|
+
## pdftk_path = '/Users/2Be/Ruby/pdfs/pdftk_lib/bin/pdftk'
|
323
|
+
## file_array = []
|
324
|
+
## 1000.times { file_array << file }
|
325
|
+
## puts Benchmark.measure { system ( pdftk_path + " '" + file_array.join("' '") + "' input_pw '' output 'test.pdf'" ) }
|
326
|
+
### gives: 0.000000 0.000000 3.250000 ( 3.244724)
|
327
|
+
### FAILS with no output, unwilling to decrypt.
|
328
|
+
###### gives: 0.000000 0.000000 2.640000 ( 2.661801) with draft file size 1.3MB (!!)
|
329
|
+
#### local test by pyton
|
330
|
+
## pyton_path = '/Users/2Be/Ruby/pdfs/pdftk_lib/join.py'
|
331
|
+
## file_array = []
|
332
|
+
## 1000.times { file_array << file }
|
333
|
+
## puts Benchmark.measure { system ( pyton_path + " -o 'test.pdf' '#{file_array.join "' '"}' " ) }
|
334
|
+
### gives 0.000000 0.000000 1.010000 ( 1.147135)
|
335
|
+
### file merge FAILS with 1,000 empty pages (undecrypted)
|
336
|
+
####### gives: 0.000000 0.000000 1.770000 ( 1.775513) with draft. file size 4.9MB (!!!)
|
328
337
|
|
@@ -133,7 +133,7 @@ module CombinePDF
|
|
133
133
|
y = options[:y]
|
134
134
|
|
135
135
|
# set graphic state for the box
|
136
|
-
box_stream << "q\
|
136
|
+
box_stream << "q\n"
|
137
137
|
box_graphic_state = { ca: options[:opacity], CA: options[:opacity], LW: options[:border_width], LC: 0, LJ: 0, LD: 0 }
|
138
138
|
if options[:box_radius] != 0 # if the text box has rounded corners
|
139
139
|
box_graphic_state[:LC], box_graphic_state[:LJ] = 2, 1
|
@@ -191,7 +191,7 @@ module CombinePDF
|
|
191
191
|
end
|
192
192
|
|
193
193
|
# exit graphic state for the box
|
194
|
-
box_stream << "Q\
|
194
|
+
box_stream << "Q\n"
|
195
195
|
end
|
196
196
|
contents << box_stream
|
197
197
|
|
@@ -227,7 +227,7 @@ module CombinePDF
|
|
227
227
|
end
|
228
228
|
|
229
229
|
# set graphic state for text
|
230
|
-
text_stream << "q\
|
230
|
+
text_stream << "q\n"
|
231
231
|
text_graphic_state = graphic_state({ca: options[:opacity], CA: options[:opacity], LW: options[:stroke_width].to_f, LC: 2, LJ: 1, LD: 0})
|
232
232
|
text_stream << "#{PDFOperations._object_to_pdf text_graphic_state} gs\n"
|
233
233
|
text_stream << "DeviceRGB CS\nDeviceRGB cs\n"
|
@@ -261,7 +261,7 @@ module CombinePDF
|
|
261
261
|
y -= encoded[3]/1000*font_size #update text starting point
|
262
262
|
end
|
263
263
|
# exit graphic state for text
|
264
|
-
text_stream << "Q\
|
264
|
+
text_stream << "Q\n"
|
265
265
|
end
|
266
266
|
contents << text_stream
|
267
267
|
|
@@ -43,7 +43,6 @@ module CombinePDF
|
|
43
43
|
@key = set_general_key
|
44
44
|
case @encryption_dictionary[:V]
|
45
45
|
when 1,2
|
46
|
-
warn "trying to decrypt with RC4."
|
47
46
|
# raise_encrypted_error
|
48
47
|
_perform_decrypt_proc_ @objects, self.method(:decrypt_RC4)
|
49
48
|
else
|
@@ -109,7 +108,7 @@ module CombinePDF
|
|
109
108
|
# (0..2).each { |e| object_key << (encrypted_id >> e*8 & 0xFF ) }
|
110
109
|
# (0..1).each { |e| object_key << (encrypted_generation >> e*8 & 0xFF ) }
|
111
110
|
key_length = object_key.length < 16 ? object_key.length : 16
|
112
|
-
rc4 = RC4.new( Digest::MD5.digest(object_key)[(0...key_length)] )
|
111
|
+
rc4 = ::RC4.new( Digest::MD5.digest(object_key)[(0...key_length)] )
|
113
112
|
rc4.decrypt(encrypted)
|
114
113
|
end
|
115
114
|
def decrypt_AES(encrypted, encrypted_id, encrypted_generation, encrypted_filter)
|
@@ -37,6 +37,7 @@ module CombinePDF
|
|
37
37
|
|
38
38
|
# following the reference chain and assigning a pointer to the correct Resouces object.
|
39
39
|
# (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
|
40
|
+
page[:Resources] ||= {}
|
40
41
|
original_resources = page[:Resources]
|
41
42
|
if original_resources[:is_reference_only]
|
42
43
|
original_resources = original_resources[:referenced_object]
|
@@ -45,6 +46,7 @@ module CombinePDF
|
|
45
46
|
original_contents = page[:Contents]
|
46
47
|
original_contents = [original_contents] unless original_contents.is_a? Array
|
47
48
|
|
49
|
+
stream[:Resources] ||= {}
|
48
50
|
stream_resources = stream[:Resources]
|
49
51
|
if stream_resources[:is_reference_only]
|
50
52
|
stream_resources = stream_resources[:referenced_object]
|
@@ -65,7 +65,6 @@ module CombinePDF
|
|
65
65
|
@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
|
66
66
|
end
|
67
67
|
|
68
|
-
warn "Starting to parse PDF data."
|
69
68
|
@parsed = _parse_
|
70
69
|
|
71
70
|
if @root_object == {}
|
@@ -75,7 +74,6 @@ module CombinePDF
|
|
75
74
|
end
|
76
75
|
end
|
77
76
|
raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
|
78
|
-
warn "Injecting actual values into root object: #{@root_object}."
|
79
77
|
PDFOperations.change_references_to_actual_values @parsed, @root_object
|
80
78
|
|
81
79
|
if @root_object[:Encrypt]
|
@@ -91,7 +89,6 @@ module CombinePDF
|
|
91
89
|
warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
|
92
90
|
|
93
91
|
object_streams.each do |o|
|
94
|
-
warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
|
95
92
|
## un-encode (using the correct filter) the object streams
|
96
93
|
PDFFilter.inflate_object o
|
97
94
|
## extract objects from stream to top level arry @parsed
|
@@ -123,7 +120,6 @@ module CombinePDF
|
|
123
120
|
else
|
124
121
|
@info_object = {}
|
125
122
|
end
|
126
|
-
warn "setting parsed collection and returning collection."
|
127
123
|
@parsed
|
128
124
|
end
|
129
125
|
|
@@ -133,7 +129,6 @@ module CombinePDF
|
|
133
129
|
def _parse_
|
134
130
|
out = []
|
135
131
|
str = ''
|
136
|
-
# warn "Scaning for objects, starting at #{@scanner.pos}: #{@scanner.peek(10)}"
|
137
132
|
while @scanner.rest? do
|
138
133
|
case
|
139
134
|
##########################################
|
@@ -171,7 +166,6 @@ module CombinePDF
|
|
171
166
|
## parse an Object after finished
|
172
167
|
##########################################
|
173
168
|
when str = @scanner.scan(/endobj/)
|
174
|
-
# warn "Proccessing Object"
|
175
169
|
#what to do when this is an object?
|
176
170
|
if out.last.is_a? Hash
|
177
171
|
out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
|
@@ -115,12 +115,10 @@ module CombinePDF
|
|
115
115
|
end
|
116
116
|
# general globals
|
117
117
|
@string_output = :literal
|
118
|
-
@need_to_rebuild_resources = false
|
119
118
|
@set_start_id = 1
|
120
119
|
@info[:Producer] = "Ruby CombinePDF Library by Boaz Segev"
|
121
120
|
@info.delete :CreationDate
|
122
121
|
@info.delete :ModDate
|
123
|
-
warn "finished to initialize PDF object."
|
124
122
|
end
|
125
123
|
|
126
124
|
# Formats the data to PDF formats and returns a binary string that represents the PDF file content.
|
@@ -133,17 +131,11 @@ module CombinePDF
|
|
133
131
|
@version = 1.5 if @version.to_f == 0.0
|
134
132
|
#set creation date for merged file
|
135
133
|
@info[:CreationDate] = Time.now.strftime "D:%Y%m%d%H%M%S%:::z'00"
|
136
|
-
#rebuild resources if needed
|
137
|
-
if @need_to_rebuild_resources
|
138
|
-
rebuild_resources
|
139
|
-
end
|
140
134
|
#rebuild_catalog
|
141
135
|
catalog = rebuild_catalog_and_objects
|
142
136
|
# add ID and generation numbers to objects
|
143
137
|
renumber_object_ids
|
144
138
|
|
145
|
-
warn "Formatting PDF output"
|
146
|
-
|
147
139
|
out = []
|
148
140
|
xref = []
|
149
141
|
indirect_object_count = 1 #the first object is the null object
|
@@ -159,7 +151,6 @@ module CombinePDF
|
|
159
151
|
out << PDFOperations._object_to_pdf(o)
|
160
152
|
loc += out.last.length + 1
|
161
153
|
end
|
162
|
-
warn "Building XREF"
|
163
154
|
xref_location = 0
|
164
155
|
out.each { |line| xref_location += line.bytes.length + 1}
|
165
156
|
out << "xref\n\r0 #{(indirect_object_count).to_s}\n\r0000000000 65535 f \n\r"
|
@@ -275,7 +266,7 @@ module CombinePDF
|
|
275
266
|
fonts_array
|
276
267
|
end
|
277
268
|
|
278
|
-
# add the pages (or file) to the PDF (combine/merge) and
|
269
|
+
# add the pages (or file) to the PDF (combine/merge) and RETURNS SELF, for nesting.
|
279
270
|
# for example:
|
280
271
|
#
|
281
272
|
# pdf = CombinePDF.new "first_file.pdf"
|
@@ -290,17 +281,14 @@ module CombinePDF
|
|
290
281
|
## and how to handles imported pages?
|
291
282
|
if data.is_a?(PDF)
|
292
283
|
@version = [@version, data.version].max
|
293
|
-
|
294
|
-
@need_to_rebuild_resources = true
|
295
|
-
|
296
284
|
@objects.push(*data.objects)
|
297
285
|
# rebuild_catalog
|
298
|
-
return
|
286
|
+
return self
|
299
287
|
end
|
300
288
|
insert -1, data
|
301
289
|
end
|
302
290
|
|
303
|
-
# add the pages (or file) to the BEGINNING of the PDF (combine/merge) and
|
291
|
+
# add the pages (or file) to the BEGINNING of the PDF (combine/merge) and RETURNS SELF for nesting operators.
|
304
292
|
# for example:
|
305
293
|
#
|
306
294
|
# pdf = CombinePDF.new "second_file.pdf"
|
@@ -311,6 +299,7 @@ module CombinePDF
|
|
311
299
|
# data:: is PDF page (Hash), and Array of PDF pages or a parsed PDF object to be added.
|
312
300
|
def >> (data)
|
313
301
|
insert 0, data
|
302
|
+
self
|
314
303
|
end
|
315
304
|
|
316
305
|
# add PDF pages (or PDF files) into a specific location.
|
@@ -519,8 +508,6 @@ module CombinePDF
|
|
519
508
|
end
|
520
509
|
# @private
|
521
510
|
def serialize_objects_and_references(object = nil)
|
522
|
-
warn "connecting objects with their references (serialize_objects_and_references)."
|
523
|
-
|
524
511
|
# # Version 3.5 injects indirect objects if they arn't dictionaries.
|
525
512
|
# # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" }
|
526
513
|
# # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
|
@@ -653,49 +640,6 @@ module CombinePDF
|
|
653
640
|
catalog
|
654
641
|
end
|
655
642
|
|
656
|
-
# @private
|
657
|
-
# disabled, don't use. simpley returns true.
|
658
|
-
def rebuild_resources
|
659
|
-
|
660
|
-
warn "Resources re-building disabled as it isn't worth the price in peformance as of yet."
|
661
|
-
|
662
|
-
return true
|
663
|
-
|
664
|
-
warn "Re-Building Resources"
|
665
|
-
@need_to_rebuild_resources = false
|
666
|
-
# what are resources?
|
667
|
-
# anything at the top level of the file exept catalogs, page lists (Pages) and pages...
|
668
|
-
not_resources = [:Catalog, :Pages, :Page]
|
669
|
-
# get old resources list
|
670
|
-
old_resources = @objects.select {|obj| obj.is_a?(Hash) && !not_resources.include?(obj[:Type])}
|
671
|
-
# collect all unique resources while ignoring double values and resetting references
|
672
|
-
# also ignore inner values (canot use PRIVATE_HASH_KEYS because of stream and other issues)
|
673
|
-
ignore_keys = [:indirect_reference_id, :indirect_generation_number, :is_reference_only, :referenced_object]
|
674
|
-
new_resources = []
|
675
|
-
all_references = references
|
676
|
-
old_resources.each do |old_r|
|
677
|
-
add = true
|
678
|
-
new_resources.each do |new_r|
|
679
|
-
# ## v.1.0 - slower
|
680
|
-
# if (old_r.reject {|k,v| ignore_keys.include?(k) }) == (new_r.reject {|k,v| ignore_keys.include?(k)})
|
681
|
-
# all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
|
682
|
-
# add = false
|
683
|
-
# end
|
684
|
-
## v.1.1 - faster, doesn't build two hashes (but iterates one)
|
685
|
-
if ( [].tap {|out| old_r.each {|k,v| out << true unless ((!ignore_keys.include?(k)) && new_r[k] == v) } } .empty?)
|
686
|
-
all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
|
687
|
-
add = false
|
688
|
-
end
|
689
|
-
end
|
690
|
-
new_resources << old_r if add
|
691
|
-
end
|
692
|
-
# remove old resources
|
693
|
-
@objects.reject! {|obj| old_resources.include?(obj)}
|
694
|
-
# insert new resources
|
695
|
-
@objects.push *new_resources
|
696
|
-
# rebuild stream lengths?
|
697
|
-
end
|
698
|
-
|
699
643
|
# @private
|
700
644
|
# the function rerturns true if the reference belongs to the object
|
701
645
|
def compare_reference_values(obj, ref)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: combine_pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Boaz Segev
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-09-
|
12
|
+
date: 2014-09-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ruby-rc4
|
@@ -26,8 +26,8 @@ dependencies:
|
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: 0.1.5
|
28
28
|
description: A nifty gem, in pure Ruby, to parse PDF files and combine (merge) them
|
29
|
-
with other PDF files, number the pages, watermark them or stamp them
|
30
|
-
the PDF file format).
|
29
|
+
with other PDF files, number the pages, watermark them or stamp them, create tables
|
30
|
+
or basic text objects etc` (all using the PDF file format).
|
31
31
|
email: bsegev@gmail.com
|
32
32
|
executables: []
|
33
33
|
extensions: []
|