combine_pdf 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/combine_pdf.rb +33 -24
- data/lib/combine_pdf/combine_pdf_basic_writer.rb +4 -4
- data/lib/combine_pdf/combine_pdf_decrypt.rb +1 -2
- data/lib/combine_pdf/combine_pdf_operations.rb +2 -0
- data/lib/combine_pdf/combine_pdf_parser.rb +0 -6
- data/lib/combine_pdf/combine_pdf_pdf.rb +4 -60
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7d910d296f84bd5493a258317eace2f9cabf7d4b
|
4
|
+
data.tar.gz: 67efe73d182be27f9ad597e48cff83c3147ab37d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8924f5bb53e5f8e00c224f360e343688b9902d1c2e7b59c319e07cc66ce763b08651619be64217d1af4a7e1029344091dcb106ffbc7ac41a3e496c5a0e0f7bbc
|
7
|
+
data.tar.gz: d2162982205d6c89a6ace341d8d2a3e123caff712d7c8dd3e9220ec4ce4dc67dfa2759e4429a07bc2c28d66600be00c5b6221d26c11bb34501d4fdb42092aba5
|
data/lib/combine_pdf.rb
CHANGED
@@ -5,6 +5,10 @@ require 'zlib'
|
|
5
5
|
require 'securerandom'
|
6
6
|
require 'strscan'
|
7
7
|
|
8
|
+
#require the RC4 Gem
|
9
|
+
require 'rc4'
|
10
|
+
|
11
|
+
|
8
12
|
load "combine_pdf/combine_pdf_operations.rb"
|
9
13
|
load "combine_pdf/combine_pdf_basic_writer.rb"
|
10
14
|
load "combine_pdf/combine_pdf_decrypt.rb"
|
@@ -13,26 +17,6 @@ load "combine_pdf/combine_pdf_filter.rb"
|
|
13
17
|
load "combine_pdf/combine_pdf_parser.rb"
|
14
18
|
load "combine_pdf/combine_pdf_pdf.rb"
|
15
19
|
|
16
|
-
# # will be removed one font support and font library is completed.
|
17
|
-
# require "combine_pdf/font_metrics/courier-bold_metrics.rb"
|
18
|
-
# require "combine_pdf/font_metrics/courier-boldoblique_metrics.rb"
|
19
|
-
# require "combine_pdf/font_metrics/courier-oblique_metrics.rb"
|
20
|
-
# require "combine_pdf/font_metrics/courier_metrics.rb"
|
21
|
-
# require "combine_pdf/font_metrics/helvetica-bold_metrics.rb"
|
22
|
-
# require "combine_pdf/font_metrics/helvetica-boldoblique_metrics.rb"
|
23
|
-
# require "combine_pdf/font_metrics/helvetica-oblique_metrics.rb"
|
24
|
-
# require "combine_pdf/font_metrics/helvetica_metrics.rb"
|
25
|
-
# require "combine_pdf/font_metrics/symbol_metrics.rb"
|
26
|
-
# require "combine_pdf/font_metrics/times-bold_metrics.rb"
|
27
|
-
# require "combine_pdf/font_metrics/times-bolditalic_metrics.rb"
|
28
|
-
# require "combine_pdf/font_metrics/times-italic_metrics.rb"
|
29
|
-
# require "combine_pdf/font_metrics/times-roman_metrics.rb"
|
30
|
-
# require "combine_pdf/font_metrics/zapfdingbats_metrics.rb"
|
31
|
-
# require "combine_pdf/font_metrics/metrics_dictionary.rb"
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
20
|
|
37
21
|
# This is a pure ruby library to combine/merge, stmap/overlay and number PDF files - as well as to create tables (ment for indexing combined files).
|
38
22
|
#
|
@@ -314,9 +298,9 @@ end
|
|
314
298
|
# numbers are Fixnum or Float
|
315
299
|
# boolean are TrueClass or FalseClass
|
316
300
|
|
317
|
-
##
|
318
|
-
## puts Benchmark.measure { pdf = CombinePDF.new(
|
319
|
-
## demo: file_name = "
|
301
|
+
## test performance with:
|
302
|
+
## puts Benchmark.measure { pdf = CombinePDF.new(file); pdf.save "test.pdf" } # PDFEditor.new_pdf
|
303
|
+
## demo: file_name = "~/Ruby/pdfs/encrypted.pdf"; pdf=0; puts Benchmark.measure { pdf = CombinePDF.new(file_name); pdf.save "test.pdf" }
|
320
304
|
## at the moment... my code it terribly slow for larger files... :(
|
321
305
|
## The file saving is solved (I hope)... but file loading is an issue.
|
322
306
|
## pdf.each_object {|obj| puts "Stream length: #{obj[:raw_stream_content].length} was registered as #{obj[:Length].is_a?(Hash)? obj[:Length][:referenced_object][:indirect_without_dictionary] : obj[:Length]}" if obj[:raw_stream_content] }
|
@@ -324,5 +308,30 @@ end
|
|
324
308
|
## puts Benchmark.measure { 1000.times { (CombinePDF::PDFOperations.get_refernced_object pdf.objects, {indirect_reference_id: 100, indirect_generation_number:0}).object_id } }
|
325
309
|
## puts Benchmark.measure { 1000.times { (pdf.objects.select {|o| o[:indirect_reference_id]== 100 && o[:indirect_generation_number] == 0})[0].object_id } }
|
326
310
|
## puts Benchmark.measure { {}.tap {|out| pdf.objects.each {|o| out[ [o[:indirect_reference_id], o[:indirect_generation_number] ] ] = o }} }
|
327
|
-
|
311
|
+
##
|
312
|
+
#### local test for CombinePDF
|
313
|
+
## file = "/Users/2Be/Ruby/pdfs/encrypted.pdf"
|
314
|
+
## puts Benchmark.measure { 1000.times { pdf = CombinePDF.new(file); pdf.save "test.pdf" } }
|
315
|
+
### gives : 2.540000 0.140000 2.680000 ( 2.696524)
|
316
|
+
## puts Benchmark.measure { pdf = CombinePDF.new() ; 1000.times { pdf << CombinePDF.new(file) } ; pdf.save "test.pdf" }
|
317
|
+
### gives: 11.770000 0.090000 11.860000 ( 11.879411) #why the difference? NOT the object reference rebuilding...
|
318
|
+
### file size: 7Kb success
|
319
|
+
###### gives: 7.440000 0.100000 7.540000 ( 7.536460) (!!!) with draft file size 8kb
|
320
|
+
##
|
321
|
+
#### local test by pdftk
|
322
|
+
## pdftk_path = '/Users/2Be/Ruby/pdfs/pdftk_lib/bin/pdftk'
|
323
|
+
## file_array = []
|
324
|
+
## 1000.times { file_array << file }
|
325
|
+
## puts Benchmark.measure { system ( pdftk_path + " '" + file_array.join("' '") + "' input_pw '' output 'test.pdf'" ) }
|
326
|
+
### gives: 0.000000 0.000000 3.250000 ( 3.244724)
|
327
|
+
### FAILS with no output, unwilling to decrypt.
|
328
|
+
###### gives: 0.000000 0.000000 2.640000 ( 2.661801) with draft file size 1.3MB (!!)
|
329
|
+
#### local test by pyton
|
330
|
+
## pyton_path = '/Users/2Be/Ruby/pdfs/pdftk_lib/join.py'
|
331
|
+
## file_array = []
|
332
|
+
## 1000.times { file_array << file }
|
333
|
+
## puts Benchmark.measure { system ( pyton_path + " -o 'test.pdf' '#{file_array.join "' '"}' " ) }
|
334
|
+
### gives 0.000000 0.000000 1.010000 ( 1.147135)
|
335
|
+
### file merge FAILS with 1,000 empty pages (undecrypted)
|
336
|
+
####### gives: 0.000000 0.000000 1.770000 ( 1.775513) with draft. file size 4.9MB (!!!)
|
328
337
|
|
@@ -133,7 +133,7 @@ module CombinePDF
|
|
133
133
|
y = options[:y]
|
134
134
|
|
135
135
|
# set graphic state for the box
|
136
|
-
box_stream << "q\
|
136
|
+
box_stream << "q\n"
|
137
137
|
box_graphic_state = { ca: options[:opacity], CA: options[:opacity], LW: options[:border_width], LC: 0, LJ: 0, LD: 0 }
|
138
138
|
if options[:box_radius] != 0 # if the text box has rounded corners
|
139
139
|
box_graphic_state[:LC], box_graphic_state[:LJ] = 2, 1
|
@@ -191,7 +191,7 @@ module CombinePDF
|
|
191
191
|
end
|
192
192
|
|
193
193
|
# exit graphic state for the box
|
194
|
-
box_stream << "Q\
|
194
|
+
box_stream << "Q\n"
|
195
195
|
end
|
196
196
|
contents << box_stream
|
197
197
|
|
@@ -227,7 +227,7 @@ module CombinePDF
|
|
227
227
|
end
|
228
228
|
|
229
229
|
# set graphic state for text
|
230
|
-
text_stream << "q\
|
230
|
+
text_stream << "q\n"
|
231
231
|
text_graphic_state = graphic_state({ca: options[:opacity], CA: options[:opacity], LW: options[:stroke_width].to_f, LC: 2, LJ: 1, LD: 0})
|
232
232
|
text_stream << "#{PDFOperations._object_to_pdf text_graphic_state} gs\n"
|
233
233
|
text_stream << "DeviceRGB CS\nDeviceRGB cs\n"
|
@@ -261,7 +261,7 @@ module CombinePDF
|
|
261
261
|
y -= encoded[3]/1000*font_size #update text starting point
|
262
262
|
end
|
263
263
|
# exit graphic state for text
|
264
|
-
text_stream << "Q\
|
264
|
+
text_stream << "Q\n"
|
265
265
|
end
|
266
266
|
contents << text_stream
|
267
267
|
|
@@ -43,7 +43,6 @@ module CombinePDF
|
|
43
43
|
@key = set_general_key
|
44
44
|
case @encryption_dictionary[:V]
|
45
45
|
when 1,2
|
46
|
-
warn "trying to decrypt with RC4."
|
47
46
|
# raise_encrypted_error
|
48
47
|
_perform_decrypt_proc_ @objects, self.method(:decrypt_RC4)
|
49
48
|
else
|
@@ -109,7 +108,7 @@ module CombinePDF
|
|
109
108
|
# (0..2).each { |e| object_key << (encrypted_id >> e*8 & 0xFF ) }
|
110
109
|
# (0..1).each { |e| object_key << (encrypted_generation >> e*8 & 0xFF ) }
|
111
110
|
key_length = object_key.length < 16 ? object_key.length : 16
|
112
|
-
rc4 = RC4.new( Digest::MD5.digest(object_key)[(0...key_length)] )
|
111
|
+
rc4 = ::RC4.new( Digest::MD5.digest(object_key)[(0...key_length)] )
|
113
112
|
rc4.decrypt(encrypted)
|
114
113
|
end
|
115
114
|
def decrypt_AES(encrypted, encrypted_id, encrypted_generation, encrypted_filter)
|
@@ -37,6 +37,7 @@ module CombinePDF
|
|
37
37
|
|
38
38
|
# following the reference chain and assigning a pointer to the correct Resouces object.
|
39
39
|
# (assignments of Strings, Arrays and Hashes are pointers in Ruby, unless the .dup method is called)
|
40
|
+
page[:Resources] ||= {}
|
40
41
|
original_resources = page[:Resources]
|
41
42
|
if original_resources[:is_reference_only]
|
42
43
|
original_resources = original_resources[:referenced_object]
|
@@ -45,6 +46,7 @@ module CombinePDF
|
|
45
46
|
original_contents = page[:Contents]
|
46
47
|
original_contents = [original_contents] unless original_contents.is_a? Array
|
47
48
|
|
49
|
+
stream[:Resources] ||= {}
|
48
50
|
stream_resources = stream[:Resources]
|
49
51
|
if stream_resources[:is_reference_only]
|
50
52
|
stream_resources = stream_resources[:referenced_object]
|
@@ -65,7 +65,6 @@ module CombinePDF
|
|
65
65
|
@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
|
66
66
|
end
|
67
67
|
|
68
|
-
warn "Starting to parse PDF data."
|
69
68
|
@parsed = _parse_
|
70
69
|
|
71
70
|
if @root_object == {}
|
@@ -75,7 +74,6 @@ module CombinePDF
|
|
75
74
|
end
|
76
75
|
end
|
77
76
|
raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
|
78
|
-
warn "Injecting actual values into root object: #{@root_object}."
|
79
77
|
PDFOperations.change_references_to_actual_values @parsed, @root_object
|
80
78
|
|
81
79
|
if @root_object[:Encrypt]
|
@@ -91,7 +89,6 @@ module CombinePDF
|
|
91
89
|
warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
|
92
90
|
|
93
91
|
object_streams.each do |o|
|
94
|
-
warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
|
95
92
|
## un-encode (using the correct filter) the object streams
|
96
93
|
PDFFilter.inflate_object o
|
97
94
|
## extract objects from stream to top level arry @parsed
|
@@ -123,7 +120,6 @@ module CombinePDF
|
|
123
120
|
else
|
124
121
|
@info_object = {}
|
125
122
|
end
|
126
|
-
warn "setting parsed collection and returning collection."
|
127
123
|
@parsed
|
128
124
|
end
|
129
125
|
|
@@ -133,7 +129,6 @@ module CombinePDF
|
|
133
129
|
def _parse_
|
134
130
|
out = []
|
135
131
|
str = ''
|
136
|
-
# warn "Scaning for objects, starting at #{@scanner.pos}: #{@scanner.peek(10)}"
|
137
132
|
while @scanner.rest? do
|
138
133
|
case
|
139
134
|
##########################################
|
@@ -171,7 +166,6 @@ module CombinePDF
|
|
171
166
|
## parse an Object after finished
|
172
167
|
##########################################
|
173
168
|
when str = @scanner.scan(/endobj/)
|
174
|
-
# warn "Proccessing Object"
|
175
169
|
#what to do when this is an object?
|
176
170
|
if out.last.is_a? Hash
|
177
171
|
out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
|
@@ -115,12 +115,10 @@ module CombinePDF
|
|
115
115
|
end
|
116
116
|
# general globals
|
117
117
|
@string_output = :literal
|
118
|
-
@need_to_rebuild_resources = false
|
119
118
|
@set_start_id = 1
|
120
119
|
@info[:Producer] = "Ruby CombinePDF Library by Boaz Segev"
|
121
120
|
@info.delete :CreationDate
|
122
121
|
@info.delete :ModDate
|
123
|
-
warn "finished to initialize PDF object."
|
124
122
|
end
|
125
123
|
|
126
124
|
# Formats the data to PDF formats and returns a binary string that represents the PDF file content.
|
@@ -133,17 +131,11 @@ module CombinePDF
|
|
133
131
|
@version = 1.5 if @version.to_f == 0.0
|
134
132
|
#set creation date for merged file
|
135
133
|
@info[:CreationDate] = Time.now.strftime "D:%Y%m%d%H%M%S%:::z'00"
|
136
|
-
#rebuild resources if needed
|
137
|
-
if @need_to_rebuild_resources
|
138
|
-
rebuild_resources
|
139
|
-
end
|
140
134
|
#rebuild_catalog
|
141
135
|
catalog = rebuild_catalog_and_objects
|
142
136
|
# add ID and generation numbers to objects
|
143
137
|
renumber_object_ids
|
144
138
|
|
145
|
-
warn "Formatting PDF output"
|
146
|
-
|
147
139
|
out = []
|
148
140
|
xref = []
|
149
141
|
indirect_object_count = 1 #the first object is the null object
|
@@ -159,7 +151,6 @@ module CombinePDF
|
|
159
151
|
out << PDFOperations._object_to_pdf(o)
|
160
152
|
loc += out.last.length + 1
|
161
153
|
end
|
162
|
-
warn "Building XREF"
|
163
154
|
xref_location = 0
|
164
155
|
out.each { |line| xref_location += line.bytes.length + 1}
|
165
156
|
out << "xref\n\r0 #{(indirect_object_count).to_s}\n\r0000000000 65535 f \n\r"
|
@@ -275,7 +266,7 @@ module CombinePDF
|
|
275
266
|
fonts_array
|
276
267
|
end
|
277
268
|
|
278
|
-
# add the pages (or file) to the PDF (combine/merge) and
|
269
|
+
# add the pages (or file) to the PDF (combine/merge) and RETURNS SELF, for nesting.
|
279
270
|
# for example:
|
280
271
|
#
|
281
272
|
# pdf = CombinePDF.new "first_file.pdf"
|
@@ -290,17 +281,14 @@ module CombinePDF
|
|
290
281
|
## and how to handles imported pages?
|
291
282
|
if data.is_a?(PDF)
|
292
283
|
@version = [@version, data.version].max
|
293
|
-
|
294
|
-
@need_to_rebuild_resources = true
|
295
|
-
|
296
284
|
@objects.push(*data.objects)
|
297
285
|
# rebuild_catalog
|
298
|
-
return
|
286
|
+
return self
|
299
287
|
end
|
300
288
|
insert -1, data
|
301
289
|
end
|
302
290
|
|
303
|
-
# add the pages (or file) to the BEGINNING of the PDF (combine/merge) and
|
291
|
+
# add the pages (or file) to the BEGINNING of the PDF (combine/merge) and RETURNS SELF for nesting operators.
|
304
292
|
# for example:
|
305
293
|
#
|
306
294
|
# pdf = CombinePDF.new "second_file.pdf"
|
@@ -311,6 +299,7 @@ module CombinePDF
|
|
311
299
|
# data:: is PDF page (Hash), and Array of PDF pages or a parsed PDF object to be added.
|
312
300
|
def >> (data)
|
313
301
|
insert 0, data
|
302
|
+
self
|
314
303
|
end
|
315
304
|
|
316
305
|
# add PDF pages (or PDF files) into a specific location.
|
@@ -519,8 +508,6 @@ module CombinePDF
|
|
519
508
|
end
|
520
509
|
# @private
|
521
510
|
def serialize_objects_and_references(object = nil)
|
522
|
-
warn "connecting objects with their references (serialize_objects_and_references)."
|
523
|
-
|
524
511
|
# # Version 3.5 injects indirect objects if they arn't dictionaries.
|
525
512
|
# # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" }
|
526
513
|
# # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} }
|
@@ -653,49 +640,6 @@ module CombinePDF
|
|
653
640
|
catalog
|
654
641
|
end
|
655
642
|
|
656
|
-
# @private
|
657
|
-
# disabled, don't use. simpley returns true.
|
658
|
-
def rebuild_resources
|
659
|
-
|
660
|
-
warn "Resources re-building disabled as it isn't worth the price in peformance as of yet."
|
661
|
-
|
662
|
-
return true
|
663
|
-
|
664
|
-
warn "Re-Building Resources"
|
665
|
-
@need_to_rebuild_resources = false
|
666
|
-
# what are resources?
|
667
|
-
# anything at the top level of the file exept catalogs, page lists (Pages) and pages...
|
668
|
-
not_resources = [:Catalog, :Pages, :Page]
|
669
|
-
# get old resources list
|
670
|
-
old_resources = @objects.select {|obj| obj.is_a?(Hash) && !not_resources.include?(obj[:Type])}
|
671
|
-
# collect all unique resources while ignoring double values and resetting references
|
672
|
-
# also ignore inner values (canot use PRIVATE_HASH_KEYS because of stream and other issues)
|
673
|
-
ignore_keys = [:indirect_reference_id, :indirect_generation_number, :is_reference_only, :referenced_object]
|
674
|
-
new_resources = []
|
675
|
-
all_references = references
|
676
|
-
old_resources.each do |old_r|
|
677
|
-
add = true
|
678
|
-
new_resources.each do |new_r|
|
679
|
-
# ## v.1.0 - slower
|
680
|
-
# if (old_r.reject {|k,v| ignore_keys.include?(k) }) == (new_r.reject {|k,v| ignore_keys.include?(k)})
|
681
|
-
# all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
|
682
|
-
# add = false
|
683
|
-
# end
|
684
|
-
## v.1.1 - faster, doesn't build two hashes (but iterates one)
|
685
|
-
if ( [].tap {|out| old_r.each {|k,v| out << true unless ((!ignore_keys.include?(k)) && new_r[k] == v) } } .empty?)
|
686
|
-
all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) }
|
687
|
-
add = false
|
688
|
-
end
|
689
|
-
end
|
690
|
-
new_resources << old_r if add
|
691
|
-
end
|
692
|
-
# remove old resources
|
693
|
-
@objects.reject! {|obj| old_resources.include?(obj)}
|
694
|
-
# insert new resources
|
695
|
-
@objects.push *new_resources
|
696
|
-
# rebuild stream lengths?
|
697
|
-
end
|
698
|
-
|
699
643
|
# @private
|
700
644
|
# the function rerturns true if the reference belongs to the object
|
701
645
|
def compare_reference_values(obj, ref)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: combine_pdf
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Boaz Segev
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-09-
|
12
|
+
date: 2014-09-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: ruby-rc4
|
@@ -26,8 +26,8 @@ dependencies:
|
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: 0.1.5
|
28
28
|
description: A nifty gem, in pure Ruby, to parse PDF files and combine (merge) them
|
29
|
-
with other PDF files, number the pages, watermark them or stamp them
|
30
|
-
the PDF file format).
|
29
|
+
with other PDF files, number the pages, watermark them or stamp them, create tables
|
30
|
+
or basic text objects etc` (all using the PDF file format).
|
31
31
|
email: bsegev@gmail.com
|
32
32
|
executables: []
|
33
33
|
extensions: []
|