combine_pdf 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/combine_pdf.rb +467 -0
- data/lib/combine_pdf/combine_pdf_basic_writer.rb +110 -0
- data/lib/combine_pdf/combine_pdf_decrypt.rb +198 -0
- data/lib/combine_pdf/combine_pdf_filter.rb +72 -0
- data/lib/combine_pdf/combine_pdf_parser.rb +315 -0
- data/lib/combine_pdf/combine_pdf_pdf.rb +396 -0
- metadata +66 -0
@@ -0,0 +1,110 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
########################################################
|
3
|
+
## Thoughts from reading the ISO 32000-1:2008
|
4
|
+
## this file is part of the CombinePDF library and the code
|
5
|
+
## is subject to the same license.
|
6
|
+
########################################################
|
7
|
+
|
8
|
+
module CombinePDF
|
9
|
+
|
10
|
+
class PDFWriter
|
11
|
+
|
12
|
+
def initialize(media_box = [0.0, 0.0, 612.0, 792.0])
|
13
|
+
@content_stream = {}
|
14
|
+
@media_box = media_box
|
15
|
+
end
|
16
|
+
|
17
|
+
########################################################
|
18
|
+
## textbox
|
19
|
+
## - font_name: :font_name
|
20
|
+
## The PostScript names of 14 Type 1 fonts, known as the standard 14 fonts, are as follows:
|
21
|
+
## Times-Roman, Helvetica, Courier, Symbol, Times-Bold, Helvetica-Bold, Courier-Bold, ZapfDingbats, Times-Italic, Helvetica- Oblique, Courier-Oblique, Times-BoldItalic, Helvetica-BoldOblique, Courier-BoldOblique
|
22
|
+
## - text_color: [R, G, B]
|
23
|
+
## an array with three floats, each in a value between 0 to 1.
|
24
|
+
## First value is Red, second Green and last is Blue (RGB color system)
|
25
|
+
def add_text_box(text, args = {})
|
26
|
+
options = {
|
27
|
+
text_alignment: :center,
|
28
|
+
text_color: [1,1,1],
|
29
|
+
# text_stroke: nil,
|
30
|
+
font_name: :Helvetica,
|
31
|
+
font_type: :Type1,
|
32
|
+
font_object: nil,
|
33
|
+
font_size: 12,
|
34
|
+
border_color: nil,
|
35
|
+
border_width: nil,
|
36
|
+
border_radius: nil,
|
37
|
+
background_color: nil,
|
38
|
+
opacity: 1,
|
39
|
+
x: 0,
|
40
|
+
y: 0,
|
41
|
+
length: -1,
|
42
|
+
height: -1,
|
43
|
+
}
|
44
|
+
# create font object
|
45
|
+
font_object = { Type: :Font, Subtype: options[:font_type], BaseFont: options[:font_name]}
|
46
|
+
if options[:font_object].is_a?(Hash) && options[:font_object][:indirect_reference_id] && options[:font_object][:indirect_generation_number] && (options[:font_object][:is_reference_only] != true)
|
47
|
+
font_object = {is_reference_only: true, referenced_object: font_object}
|
48
|
+
end
|
49
|
+
|
50
|
+
# create resources object
|
51
|
+
font_name = ("MyFont" + rand(99) ).to_sym
|
52
|
+
resources_object = {Resources: {Font: { font_name => font_object } } }
|
53
|
+
# create box stream
|
54
|
+
|
55
|
+
# reset x,y by text alignment - x,y are calculated from the buttom left
|
56
|
+
# each unit (1) is 1/72 Inch
|
57
|
+
x = options[:x]
|
58
|
+
y = options[:y]
|
59
|
+
# create text stream
|
60
|
+
text_stream = ""
|
61
|
+
text_stream << "BT\n" # the Begine Text marker
|
62
|
+
text_stream << PDFOperations._format_name_to_pdf(font_name) # Set font name
|
63
|
+
text_stream << " #{options[:font_size].to_f} Tf\n" # set font size and add font operator
|
64
|
+
text_stream << "#{options[:text_color][0]} #{options[:text_color][0]} #{options[:text_color][0]} rg\n" # sets the color state
|
65
|
+
text_stream << " #{options[:opacity].to_f} ca\n" # set opacity (alpha) for graphic state.
|
66
|
+
text_stream << "#{x} #{y} Td\n" # set location for text object
|
67
|
+
text_stream << PDFOperations._format_string_to_pdf(text) # insert the string in PDF format
|
68
|
+
text_stream << " Tj\n ET\n" # the Text object operator and the End Text marker
|
69
|
+
end
|
70
|
+
|
71
|
+
########################################################
|
72
|
+
## add_content_to_pages(pages = [], location = :above)
|
73
|
+
## pages - a page hash or an array of pages
|
74
|
+
## location - :above to place content over existing content or :below to place content under existing content
|
75
|
+
def add_content_to_pages(pages = [], location = :above)
|
76
|
+
if pages.is_a?(Array)
|
77
|
+
pages.each {|p| add_content_to_pages p, location}
|
78
|
+
elsif pages.is_a?(Hash)
|
79
|
+
#####
|
80
|
+
##add content stream to page
|
81
|
+
end
|
82
|
+
end
|
83
|
+
########################################################
|
84
|
+
## make_into_page()
|
85
|
+
## takes no arguments and returns the contents stream within a page (to be added as an indipendent page to the PDF object)
|
86
|
+
def make_into_page
|
87
|
+
{Type: :Page, }
|
88
|
+
end
|
89
|
+
|
90
|
+
########################################################
|
91
|
+
## to_pdf()
|
92
|
+
## prints out the content stream as raw PDF
|
93
|
+
## file_name - the name of the file to which to save the data (will be overwritten).
|
94
|
+
## if file_name is given, save to file.
|
95
|
+
def to_pdf( file_name = nil)
|
96
|
+
pdf = PDF.new
|
97
|
+
pdf << make_into_page
|
98
|
+
if file_name
|
99
|
+
pdf.save file_name
|
100
|
+
else
|
101
|
+
pdf.to_pdf
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
|
@@ -0,0 +1,198 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
########################################################
|
3
|
+
## Thoughts from reading the ISO 32000-1:2008
|
4
|
+
## this file is part of the CombinePDF library and the code
|
5
|
+
## is subject to the same license.
|
6
|
+
########################################################
|
7
|
+
|
8
|
+
module CombinePDF
|
9
|
+
class PDFDecrypt
|
10
|
+
|
11
|
+
def initialize objects=[], root_doctionary = {}
|
12
|
+
@objects = objects
|
13
|
+
@encryption_dictionary = root_doctionary[:Encrypt]
|
14
|
+
raise "Cannot decrypt an encrypted file without an encryption dictionary!" unless @encryption_dictionary
|
15
|
+
@root_doctionary = root_doctionary
|
16
|
+
@padding_key = [ 0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41,
|
17
|
+
0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08,
|
18
|
+
0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80,
|
19
|
+
0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A ]
|
20
|
+
@key_crypt_first_iv_store = nil
|
21
|
+
@encryption_iv = nil
|
22
|
+
PDFOperations.change_references_to_actual_values @objects, @encryption_dictionary
|
23
|
+
end
|
24
|
+
def set_general_key(password = "")
|
25
|
+
# 1) make sure the initial key is 32 byte long (if no password, uses padding).
|
26
|
+
key = (password.bytes[0..32] + @padding_key)[0..31].pack('C*').force_encoding(Encoding::ASCII_8BIT)
|
27
|
+
# 2) add the value of the encryption dictionary’s O entry
|
28
|
+
key << @encryption_dictionary[:O].to_s
|
29
|
+
# 3) Convert the integer value of the P entry to a 32-bit unsigned binary number
|
30
|
+
# and pass these bytes low-order byte first
|
31
|
+
key << [@encryption_dictionary[:P]].pack('i')
|
32
|
+
# 4) Pass the first element of the file’s file identifier array
|
33
|
+
# (the value of the ID entry in the document’s trailer dictionary
|
34
|
+
key << @root_doctionary[:ID][0]
|
35
|
+
# # 4(a) (Security handlers of revision 4 or greater)
|
36
|
+
# # if document metadata is not being encrypted, add 4 bytes with the value 0xFFFFFFFF.
|
37
|
+
if @encryption_dictionary[:R] >= 4
|
38
|
+
unless @encryption_dictionary[:EncryptMetadata] == false #default is true and nil != false
|
39
|
+
key << "\x00\x00\x00\x00"
|
40
|
+
else
|
41
|
+
key << "\xFF\xFF\xFF\xFF"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
# 5) pass everything as a MD5 hash
|
45
|
+
key = Digest::MD5.digest(key)
|
46
|
+
# 5(a) h) (Security handlers of revision 3 or greater) Do the following 50 times:
|
47
|
+
# Take the output from the previous MD5 hash and
|
48
|
+
# pass the first n bytes of the output as input into a new MD5 hash,
|
49
|
+
# where n is the number of bytes of the encryption key as defined by the value of
|
50
|
+
# the encryption dictionary’s Length entry.
|
51
|
+
if @encryption_dictionary[:R] >= 3
|
52
|
+
50.times do|i|
|
53
|
+
key = Digest::MD5.digest(key[0...@encryption_dictionary[:Length]])
|
54
|
+
end
|
55
|
+
end
|
56
|
+
# 6) Set the encryption key to the first n bytes of the output from the final MD5 hash,
|
57
|
+
# where n shall always be 5 for security handlers of revision 2 but,
|
58
|
+
# for security handlers of revision 3 or greater,
|
59
|
+
# shall depend on the value of the encryption dictionary’s Length entry.
|
60
|
+
if @encryption_dictionary[:R] >= 3
|
61
|
+
@key = key[0..(@encryption_dictionary[:Length]/8)]
|
62
|
+
else
|
63
|
+
@key = key[0..4]
|
64
|
+
end
|
65
|
+
@key
|
66
|
+
end
|
67
|
+
def decrypt
|
68
|
+
raise_encrypted_error @encryption_dictionary unless @encryption_dictionary[:Filter] == :Standard
|
69
|
+
@key = set_general_key
|
70
|
+
case @encryption_dictionary[:V]
|
71
|
+
when 1,2
|
72
|
+
warn "trying to decrypt with RC4."
|
73
|
+
# raise_encrypted_error
|
74
|
+
_perform_decrypt_proc_ @objects, self.method(:decrypt_RC4)
|
75
|
+
else
|
76
|
+
raise_encrypted_error
|
77
|
+
end
|
78
|
+
#rebuild stream lengths?
|
79
|
+
@objects
|
80
|
+
end
|
81
|
+
def decrypt_none(encrypted, encrypted_id, encrypted_generation, encrypted_filter)
|
82
|
+
"encrypted"
|
83
|
+
end
|
84
|
+
def decrypt_RC4(encrypted, encrypted_id, encrypted_generation, encrypted_filter)
|
85
|
+
## start decryption using padding strings
|
86
|
+
object_key = @key.dup
|
87
|
+
object_key << [encrypted_id].pack('i')[0..2]
|
88
|
+
object_key << [encrypted_generation].pack('i')[0..1]
|
89
|
+
# (0..2).each { |e| object_key << (encrypted_id >> e*8 & 0xFF ) }
|
90
|
+
# (0..1).each { |e| object_key << (encrypted_generation >> e*8 & 0xFF ) }
|
91
|
+
key_length = object_key.length < 16 ? object_key.length : 16
|
92
|
+
rc4 = RC4.new( Digest::MD5.digest(object_key)[(0...key_length)] )
|
93
|
+
rc4.decrypt(encrypted)
|
94
|
+
end
|
95
|
+
def decrypt_AES(encrypted, encrypted_id, encrypted_generation, encrypted_filter)
|
96
|
+
## extract encryption_iv if it wasn't extracted yet
|
97
|
+
unless @encryption_iv
|
98
|
+
@encryption_iv = encrypted[0..15].to_i
|
99
|
+
#raise "Tryed decrypting using AES and couldn't extract iv" if @encryption_iv == 0
|
100
|
+
@encryption_iv = 0.chr * 16
|
101
|
+
#encrypted = encrypted[16..-1]
|
102
|
+
end
|
103
|
+
## start decryption using padding strings
|
104
|
+
object_key = @key.dup
|
105
|
+
(0..2).each { |e| object_key << (encrypted_id >> e*8 & 0xFF ) }
|
106
|
+
(0..1).each { |e| object_key << (encrypted_generation >> e*8 & 0xFF ) }
|
107
|
+
object_key << "sAlT"
|
108
|
+
key_length = object_key.length < 16 ? object_key.length : 16
|
109
|
+
cipher = OpenSSL::Cipher::Cipher.new("aes-#{object_key.length << 3}-cbc").decrypt
|
110
|
+
cipher.padding = 0
|
111
|
+
(cipher.update(encrypted) + cipher.final).unpack("C*")
|
112
|
+
end
|
113
|
+
def _perform_decrypt_proc_ (object, decrypt_proc, encrypted_id = nil, encrypted_generation = nil, encrypted_filter = nil)
|
114
|
+
case
|
115
|
+
when object.is_a?(Array)
|
116
|
+
object.map! { |item| _perform_decrypt_proc_(item, decrypt_proc, encrypted_id, encrypted_generation, encrypted_filter) }
|
117
|
+
when object.is_a?(Hash)
|
118
|
+
encrypted_id ||= object[:indirect_reference_id]
|
119
|
+
encrypted_generation ||= object[:indirect_generation_number]
|
120
|
+
encrypted_filter ||= object[:Filter]
|
121
|
+
if object[:raw_stream_content]
|
122
|
+
stream_length = object[:Length]
|
123
|
+
if stream_length.is_a?(Hash) && stream_length[:is_reference_only]
|
124
|
+
stream_length = PDFOperations.get_refernced_object( @objects, stream_length)[:indirect_without_dictionary]
|
125
|
+
end
|
126
|
+
actual_length = object[:raw_stream_content].length
|
127
|
+
warn "Stream registeded length was #{object[:Length].to_s} and the actual length was #{actual_length}." if actual_length < stream_length
|
128
|
+
length = [ stream_length, actual_length].min
|
129
|
+
object[:raw_stream_content] = decrypt_proc.call( (object[:raw_stream_content][0...length]), encrypted_id, encrypted_generation, encrypted_filter)
|
130
|
+
end
|
131
|
+
object.each {|k, v| object[k] = _perform_decrypt_proc_(v, decrypt_proc, encrypted_id, encrypted_generation, encrypted_filter) if k != :raw_stream_content && (v.is_a?(Hash) || v.is_a?(Array) || v.is_a?(String))} # assumes no decrypting is ever performed on keys
|
132
|
+
when object.is_a?(String)
|
133
|
+
return decrypt_proc.call(object, encrypted_id, encrypted_generation, encrypted_filter)
|
134
|
+
else
|
135
|
+
return object
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
def raise_encrypted_error object = nil
|
141
|
+
object ||= @encryption_dictionary.to_s.split(',').join("\n")
|
142
|
+
warn "Data raising exception:\n #{object.to_s.split(',').join("\n")}"
|
143
|
+
raise "File is encrypted - not supported."
|
144
|
+
end
|
145
|
+
end
|
146
|
+
#####################################################
|
147
|
+
## The following isn't my code!!!!
|
148
|
+
## It is subject to a different license and copyright.
|
149
|
+
## This was the code for the RC4 Gem,
|
150
|
+
## ... I had a bad internet connection so I ended up
|
151
|
+
## copying it from the web page I had in my cache.
|
152
|
+
## This wonderful work was done by Caige Nichols.
|
153
|
+
#####################################################
|
154
|
+
|
155
|
+
class RC4
|
156
|
+
def initialize(str)
|
157
|
+
begin
|
158
|
+
raise SyntaxError, "RC4: Key supplied is blank" if str.eql?('')
|
159
|
+
|
160
|
+
@q1, @q2 = 0, 0
|
161
|
+
@key = []
|
162
|
+
str.each_byte { |elem| @key << elem } while @key.size < 256
|
163
|
+
@key.slice!(256..@key.size-1) if @key.size >= 256
|
164
|
+
@s = (0..255).to_a
|
165
|
+
j = 0
|
166
|
+
0.upto(255) do |i|
|
167
|
+
j = (j + @s[i] + @key[i] ) % 256
|
168
|
+
@s[i], @s[j] = @s[j], @s[i]
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def encrypt!(text)
|
174
|
+
process text
|
175
|
+
end
|
176
|
+
|
177
|
+
def encrypt(text)
|
178
|
+
process text.dup
|
179
|
+
end
|
180
|
+
|
181
|
+
alias_method :decrypt, :encrypt
|
182
|
+
|
183
|
+
private
|
184
|
+
|
185
|
+
def process(text)
|
186
|
+
text.unpack("C*").map { |c| c ^ round }.pack("C*")
|
187
|
+
end
|
188
|
+
|
189
|
+
def round
|
190
|
+
@q1 = (@q1 + 1) % 256
|
191
|
+
@q2 = (@q2 + @s[@q1]) % 256
|
192
|
+
@s[@q1], @s[@q2] = @s[@q2], @s[@q1]
|
193
|
+
@s[(@s[@q1]+@s[@q2]) % 256]
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
########################################################
|
3
|
+
## Thoughts from reading the ISO 32000-1:2008
|
4
|
+
## this file is part of the CombinePDF library and the code
|
5
|
+
## is subject to the same license.
|
6
|
+
########################################################
|
7
|
+
|
8
|
+
module CombinePDF
|
9
|
+
|
10
|
+
module PDFFilter
|
11
|
+
module_function
|
12
|
+
|
13
|
+
def deflate_object object = nil
|
14
|
+
false
|
15
|
+
end
|
16
|
+
|
17
|
+
def inflate_object object = nil, filter = :none
|
18
|
+
filter_array = object[:Filter]
|
19
|
+
if filter_array.is_a?(Hash) && filter_array[:is_reference_only]
|
20
|
+
filter_array = filter_array[:referenced_object]
|
21
|
+
end
|
22
|
+
if filter_array.is_a?(Symbol)
|
23
|
+
filter_array = [filter_array]
|
24
|
+
end
|
25
|
+
filter_array = [] if filter_array.nil?
|
26
|
+
params_array = object[:DecodeParms]
|
27
|
+
if params_array.is_a?(Hash) && params_array[:is_reference_only]
|
28
|
+
params_array = params_array[:referenced_object]
|
29
|
+
end
|
30
|
+
unless params_array.is_a?(Array)
|
31
|
+
params_array = [params_array]
|
32
|
+
end
|
33
|
+
while filter_array[0]
|
34
|
+
case filter_array[0]
|
35
|
+
when :FlateDecode
|
36
|
+
raise_unsupported_error params_array[0] unless params_array[0].nil?
|
37
|
+
if params_array[0] && params_array[0][:Predictor].to_i > 1
|
38
|
+
bits = params_array[0][:BitsPerComponent] || 8
|
39
|
+
predictor = params_array[0][:Predictor].to_i
|
40
|
+
columns = params_array[0][:Columns] || 1
|
41
|
+
if (2..9).include? params_array[0][:Predictor].to_i
|
42
|
+
####
|
43
|
+
# prepare TIFF group
|
44
|
+
elsif (10..15).include? params_array[0][:Predictor].to_i == 2
|
45
|
+
####
|
46
|
+
# prepare PNG group
|
47
|
+
end
|
48
|
+
else
|
49
|
+
object[:raw_stream_content] = Zlib::Inflate.inflate object[:raw_stream_content]
|
50
|
+
object[:Length] = object[:raw_stream_content].bytesize
|
51
|
+
end
|
52
|
+
when nil
|
53
|
+
true
|
54
|
+
else
|
55
|
+
return false
|
56
|
+
end
|
57
|
+
params_array.shift
|
58
|
+
filter_array.shift
|
59
|
+
end
|
60
|
+
object.delete(:Filter)
|
61
|
+
true
|
62
|
+
end
|
63
|
+
def raise_unsupported_error (object = {})
|
64
|
+
raise "Filter #{object} unsupported. couldn't deflate object"
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
|
@@ -0,0 +1,315 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
########################################################
|
3
|
+
## Thoughts from reading the ISO 32000-1:2008
|
4
|
+
## this file is part of the CombinePDF library and the code
|
5
|
+
## is subject to the same license.
|
6
|
+
########################################################
|
7
|
+
module CombinePDF
|
8
|
+
|
9
|
+
########################################################
|
10
|
+
## This is the Parser class.
|
11
|
+
## It takes PDF data and parses it, returning an array
|
12
|
+
## of data.
|
13
|
+
## That array can be used to initialize a PDF object.
|
14
|
+
## The Parser class doesn't involve itself with the
|
15
|
+
## file version.
|
16
|
+
########################################################
|
17
|
+
|
18
|
+
class PDFParser
|
19
|
+
# LITERAL_STRING_REPLACEMENT_HASH = {
|
20
|
+
# 110 => 10, # "\\n".bytes = [92, 110] "\n".ord = 10
|
21
|
+
# 114 => 13, #r
|
22
|
+
# 116 => 9, #t
|
23
|
+
# 98 => 8, #b
|
24
|
+
# 102 => 255, #f
|
25
|
+
# 40 => 40, #(
|
26
|
+
# 41 => 41, #)
|
27
|
+
# 92 => 92 #\
|
28
|
+
# }
|
29
|
+
attr_reader :parsed, :version, :info_object, :root_object
|
30
|
+
def initialize (string)
|
31
|
+
raise TypeError, "couldn't parse and data, expecting type String" unless string.is_a? String
|
32
|
+
@string_to_parse = string.force_encoding(Encoding::ASCII_8BIT)
|
33
|
+
@literal_strings = []
|
34
|
+
@hex_strings = []
|
35
|
+
@streams = []
|
36
|
+
@parsed = []
|
37
|
+
@root_object = {}
|
38
|
+
@info_object = {}
|
39
|
+
@version = nil
|
40
|
+
@scanner = nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse
|
44
|
+
return @parsed unless @parsed.empty?
|
45
|
+
@scanner = StringScanner.new @string_to_parse
|
46
|
+
@scanner.pos = 0
|
47
|
+
if @scanner.scan /\%PDF\-[\d\-\.]+/
|
48
|
+
@version = @scanner.matched.scan(/[\d\.]+/)[0].to_f
|
49
|
+
end
|
50
|
+
|
51
|
+
warn "Starting to parse PDF data."
|
52
|
+
@parsed = _parse_
|
53
|
+
|
54
|
+
if @root_object == {}
|
55
|
+
xref_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :XRef}
|
56
|
+
xref_streams.each do |xref_dictionary|
|
57
|
+
@root_object.merge! xref_dictionary
|
58
|
+
end
|
59
|
+
end
|
60
|
+
raise "root is unknown - cannot determine if file is Encrypted" if @root_object == {}
|
61
|
+
warn "Injecting actual values into root object: #{@root_object}."
|
62
|
+
PDFOperations.change_references_to_actual_values @parsed, @root_object
|
63
|
+
|
64
|
+
if @root_object[:Encrypt]
|
65
|
+
warn "PDF is Encrypted! Attempting to unencrypt - not yet fully supported."
|
66
|
+
decryptor = PDFDecrypt.new @parsed, @root_object
|
67
|
+
decryptor.decrypt
|
68
|
+
#do we really need to apply to @parsed? No, there is no need.
|
69
|
+
end
|
70
|
+
if @version >= 1.5 # code placement for object streams
|
71
|
+
## search for objects streams
|
72
|
+
object_streams = @parsed.select {|obj| obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
73
|
+
unless object_streams.empty?
|
74
|
+
warn "PDF 1.5 Object streams found - they are not fully supported! attempting to extract objects."
|
75
|
+
|
76
|
+
object_streams.each do |o|
|
77
|
+
warn "Attempting #{o.select {|k,v| k != :raw_stream_content}}"
|
78
|
+
## un-encode (using the correct filter) the object streams
|
79
|
+
PDFFilter.inflate_object o
|
80
|
+
## extract objects from stream to top level arry @parsed
|
81
|
+
@scanner = StringScanner.new o[:raw_stream_content]
|
82
|
+
stream_data = _parse_
|
83
|
+
id_array = []
|
84
|
+
while stream_data[0].is_a? Fixnum
|
85
|
+
id_array << stream_data.shift
|
86
|
+
stream_data.shift
|
87
|
+
end
|
88
|
+
while stream_data[0].is_a? Hash
|
89
|
+
stream_data[0][:indirect_reference_id] = id_array.shift
|
90
|
+
stream_data[0][:indirect_generation_number] = 0
|
91
|
+
@parsed << stream_data.shift
|
92
|
+
end
|
93
|
+
end
|
94
|
+
# ## remove object streams
|
95
|
+
@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :ObjStm}
|
96
|
+
# ## remove XREF dictionaries
|
97
|
+
@parsed.reject! {|obj| object_streams << obj if obj.is_a?(Hash) && obj[:Type] == :XRef}
|
98
|
+
end
|
99
|
+
end
|
100
|
+
PDFOperations.change_references_to_actual_values @parsed, @root_object
|
101
|
+
@info_object = @root_object[:Info]
|
102
|
+
if @info_object && @info_object.is_a?(Hash)
|
103
|
+
@parsed.delete @info_object
|
104
|
+
PDFOperations.change_references_to_actual_values @parsed, @info_object
|
105
|
+
PRIVATE_HASH_KEYS.each {|key| @info_object.delete key}
|
106
|
+
else
|
107
|
+
@info_object = {}
|
108
|
+
end
|
109
|
+
warn "setting parsed collection and returning collection."
|
110
|
+
@parsed
|
111
|
+
end
|
112
|
+
|
113
|
+
protected
|
114
|
+
|
115
|
+
def _parse_
|
116
|
+
out = []
|
117
|
+
str = ''
|
118
|
+
# warn "Scaning for objects, starting at #{@scanner.pos}: #{@scanner.peek(10)}"
|
119
|
+
while @scanner.rest? do
|
120
|
+
case
|
121
|
+
##########################################
|
122
|
+
## parse an Array
|
123
|
+
##########################################
|
124
|
+
when @scanner.scan(/\[/)
|
125
|
+
out << _parse_
|
126
|
+
##########################################
|
127
|
+
## parse a Dictionary
|
128
|
+
##########################################
|
129
|
+
when @scanner.scan(/<</)
|
130
|
+
data = _parse_
|
131
|
+
obj = {}
|
132
|
+
obj[data.shift] = data.shift while data[0]
|
133
|
+
out << obj
|
134
|
+
##########################################
|
135
|
+
## return content of array or dictionary
|
136
|
+
##########################################
|
137
|
+
when @scanner.scan(/\]/), @scanner.scan(/>>/)
|
138
|
+
return out
|
139
|
+
##########################################
|
140
|
+
## parse a Stream
|
141
|
+
##########################################
|
142
|
+
when @scanner.scan(/stream[\r]?[\n]/)
|
143
|
+
str = @scanner.scan_until(/endstream/)
|
144
|
+
# need to remove end of stream
|
145
|
+
if out.last.is_a? Hash
|
146
|
+
out.last[:raw_stream_content] = str[0...-10] #cuts only one EON char (\n or \r)
|
147
|
+
# out.last[:raw_stream_content] = str.gsub(/[\n\r]?[\n\r]?endstream/, "")
|
148
|
+
else
|
149
|
+
warn "Stream not attached to dictionary!"
|
150
|
+
out << str[0...-10].force_encoding(Encoding::ASCII_8BIT)
|
151
|
+
end
|
152
|
+
##########################################
|
153
|
+
## parse an Object after finished
|
154
|
+
##########################################
|
155
|
+
when str = @scanner.scan(/endobj/)
|
156
|
+
# warn "Proccessing Object"
|
157
|
+
#what to do when this is an object?
|
158
|
+
if out.last.is_a? Hash
|
159
|
+
out << out.pop.merge({indirect_generation_number: out.pop, indirect_reference_id: out.pop})
|
160
|
+
else
|
161
|
+
out << {indirect_without_dictionary: out.pop, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
|
162
|
+
end
|
163
|
+
##########################################
|
164
|
+
## parse a Hex String
|
165
|
+
##########################################
|
166
|
+
when str = @scanner.scan(/<[0-9a-f]+>/)
|
167
|
+
# warn "Found a hex string"
|
168
|
+
out << [str[1..-2]].pack('H*')
|
169
|
+
##########################################
|
170
|
+
## parse a Literal String
|
171
|
+
##########################################
|
172
|
+
when @scanner.scan(/\(/)
|
173
|
+
# warn "Found a literal string"
|
174
|
+
str = ''
|
175
|
+
count = 1
|
176
|
+
while count > 0 && @scanner.rest? do
|
177
|
+
str += @scanner.scan_until(/[\(\)]/).to_s
|
178
|
+
seperator_count = 0
|
179
|
+
seperator_count += 1 while str[-2-seperator_count] == "\\"
|
180
|
+
|
181
|
+
case str[-1]
|
182
|
+
when '('
|
183
|
+
## The following solution fails when (string ends with this sign: \\)
|
184
|
+
|
185
|
+
count += 1 unless seperator_count.odd?
|
186
|
+
when ')'
|
187
|
+
count -= 1 unless seperator_count.odd?
|
188
|
+
else
|
189
|
+
warn "Unknown error parsing string at #{@scanner.pos}!"
|
190
|
+
cout = 0 # error
|
191
|
+
end
|
192
|
+
end
|
193
|
+
# The PDF formatted string is: str[0..-2]
|
194
|
+
# now staring to convert to regular string
|
195
|
+
str_bytes = str[0..-2].bytes
|
196
|
+
str = []
|
197
|
+
until str_bytes.empty?
|
198
|
+
case str_bytes[0]
|
199
|
+
when 13 # eol - \r
|
200
|
+
# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
|
201
|
+
# shall be treated as a byte value of (0Ah),
|
202
|
+
# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
|
203
|
+
str_bytes.shift
|
204
|
+
str_bytes.shift if str_bytes[0] == 10
|
205
|
+
str << 10
|
206
|
+
when 10 # eol - \n
|
207
|
+
# An end-of-line marker appearing within a literal string without a preceding REVERSE SOLIDUS
|
208
|
+
# shall be treated as a byte value of (0Ah),
|
209
|
+
# irrespective of whether the end-of-line marker was a CARRIAGE RETURN (0Dh), a LINE FEED (0Ah), or both.
|
210
|
+
str_bytes.shift
|
211
|
+
str_bytes.shift if str_bytes[0] == 13
|
212
|
+
str << 10
|
213
|
+
when 92 # "\\".ord == 92
|
214
|
+
str_bytes.shift
|
215
|
+
rep = str_bytes.shift
|
216
|
+
case rep
|
217
|
+
when 110 #n
|
218
|
+
str << 10 #new line
|
219
|
+
when 114 #r
|
220
|
+
str << 13 # CR
|
221
|
+
when 116 #t
|
222
|
+
str << 9 #tab
|
223
|
+
when 98 #b
|
224
|
+
str << 8
|
225
|
+
when 102 #f
|
226
|
+
str << 255
|
227
|
+
when 48..57 #decimal notation for byte?
|
228
|
+
rep = rep.chr
|
229
|
+
rep += str_bytes.shift.chr if str_bytes[0].between?(48,57)
|
230
|
+
rep += str_bytes.shift.chr if str_bytes[0].between?(48,57) && ((rep + str_bytes[0].chr).to_i <= 255)
|
231
|
+
str << rep.to_i
|
232
|
+
when 10 # new line, ignore
|
233
|
+
str_bytes.shift if str_bytes[0] == 13
|
234
|
+
true
|
235
|
+
when 13 # new line (or double notation for new line), ignore
|
236
|
+
str_bytes.shift if str_bytes[0] == 10
|
237
|
+
true
|
238
|
+
else
|
239
|
+
str << rep
|
240
|
+
end
|
241
|
+
else
|
242
|
+
str << str_bytes.shift
|
243
|
+
end
|
244
|
+
end
|
245
|
+
out << str.pack('C*')
|
246
|
+
##########################################
|
247
|
+
## Parse a comment
|
248
|
+
##########################################
|
249
|
+
when str = @scanner.scan(/\%/)
|
250
|
+
#is a comment, skip until new line
|
251
|
+
@scanner.skip_until /[\n\r]+/
|
252
|
+
##########################################
|
253
|
+
## Parse a Name
|
254
|
+
##########################################
|
255
|
+
# old, probably working version: when str = @scanner.scan(/\/[\#\w\d\.\+\-\\\?\,]+/)
|
256
|
+
# I don't know how to write the /[\x21-\x7e___subtract_certain_hex_values_here____]+/
|
257
|
+
# all allowed regular caracters between ! and ~ : /[\x21-\x24\x26\x27\x2a-\x2e\x30-\x3b\x3d\x3f-\x5a\x5c\x5e-\x7a\x7c\x7e]+
|
258
|
+
# all characters that aren't white space or special: /[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+
|
259
|
+
when str = @scanner.scan(/\/[^\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+/)
|
260
|
+
out << ( str[1..-1].gsub(/\#[0-9a-fA-F]{2}/) {|a| a[1..2].hex.chr } ).to_sym
|
261
|
+
##########################################
|
262
|
+
## Parse a Number
|
263
|
+
##########################################
|
264
|
+
when str = @scanner.scan(/[\+\-\.\d]+/)
|
265
|
+
str.match(/\./) ? (out << str.to_f) : (out << str.to_i)
|
266
|
+
##########################################
|
267
|
+
## Parse an Object Reference
|
268
|
+
##########################################
|
269
|
+
when @scanner.scan(/R/)
|
270
|
+
out << { is_reference_only: true, indirect_generation_number: out.pop, indirect_reference_id: out.pop}
|
271
|
+
##########################################
|
272
|
+
## Parse Bool - true and after false
|
273
|
+
##########################################
|
274
|
+
when @scanner.scan(/true/)
|
275
|
+
out << true
|
276
|
+
when @scanner.scan(/false/)
|
277
|
+
out << false
|
278
|
+
##########################################
|
279
|
+
## Parse NULL - null
|
280
|
+
##########################################
|
281
|
+
when @scanner.scan(/null/)
|
282
|
+
out << nil
|
283
|
+
##########################################
|
284
|
+
## XREF - check for encryption... anything else?
|
285
|
+
##########################################
|
286
|
+
when @scanner.scan(/xref/)
|
287
|
+
##########
|
288
|
+
## get root object to check for encryption
|
289
|
+
@scanner.scan_until(/(trailer)|(\%EOF)/)
|
290
|
+
|
291
|
+
if @scanner.matched[-1] == 'r'
|
292
|
+
if @scanner.skip_until(/<</)
|
293
|
+
data = _parse_
|
294
|
+
@root_object = {}
|
295
|
+
@root_object[data.shift] = data.shift while data[0]
|
296
|
+
end
|
297
|
+
##########
|
298
|
+
## skip untill end of segment, maked by %%EOF
|
299
|
+
@scanner.skip_until(/\%\%EOF/)
|
300
|
+
end
|
301
|
+
|
302
|
+
when @scanner.scan(/[\s]+/) , @scanner.scan(/obj[\s]*/)
|
303
|
+
# do nothing
|
304
|
+
# warn "White Space, do nothing"
|
305
|
+
nil
|
306
|
+
else
|
307
|
+
# always advance
|
308
|
+
# warn "Advnacing for unknown reason..."
|
309
|
+
@scanner.pos = @scanner.pos + 1
|
310
|
+
end
|
311
|
+
end
|
312
|
+
out
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|