origamindee 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +89 -0
- data/COPYING.LESSER +165 -0
- data/README.md +131 -0
- data/bin/config/pdfcop.conf.yml +236 -0
- data/bin/pdf2pdfa +87 -0
- data/bin/pdf2ruby +333 -0
- data/bin/pdfcop +476 -0
- data/bin/pdfdecompress +97 -0
- data/bin/pdfdecrypt +91 -0
- data/bin/pdfencrypt +113 -0
- data/bin/pdfexplode +223 -0
- data/bin/pdfextract +277 -0
- data/bin/pdfmetadata +143 -0
- data/bin/pdfsh +12 -0
- data/bin/shell/console.rb +128 -0
- data/bin/shell/hexdump.rb +59 -0
- data/bin/shell/irbrc +69 -0
- data/examples/README.md +34 -0
- data/examples/attachments/attachment.rb +38 -0
- data/examples/attachments/nested_document.rb +51 -0
- data/examples/encryption/encryption.rb +28 -0
- data/examples/events/events.rb +72 -0
- data/examples/flash/flash.rb +37 -0
- data/examples/flash/helloworld.swf +0 -0
- data/examples/forms/javascript.rb +54 -0
- data/examples/forms/xfa.rb +115 -0
- data/examples/javascript/hello_world.rb +22 -0
- data/examples/javascript/js_emulation.rb +54 -0
- data/examples/loop/goto.rb +32 -0
- data/examples/loop/named.rb +33 -0
- data/examples/signature/signature.rb +65 -0
- data/examples/uri/javascript.rb +56 -0
- data/examples/uri/open-uri.rb +21 -0
- data/examples/uri/submitform.rb +47 -0
- data/lib/origami/3d.rb +364 -0
- data/lib/origami/acroform.rb +321 -0
- data/lib/origami/actions.rb +318 -0
- data/lib/origami/annotations.rb +711 -0
- data/lib/origami/array.rb +242 -0
- data/lib/origami/boolean.rb +90 -0
- data/lib/origami/catalog.rb +418 -0
- data/lib/origami/collections.rb +144 -0
- data/lib/origami/compound.rb +161 -0
- data/lib/origami/destinations.rb +252 -0
- data/lib/origami/dictionary.rb +192 -0
- data/lib/origami/encryption.rb +1084 -0
- data/lib/origami/extensions/fdf.rb +347 -0
- data/lib/origami/extensions/ppklite.rb +422 -0
- data/lib/origami/filespec.rb +197 -0
- data/lib/origami/filters/ascii.rb +211 -0
- data/lib/origami/filters/ccitt/tables.rb +267 -0
- data/lib/origami/filters/ccitt.rb +357 -0
- data/lib/origami/filters/crypt.rb +38 -0
- data/lib/origami/filters/dct.rb +54 -0
- data/lib/origami/filters/flate.rb +69 -0
- data/lib/origami/filters/jbig2.rb +57 -0
- data/lib/origami/filters/jpx.rb +47 -0
- data/lib/origami/filters/lzw.rb +170 -0
- data/lib/origami/filters/predictors.rb +292 -0
- data/lib/origami/filters/runlength.rb +129 -0
- data/lib/origami/filters.rb +364 -0
- data/lib/origami/font.rb +196 -0
- data/lib/origami/functions.rb +79 -0
- data/lib/origami/graphics/colors.rb +230 -0
- data/lib/origami/graphics/instruction.rb +98 -0
- data/lib/origami/graphics/path.rb +182 -0
- data/lib/origami/graphics/patterns.rb +174 -0
- data/lib/origami/graphics/render.rb +62 -0
- data/lib/origami/graphics/state.rb +149 -0
- data/lib/origami/graphics/text.rb +225 -0
- data/lib/origami/graphics/xobject.rb +918 -0
- data/lib/origami/graphics.rb +38 -0
- data/lib/origami/header.rb +75 -0
- data/lib/origami/javascript.rb +713 -0
- data/lib/origami/linearization.rb +330 -0
- data/lib/origami/metadata.rb +172 -0
- data/lib/origami/name.rb +135 -0
- data/lib/origami/null.rb +65 -0
- data/lib/origami/numeric.rb +181 -0
- data/lib/origami/obfuscation.rb +245 -0
- data/lib/origami/object.rb +760 -0
- data/lib/origami/optionalcontent.rb +183 -0
- data/lib/origami/outline.rb +54 -0
- data/lib/origami/outputintents.rb +85 -0
- data/lib/origami/page.rb +722 -0
- data/lib/origami/parser.rb +269 -0
- data/lib/origami/parsers/fdf.rb +56 -0
- data/lib/origami/parsers/pdf/lazy.rb +176 -0
- data/lib/origami/parsers/pdf/linear.rb +122 -0
- data/lib/origami/parsers/pdf.rb +118 -0
- data/lib/origami/parsers/ppklite.rb +57 -0
- data/lib/origami/pdf.rb +1108 -0
- data/lib/origami/reference.rb +134 -0
- data/lib/origami/signature.rb +702 -0
- data/lib/origami/stream.rb +705 -0
- data/lib/origami/string.rb +444 -0
- data/lib/origami/template/patterns.rb +56 -0
- data/lib/origami/template/widgets.rb +151 -0
- data/lib/origami/trailer.rb +190 -0
- data/lib/origami/tree.rb +62 -0
- data/lib/origami/version.rb +23 -0
- data/lib/origami/webcapture.rb +100 -0
- data/lib/origami/xfa/config.rb +453 -0
- data/lib/origami/xfa/connectionset.rb +146 -0
- data/lib/origami/xfa/datasets.rb +49 -0
- data/lib/origami/xfa/localeset.rb +42 -0
- data/lib/origami/xfa/package.rb +59 -0
- data/lib/origami/xfa/pdf.rb +73 -0
- data/lib/origami/xfa/signature.rb +42 -0
- data/lib/origami/xfa/sourceset.rb +43 -0
- data/lib/origami/xfa/stylesheet.rb +44 -0
- data/lib/origami/xfa/template.rb +1691 -0
- data/lib/origami/xfa/xdc.rb +42 -0
- data/lib/origami/xfa/xfa.rb +146 -0
- data/lib/origami/xfa/xfdf.rb +43 -0
- data/lib/origami/xfa/xmpmeta.rb +43 -0
- data/lib/origami/xfa.rb +62 -0
- data/lib/origami/xreftable.rb +557 -0
- data/lib/origami.rb +47 -0
- data/test/dataset/calc.pdf +85 -0
- data/test/dataset/crypto.pdf +36 -0
- data/test/dataset/empty.pdf +49 -0
- data/test/test_actions.rb +27 -0
- data/test/test_annotations.rb +68 -0
- data/test/test_forms.rb +30 -0
- data/test/test_native_types.rb +83 -0
- data/test/test_object_tree.rb +33 -0
- data/test/test_pages.rb +60 -0
- data/test/test_pdf.rb +20 -0
- data/test/test_pdf_attachment.rb +34 -0
- data/test/test_pdf_create.rb +24 -0
- data/test/test_pdf_encrypt.rb +102 -0
- data/test/test_pdf_parse.rb +134 -0
- data/test/test_pdf_parse_lazy.rb +69 -0
- data/test/test_pdf_sign.rb +97 -0
- data/test/test_streams.rb +184 -0
- data/test/test_xrefs.rb +67 -0
- metadata +280 -0
data/bin/pdfencrypt
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
=begin
|
4
|
+
|
5
|
+
= Info
|
6
|
+
Encrypts a PDF document.
|
7
|
+
|
8
|
+
= License
|
9
|
+
Copyright (C) 2016 Guillaume Delugré.
|
10
|
+
|
11
|
+
Origami is free software: you can redistribute it and/or modify
|
12
|
+
it under the terms of the GNU Lesser General Public License as published by
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
14
|
+
(at your option) any later version.
|
15
|
+
|
16
|
+
Origami is distributed in the hope that it will be useful,
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
GNU Lesser General Public License for more details.
|
20
|
+
|
21
|
+
You should have received a copy of the GNU Lesser General Public License
|
22
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
|
24
|
+
=end
|
25
|
+
|
26
|
+
begin
|
27
|
+
require 'origami'
|
28
|
+
rescue LoadError
|
29
|
+
$: << File.join(__dir__, '../lib')
|
30
|
+
require 'origami'
|
31
|
+
end
|
32
|
+
include Origami
|
33
|
+
|
34
|
+
require 'optparse'
|
35
|
+
|
36
|
+
class OptParser
|
37
|
+
BANNER = <<USAGE
|
38
|
+
Usage: #{$0} [<PDF-file>] [-p <password>] [-c <cipher>] [-s <key-size>] [--hardened] [-o <output-file>]
|
39
|
+
Encrypts a PDF document. Supports RC4 40 to 128 bits, AES128, AES256.
|
40
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
41
|
+
|
42
|
+
Options:
|
43
|
+
USAGE
|
44
|
+
|
45
|
+
def self.parser(options)
|
46
|
+
OptionParser.new do |opts|
|
47
|
+
opts.banner = BANNER
|
48
|
+
|
49
|
+
opts.on("-o", "--output FILE", "Output PDF file (stdout by default)") do |o|
|
50
|
+
options[:output] = o
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("-p", "--password PASSWORD", "Password of the document") do |p|
|
54
|
+
options[:password] = p
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("-c", "--cipher CIPHER", "Cipher used to encrypt the document (Default: AES)") do |c|
|
58
|
+
options[:cipher] = c
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on("-s", "--key-size KEYSIZE", "Key size in bits (Default: 256)") do |s|
|
62
|
+
options[:key_size] = s.to_i
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("--hardened", "Use stronger key validation scheme (only AES-256)") do
|
66
|
+
options[:hardened] = true
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
70
|
+
puts opts
|
71
|
+
exit
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.parse(args)
|
77
|
+
options =
|
78
|
+
{
|
79
|
+
output: STDOUT,
|
80
|
+
password: '',
|
81
|
+
cipher: 'aes',
|
82
|
+
key_size: 256,
|
83
|
+
hardened: false
|
84
|
+
}
|
85
|
+
|
86
|
+
self.parser(options).parse!(args)
|
87
|
+
|
88
|
+
options
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
begin
|
93
|
+
@options = OptParser.parse(ARGV)
|
94
|
+
|
95
|
+
target = (ARGV.empty?) ? STDIN : ARGV.shift
|
96
|
+
params =
|
97
|
+
{
|
98
|
+
verbosity: Parser::VERBOSE_QUIET,
|
99
|
+
}
|
100
|
+
|
101
|
+
pdf = PDF.read(target, params)
|
102
|
+
pdf.encrypt(
|
103
|
+
user_passwd: @options[:password],
|
104
|
+
owner_passwd: @options[:password],
|
105
|
+
cipher: @options[:cipher],
|
106
|
+
key_size: @options[:key_size],
|
107
|
+
hardened: @options[:hardened]
|
108
|
+
)
|
109
|
+
pdf.save(@options[:output], noindent: true)
|
110
|
+
|
111
|
+
rescue
|
112
|
+
abort "#{$!.class}: #{$!.message}"
|
113
|
+
end
|
data/bin/pdfexplode
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
=begin
|
4
|
+
|
5
|
+
= Info
|
6
|
+
Explodes a PDF into separate documents.
|
7
|
+
|
8
|
+
= License
|
9
|
+
Copyright (C) 2016 Guillaume Delugré.
|
10
|
+
|
11
|
+
Origami is free software: you can redistribute it and/or modify
|
12
|
+
it under the terms of the GNU Lesser General Public License as published by
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
14
|
+
(at your option) any later version.
|
15
|
+
|
16
|
+
Origami is distributed in the hope that it will be useful,
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
GNU Lesser General Public License for more details.
|
20
|
+
|
21
|
+
You should have received a copy of the GNU Lesser General Public License
|
22
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
|
24
|
+
=end
|
25
|
+
|
26
|
+
begin
|
27
|
+
require 'origami'
|
28
|
+
rescue LoadError
|
29
|
+
$: << File.join(__dir__, '../lib')
|
30
|
+
require 'origami'
|
31
|
+
end
|
32
|
+
include Origami
|
33
|
+
|
34
|
+
require 'optparse'
|
35
|
+
require 'rexml/document'
|
36
|
+
|
37
|
+
class OptParser
|
38
|
+
BANNER = <<USAGE
|
39
|
+
Usage: #{$0} <PDF-file> [-r <range>] [-t pages|rsrc] [-d <output-directory>]
|
40
|
+
Explodes a document into separate documents.
|
41
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
42
|
+
|
43
|
+
Options:
|
44
|
+
USAGE
|
45
|
+
|
46
|
+
def self.parser(options)
|
47
|
+
OptionParser.new do |opts|
|
48
|
+
opts.banner = BANNER
|
49
|
+
|
50
|
+
opts.on("-d", "--output-dir DIR", "Output directory.") do |d|
|
51
|
+
options[:output_dir] = d
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on("-r", "--range PAGES", "Page range (e.g: 2-, 1-3, 5). Default to '-'.") do |r|
|
55
|
+
range =
|
56
|
+
if r.index('-').nil?
|
57
|
+
page = r.to_i
|
58
|
+
Range.new(page-1, page-1)
|
59
|
+
else
|
60
|
+
from, to = r.split('-').map{|bound| bound.to_i}
|
61
|
+
from ||= 1
|
62
|
+
to ||= 0
|
63
|
+
Range.new(from-1, to-1)
|
64
|
+
end
|
65
|
+
options[:page_range] = range
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("-t", "--type TYPE", "Split by type. Can be 'pages' or 'rsrc'. Default to 'pages'.") do |t|
|
69
|
+
options[:split_by] = t
|
70
|
+
end
|
71
|
+
|
72
|
+
opts.on_tail("-h", "--help", "Show this message.") do
|
73
|
+
puts opts
|
74
|
+
exit
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.parse(args)
|
80
|
+
options =
|
81
|
+
{
|
82
|
+
page_range: (0..-1),
|
83
|
+
split_by: 'pages'
|
84
|
+
}
|
85
|
+
|
86
|
+
self.parser(options).parse!(args)
|
87
|
+
|
88
|
+
options
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
begin
|
93
|
+
@options = OptParser.parse(ARGV)
|
94
|
+
|
95
|
+
if ARGV.empty?
|
96
|
+
abort "Error: No filename was specified. #{$0} --help for details."
|
97
|
+
else
|
98
|
+
target = ARGV.shift
|
99
|
+
end
|
100
|
+
|
101
|
+
if @options[:output_dir].nil?
|
102
|
+
@options[:output_dir] = "#{File.join(File.dirname(target), File.basename(target,'.pdf'))}.explode"
|
103
|
+
end
|
104
|
+
|
105
|
+
Origami::OPTIONS[:ignore_bad_references] = true
|
106
|
+
OUTPUT_DIR = @options[:output_dir]
|
107
|
+
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
108
|
+
|
109
|
+
def split_by_rsrc(n, page, type)
|
110
|
+
all_rsrc = page.resources
|
111
|
+
type_rsrc = page.resources(type)
|
112
|
+
other_rsrc = all_rsrc.keys - type_rsrc.keys
|
113
|
+
|
114
|
+
return unless type_rsrc.empty?
|
115
|
+
|
116
|
+
# Keep only specified resource type.
|
117
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_keeponly_#{type}.pdf")
|
118
|
+
PDF.write(output_file) do |pdf|
|
119
|
+
reduced = page.copy
|
120
|
+
# New resource dictionary with only matching resources.
|
121
|
+
reduced.Resources = Resources.new(type => type_rsrc)
|
122
|
+
# Remove mention of other resources.
|
123
|
+
reduced.each_content_stream do |stream|
|
124
|
+
stream.data = stream.data.lines.
|
125
|
+
delete_if {|line| other_rsrc.any?{|res| line =~ /#{res}/}}.join
|
126
|
+
end
|
127
|
+
|
128
|
+
STDERR.puts "Creating #{output_file}..."
|
129
|
+
pdf.append_page(reduced)
|
130
|
+
end
|
131
|
+
|
132
|
+
# Remove all specified resource type.
|
133
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_excluded_#{type}.pdf")
|
134
|
+
PDF.write(output_file) do |pdf|
|
135
|
+
reduced = page.copy
|
136
|
+
# New resource dictionary with no resource of specified type.
|
137
|
+
reduced.Resources = reduced.Resources.copy
|
138
|
+
reduced.Resources.delete(type)
|
139
|
+
# Remove mention this resource type.
|
140
|
+
reduced.each_content_stream do |stream|
|
141
|
+
stream.data = stream.data.lines.
|
142
|
+
delete_if {|line| type_rsrc.keys.any?{|res| line =~ /#{res}/}}.join
|
143
|
+
end
|
144
|
+
|
145
|
+
STDERR.puts "Creating #{output_file}..."
|
146
|
+
pdf.append_page(reduced)
|
147
|
+
end
|
148
|
+
|
149
|
+
# Now treating each resource object separately.
|
150
|
+
type_rsrc.each_pair do |name, rsrc|
|
151
|
+
anyother_rsrc = all_rsrc.keys - [ name ]
|
152
|
+
# Keey only specified resource object.
|
153
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_keeponly_#{type}_#{name}.pdf")
|
154
|
+
PDF.write(output_file) do |pdf|
|
155
|
+
reduced = page.copy
|
156
|
+
# New resource dictionary with only specified resource object.
|
157
|
+
reduced.Resources = Resources.new(type => {name => rsrc})
|
158
|
+
# Remove mention of all other resources.
|
159
|
+
reduced.each_content_stream do |stream|
|
160
|
+
stream.data = stream.data.lines.
|
161
|
+
delete_if {|line| anyother_rsrc.any?{|res| line =~ /#{res}/}}.join
|
162
|
+
end
|
163
|
+
|
164
|
+
STDERR.puts "Creating #{output_file}..."
|
165
|
+
pdf.append_page(reduced)
|
166
|
+
end
|
167
|
+
|
168
|
+
# Remove only specified resource object.
|
169
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_excluded_#{type}_#{name}.pdf")
|
170
|
+
PDF.write(output_file) do |pdf|
|
171
|
+
reduced = page.copy
|
172
|
+
# New resource dictionary with only specified resource object.
|
173
|
+
reduced.Resources = reduced.Resources.copy
|
174
|
+
reduced.Resources[type] = reduced.Resources.send(type).copy
|
175
|
+
reduced.Resources[type].delete(name)
|
176
|
+
# Remove mention of this resource only.
|
177
|
+
reduced.each_content_stream do |stream|
|
178
|
+
stream.data = stream.data.lines.
|
179
|
+
delete_if {|line| line =~ /#{name}/}.join
|
180
|
+
end
|
181
|
+
|
182
|
+
STDERR.puts "Creating #{output_file}..."
|
183
|
+
pdf.append_page(reduced)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
params =
|
189
|
+
{
|
190
|
+
verbosity: Parser::VERBOSE_QUIET,
|
191
|
+
}
|
192
|
+
pdf = PDF.read(target, params)
|
193
|
+
|
194
|
+
i = @options[:page_range].first + 1
|
195
|
+
pdf.pages[@options[:page_range]].each do |page|
|
196
|
+
case @options[:split_by]
|
197
|
+
when 'pages'
|
198
|
+
output_file = File.join(OUTPUT_DIR, "page_#{i}.pdf")
|
199
|
+
PDF.write(output_file) do |doc|
|
200
|
+
STDERR.puts "Creating #{output_file}..."
|
201
|
+
doc.append_page(page)
|
202
|
+
end
|
203
|
+
|
204
|
+
when 'rsrc'
|
205
|
+
[ Resources::EXTGSTATE,
|
206
|
+
Resources::COLORSPACE,
|
207
|
+
Resources::PATTERN,
|
208
|
+
Resources::SHADING,
|
209
|
+
Resources::XOBJECT,
|
210
|
+
Resources::FONT,
|
211
|
+
Resources::PROPERTIES
|
212
|
+
].each { |type| split_by_rsrc(i, page, type) }
|
213
|
+
|
214
|
+
else
|
215
|
+
raise ArgumentError, "Unknown split option: #{@options[:split_by]}"
|
216
|
+
end
|
217
|
+
|
218
|
+
i += 1
|
219
|
+
end
|
220
|
+
|
221
|
+
rescue
|
222
|
+
abort "#{$!.class}: #{$!.message} #{$!.backtrace.join($/)}"
|
223
|
+
end
|
data/bin/pdfextract
ADDED
@@ -0,0 +1,277 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
=begin
|
4
|
+
|
5
|
+
= Info
|
6
|
+
Extracts valuable data from a PDF document. Can extract:
|
7
|
+
- decoded streams
|
8
|
+
- JavaScript
|
9
|
+
- file attachments
|
10
|
+
|
11
|
+
= License
|
12
|
+
Copyright (C) 2016 Guillaume Delugré.
|
13
|
+
|
14
|
+
Origami is free software: you can redistribute it and/or modify
|
15
|
+
it under the terms of the GNU Lesser General Public License as published by
|
16
|
+
the Free Software Foundation, either version 3 of the License, or
|
17
|
+
(at your option) any later version.
|
18
|
+
|
19
|
+
Origami is distributed in the hope that it will be useful,
|
20
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
21
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
22
|
+
GNU Lesser General Public License for more details.
|
23
|
+
|
24
|
+
You should have received a copy of the GNU Lesser General Public License
|
25
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
26
|
+
|
27
|
+
=end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'origami'
|
31
|
+
rescue LoadError
|
32
|
+
$: << File.join(__dir__, '../lib')
|
33
|
+
require 'origami'
|
34
|
+
end
|
35
|
+
include Origami
|
36
|
+
|
37
|
+
require 'optparse'
|
38
|
+
require 'rexml/document'
|
39
|
+
|
40
|
+
class OptParser
|
41
|
+
BANNER = <<USAGE
|
42
|
+
Usage: #{$0} <PDF-file> [-afjms] [-d <output-directory>]
|
43
|
+
Extracts various data out of a document (streams, scripts, images, fonts, metadata, attachments).
|
44
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
45
|
+
|
46
|
+
Options:
|
47
|
+
USAGE
|
48
|
+
|
49
|
+
def self.parser(options)
|
50
|
+
OptionParser.new do |opts|
|
51
|
+
opts.banner = BANNER
|
52
|
+
|
53
|
+
opts.on("-d", "--output-dir DIR", "Output directory") do |d|
|
54
|
+
options[:output_dir] = d
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("-s", "--streams", "Extracts all decoded streams") do
|
58
|
+
options[:streams] = true
|
59
|
+
end
|
60
|
+
|
61
|
+
opts.on("-a", "--attachments", "Extracts file attachments") do
|
62
|
+
options[:attachments] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("-f", "--fonts", "Extracts embedded font files") do
|
66
|
+
options[:fonts] = true
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on("-j", "--js", "Extracts JavaScript scripts") do
|
70
|
+
options[:javascript] = true
|
71
|
+
end
|
72
|
+
|
73
|
+
opts.on("-m", "--metadata", "Extracts metadata streams") do
|
74
|
+
options[:metadata] = true
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on("-i", "--images", "Extracts embedded images") do
|
78
|
+
options[:images] = true
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
82
|
+
puts opts
|
83
|
+
exit
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.parse(args)
|
89
|
+
options = {}
|
90
|
+
|
91
|
+
self.parser(options).parse!(args)
|
92
|
+
|
93
|
+
options
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
begin
|
98
|
+
@options = OptParser.parse(ARGV)
|
99
|
+
|
100
|
+
if ARGV.empty?
|
101
|
+
abort "Error: No filename was specified. #{$0} --help for details."
|
102
|
+
else
|
103
|
+
target = ARGV.shift
|
104
|
+
end
|
105
|
+
|
106
|
+
unless %i[streams javascript attachments fonts metadata images].any? {|opt| @options[opt]}
|
107
|
+
@options[:streams] =
|
108
|
+
@options[:javascript] =
|
109
|
+
@options[:fonts] =
|
110
|
+
@options[:attachments] =
|
111
|
+
@options[:images] = true
|
112
|
+
end
|
113
|
+
|
114
|
+
if @options[:output_dir].nil?
|
115
|
+
@options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
|
116
|
+
end
|
117
|
+
|
118
|
+
# Force data extraction, even for invalid FlateDecode streams.
|
119
|
+
Origami::OPTIONS[:ignore_zlib_errors] = true
|
120
|
+
Origami::OPTIONS[:ignore_png_errors] = true
|
121
|
+
|
122
|
+
OUTPUT_DIR = @options[:output_dir]
|
123
|
+
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
124
|
+
|
125
|
+
params =
|
126
|
+
{
|
127
|
+
verbosity: Parser::VERBOSE_QUIET,
|
128
|
+
}
|
129
|
+
pdf = PDF.read(target, params)
|
130
|
+
|
131
|
+
if @options[:streams]
|
132
|
+
nstreams = 0
|
133
|
+
stream_dir = File.join(OUTPUT_DIR, "streams")
|
134
|
+
Dir::mkdir(stream_dir) unless File.directory?(stream_dir)
|
135
|
+
|
136
|
+
pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream|
|
137
|
+
stream_file = File.join(stream_dir, "stream_#{stream.reference.refno}.dmp")
|
138
|
+
begin
|
139
|
+
File.binwrite(stream_file, stream.data)
|
140
|
+
rescue
|
141
|
+
STDERR.puts "Cannot decode stream #{stream.reference}: #{$!.message}"
|
142
|
+
next
|
143
|
+
end
|
144
|
+
|
145
|
+
nstreams += 1
|
146
|
+
end
|
147
|
+
|
148
|
+
puts "Extracted #{nstreams} PDF streams to '#{stream_dir}'."
|
149
|
+
end
|
150
|
+
|
151
|
+
if @options[:javascript]
|
152
|
+
nscripts = 0
|
153
|
+
js_dir = File.join(OUTPUT_DIR, "scripts")
|
154
|
+
Dir::mkdir(js_dir) unless File.directory?(js_dir)
|
155
|
+
|
156
|
+
pdf.ls(/^JS$/).each do |script|
|
157
|
+
script_file = File.join(js_dir, "script_#{script.hash}.js")
|
158
|
+
script_data =
|
159
|
+
case script
|
160
|
+
when Stream then script.data
|
161
|
+
else script.value
|
162
|
+
end
|
163
|
+
|
164
|
+
File.binwrite(script_file, script_data)
|
165
|
+
nscripts += 1
|
166
|
+
end
|
167
|
+
|
168
|
+
# Also checking for presence of JavaScript in XML forms.
|
169
|
+
if pdf.form? and pdf.Catalog.AcroForm.has_key?(:XFA)
|
170
|
+
xfa = pdf.Catalog.AcroForm.XFA
|
171
|
+
|
172
|
+
case xfa
|
173
|
+
when Array then
|
174
|
+
xml = ""
|
175
|
+
i = 0
|
176
|
+
xfa.each do |packet|
|
177
|
+
if i % 2 == 1
|
178
|
+
xml << packet.solve.data
|
179
|
+
end
|
180
|
+
|
181
|
+
i = i + 1
|
182
|
+
end
|
183
|
+
when Stream then
|
184
|
+
xml = xfa.data
|
185
|
+
else
|
186
|
+
reject("Malformed XFA dictionary")
|
187
|
+
end
|
188
|
+
|
189
|
+
xfadoc = REXML::Document.new(xml)
|
190
|
+
REXML::XPath.match(xfadoc, "//script").each do |script|
|
191
|
+
script_file = File.join(js_dir, "script_#{script.hash}.js")
|
192
|
+
File.binwrite(script_file, script.text)
|
193
|
+
nscripts += 1
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
puts "Extracted #{nscripts} scripts to '#{js_dir}'."
|
198
|
+
end
|
199
|
+
|
200
|
+
if @options[:attachments]
|
201
|
+
nattach = 0
|
202
|
+
attachments_dir = File.join(OUTPUT_DIR, "attachments")
|
203
|
+
Dir::mkdir(attachments_dir) unless File.directory?(attachments_dir)
|
204
|
+
|
205
|
+
pdf.each_attachment do |name, attachment|
|
206
|
+
name = name.to_utf8.tr("\/\x00", "_")
|
207
|
+
attached_file = File.join(attachments_dir, "attached_#{File.basename(name)}")
|
208
|
+
|
209
|
+
if attachment and attachment.EF and attachment.EF.F.is_a?(Stream)
|
210
|
+
File.binwrite(attached_file, attachment.EF.F.data)
|
211
|
+
nattach += 1
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
puts "Extracted #{nattach} attachments to '#{attachments_dir}'."
|
216
|
+
end
|
217
|
+
|
218
|
+
if @options[:fonts]
|
219
|
+
nfonts = 0
|
220
|
+
fonts_dir = File.join(OUTPUT_DIR, "fonts")
|
221
|
+
Dir::mkdir(fonts_dir) unless File.directory?(fonts_dir)
|
222
|
+
|
223
|
+
pdf.each_object.select {|obj| obj.is_a?(Stream)}.each do |stream|
|
224
|
+
font = stream.xrefs.find{|obj| obj.is_a?(FontDescriptor)}
|
225
|
+
if font
|
226
|
+
font_file = File.join(fonts_dir, File.basename(font.FontName.value.to_s))
|
227
|
+
File.binwrite(font_file, stream.data)
|
228
|
+
nfonts += 1
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
puts "Extracted #{nfonts} fonts to '#{fonts_dir}'."
|
233
|
+
end
|
234
|
+
|
235
|
+
if @options[:metadata]
|
236
|
+
nmeta = 0
|
237
|
+
metadata_dir = File.join(OUTPUT_DIR, "metadata")
|
238
|
+
Dir::mkdir(metadata_dir) unless File.directory?(metadata_dir)
|
239
|
+
|
240
|
+
pdf.each_object.select {|obj| obj.is_a?(MetadataStream)}.each do |stream|
|
241
|
+
metadata_file = File.join(metadata_dir, "metadata_#{stream.reference.refno}.xml")
|
242
|
+
File.binwrite(metadata_file, stream.data)
|
243
|
+
nmeta += 1
|
244
|
+
end
|
245
|
+
|
246
|
+
puts "Extracted #{nmeta} metadata streams to '#{metadata_dir}'."
|
247
|
+
end
|
248
|
+
|
249
|
+
if @options[:images]
|
250
|
+
nimages = 0
|
251
|
+
image_dir = File.join(OUTPUT_DIR, "images")
|
252
|
+
Dir::mkdir(image_dir) unless File.directory?(image_dir)
|
253
|
+
|
254
|
+
pdf.each_object.select {|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
|
255
|
+
begin
|
256
|
+
ext, image_data = stream.to_image_file
|
257
|
+
image_file = File.join(image_dir, "image_#{stream.reference.refno}.#{ext}")
|
258
|
+
|
259
|
+
if ext != 'png' and stream.ColorSpace == Graphics::Color::Space::DEVICE_CMYK
|
260
|
+
STDERR.puts "Warning: file '#{image_file}' is intended to be viewed in CMYK color space."
|
261
|
+
end
|
262
|
+
|
263
|
+
File.binwrite(image_file, image_data)
|
264
|
+
nimages += 1
|
265
|
+
rescue
|
266
|
+
STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{$!.message}"
|
267
|
+
STDERR.puts $!.backtrace.join($/)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
puts "Extracted #{nimages} images to '#{image_dir}'."
|
272
|
+
end
|
273
|
+
|
274
|
+
rescue
|
275
|
+
STDERR.puts $!.backtrace.join($/)
|
276
|
+
abort "#{$!.class}: #{$!.message}"
|
277
|
+
end
|