origami 1.2.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +66 -0
- data/README.md +112 -0
- data/bin/config/pdfcop.conf.yml +232 -233
- data/bin/gui/about.rb +27 -37
- data/bin/gui/config.rb +108 -117
- data/bin/gui/file.rb +416 -365
- data/bin/gui/gtkhex.rb +1138 -1153
- data/bin/gui/hexview.rb +55 -57
- data/bin/gui/imgview.rb +48 -51
- data/bin/gui/menu.rb +388 -386
- data/bin/gui/properties.rb +114 -130
- data/bin/gui/signing.rb +571 -617
- data/bin/gui/textview.rb +77 -95
- data/bin/gui/treeview.rb +382 -387
- data/bin/gui/walker.rb +227 -232
- data/bin/gui/xrefs.rb +56 -60
- data/bin/pdf2pdfa +53 -57
- data/bin/pdf2ruby +212 -228
- data/bin/pdfcop +338 -348
- data/bin/pdfdecompress +58 -65
- data/bin/pdfdecrypt +56 -60
- data/bin/pdfencrypt +75 -80
- data/bin/pdfexplode +185 -182
- data/bin/pdfextract +201 -218
- data/bin/pdfmetadata +83 -82
- data/bin/pdfsh +4 -5
- data/bin/pdfwalker +1 -2
- data/bin/shell/.irbrc +45 -82
- data/bin/shell/console.rb +105 -130
- data/bin/shell/hexdump.rb +40 -64
- data/examples/README.md +34 -0
- data/examples/attachments/attachment.rb +38 -0
- data/examples/attachments/nested_document.rb +51 -0
- data/examples/encryption/encryption.rb +28 -0
- data/{samples/actions/triggerevents/trigger.rb → examples/events/events.rb} +13 -16
- data/examples/flash/flash.rb +37 -0
- data/{samples → examples}/flash/helloworld.swf +0 -0
- data/examples/forms/javascript.rb +54 -0
- data/examples/forms/xfa.rb +115 -0
- data/examples/javascript/hello_world.rb +22 -0
- data/examples/javascript/js_emulation.rb +54 -0
- data/examples/loop/goto.rb +32 -0
- data/examples/loop/named.rb +33 -0
- data/examples/signature/signature.rb +65 -0
- data/examples/uri/javascript.rb +56 -0
- data/examples/uri/open-uri.rb +21 -0
- data/examples/uri/submitform.rb +47 -0
- data/lib/origami.rb +29 -42
- data/lib/origami/3d.rb +350 -225
- data/lib/origami/acroform.rb +262 -288
- data/lib/origami/actions.rb +268 -288
- data/lib/origami/annotations.rb +697 -722
- data/lib/origami/array.rb +258 -184
- data/lib/origami/boolean.rb +74 -84
- data/lib/origami/catalog.rb +397 -434
- data/lib/origami/collections.rb +144 -0
- data/lib/origami/destinations.rb +233 -194
- data/lib/origami/dictionary.rb +253 -232
- data/lib/origami/encryption.rb +1274 -1243
- data/lib/origami/export.rb +232 -268
- data/lib/origami/extensions/fdf.rb +307 -220
- data/lib/origami/extensions/ppklite.rb +368 -435
- data/lib/origami/filespec.rb +197 -0
- data/lib/origami/filters.rb +301 -295
- data/lib/origami/filters/ascii.rb +177 -180
- data/lib/origami/filters/ccitt.rb +528 -535
- data/lib/origami/filters/crypt.rb +26 -35
- data/lib/origami/filters/dct.rb +46 -52
- data/lib/origami/filters/flate.rb +95 -94
- data/lib/origami/filters/jbig2.rb +49 -55
- data/lib/origami/filters/jpx.rb +38 -44
- data/lib/origami/filters/lzw.rb +189 -183
- data/lib/origami/filters/predictors.rb +221 -235
- data/lib/origami/filters/runlength.rb +103 -104
- data/lib/origami/font.rb +173 -186
- data/lib/origami/functions.rb +67 -81
- data/lib/origami/graphics.rb +25 -21
- data/lib/origami/graphics/colors.rb +178 -187
- data/lib/origami/graphics/instruction.rb +79 -85
- data/lib/origami/graphics/path.rb +142 -148
- data/lib/origami/graphics/patterns.rb +160 -167
- data/lib/origami/graphics/render.rb +43 -50
- data/lib/origami/graphics/state.rb +138 -153
- data/lib/origami/graphics/text.rb +188 -205
- data/lib/origami/graphics/xobject.rb +819 -815
- data/lib/origami/header.rb +63 -78
- data/lib/origami/javascript.rb +596 -597
- data/lib/origami/linearization.rb +285 -290
- data/lib/origami/metadata.rb +139 -148
- data/lib/origami/name.rb +112 -148
- data/lib/origami/null.rb +53 -62
- data/lib/origami/numeric.rb +162 -175
- data/lib/origami/obfuscation.rb +186 -174
- data/lib/origami/object.rb +593 -573
- data/lib/origami/outline.rb +42 -47
- data/lib/origami/outputintents.rb +73 -82
- data/lib/origami/page.rb +703 -592
- data/lib/origami/parser.rb +238 -290
- data/lib/origami/parsers/fdf.rb +41 -33
- data/lib/origami/parsers/pdf.rb +75 -95
- data/lib/origami/parsers/pdf/lazy.rb +137 -0
- data/lib/origami/parsers/pdf/linear.rb +64 -66
- data/lib/origami/parsers/ppklite.rb +34 -70
- data/lib/origami/pdf.rb +1030 -1005
- data/lib/origami/reference.rb +102 -102
- data/lib/origami/signature.rb +591 -609
- data/lib/origami/stream.rb +668 -551
- data/lib/origami/string.rb +397 -373
- data/lib/origami/template/patterns.rb +56 -0
- data/lib/origami/template/widgets.rb +151 -0
- data/lib/origami/trailer.rb +144 -158
- data/lib/origami/tree.rb +62 -0
- data/lib/origami/version.rb +23 -0
- data/lib/origami/webcapture.rb +88 -79
- data/lib/origami/xfa.rb +2863 -2882
- data/lib/origami/xreftable.rb +472 -384
- data/test/dataset/calc.pdf +85 -0
- data/test/dataset/crypto.pdf +82 -0
- data/test/dataset/empty.pdf +49 -0
- data/test/test_actions.rb +27 -0
- data/test/test_annotations.rb +90 -0
- data/test/test_pages.rb +31 -0
- data/test/test_pdf.rb +16 -0
- data/test/test_pdf_attachment.rb +34 -0
- data/test/test_pdf_create.rb +24 -0
- data/test/test_pdf_encrypt.rb +95 -0
- data/test/test_pdf_parse.rb +96 -0
- data/test/test_pdf_sign.rb +58 -0
- data/test/test_streams.rb +182 -0
- data/test/test_xrefs.rb +67 -0
- metadata +88 -58
- data/README +0 -67
- data/bin/pdf2graph +0 -121
- data/bin/pdfcocoon +0 -104
- data/lib/origami/file.rb +0 -233
- data/samples/README.txt +0 -45
- data/samples/actions/launch/calc.rb +0 -87
- data/samples/actions/launch/winparams.rb +0 -22
- data/samples/actions/loop/loopgoto.rb +0 -24
- data/samples/actions/loop/loopnamed.rb +0 -21
- data/samples/actions/named/named.rb +0 -31
- data/samples/actions/samba/smbrelay.rb +0 -26
- data/samples/actions/webbug/submitform.js +0 -26
- data/samples/actions/webbug/webbug-browser.rb +0 -68
- data/samples/actions/webbug/webbug-js.rb +0 -67
- data/samples/actions/webbug/webbug-reader.rb +0 -90
- data/samples/attachments/attach.rb +0 -40
- data/samples/attachments/attached.txt +0 -1
- data/samples/crypto/crypto.rb +0 -28
- data/samples/digsig/signed.rb +0 -46
- data/samples/exploits/cve-2008-2992-utilprintf.rb +0 -87
- data/samples/exploits/cve-2009-0927-geticon.rb +0 -65
- data/samples/exploits/exploit_customdictopen.rb +0 -55
- data/samples/exploits/getannots.rb +0 -69
- data/samples/flash/flash.rb +0 -31
- data/samples/javascript/attached.txt +0 -1
- data/samples/javascript/js.rb +0 -52
- data/templates/patterns.rb +0 -66
- data/templates/widgets.rb +0 -173
- data/templates/xdp.rb +0 -92
- data/test/ts_pdf.rb +0 -50
data/bin/pdfexplode
CHANGED
@@ -1,34 +1,33 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
=begin
|
4
4
|
|
5
|
-
=
|
6
|
-
|
5
|
+
= Info
|
6
|
+
Explodes a PDF into separate documents.
|
7
7
|
|
8
|
-
=
|
9
|
-
|
10
|
-
= License:
|
11
|
-
Origami is free software: you can redistribute it and/or modify
|
12
|
-
it under the terms of the GNU Lesser General Public License as published by
|
13
|
-
the Free Software Foundation, either version 3 of the License, or
|
14
|
-
(at your option) any later version.
|
8
|
+
= License
|
9
|
+
Copyright (C) 2016 Guillaume Delugré.
|
15
10
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
11
|
+
Origami is free software: you can redistribute it and/or modify
|
12
|
+
it under the terms of the GNU Lesser General Public License as published by
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
14
|
+
(at your option) any later version.
|
20
15
|
|
21
|
-
|
22
|
-
|
16
|
+
Origami is distributed in the hope that it will be useful,
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
GNU Lesser General Public License for more details.
|
20
|
+
|
21
|
+
You should have received a copy of the GNU Lesser General Public License
|
22
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
23
23
|
|
24
24
|
=end
|
25
25
|
|
26
26
|
begin
|
27
|
-
|
27
|
+
require 'origami'
|
28
28
|
rescue LoadError
|
29
|
-
|
30
|
-
|
31
|
-
require 'origami'
|
29
|
+
$: << File.join(__dir__, '../lib')
|
30
|
+
require 'origami'
|
32
31
|
end
|
33
32
|
include Origami
|
34
33
|
|
@@ -36,185 +35,189 @@ require 'optparse'
|
|
36
35
|
require 'rexml/document'
|
37
36
|
|
38
37
|
class OptParser
|
39
|
-
|
38
|
+
BANNER = <<USAGE
|
40
39
|
Usage: #{$0} <PDF-file> [-r <range>] [-t pages|rsrc] [-d <output-directory>]
|
41
40
|
Explodes a document into separate documents.
|
42
|
-
Bug reports or feature requests at: http://
|
41
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
43
42
|
|
44
43
|
Options:
|
45
44
|
USAGE
|
46
45
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
46
|
+
def self.parser(options)
|
47
|
+
OptionParser.new do |opts|
|
48
|
+
opts.banner = BANNER
|
49
|
+
|
50
|
+
opts.on("-d", "--output-dir DIR", "Output directory.") do |d|
|
51
|
+
options[:output_dir] = d
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on("-r", "--range PAGES", "Page range (e.g: 2-, 1-3, 5). Default to '-'.") do |r|
|
55
|
+
range =
|
56
|
+
if r.index('-').nil?
|
57
|
+
page = r.to_i
|
58
|
+
Range.new(page-1, page-1)
|
59
|
+
else
|
60
|
+
from, to = r.split('-').map{|bound| bound.to_i}
|
61
|
+
from ||= 1
|
62
|
+
to ||= 0
|
63
|
+
Range.new(from-1, to-1)
|
64
|
+
end
|
65
|
+
options[:page_range] = range
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("-t", "--type TYPE", "Split by type. Can be 'pages' or 'rsrc'. Default to 'pages'.") do |t|
|
69
|
+
options[:split_by] = t
|
70
|
+
end
|
71
|
+
|
72
|
+
opts.on_tail("-h", "--help", "Show this message.") do
|
73
|
+
puts opts
|
74
|
+
exit
|
75
|
+
end
|
76
|
+
end
|
77
77
|
end
|
78
|
-
end
|
79
78
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
79
|
+
def self.parse(args)
|
80
|
+
options =
|
81
|
+
{
|
82
|
+
page_range: (0..-1),
|
83
|
+
split_by: 'pages'
|
84
|
+
}
|
86
85
|
|
87
|
-
|
86
|
+
self.parser(options).parse!(args)
|
88
87
|
|
89
|
-
|
90
|
-
|
88
|
+
options
|
89
|
+
end
|
91
90
|
end
|
92
91
|
|
93
92
|
begin
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
93
|
+
@options = OptParser.parse(ARGV)
|
94
|
+
|
95
|
+
if ARGV.empty?
|
96
|
+
abort "Error: No filename was specified. #{$0} --help for details."
|
97
|
+
else
|
98
|
+
target = ARGV.shift
|
99
|
+
end
|
100
|
+
|
101
|
+
if @options[:output_dir].nil?
|
102
|
+
@options[:output_dir] = "#{File.join(File.dirname(target), File.basename(target,'.pdf'))}.explode"
|
103
|
+
end
|
104
|
+
|
105
|
+
Origami::OPTIONS[:ignore_bad_references] = true
|
106
|
+
OUTPUT_DIR = @options[:output_dir]
|
107
|
+
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
108
|
+
|
109
|
+
def split_by_rsrc(n, page, type)
|
110
|
+
all_rsrc = page.resources
|
111
|
+
type_rsrc = page.resources(type)
|
112
|
+
other_rsrc = all_rsrc.keys - type_rsrc.keys
|
113
|
+
|
114
|
+
unless type_rsrc.empty?
|
115
|
+
# Keep only specified resource type.
|
116
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_keeponly_#{type}.pdf")
|
117
|
+
PDF.write(output_file) do |pdf|
|
118
|
+
reduced = page.copy
|
119
|
+
# New resource dictionary with only matching resources.
|
120
|
+
reduced.Resources = Resources.new(type => type_rsrc)
|
121
|
+
# Remove mention of other resources.
|
122
|
+
reduced.each_content_stream do |stream|
|
123
|
+
stream.data = stream.data.lines.
|
124
|
+
delete_if {|line| other_rsrc.any?{|rsrc| line =~ /#{rsrc}/}}.join
|
125
|
+
end
|
126
|
+
|
127
|
+
STDERR.puts "Creating #{output_file}..."
|
128
|
+
pdf.append_page(reduced)
|
129
|
+
end
|
130
|
+
|
131
|
+
# Remove all specified resource type.
|
132
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_excluded_#{type}.pdf")
|
133
|
+
PDF.write(output_file) do |pdf|
|
134
|
+
reduced = page.copy
|
135
|
+
# New resource dictionary with no resource of specified type.
|
136
|
+
reduced.Resources = reduced.Resources.copy
|
137
|
+
reduced.Resources.delete(type)
|
138
|
+
# Remove mention this resource type.
|
139
|
+
reduced.each_content_stream do |stream|
|
140
|
+
stream.data = stream.data.lines.
|
141
|
+
delete_if {|line| type_rsrc.keys.any?{|rsrc| line =~ /#{rsrc}/}}.join
|
142
|
+
end
|
143
|
+
|
144
|
+
STDERR.puts "Creating #{output_file}..."
|
145
|
+
pdf.append_page(reduced)
|
146
|
+
end
|
147
|
+
|
148
|
+
# Now treating each resource object separately.
|
149
|
+
type_rsrc.each_pair do |name, rsrc|
|
150
|
+
anyother_rsrc = all_rsrc.keys - [ name ]
|
151
|
+
# Keey only specified resource object.
|
152
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_keeponly_#{type}_#{name}.pdf")
|
153
|
+
PDF.write(output_file) do |pdf|
|
154
|
+
reduced = page.copy
|
155
|
+
# New resource dictionary with only specified resource object.
|
156
|
+
reduced.Resources = Resources.new(type => {name => rsrc})
|
157
|
+
# Remove mention of all other resources.
|
158
|
+
reduced.each_content_stream do |stream|
|
159
|
+
stream.data = stream.data.lines.
|
160
|
+
delete_if {|line| anyother_rsrc.any?{|rsrc| line =~ /#{rsrc}/}}.join
|
161
|
+
end
|
162
|
+
|
163
|
+
STDERR.puts "Creating #{output_file}..."
|
164
|
+
pdf.append_page(reduced)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Remove only specified resource object.
|
168
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_excluded_#{type}_#{name}.pdf")
|
169
|
+
PDF.write(output_file) do |pdf|
|
170
|
+
reduced = page.copy
|
171
|
+
# New resource dictionary with only specified resource object.
|
172
|
+
reduced.Resources = reduced.Resources.copy
|
173
|
+
reduced.Resources[type] = reduced.Resources.send(type).copy
|
174
|
+
reduced.Resources[type].delete(name)
|
175
|
+
# Remove mention of this resource only.
|
176
|
+
reduced.each_content_stream do |stream|
|
177
|
+
stream.data = stream.data.lines.
|
178
|
+
delete_if {|line| line =~ /#{name}/}.join
|
179
|
+
end
|
180
|
+
|
181
|
+
STDERR.puts "Creating #{output_file}..."
|
182
|
+
pdf.append_page(reduced)
|
183
|
+
end
|
184
|
+
end
|
177
185
|
end
|
178
|
-
end
|
179
186
|
end
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
187
|
+
|
188
|
+
params =
|
189
|
+
{
|
190
|
+
verbosity: Parser::VERBOSE_QUIET,
|
191
|
+
}
|
192
|
+
pdf = PDF.read(target, params)
|
193
|
+
|
194
|
+
i = @options[:page_range].first + 1
|
195
|
+
pdf.pages[@options[:page_range]].each do |page|
|
196
|
+
case @options[:split_by]
|
197
|
+
when 'pages'
|
198
|
+
output_file = File.join(OUTPUT_DIR, "page_#{i}.pdf")
|
199
|
+
PDF.write(output_file) do |pdf|
|
200
|
+
STDERR.puts "Creating #{output_file}..."
|
201
|
+
pdf.append_page(page)
|
202
|
+
end
|
203
|
+
|
204
|
+
when 'rsrc'
|
205
|
+
[ Resources::EXTGSTATE,
|
206
|
+
Resources::COLORSPACE,
|
207
|
+
Resources::PATTERN,
|
208
|
+
Resources::SHADING,
|
209
|
+
Resources::XOBJECT,
|
210
|
+
Resources::FONT,
|
211
|
+
Resources::PROPERTIES
|
212
|
+
].each { |type| split_by_rsrc(i, page, type) }
|
213
|
+
|
214
|
+
else
|
215
|
+
raise ArgumentError, "Unknown split option: #{@options[:split_by]}"
|
196
216
|
end
|
197
217
|
|
198
|
-
|
199
|
-
[ Resources::EXTGSTATE,
|
200
|
-
Resources::COLORSPACE,
|
201
|
-
Resources::PATTERN,
|
202
|
-
Resources::SHADING,
|
203
|
-
Resources::XOBJECT,
|
204
|
-
Resources::FONT,
|
205
|
-
Resources::PROPERTIES
|
206
|
-
].each { |type| split_by_rsrc(i, page, type) }
|
207
|
-
|
208
|
-
else
|
209
|
-
raise ArgumentError, "Unknown split option: #{@options[:split_by]}"
|
218
|
+
i += 1
|
210
219
|
end
|
211
220
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
rescue SystemExit
|
216
|
-
rescue Exception => e
|
217
|
-
STDERR.puts "#{e.class}: #{e.message} #{e.backtrace}"
|
218
|
-
exit 1
|
221
|
+
rescue
|
222
|
+
abort "#{$!.class}: #{$!.message} #{$!.backtrace.join($/)}"
|
219
223
|
end
|
220
|
-
|
data/bin/pdfextract
CHANGED
@@ -1,38 +1,36 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
=begin
|
4
4
|
|
5
|
-
=
|
6
|
-
|
5
|
+
= Info
|
6
|
+
Extracts valuable data from a PDF document. Can extract:
|
7
|
+
- decoded streams
|
8
|
+
- JavaScript
|
9
|
+
- file attachments
|
7
10
|
|
8
|
-
=
|
9
|
-
|
10
|
-
- decoded streams
|
11
|
-
- JavaScript
|
12
|
-
- file attachments
|
11
|
+
= License
|
12
|
+
Copyright (C) 2016 Guillaume Delugré.
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
(at your option) any later version.
|
14
|
+
Origami is free software: you can redistribute it and/or modify
|
15
|
+
it under the terms of the GNU Lesser General Public License as published by
|
16
|
+
the Free Software Foundation, either version 3 of the License, or
|
17
|
+
(at your option) any later version.
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
Origami is distributed in the hope that it will be useful,
|
20
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
21
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
22
|
+
GNU Lesser General Public License for more details.
|
24
23
|
|
25
|
-
|
26
|
-
|
24
|
+
You should have received a copy of the GNU Lesser General Public License
|
25
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
27
26
|
|
28
27
|
=end
|
29
28
|
|
30
29
|
begin
|
31
|
-
|
30
|
+
require 'origami'
|
32
31
|
rescue LoadError
|
33
|
-
|
34
|
-
|
35
|
-
require 'origami'
|
32
|
+
$: << File.join(__dir__, '../lib')
|
33
|
+
require 'origami'
|
36
34
|
end
|
37
35
|
include Origami
|
38
36
|
|
@@ -40,246 +38,231 @@ require 'optparse'
|
|
40
38
|
require 'rexml/document'
|
41
39
|
|
42
40
|
class OptParser
|
43
|
-
|
41
|
+
BANNER = <<USAGE
|
44
42
|
Usage: #{$0} <PDF-file> [-afjms] [-d <output-directory>]
|
45
43
|
Extracts various data out of a document (streams, scripts, images, fonts, metadata, attachments).
|
46
|
-
Bug reports or feature requests at: http://
|
44
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
47
45
|
|
48
46
|
Options:
|
49
47
|
USAGE
|
50
48
|
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
def self.parser(options)
|
50
|
+
OptionParser.new do |opts|
|
51
|
+
opts.banner = BANNER
|
54
52
|
|
55
|
-
|
56
|
-
|
57
|
-
|
53
|
+
opts.on("-d", "--output-dir DIR", "Output directory") do |d|
|
54
|
+
options[:output_dir] = d
|
55
|
+
end
|
58
56
|
|
59
|
-
|
60
|
-
|
61
|
-
|
57
|
+
opts.on("-s", "--streams", "Extracts all decoded streams") do
|
58
|
+
options[:streams] = true
|
59
|
+
end
|
62
60
|
|
63
|
-
|
64
|
-
|
65
|
-
|
61
|
+
opts.on("-a", "--attachments", "Extracts file attachments") do
|
62
|
+
options[:attachments] = true
|
63
|
+
end
|
66
64
|
|
67
|
-
|
68
|
-
|
69
|
-
|
65
|
+
opts.on("-f", "--fonts", "Extracts embedded font files") do
|
66
|
+
options[:fonts] = true
|
67
|
+
end
|
70
68
|
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
opts.on("-j", "--js", "Extracts JavaScript scripts") do
|
70
|
+
options[:javascript] = true
|
71
|
+
end
|
74
72
|
|
75
|
-
|
76
|
-
|
77
|
-
|
73
|
+
opts.on("-m", "--metadata", "Extracts metadata streams") do
|
74
|
+
options[:metadata] = true
|
75
|
+
end
|
78
76
|
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
opts.on("-i", "--images", "Extracts embedded images") do
|
78
|
+
options[:images] = true
|
79
|
+
end
|
82
80
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
81
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
82
|
+
puts opts
|
83
|
+
exit
|
84
|
+
end
|
85
|
+
end
|
87
86
|
end
|
88
|
-
end
|
89
87
|
|
90
|
-
|
91
|
-
|
92
|
-
{
|
93
|
-
}
|
88
|
+
def self.parse(args)
|
89
|
+
options = {}
|
94
90
|
|
95
|
-
|
91
|
+
self.parser(options).parse!(args)
|
96
92
|
|
97
|
-
|
98
|
-
|
93
|
+
options
|
94
|
+
end
|
99
95
|
end
|
100
96
|
|
101
97
|
begin
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
target = ARGV.shift
|
109
|
-
end
|
110
|
-
|
111
|
-
unless [:streams,:javascript,:attachments,:fonts,:metadata,:images].any? {|opt| @options[opt]}
|
112
|
-
@options[:streams] =
|
113
|
-
@options[:javascript] =
|
114
|
-
@options[:fonts] =
|
115
|
-
@options[:attachments] =
|
116
|
-
@options[:images] = true
|
117
|
-
end
|
118
|
-
|
119
|
-
if @options[:output_dir].nil?
|
120
|
-
@options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
|
121
|
-
end
|
122
|
-
|
123
|
-
# Force data extraction, even for invalid FlateDecode streams.
|
124
|
-
Origami::OPTIONS[:ignore_zlib_errors] = true
|
125
|
-
|
126
|
-
OUTPUT_DIR = @options[:output_dir]
|
127
|
-
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
128
|
-
|
129
|
-
params =
|
130
|
-
{
|
131
|
-
:verbosity => Parser::VERBOSE_QUIET,
|
132
|
-
}
|
133
|
-
pdf = PDF.read(target, params)
|
134
|
-
|
135
|
-
if @options[:streams]
|
136
|
-
nstreams = 0
|
137
|
-
Dir::mkdir("#{OUTPUT_DIR}/streams") unless File.directory?("#{OUTPUT_DIR}/streams")
|
138
|
-
|
139
|
-
pdf.root_objects.find_all{|obj| obj.is_a?(Stream)}.each do |stream|
|
140
|
-
stream_file = "#{OUTPUT_DIR}/streams/stream_#{stream.reference.refno}.dmp"
|
141
|
-
File.open(stream_file, "wb") do |fd|
|
142
|
-
fd.write(stream.data)
|
143
|
-
end
|
144
|
-
nstreams += 1
|
98
|
+
@options = OptParser.parse(ARGV)
|
99
|
+
|
100
|
+
if ARGV.empty?
|
101
|
+
abort "Error: No filename was specified. #{$0} --help for details."
|
102
|
+
else
|
103
|
+
target = ARGV.shift
|
145
104
|
end
|
146
105
|
|
147
|
-
|
148
|
-
|
106
|
+
unless %i[streams javascript attachments fonts metadata images].any? {|opt| @options[opt]}
|
107
|
+
@options[:streams] =
|
108
|
+
@options[:javascript] =
|
109
|
+
@options[:fonts] =
|
110
|
+
@options[:attachments] =
|
111
|
+
@options[:images] = true
|
112
|
+
end
|
149
113
|
|
150
|
-
|
151
|
-
|
152
|
-
|
114
|
+
if @options[:output_dir].nil?
|
115
|
+
@options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
|
116
|
+
end
|
153
117
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
118
|
+
# Force data extraction, even for invalid FlateDecode streams.
|
119
|
+
Origami::OPTIONS[:ignore_zlib_errors] = true
|
120
|
+
Origami::OPTIONS[:ignore_png_errors] = true
|
121
|
+
|
122
|
+
OUTPUT_DIR = @options[:output_dir]
|
123
|
+
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
124
|
+
|
125
|
+
params =
|
126
|
+
{
|
127
|
+
verbosity: Parser::VERBOSE_QUIET,
|
128
|
+
}
|
129
|
+
pdf = PDF.read(target, params)
|
130
|
+
|
131
|
+
if @options[:streams]
|
132
|
+
nstreams = 0
|
133
|
+
stream_dir = File.join(OUTPUT_DIR, "streams")
|
134
|
+
Dir::mkdir(stream_dir) unless File.directory?(stream_dir)
|
135
|
+
|
136
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(Stream)}.each do |stream|
|
137
|
+
stream_file = File.join(stream_dir, "stream_#{stream.reference.refno}.dmp")
|
138
|
+
File.binwrite(stream_file, stream.data)
|
139
|
+
nstreams += 1
|
140
|
+
end
|
141
|
+
|
142
|
+
puts "Extracted #{nstreams} PDF streams to '#{stream_dir}'."
|
167
143
|
end
|
168
144
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
145
|
+
if @options[:javascript]
|
146
|
+
nscripts = 0
|
147
|
+
js_dir = File.join(OUTPUT_DIR, "scripts")
|
148
|
+
Dir::mkdir(js_dir) unless File.directory?(js_dir)
|
149
|
+
|
150
|
+
pdf.ls(/^JS$/).each do |script|
|
151
|
+
script_file = File.join(js_dir, "script_#{script.hash}.js")
|
152
|
+
script_data =
|
153
|
+
case script
|
154
|
+
when Stream then script.data
|
155
|
+
else script.value
|
156
|
+
end
|
157
|
+
|
158
|
+
File.binwrite(script_file, script_data)
|
159
|
+
nscripts += 1
|
160
|
+
end
|
161
|
+
|
162
|
+
# Also checking for presence of JavaScript in XML forms.
|
163
|
+
if pdf.form? and pdf.Catalog.AcroForm.has_key?(:XFA)
|
164
|
+
xfa = pdf.Catalog.AcroForm[:XFA].solve
|
165
|
+
|
166
|
+
case xfa
|
167
|
+
when Array then
|
168
|
+
xml = ""
|
169
|
+
i = 0
|
170
|
+
xfa.each do |packet|
|
171
|
+
if i % 2 == 1
|
172
|
+
xml << packet.solve.data
|
173
|
+
end
|
174
|
+
|
175
|
+
i = i + 1
|
176
|
+
end
|
177
|
+
when Stream then
|
178
|
+
xml = xfa.data
|
179
|
+
else
|
180
|
+
reject("Malformed XFA dictionary")
|
180
181
|
end
|
181
182
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
end
|
189
|
-
|
190
|
-
xfadoc = REXML::Document.new(xml)
|
191
|
-
REXML::XPath.match(xfadoc, "//script").each do |script|
|
192
|
-
script_file = "#{OUTPUT_DIR}/script_#{script.hash}.js"
|
193
|
-
File.open(script_file, "wb") do |fd|
|
194
|
-
fd.write(script.text)
|
183
|
+
xfadoc = REXML::Document.new(xml)
|
184
|
+
REXML::XPath.match(xfadoc, "//script").each do |script|
|
185
|
+
script_file = File.join(js_dir, "script_#{script.hash}.js")
|
186
|
+
File.binwrite(script_file, script.text)
|
187
|
+
nscripts += 1
|
188
|
+
end
|
195
189
|
end
|
196
|
-
|
197
|
-
|
190
|
+
|
191
|
+
puts "Extracted #{nscripts} scripts to '#{js_dir}'."
|
198
192
|
end
|
199
193
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
fd.write(f.data)
|
194
|
+
if @options[:attachments]
|
195
|
+
nattach = 0
|
196
|
+
attachments_dir = File.join(OUTPUT_DIR, "attachments")
|
197
|
+
Dir::mkdir(attachments_dir) unless File.directory?(attachments_dir)
|
198
|
+
|
199
|
+
pdf.each_attachment do |name, attachment|
|
200
|
+
attached_file = File.join(attachments_dir, "attached_#{File.basename(name)}")
|
201
|
+
spec = attachment.solve
|
202
|
+
if spec and spec.EF and f = spec.EF.F and f.is_a?(Stream)
|
203
|
+
File.binwrite(attached_file, f.data)
|
204
|
+
nattach += 1
|
205
|
+
end
|
213
206
|
end
|
214
|
-
|
215
|
-
|
207
|
+
|
208
|
+
puts "Extracted #{nattach} attachments to '#{attachments_dir}'."
|
216
209
|
end
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
fd.write(stream.data)
|
210
|
+
|
211
|
+
if @options[:fonts]
|
212
|
+
nfonts = 0
|
213
|
+
fonts_dir = File.join(OUTPUT_DIR, "fonts")
|
214
|
+
Dir::mkdir(fonts_dir) unless File.directory?(fonts_dir)
|
215
|
+
|
216
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(Stream)}.each do |stream|
|
217
|
+
font = stream.xrefs.find{|obj| obj.is_a?(FontDescriptor)}
|
218
|
+
if font
|
219
|
+
font_file = File.join(fonts_dir, File.basename(font.FontName.value.to_s))
|
220
|
+
File.binwrite(font_file, stream.data)
|
221
|
+
nfonts += 1
|
222
|
+
end
|
231
223
|
end
|
232
|
-
|
233
|
-
|
224
|
+
|
225
|
+
puts "Extracted #{nfonts} fonts to '#{fonts_dir}'."
|
234
226
|
end
|
235
227
|
|
236
|
-
|
237
|
-
|
228
|
+
if @options[:metadata]
|
229
|
+
nmeta = 0
|
230
|
+
metadata_dir = File.join(OUTPUT_DIR, "metadata")
|
231
|
+
Dir::mkdir(metadata_dir) unless File.directory?(metadata_dir)
|
238
232
|
|
239
|
-
|
240
|
-
|
241
|
-
|
233
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(MetadataStream)}.each do |stream|
|
234
|
+
metadata_file = File.join(metadata_dir, "metadata_#{stream.reference.refno}.xml")
|
235
|
+
File.binwrite(metadata_file, stream.data)
|
236
|
+
nmeta += 1
|
237
|
+
end
|
242
238
|
|
243
|
-
|
244
|
-
metadata_file = "#{OUTPUT_DIR}/metadata/metadata_#{stream.reference.refno}.xml"
|
245
|
-
File.open(metadata_file, "wb") do |fd|
|
246
|
-
fd.write(stream.data)
|
247
|
-
end
|
248
|
-
nmeta += 1
|
239
|
+
puts "Extracted #{nmeta} metadata streams to '#{metadata_dir}'."
|
249
240
|
end
|
250
241
|
|
251
|
-
|
252
|
-
|
242
|
+
if @options[:images]
|
243
|
+
nimages = 0
|
244
|
+
image_dir = File.join(OUTPUT_DIR, "images")
|
245
|
+
Dir::mkdir(image_dir) unless File.directory?(image_dir)
|
253
246
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
pdf.root_objects.find_all{|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
|
259
|
-
begin
|
260
|
-
ext, image_data = stream.to_image_file
|
261
|
-
image_file = "#{OUTPUT_DIR}/images/image_#{stream.reference.refno}.#{ext}"
|
247
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
|
248
|
+
begin
|
249
|
+
ext, image_data = stream.to_image_file
|
250
|
+
image_file = File.join(image_dir, "image_#{stream.reference.refno}.#{ext}")
|
262
251
|
|
263
|
-
|
264
|
-
|
265
|
-
|
252
|
+
if ext != 'png' and stream.ColorSpace == Graphics::Color::Space::DEVICE_CMYK
|
253
|
+
STDERR.puts "Warning: file '#{image_file}' is intended to be viewed in CMYK color space."
|
254
|
+
end
|
266
255
|
|
267
|
-
|
268
|
-
|
256
|
+
File.binwrite(image_file, image_data)
|
257
|
+
nimages += 1
|
258
|
+
rescue
|
259
|
+
STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{$!.message}"
|
260
|
+
end
|
269
261
|
end
|
270
|
-
nimages += 1
|
271
262
|
|
272
|
-
|
273
|
-
STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{e.message}"
|
274
|
-
end
|
263
|
+
puts "Extracted #{nimages} images to '#{image_dir}'."
|
275
264
|
end
|
276
|
-
|
277
|
-
puts "Extracted #{nimages} images to '#{OUTPUT_DIR}/images'."
|
278
|
-
end
|
279
|
-
|
280
|
-
rescue SystemExit
|
281
|
-
rescue Exception => e
|
282
|
-
STDERR.puts "#{e.class}: #{e.message}"
|
283
|
-
exit 1
|
284
|
-
end
|
285
265
|
|
266
|
+
rescue
|
267
|
+
abort "#{$!.class}: #{$!.message}"
|
268
|
+
end
|