origami 1.2.7 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +66 -0
- data/README.md +112 -0
- data/bin/config/pdfcop.conf.yml +232 -233
- data/bin/gui/about.rb +27 -37
- data/bin/gui/config.rb +108 -117
- data/bin/gui/file.rb +416 -365
- data/bin/gui/gtkhex.rb +1138 -1153
- data/bin/gui/hexview.rb +55 -57
- data/bin/gui/imgview.rb +48 -51
- data/bin/gui/menu.rb +388 -386
- data/bin/gui/properties.rb +114 -130
- data/bin/gui/signing.rb +571 -617
- data/bin/gui/textview.rb +77 -95
- data/bin/gui/treeview.rb +382 -387
- data/bin/gui/walker.rb +227 -232
- data/bin/gui/xrefs.rb +56 -60
- data/bin/pdf2pdfa +53 -57
- data/bin/pdf2ruby +212 -228
- data/bin/pdfcop +338 -348
- data/bin/pdfdecompress +58 -65
- data/bin/pdfdecrypt +56 -60
- data/bin/pdfencrypt +75 -80
- data/bin/pdfexplode +185 -182
- data/bin/pdfextract +201 -218
- data/bin/pdfmetadata +83 -82
- data/bin/pdfsh +4 -5
- data/bin/pdfwalker +1 -2
- data/bin/shell/.irbrc +45 -82
- data/bin/shell/console.rb +105 -130
- data/bin/shell/hexdump.rb +40 -64
- data/examples/README.md +34 -0
- data/examples/attachments/attachment.rb +38 -0
- data/examples/attachments/nested_document.rb +51 -0
- data/examples/encryption/encryption.rb +28 -0
- data/{samples/actions/triggerevents/trigger.rb → examples/events/events.rb} +13 -16
- data/examples/flash/flash.rb +37 -0
- data/{samples → examples}/flash/helloworld.swf +0 -0
- data/examples/forms/javascript.rb +54 -0
- data/examples/forms/xfa.rb +115 -0
- data/examples/javascript/hello_world.rb +22 -0
- data/examples/javascript/js_emulation.rb +54 -0
- data/examples/loop/goto.rb +32 -0
- data/examples/loop/named.rb +33 -0
- data/examples/signature/signature.rb +65 -0
- data/examples/uri/javascript.rb +56 -0
- data/examples/uri/open-uri.rb +21 -0
- data/examples/uri/submitform.rb +47 -0
- data/lib/origami.rb +29 -42
- data/lib/origami/3d.rb +350 -225
- data/lib/origami/acroform.rb +262 -288
- data/lib/origami/actions.rb +268 -288
- data/lib/origami/annotations.rb +697 -722
- data/lib/origami/array.rb +258 -184
- data/lib/origami/boolean.rb +74 -84
- data/lib/origami/catalog.rb +397 -434
- data/lib/origami/collections.rb +144 -0
- data/lib/origami/destinations.rb +233 -194
- data/lib/origami/dictionary.rb +253 -232
- data/lib/origami/encryption.rb +1274 -1243
- data/lib/origami/export.rb +232 -268
- data/lib/origami/extensions/fdf.rb +307 -220
- data/lib/origami/extensions/ppklite.rb +368 -435
- data/lib/origami/filespec.rb +197 -0
- data/lib/origami/filters.rb +301 -295
- data/lib/origami/filters/ascii.rb +177 -180
- data/lib/origami/filters/ccitt.rb +528 -535
- data/lib/origami/filters/crypt.rb +26 -35
- data/lib/origami/filters/dct.rb +46 -52
- data/lib/origami/filters/flate.rb +95 -94
- data/lib/origami/filters/jbig2.rb +49 -55
- data/lib/origami/filters/jpx.rb +38 -44
- data/lib/origami/filters/lzw.rb +189 -183
- data/lib/origami/filters/predictors.rb +221 -235
- data/lib/origami/filters/runlength.rb +103 -104
- data/lib/origami/font.rb +173 -186
- data/lib/origami/functions.rb +67 -81
- data/lib/origami/graphics.rb +25 -21
- data/lib/origami/graphics/colors.rb +178 -187
- data/lib/origami/graphics/instruction.rb +79 -85
- data/lib/origami/graphics/path.rb +142 -148
- data/lib/origami/graphics/patterns.rb +160 -167
- data/lib/origami/graphics/render.rb +43 -50
- data/lib/origami/graphics/state.rb +138 -153
- data/lib/origami/graphics/text.rb +188 -205
- data/lib/origami/graphics/xobject.rb +819 -815
- data/lib/origami/header.rb +63 -78
- data/lib/origami/javascript.rb +596 -597
- data/lib/origami/linearization.rb +285 -290
- data/lib/origami/metadata.rb +139 -148
- data/lib/origami/name.rb +112 -148
- data/lib/origami/null.rb +53 -62
- data/lib/origami/numeric.rb +162 -175
- data/lib/origami/obfuscation.rb +186 -174
- data/lib/origami/object.rb +593 -573
- data/lib/origami/outline.rb +42 -47
- data/lib/origami/outputintents.rb +73 -82
- data/lib/origami/page.rb +703 -592
- data/lib/origami/parser.rb +238 -290
- data/lib/origami/parsers/fdf.rb +41 -33
- data/lib/origami/parsers/pdf.rb +75 -95
- data/lib/origami/parsers/pdf/lazy.rb +137 -0
- data/lib/origami/parsers/pdf/linear.rb +64 -66
- data/lib/origami/parsers/ppklite.rb +34 -70
- data/lib/origami/pdf.rb +1030 -1005
- data/lib/origami/reference.rb +102 -102
- data/lib/origami/signature.rb +591 -609
- data/lib/origami/stream.rb +668 -551
- data/lib/origami/string.rb +397 -373
- data/lib/origami/template/patterns.rb +56 -0
- data/lib/origami/template/widgets.rb +151 -0
- data/lib/origami/trailer.rb +144 -158
- data/lib/origami/tree.rb +62 -0
- data/lib/origami/version.rb +23 -0
- data/lib/origami/webcapture.rb +88 -79
- data/lib/origami/xfa.rb +2863 -2882
- data/lib/origami/xreftable.rb +472 -384
- data/test/dataset/calc.pdf +85 -0
- data/test/dataset/crypto.pdf +82 -0
- data/test/dataset/empty.pdf +49 -0
- data/test/test_actions.rb +27 -0
- data/test/test_annotations.rb +90 -0
- data/test/test_pages.rb +31 -0
- data/test/test_pdf.rb +16 -0
- data/test/test_pdf_attachment.rb +34 -0
- data/test/test_pdf_create.rb +24 -0
- data/test/test_pdf_encrypt.rb +95 -0
- data/test/test_pdf_parse.rb +96 -0
- data/test/test_pdf_sign.rb +58 -0
- data/test/test_streams.rb +182 -0
- data/test/test_xrefs.rb +67 -0
- metadata +88 -58
- data/README +0 -67
- data/bin/pdf2graph +0 -121
- data/bin/pdfcocoon +0 -104
- data/lib/origami/file.rb +0 -233
- data/samples/README.txt +0 -45
- data/samples/actions/launch/calc.rb +0 -87
- data/samples/actions/launch/winparams.rb +0 -22
- data/samples/actions/loop/loopgoto.rb +0 -24
- data/samples/actions/loop/loopnamed.rb +0 -21
- data/samples/actions/named/named.rb +0 -31
- data/samples/actions/samba/smbrelay.rb +0 -26
- data/samples/actions/webbug/submitform.js +0 -26
- data/samples/actions/webbug/webbug-browser.rb +0 -68
- data/samples/actions/webbug/webbug-js.rb +0 -67
- data/samples/actions/webbug/webbug-reader.rb +0 -90
- data/samples/attachments/attach.rb +0 -40
- data/samples/attachments/attached.txt +0 -1
- data/samples/crypto/crypto.rb +0 -28
- data/samples/digsig/signed.rb +0 -46
- data/samples/exploits/cve-2008-2992-utilprintf.rb +0 -87
- data/samples/exploits/cve-2009-0927-geticon.rb +0 -65
- data/samples/exploits/exploit_customdictopen.rb +0 -55
- data/samples/exploits/getannots.rb +0 -69
- data/samples/flash/flash.rb +0 -31
- data/samples/javascript/attached.txt +0 -1
- data/samples/javascript/js.rb +0 -52
- data/templates/patterns.rb +0 -66
- data/templates/widgets.rb +0 -173
- data/templates/xdp.rb +0 -92
- data/test/ts_pdf.rb +0 -50
data/bin/pdfexplode
CHANGED
|
@@ -1,34 +1,33 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
3
|
=begin
|
|
4
4
|
|
|
5
|
-
=
|
|
6
|
-
|
|
5
|
+
= Info
|
|
6
|
+
Explodes a PDF into separate documents.
|
|
7
7
|
|
|
8
|
-
=
|
|
9
|
-
|
|
10
|
-
= License:
|
|
11
|
-
Origami is free software: you can redistribute it and/or modify
|
|
12
|
-
it under the terms of the GNU Lesser General Public License as published by
|
|
13
|
-
the Free Software Foundation, either version 3 of the License, or
|
|
14
|
-
(at your option) any later version.
|
|
8
|
+
= License
|
|
9
|
+
Copyright (C) 2016 Guillaume Delugré.
|
|
15
10
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
11
|
+
Origami is free software: you can redistribute it and/or modify
|
|
12
|
+
it under the terms of the GNU Lesser General Public License as published by
|
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
14
|
+
(at your option) any later version.
|
|
20
15
|
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
Origami is distributed in the hope that it will be useful,
|
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
+
GNU Lesser General Public License for more details.
|
|
20
|
+
|
|
21
|
+
You should have received a copy of the GNU Lesser General Public License
|
|
22
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
|
23
23
|
|
|
24
24
|
=end
|
|
25
25
|
|
|
26
26
|
begin
|
|
27
|
-
|
|
27
|
+
require 'origami'
|
|
28
28
|
rescue LoadError
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
require 'origami'
|
|
29
|
+
$: << File.join(__dir__, '../lib')
|
|
30
|
+
require 'origami'
|
|
32
31
|
end
|
|
33
32
|
include Origami
|
|
34
33
|
|
|
@@ -36,185 +35,189 @@ require 'optparse'
|
|
|
36
35
|
require 'rexml/document'
|
|
37
36
|
|
|
38
37
|
class OptParser
|
|
39
|
-
|
|
38
|
+
BANNER = <<USAGE
|
|
40
39
|
Usage: #{$0} <PDF-file> [-r <range>] [-t pages|rsrc] [-d <output-directory>]
|
|
41
40
|
Explodes a document into separate documents.
|
|
42
|
-
Bug reports or feature requests at: http://
|
|
41
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
|
43
42
|
|
|
44
43
|
Options:
|
|
45
44
|
USAGE
|
|
46
45
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
46
|
+
def self.parser(options)
|
|
47
|
+
OptionParser.new do |opts|
|
|
48
|
+
opts.banner = BANNER
|
|
49
|
+
|
|
50
|
+
opts.on("-d", "--output-dir DIR", "Output directory.") do |d|
|
|
51
|
+
options[:output_dir] = d
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
opts.on("-r", "--range PAGES", "Page range (e.g: 2-, 1-3, 5). Default to '-'.") do |r|
|
|
55
|
+
range =
|
|
56
|
+
if r.index('-').nil?
|
|
57
|
+
page = r.to_i
|
|
58
|
+
Range.new(page-1, page-1)
|
|
59
|
+
else
|
|
60
|
+
from, to = r.split('-').map{|bound| bound.to_i}
|
|
61
|
+
from ||= 1
|
|
62
|
+
to ||= 0
|
|
63
|
+
Range.new(from-1, to-1)
|
|
64
|
+
end
|
|
65
|
+
options[:page_range] = range
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
opts.on("-t", "--type TYPE", "Split by type. Can be 'pages' or 'rsrc'. Default to 'pages'.") do |t|
|
|
69
|
+
options[:split_by] = t
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
opts.on_tail("-h", "--help", "Show this message.") do
|
|
73
|
+
puts opts
|
|
74
|
+
exit
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
77
|
end
|
|
78
|
-
end
|
|
79
78
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
79
|
+
def self.parse(args)
|
|
80
|
+
options =
|
|
81
|
+
{
|
|
82
|
+
page_range: (0..-1),
|
|
83
|
+
split_by: 'pages'
|
|
84
|
+
}
|
|
86
85
|
|
|
87
|
-
|
|
86
|
+
self.parser(options).parse!(args)
|
|
88
87
|
|
|
89
|
-
|
|
90
|
-
|
|
88
|
+
options
|
|
89
|
+
end
|
|
91
90
|
end
|
|
92
91
|
|
|
93
92
|
begin
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
93
|
+
@options = OptParser.parse(ARGV)
|
|
94
|
+
|
|
95
|
+
if ARGV.empty?
|
|
96
|
+
abort "Error: No filename was specified. #{$0} --help for details."
|
|
97
|
+
else
|
|
98
|
+
target = ARGV.shift
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
if @options[:output_dir].nil?
|
|
102
|
+
@options[:output_dir] = "#{File.join(File.dirname(target), File.basename(target,'.pdf'))}.explode"
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
Origami::OPTIONS[:ignore_bad_references] = true
|
|
106
|
+
OUTPUT_DIR = @options[:output_dir]
|
|
107
|
+
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
|
108
|
+
|
|
109
|
+
def split_by_rsrc(n, page, type)
|
|
110
|
+
all_rsrc = page.resources
|
|
111
|
+
type_rsrc = page.resources(type)
|
|
112
|
+
other_rsrc = all_rsrc.keys - type_rsrc.keys
|
|
113
|
+
|
|
114
|
+
unless type_rsrc.empty?
|
|
115
|
+
# Keep only specified resource type.
|
|
116
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_keeponly_#{type}.pdf")
|
|
117
|
+
PDF.write(output_file) do |pdf|
|
|
118
|
+
reduced = page.copy
|
|
119
|
+
# New resource dictionary with only matching resources.
|
|
120
|
+
reduced.Resources = Resources.new(type => type_rsrc)
|
|
121
|
+
# Remove mention of other resources.
|
|
122
|
+
reduced.each_content_stream do |stream|
|
|
123
|
+
stream.data = stream.data.lines.
|
|
124
|
+
delete_if {|line| other_rsrc.any?{|rsrc| line =~ /#{rsrc}/}}.join
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
STDERR.puts "Creating #{output_file}..."
|
|
128
|
+
pdf.append_page(reduced)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Remove all specified resource type.
|
|
132
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_excluded_#{type}.pdf")
|
|
133
|
+
PDF.write(output_file) do |pdf|
|
|
134
|
+
reduced = page.copy
|
|
135
|
+
# New resource dictionary with no resource of specified type.
|
|
136
|
+
reduced.Resources = reduced.Resources.copy
|
|
137
|
+
reduced.Resources.delete(type)
|
|
138
|
+
# Remove mention this resource type.
|
|
139
|
+
reduced.each_content_stream do |stream|
|
|
140
|
+
stream.data = stream.data.lines.
|
|
141
|
+
delete_if {|line| type_rsrc.keys.any?{|rsrc| line =~ /#{rsrc}/}}.join
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
STDERR.puts "Creating #{output_file}..."
|
|
145
|
+
pdf.append_page(reduced)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Now treating each resource object separately.
|
|
149
|
+
type_rsrc.each_pair do |name, rsrc|
|
|
150
|
+
anyother_rsrc = all_rsrc.keys - [ name ]
|
|
151
|
+
# Keey only specified resource object.
|
|
152
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_keeponly_#{type}_#{name}.pdf")
|
|
153
|
+
PDF.write(output_file) do |pdf|
|
|
154
|
+
reduced = page.copy
|
|
155
|
+
# New resource dictionary with only specified resource object.
|
|
156
|
+
reduced.Resources = Resources.new(type => {name => rsrc})
|
|
157
|
+
# Remove mention of all other resources.
|
|
158
|
+
reduced.each_content_stream do |stream|
|
|
159
|
+
stream.data = stream.data.lines.
|
|
160
|
+
delete_if {|line| anyother_rsrc.any?{|rsrc| line =~ /#{rsrc}/}}.join
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
STDERR.puts "Creating #{output_file}..."
|
|
164
|
+
pdf.append_page(reduced)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Remove only specified resource object.
|
|
168
|
+
output_file = File.join(OUTPUT_DIR, "page_#{n}_excluded_#{type}_#{name}.pdf")
|
|
169
|
+
PDF.write(output_file) do |pdf|
|
|
170
|
+
reduced = page.copy
|
|
171
|
+
# New resource dictionary with only specified resource object.
|
|
172
|
+
reduced.Resources = reduced.Resources.copy
|
|
173
|
+
reduced.Resources[type] = reduced.Resources.send(type).copy
|
|
174
|
+
reduced.Resources[type].delete(name)
|
|
175
|
+
# Remove mention of this resource only.
|
|
176
|
+
reduced.each_content_stream do |stream|
|
|
177
|
+
stream.data = stream.data.lines.
|
|
178
|
+
delete_if {|line| line =~ /#{name}/}.join
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
STDERR.puts "Creating #{output_file}..."
|
|
182
|
+
pdf.append_page(reduced)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
177
185
|
end
|
|
178
|
-
end
|
|
179
186
|
end
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
187
|
+
|
|
188
|
+
params =
|
|
189
|
+
{
|
|
190
|
+
verbosity: Parser::VERBOSE_QUIET,
|
|
191
|
+
}
|
|
192
|
+
pdf = PDF.read(target, params)
|
|
193
|
+
|
|
194
|
+
i = @options[:page_range].first + 1
|
|
195
|
+
pdf.pages[@options[:page_range]].each do |page|
|
|
196
|
+
case @options[:split_by]
|
|
197
|
+
when 'pages'
|
|
198
|
+
output_file = File.join(OUTPUT_DIR, "page_#{i}.pdf")
|
|
199
|
+
PDF.write(output_file) do |pdf|
|
|
200
|
+
STDERR.puts "Creating #{output_file}..."
|
|
201
|
+
pdf.append_page(page)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
when 'rsrc'
|
|
205
|
+
[ Resources::EXTGSTATE,
|
|
206
|
+
Resources::COLORSPACE,
|
|
207
|
+
Resources::PATTERN,
|
|
208
|
+
Resources::SHADING,
|
|
209
|
+
Resources::XOBJECT,
|
|
210
|
+
Resources::FONT,
|
|
211
|
+
Resources::PROPERTIES
|
|
212
|
+
].each { |type| split_by_rsrc(i, page, type) }
|
|
213
|
+
|
|
214
|
+
else
|
|
215
|
+
raise ArgumentError, "Unknown split option: #{@options[:split_by]}"
|
|
196
216
|
end
|
|
197
217
|
|
|
198
|
-
|
|
199
|
-
[ Resources::EXTGSTATE,
|
|
200
|
-
Resources::COLORSPACE,
|
|
201
|
-
Resources::PATTERN,
|
|
202
|
-
Resources::SHADING,
|
|
203
|
-
Resources::XOBJECT,
|
|
204
|
-
Resources::FONT,
|
|
205
|
-
Resources::PROPERTIES
|
|
206
|
-
].each { |type| split_by_rsrc(i, page, type) }
|
|
207
|
-
|
|
208
|
-
else
|
|
209
|
-
raise ArgumentError, "Unknown split option: #{@options[:split_by]}"
|
|
218
|
+
i += 1
|
|
210
219
|
end
|
|
211
220
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
rescue SystemExit
|
|
216
|
-
rescue Exception => e
|
|
217
|
-
STDERR.puts "#{e.class}: #{e.message} #{e.backtrace}"
|
|
218
|
-
exit 1
|
|
221
|
+
rescue
|
|
222
|
+
abort "#{$!.class}: #{$!.message} #{$!.backtrace.join($/)}"
|
|
219
223
|
end
|
|
220
|
-
|
data/bin/pdfextract
CHANGED
|
@@ -1,38 +1,36 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
2
|
|
|
3
3
|
=begin
|
|
4
4
|
|
|
5
|
-
=
|
|
6
|
-
|
|
5
|
+
= Info
|
|
6
|
+
Extracts valuable data from a PDF document. Can extract:
|
|
7
|
+
- decoded streams
|
|
8
|
+
- JavaScript
|
|
9
|
+
- file attachments
|
|
7
10
|
|
|
8
|
-
=
|
|
9
|
-
|
|
10
|
-
- decoded streams
|
|
11
|
-
- JavaScript
|
|
12
|
-
- file attachments
|
|
11
|
+
= License
|
|
12
|
+
Copyright (C) 2016 Guillaume Delugré.
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
(at your option) any later version.
|
|
14
|
+
Origami is free software: you can redistribute it and/or modify
|
|
15
|
+
it under the terms of the GNU Lesser General Public License as published by
|
|
16
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
17
|
+
(at your option) any later version.
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
Origami is distributed in the hope that it will be useful,
|
|
20
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
22
|
+
GNU Lesser General Public License for more details.
|
|
24
23
|
|
|
25
|
-
|
|
26
|
-
|
|
24
|
+
You should have received a copy of the GNU Lesser General Public License
|
|
25
|
+
along with Origami. If not, see <http://www.gnu.org/licenses/>.
|
|
27
26
|
|
|
28
27
|
=end
|
|
29
28
|
|
|
30
29
|
begin
|
|
31
|
-
|
|
30
|
+
require 'origami'
|
|
32
31
|
rescue LoadError
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
require 'origami'
|
|
32
|
+
$: << File.join(__dir__, '../lib')
|
|
33
|
+
require 'origami'
|
|
36
34
|
end
|
|
37
35
|
include Origami
|
|
38
36
|
|
|
@@ -40,246 +38,231 @@ require 'optparse'
|
|
|
40
38
|
require 'rexml/document'
|
|
41
39
|
|
|
42
40
|
class OptParser
|
|
43
|
-
|
|
41
|
+
BANNER = <<USAGE
|
|
44
42
|
Usage: #{$0} <PDF-file> [-afjms] [-d <output-directory>]
|
|
45
43
|
Extracts various data out of a document (streams, scripts, images, fonts, metadata, attachments).
|
|
46
|
-
Bug reports or feature requests at: http://
|
|
44
|
+
Bug reports or feature requests at: http://github.com/gdelugre/origami
|
|
47
45
|
|
|
48
46
|
Options:
|
|
49
47
|
USAGE
|
|
50
48
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
49
|
+
def self.parser(options)
|
|
50
|
+
OptionParser.new do |opts|
|
|
51
|
+
opts.banner = BANNER
|
|
54
52
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
53
|
+
opts.on("-d", "--output-dir DIR", "Output directory") do |d|
|
|
54
|
+
options[:output_dir] = d
|
|
55
|
+
end
|
|
58
56
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
57
|
+
opts.on("-s", "--streams", "Extracts all decoded streams") do
|
|
58
|
+
options[:streams] = true
|
|
59
|
+
end
|
|
62
60
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
61
|
+
opts.on("-a", "--attachments", "Extracts file attachments") do
|
|
62
|
+
options[:attachments] = true
|
|
63
|
+
end
|
|
66
64
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
opts.on("-f", "--fonts", "Extracts embedded font files") do
|
|
66
|
+
options[:fonts] = true
|
|
67
|
+
end
|
|
70
68
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
69
|
+
opts.on("-j", "--js", "Extracts JavaScript scripts") do
|
|
70
|
+
options[:javascript] = true
|
|
71
|
+
end
|
|
74
72
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
opts.on("-m", "--metadata", "Extracts metadata streams") do
|
|
74
|
+
options[:metadata] = true
|
|
75
|
+
end
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
opts.on("-i", "--images", "Extracts embedded images") do
|
|
78
|
+
options[:images] = true
|
|
79
|
+
end
|
|
82
80
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
81
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
|
82
|
+
puts opts
|
|
83
|
+
exit
|
|
84
|
+
end
|
|
85
|
+
end
|
|
87
86
|
end
|
|
88
|
-
end
|
|
89
87
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
{
|
|
93
|
-
}
|
|
88
|
+
def self.parse(args)
|
|
89
|
+
options = {}
|
|
94
90
|
|
|
95
|
-
|
|
91
|
+
self.parser(options).parse!(args)
|
|
96
92
|
|
|
97
|
-
|
|
98
|
-
|
|
93
|
+
options
|
|
94
|
+
end
|
|
99
95
|
end
|
|
100
96
|
|
|
101
97
|
begin
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
target = ARGV.shift
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
unless [:streams,:javascript,:attachments,:fonts,:metadata,:images].any? {|opt| @options[opt]}
|
|
112
|
-
@options[:streams] =
|
|
113
|
-
@options[:javascript] =
|
|
114
|
-
@options[:fonts] =
|
|
115
|
-
@options[:attachments] =
|
|
116
|
-
@options[:images] = true
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
if @options[:output_dir].nil?
|
|
120
|
-
@options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
# Force data extraction, even for invalid FlateDecode streams.
|
|
124
|
-
Origami::OPTIONS[:ignore_zlib_errors] = true
|
|
125
|
-
|
|
126
|
-
OUTPUT_DIR = @options[:output_dir]
|
|
127
|
-
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
|
128
|
-
|
|
129
|
-
params =
|
|
130
|
-
{
|
|
131
|
-
:verbosity => Parser::VERBOSE_QUIET,
|
|
132
|
-
}
|
|
133
|
-
pdf = PDF.read(target, params)
|
|
134
|
-
|
|
135
|
-
if @options[:streams]
|
|
136
|
-
nstreams = 0
|
|
137
|
-
Dir::mkdir("#{OUTPUT_DIR}/streams") unless File.directory?("#{OUTPUT_DIR}/streams")
|
|
138
|
-
|
|
139
|
-
pdf.root_objects.find_all{|obj| obj.is_a?(Stream)}.each do |stream|
|
|
140
|
-
stream_file = "#{OUTPUT_DIR}/streams/stream_#{stream.reference.refno}.dmp"
|
|
141
|
-
File.open(stream_file, "wb") do |fd|
|
|
142
|
-
fd.write(stream.data)
|
|
143
|
-
end
|
|
144
|
-
nstreams += 1
|
|
98
|
+
@options = OptParser.parse(ARGV)
|
|
99
|
+
|
|
100
|
+
if ARGV.empty?
|
|
101
|
+
abort "Error: No filename was specified. #{$0} --help for details."
|
|
102
|
+
else
|
|
103
|
+
target = ARGV.shift
|
|
145
104
|
end
|
|
146
105
|
|
|
147
|
-
|
|
148
|
-
|
|
106
|
+
unless %i[streams javascript attachments fonts metadata images].any? {|opt| @options[opt]}
|
|
107
|
+
@options[:streams] =
|
|
108
|
+
@options[:javascript] =
|
|
109
|
+
@options[:fonts] =
|
|
110
|
+
@options[:attachments] =
|
|
111
|
+
@options[:images] = true
|
|
112
|
+
end
|
|
149
113
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
114
|
+
if @options[:output_dir].nil?
|
|
115
|
+
@options[:output_dir] = "#{File.basename(target, '.pdf')}.dump"
|
|
116
|
+
end
|
|
153
117
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
118
|
+
# Force data extraction, even for invalid FlateDecode streams.
|
|
119
|
+
Origami::OPTIONS[:ignore_zlib_errors] = true
|
|
120
|
+
Origami::OPTIONS[:ignore_png_errors] = true
|
|
121
|
+
|
|
122
|
+
OUTPUT_DIR = @options[:output_dir]
|
|
123
|
+
Dir::mkdir(OUTPUT_DIR) unless File.directory?(OUTPUT_DIR)
|
|
124
|
+
|
|
125
|
+
params =
|
|
126
|
+
{
|
|
127
|
+
verbosity: Parser::VERBOSE_QUIET,
|
|
128
|
+
}
|
|
129
|
+
pdf = PDF.read(target, params)
|
|
130
|
+
|
|
131
|
+
if @options[:streams]
|
|
132
|
+
nstreams = 0
|
|
133
|
+
stream_dir = File.join(OUTPUT_DIR, "streams")
|
|
134
|
+
Dir::mkdir(stream_dir) unless File.directory?(stream_dir)
|
|
135
|
+
|
|
136
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(Stream)}.each do |stream|
|
|
137
|
+
stream_file = File.join(stream_dir, "stream_#{stream.reference.refno}.dmp")
|
|
138
|
+
File.binwrite(stream_file, stream.data)
|
|
139
|
+
nstreams += 1
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
puts "Extracted #{nstreams} PDF streams to '#{stream_dir}'."
|
|
167
143
|
end
|
|
168
144
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
145
|
+
if @options[:javascript]
|
|
146
|
+
nscripts = 0
|
|
147
|
+
js_dir = File.join(OUTPUT_DIR, "scripts")
|
|
148
|
+
Dir::mkdir(js_dir) unless File.directory?(js_dir)
|
|
149
|
+
|
|
150
|
+
pdf.ls(/^JS$/).each do |script|
|
|
151
|
+
script_file = File.join(js_dir, "script_#{script.hash}.js")
|
|
152
|
+
script_data =
|
|
153
|
+
case script
|
|
154
|
+
when Stream then script.data
|
|
155
|
+
else script.value
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
File.binwrite(script_file, script_data)
|
|
159
|
+
nscripts += 1
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Also checking for presence of JavaScript in XML forms.
|
|
163
|
+
if pdf.form? and pdf.Catalog.AcroForm.has_key?(:XFA)
|
|
164
|
+
xfa = pdf.Catalog.AcroForm[:XFA].solve
|
|
165
|
+
|
|
166
|
+
case xfa
|
|
167
|
+
when Array then
|
|
168
|
+
xml = ""
|
|
169
|
+
i = 0
|
|
170
|
+
xfa.each do |packet|
|
|
171
|
+
if i % 2 == 1
|
|
172
|
+
xml << packet.solve.data
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
i = i + 1
|
|
176
|
+
end
|
|
177
|
+
when Stream then
|
|
178
|
+
xml = xfa.data
|
|
179
|
+
else
|
|
180
|
+
reject("Malformed XFA dictionary")
|
|
180
181
|
end
|
|
181
182
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
xfadoc = REXML::Document.new(xml)
|
|
191
|
-
REXML::XPath.match(xfadoc, "//script").each do |script|
|
|
192
|
-
script_file = "#{OUTPUT_DIR}/script_#{script.hash}.js"
|
|
193
|
-
File.open(script_file, "wb") do |fd|
|
|
194
|
-
fd.write(script.text)
|
|
183
|
+
xfadoc = REXML::Document.new(xml)
|
|
184
|
+
REXML::XPath.match(xfadoc, "//script").each do |script|
|
|
185
|
+
script_file = File.join(js_dir, "script_#{script.hash}.js")
|
|
186
|
+
File.binwrite(script_file, script.text)
|
|
187
|
+
nscripts += 1
|
|
188
|
+
end
|
|
195
189
|
end
|
|
196
|
-
|
|
197
|
-
|
|
190
|
+
|
|
191
|
+
puts "Extracted #{nscripts} scripts to '#{js_dir}'."
|
|
198
192
|
end
|
|
199
193
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
fd.write(f.data)
|
|
194
|
+
if @options[:attachments]
|
|
195
|
+
nattach = 0
|
|
196
|
+
attachments_dir = File.join(OUTPUT_DIR, "attachments")
|
|
197
|
+
Dir::mkdir(attachments_dir) unless File.directory?(attachments_dir)
|
|
198
|
+
|
|
199
|
+
pdf.each_attachment do |name, attachment|
|
|
200
|
+
attached_file = File.join(attachments_dir, "attached_#{File.basename(name)}")
|
|
201
|
+
spec = attachment.solve
|
|
202
|
+
if spec and spec.EF and f = spec.EF.F and f.is_a?(Stream)
|
|
203
|
+
File.binwrite(attached_file, f.data)
|
|
204
|
+
nattach += 1
|
|
205
|
+
end
|
|
213
206
|
end
|
|
214
|
-
|
|
215
|
-
|
|
207
|
+
|
|
208
|
+
puts "Extracted #{nattach} attachments to '#{attachments_dir}'."
|
|
216
209
|
end
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
fd.write(stream.data)
|
|
210
|
+
|
|
211
|
+
if @options[:fonts]
|
|
212
|
+
nfonts = 0
|
|
213
|
+
fonts_dir = File.join(OUTPUT_DIR, "fonts")
|
|
214
|
+
Dir::mkdir(fonts_dir) unless File.directory?(fonts_dir)
|
|
215
|
+
|
|
216
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(Stream)}.each do |stream|
|
|
217
|
+
font = stream.xrefs.find{|obj| obj.is_a?(FontDescriptor)}
|
|
218
|
+
if font
|
|
219
|
+
font_file = File.join(fonts_dir, File.basename(font.FontName.value.to_s))
|
|
220
|
+
File.binwrite(font_file, stream.data)
|
|
221
|
+
nfonts += 1
|
|
222
|
+
end
|
|
231
223
|
end
|
|
232
|
-
|
|
233
|
-
|
|
224
|
+
|
|
225
|
+
puts "Extracted #{nfonts} fonts to '#{fonts_dir}'."
|
|
234
226
|
end
|
|
235
227
|
|
|
236
|
-
|
|
237
|
-
|
|
228
|
+
if @options[:metadata]
|
|
229
|
+
nmeta = 0
|
|
230
|
+
metadata_dir = File.join(OUTPUT_DIR, "metadata")
|
|
231
|
+
Dir::mkdir(metadata_dir) unless File.directory?(metadata_dir)
|
|
238
232
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
233
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(MetadataStream)}.each do |stream|
|
|
234
|
+
metadata_file = File.join(metadata_dir, "metadata_#{stream.reference.refno}.xml")
|
|
235
|
+
File.binwrite(metadata_file, stream.data)
|
|
236
|
+
nmeta += 1
|
|
237
|
+
end
|
|
242
238
|
|
|
243
|
-
|
|
244
|
-
metadata_file = "#{OUTPUT_DIR}/metadata/metadata_#{stream.reference.refno}.xml"
|
|
245
|
-
File.open(metadata_file, "wb") do |fd|
|
|
246
|
-
fd.write(stream.data)
|
|
247
|
-
end
|
|
248
|
-
nmeta += 1
|
|
239
|
+
puts "Extracted #{nmeta} metadata streams to '#{metadata_dir}'."
|
|
249
240
|
end
|
|
250
241
|
|
|
251
|
-
|
|
252
|
-
|
|
242
|
+
if @options[:images]
|
|
243
|
+
nimages = 0
|
|
244
|
+
image_dir = File.join(OUTPUT_DIR, "images")
|
|
245
|
+
Dir::mkdir(image_dir) unless File.directory?(image_dir)
|
|
253
246
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
pdf.root_objects.find_all{|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
|
|
259
|
-
begin
|
|
260
|
-
ext, image_data = stream.to_image_file
|
|
261
|
-
image_file = "#{OUTPUT_DIR}/images/image_#{stream.reference.refno}.#{ext}"
|
|
247
|
+
pdf.root_objects.find_all{|obj| obj.is_a?(Graphics::ImageXObject)}.each do |stream|
|
|
248
|
+
begin
|
|
249
|
+
ext, image_data = stream.to_image_file
|
|
250
|
+
image_file = File.join(image_dir, "image_#{stream.reference.refno}.#{ext}")
|
|
262
251
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
252
|
+
if ext != 'png' and stream.ColorSpace == Graphics::Color::Space::DEVICE_CMYK
|
|
253
|
+
STDERR.puts "Warning: file '#{image_file}' is intended to be viewed in CMYK color space."
|
|
254
|
+
end
|
|
266
255
|
|
|
267
|
-
|
|
268
|
-
|
|
256
|
+
File.binwrite(image_file, image_data)
|
|
257
|
+
nimages += 1
|
|
258
|
+
rescue
|
|
259
|
+
STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{$!.message}"
|
|
260
|
+
end
|
|
269
261
|
end
|
|
270
|
-
nimages += 1
|
|
271
262
|
|
|
272
|
-
|
|
273
|
-
STDERR.puts "Unable to decode image (stream #{stream.reference.refno}). #{e.message}"
|
|
274
|
-
end
|
|
263
|
+
puts "Extracted #{nimages} images to '#{image_dir}'."
|
|
275
264
|
end
|
|
276
|
-
|
|
277
|
-
puts "Extracted #{nimages} images to '#{OUTPUT_DIR}/images'."
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
rescue SystemExit
|
|
281
|
-
rescue Exception => e
|
|
282
|
-
STDERR.puts "#{e.class}: #{e.message}"
|
|
283
|
-
exit 1
|
|
284
|
-
end
|
|
285
265
|
|
|
266
|
+
rescue
|
|
267
|
+
abort "#{$!.class}: #{$!.message}"
|
|
268
|
+
end
|