metaclean 1.0.2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +134 -47
- data/bin/metaclean +1 -22
- data/lib/metaclean/cli.rb +42 -92
- data/lib/metaclean/display.rb +59 -40
- data/lib/metaclean/exiftool.rb +70 -89
- data/lib/metaclean/ffmpeg.rb +84 -0
- data/lib/metaclean/mat2.rb +43 -40
- data/lib/metaclean/qpdf.rb +29 -25
- data/lib/metaclean/runner.rb +317 -168
- data/lib/metaclean/strategy.rb +118 -39
- data/lib/metaclean/version.rb +1 -3
- data/lib/metaclean.rb +75 -26
- metadata +11 -8
data/lib/metaclean/display.rb
CHANGED
|
@@ -1,16 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
#
|
|
4
|
-
#
|
|
5
|
-
# tables, the before/after diff. Keeping presentation in one module means
|
|
6
|
-
# the rest of the codebase stays focused on logic.
|
|
7
|
-
#
|
|
8
|
-
# ANSI escape sequences:
|
|
9
|
-
# "\e[31m" turns the terminal text red.
|
|
10
|
-
# "\e[0m" resets all styling.
|
|
11
|
-
# A modern terminal interprets these; if you redirect to a file, they show
|
|
12
|
-
# up as garbage — that's why we check `tty?` before emitting them.
|
|
13
|
-
# ───────────────────────────────────────────────────────────────────────────
|
|
3
|
+
# All terminal output lives here: ANSI colors, headers, tables, the
|
|
4
|
+
# before/after diff. Colors are gated on `tty?` (see color?).
|
|
14
5
|
|
|
15
6
|
module Metaclean
|
|
16
7
|
module Display
|
|
@@ -21,7 +12,6 @@ module Metaclean
|
|
|
21
12
|
red: "\e[31m",
|
|
22
13
|
green: "\e[32m",
|
|
23
14
|
yellow: "\e[33m",
|
|
24
|
-
blue: "\e[34m",
|
|
25
15
|
magenta: "\e[35m",
|
|
26
16
|
cyan: "\e[36m",
|
|
27
17
|
gray: "\e[90m"
|
|
@@ -33,35 +23,51 @@ module Metaclean
|
|
|
33
23
|
# Excluding these makes the diff focus on what actually got stripped.
|
|
34
24
|
NON_METADATA_GROUPS = %w[System File ExifTool Composite].freeze
|
|
35
25
|
|
|
26
|
+
# ASCII wordmark shown at the top of --help / --version. Printed by `banner`
|
|
27
|
+
# (see there for why it's colored line-by-line).
|
|
28
|
+
LOGO = <<~ART
|
|
29
|
+
███╗ ███╗███████╗████████╗ █████╗ ██████╗██╗ ███████╗ █████╗ ███╗ ██╗
|
|
30
|
+
████╗ ████║██╔════╝╚══██╔══╝██╔══██╗██╔════╝██║ ██╔════╝██╔══██╗████╗ ██║
|
|
31
|
+
██╔████╔██║█████╗ ██║ ███████║██║ ██║ █████╗ ███████║██╔██╗ ██║
|
|
32
|
+
██║╚██╔╝██║██╔══╝ ██║ ██╔══██║██║ ██║ ██╔══╝ ██╔══██║██║╚██╗██║
|
|
33
|
+
██║ ╚═╝ ██║███████╗ ██║ ██║ ██║╚██████╗███████╗███████╗██║ ██║██║ ╚████║
|
|
34
|
+
╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝╚══════╝╚══════╝╚═╝ ╚═╝╚═╝ ╚═══╝
|
|
35
|
+
ART
|
|
36
|
+
|
|
36
37
|
module_function
|
|
37
38
|
|
|
38
39
|
# Decides whether to emit ANSI color codes. Colors are wrong when:
|
|
39
40
|
# * stdout is a pipe/file (not a terminal) — `tty?` is false there
|
|
40
41
|
# * NO_COLOR env var is set (de-facto convention, see no-color.org)
|
|
41
|
-
# * we're on classic Windows cmd.exe (modern Windows Terminal is fine,
|
|
42
|
-
# but to be safe we require an explicit FORCE_COLOR opt-in there)
|
|
43
42
|
def color?
|
|
44
43
|
return @color if defined?(@color)
|
|
45
44
|
|
|
46
45
|
# Per https://no-color.org: disable only when NO_COLOR is set to a
|
|
47
46
|
# non-empty value. An unset or empty NO_COLOR leaves colors on.
|
|
48
47
|
no_color = ENV['NO_COLOR'].to_s
|
|
49
|
-
@color = $stdout.tty? && no_color.empty?
|
|
48
|
+
@color = $stdout.tty? && no_color.empty?
|
|
50
49
|
@color = true if ENV['FORCE_COLOR']
|
|
51
50
|
@color
|
|
52
51
|
end
|
|
53
52
|
|
|
54
|
-
#
|
|
55
|
-
# plain if colors are disabled. The reset code at the end stops the
|
|
56
|
-
# color from bleeding into following output.
|
|
53
|
+
# Wrap text in a color, or pass it through plain when colors are off.
|
|
57
54
|
def c(text, color)
|
|
58
|
-
|
|
55
|
+
text = printable(text)
|
|
56
|
+
return text unless color?
|
|
59
57
|
|
|
60
58
|
"#{COLORS[color]}#{text}#{COLORS[:reset]}"
|
|
61
59
|
end
|
|
62
60
|
|
|
63
|
-
#
|
|
64
|
-
#
|
|
61
|
+
# Red ASCII wordmark (matches Ruby's brand color) + one-line tagline for
|
|
62
|
+
# --help / --version. Colored line-by-line on purpose: `c` runs text through
|
|
63
|
+
# `printable`, which turns control chars (including the heredoc's newlines)
|
|
64
|
+
# into spaces — so coloring the whole block at once would collapse the logo
|
|
65
|
+
# onto one line.
|
|
66
|
+
def banner
|
|
67
|
+
LOGO.each_line { |line| puts c(line.chomp, :red) }
|
|
68
|
+
puts c(' strip EXIF · IPTC · XMP · GPS · ID3 — leave the file clean', :gray)
|
|
69
|
+
end
|
|
70
|
+
|
|
65
71
|
def header(text)
|
|
66
72
|
puts
|
|
67
73
|
puts c('━' * 64, :gray)
|
|
@@ -82,23 +88,19 @@ module Metaclean
|
|
|
82
88
|
# `only_embedded:` filters out the System/File/etc. noise.
|
|
83
89
|
def metadata_table(meta, only_embedded: false)
|
|
84
90
|
rows = meta.reject { |k, _| k == 'SourceFile' }
|
|
85
|
-
rows = rows.
|
|
91
|
+
rows = rows.select { |k, _| embedded_key?(k) } if only_embedded
|
|
86
92
|
|
|
87
93
|
if rows.empty?
|
|
88
94
|
info(only_embedded ? '(no embedded metadata)' : '(no metadata)')
|
|
89
95
|
return
|
|
90
96
|
end
|
|
91
97
|
|
|
92
|
-
#
|
|
93
|
-
# result. Here we group all "GPS:*" tags together, all "EXIF:*" together,
|
|
94
|
-
# etc., then print each group as a labeled sub-table.
|
|
98
|
+
# Group "GPS:*", "EXIF:*", … each into its own labeled sub-table.
|
|
95
99
|
grouped = rows.group_by { |k, _| group_of(k) }
|
|
96
100
|
grouped.sort_by { |g, _| g.to_s }.each do |group, pairs|
|
|
97
101
|
puts c(" [#{group}]", :magenta)
|
|
98
102
|
pairs.sort_by { |k, _| k.to_s }.each do |k, v|
|
|
99
103
|
tag = k.to_s.split(':', 2).last
|
|
100
|
-
# `format` (alias of sprintf) does column alignment: %-38s = left-
|
|
101
|
-
# aligned, padded to 38 chars.
|
|
102
104
|
line = format(' %-38s %s', truncate(tag, 38), truncate(format_value(v), 60))
|
|
103
105
|
puts c(line, :dim)
|
|
104
106
|
end
|
|
@@ -109,16 +111,12 @@ module Metaclean
|
|
|
109
111
|
# sections: removed, changed, still-present. This is the "before/after"
|
|
110
112
|
# the user asked for.
|
|
111
113
|
def diff(before, after)
|
|
112
|
-
keys = (before.keys + after.keys).uniq
|
|
113
|
-
.reject { |k| k == 'SourceFile' }
|
|
114
|
-
.reject { |k| NON_METADATA_GROUPS.include?(group_of(k)) }
|
|
114
|
+
keys = (before.keys + after.keys).uniq.select { |k| embedded_key?(k) }
|
|
115
115
|
|
|
116
116
|
removed = []
|
|
117
117
|
changed = []
|
|
118
118
|
kept = []
|
|
119
119
|
|
|
120
|
-
# Classifying each key into one of three buckets keeps the rest of
|
|
121
|
-
# the method simple and testable.
|
|
122
120
|
keys.sort.each do |k|
|
|
123
121
|
b = before[k]
|
|
124
122
|
a = after[k]
|
|
@@ -161,22 +159,46 @@ module Metaclean
|
|
|
161
159
|
end
|
|
162
160
|
end
|
|
163
161
|
|
|
164
|
-
#
|
|
165
|
-
# the result at 2 elements, so a value containing ":" doesn't break it.
|
|
162
|
+
# Group name out of "Group:Tag" (split caps at 2 so a ":" in the value is safe).
|
|
166
163
|
def group_of(key)
|
|
167
164
|
key.to_s.split(':', 2).first.to_s
|
|
168
165
|
end
|
|
169
166
|
|
|
167
|
+
# True when `key` names real embedded metadata: not the SourceFile
|
|
168
|
+
# bookkeeping key, and not one of the System/File/ExifTool/Composite
|
|
169
|
+
# groups that describe the file rather than its embedded tags. Single
|
|
170
|
+
# source of truth for "is this a tag we actually stripped?" — shared by
|
|
171
|
+
# the table, diff, count, removed-count and privacy-residual checks.
|
|
172
|
+
def embedded_key?(key)
|
|
173
|
+
key != 'SourceFile' && !NON_METADATA_GROUPS.include?(group_of(key))
|
|
174
|
+
end
|
|
175
|
+
|
|
170
176
|
# Make any value safe to print on a single line. Hashes/Arrays get
|
|
171
177
|
# `inspect` (shows their structure); strings are collapsed to single
|
|
172
178
|
# spaces so a multiline tag value doesn't wreck the table.
|
|
173
179
|
def format_value(v)
|
|
174
180
|
case v
|
|
175
|
-
when Hash, Array then v.inspect
|
|
176
|
-
else
|
|
181
|
+
when Hash, Array then printable(v.inspect)
|
|
182
|
+
else
|
|
183
|
+
# Guard the regexp gsub against invalid-encoding tag values — gsub raises
|
|
184
|
+
# ArgumentError on them. Exiftool.read already scrubs; this is belt-and-
|
|
185
|
+
# suspenders so the display layer can never crash the run on hostile bytes.
|
|
186
|
+
s = printable(v)
|
|
187
|
+
s.gsub(/\s+/, ' ')
|
|
177
188
|
end
|
|
178
189
|
end
|
|
179
190
|
|
|
191
|
+
# Render untrusted filenames/metadata as terminal text, not terminal control.
|
|
192
|
+
# Exif/Office/PDF metadata can contain ANSI/OSC escape bytes; printing those
|
|
193
|
+
# raw can recolor output, rewrite a terminal title, or worse. We keep the
|
|
194
|
+
# content readable by replacing C0/DEL and C1 control chars with spaces
|
|
195
|
+
# (C1, U+0080–U+009F, holds the 8-bit CSI/OSC introducers some terminals honor).
|
|
196
|
+
def printable(text)
|
|
197
|
+
s = text.to_s
|
|
198
|
+
s = s.scrub unless s.valid_encoding?
|
|
199
|
+
s.gsub(/[[:cntrl:]]/, ' ')
|
|
200
|
+
end
|
|
201
|
+
|
|
180
202
|
# Truncate to N chars with a single-character ellipsis. We use "…"
|
|
181
203
|
# (one Unicode char) instead of "..." so the truncation doesn't itself
|
|
182
204
|
# spill over the budget.
|
|
@@ -188,10 +210,7 @@ module Metaclean
|
|
|
188
210
|
# How many "real" embedded tags are there? Used for the
|
|
189
211
|
# "Before (24 embedded tags) → After (0)" summary line.
|
|
190
212
|
def count_embedded(meta)
|
|
191
|
-
meta.keys
|
|
192
|
-
.reject { |k| k == 'SourceFile' }
|
|
193
|
-
.reject { |k| NON_METADATA_GROUPS.include?(group_of(k)) }
|
|
194
|
-
.size
|
|
213
|
+
meta.keys.count { |k| embedded_key?(k) }
|
|
195
214
|
end
|
|
196
215
|
end
|
|
197
216
|
end
|
data/lib/metaclean/exiftool.rb
CHANGED
|
@@ -1,140 +1,121 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
#
|
|
4
|
-
# A thin Ruby wrapper around the external `exiftool` binary.
|
|
3
|
+
# Thin wrapper around the external `exiftool` binary.
|
|
5
4
|
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
11
|
-
#
|
|
5
|
+
# Open3.capture3 with multiple args bypasses the shell, so a filename like
|
|
6
|
+
# `cat; rm -rf /` is one argument, not a command. That's not the whole story:
|
|
7
|
+
# exiftool still parses its own arguments, so a filename beginning with "-"
|
|
8
|
+
# (e.g. `-config`) would be read as an option. Every path goes through
|
|
9
|
+
# `Metaclean.safe_path`, which prefixes a leading dash with "./" so it's
|
|
10
|
+
# always seen as a filename.
|
|
12
11
|
|
|
13
12
|
require 'open3'
|
|
14
13
|
require 'json'
|
|
15
14
|
|
|
16
15
|
module Metaclean
|
|
17
|
-
# `module Exiftool` (vs `class`) because we want module-level methods like
|
|
18
|
-
# `Exiftool.read(path)` — there's no state to carry per instance.
|
|
19
16
|
module Exiftool
|
|
20
|
-
# `module_function` makes every method below act like a "static" method
|
|
21
|
-
# on the module *and* a private instance method (rarely used). It saves
|
|
22
|
-
# writing `def self.read` for every method.
|
|
23
17
|
module_function
|
|
24
18
|
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
# `defined?(@available)` is safer than `@available.nil?` because the
|
|
29
|
-
# cached value could legitimately be `false` — we want to skip the
|
|
30
|
-
# re-check in that case too.
|
|
19
|
+
# True if `exiftool` is on PATH. Memoized so repeated checks don't re-spawn
|
|
20
|
+
# it (defined? not nil? — the cached value can legitimately be false).
|
|
31
21
|
def available?
|
|
32
22
|
return @available if defined?(@available)
|
|
33
23
|
|
|
34
|
-
|
|
24
|
+
out, _err, status = Open3.capture3('exiftool', '-ver')
|
|
35
25
|
@available = status.success?
|
|
26
|
+
# Stash the version off the same call so `version` need not re-spawn.
|
|
27
|
+
@version = @available ? out.strip : nil
|
|
28
|
+
@available
|
|
36
29
|
rescue Errno::ENOENT
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@available = false
|
|
30
|
+
@version = nil
|
|
31
|
+
@available = false # exiftool not on PATH
|
|
40
32
|
end
|
|
41
33
|
|
|
42
34
|
# Returns the version string, or nil if exiftool is missing/broken.
|
|
35
|
+
# Captured by `available?`, so this never re-runs the binary.
|
|
43
36
|
def version
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
out, _err, status = Open3.capture3('exiftool', '-ver')
|
|
47
|
-
status.success? ? out.strip : nil
|
|
48
|
-
rescue Errno::ENOENT
|
|
49
|
-
nil
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Hard-fail with a helpful install hint. Called from `read`/`strip!` before
|
|
53
|
-
# any work, so users see one clear message instead of a low-level Errno.
|
|
54
|
-
# The `<<~MSG ... MSG` is a "squiggly heredoc": leading indentation is
|
|
55
|
-
# stripped automatically, so the output is left-aligned.
|
|
56
|
-
def ensure_available!
|
|
57
|
-
return if available?
|
|
58
|
-
|
|
59
|
-
raise ExiftoolMissing, <<~MSG
|
|
60
|
-
ExifTool is not installed or not on PATH.
|
|
61
|
-
|
|
62
|
-
Install:
|
|
63
|
-
macOS: brew install exiftool
|
|
64
|
-
Debian: sudo apt install libimage-exiftool-perl
|
|
65
|
-
Fedora: sudo dnf install perl-Image-ExifTool
|
|
66
|
-
Arch: sudo pacman -S perl-image-exiftool
|
|
67
|
-
Windows: scoop install exiftool (or download exiftool.org)
|
|
68
|
-
MSG
|
|
37
|
+
available? ? @version : nil
|
|
69
38
|
end
|
|
70
39
|
|
|
71
40
|
# Reads metadata from a file and returns a flat Hash of "Group:Tag" => value.
|
|
72
41
|
#
|
|
73
42
|
# ExifTool flag glossary:
|
|
74
43
|
# -j JSON output (machine-parseable)
|
|
75
|
-
# -G1 Include the family-1 group name
|
|
44
|
+
# -G1 Include the family-1 group name. NB: with -G1 mainstream EXIF
|
|
45
|
+
# tags appear under "IFD0"/"ExifIFD"/"IFD1", not "EXIF" (that's
|
|
46
|
+
# the family-0 name); GPS/IPTC/XMP-dc keep those group names.
|
|
76
47
|
# -a Allow duplicate tags (some formats have several with same name)
|
|
77
48
|
# -u Include unknown/unidentified tags
|
|
78
49
|
# -s Short tag names (no descriptions)
|
|
79
50
|
# -n Numeric values (no human formatting like "1/100 sec")
|
|
80
51
|
# -api largefilesupport=1 Allow files >4 GB
|
|
81
52
|
def read(path)
|
|
82
|
-
ensure_available!
|
|
83
53
|
out, err, status = Open3.capture3(
|
|
84
|
-
'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1',
|
|
54
|
+
'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1',
|
|
55
|
+
Metaclean.safe_path(path)
|
|
85
56
|
)
|
|
86
57
|
raise Error, "ExifTool read failed: #{err.strip}" unless status.success?
|
|
87
58
|
|
|
88
59
|
# ExifTool's JSON output is an array (one entry per file). We always
|
|
89
60
|
# pass one file, so we take the first element. `|| {}` handles the
|
|
90
|
-
# edge case where exiftool returns an empty array.
|
|
61
|
+
# edge case where exiftool returns an empty array. A non-array shape is
|
|
62
|
+
# unexpected — bail with a clear error instead of crashing later on
|
|
63
|
+
# `.first` returning a Hash/scalar.
|
|
91
64
|
data = JSON.parse(out)
|
|
92
|
-
data.
|
|
65
|
+
raise Error, 'Unexpected ExifTool output (expected a JSON array)' unless data.is_a?(Array)
|
|
66
|
+
|
|
67
|
+
scrub_encoding(data.first || {})
|
|
93
68
|
rescue JSON::ParserError => e
|
|
94
69
|
raise Error, "Could not parse ExifTool output: #{e.message}"
|
|
95
70
|
end
|
|
96
71
|
|
|
97
|
-
#
|
|
98
|
-
#
|
|
99
|
-
#
|
|
100
|
-
#
|
|
101
|
-
#
|
|
102
|
-
#
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
ensure_available!
|
|
110
|
-
|
|
111
|
-
preserving = keep_orientation || keep_color_profile
|
|
112
|
-
args = ['exiftool', '-all=']
|
|
113
|
-
|
|
114
|
-
# `-tagsFromFile @` says "copy tags from the same file you're writing
|
|
115
|
-
# to". That sounds redundant, but combined with `-all=` running first,
|
|
116
|
-
# it means "delete everything, then re-add only the listed tags".
|
|
117
|
-
if preserving
|
|
118
|
-
args.concat(['-tagsFromFile', '@'])
|
|
119
|
-
args << '-Orientation' if keep_orientation
|
|
120
|
-
args << '-ICC_Profile' if keep_color_profile
|
|
72
|
+
# ExifTool labels its -j output UTF-8, but binary/odd tag values (UserComment,
|
|
73
|
+
# MakerNotes fragments, corrupt or hostile files) can carry invalid bytes. A
|
|
74
|
+
# later gsub (Display.format_value) raises on an invalid-encoding String and
|
|
75
|
+
# would crash the whole run, so replace bad bytes up front. This hash is only
|
|
76
|
+
# used for display/diff/residual checks — the actual strip operates on the
|
|
77
|
+
# file via the tools — so scrubbing is safe.
|
|
78
|
+
def scrub_encoding(obj)
|
|
79
|
+
case obj
|
|
80
|
+
when String then obj.valid_encoding? ? obj : obj.scrub
|
|
81
|
+
when Array then obj.map { |e| scrub_encoding(e) }
|
|
82
|
+
when Hash then obj.transform_values { |v| scrub_encoding(v) }
|
|
83
|
+
else obj
|
|
121
84
|
end
|
|
122
|
-
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# ExifTool can READ many formats it cannot WRITE, and mat2 owns the strip for
|
|
88
|
+
# them: the ZIP-based documents (docx/xlsx/pptx/odt/ods/odp/odg/odf/epub) and
|
|
89
|
+
# the RIFF containers (avi/wav). ExifTool announces the inability with one of
|
|
90
|
+
# a few phrasings — "writing of X files is not yet supported", "does not yet
|
|
91
|
+
# support writing of …", or "Can't currently write RIFF … files" — so we
|
|
92
|
+
# match all of them. strip! returns :unsupported in these cases so the runner
|
|
93
|
+
# treats it as a soft skip (mat2 does the actual strip), NOT a pipeline
|
|
94
|
+
# failure that would wrongly pin an already-clean file at :unverified. This is
|
|
95
|
+
# safe because the post-strip residual re-read still gates the :cleaned status.
|
|
96
|
+
WRITE_UNSUPPORTED_RE = /not yet support|can't currently write|writing of .* files/i
|
|
97
|
+
|
|
98
|
+
# Removes every removable tag, in place. Returns true on success,
|
|
99
|
+
# :unsupported when ExifTool cannot write the format, and raises on failure.
|
|
100
|
+
#
|
|
101
|
+
# `-all=` sets every tag to empty, which deletes them. `-overwrite_original`
|
|
102
|
+
# makes ExifTool replace the file directly instead of writing `file_original`
|
|
103
|
+
# next to it. `-api largefilesupport=1` lets files larger than 4 GB through.
|
|
104
|
+
def strip!(path, also_delete: [])
|
|
105
|
+
# `-all=` clears metadata, but for TIFF/DNG ExifTool refuses to delete the
|
|
106
|
+
# IFD0 directory and leaves its tags (Artist, Software, …) behind. So we
|
|
107
|
+
# ALSO delete the known privacy tags by name and clear the GPS group: both
|
|
108
|
+
# are no-ops where `-all=` already removed them (e.g. JPEG), but they make
|
|
109
|
+
# the strip complete AND lossless (no re-encode) for IFD0-preserving formats.
|
|
110
|
+
args = ['exiftool', '-all=', '-gps:all=']
|
|
111
|
+
also_delete.each { |tag| args << "-#{tag}=" }
|
|
112
|
+
args.concat(['-overwrite_original', '-q', '-q', '-api', 'largefilesupport=1', Metaclean.safe_path(path)])
|
|
123
113
|
|
|
124
114
|
_out, err, status = Open3.capture3(*args)
|
|
125
115
|
return true if status.success?
|
|
116
|
+
return :unsupported if err.match?(WRITE_UNSUPPORTED_RE)
|
|
126
117
|
|
|
127
|
-
|
|
128
|
-
# full strip — but only if we *were* preserving, otherwise the retry
|
|
129
|
-
# would be identical to the failed attempt.
|
|
130
|
-
raise Error, "ExifTool strip failed: #{err.strip}" unless preserving
|
|
131
|
-
|
|
132
|
-
_out2, err2, status2 = Open3.capture3(
|
|
133
|
-
'exiftool', '-all=', '-overwrite_original', '-q', '-q', path.to_s
|
|
134
|
-
)
|
|
135
|
-
return true if status2.success?
|
|
136
|
-
|
|
137
|
-
raise Error, "ExifTool strip failed: #{err2.strip.empty? ? err.strip : err2.strip}"
|
|
118
|
+
raise Error, "ExifTool strip failed: #{err.strip}"
|
|
138
119
|
end
|
|
139
120
|
end
|
|
140
121
|
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Wrapper around the external `ffmpeg` binary.
|
|
4
|
+
#
|
|
5
|
+
# ffmpeg is used for ONE job the other tools can't do: the Matroska containers
|
|
6
|
+
# (mkv/webm). ExifTool is read-only for Matroska, and mat2 has no Matroska
|
|
7
|
+
# parser, so without ffmpeg those formats can't be cleaned at all. ffmpeg
|
|
8
|
+
# rewrites the container while copying every stream verbatim (`-c copy`) — no
|
|
9
|
+
# re-encode, so the audio/video is bit-identical and only the metadata is gone.
|
|
10
|
+
#
|
|
11
|
+
# Like mat2, ffmpeg can't edit in place: it muxes to a new file. We write to a
|
|
12
|
+
# SecureRandom-named sibling and move it back over the source on success.
|
|
13
|
+
|
|
14
|
+
require 'open3'
|
|
15
|
+
require 'fileutils'
|
|
16
|
+
require 'securerandom'
|
|
17
|
+
|
|
18
|
+
module Metaclean
|
|
19
|
+
module Ffmpeg
|
|
20
|
+
module_function
|
|
21
|
+
|
|
22
|
+
# Memoized PATH check (same pattern as the other wrappers).
|
|
23
|
+
def available?
|
|
24
|
+
return @available if defined?(@available)
|
|
25
|
+
|
|
26
|
+
out, _err, status = Open3.capture3('ffmpeg', '-version')
|
|
27
|
+
@available = status.success?
|
|
28
|
+
# First line is "ffmpeg version 7.1.1 Copyright ..."; grab the 3rd token.
|
|
29
|
+
@version = @available ? out.lines.first.to_s.split[2] : nil
|
|
30
|
+
@available
|
|
31
|
+
rescue Errno::ENOENT
|
|
32
|
+
@version = nil
|
|
33
|
+
@available = false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def version
|
|
37
|
+
available? ? @version : nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Strips all metadata from `path` in place, losslessly. Returns true on
|
|
41
|
+
# success, raises Metaclean::Error on failure.
|
|
42
|
+
#
|
|
43
|
+
# -map 0 keep every stream (video, audio, subtitles)
|
|
44
|
+
# -map_metadata -1 drop global/container metadata
|
|
45
|
+
# -map_chapters -1 drop chapter markers (they can carry titles)
|
|
46
|
+
# -c copy remux without re-encoding — bit-identical streams
|
|
47
|
+
def strip!(path)
|
|
48
|
+
raise Error, 'ffmpeg not available' unless available?
|
|
49
|
+
|
|
50
|
+
tmp = tmp_path_for(path)
|
|
51
|
+
# Clear any stale temp from an earlier crashed run before muxing.
|
|
52
|
+
File.delete(tmp) if File.exist?(tmp)
|
|
53
|
+
|
|
54
|
+
_out, err, status = Open3.capture3(
|
|
55
|
+
'ffmpeg', '-y', '-v', 'error', '-nostdin', '-i', file_url(path),
|
|
56
|
+
'-map', '0', '-map_metadata', '-1', '-map_chapters', '-1', '-c', 'copy',
|
|
57
|
+
file_url(tmp)
|
|
58
|
+
)
|
|
59
|
+
# ffmpeg can exit 0 yet write nothing on some odd inputs, so require the
|
|
60
|
+
# output to actually exist before we trust it and move it into place.
|
|
61
|
+
raise Error, "ffmpeg failed: #{err.strip}" unless status.success? && File.exist?(tmp)
|
|
62
|
+
|
|
63
|
+
FileUtils.mv(tmp, path)
|
|
64
|
+
true
|
|
65
|
+
ensure
|
|
66
|
+
# Interrupt-safety: drop the temp if we were killed between mux and rename.
|
|
67
|
+
# On the success path it's already moved, so this is a no-op.
|
|
68
|
+
File.delete(tmp) if tmp && File.exist?(tmp)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Sibling temp with the SAME extension (ffmpeg picks the muxer from it) in
|
|
72
|
+
# the SAME directory (so the final rename is an atomic same-fs move). The
|
|
73
|
+
# ".metaclean.tmp." marker means Runner#skip? ignores any stray leftover.
|
|
74
|
+
def tmp_path_for(path)
|
|
75
|
+
dir = File.dirname(path)
|
|
76
|
+
ext = File.extname(path)
|
|
77
|
+
File.join(dir, "#{Metaclean::TMP_MARKER}ff.#{Process.pid}.#{SecureRandom.hex(8)}#{ext}")
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def file_url(path)
|
|
81
|
+
"file:#{File.expand_path(path)}"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
data/lib/metaclean/mat2.rb
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
# ───────────────────────────────────────────────────────────────────────────
|
|
4
3
|
# Wrapper around the external `mat2` (Metadata Anonymisation Toolkit 2).
|
|
5
4
|
#
|
|
6
5
|
# mat2 is stricter than ExifTool on certain formats (DOCX/PDF/PNG): instead
|
|
@@ -10,7 +9,6 @@
|
|
|
10
9
|
# mat2's CLI quirk: it does NOT overwrite the original. It writes a new file
|
|
11
10
|
# named `<name>.cleaned.<ext>` next to it. We adapt by renaming that result
|
|
12
11
|
# back over the source after a successful run.
|
|
13
|
-
# ───────────────────────────────────────────────────────────────────────────
|
|
14
12
|
|
|
15
13
|
require 'open3'
|
|
16
14
|
require 'fileutils'
|
|
@@ -20,17 +18,22 @@ module Metaclean
|
|
|
20
18
|
# File extensions we know mat2 can handle. Keep this list conservative —
|
|
21
19
|
# if mat2 doesn't actually support an extension, the call will fail
|
|
22
20
|
# gracefully via UNSUPPORTED_RE below, but we'd rather not even try.
|
|
21
|
+
# Deliberately ABSENT: Matroska (mkv/webm) — mat2 has no parser for it; ffmpeg
|
|
22
|
+
# owns those (Strategy::FFMPEG_FORMATS). QuickTime/MP4-audio (mov/m4a) — mat2
|
|
23
|
+
# can't write them and ExifTool already cleans them, so listing them only
|
|
24
|
+
# caused a wasted mat2 spawn that always soft-skipped. WMV (ASF) IS here on
|
|
25
|
+
# purpose: mat2 CAN write it but ExifTool can't, so mat2 is the only tool that
|
|
26
|
+
# cleans .wmv — dropping it would make every .wmv permanently :failed.
|
|
23
27
|
SUPPORTED_EXTS = %w[
|
|
24
28
|
pdf png jpg jpeg tif tiff gif bmp svg webp
|
|
25
|
-
mp3 flac ogg opus wav
|
|
26
|
-
mp4 avi
|
|
29
|
+
mp3 flac ogg opus wav
|
|
30
|
+
mp4 avi wmv
|
|
27
31
|
docx xlsx pptx odt ods odp odg odf epub
|
|
28
32
|
zip torrent
|
|
29
33
|
].freeze
|
|
30
34
|
|
|
31
|
-
#
|
|
32
|
-
#
|
|
33
|
-
# `i` flag = case-insensitive.
|
|
35
|
+
# Matches the messages mat2 prints when it can't handle a file — lets us
|
|
36
|
+
# distinguish a soft skip from a real error.
|
|
34
37
|
UNSUPPORTED_RE = /(not supported|isn't supported|cannot be cleaned|unsupported file)/i.freeze
|
|
35
38
|
|
|
36
39
|
module_function
|
|
@@ -39,21 +42,20 @@ module Metaclean
|
|
|
39
42
|
def available?
|
|
40
43
|
return @available if defined?(@available)
|
|
41
44
|
|
|
42
|
-
|
|
45
|
+
out, _err, status = Open3.capture3('mat2', '--version')
|
|
43
46
|
@available = status.success?
|
|
47
|
+
# `mat2 --version` prints "mat2 0.14.0" — `.split.last` grabs the
|
|
48
|
+
# version number regardless of whatever prefix appears. Captured here
|
|
49
|
+
# so `version` reuses it instead of re-spawning the binary.
|
|
50
|
+
@version = @available ? out.strip.split.last : nil
|
|
51
|
+
@available
|
|
44
52
|
rescue Errno::ENOENT
|
|
53
|
+
@version = nil
|
|
45
54
|
@available = false
|
|
46
55
|
end
|
|
47
56
|
|
|
48
57
|
def version
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
out, _err, status = Open3.capture3('mat2', '--version')
|
|
52
|
-
# `mat2 --version` prints "mat2 0.14.0" — `.split.last` grabs the
|
|
53
|
-
# version number regardless of whatever prefix appears.
|
|
54
|
-
status.success? ? out.strip.split.last : nil
|
|
55
|
-
rescue Errno::ENOENT
|
|
56
|
-
nil
|
|
58
|
+
available? ? @version : nil
|
|
57
59
|
end
|
|
58
60
|
|
|
59
61
|
# Quick check before we even try mat2 on a file. Used by Strategy to
|
|
@@ -61,7 +63,7 @@ module Metaclean
|
|
|
61
63
|
def supports?(path)
|
|
62
64
|
return false unless available?
|
|
63
65
|
|
|
64
|
-
SUPPORTED_EXTS.include?(
|
|
66
|
+
SUPPORTED_EXTS.include?(Metaclean.ext_of(path))
|
|
65
67
|
end
|
|
66
68
|
|
|
67
69
|
# Strips metadata from `path` in place. Returns:
|
|
@@ -76,38 +78,39 @@ module Metaclean
|
|
|
76
78
|
raise Error, 'mat2 not available' unless available?
|
|
77
79
|
|
|
78
80
|
cleaned = cleaned_path_for(path)
|
|
81
|
+
safe = Metaclean.safe_path(path)
|
|
79
82
|
|
|
80
83
|
# Defensive: if a stale `<name>.cleaned.<ext>` exists from an earlier
|
|
81
84
|
# crashed run, remove it so we don't accidentally use old data.
|
|
82
85
|
File.delete(cleaned) if File.exist?(cleaned)
|
|
83
86
|
|
|
84
|
-
out, err, status = Open3.capture3('mat2',
|
|
85
|
-
combined = "#{out}\n#{err}"
|
|
87
|
+
out, err, status = Open3.capture3('mat2', safe)
|
|
86
88
|
|
|
87
|
-
#
|
|
88
|
-
#
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
89
|
+
# Success path first. mat2 only creates `<name>.cleaned.<ext>` when it
|
|
90
|
+
# actually stripped something; no file after exit 0 means there was
|
|
91
|
+
# nothing to remove. We check exit status BEFORE the "unsupported"
|
|
92
|
+
# message so a successful run that merely warns about one embedded
|
|
93
|
+
# stream isn't misreported as a soft skip.
|
|
94
|
+
if status.success?
|
|
95
|
+
return :no_metadata unless File.exist?(cleaned)
|
|
94
96
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
# `err.strip.empty? ? out.strip : err.strip` picks whichever stream
|
|
98
|
-
# has actual content — some tools log to stdout, others to stderr.
|
|
99
|
-
raise Error, "mat2 failed: #{err.strip.empty? ? out.strip : err.strip}"
|
|
97
|
+
FileUtils.mv(cleaned, safe)
|
|
98
|
+
return true
|
|
100
99
|
end
|
|
101
100
|
|
|
102
|
-
#
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
if
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
101
|
+
# Failure path. A "not supported" message means a soft skip we report
|
|
102
|
+
# so the runner can continue with the next tool, not a hard error.
|
|
103
|
+
combined = "#{out}\n#{err}"
|
|
104
|
+
return :unsupported if combined.match?(UNSUPPORTED_RE)
|
|
105
|
+
|
|
106
|
+
# `err.strip.empty? ? out.strip : err.strip` picks whichever stream
|
|
107
|
+
# has actual content — some tools log to stdout, others to stderr.
|
|
108
|
+
raise Error, "mat2 failed: #{err.strip.empty? ? out.strip : err.strip}"
|
|
109
|
+
ensure
|
|
110
|
+
# Interrupt-safety: if we were killed (Ctrl-C) between mat2 writing
|
|
111
|
+
# `<name>.cleaned.<ext>` and the rename, don't leave the orphan behind.
|
|
112
|
+
# On the success path it's already moved, so this is a no-op.
|
|
113
|
+
File.delete(cleaned) if cleaned && File.exist?(cleaned)
|
|
111
114
|
end
|
|
112
115
|
|
|
113
116
|
# Builds the path mat2 will write to: `name.cleaned.ext`.
|