metaclean 1.0.2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # ───────────────────────────────────────────────────────────────────────────
4
- # Anything that prints to the terminal lives here: ANSI colors, headers,
5
- # tables, the before/after diff. Keeping presentation in one module means
6
- # the rest of the codebase stays focused on logic.
7
- #
8
- # ANSI escape sequences:
9
- # "\e[31m" turns the terminal text red.
10
- # "\e[0m" resets all styling.
11
- # A modern terminal interprets these; if you redirect to a file, they show
12
- # up as garbage — that's why we check `tty?` before emitting them.
13
- # ───────────────────────────────────────────────────────────────────────────
3
+ # All terminal output lives here: ANSI colors, headers, tables, the
4
+ # before/after diff. Colors are gated on `tty?` (see color?).
14
5
 
15
6
  module Metaclean
16
7
  module Display
@@ -21,7 +12,6 @@ module Metaclean
21
12
  red: "\e[31m",
22
13
  green: "\e[32m",
23
14
  yellow: "\e[33m",
24
- blue: "\e[34m",
25
15
  magenta: "\e[35m",
26
16
  cyan: "\e[36m",
27
17
  gray: "\e[90m"
@@ -33,35 +23,51 @@ module Metaclean
33
23
  # Excluding these makes the diff focus on what actually got stripped.
34
24
  NON_METADATA_GROUPS = %w[System File ExifTool Composite].freeze
35
25
 
26
+ # ASCII wordmark shown at the top of --help / --version. Printed by `banner`
27
+ # (see there for why it's colored line-by-line).
28
+ LOGO = <<~ART
29
+ ███╗ ███╗███████╗████████╗ █████╗ ██████╗██╗ ███████╗ █████╗ ███╗ ██╗
30
+ ████╗ ████║██╔════╝╚══██╔══╝██╔══██╗██╔════╝██║ ██╔════╝██╔══██╗████╗ ██║
31
+ ██╔████╔██║█████╗ ██║ ███████║██║ ██║ █████╗ ███████║██╔██╗ ██║
32
+ ██║╚██╔╝██║██╔══╝ ██║ ██╔══██║██║ ██║ ██╔══╝ ██╔══██║██║╚██╗██║
33
+ ██║ ╚═╝ ██║███████╗ ██║ ██║ ██║╚██████╗███████╗███████╗██║ ██║██║ ╚████║
34
+ ╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝╚══════╝╚══════╝╚═╝ ╚═╝╚═╝ ╚═══╝
35
+ ART
36
+
36
37
  module_function
37
38
 
38
39
  # Decides whether to emit ANSI color codes. Colors are wrong when:
39
40
  # * stdout is a pipe/file (not a terminal) — `tty?` is false there
40
41
  # * NO_COLOR env var is set (de-facto convention, see no-color.org)
41
- # * we're on classic Windows cmd.exe (modern Windows Terminal is fine,
42
- # but to be safe we require an explicit FORCE_COLOR opt-in there)
43
42
  def color?
44
43
  return @color if defined?(@color)
45
44
 
46
45
  # Per https://no-color.org: disable only when NO_COLOR is set to a
47
46
  # non-empty value. An unset or empty NO_COLOR leaves colors on.
48
47
  no_color = ENV['NO_COLOR'].to_s
49
- @color = $stdout.tty? && no_color.empty? && !Gem.win_platform?
48
+ @color = $stdout.tty? && no_color.empty?
50
49
  @color = true if ENV['FORCE_COLOR']
51
50
  @color
52
51
  end
53
52
 
54
- # `c` for "color". Wraps text in the requested color, or returns it
55
- # plain if colors are disabled. The reset code at the end stops the
56
- # color from bleeding into following output.
53
+ # Wrap text in a color, or pass it through plain when colors are off.
57
54
  def c(text, color)
58
- return text.to_s unless color?
55
+ text = printable(text)
56
+ return text unless color?
59
57
 
60
58
  "#{COLORS[color]}#{text}#{COLORS[:reset]}"
61
59
  end
62
60
 
63
- # Visual section markers used throughout the runner's output. Keeping
64
- # them here means a single change updates the look everywhere.
61
+ # Red ASCII wordmark (matches Ruby's brand color) + one-line tagline for
62
+ # --help / --version. Colored line-by-line on purpose: `c` runs text through
63
+ # `printable`, which turns control chars (including the heredoc's newlines)
64
+ # into spaces — so coloring the whole block at once would collapse the logo
65
+ # onto one line.
66
+ def banner
67
+ LOGO.each_line { |line| puts c(line.chomp, :red) }
68
+ puts c(' strip EXIF · IPTC · XMP · GPS · ID3 — leave the file clean', :gray)
69
+ end
70
+
65
71
  def header(text)
66
72
  puts
67
73
  puts c('━' * 64, :gray)
@@ -82,23 +88,19 @@ module Metaclean
82
88
  # `only_embedded:` filters out the System/File/etc. noise.
83
89
  def metadata_table(meta, only_embedded: false)
84
90
  rows = meta.reject { |k, _| k == 'SourceFile' }
85
- rows = rows.reject { |k, _| NON_METADATA_GROUPS.include?(group_of(k)) } if only_embedded
91
+ rows = rows.select { |k, _| embedded_key?(k) } if only_embedded
86
92
 
87
93
  if rows.empty?
88
94
  info(only_embedded ? '(no embedded metadata)' : '(no metadata)')
89
95
  return
90
96
  end
91
97
 
92
- # `group_by` partitions an Enumerable into a Hash keyed by the block's
93
- # result. Here we group all "GPS:*" tags together, all "EXIF:*" together,
94
- # etc., then print each group as a labeled sub-table.
98
+ # Group "GPS:*", "EXIF:*", each into its own labeled sub-table.
95
99
  grouped = rows.group_by { |k, _| group_of(k) }
96
100
  grouped.sort_by { |g, _| g.to_s }.each do |group, pairs|
97
101
  puts c(" [#{group}]", :magenta)
98
102
  pairs.sort_by { |k, _| k.to_s }.each do |k, v|
99
103
  tag = k.to_s.split(':', 2).last
100
- # `format` (alias of sprintf) does column alignment: %-38s = left-
101
- # aligned, padded to 38 chars.
102
104
  line = format(' %-38s %s', truncate(tag, 38), truncate(format_value(v), 60))
103
105
  puts c(line, :dim)
104
106
  end
@@ -109,16 +111,12 @@ module Metaclean
109
111
  # sections: removed, changed, still-present. This is the "before/after"
110
112
  # the user asked for.
111
113
  def diff(before, after)
112
- keys = (before.keys + after.keys).uniq
113
- .reject { |k| k == 'SourceFile' }
114
- .reject { |k| NON_METADATA_GROUPS.include?(group_of(k)) }
114
+ keys = (before.keys + after.keys).uniq.select { |k| embedded_key?(k) }
115
115
 
116
116
  removed = []
117
117
  changed = []
118
118
  kept = []
119
119
 
120
- # Classifying each key into one of three buckets keeps the rest of
121
- # the method simple and testable.
122
120
  keys.sort.each do |k|
123
121
  b = before[k]
124
122
  a = after[k]
@@ -161,22 +159,46 @@ module Metaclean
161
159
  end
162
160
  end
163
161
 
164
- # Pull the group name out of "Group:Tag". The `2` argument to split caps
165
- # the result at 2 elements, so a value containing ":" doesn't break it.
162
+ # Group name out of "Group:Tag" (split caps at 2 so a ":" in the value is safe).
166
163
  def group_of(key)
167
164
  key.to_s.split(':', 2).first.to_s
168
165
  end
169
166
 
167
+ # True when `key` names real embedded metadata: not the SourceFile
168
+ # bookkeeping key, and not one of the System/File/ExifTool/Composite
169
+ # groups that describe the file rather than its embedded tags. Single
170
+ # source of truth for "is this a tag we actually stripped?" — shared by
171
+ # the table, diff, count, removed-count and privacy-residual checks.
172
+ def embedded_key?(key)
173
+ key != 'SourceFile' && !NON_METADATA_GROUPS.include?(group_of(key))
174
+ end
175
+
170
176
  # Make any value safe to print on a single line. Hashes/Arrays get
171
177
  # `inspect` (shows their structure); strings are collapsed to single
172
178
  # spaces so a multiline tag value doesn't wreck the table.
173
179
  def format_value(v)
174
180
  case v
175
- when Hash, Array then v.inspect
176
- else v.to_s.gsub(/\s+/, ' ')
181
+ when Hash, Array then printable(v.inspect)
182
+ else
183
+ # Guard the regexp gsub against invalid-encoding tag values — gsub raises
184
+ # ArgumentError on them. Exiftool.read already scrubs; this is belt-and-
185
+ # suspenders so the display layer can never crash the run on hostile bytes.
186
+ s = printable(v)
187
+ s.gsub(/\s+/, ' ')
177
188
  end
178
189
  end
179
190
 
191
+ # Render untrusted filenames/metadata as terminal text, not terminal control.
192
+ # Exif/Office/PDF metadata can contain ANSI/OSC escape bytes; printing those
193
+ # raw can recolor output, rewrite a terminal title, or worse. We keep the
194
+ # content readable by replacing C0/DEL and C1 control chars with spaces
195
+ # (C1, U+0080–U+009F, holds the 8-bit CSI/OSC introducers some terminals honor).
196
+ def printable(text)
197
+ s = text.to_s
198
+ s = s.scrub unless s.valid_encoding?
199
+ s.gsub(/[[:cntrl:]]/, ' ')
200
+ end
201
+
180
202
  # Truncate to N chars with a single-character ellipsis. We use "…"
181
203
  # (one Unicode char) instead of "..." so the truncation doesn't itself
182
204
  # spill over the budget.
@@ -188,10 +210,7 @@ module Metaclean
188
210
  # How many "real" embedded tags are there? Used for the
189
211
  # "Before (24 embedded tags) → After (0)" summary line.
190
212
  def count_embedded(meta)
191
- meta.keys
192
- .reject { |k| k == 'SourceFile' }
193
- .reject { |k| NON_METADATA_GROUPS.include?(group_of(k)) }
194
- .size
213
+ meta.keys.count { |k| embedded_key?(k) }
195
214
  end
196
215
  end
197
216
  end
@@ -1,140 +1,121 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # ───────────────────────────────────────────────────────────────────────────
4
- # A thin Ruby wrapper around the external `exiftool` binary.
3
+ # Thin wrapper around the external `exiftool` binary.
5
4
  #
6
- # We use `Open3.capture3` instead of backticks or `system()` because:
7
- # 1. It returns stdout, stderr, and the process status separately.
8
- # 2. When called with multiple arguments, it bypasses the shell entirely
9
- # so a filename like `cat; rm -rf /` is treated as ONE filename, not
10
- # a shell command. This is the standard way to safely shell out in Ruby.
11
- # ───────────────────────────────────────────────────────────────────────────
5
+ # Open3.capture3 with multiple args bypasses the shell, so a filename like
6
+ # `cat; rm -rf /` is one argument, not a command. That's not the whole story:
7
+ # exiftool still parses its own arguments, so a filename beginning with "-"
8
+ # (e.g. `-config`) would be read as an option. Every path goes through
9
+ # `Metaclean.safe_path`, which prefixes a leading dash with "./" so it's
10
+ # always seen as a filename.
12
11
 
13
12
  require 'open3'
14
13
  require 'json'
15
14
 
16
15
  module Metaclean
17
- # `module Exiftool` (vs `class`) because we want module-level methods like
18
- # `Exiftool.read(path)` — there's no state to carry per instance.
19
16
  module Exiftool
20
- # `module_function` makes every method below act like a "static" method
21
- # on the module *and* a private instance method (rarely used). It saves
22
- # writing `def self.read` for every method.
23
17
  module_function
24
18
 
25
- # Returns true if `exiftool` is on PATH. The result is memoized in `@available`
26
- # so repeated checks don't re-spawn the process.
27
- #
28
- # `defined?(@available)` is safer than `@available.nil?` because the
29
- # cached value could legitimately be `false` — we want to skip the
30
- # re-check in that case too.
19
+ # True if `exiftool` is on PATH. Memoized so repeated checks don't re-spawn
20
+ # it (defined? not nil? the cached value can legitimately be false).
31
21
  def available?
32
22
  return @available if defined?(@available)
33
23
 
34
- _out, _err, status = Open3.capture3('exiftool', '-ver')
24
+ out, _err, status = Open3.capture3('exiftool', '-ver')
35
25
  @available = status.success?
26
+ # Stash the version off the same call so `version` need not re-spawn.
27
+ @version = @available ? out.strip : nil
28
+ @available
36
29
  rescue Errno::ENOENT
37
- # `Errno::ENOENT` ("no such file or directory") is what Open3 raises
38
- # when the executable can't be found. We treat that as "not available".
39
- @available = false
30
+ @version = nil
31
+ @available = false # exiftool not on PATH
40
32
  end
41
33
 
42
34
  # Returns the version string, or nil if exiftool is missing/broken.
35
+ # Captured by `available?`, so this never re-runs the binary.
43
36
  def version
44
- return nil unless available?
45
-
46
- out, _err, status = Open3.capture3('exiftool', '-ver')
47
- status.success? ? out.strip : nil
48
- rescue Errno::ENOENT
49
- nil
50
- end
51
-
52
- # Hard-fail with a helpful install hint. Called from `read`/`strip!` before
53
- # any work, so users see one clear message instead of a low-level Errno.
54
- # The `<<~MSG ... MSG` is a "squiggly heredoc": leading indentation is
55
- # stripped automatically, so the output is left-aligned.
56
- def ensure_available!
57
- return if available?
58
-
59
- raise ExiftoolMissing, <<~MSG
60
- ExifTool is not installed or not on PATH.
61
-
62
- Install:
63
- macOS: brew install exiftool
64
- Debian: sudo apt install libimage-exiftool-perl
65
- Fedora: sudo dnf install perl-Image-ExifTool
66
- Arch: sudo pacman -S perl-image-exiftool
67
- Windows: scoop install exiftool (or download exiftool.org)
68
- MSG
37
+ available? ? @version : nil
69
38
  end
70
39
 
71
40
  # Reads metadata from a file and returns a flat Hash of "Group:Tag" => value.
72
41
  #
73
42
  # ExifTool flag glossary:
74
43
  # -j JSON output (machine-parseable)
75
- # -G1 Include the family-1 group name (e.g. "EXIF", "GPS", "IPTC")
44
+ # -G1 Include the family-1 group name. NB: with -G1 mainstream EXIF
45
+ # tags appear under "IFD0"/"ExifIFD"/"IFD1", not "EXIF" (that's
46
+ # the family-0 name); GPS/IPTC/XMP-dc keep those group names.
76
47
  # -a Allow duplicate tags (some formats have several with same name)
77
48
  # -u Include unknown/unidentified tags
78
49
  # -s Short tag names (no descriptions)
79
50
  # -n Numeric values (no human formatting like "1/100 sec")
80
51
  # -api largefilesupport=1 Allow files >4 GB
81
52
  def read(path)
82
- ensure_available!
83
53
  out, err, status = Open3.capture3(
84
- 'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1', path.to_s
54
+ 'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1',
55
+ Metaclean.safe_path(path)
85
56
  )
86
57
  raise Error, "ExifTool read failed: #{err.strip}" unless status.success?
87
58
 
88
59
  # ExifTool's JSON output is an array (one entry per file). We always
89
60
  # pass one file, so we take the first element. `|| {}` handles the
90
- # edge case where exiftool returns an empty array.
61
+ # edge case where exiftool returns an empty array. A non-array shape is
62
+ # unexpected — bail with a clear error instead of crashing later on
63
+ # `.first` returning a Hash/scalar.
91
64
  data = JSON.parse(out)
92
- data.first || {}
65
+ raise Error, 'Unexpected ExifTool output (expected a JSON array)' unless data.is_a?(Array)
66
+
67
+ scrub_encoding(data.first || {})
93
68
  rescue JSON::ParserError => e
94
69
  raise Error, "Could not parse ExifTool output: #{e.message}"
95
70
  end
96
71
 
97
- # Removes every removable tag, in place. Returns true on success.
98
- #
99
- # `-all=` is the magic incantation: it sets every tag to nothing (= empty),
100
- # which deletes them. `-overwrite_original` makes ExifTool replace the
101
- # file directly instead of writing `file_original` next to it.
102
- #
103
- # The optional `keep_*` flags are useful because:
104
- # * Orientation tells viewers how to rotate phone photos. Removing it
105
- # can show the picture sideways.
106
- # * ICC profile tells viewers which color space the image is in.
107
- # Removing it can shift colors.
108
- def strip!(path, keep_orientation: false, keep_color_profile: false)
109
- ensure_available!
110
-
111
- preserving = keep_orientation || keep_color_profile
112
- args = ['exiftool', '-all=']
113
-
114
- # `-tagsFromFile @` says "copy tags from the same file you're writing
115
- # to". That sounds redundant, but combined with `-all=` running first,
116
- # it means "delete everything, then re-add only the listed tags".
117
- if preserving
118
- args.concat(['-tagsFromFile', '@'])
119
- args << '-Orientation' if keep_orientation
120
- args << '-ICC_Profile' if keep_color_profile
72
+ # ExifTool labels its -j output UTF-8, but binary/odd tag values (UserComment,
73
+ # MakerNotes fragments, corrupt or hostile files) can carry invalid bytes. A
74
+ # later gsub (Display.format_value) raises on an invalid-encoding String and
75
+ # would crash the whole run, so replace bad bytes up front. This hash is only
76
+ # used for display/diff/residual checks the actual strip operates on the
77
+ # file via the tools — so scrubbing is safe.
78
+ def scrub_encoding(obj)
79
+ case obj
80
+ when String then obj.valid_encoding? ? obj : obj.scrub
81
+ when Array then obj.map { |e| scrub_encoding(e) }
82
+ when Hash then obj.transform_values { |v| scrub_encoding(v) }
83
+ else obj
121
84
  end
122
- args.concat(['-overwrite_original', '-q', '-q', '-api', 'largefilesupport=1', path.to_s])
85
+ end
86
+
87
+ # ExifTool can READ many formats it cannot WRITE, and mat2 owns the strip for
88
+ # them: the ZIP-based documents (docx/xlsx/pptx/odt/ods/odp/odg/odf/epub) and
89
+ # the RIFF containers (avi/wav). ExifTool announces the inability with one of
90
+ # a few phrasings — "writing of X files is not yet supported", "does not yet
91
+ # support writing of …", or "Can't currently write RIFF … files" — so we
92
+ # match all of them. strip! returns :unsupported in these cases so the runner
93
+ # treats it as a soft skip (mat2 does the actual strip), NOT a pipeline
94
+ # failure that would wrongly pin an already-clean file at :unverified. This is
95
+ # safe because the post-strip residual re-read still gates the :cleaned status.
96
+ WRITE_UNSUPPORTED_RE = /not yet support|can't currently write|writing of .* files/i
97
+
98
+ # Removes every removable tag, in place. Returns true on success,
99
+ # :unsupported when ExifTool cannot write the format, and raises on failure.
100
+ #
101
+ # `-all=` sets every tag to empty, which deletes them. `-overwrite_original`
102
+ # makes ExifTool replace the file directly instead of writing `file_original`
103
+ # next to it. `-api largefilesupport=1` lets files larger than 4 GB through.
104
+ def strip!(path, also_delete: [])
105
+ # `-all=` clears metadata, but for TIFF/DNG ExifTool refuses to delete the
106
+ # IFD0 directory and leaves its tags (Artist, Software, …) behind. So we
107
+ # ALSO delete the known privacy tags by name and clear the GPS group: both
108
+ # are no-ops where `-all=` already removed them (e.g. JPEG), but they make
109
+ # the strip complete AND lossless (no re-encode) for IFD0-preserving formats.
110
+ args = ['exiftool', '-all=', '-gps:all=']
111
+ also_delete.each { |tag| args << "-#{tag}=" }
112
+ args.concat(['-overwrite_original', '-q', '-q', '-api', 'largefilesupport=1', Metaclean.safe_path(path)])
123
113
 
124
114
  _out, err, status = Open3.capture3(*args)
125
115
  return true if status.success?
116
+ return :unsupported if err.match?(WRITE_UNSUPPORTED_RE)
126
117
 
127
- # Some minimal/odd files reject the preserve-pass. Fall back to a plain
128
- # full strip — but only if we *were* preserving, otherwise the retry
129
- # would be identical to the failed attempt.
130
- raise Error, "ExifTool strip failed: #{err.strip}" unless preserving
131
-
132
- _out2, err2, status2 = Open3.capture3(
133
- 'exiftool', '-all=', '-overwrite_original', '-q', '-q', path.to_s
134
- )
135
- return true if status2.success?
136
-
137
- raise Error, "ExifTool strip failed: #{err2.strip.empty? ? err.strip : err2.strip}"
118
+ raise Error, "ExifTool strip failed: #{err.strip}"
138
119
  end
139
120
  end
140
121
  end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Wrapper around the external `ffmpeg` binary.
4
+ #
5
+ # ffmpeg is used for ONE job the other tools can't do: the Matroska containers
6
+ # (mkv/webm). ExifTool is read-only for Matroska, and mat2 has no Matroska
7
+ # parser, so without ffmpeg those formats can't be cleaned at all. ffmpeg
8
+ # rewrites the container while copying every stream verbatim (`-c copy`) — no
9
+ # re-encode, so the audio/video is bit-identical and only the metadata is gone.
10
+ #
11
+ # Like mat2, ffmpeg can't edit in place: it muxes to a new file. We write to a
12
+ # SecureRandom-named sibling and move it back over the source on success.
13
+
14
+ require 'open3'
15
+ require 'fileutils'
16
+ require 'securerandom'
17
+
18
+ module Metaclean
19
+ module Ffmpeg
20
+ module_function
21
+
22
+ # Memoized PATH check (same pattern as the other wrappers).
23
+ def available?
24
+ return @available if defined?(@available)
25
+
26
+ out, _err, status = Open3.capture3('ffmpeg', '-version')
27
+ @available = status.success?
28
+ # First line is "ffmpeg version 7.1.1 Copyright ..."; grab the 3rd token.
29
+ @version = @available ? out.lines.first.to_s.split[2] : nil
30
+ @available
31
+ rescue Errno::ENOENT
32
+ @version = nil
33
+ @available = false
34
+ end
35
+
36
+ def version
37
+ available? ? @version : nil
38
+ end
39
+
40
+ # Strips all metadata from `path` in place, losslessly. Returns true on
41
+ # success, raises Metaclean::Error on failure.
42
+ #
43
+ # -map 0 keep every stream (video, audio, subtitles)
44
+ # -map_metadata -1 drop global/container metadata
45
+ # -map_chapters -1 drop chapter markers (they can carry titles)
46
+ # -c copy remux without re-encoding — bit-identical streams
47
+ def strip!(path)
48
+ raise Error, 'ffmpeg not available' unless available?
49
+
50
+ tmp = tmp_path_for(path)
51
+ # Clear any stale temp from an earlier crashed run before muxing.
52
+ File.delete(tmp) if File.exist?(tmp)
53
+
54
+ _out, err, status = Open3.capture3(
55
+ 'ffmpeg', '-y', '-v', 'error', '-nostdin', '-i', file_url(path),
56
+ '-map', '0', '-map_metadata', '-1', '-map_chapters', '-1', '-c', 'copy',
57
+ file_url(tmp)
58
+ )
59
+ # ffmpeg can exit 0 yet write nothing on some odd inputs, so require the
60
+ # output to actually exist before we trust it and move it into place.
61
+ raise Error, "ffmpeg failed: #{err.strip}" unless status.success? && File.exist?(tmp)
62
+
63
+ FileUtils.mv(tmp, path)
64
+ true
65
+ ensure
66
+ # Interrupt-safety: drop the temp if we were killed between mux and rename.
67
+ # On the success path it's already moved, so this is a no-op.
68
+ File.delete(tmp) if tmp && File.exist?(tmp)
69
+ end
70
+
71
+ # Sibling temp with the SAME extension (ffmpeg picks the muxer from it) in
72
+ # the SAME directory (so the final rename is an atomic same-fs move). The
73
+ # ".metaclean.tmp." marker means Runner#skip? ignores any stray leftover.
74
+ def tmp_path_for(path)
75
+ dir = File.dirname(path)
76
+ ext = File.extname(path)
77
+ File.join(dir, "#{Metaclean::TMP_MARKER}ff.#{Process.pid}.#{SecureRandom.hex(8)}#{ext}")
78
+ end
79
+
80
+ def file_url(path)
81
+ "file:#{File.expand_path(path)}"
82
+ end
83
+ end
84
+ end
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # ───────────────────────────────────────────────────────────────────────────
4
3
  # Wrapper around the external `mat2` (Metadata Anonymisation Toolkit 2).
5
4
  #
6
5
  # mat2 is stricter than ExifTool on certain formats (DOCX/PDF/PNG): instead
@@ -10,7 +9,6 @@
10
9
  # mat2's CLI quirk: it does NOT overwrite the original. It writes a new file
11
10
  # named `<name>.cleaned.<ext>` next to it. We adapt by renaming that result
12
11
  # back over the source after a successful run.
13
- # ───────────────────────────────────────────────────────────────────────────
14
12
 
15
13
  require 'open3'
16
14
  require 'fileutils'
@@ -20,17 +18,22 @@ module Metaclean
20
18
  # File extensions we know mat2 can handle. Keep this list conservative —
21
19
  # if mat2 doesn't actually support an extension, the call will fail
22
20
  # gracefully via UNSUPPORTED_RE below, but we'd rather not even try.
21
+ # Deliberately ABSENT: Matroska (mkv/webm) — mat2 has no parser for it; ffmpeg
22
+ # owns those (Strategy::FFMPEG_FORMATS). QuickTime/MP4-audio (mov/m4a) — mat2
23
+ # can't write them and ExifTool already cleans them, so listing them only
24
+ # caused a wasted mat2 spawn that always soft-skipped. WMV (ASF) IS here on
25
+ # purpose: mat2 CAN write it but ExifTool can't, so mat2 is the only tool that
26
+ # cleans .wmv — dropping it would make every .wmv permanently :failed.
23
27
  SUPPORTED_EXTS = %w[
24
28
  pdf png jpg jpeg tif tiff gif bmp svg webp
25
- mp3 flac ogg opus wav m4a
26
- mp4 avi mkv mov wmv webm
29
+ mp3 flac ogg opus wav
30
+ mp4 avi wmv
27
31
  docx xlsx pptx odt ods odp odg odf epub
28
32
  zip torrent
29
33
  ].freeze
30
34
 
31
- # Regex matching the messages mat2 prints when it can't handle a file.
32
- # We use this to distinguish "soft skip" from a real error.
33
- # `i` flag = case-insensitive.
35
+ # Matches the messages mat2 prints when it can't handle a file — lets us
36
+ # distinguish a soft skip from a real error.
34
37
  UNSUPPORTED_RE = /(not supported|isn't supported|cannot be cleaned|unsupported file)/i.freeze
35
38
 
36
39
  module_function
@@ -39,21 +42,20 @@ module Metaclean
39
42
  def available?
40
43
  return @available if defined?(@available)
41
44
 
42
- _out, _err, status = Open3.capture3('mat2', '--version')
45
+ out, _err, status = Open3.capture3('mat2', '--version')
43
46
  @available = status.success?
47
+ # `mat2 --version` prints "mat2 0.14.0" — `.split.last` grabs the
48
+ # version number regardless of whatever prefix appears. Captured here
49
+ # so `version` reuses it instead of re-spawning the binary.
50
+ @version = @available ? out.strip.split.last : nil
51
+ @available
44
52
  rescue Errno::ENOENT
53
+ @version = nil
45
54
  @available = false
46
55
  end
47
56
 
48
57
  def version
49
- return nil unless available?
50
-
51
- out, _err, status = Open3.capture3('mat2', '--version')
52
- # `mat2 --version` prints "mat2 0.14.0" — `.split.last` grabs the
53
- # version number regardless of whatever prefix appears.
54
- status.success? ? out.strip.split.last : nil
55
- rescue Errno::ENOENT
56
- nil
58
+ available? ? @version : nil
57
59
  end
58
60
 
59
61
  # Quick check before we even try mat2 on a file. Used by Strategy to
@@ -61,7 +63,7 @@ module Metaclean
61
63
  def supports?(path)
62
64
  return false unless available?
63
65
 
64
- SUPPORTED_EXTS.include?(File.extname(path).downcase.delete('.'))
66
+ SUPPORTED_EXTS.include?(Metaclean.ext_of(path))
65
67
  end
66
68
 
67
69
  # Strips metadata from `path` in place. Returns:
@@ -76,38 +78,39 @@ module Metaclean
76
78
  raise Error, 'mat2 not available' unless available?
77
79
 
78
80
  cleaned = cleaned_path_for(path)
81
+ safe = Metaclean.safe_path(path)
79
82
 
80
83
  # Defensive: if a stale `<name>.cleaned.<ext>` exists from an earlier
81
84
  # crashed run, remove it so we don't accidentally use old data.
82
85
  File.delete(cleaned) if File.exist?(cleaned)
83
86
 
84
- out, err, status = Open3.capture3('mat2', path.to_s)
85
- combined = "#{out}\n#{err}"
87
+ out, err, status = Open3.capture3('mat2', safe)
86
88
 
87
- # Soft skip mat2 itself told us it can't process this file.
88
- # Defensive: if mat2 still wrote a partial `<name>.cleaned.<ext>`,
89
- # remove it so a later run doesn't pick up stale output.
90
- if combined.match?(UNSUPPORTED_RE)
91
- File.delete(cleaned) if File.exist?(cleaned)
92
- return :unsupported
93
- end
89
+ # Success path first. mat2 only creates `<name>.cleaned.<ext>` when it
90
+ # actually stripped something; no file after exit 0 means there was
91
+ # nothing to remove. We check exit status BEFORE the "unsupported"
92
+ # message so a successful run that merely warns about one embedded
93
+ # stream isn't misreported as a soft skip.
94
+ if status.success?
95
+ return :no_metadata unless File.exist?(cleaned)
94
96
 
95
- unless status.success?
96
- File.delete(cleaned) if File.exist?(cleaned)
97
- # `err.strip.empty? ? out.strip : err.strip` picks whichever stream
98
- # has actual content — some tools log to stdout, others to stderr.
99
- raise Error, "mat2 failed: #{err.strip.empty? ? out.strip : err.strip}"
97
+ FileUtils.mv(cleaned, safe)
98
+ return true
100
99
  end
101
100
 
102
- # mat2 only creates `<name>.cleaned.<ext>` when it actually stripped
103
- # something. If the file didn't exist after a successful run, there
104
- # was nothing to remove.
105
- if File.exist?(cleaned)
106
- FileUtils.mv(cleaned, path.to_s)
107
- true
108
- else
109
- :no_metadata
110
- end
101
+ # Failure path. A "not supported" message means a soft skip we report
102
+ # so the runner can continue with the next tool, not a hard error.
103
+ combined = "#{out}\n#{err}"
104
+ return :unsupported if combined.match?(UNSUPPORTED_RE)
105
+
106
+ # `err.strip.empty? ? out.strip : err.strip` picks whichever stream
107
+ # has actual content — some tools log to stdout, others to stderr.
108
+ raise Error, "mat2 failed: #{err.strip.empty? ? out.strip : err.strip}"
109
+ ensure
110
+ # Interrupt-safety: if we were killed (Ctrl-C) between mat2 writing
111
+ # `<name>.cleaned.<ext>` and the rename, don't leave the orphan behind.
112
+ # On the success path it's already moved, so this is a no-op.
113
+ File.delete(cleaned) if cleaned && File.exist?(cleaned)
111
114
  end
112
115
 
113
116
  # Builds the path mat2 will write to: `name.cleaned.ext`.