metaclean 1.0.2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +134 -47
- data/bin/metaclean +1 -22
- data/lib/metaclean/cli.rb +42 -92
- data/lib/metaclean/display.rb +59 -40
- data/lib/metaclean/exiftool.rb +70 -89
- data/lib/metaclean/ffmpeg.rb +84 -0
- data/lib/metaclean/mat2.rb +43 -40
- data/lib/metaclean/qpdf.rb +29 -25
- data/lib/metaclean/runner.rb +317 -168
- data/lib/metaclean/strategy.rb +118 -39
- data/lib/metaclean/version.rb +1 -3
- data/lib/metaclean.rb +75 -26
- metadata +11 -8
data/lib/metaclean/runner.rb
CHANGED
|
@@ -1,56 +1,51 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
# ───────────────────────────────────────────────────────────────────────────
|
|
4
3
|
# The orchestrator. Given a list of paths and parsed CLI options, this class:
|
|
5
4
|
#
|
|
6
|
-
# 1. Expands paths into a flat list of files (handling directories
|
|
7
|
-
# recursion
|
|
5
|
+
# 1. Expands paths into a flat list of files (handling directories and
|
|
6
|
+
# recursion; symlinks are skipped).
|
|
8
7
|
# 2. Asks the user for confirmation (unless --force).
|
|
9
8
|
# 3. For each file, runs the strategy pipeline (mat2 / exiftool / qpdf)
|
|
10
9
|
# using the "atomic write" pattern so a crash never leaves a
|
|
11
10
|
# half-cleaned file.
|
|
12
11
|
# 4. Prints a before/after diff and a final summary.
|
|
13
|
-
# ───────────────────────────────────────────────────────────────────────────
|
|
14
12
|
|
|
15
13
|
require 'fileutils'
|
|
16
|
-
require '
|
|
17
|
-
require 'set'
|
|
18
|
-
require 'tmpdir'
|
|
14
|
+
require 'securerandom'
|
|
19
15
|
|
|
20
16
|
module Metaclean
|
|
21
17
|
class Runner
|
|
22
|
-
# Constructor — just stashes the options Hash. The CLI builds it.
|
|
23
18
|
def initialize(options)
|
|
24
19
|
@options = options
|
|
25
20
|
end
|
|
26
21
|
|
|
27
|
-
# ─────────────────────────────────────────────────────────────────
|
|
28
22
|
# Public entry points: one for `--inspect`, one for the cleaning flow.
|
|
29
|
-
# ─────────────────────────────────────────────────────────────────
|
|
30
23
|
|
|
31
24
|
def inspect_paths(paths)
|
|
32
25
|
files = expand_files(paths)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
if @options[:format] == :json
|
|
37
|
-
out = files.map { |f| { file: f, metadata: Exiftool.read(f) } }
|
|
38
|
-
puts JSON.pretty_generate(out)
|
|
39
|
-
return
|
|
26
|
+
if files.empty?
|
|
27
|
+
Display.warning('No files to inspect.')
|
|
28
|
+
exit 1
|
|
40
29
|
end
|
|
41
|
-
|
|
42
|
-
# Human output: pretty header + grouped table per file.
|
|
43
30
|
files.each do |file|
|
|
44
31
|
Display.header "📄 #{file}"
|
|
45
32
|
meta = Exiftool.read(file)
|
|
46
33
|
Display.section "Metadata (#{Display.count_embedded(meta)} embedded tags)"
|
|
47
34
|
Display.metadata_table(meta)
|
|
35
|
+
rescue Error, SystemCallError => e
|
|
36
|
+
# One unreadable/odd file shouldn't abort inspecting the rest — mirrors
|
|
37
|
+
# the per-file rescue in the clean batch.
|
|
38
|
+
warn Display.error("#{file}: #{e.message}")
|
|
48
39
|
end
|
|
49
40
|
end
|
|
50
41
|
|
|
51
42
|
def clean_paths(paths)
|
|
52
43
|
files = expand_files(paths)
|
|
53
|
-
|
|
44
|
+
# See inspect_paths: nothing to act on is a non-zero condition, not success.
|
|
45
|
+
if files.empty?
|
|
46
|
+
Display.warning('No files to process.')
|
|
47
|
+
exit 1
|
|
48
|
+
end
|
|
54
49
|
|
|
55
50
|
announce_tools
|
|
56
51
|
|
|
@@ -59,79 +54,72 @@ module Metaclean
|
|
|
59
54
|
unless @options[:force] || @options[:dry_run]
|
|
60
55
|
action = @options[:in_place] ? 'OVERWRITE' : 'create cleaned copies of'
|
|
61
56
|
puts Display.c("About to #{action} #{files.size} file(s).", :yellow)
|
|
62
|
-
if @options[:in_place]
|
|
57
|
+
if @options[:in_place]
|
|
63
58
|
puts Display.c('Backups will be saved alongside as <file>.bak.', :gray)
|
|
64
59
|
end
|
|
65
60
|
print Display.c('Proceed? [y/N] ', :bold)
|
|
66
|
-
|
|
67
|
-
# (e.g. user hit Ctrl-D), the chain short-circuits to nil.
|
|
68
|
-
ans = $stdin.gets&.strip&.downcase
|
|
61
|
+
ans = $stdin.gets&.strip&.downcase # gets → nil on Ctrl-D
|
|
69
62
|
return Display.warning('Aborted.') unless %w[y yes].include?(ans)
|
|
70
63
|
end
|
|
71
64
|
|
|
72
|
-
summary = { cleaned: 0, failed: 0, removed_total: 0, residual_files: 0 }
|
|
65
|
+
summary = { cleaned: 0, unverified: 0, failed: 0, removed_total: 0, residual_files: 0 }
|
|
73
66
|
|
|
74
|
-
#
|
|
75
|
-
# to `clean_one` so it can render "[3/47]" in batch mode.
|
|
67
|
+
# index/total let clean_one render "[3/47]" in batch mode.
|
|
76
68
|
files.each_with_index do |file, idx|
|
|
77
69
|
result = clean_one(file, index: idx + 1, total: files.size)
|
|
78
70
|
summary[result[:status]] += 1
|
|
79
71
|
summary[:removed_total] += result[:removed].to_i
|
|
80
72
|
summary[:residual_files] += 1 if result[:residual].to_i.positive?
|
|
81
|
-
rescue Error => e
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
#
|
|
73
|
+
rescue Error, SystemCallError => e
|
|
74
|
+
# One bad file shouldn't abort the whole batch. SystemCallError
|
|
75
|
+
# (Errno::*: disk full, permission denied, read-only fs) is a SIBLING
|
|
76
|
+
# of our Error, not a subclass, so it must be named explicitly or it
|
|
77
|
+
# would escape this rescue and crash the run with a raw backtrace.
|
|
85
78
|
warn Display.error("#{file}: #{e.message}")
|
|
86
79
|
summary[:failed] += 1
|
|
87
80
|
end
|
|
88
81
|
|
|
89
82
|
print_summary(summary)
|
|
90
83
|
|
|
91
|
-
# Non-zero exit
|
|
92
|
-
exit 1 if
|
|
93
|
-
exit 1 if summary[:failed].positive?
|
|
84
|
+
# Non-zero exit so CI/scripts can detect a failed or not-fully-verified file.
|
|
85
|
+
exit 1 if summary[:failed].positive? || summary[:unverified].positive?
|
|
94
86
|
end
|
|
95
87
|
|
|
96
88
|
private
|
|
97
89
|
|
|
98
|
-
# ─────────────────────────────────────────────────────────────────
|
|
99
90
|
# Output helpers
|
|
100
|
-
# ─────────────────────────────────────────────────────────────────
|
|
101
91
|
|
|
102
92
|
def announce_tools
|
|
103
93
|
have = []
|
|
104
|
-
have << "exiftool #{Exiftool.version}"
|
|
105
|
-
have << "mat2 #{Mat2.version}"
|
|
106
|
-
have << "qpdf #{Qpdf.version
|
|
94
|
+
have << "exiftool #{Exiftool.version}" if Exiftool.available?
|
|
95
|
+
have << "mat2 #{Mat2.version}" if Mat2.available?
|
|
96
|
+
have << "qpdf #{Qpdf.version}" if Qpdf.available?
|
|
97
|
+
have << "ffmpeg #{Ffmpeg.version}" if Ffmpeg.available?
|
|
107
98
|
Display.info "Tools detected: #{have.join(', ')}"
|
|
108
99
|
Display.info '(dry-run — no files will be modified)' if @options[:dry_run]
|
|
109
100
|
end
|
|
110
101
|
|
|
111
|
-
#
|
|
112
|
-
# Cleaning a single file — the heart of the program.
|
|
113
|
-
# ─────────────────────────────────────────────────────────────────
|
|
114
|
-
|
|
102
|
+
# Cleaning a single file.
|
|
115
103
|
def clean_one(file, index:, total:)
|
|
116
104
|
prefix = total > 1 ? "[#{index}/#{total}] " : ''
|
|
117
105
|
Display.header "#{prefix}📄 #{file}"
|
|
118
106
|
|
|
119
107
|
# Read the "before" metadata FIRST — once we start cleaning, this is
|
|
120
108
|
# gone forever and we'd have nothing to diff against.
|
|
121
|
-
before =
|
|
109
|
+
before = read_metadata(file)
|
|
122
110
|
Display.section "Before (#{Display.count_embedded(before)} embedded tags)"
|
|
123
111
|
Display.metadata_table(before, only_embedded: true)
|
|
124
112
|
|
|
125
|
-
# Ask the strategy module which tools to run
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
113
|
+
# Ask the strategy module which tools to run for this file type.
|
|
114
|
+
tools = Strategy.tools_for(file)
|
|
115
|
+
# Warn when the stricter tool for a document format won't run: ExifTool
|
|
116
|
+
# alone leaves (and can't fully verify) document-internal metadata.
|
|
117
|
+
if Strategy.mat2_essential?(file) && !tools.include?(:mat2)
|
|
118
|
+
Display.warning 'mat2 will not run for this format — document-internal metadata may remain and cannot be verified.'
|
|
131
119
|
end
|
|
132
120
|
Display.info "Pipeline: #{tools.join(' → ')}"
|
|
133
121
|
|
|
134
|
-
#
|
|
122
|
+
# Atomic write setup:
|
|
135
123
|
# `final_path` = where the cleaned file will end up.
|
|
136
124
|
# `staging` = a temp file we mutate. After all tools succeed, we
|
|
137
125
|
# rename staging → final_path. If anything goes wrong
|
|
@@ -140,22 +128,32 @@ module Metaclean
|
|
|
140
128
|
final_path = resolve_final_path(file)
|
|
141
129
|
staging = staging_path_for(final_path)
|
|
142
130
|
|
|
143
|
-
FileUtils.cp(file, staging)
|
|
144
131
|
tool_results = []
|
|
145
132
|
begin
|
|
133
|
+
# TOCTOU guard: the path was a regular file at discovery (expand_files), but
|
|
134
|
+
# it could have been swapped for a symlink in the window since. Re-check
|
|
135
|
+
# right before we read/copy/back it up, so we never copy — or take a .bak —
|
|
136
|
+
# THROUGH a link pointing outside the intended scope. Bails to :failed.
|
|
137
|
+
raise Error, "#{file} became a symlink since discovery — refusing to clean it" if File.symlink?(file)
|
|
138
|
+
|
|
139
|
+
# The staging copy lives INSIDE the begin so the ensure below cleans up a
|
|
140
|
+
# partial temp if cp is interrupted (Ctrl-C) or fails mid-copy (disk full,
|
|
141
|
+
# read-only fs). cp only ever reads the original, so the source is intact
|
|
142
|
+
# regardless.
|
|
143
|
+
copy_file_exclusive(file, staging)
|
|
146
144
|
tools.each do |tool|
|
|
147
145
|
tool_results << run_tool(tool, staging)
|
|
148
146
|
end
|
|
149
147
|
|
|
150
148
|
# Re-read metadata of the cleaned staging file for the diff.
|
|
151
|
-
after =
|
|
149
|
+
after = read_metadata(staging)
|
|
152
150
|
Display.section "After (#{Display.count_embedded(after)} embedded tags)"
|
|
153
151
|
Display.metadata_table(after, only_embedded: true)
|
|
154
152
|
|
|
155
153
|
Display.section 'Diff'
|
|
156
154
|
Display.diff(before, after)
|
|
157
155
|
|
|
158
|
-
#
|
|
156
|
+
# Anything privacy-relevant that survived the strip.
|
|
159
157
|
residual = Strategy.privacy_residual(after)
|
|
160
158
|
if residual.any?
|
|
161
159
|
Display.warning "Privacy-relevant tags still present (#{residual.size}):"
|
|
@@ -166,14 +164,42 @@ module Metaclean
|
|
|
166
164
|
if @options[:dry_run]
|
|
167
165
|
File.delete(staging) if File.exist?(staging)
|
|
168
166
|
Display.info '(dry-run: nothing was written)'
|
|
169
|
-
return finalize_result(tool_results, before, after, residual)
|
|
167
|
+
return finalize_result(tool_results, before, after, residual, file: file)
|
|
170
168
|
end
|
|
171
169
|
|
|
172
|
-
#
|
|
173
|
-
|
|
174
|
-
|
|
170
|
+
# Never write output unless the file is genuinely clean: at least one
|
|
171
|
+
# tool ran AND no privacy-relevant tag survived. Otherwise the staging
|
|
172
|
+
# file — committed as a "_clean" copy or an in-place overwrite — would
|
|
173
|
+
# not actually be clean, the exact false-clean this tool exists to
|
|
174
|
+
# prevent. Bail to :failed and let the ensure block delete staging,
|
|
175
|
+
# leaving the original untouched.
|
|
176
|
+
unless cleaned?(tool_results, residual)
|
|
177
|
+
reason = tools_succeeded?(tool_results) ? 'Privacy-relevant tags survived' : 'All tools failed'
|
|
178
|
+
Display.warning "#{reason} — not writing output."
|
|
179
|
+
return finalize_result(tool_results, before, after, residual, file: file)
|
|
180
|
+
end
|
|
175
181
|
|
|
176
|
-
|
|
182
|
+
# Preserve the original's permission bits onto the cleaned output. cp and
|
|
183
|
+
# the tools' temp renames otherwise leave it at the umask default, which
|
|
184
|
+
# could widen a locked-down 0600 file to 0644 — a leak for a privacy tool.
|
|
185
|
+
File.chmod(File.stat(file).mode, staging)
|
|
186
|
+
|
|
187
|
+
# In-place clean of a hard-linked file only re-points THIS name (rename) at
|
|
188
|
+
# the freshly-cleaned inode; the file's other names still point at the
|
|
189
|
+
# original, metadata-bearing inode. This name is genuinely clean, but warn
|
|
190
|
+
# so the user knows the other links aren't covered by the run.
|
|
191
|
+
warn_if_hardlinked(file) if @options[:in_place]
|
|
192
|
+
|
|
193
|
+
# Commit: move/link staging → final_path (backing up the original in place).
|
|
194
|
+
final_path = commit!(staging, final_path)
|
|
195
|
+
result = finalize_result(tool_results, before, after, residual, file: file)
|
|
196
|
+
if result[:status] == :unverified
|
|
197
|
+
reason = tool_errored?(tool_results) ? 'a tool in the pipeline failed' : 'mat2 did not run on this format'
|
|
198
|
+
Display.warning "→ #{final_path} (unverified — #{reason})"
|
|
199
|
+
else
|
|
200
|
+
Display.success "→ #{final_path}"
|
|
201
|
+
end
|
|
202
|
+
result
|
|
177
203
|
ensure
|
|
178
204
|
# Last-resort cleanup. If `commit!` already moved the staging file,
|
|
179
205
|
# `File.exist?(staging)` is false and this is a no-op. The path-
|
|
@@ -183,16 +209,32 @@ module Metaclean
|
|
|
183
209
|
end
|
|
184
210
|
end
|
|
185
211
|
|
|
212
|
+
# Warn when an in-place target has more than one hard link: a rename only
|
|
213
|
+
# cleans the named link, leaving the others pointing at the original metadata.
|
|
214
|
+
def warn_if_hardlinked(file)
|
|
215
|
+
nlink = File.stat(file).nlink
|
|
216
|
+
return unless nlink > 1
|
|
217
|
+
|
|
218
|
+
Display.warning "#{file} has #{nlink} hard links — only this name is cleaned; " \
|
|
219
|
+
"the other #{nlink - 1} still contain the original metadata."
|
|
220
|
+
end
|
|
221
|
+
|
|
186
222
|
# Dispatches to the right wrapper module. Returns a small Hash so the
|
|
187
223
|
# caller can summarize tool-by-tool success/failure.
|
|
188
224
|
def run_tool(tool, path)
|
|
189
225
|
case tool
|
|
190
226
|
when :exiftool
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
227
|
+
# :unsupported means ExifTool can read but not write this format (a
|
|
228
|
+
# ZIP-based document mat2 owns) — a soft skip, NOT a pipeline failure.
|
|
229
|
+
# Pass the privacy tag names so TIFF/DNG IFD0 tags `-all=` won't drop
|
|
230
|
+
# still get deleted (losslessly).
|
|
231
|
+
if Exiftool.strip!(path, also_delete: Strategy::PRIVACY_TAGS) == :unsupported
|
|
232
|
+
Display.info ' · exiftool (read-only for this format, skipped)'
|
|
233
|
+
{ tool: :exiftool, ok: false, skipped: true, note: :unsupported }
|
|
234
|
+
else
|
|
235
|
+
Display.info ' ✓ exiftool'
|
|
236
|
+
{ tool: :exiftool, ok: true }
|
|
237
|
+
end
|
|
196
238
|
when :mat2
|
|
197
239
|
result = Mat2.strip!(path)
|
|
198
240
|
# mat2 returns either `true` (success) or a symbol indicating a
|
|
@@ -214,63 +256,183 @@ module Metaclean
|
|
|
214
256
|
Qpdf.rebuild!(path)
|
|
215
257
|
Display.info ' ✓ qpdf'
|
|
216
258
|
{ tool: :qpdf, ok: true }
|
|
259
|
+
when :ffmpeg
|
|
260
|
+
# Matroska remux. A failure raises and is caught below (→ not written).
|
|
261
|
+
Ffmpeg.strip!(path)
|
|
262
|
+
Display.info ' ✓ ffmpeg'
|
|
263
|
+
{ tool: :ffmpeg, ok: true }
|
|
217
264
|
end
|
|
218
|
-
rescue Error => e
|
|
265
|
+
rescue Error, SystemCallError => e
|
|
219
266
|
# One tool failing shouldn't abort the pipeline — we want to keep
|
|
220
267
|
# trying with the others. The `finalize_result` step decides whether
|
|
221
|
-
# the overall file counts as cleaned or failed.
|
|
222
|
-
|
|
268
|
+
# the overall file counts as cleaned or failed. `SystemCallError`
|
|
269
|
+
# (Errno::*) covers a tool wrapper's internal FileUtils.mv/File.delete
|
|
270
|
+
# raising on permission/quota/disk errors — without it those would
|
|
271
|
+
# escape and crash the batch.
|
|
272
|
+
# Collapse whitespace and bound the length: some tools (notably mat2) dump a
|
|
273
|
+
# multi-line Python traceback on failure, which would otherwise flood the
|
|
274
|
+
# diff. One readable line is enough — re-run the tool directly to debug.
|
|
275
|
+
msg = Display.truncate(e.message.gsub(/\s+/, ' ').strip, 200)
|
|
276
|
+
Display.warning " ✗ #{tool}: #{msg} — continuing"
|
|
223
277
|
{ tool: tool, ok: false, error: e.message }
|
|
224
278
|
end
|
|
225
279
|
|
|
226
|
-
|
|
280
|
+
# :cleaned needs ALL of: a tool genuinely ran, no privacy residual survived,
|
|
281
|
+
# no pipeline tool errored, AND — for a format where mat2 owns coverage
|
|
282
|
+
# ExifTool can't re-read (Office/PDF doc internals) — mat2 actually ran. A
|
|
283
|
+
# tool that errored, or an absent mat2 on a document format, means the
|
|
284
|
+
# pipeline didn't fully complete and the residual check is partly blind, so
|
|
285
|
+
# the result is :unverified, not a confident :cleaned. `file` is needed only
|
|
286
|
+
# for that mat2-coverage check.
|
|
287
|
+
def finalize_result(tool_results, before, after, residual, file: nil)
|
|
227
288
|
removed = removed_embedded_count(before, after)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
{ status: status,
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
289
|
+
status = if !cleaned?(tool_results, residual)
|
|
290
|
+
:failed
|
|
291
|
+
elsif !tool_errored?(tool_results) && !mat2_coverage_gap?(tool_results, file)
|
|
292
|
+
:cleaned
|
|
293
|
+
else
|
|
294
|
+
:unverified
|
|
295
|
+
end
|
|
296
|
+
{ status: status, removed: removed, residual: residual.size }
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# mat2 is essential for this format (Office/PDF internals ExifTool can't
|
|
300
|
+
# strip or fully re-read) but did NOT actually run and strip — absent,
|
|
301
|
+
# unsupported soft-skip, or errored. The residual check can't confirm the
|
|
302
|
+
# clean, so don't report a confident :cleaned.
|
|
303
|
+
def mat2_coverage_gap?(tool_results, file)
|
|
304
|
+
return false unless file && Strategy.mat2_essential?(file)
|
|
305
|
+
|
|
306
|
+
tool_results.none? { |r| r[:tool] == :mat2 && r[:ok] && !r[:skipped] }
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# A file is genuinely cleaned only when at least one tool actually ran
|
|
310
|
+
# (not just a mat2 :unsupported soft-skip) AND no privacy-relevant tag
|
|
311
|
+
# survived. Both the commit gate and the final status use this ONE
|
|
312
|
+
# predicate, so they can never disagree — we never write a "_clean" copy
|
|
313
|
+
# (or overwrite an original in place) and then report it :failed. Silently
|
|
314
|
+
# marking a file clean while sensitive metadata is still present is the
|
|
315
|
+
# worst possible outcome for a privacy tool.
|
|
316
|
+
def cleaned?(tool_results, residual)
|
|
317
|
+
tools_succeeded?(tool_results) && residual.empty?
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Did at least one tool genuinely run (not a mat2 :unsupported soft-skip)?
|
|
321
|
+
def tools_succeeded?(tool_results)
|
|
322
|
+
tool_results.any? { |r| r[:ok] && !r[:skipped] }
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# Did a tool that was meant to run error out (not a mat2 :unsupported
|
|
326
|
+
# soft-skip)? Even with an empty residual that means the pipeline didn't
|
|
327
|
+
# fully complete, so the clean can't be reported as a confident :cleaned.
|
|
328
|
+
def tool_errored?(tool_results)
|
|
329
|
+
tool_results.any? { |r| !r[:ok] && !r[:skipped] }
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# Read metadata for the before/after diff. ensure_tools! guarantees exiftool
|
|
333
|
+
# is present before any run.
|
|
334
|
+
def read_metadata(path)
|
|
335
|
+
Exiftool.read(path)
|
|
239
336
|
end
|
|
240
337
|
|
|
241
338
|
def removed_embedded_count(before, after)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
339
|
+
before.keys.count { |key| Display.embedded_key?(key) && !after.key?(key) }
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
# Path helpers — figuring out where to stage and where to commit.
|
|
246
343
|
|
|
247
|
-
|
|
344
|
+
def commit!(staging, final_path)
|
|
345
|
+
committed = false
|
|
346
|
+
# Make a backup of the original BEFORE we overwrite it. The order matters:
|
|
347
|
+
# if the rename below fails, the backup still exists.
|
|
348
|
+
if @options[:in_place]
|
|
349
|
+
backup = copy_with_collision_safe_name(final_path, "#{final_path}.bak")
|
|
350
|
+
# File.rename, NOT FileUtils.mv. staging is in the same dir as final_path
|
|
351
|
+
# (staging_path_for), so this is always an atomic same-fs swap. FileUtils.mv
|
|
352
|
+
# would, on EPERM (e.g. a sticky /tmp file not owned by us) or EXDEV, fall
|
|
353
|
+
# back to a TRUNCATING copy of the original — and an interrupt mid-copy
|
|
354
|
+
# would corrupt the original while the rescue below deleted the only backup.
|
|
355
|
+
# File.rename raises BEFORE touching final_path, so the rescue's
|
|
356
|
+
# "staging still exists ⇒ original intact" assumption always holds.
|
|
357
|
+
File.rename(staging, final_path)
|
|
358
|
+
committed = true
|
|
359
|
+
return final_path
|
|
248
360
|
end
|
|
361
|
+
|
|
362
|
+
link_with_collision_safe_name(staging, final_path)
|
|
363
|
+
rescue SystemCallError, Interrupt
|
|
364
|
+
# The rename failed (disk full, read-only fs, cross-device) OR the user hit
|
|
365
|
+
# Ctrl-C in the window after the .bak was written but before the mv. Either
|
|
366
|
+
# way the original is untouched, so the .bak is a redundant copy of it —
|
|
367
|
+
# remove it instead of leaving a stray file behind, then re-raise (a
|
|
368
|
+
# SystemCallError is reported per-file as failed; an Interrupt propagates to
|
|
369
|
+
# the CLI's exit-130 handler).
|
|
370
|
+
File.delete(backup) if backup && !committed && File.exist?(staging) && File.exist?(backup)
|
|
371
|
+
raise
|
|
249
372
|
end
|
|
250
373
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
374
|
+
def link_with_collision_safe_name(staging, preferred)
|
|
375
|
+
target = preferred
|
|
376
|
+
loop do
|
|
377
|
+
File.link(staging, target)
|
|
378
|
+
File.delete(staging)
|
|
379
|
+
return target
|
|
380
|
+
rescue Errno::EEXIST
|
|
381
|
+
target = collision_safe(target)
|
|
382
|
+
rescue Errno::EACCES, Errno::EPERM, Errno::ENOTSUP, NotImplementedError
|
|
383
|
+
# The filesystem can't hard-link (Linux returns EPERM; macOS/BSD on
|
|
384
|
+
# exFAT/FAT/SMB returns ENOTSUP/EOPNOTSUPP — same Errno class). Fall back
|
|
385
|
+
# to a plain exclusive copy so removable/network drives still clean.
|
|
386
|
+
target = copy_with_collision_safe_name(staging, target)
|
|
387
|
+
File.delete(staging)
|
|
388
|
+
return target
|
|
389
|
+
end
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
def copy_with_collision_safe_name(src, preferred)
|
|
393
|
+
target = preferred
|
|
394
|
+
loop do
|
|
395
|
+
copy_file_exclusive(src, target, preserve: true)
|
|
396
|
+
return target
|
|
397
|
+
rescue Errno::EEXIST
|
|
398
|
+
target = collision_safe(target)
|
|
265
399
|
end
|
|
266
|
-
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def copy_file_exclusive(src, dest, preserve: false)
|
|
403
|
+
src_stat = File.lstat(src)
|
|
404
|
+
raise Error, "#{src} is a symlink — refusing to copy it" if src_stat.symlink?
|
|
405
|
+
|
|
406
|
+
mode = src_stat.mode & 0o7777
|
|
407
|
+
created = false
|
|
408
|
+
File.open(dest, File::WRONLY | File::CREAT | File::EXCL, mode) do |out|
|
|
409
|
+
created = true
|
|
410
|
+
File.open(src, 'rb') do |input|
|
|
411
|
+
opened = input.stat
|
|
412
|
+
unless opened.dev == src_stat.dev && opened.ino == src_stat.ino
|
|
413
|
+
raise Error, "#{src} changed while opening — refusing to copy it"
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
IO.copy_stream(input, out)
|
|
417
|
+
end
|
|
418
|
+
end
|
|
419
|
+
return unless preserve
|
|
420
|
+
|
|
421
|
+
# Best-effort: the bytes are already fully copied, so a failed mode/timestamp
|
|
422
|
+
# restore (e.g. utime/chmod on a FAT/exFAT mount) must NOT discard an
|
|
423
|
+
# otherwise-complete file by falling into the delete-on-error rescue below.
|
|
424
|
+
begin
|
|
425
|
+
File.chmod(mode, dest)
|
|
426
|
+
File.utime(src_stat.atime, src_stat.mtime, dest)
|
|
427
|
+
rescue SystemCallError
|
|
428
|
+
nil
|
|
429
|
+
end
|
|
430
|
+
rescue StandardError, Interrupt
|
|
431
|
+
File.delete(dest) if created && dest && File.exist?(dest)
|
|
432
|
+
raise
|
|
267
433
|
end
|
|
268
434
|
|
|
269
435
|
def resolve_final_path(file)
|
|
270
|
-
# When following a symlink with --in-place, we want to overwrite the
|
|
271
|
-
# *target* of the link, not replace the link itself with a regular
|
|
272
|
-
# file. `realpath` resolves through the link.
|
|
273
|
-
return File.realpath(file) if @options[:in_place] && File.symlink?(file)
|
|
274
436
|
return file if @options[:in_place]
|
|
275
437
|
|
|
276
438
|
# Default: write `<name>_clean.<ext>` next to the original. If it
|
|
@@ -281,7 +443,7 @@ module Metaclean
|
|
|
281
443
|
def build_clean_path(file)
|
|
282
444
|
ext = File.extname(file)
|
|
283
445
|
base = File.basename(file, ext)
|
|
284
|
-
File.join(File.dirname(file), "#{base}
|
|
446
|
+
File.join(File.dirname(file), "#{base}#{Metaclean::CLEAN_SUFFIX}#{ext}")
|
|
285
447
|
end
|
|
286
448
|
|
|
287
449
|
# Staging path lives in the same directory as the destination so that
|
|
@@ -290,13 +452,15 @@ module Metaclean
|
|
|
290
452
|
# The original extension is preserved as the LAST segment so tools like
|
|
291
453
|
# mat2 — which dispatch on file extension — see the real type.
|
|
292
454
|
def staging_path_for(final_path)
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
455
|
+
dir = File.dirname(final_path)
|
|
456
|
+
ext = File.extname(final_path)
|
|
457
|
+
# SecureRandom (not rand) makes the staging name unpredictable, so a
|
|
458
|
+
# hostile process in the same directory can't pre-create it as a symlink
|
|
459
|
+
# that `FileUtils.cp` would copy the (still-sensitive) original through.
|
|
460
|
+
File.join(dir, "#{Metaclean::TMP_MARKER}#{Process.pid}.#{SecureRandom.hex(8)}#{ext}")
|
|
296
461
|
end
|
|
297
462
|
|
|
298
|
-
# If `path` is taken,
|
|
299
|
-
# one. `loop do … end` runs forever; we `return` out of it.
|
|
463
|
+
# If `path` is taken, try `path_1`, `path_2`, … until one is free.
|
|
300
464
|
def collision_safe(path)
|
|
301
465
|
return path unless File.exist?(path)
|
|
302
466
|
|
|
@@ -312,19 +476,12 @@ module Metaclean
|
|
|
312
476
|
end
|
|
313
477
|
end
|
|
314
478
|
|
|
315
|
-
# Translates the on/off CLI flags into a "prefer" hash that Strategy
|
|
316
|
-
# understands. Keeping this as one method makes the wiring obvious.
|
|
317
|
-
def tool_prefs
|
|
318
|
-
{
|
|
319
|
-
mat2: !@options[:no_mat2] && !@options[:exiftool_only],
|
|
320
|
-
qpdf: !@options[:no_qpdf] && !@options[:exiftool_only],
|
|
321
|
-
exiftool: !@options[:no_exiftool]
|
|
322
|
-
}
|
|
323
|
-
end
|
|
324
|
-
|
|
325
479
|
def print_summary(summary)
|
|
326
480
|
Display.header 'Summary'
|
|
327
481
|
Display.success "Cleaned: #{summary[:cleaned]} file(s)"
|
|
482
|
+
if summary[:unverified].positive?
|
|
483
|
+
Display.warning "Unverified (clean could not be confirmed): #{summary[:unverified]} file(s)"
|
|
484
|
+
end
|
|
328
485
|
puts Display.error("Failed: #{summary[:failed]}") if summary[:failed].positive?
|
|
329
486
|
Display.info "Total embedded tags removed: #{summary[:removed_total]}"
|
|
330
487
|
if summary[:residual_files].positive?
|
|
@@ -332,18 +489,16 @@ module Metaclean
|
|
|
332
489
|
end
|
|
333
490
|
end
|
|
334
491
|
|
|
335
|
-
# ─────────────────────────────────────────────────────────────────
|
|
336
492
|
# File discovery — turning the user's paths into a flat list.
|
|
337
|
-
# ─────────────────────────────────────────────────────────────────
|
|
338
493
|
|
|
339
494
|
def expand_files(paths)
|
|
340
495
|
explicit = []
|
|
341
496
|
discovered = []
|
|
342
497
|
paths.each do |p|
|
|
343
|
-
# Symlinks are skipped
|
|
344
|
-
#
|
|
345
|
-
if File.symlink?(p)
|
|
346
|
-
Display.warning "Skipping symlink: #{p}
|
|
498
|
+
# Symlinks are always skipped — avoids cleaning something through a link
|
|
499
|
+
# that points outside the intended scope.
|
|
500
|
+
if File.symlink?(p)
|
|
501
|
+
Display.warning "Skipping symlink: #{p}"
|
|
347
502
|
next
|
|
348
503
|
end
|
|
349
504
|
if File.directory?(p)
|
|
@@ -358,9 +513,7 @@ module Metaclean
|
|
|
358
513
|
end
|
|
359
514
|
end
|
|
360
515
|
discovered.reject! { |f| skip?(f) }
|
|
361
|
-
|
|
362
|
-
result.select! { |f| type_allowed?(f) } if @options[:types]
|
|
363
|
-
dedupe_by_realpath(result)
|
|
516
|
+
dedupe_by_realpath(explicit + discovered)
|
|
364
517
|
end
|
|
365
518
|
|
|
366
519
|
# Same file via two different paths (or via symlink + direct path) should
|
|
@@ -369,11 +522,7 @@ module Metaclean
|
|
|
369
522
|
def dedupe_by_realpath(paths)
|
|
370
523
|
seen = {}
|
|
371
524
|
paths.each_with_object([]) do |p, acc|
|
|
372
|
-
key =
|
|
373
|
-
File.realpath(p)
|
|
374
|
-
rescue StandardError
|
|
375
|
-
p
|
|
376
|
-
end
|
|
525
|
+
key = safe_realpath(p)
|
|
377
526
|
next if seen[key]
|
|
378
527
|
|
|
379
528
|
seen[key] = true
|
|
@@ -381,50 +530,54 @@ module Metaclean
|
|
|
381
530
|
end
|
|
382
531
|
end
|
|
383
532
|
|
|
533
|
+
# File.realpath, falling back to the raw path when it can't resolve
|
|
534
|
+
# (broken symlink, permission denied) instead of raising.
|
|
535
|
+
def safe_realpath(path)
|
|
536
|
+
File.realpath(path)
|
|
537
|
+
rescue StandardError
|
|
538
|
+
path
|
|
539
|
+
end
|
|
540
|
+
|
|
384
541
|
def collect_dir(dir, out)
|
|
385
542
|
if @options[:recursive]
|
|
386
|
-
walk_recursive(dir, out
|
|
543
|
+
walk_recursive(dir, out)
|
|
387
544
|
else
|
|
388
|
-
# Non-recursive: just the immediate children of `dir`.
|
|
389
|
-
Dir.glob(
|
|
390
|
-
|
|
545
|
+
# Non-recursive: just the immediate children of `dir`. Use Dir.children,
|
|
546
|
+
# NOT Dir.glob("#{dir}/*") — glob interprets the WHOLE pattern, so a
|
|
547
|
+
# directory name containing glob metacharacters (e.g. "Holiday [2024]")
|
|
548
|
+
# matches nothing and the entire folder is silently skipped. Dir.children
|
|
549
|
+
# surfaces dotfiles too; skip? filters them later, same as walk_recursive.
|
|
550
|
+
Dir.children(dir).each do |entry|
|
|
551
|
+
sub = File.join(dir, entry)
|
|
552
|
+
next if File.symlink?(sub)
|
|
391
553
|
|
|
392
554
|
out << sub if File.file?(sub)
|
|
393
555
|
end
|
|
394
556
|
end
|
|
557
|
+
rescue SystemCallError => e
|
|
558
|
+
# Any Errno (EACCES/ENOENT/ENOTDIR from a dir replaced mid-scan, EIO, …):
|
|
559
|
+
# warn and skip this directory so one bad entry doesn't abort discovery of
|
|
560
|
+
# the rest of the batch.
|
|
561
|
+
Display.warning "Skipping #{dir}: #{e.message}"
|
|
395
562
|
end
|
|
396
563
|
|
|
397
|
-
# Manual recursive walker.
|
|
398
|
-
#
|
|
399
|
-
|
|
400
|
-
# eventually points at one of its ancestors.
|
|
401
|
-
def walk_recursive(dir, out, visited)
|
|
402
|
-
real = begin
|
|
403
|
-
File.realpath(dir)
|
|
404
|
-
rescue StandardError
|
|
405
|
-
dir
|
|
406
|
-
end
|
|
407
|
-
return if visited.include?(real)
|
|
408
|
-
|
|
409
|
-
visited << real
|
|
410
|
-
|
|
564
|
+
# Manual recursive walker. Symlinks are always skipped (never followed), so
|
|
565
|
+
# the real directory tree is acyclic and no loop-guard is needed.
|
|
566
|
+
def walk_recursive(dir, out)
|
|
411
567
|
Dir.each_child(dir) do |entry|
|
|
412
568
|
sub = File.join(dir, entry)
|
|
413
|
-
if File.symlink?(sub)
|
|
414
|
-
next unless @options[:follow_symlinks]
|
|
569
|
+
next if File.symlink?(sub)
|
|
415
570
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
elsif File.file?(sub)
|
|
419
|
-
out << sub
|
|
420
|
-
end
|
|
421
|
-
elsif File.directory?(sub)
|
|
422
|
-
walk_recursive(sub, out, visited)
|
|
571
|
+
if File.directory?(sub)
|
|
572
|
+
walk_recursive(sub, out)
|
|
423
573
|
elsif File.file?(sub)
|
|
424
574
|
out << sub
|
|
425
575
|
end
|
|
426
576
|
end
|
|
427
|
-
rescue
|
|
577
|
+
rescue SystemCallError => e
|
|
578
|
+
# Any Errno (EACCES/ENOENT/ENOTDIR from a dir replaced mid-scan, EIO, …):
|
|
579
|
+
# warn and skip this directory so one bad entry doesn't abort discovery of
|
|
580
|
+
# the rest of the batch.
|
|
428
581
|
Display.warning "Skipping #{dir}: #{e.message}"
|
|
429
582
|
end
|
|
430
583
|
|
|
@@ -437,15 +590,11 @@ module Metaclean
|
|
|
437
590
|
base = File.basename(file)
|
|
438
591
|
return true if base.start_with?('.')
|
|
439
592
|
return true if base.end_with?('.bak')
|
|
440
|
-
return true if base =~
|
|
441
|
-
|
|
593
|
+
return true if base =~ Metaclean::CLEAN_OUTPUT_RE
|
|
594
|
+
# Matches our staging temps regardless of the pid/random suffix format.
|
|
595
|
+
return true if base.include?(Metaclean::TMP_MARKER)
|
|
442
596
|
|
|
443
597
|
false
|
|
444
598
|
end
|
|
445
|
-
|
|
446
|
-
def type_allowed?(file)
|
|
447
|
-
ext = File.extname(file).downcase.delete('.')
|
|
448
|
-
@options[:types].include?(ext)
|
|
449
|
-
end
|
|
450
599
|
end
|
|
451
600
|
end
|