metaclean 1.0.2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,56 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # ───────────────────────────────────────────────────────────────────────────
4
3
  # The orchestrator. Given a list of paths and parsed CLI options, this class:
5
4
  #
6
- # 1. Expands paths into a flat list of files (handling directories,
7
- # recursion, symlinks, type filters).
5
+ # 1. Expands paths into a flat list of files (handling directories and
6
+ # recursion; symlinks are skipped).
8
7
  # 2. Asks the user for confirmation (unless --force).
9
8
  # 3. For each file, runs the strategy pipeline (mat2 / exiftool / qpdf)
10
9
  # using the "atomic write" pattern so a crash never leaves a
11
10
  # half-cleaned file.
12
11
  # 4. Prints a before/after diff and a final summary.
13
- # ───────────────────────────────────────────────────────────────────────────
14
12
 
15
13
  require 'fileutils'
16
- require 'json'
17
- require 'set'
18
- require 'tmpdir'
14
+ require 'securerandom'
19
15
 
20
16
  module Metaclean
21
17
  class Runner
22
- # Constructor — just stashes the options Hash. The CLI builds it.
23
18
  def initialize(options)
24
19
  @options = options
25
20
  end
26
21
 
27
- # ─────────────────────────────────────────────────────────────────
28
22
  # Public entry points: one for `--inspect`, one for the cleaning flow.
29
- # ─────────────────────────────────────────────────────────────────
30
23
 
31
24
  def inspect_paths(paths)
32
25
  files = expand_files(paths)
33
- return Display.warning('No files to inspect.') if files.empty?
34
-
35
- # `--json`: machine output, no colors, suitable for piping.
36
- if @options[:format] == :json
37
- out = files.map { |f| { file: f, metadata: Exiftool.read(f) } }
38
- puts JSON.pretty_generate(out)
39
- return
26
+ if files.empty?
27
+ Display.warning('No files to inspect.')
28
+ exit 1
40
29
  end
41
-
42
- # Human output: pretty header + grouped table per file.
43
30
  files.each do |file|
44
31
  Display.header "📄 #{file}"
45
32
  meta = Exiftool.read(file)
46
33
  Display.section "Metadata (#{Display.count_embedded(meta)} embedded tags)"
47
34
  Display.metadata_table(meta)
35
+ rescue Error, SystemCallError => e
36
+ # One unreadable/odd file shouldn't abort inspecting the rest — mirrors
37
+ # the per-file rescue in the clean batch.
38
+ warn Display.error("#{file}: #{e.message}")
48
39
  end
49
40
  end
50
41
 
51
42
  def clean_paths(paths)
52
43
  files = expand_files(paths)
53
- return Display.warning('No files to process.') if files.empty?
44
+ # See inspect_paths: nothing to act on is a non-zero condition, not success.
45
+ if files.empty?
46
+ Display.warning('No files to process.')
47
+ exit 1
48
+ end
54
49
 
55
50
  announce_tools
56
51
 
@@ -59,79 +54,72 @@ module Metaclean
59
54
  unless @options[:force] || @options[:dry_run]
60
55
  action = @options[:in_place] ? 'OVERWRITE' : 'create cleaned copies of'
61
56
  puts Display.c("About to #{action} #{files.size} file(s).", :yellow)
62
- if @options[:in_place] && !@options[:no_backup]
57
+ if @options[:in_place]
63
58
  puts Display.c('Backups will be saved alongside as <file>.bak.', :gray)
64
59
  end
65
60
  print Display.c('Proceed? [y/N] ', :bold)
66
- # `&.` is the safe-navigation operator: if `gets` returns nil
67
- # (e.g. user hit Ctrl-D), the chain short-circuits to nil.
68
- ans = $stdin.gets&.strip&.downcase
61
+ ans = $stdin.gets&.strip&.downcase # gets nil on Ctrl-D
69
62
  return Display.warning('Aborted.') unless %w[y yes].include?(ans)
70
63
  end
71
64
 
72
- summary = { cleaned: 0, failed: 0, removed_total: 0, residual_files: 0 }
65
+ summary = { cleaned: 0, unverified: 0, failed: 0, removed_total: 0, residual_files: 0 }
73
66
 
74
- # `each_with_index` gives us the file AND its position. We pass both
75
- # to `clean_one` so it can render "[3/47]" in batch mode.
67
+ # index/total let clean_one render "[3/47]" in batch mode.
76
68
  files.each_with_index do |file, idx|
77
69
  result = clean_one(file, index: idx + 1, total: files.size)
78
70
  summary[result[:status]] += 1
79
71
  summary[:removed_total] += result[:removed].to_i
80
72
  summary[:residual_files] += 1 if result[:residual].to_i.positive?
81
- rescue Error => e
82
- # Block-level rescue (Ruby 2.5+). Catches errors from `clean_one`
83
- # without aborting the whole batch one bad file shouldn't stop
84
- # the next 99 from being cleaned.
73
+ rescue Error, SystemCallError => e
74
+ # One bad file shouldn't abort the whole batch. SystemCallError
75
+ # (Errno::*: disk full, permission denied, read-only fs) is a SIBLING
76
+ # of our Error, not a subclass, so it must be named explicitly or it
77
+ # would escape this rescue and crash the run with a raw backtrace.
85
78
  warn Display.error("#{file}: #{e.message}")
86
79
  summary[:failed] += 1
87
80
  end
88
81
 
89
82
  print_summary(summary)
90
83
 
91
- # Non-zero exit code so CI pipelines can detect failures.
92
- exit 1 if @options[:strict_verify] && summary[:residual_files].positive?
93
- exit 1 if summary[:failed].positive?
84
+ # Non-zero exit so CI/scripts can detect a failed or not-fully-verified file.
85
+ exit 1 if summary[:failed].positive? || summary[:unverified].positive?
94
86
  end
95
87
 
96
88
  private
97
89
 
98
- # ─────────────────────────────────────────────────────────────────
99
90
  # Output helpers
100
- # ─────────────────────────────────────────────────────────────────
101
91
 
102
92
  def announce_tools
103
93
  have = []
104
- have << "exiftool #{Exiftool.version}" if Exiftool.available?
105
- have << "mat2 #{Mat2.version}" if Mat2.available?
106
- have << "qpdf #{Qpdf.version&.split&.last}" if Qpdf.available?
94
+ have << "exiftool #{Exiftool.version}" if Exiftool.available?
95
+ have << "mat2 #{Mat2.version}" if Mat2.available?
96
+ have << "qpdf #{Qpdf.version}" if Qpdf.available?
97
+ have << "ffmpeg #{Ffmpeg.version}" if Ffmpeg.available?
107
98
  Display.info "Tools detected: #{have.join(', ')}"
108
99
  Display.info '(dry-run — no files will be modified)' if @options[:dry_run]
109
100
  end
110
101
 
111
- # ─────────────────────────────────────────────────────────────────
112
- # Cleaning a single file — the heart of the program.
113
- # ─────────────────────────────────────────────────────────────────
114
-
102
+ # Cleaning a single file.
115
103
  def clean_one(file, index:, total:)
116
104
  prefix = total > 1 ? "[#{index}/#{total}] " : ''
117
105
  Display.header "#{prefix}📄 #{file}"
118
106
 
119
107
  # Read the "before" metadata FIRST — once we start cleaning, this is
120
108
  # gone forever and we'd have nothing to diff against.
121
- before = Exiftool.read(file)
109
+ before = read_metadata(file)
122
110
  Display.section "Before (#{Display.count_embedded(before)} embedded tags)"
123
111
  Display.metadata_table(before, only_embedded: true)
124
112
 
125
- # Ask the strategy module which tools to run. If everything's
126
- # disabled (user passed all --no-* flags), bail out gracefully.
127
- tools = Strategy.tools_for(file, prefer: tool_prefs)
128
- if tools.empty?
129
- Display.warning 'No applicable tools — skipping.'
130
- return { status: :failed, removed: 0, residual: 0 }
113
+ # Ask the strategy module which tools to run for this file type.
114
+ tools = Strategy.tools_for(file)
115
+ # Warn when the stricter tool for a document format won't run: ExifTool
116
+ # alone leaves (and can't fully verify) document-internal metadata.
117
+ if Strategy.mat2_essential?(file) && !tools.include?(:mat2)
118
+ Display.warning 'mat2 will not run for this format — document-internal metadata may remain and cannot be verified.'
131
119
  end
132
120
  Display.info "Pipeline: #{tools.join(' → ')}"
133
121
 
134
- # ── Atomic write setup ────────────────────────────────────────
122
+ # Atomic write setup:
135
123
  # `final_path` = where the cleaned file will end up.
136
124
  # `staging` = a temp file we mutate. After all tools succeed, we
137
125
  # rename staging → final_path. If anything goes wrong
@@ -140,22 +128,32 @@ module Metaclean
140
128
  final_path = resolve_final_path(file)
141
129
  staging = staging_path_for(final_path)
142
130
 
143
- FileUtils.cp(file, staging)
144
131
  tool_results = []
145
132
  begin
133
+ # TOCTOU guard: the path was a regular file at discovery (expand_files), but
134
+ # it could have been swapped for a symlink in the window since. Re-check
135
+ # right before we read/copy/back it up, so we never copy — or take a .bak —
136
+ # THROUGH a link pointing outside the intended scope. Bails to :failed.
137
+ raise Error, "#{file} became a symlink since discovery — refusing to clean it" if File.symlink?(file)
138
+
139
+ # The staging copy lives INSIDE the begin so the ensure below cleans up a
140
+ # partial temp if cp is interrupted (Ctrl-C) or fails mid-copy (disk full,
141
+ # read-only fs). cp only ever reads the original, so the source is intact
142
+ # regardless.
143
+ copy_file_exclusive(file, staging)
146
144
  tools.each do |tool|
147
145
  tool_results << run_tool(tool, staging)
148
146
  end
149
147
 
150
148
  # Re-read metadata of the cleaned staging file for the diff.
151
- after = Exiftool.read(staging)
149
+ after = read_metadata(staging)
152
150
  Display.section "After (#{Display.count_embedded(after)} embedded tags)"
153
151
  Display.metadata_table(after, only_embedded: true)
154
152
 
155
153
  Display.section 'Diff'
156
154
  Display.diff(before, after)
157
155
 
158
- # Loud warning if anything privacy-relevant survived.
156
+ # Anything privacy-relevant that survived the strip.
159
157
  residual = Strategy.privacy_residual(after)
160
158
  if residual.any?
161
159
  Display.warning "Privacy-relevant tags still present (#{residual.size}):"
@@ -166,14 +164,42 @@ module Metaclean
166
164
  if @options[:dry_run]
167
165
  File.delete(staging) if File.exist?(staging)
168
166
  Display.info '(dry-run: nothing was written)'
169
- return finalize_result(tool_results, before, after, residual)
167
+ return finalize_result(tool_results, before, after, residual, file: file)
170
168
  end
171
169
 
172
- # Commit: rename staging final_path (and back up original if needed).
173
- commit!(file, staging, final_path)
174
- Display.success " #{final_path}"
170
+ # Never write output unless the file is genuinely clean: at least one
171
+ # tool ran AND no privacy-relevant tag survived. Otherwise the staging
172
+ # file — committed as a "_clean" copy or an in-place overwrite — would
173
+ # not actually be clean, the exact false-clean this tool exists to
174
+ # prevent. Bail to :failed and let the ensure block delete staging,
175
+ # leaving the original untouched.
176
+ unless cleaned?(tool_results, residual)
177
+ reason = tools_succeeded?(tool_results) ? 'Privacy-relevant tags survived' : 'All tools failed'
178
+ Display.warning "#{reason} — not writing output."
179
+ return finalize_result(tool_results, before, after, residual, file: file)
180
+ end
175
181
 
176
- finalize_result(tool_results, before, after, residual)
182
+ # Preserve the original's permission bits onto the cleaned output. cp and
183
+ # the tools' temp renames otherwise leave it at the umask default, which
184
+ # could widen a locked-down 0600 file to 0644 — a leak for a privacy tool.
185
+ File.chmod(File.stat(file).mode, staging)
186
+
187
+ # In-place clean of a hard-linked file only re-points THIS name (rename) at
188
+ # the freshly-cleaned inode; the file's other names still point at the
189
+ # original, metadata-bearing inode. This name is genuinely clean, but warn
190
+ # so the user knows the other links aren't covered by the run.
191
+ warn_if_hardlinked(file) if @options[:in_place]
192
+
193
+ # Commit: move/link staging → final_path (backing up the original in place).
194
+ final_path = commit!(staging, final_path)
195
+ result = finalize_result(tool_results, before, after, residual, file: file)
196
+ if result[:status] == :unverified
197
+ reason = tool_errored?(tool_results) ? 'a tool in the pipeline failed' : 'mat2 did not run on this format'
198
+ Display.warning "→ #{final_path} (unverified — #{reason})"
199
+ else
200
+ Display.success "→ #{final_path}"
201
+ end
202
+ result
177
203
  ensure
178
204
  # Last-resort cleanup. If `commit!` already moved the staging file,
179
205
  # `File.exist?(staging)` is false and this is a no-op. The path-
@@ -183,16 +209,32 @@ module Metaclean
183
209
  end
184
210
  end
185
211
 
212
+ # Warn when an in-place target has more than one hard link: a rename only
213
+ # cleans the named link, leaving the others pointing at the original metadata.
214
+ def warn_if_hardlinked(file)
215
+ nlink = File.stat(file).nlink
216
+ return unless nlink > 1
217
+
218
+ Display.warning "#{file} has #{nlink} hard links — only this name is cleaned; " \
219
+ "the other #{nlink - 1} still contain the original metadata."
220
+ end
221
+
186
222
  # Dispatches to the right wrapper module. Returns a small Hash so the
187
223
  # caller can summarize tool-by-tool success/failure.
188
224
  def run_tool(tool, path)
189
225
  case tool
190
226
  when :exiftool
191
- Exiftool.strip!(path,
192
- keep_orientation: @options[:keep_orientation],
193
- keep_color_profile: @options[:keep_color_profile])
194
- Display.info " ✓ exiftool"
195
- { tool: :exiftool, ok: true }
227
+ # :unsupported means ExifTool can read but not write this format (a
228
+ # ZIP-based document mat2 owns) — a soft skip, NOT a pipeline failure.
229
+ # Pass the privacy tag names so TIFF/DNG IFD0 tags `-all=` won't drop
230
+ # still get deleted (losslessly).
231
+ if Exiftool.strip!(path, also_delete: Strategy::PRIVACY_TAGS) == :unsupported
232
+ Display.info ' · exiftool (read-only for this format, skipped)'
233
+ { tool: :exiftool, ok: false, skipped: true, note: :unsupported }
234
+ else
235
+ Display.info ' ✓ exiftool'
236
+ { tool: :exiftool, ok: true }
237
+ end
196
238
  when :mat2
197
239
  result = Mat2.strip!(path)
198
240
  # mat2 returns either `true` (success) or a symbol indicating a
@@ -214,63 +256,183 @@ module Metaclean
214
256
  Qpdf.rebuild!(path)
215
257
  Display.info ' ✓ qpdf'
216
258
  { tool: :qpdf, ok: true }
259
+ when :ffmpeg
260
+ # Matroska remux. A failure raises and is caught below (→ not written).
261
+ Ffmpeg.strip!(path)
262
+ Display.info ' ✓ ffmpeg'
263
+ { tool: :ffmpeg, ok: true }
217
264
  end
218
- rescue Error => e
265
+ rescue Error, SystemCallError => e
219
266
  # One tool failing shouldn't abort the pipeline — we want to keep
220
267
  # trying with the others. The `finalize_result` step decides whether
221
- # the overall file counts as cleaned or failed.
222
- Display.warning " ✗ #{tool}: #{e.message} continuing"
268
+ # the overall file counts as cleaned or failed. `SystemCallError`
269
+ # (Errno::*) covers a tool wrapper's internal FileUtils.mv/File.delete
270
+ # raising on permission/quota/disk errors — without it those would
271
+ # escape and crash the batch.
272
+ # Collapse whitespace and bound the length: some tools (notably mat2) dump a
273
+ # multi-line Python traceback on failure, which would otherwise flood the
274
+ # diff. One readable line is enough — re-run the tool directly to debug.
275
+ msg = Display.truncate(e.message.gsub(/\s+/, ' ').strip, 200)
276
+ Display.warning " ✗ #{tool}: #{msg} — continuing"
223
277
  { tool: tool, ok: false, error: e.message }
224
278
  end
225
279
 
226
- def finalize_result(tool_results, before, after, residual)
280
+ # :cleaned needs ALL of: a tool genuinely ran, no privacy residual survived,
281
+ # no pipeline tool errored, AND — for a format where mat2 owns coverage
282
+ # ExifTool can't re-read (Office/PDF doc internals) — mat2 actually ran. A
283
+ # tool that errored, or an absent mat2 on a document format, means the
284
+ # pipeline didn't fully complete and the residual check is partly blind, so
285
+ # the result is :unverified, not a confident :cleaned. `file` is needed only
286
+ # for that mat2-coverage check.
287
+ def finalize_result(tool_results, before, after, residual, file: nil)
227
288
  removed = removed_embedded_count(before, after)
228
- # A file only counts as "cleaned" if at least one tool actually ran
229
- # successfully (i.e. wasn't skipped as unsupported) AND no privacy-
230
- # relevant tags survived. Anything else is a failure — silently
231
- # marking a file clean when sensitive metadata is still present is
232
- # the worst possible outcome for a privacy tool.
233
- ran_ok = tool_results.any? { |r| r[:ok] && !r[:skipped] }
234
- status = ran_ok && residual.empty? ? :cleaned : :failed
235
- { status: status,
236
- removed: removed,
237
- residual: residual.size,
238
- tools: tool_results }
289
+ status = if !cleaned?(tool_results, residual)
290
+ :failed
291
+ elsif !tool_errored?(tool_results) && !mat2_coverage_gap?(tool_results, file)
292
+ :cleaned
293
+ else
294
+ :unverified
295
+ end
296
+ { status: status, removed: removed, residual: residual.size }
297
+ end
298
+
299
+ # mat2 is essential for this format (Office/PDF internals ExifTool can't
300
+ # strip or fully re-read) but did NOT actually run and strip — absent,
301
+ # unsupported soft-skip, or errored. The residual check can't confirm the
302
+ # clean, so don't report a confident :cleaned.
303
+ def mat2_coverage_gap?(tool_results, file)
304
+ return false unless file && Strategy.mat2_essential?(file)
305
+
306
+ tool_results.none? { |r| r[:tool] == :mat2 && r[:ok] && !r[:skipped] }
307
+ end
308
+
309
+ # A file is genuinely cleaned only when at least one tool actually ran
310
+ # (not just a mat2 :unsupported soft-skip) AND no privacy-relevant tag
311
+ # survived. Both the commit gate and the final status use this ONE
312
+ # predicate, so they can never disagree — we never write a "_clean" copy
313
+ # (or overwrite an original in place) and then report it :failed. Silently
314
+ # marking a file clean while sensitive metadata is still present is the
315
+ # worst possible outcome for a privacy tool.
316
+ def cleaned?(tool_results, residual)
317
+ tools_succeeded?(tool_results) && residual.empty?
318
+ end
319
+
320
+ # Did at least one tool genuinely run (not a mat2 :unsupported soft-skip)?
321
+ def tools_succeeded?(tool_results)
322
+ tool_results.any? { |r| r[:ok] && !r[:skipped] }
323
+ end
324
+
325
+ # Did a tool that was meant to run error out (not a mat2 :unsupported
326
+ # soft-skip)? Even with an empty residual that means the pipeline didn't
327
+ # fully complete, so the clean can't be reported as a confident :cleaned.
328
+ def tool_errored?(tool_results)
329
+ tool_results.any? { |r| !r[:ok] && !r[:skipped] }
330
+ end
331
+
332
+ # Read metadata for the before/after diff. ensure_tools! guarantees exiftool
333
+ # is present before any run.
334
+ def read_metadata(path)
335
+ Exiftool.read(path)
239
336
  end
240
337
 
241
338
  def removed_embedded_count(before, after)
242
- after_keys = after.keys.to_set
243
- before.keys.count do |key|
244
- next false if key == 'SourceFile'
245
- next false if Display::NON_METADATA_GROUPS.include?(Display.group_of(key))
339
+ before.keys.count { |key| Display.embedded_key?(key) && !after.key?(key) }
340
+ end
341
+
342
+ # Path helpers — figuring out where to stage and where to commit.
246
343
 
247
- !after_keys.include?(key)
344
+ def commit!(staging, final_path)
345
+ committed = false
346
+ # Make a backup of the original BEFORE we overwrite it. The order matters:
347
+ # if the rename below fails, the backup still exists.
348
+ if @options[:in_place]
349
+ backup = copy_with_collision_safe_name(final_path, "#{final_path}.bak")
350
+ # File.rename, NOT FileUtils.mv. staging is in the same dir as final_path
351
+ # (staging_path_for), so this is always an atomic same-fs swap. FileUtils.mv
352
+ # would, on EPERM (e.g. a sticky /tmp file not owned by us) or EXDEV, fall
353
+ # back to a TRUNCATING copy of the original — and an interrupt mid-copy
354
+ # would corrupt the original while the rescue below deleted the only backup.
355
+ # File.rename raises BEFORE touching final_path, so the rescue's
356
+ # "staging still exists ⇒ original intact" assumption always holds.
357
+ File.rename(staging, final_path)
358
+ committed = true
359
+ return final_path
248
360
  end
361
+
362
+ link_with_collision_safe_name(staging, final_path)
363
+ rescue SystemCallError, Interrupt
364
+ # The rename failed (disk full, read-only fs, cross-device) OR the user hit
365
+ # Ctrl-C in the window after the .bak was written but before the mv. Either
366
+ # way the original is untouched, so the .bak is a redundant copy of it —
367
+ # remove it instead of leaving a stray file behind, then re-raise (a
368
+ # SystemCallError is reported per-file as failed; an Interrupt propagates to
369
+ # the CLI's exit-130 handler).
370
+ File.delete(backup) if backup && !committed && File.exist?(staging) && File.exist?(backup)
371
+ raise
249
372
  end
250
373
 
251
- # ─────────────────────────────────────────────────────────────────
252
- # Path helpers — figuring out where to stage and where to commit.
253
- # ─────────────────────────────────────────────────────────────────
254
-
255
- def commit!(source, staging, final_path)
256
- # Make a backup of the original BEFORE we overwrite it. The order
257
- # matters: if the rename below fails, the backup still exists.
258
- # When source is a symlink, place the backup next to the *target*
259
- # (which is what --in-place actually overwrites) — putting the .bak
260
- # next to the link is confusing during recovery.
261
- if @options[:in_place] && !@options[:no_backup]
262
- backup_target = File.symlink?(source) ? File.realpath(source) : source
263
- backup = collision_safe("#{backup_target}.bak")
264
- FileUtils.cp(backup_target, backup)
374
+ def link_with_collision_safe_name(staging, preferred)
375
+ target = preferred
376
+ loop do
377
+ File.link(staging, target)
378
+ File.delete(staging)
379
+ return target
380
+ rescue Errno::EEXIST
381
+ target = collision_safe(target)
382
+ rescue Errno::EACCES, Errno::EPERM, Errno::ENOTSUP, NotImplementedError
383
+ # The filesystem can't hard-link (Linux returns EPERM; macOS/BSD on
384
+ # exFAT/FAT/SMB returns ENOTSUP/EOPNOTSUPP — same Errno class). Fall back
385
+ # to a plain exclusive copy so removable/network drives still clean.
386
+ target = copy_with_collision_safe_name(staging, target)
387
+ File.delete(staging)
388
+ return target
389
+ end
390
+ end
391
+
392
+ def copy_with_collision_safe_name(src, preferred)
393
+ target = preferred
394
+ loop do
395
+ copy_file_exclusive(src, target, preserve: true)
396
+ return target
397
+ rescue Errno::EEXIST
398
+ target = collision_safe(target)
265
399
  end
266
- FileUtils.mv(staging, final_path)
400
+ end
401
+
402
+ def copy_file_exclusive(src, dest, preserve: false)
403
+ src_stat = File.lstat(src)
404
+ raise Error, "#{src} is a symlink — refusing to copy it" if src_stat.symlink?
405
+
406
+ mode = src_stat.mode & 0o7777
407
+ created = false
408
+ File.open(dest, File::WRONLY | File::CREAT | File::EXCL, mode) do |out|
409
+ created = true
410
+ File.open(src, 'rb') do |input|
411
+ opened = input.stat
412
+ unless opened.dev == src_stat.dev && opened.ino == src_stat.ino
413
+ raise Error, "#{src} changed while opening — refusing to copy it"
414
+ end
415
+
416
+ IO.copy_stream(input, out)
417
+ end
418
+ end
419
+ return unless preserve
420
+
421
+ # Best-effort: the bytes are already fully copied, so a failed mode/timestamp
422
+ # restore (e.g. utime/chmod on a FAT/exFAT mount) must NOT discard an
423
+ # otherwise-complete file by falling into the delete-on-error rescue below.
424
+ begin
425
+ File.chmod(mode, dest)
426
+ File.utime(src_stat.atime, src_stat.mtime, dest)
427
+ rescue SystemCallError
428
+ nil
429
+ end
430
+ rescue StandardError, Interrupt
431
+ File.delete(dest) if created && dest && File.exist?(dest)
432
+ raise
267
433
  end
268
434
 
269
435
  def resolve_final_path(file)
270
- # When following a symlink with --in-place, we want to overwrite the
271
- # *target* of the link, not replace the link itself with a regular
272
- # file. `realpath` resolves through the link.
273
- return File.realpath(file) if @options[:in_place] && File.symlink?(file)
274
436
  return file if @options[:in_place]
275
437
 
276
438
  # Default: write `<name>_clean.<ext>` next to the original. If it
@@ -281,7 +443,7 @@ module Metaclean
281
443
  def build_clean_path(file)
282
444
  ext = File.extname(file)
283
445
  base = File.basename(file, ext)
284
- File.join(File.dirname(file), "#{base}_clean#{ext}")
446
+ File.join(File.dirname(file), "#{base}#{Metaclean::CLEAN_SUFFIX}#{ext}")
285
447
  end
286
448
 
287
449
  # Staging path lives in the same directory as the destination so that
@@ -290,13 +452,15 @@ module Metaclean
290
452
  # The original extension is preserved as the LAST segment so tools like
291
453
  # mat2 — which dispatch on file extension — see the real type.
292
454
  def staging_path_for(final_path)
293
- ext = File.extname(final_path)
294
- base = ext.empty? ? final_path : final_path[0...-ext.length]
295
- "#{base}.metaclean.tmp.#{Process.pid}.#{rand(1_000_000)}#{ext}"
455
+ dir = File.dirname(final_path)
456
+ ext = File.extname(final_path)
457
+ # SecureRandom (not rand) makes the staging name unpredictable, so a
458
+ # hostile process in the same directory can't pre-create it as a symlink
459
+ # that `FileUtils.cp` would copy the (still-sensitive) original through.
460
+ File.join(dir, "#{Metaclean::TMP_MARKER}#{Process.pid}.#{SecureRandom.hex(8)}#{ext}")
296
461
  end
297
462
 
298
- # If `path` is taken, return `path_1`, `path_2`, … until we find a free
299
- # one. `loop do … end` runs forever; we `return` out of it.
463
+ # If `path` is taken, try `path_1`, `path_2`, … until one is free.
300
464
  def collision_safe(path)
301
465
  return path unless File.exist?(path)
302
466
 
@@ -312,19 +476,12 @@ module Metaclean
312
476
  end
313
477
  end
314
478
 
315
- # Translates the on/off CLI flags into a "prefer" hash that Strategy
316
- # understands. Keeping this as one method makes the wiring obvious.
317
- def tool_prefs
318
- {
319
- mat2: !@options[:no_mat2] && !@options[:exiftool_only],
320
- qpdf: !@options[:no_qpdf] && !@options[:exiftool_only],
321
- exiftool: !@options[:no_exiftool]
322
- }
323
- end
324
-
325
479
  def print_summary(summary)
326
480
  Display.header 'Summary'
327
481
  Display.success "Cleaned: #{summary[:cleaned]} file(s)"
482
+ if summary[:unverified].positive?
483
+ Display.warning "Unverified (clean could not be confirmed): #{summary[:unverified]} file(s)"
484
+ end
328
485
  puts Display.error("Failed: #{summary[:failed]}") if summary[:failed].positive?
329
486
  Display.info "Total embedded tags removed: #{summary[:removed_total]}"
330
487
  if summary[:residual_files].positive?
@@ -332,18 +489,16 @@ module Metaclean
332
489
  end
333
490
  end
334
491
 
335
- # ─────────────────────────────────────────────────────────────────
336
492
  # File discovery — turning the user's paths into a flat list.
337
- # ─────────────────────────────────────────────────────────────────
338
493
 
339
494
  def expand_files(paths)
340
495
  explicit = []
341
496
  discovered = []
342
497
  paths.each do |p|
343
- # Symlinks are skipped by default. This avoids accidentally cleaning
344
- # something through a link that points outside the intended scope.
345
- if File.symlink?(p) && !@options[:follow_symlinks]
346
- Display.warning "Skipping symlink: #{p} (use --follow-symlinks to include)"
498
+ # Symlinks are always skipped avoids cleaning something through a link
499
+ # that points outside the intended scope.
500
+ if File.symlink?(p)
501
+ Display.warning "Skipping symlink: #{p}"
347
502
  next
348
503
  end
349
504
  if File.directory?(p)
@@ -358,9 +513,7 @@ module Metaclean
358
513
  end
359
514
  end
360
515
  discovered.reject! { |f| skip?(f) }
361
- result = explicit + discovered
362
- result.select! { |f| type_allowed?(f) } if @options[:types]
363
- dedupe_by_realpath(result)
516
+ dedupe_by_realpath(explicit + discovered)
364
517
  end
365
518
 
366
519
  # Same file via two different paths (or via symlink + direct path) should
@@ -369,11 +522,7 @@ module Metaclean
369
522
  def dedupe_by_realpath(paths)
370
523
  seen = {}
371
524
  paths.each_with_object([]) do |p, acc|
372
- key = begin
373
- File.realpath(p)
374
- rescue StandardError
375
- p
376
- end
525
+ key = safe_realpath(p)
377
526
  next if seen[key]
378
527
 
379
528
  seen[key] = true
@@ -381,50 +530,54 @@ module Metaclean
381
530
  end
382
531
  end
383
532
 
533
+ # File.realpath, falling back to the raw path when it can't resolve
534
+ # (broken symlink, permission denied) instead of raising.
535
+ def safe_realpath(path)
536
+ File.realpath(path)
537
+ rescue StandardError
538
+ path
539
+ end
540
+
384
541
  def collect_dir(dir, out)
385
542
  if @options[:recursive]
386
- walk_recursive(dir, out, Set.new)
543
+ walk_recursive(dir, out)
387
544
  else
388
- # Non-recursive: just the immediate children of `dir`.
389
- Dir.glob(File.join(dir, '*')).each do |sub|
390
- next if File.symlink?(sub) && !@options[:follow_symlinks]
545
+ # Non-recursive: just the immediate children of `dir`. Use Dir.children,
546
+ # NOT Dir.glob("#{dir}/*") — glob interprets the WHOLE pattern, so a
547
+ # directory name containing glob metacharacters (e.g. "Holiday [2024]")
548
+ # matches nothing and the entire folder is silently skipped. Dir.children
549
+ # surfaces dotfiles too; skip? filters them later, same as walk_recursive.
550
+ Dir.children(dir).each do |entry|
551
+ sub = File.join(dir, entry)
552
+ next if File.symlink?(sub)
391
553
 
392
554
  out << sub if File.file?(sub)
393
555
  end
394
556
  end
557
+ rescue SystemCallError => e
558
+ # Any Errno (EACCES/ENOENT/ENOTDIR from a dir replaced mid-scan, EIO, …):
559
+ # warn and skip this directory so one bad entry doesn't abort discovery of
560
+ # the rest of the batch.
561
+ Display.warning "Skipping #{dir}: #{e.message}"
395
562
  end
396
563
 
397
- # Manual recursive walker. We don't use `Find.find` because it never
398
- # descends into symlinked directories, even when --follow-symlinks is on.
399
- # `visited` tracks realpaths so we don't infinite-loop on a symlink that
400
- # eventually points at one of its ancestors.
401
- def walk_recursive(dir, out, visited)
402
- real = begin
403
- File.realpath(dir)
404
- rescue StandardError
405
- dir
406
- end
407
- return if visited.include?(real)
408
-
409
- visited << real
410
-
564
+ # Manual recursive walker. Symlinks are always skipped (never followed), so
565
+ # the real directory tree is acyclic and no loop-guard is needed.
566
+ def walk_recursive(dir, out)
411
567
  Dir.each_child(dir) do |entry|
412
568
  sub = File.join(dir, entry)
413
- if File.symlink?(sub)
414
- next unless @options[:follow_symlinks]
569
+ next if File.symlink?(sub)
415
570
 
416
- if File.directory?(sub)
417
- walk_recursive(sub, out, visited)
418
- elsif File.file?(sub)
419
- out << sub
420
- end
421
- elsif File.directory?(sub)
422
- walk_recursive(sub, out, visited)
571
+ if File.directory?(sub)
572
+ walk_recursive(sub, out)
423
573
  elsif File.file?(sub)
424
574
  out << sub
425
575
  end
426
576
  end
427
- rescue Errno::EACCES, Errno::ENOENT => e
577
+ rescue SystemCallError => e
578
+ # Any Errno (EACCES/ENOENT/ENOTDIR from a dir replaced mid-scan, EIO, …):
579
+ # warn and skip this directory so one bad entry doesn't abort discovery of
580
+ # the rest of the batch.
428
581
  Display.warning "Skipping #{dir}: #{e.message}"
429
582
  end
430
583
 
@@ -437,15 +590,11 @@ module Metaclean
437
590
  base = File.basename(file)
438
591
  return true if base.start_with?('.')
439
592
  return true if base.end_with?('.bak')
440
- return true if base =~ /_clean(_\d+)?\.[^.]+\z/
441
- return true if base =~ /\.metaclean\.tmp\.\d+\.\d+/
593
+ return true if base =~ Metaclean::CLEAN_OUTPUT_RE
594
+ # Matches our staging temps regardless of the pid/random suffix format.
595
+ return true if base.include?(Metaclean::TMP_MARKER)
442
596
 
443
597
  false
444
598
  end
445
-
446
- def type_allowed?(file)
447
- ext = File.extname(file).downcase.delete('.')
448
- @options[:types].include?(ext)
449
- end
450
599
  end
451
600
  end