metaclean 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +132 -0
- data/bin/metaclean +29 -0
- data/lib/metaclean/cli.rb +169 -0
- data/lib/metaclean/display.rb +197 -0
- data/lib/metaclean/exiftool.rb +140 -0
- data/lib/metaclean/mat2.rb +123 -0
- data/lib/metaclean/qpdf.rb +75 -0
- data/lib/metaclean/runner.rb +451 -0
- data/lib/metaclean/strategy.rb +96 -0
- data/lib/metaclean/version.rb +11 -0
- data/lib/metaclean.rb +33 -0
- metadata +61 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# The orchestrator. Given a list of paths and parsed CLI options, this class:
|
|
5
|
+
#
|
|
6
|
+
# 1. Expands paths into a flat list of files (handling directories,
|
|
7
|
+
# recursion, symlinks, type filters).
|
|
8
|
+
# 2. Asks the user for confirmation (unless --force).
|
|
9
|
+
# 3. For each file, runs the strategy pipeline (mat2 / exiftool / qpdf)
|
|
10
|
+
# using the "atomic write" pattern so a crash never leaves a
|
|
11
|
+
# half-cleaned file.
|
|
12
|
+
# 4. Prints a before/after diff and a final summary.
|
|
13
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
require 'fileutils'
|
|
16
|
+
require 'json'
|
|
17
|
+
require 'set'
|
|
18
|
+
require 'tmpdir'
|
|
19
|
+
|
|
20
|
+
module Metaclean
|
|
21
|
+
class Runner
|
|
22
|
+
# Constructor — just stashes the options Hash. The CLI builds it.
|
|
23
|
+
def initialize(options)
|
|
24
|
+
@options = options
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# ─────────────────────────────────────────────────────────────────
|
|
28
|
+
# Public entry points: one for `--inspect`, one for the cleaning flow.
|
|
29
|
+
# ─────────────────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
def inspect_paths(paths)
|
|
32
|
+
files = expand_files(paths)
|
|
33
|
+
return Display.warning('No files to inspect.') if files.empty?
|
|
34
|
+
|
|
35
|
+
# `--json`: machine output, no colors, suitable for piping.
|
|
36
|
+
if @options[:format] == :json
|
|
37
|
+
out = files.map { |f| { file: f, metadata: Exiftool.read(f) } }
|
|
38
|
+
puts JSON.pretty_generate(out)
|
|
39
|
+
return
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Human output: pretty header + grouped table per file.
|
|
43
|
+
files.each do |file|
|
|
44
|
+
Display.header "📄 #{file}"
|
|
45
|
+
meta = Exiftool.read(file)
|
|
46
|
+
Display.section "Metadata (#{Display.count_embedded(meta)} embedded tags)"
|
|
47
|
+
Display.metadata_table(meta)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def clean_paths(paths)
|
|
52
|
+
files = expand_files(paths)
|
|
53
|
+
return Display.warning('No files to process.') if files.empty?
|
|
54
|
+
|
|
55
|
+
announce_tools
|
|
56
|
+
|
|
57
|
+
# Confirmation prompt — skipped for --force and --dry-run (since
|
|
58
|
+
# dry-run never modifies anything anyway).
|
|
59
|
+
unless @options[:force] || @options[:dry_run]
|
|
60
|
+
action = @options[:in_place] ? 'OVERWRITE' : 'create cleaned copies of'
|
|
61
|
+
puts Display.c("About to #{action} #{files.size} file(s).", :yellow)
|
|
62
|
+
if @options[:in_place] && !@options[:no_backup]
|
|
63
|
+
puts Display.c('Backups will be saved alongside as <file>.bak.', :gray)
|
|
64
|
+
end
|
|
65
|
+
print Display.c('Proceed? [y/N] ', :bold)
|
|
66
|
+
# `&.` is the safe-navigation operator: if `gets` returns nil
|
|
67
|
+
# (e.g. user hit Ctrl-D), the chain short-circuits to nil.
|
|
68
|
+
ans = $stdin.gets&.strip&.downcase
|
|
69
|
+
return Display.warning('Aborted.') unless %w[y yes].include?(ans)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
summary = { cleaned: 0, failed: 0, removed_total: 0, residual_files: 0 }
|
|
73
|
+
|
|
74
|
+
# `each_with_index` gives us the file AND its position. We pass both
|
|
75
|
+
# to `clean_one` so it can render "[3/47]" in batch mode.
|
|
76
|
+
files.each_with_index do |file, idx|
|
|
77
|
+
result = clean_one(file, index: idx + 1, total: files.size)
|
|
78
|
+
summary[result[:status]] += 1
|
|
79
|
+
summary[:removed_total] += result[:removed].to_i
|
|
80
|
+
summary[:residual_files] += 1 if result[:residual].to_i.positive?
|
|
81
|
+
rescue Error => e
|
|
82
|
+
# Block-level rescue (Ruby 2.5+). Catches errors from `clean_one`
|
|
83
|
+
# without aborting the whole batch — one bad file shouldn't stop
|
|
84
|
+
# the next 99 from being cleaned.
|
|
85
|
+
warn Display.error("#{file}: #{e.message}")
|
|
86
|
+
summary[:failed] += 1
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
print_summary(summary)
|
|
90
|
+
|
|
91
|
+
# Non-zero exit code so CI pipelines can detect failures.
|
|
92
|
+
exit 1 if @options[:strict_verify] && summary[:residual_files].positive?
|
|
93
|
+
exit 1 if summary[:failed].positive?
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
# ─────────────────────────────────────────────────────────────────
|
|
99
|
+
# Output helpers
|
|
100
|
+
# ─────────────────────────────────────────────────────────────────
|
|
101
|
+
|
|
102
|
+
def announce_tools
|
|
103
|
+
have = []
|
|
104
|
+
have << "exiftool #{Exiftool.version}" if Exiftool.available?
|
|
105
|
+
have << "mat2 #{Mat2.version}" if Mat2.available?
|
|
106
|
+
have << "qpdf #{Qpdf.version&.split&.last}" if Qpdf.available?
|
|
107
|
+
Display.info "Tools detected: #{have.join(', ')}"
|
|
108
|
+
Display.info '(dry-run — no files will be modified)' if @options[:dry_run]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# ─────────────────────────────────────────────────────────────────
|
|
112
|
+
# Cleaning a single file — the heart of the program.
|
|
113
|
+
# ─────────────────────────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
def clean_one(file, index:, total:)
|
|
116
|
+
prefix = total > 1 ? "[#{index}/#{total}] " : ''
|
|
117
|
+
Display.header "#{prefix}📄 #{file}"
|
|
118
|
+
|
|
119
|
+
# Read the "before" metadata FIRST — once we start cleaning, this is
|
|
120
|
+
# gone forever and we'd have nothing to diff against.
|
|
121
|
+
before = Exiftool.read(file)
|
|
122
|
+
Display.section "Before (#{Display.count_embedded(before)} embedded tags)"
|
|
123
|
+
Display.metadata_table(before, only_embedded: true)
|
|
124
|
+
|
|
125
|
+
# Ask the strategy module which tools to run. If everything's
|
|
126
|
+
# disabled (user passed all --no-* flags), bail out gracefully.
|
|
127
|
+
tools = Strategy.tools_for(file, prefer: tool_prefs)
|
|
128
|
+
if tools.empty?
|
|
129
|
+
Display.warning 'No applicable tools — skipping.'
|
|
130
|
+
return { status: :failed, removed: 0, residual: 0 }
|
|
131
|
+
end
|
|
132
|
+
Display.info "Pipeline: #{tools.join(' → ')}"
|
|
133
|
+
|
|
134
|
+
# ── Atomic write setup ────────────────────────────────────────
|
|
135
|
+
# `final_path` = where the cleaned file will end up.
|
|
136
|
+
# `staging` = a temp file we mutate. After all tools succeed, we
|
|
137
|
+
# rename staging → final_path. If anything goes wrong
|
|
138
|
+
# in the middle, we delete staging in the `ensure`
|
|
139
|
+
# block and the original is untouched.
|
|
140
|
+
final_path = resolve_final_path(file)
|
|
141
|
+
staging = staging_path_for(final_path)
|
|
142
|
+
|
|
143
|
+
FileUtils.cp(file, staging)
|
|
144
|
+
tool_results = []
|
|
145
|
+
begin
|
|
146
|
+
tools.each do |tool|
|
|
147
|
+
tool_results << run_tool(tool, staging)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Re-read metadata of the cleaned staging file for the diff.
|
|
151
|
+
after = Exiftool.read(staging)
|
|
152
|
+
Display.section "After (#{Display.count_embedded(after)} embedded tags)"
|
|
153
|
+
Display.metadata_table(after, only_embedded: true)
|
|
154
|
+
|
|
155
|
+
Display.section 'Diff'
|
|
156
|
+
Display.diff(before, after)
|
|
157
|
+
|
|
158
|
+
# Loud warning if anything privacy-relevant survived.
|
|
159
|
+
residual = Strategy.privacy_residual(after)
|
|
160
|
+
if residual.any?
|
|
161
|
+
Display.warning "Privacy-relevant tags still present (#{residual.size}):"
|
|
162
|
+
residual.each { |k, v| puts " #{Display.c(k, :yellow)} = #{Display.truncate(Display.format_value(v), 60)}" }
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Dry-run path: discard the staging file and return without committing.
|
|
166
|
+
if @options[:dry_run]
|
|
167
|
+
File.delete(staging) if File.exist?(staging)
|
|
168
|
+
Display.info '(dry-run: nothing was written)'
|
|
169
|
+
return finalize_result(tool_results, before, after, residual)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Commit: rename staging → final_path (and back up original if needed).
|
|
173
|
+
commit!(file, staging, final_path)
|
|
174
|
+
Display.success "→ #{final_path}"
|
|
175
|
+
|
|
176
|
+
finalize_result(tool_results, before, after, residual)
|
|
177
|
+
ensure
|
|
178
|
+
# Last-resort cleanup. If `commit!` already moved the staging file,
|
|
179
|
+
# `File.exist?(staging)` is false and this is a no-op. The path-
|
|
180
|
+
# comparison protects against deleting the final file by accident
|
|
181
|
+
# in the (impossible) case where staging == final.
|
|
182
|
+
File.delete(staging) if File.exist?(staging) && File.expand_path(staging) != File.expand_path(final_path)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Dispatches to the right wrapper module. Returns a small Hash so the
|
|
187
|
+
# caller can summarize tool-by-tool success/failure.
|
|
188
|
+
def run_tool(tool, path)
|
|
189
|
+
case tool
|
|
190
|
+
when :exiftool
|
|
191
|
+
Exiftool.strip!(path,
|
|
192
|
+
keep_orientation: @options[:keep_orientation],
|
|
193
|
+
keep_color_profile: @options[:keep_color_profile])
|
|
194
|
+
Display.info " ✓ exiftool"
|
|
195
|
+
{ tool: :exiftool, ok: true }
|
|
196
|
+
when :mat2
|
|
197
|
+
result = Mat2.strip!(path)
|
|
198
|
+
# mat2 returns either `true` (success) or a symbol indicating a
|
|
199
|
+
# soft skip. `:unsupported` means the tool didn't actually run, so
|
|
200
|
+
# it must not count as a successful pass — otherwise a file can be
|
|
201
|
+
# reported as "Cleaned" while metadata is still embedded.
|
|
202
|
+
case result
|
|
203
|
+
when :unsupported
|
|
204
|
+
Display.info ' · mat2 (unsupported file type, skipped)'
|
|
205
|
+
{ tool: :mat2, ok: false, skipped: true, note: result }
|
|
206
|
+
when :no_metadata
|
|
207
|
+
Display.info ' · mat2 (no metadata to strip)'
|
|
208
|
+
{ tool: :mat2, ok: true, note: result }
|
|
209
|
+
else
|
|
210
|
+
Display.info ' ✓ mat2'
|
|
211
|
+
{ tool: :mat2, ok: true, note: result }
|
|
212
|
+
end
|
|
213
|
+
when :qpdf
|
|
214
|
+
Qpdf.rebuild!(path)
|
|
215
|
+
Display.info ' ✓ qpdf'
|
|
216
|
+
{ tool: :qpdf, ok: true }
|
|
217
|
+
end
|
|
218
|
+
rescue Error => e
|
|
219
|
+
# One tool failing shouldn't abort the pipeline — we want to keep
|
|
220
|
+
# trying with the others. The `finalize_result` step decides whether
|
|
221
|
+
# the overall file counts as cleaned or failed.
|
|
222
|
+
Display.warning " ✗ #{tool}: #{e.message} — continuing"
|
|
223
|
+
{ tool: tool, ok: false, error: e.message }
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def finalize_result(tool_results, before, after, residual)
|
|
227
|
+
removed = removed_embedded_count(before, after)
|
|
228
|
+
# A file only counts as "cleaned" if at least one tool actually ran
|
|
229
|
+
# successfully (i.e. wasn't skipped as unsupported) AND no privacy-
|
|
230
|
+
# relevant tags survived. Anything else is a failure — silently
|
|
231
|
+
# marking a file clean when sensitive metadata is still present is
|
|
232
|
+
# the worst possible outcome for a privacy tool.
|
|
233
|
+
ran_ok = tool_results.any? { |r| r[:ok] && !r[:skipped] }
|
|
234
|
+
status = ran_ok && residual.empty? ? :cleaned : :failed
|
|
235
|
+
{ status: status,
|
|
236
|
+
removed: removed,
|
|
237
|
+
residual: residual.size,
|
|
238
|
+
tools: tool_results }
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def removed_embedded_count(before, after)
|
|
242
|
+
after_keys = after.keys.to_set
|
|
243
|
+
before.keys.count do |key|
|
|
244
|
+
next false if key == 'SourceFile'
|
|
245
|
+
next false if Display::NON_METADATA_GROUPS.include?(Display.group_of(key))
|
|
246
|
+
|
|
247
|
+
!after_keys.include?(key)
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# ─────────────────────────────────────────────────────────────────
|
|
252
|
+
# Path helpers — figuring out where to stage and where to commit.
|
|
253
|
+
# ─────────────────────────────────────────────────────────────────
|
|
254
|
+
|
|
255
|
+
def commit!(source, staging, final_path)
|
|
256
|
+
# Make a backup of the original BEFORE we overwrite it. The order
|
|
257
|
+
# matters: if the rename below fails, the backup still exists.
|
|
258
|
+
# When source is a symlink, place the backup next to the *target*
|
|
259
|
+
# (which is what --in-place actually overwrites) — putting the .bak
|
|
260
|
+
# next to the link is confusing during recovery.
|
|
261
|
+
if @options[:in_place] && !@options[:no_backup]
|
|
262
|
+
backup_target = File.symlink?(source) ? File.realpath(source) : source
|
|
263
|
+
backup = collision_safe("#{backup_target}.bak")
|
|
264
|
+
FileUtils.cp(backup_target, backup)
|
|
265
|
+
end
|
|
266
|
+
FileUtils.mv(staging, final_path)
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def resolve_final_path(file)
|
|
270
|
+
# When following a symlink with --in-place, we want to overwrite the
|
|
271
|
+
# *target* of the link, not replace the link itself with a regular
|
|
272
|
+
# file. `realpath` resolves through the link.
|
|
273
|
+
return File.realpath(file) if @options[:in_place] && File.symlink?(file)
|
|
274
|
+
return file if @options[:in_place]
|
|
275
|
+
|
|
276
|
+
# Default: write `<name>_clean.<ext>` next to the original. If it
|
|
277
|
+
# already exists, `collision_safe` appends `_1`, `_2`, …
|
|
278
|
+
collision_safe(build_clean_path(file))
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
def build_clean_path(file)
|
|
282
|
+
ext = File.extname(file)
|
|
283
|
+
base = File.basename(file, ext)
|
|
284
|
+
File.join(File.dirname(file), "#{base}_clean#{ext}")
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Staging path lives in the same directory as the destination so that
|
|
288
|
+
# `File.rename`/`FileUtils.mv` is an atomic same-filesystem operation.
|
|
289
|
+
# PID + random number prevent collisions between simultaneous runs.
|
|
290
|
+
# The original extension is preserved as the LAST segment so tools like
|
|
291
|
+
# mat2 — which dispatch on file extension — see the real type.
|
|
292
|
+
def staging_path_for(final_path)
|
|
293
|
+
ext = File.extname(final_path)
|
|
294
|
+
base = ext.empty? ? final_path : final_path[0...-ext.length]
|
|
295
|
+
"#{base}.metaclean.tmp.#{Process.pid}.#{rand(1_000_000)}#{ext}"
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# If `path` is taken, return `path_1`, `path_2`, … until we find a free
|
|
299
|
+
# one. `loop do … end` runs forever; we `return` out of it.
|
|
300
|
+
def collision_safe(path)
|
|
301
|
+
return path unless File.exist?(path)
|
|
302
|
+
|
|
303
|
+
ext = File.extname(path)
|
|
304
|
+
base = File.basename(path, ext)
|
|
305
|
+
dir = File.dirname(path)
|
|
306
|
+
i = 1
|
|
307
|
+
loop do
|
|
308
|
+
candidate = File.join(dir, "#{base}_#{i}#{ext}")
|
|
309
|
+
return candidate unless File.exist?(candidate)
|
|
310
|
+
|
|
311
|
+
i += 1
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
# Translates the on/off CLI flags into a "prefer" hash that Strategy
|
|
316
|
+
# understands. Keeping this as one method makes the wiring obvious.
|
|
317
|
+
def tool_prefs
|
|
318
|
+
{
|
|
319
|
+
mat2: !@options[:no_mat2] && !@options[:exiftool_only],
|
|
320
|
+
qpdf: !@options[:no_qpdf] && !@options[:exiftool_only],
|
|
321
|
+
exiftool: !@options[:no_exiftool]
|
|
322
|
+
}
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
def print_summary(summary)
|
|
326
|
+
Display.header 'Summary'
|
|
327
|
+
Display.success "Cleaned: #{summary[:cleaned]} file(s)"
|
|
328
|
+
puts Display.error("Failed: #{summary[:failed]}") if summary[:failed].positive?
|
|
329
|
+
Display.info "Total embedded tags removed: #{summary[:removed_total]}"
|
|
330
|
+
if summary[:residual_files].positive?
|
|
331
|
+
Display.warning "Files with privacy residual: #{summary[:residual_files]}"
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
# ─────────────────────────────────────────────────────────────────
|
|
336
|
+
# File discovery — turning the user's paths into a flat list.
|
|
337
|
+
# ─────────────────────────────────────────────────────────────────
|
|
338
|
+
|
|
339
|
+
def expand_files(paths)
|
|
340
|
+
explicit = []
|
|
341
|
+
discovered = []
|
|
342
|
+
paths.each do |p|
|
|
343
|
+
# Symlinks are skipped by default. This avoids accidentally cleaning
|
|
344
|
+
# something through a link that points outside the intended scope.
|
|
345
|
+
if File.symlink?(p) && !@options[:follow_symlinks]
|
|
346
|
+
Display.warning "Skipping symlink: #{p} (use --follow-symlinks to include)"
|
|
347
|
+
next
|
|
348
|
+
end
|
|
349
|
+
if File.directory?(p)
|
|
350
|
+
collect_dir(p, discovered)
|
|
351
|
+
elsif File.file?(p)
|
|
352
|
+
# Explicit file argument — never apply skip?, the user asked for
|
|
353
|
+
# this exact path. (Skip filters exist to avoid re-cleaning our
|
|
354
|
+
# own outputs during recursion, not to override the CLI.)
|
|
355
|
+
explicit << p
|
|
356
|
+
else
|
|
357
|
+
Display.warning "Not found: #{p}"
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
discovered.reject! { |f| skip?(f) }
|
|
361
|
+
result = explicit + discovered
|
|
362
|
+
result.select! { |f| type_allowed?(f) } if @options[:types]
|
|
363
|
+
dedupe_by_realpath(result)
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
# Same file via two different paths (or via symlink + direct path) should
|
|
367
|
+
# be cleaned once. Comparing by realpath catches both cases. If realpath
|
|
368
|
+
# raises (broken symlink, permission denied), fall back to the raw path.
|
|
369
|
+
def dedupe_by_realpath(paths)
|
|
370
|
+
seen = {}
|
|
371
|
+
paths.each_with_object([]) do |p, acc|
|
|
372
|
+
key = begin
|
|
373
|
+
File.realpath(p)
|
|
374
|
+
rescue StandardError
|
|
375
|
+
p
|
|
376
|
+
end
|
|
377
|
+
next if seen[key]
|
|
378
|
+
|
|
379
|
+
seen[key] = true
|
|
380
|
+
acc << p
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def collect_dir(dir, out)
|
|
385
|
+
if @options[:recursive]
|
|
386
|
+
walk_recursive(dir, out, Set.new)
|
|
387
|
+
else
|
|
388
|
+
# Non-recursive: just the immediate children of `dir`.
|
|
389
|
+
Dir.glob(File.join(dir, '*')).each do |sub|
|
|
390
|
+
next if File.symlink?(sub) && !@options[:follow_symlinks]
|
|
391
|
+
|
|
392
|
+
out << sub if File.file?(sub)
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Manual recursive walker. We don't use `Find.find` because it never
|
|
398
|
+
# descends into symlinked directories, even when --follow-symlinks is on.
|
|
399
|
+
# `visited` tracks realpaths so we don't infinite-loop on a symlink that
|
|
400
|
+
# eventually points at one of its ancestors.
|
|
401
|
+
def walk_recursive(dir, out, visited)
|
|
402
|
+
real = begin
|
|
403
|
+
File.realpath(dir)
|
|
404
|
+
rescue StandardError
|
|
405
|
+
dir
|
|
406
|
+
end
|
|
407
|
+
return if visited.include?(real)
|
|
408
|
+
|
|
409
|
+
visited << real
|
|
410
|
+
|
|
411
|
+
Dir.each_child(dir) do |entry|
|
|
412
|
+
sub = File.join(dir, entry)
|
|
413
|
+
if File.symlink?(sub)
|
|
414
|
+
next unless @options[:follow_symlinks]
|
|
415
|
+
|
|
416
|
+
if File.directory?(sub)
|
|
417
|
+
walk_recursive(sub, out, visited)
|
|
418
|
+
elsif File.file?(sub)
|
|
419
|
+
out << sub
|
|
420
|
+
end
|
|
421
|
+
elsif File.directory?(sub)
|
|
422
|
+
walk_recursive(sub, out, visited)
|
|
423
|
+
elsif File.file?(sub)
|
|
424
|
+
out << sub
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
rescue Errno::EACCES, Errno::ENOENT => e
|
|
428
|
+
Display.warning "Skipping #{dir}: #{e.message}"
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
# Files we never touch when DISCOVERED via directory scanning. This is
|
|
432
|
+
# NOT applied to explicit CLI arguments — if the user typed
|
|
433
|
+
# `metaclean .hidden.jpg`, they meant it. Hidden files (dot-prefixed)
|
|
434
|
+
# might be system metadata; .bak/_clean/.metaclean.tmp.* are our own
|
|
435
|
+
# outputs, so skipping them prevents loops on re-runs.
|
|
436
|
+
def skip?(file)
|
|
437
|
+
base = File.basename(file)
|
|
438
|
+
return true if base.start_with?('.')
|
|
439
|
+
return true if base.end_with?('.bak')
|
|
440
|
+
return true if base =~ /_clean(_\d+)?\.[^.]+\z/
|
|
441
|
+
return true if base =~ /\.metaclean\.tmp\.\d+\.\d+/
|
|
442
|
+
|
|
443
|
+
false
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
def type_allowed?(file)
|
|
447
|
+
ext = File.extname(file).downcase.delete('.')
|
|
448
|
+
@options[:types].include?(ext)
|
|
449
|
+
end
|
|
450
|
+
end
|
|
451
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# The "policy" module: which tools to run for which file, and what counts as
|
|
5
|
+
# privacy-relevant if it survives a clean.
|
|
6
|
+
#
|
|
7
|
+
# Keeping this logic in its own file means the runner doesn't need to know
|
|
8
|
+
# about formats — it just asks Strategy.tools_for(path) and runs whatever
|
|
9
|
+
# comes back.
|
|
10
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
module Metaclean
|
|
13
|
+
module Strategy
|
|
14
|
+
# Tag GROUPS that almost always carry personally identifying info.
|
|
15
|
+
# Survival of any tag in these groups raises a flag to the user.
|
|
16
|
+
PRIVACY_GROUPS = %w[GPS MakerNotes XMP-dc XMP-photoshop IPTC ICC-header].freeze
|
|
17
|
+
|
|
18
|
+
# Specific tag NAMES (regardless of group) we never want to leak.
|
|
19
|
+
# If exiftool reports e.g. "EXIF:Artist" we still flag it because of the
|
|
20
|
+
# tag-name match, not the group.
|
|
21
|
+
PRIVACY_TAGS = %w[
|
|
22
|
+
Artist Author Creator Copyright Rights
|
|
23
|
+
By-line By-lineTitle Credit Source Contact OwnerName
|
|
24
|
+
CameraOwnerName SerialNumber InternalSerialNumber LensSerialNumber
|
|
25
|
+
Software HostComputer ProcessingSoftware
|
|
26
|
+
ImageDescription UserComment
|
|
27
|
+
LastModifiedBy LastSavedBy LastAuthor
|
|
28
|
+
].freeze
|
|
29
|
+
|
|
30
|
+
# File extensions where mat2 is meaningfully stricter than ExifTool and
|
|
31
|
+
# should run first. For other formats, ExifTool is the broader expert.
|
|
32
|
+
MAT2_PREFERRED = %w[
|
|
33
|
+
pdf docx xlsx pptx odt ods odp odg epub png svg
|
|
34
|
+
mp4 avi mkv mov webm
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
module_function
|
|
38
|
+
|
|
39
|
+
# Returns an ordered list of tool symbols (e.g. `[:mat2, :exiftool, :qpdf]`)
|
|
40
|
+
# to run on `path`. The runner executes them in order; if one fails or
|
|
41
|
+
# is skipped, the next still runs.
|
|
42
|
+
#
|
|
43
|
+
# `prefer:` is a hash of user opt-outs from the CLI flags
|
|
44
|
+
# (--no-mat2, --exiftool-only, etc.). The pattern `prefer[:mat2] != false`
|
|
45
|
+
# treats both `nil` (not set) and `true` as "use it" — only an explicit
|
|
46
|
+
# `false` disables.
|
|
47
|
+
def tools_for(path, prefer: {})
|
|
48
|
+
ext = File.extname(path).downcase.delete('.')
|
|
49
|
+
tools = []
|
|
50
|
+
|
|
51
|
+
if ext == 'pdf'
|
|
52
|
+
# PDFs benefit from all three, in this order:
|
|
53
|
+
# mat2 → cleans the high-level metadata + content streams it knows
|
|
54
|
+
# exiftool → strips the Info dictionary (Author, Title, Producer)
|
|
55
|
+
# qpdf → rebuilds the file, dropping any unreferenced bits
|
|
56
|
+
tools << :mat2 if prefer[:mat2] != false && Mat2.available?
|
|
57
|
+
tools << :exiftool if prefer[:exiftool] != false
|
|
58
|
+
tools << :qpdf if prefer[:qpdf] != false && Qpdf.available?
|
|
59
|
+
elsif MAT2_PREFERRED.include?(ext) && prefer[:mat2] != false && Mat2.available?
|
|
60
|
+
# Office docs, modern image/video containers — mat2 leads.
|
|
61
|
+
tools << :mat2
|
|
62
|
+
tools << :exiftool if prefer[:exiftool] != false
|
|
63
|
+
else
|
|
64
|
+
# Everything else (JPEG, MP3, RAW, …) — ExifTool is the gold standard.
|
|
65
|
+
tools << :exiftool if prefer[:exiftool] != false
|
|
66
|
+
tools << :mat2 if prefer[:mat2] != false && Mat2.supports?(path)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
tools
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Looks at metadata read AFTER cleaning and returns the entries that
|
|
73
|
+
# still look privacy-relevant. The runner uses this for the "still
|
|
74
|
+
# present" warning at the end of each file.
|
|
75
|
+
#
|
|
76
|
+
# Why both group-match and tag-match? Tag names can appear under
|
|
77
|
+
# different groups depending on the format (e.g. "Author" in PDF vs
|
|
78
|
+
# "Artist" in EXIF). Combining the two keeps coverage broad without
|
|
79
|
+
# having to enumerate every {group, tag} pair.
|
|
80
|
+
def privacy_residual(meta)
|
|
81
|
+
meta.reject { |k, _| k == 'SourceFile' }.select do |k, _|
|
|
82
|
+
# ExifTool keys look like "GPS:GPSLatitude". Split on the first ":".
|
|
83
|
+
group, tag = k.to_s.split(':', 2)
|
|
84
|
+
# Skip System/File/etc. — those aren't user metadata.
|
|
85
|
+
next false if Display::NON_METADATA_GROUPS.include?(group)
|
|
86
|
+
|
|
87
|
+
if tag.nil?
|
|
88
|
+
# No "Group:" prefix — the whole key is the tag name.
|
|
89
|
+
PRIVACY_TAGS.include?(group.to_s)
|
|
90
|
+
else
|
|
91
|
+
PRIVACY_GROUPS.include?(group) || PRIVACY_TAGS.include?(tag)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# Single source of truth for the program's version.
|
|
5
|
+
# Both the gemspec and `metaclean --version` read from here, so we only have
|
|
6
|
+
# one place to bump.
|
|
7
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
8
|
+
|
|
9
|
+
module Metaclean
|
|
10
|
+
VERSION = '1.0.2'
|
|
11
|
+
end
|
data/lib/metaclean.rb
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# lib/metaclean.rb — the library's "front door".
|
|
5
|
+
#
|
|
6
|
+
# In Ruby, a module is a namespace. We put everything inside `Metaclean::*`
|
|
7
|
+
# so we don't pollute the global namespace and so it's obvious where each
|
|
8
|
+
# piece belongs.
|
|
9
|
+
#
|
|
10
|
+
# The `require` order matters: a file can only reference constants from
|
|
11
|
+
# files already loaded. We load the smallest pieces first, then the bigger
|
|
12
|
+
# ones that depend on them.
|
|
13
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
require 'metaclean/version' # just defines VERSION
|
|
16
|
+
require 'metaclean/display' # ANSI colors and formatters (no deps)
|
|
17
|
+
require 'metaclean/exiftool' # ExifTool wrapper
|
|
18
|
+
require 'metaclean/mat2' # mat2 wrapper
|
|
19
|
+
require 'metaclean/qpdf' # qpdf wrapper
|
|
20
|
+
require 'metaclean/strategy' # picks which tools run for each file type
|
|
21
|
+
require 'metaclean/runner' # orchestrates a clean across many files
|
|
22
|
+
require 'metaclean/cli' # parses ARGV and calls Runner
|
|
23
|
+
|
|
24
|
+
module Metaclean
|
|
25
|
+
# Custom exception classes. Inheriting from StandardError lets callers do
|
|
26
|
+
# `rescue Metaclean::Error` to catch any of our errors without accidentally
|
|
27
|
+
# catching things like NoMemoryError or SystemExit.
|
|
28
|
+
class Error < StandardError; end
|
|
29
|
+
|
|
30
|
+
# A more specific error so the CLI can show a tailored install hint when
|
|
31
|
+
# ExifTool itself is missing.
|
|
32
|
+
class ExiftoolMissing < Error; end
|
|
33
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: metaclean
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.2
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- 26zl
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies: []
|
|
12
|
+
description: |
|
|
13
|
+
metaclean is a small Ruby CLI that wraps ExifTool, mat2 and qpdf to strip
|
|
14
|
+
removable embedded tags (EXIF, IPTC, XMP, GPS, MakerNotes, ID3, document
|
|
15
|
+
properties, etc.) from images, audio, video, PDFs and Office documents —
|
|
16
|
+
and shows a before/after diff of what was removed.
|
|
17
|
+
executables:
|
|
18
|
+
- metaclean
|
|
19
|
+
extensions: []
|
|
20
|
+
extra_rdoc_files: []
|
|
21
|
+
files:
|
|
22
|
+
- LICENSE
|
|
23
|
+
- README.md
|
|
24
|
+
- bin/metaclean
|
|
25
|
+
- lib/metaclean.rb
|
|
26
|
+
- lib/metaclean/cli.rb
|
|
27
|
+
- lib/metaclean/display.rb
|
|
28
|
+
- lib/metaclean/exiftool.rb
|
|
29
|
+
- lib/metaclean/mat2.rb
|
|
30
|
+
- lib/metaclean/qpdf.rb
|
|
31
|
+
- lib/metaclean/runner.rb
|
|
32
|
+
- lib/metaclean/strategy.rb
|
|
33
|
+
- lib/metaclean/version.rb
|
|
34
|
+
homepage: https://github.com/26zl/metaclean
|
|
35
|
+
licenses:
|
|
36
|
+
- MIT
|
|
37
|
+
metadata:
|
|
38
|
+
allowed_push_host: https://rubygems.org
|
|
39
|
+
bug_tracker_uri: https://github.com/26zl/metaclean/issues
|
|
40
|
+
changelog_uri: https://github.com/26zl/metaclean/releases
|
|
41
|
+
source_code_uri: https://github.com/26zl/metaclean
|
|
42
|
+
rubygems_mfa_required: 'true'
|
|
43
|
+
rdoc_options: []
|
|
44
|
+
require_paths:
|
|
45
|
+
- lib
|
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
47
|
+
requirements:
|
|
48
|
+
- - ">="
|
|
49
|
+
- !ruby/object:Gem::Version
|
|
50
|
+
version: '3.2'
|
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - ">="
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: '0'
|
|
56
|
+
requirements:
|
|
57
|
+
- ExifTool (https://exiftool.org) on PATH
|
|
58
|
+
rubygems_version: 3.7.2
|
|
59
|
+
specification_version: 4
|
|
60
|
+
summary: Cross-platform CLI that strips file metadata with ExifTool, mat2 and qpdf.
|
|
61
|
+
test_files: []
|