metaclean 4.0.1 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -4
- data/lib/metaclean/exiftool.rb +2 -2
- data/lib/metaclean/ffmpeg.rb +1 -1
- data/lib/metaclean/mat2.rb +1 -1
- data/lib/metaclean/qpdf.rb +1 -1
- data/lib/metaclean/runner.rb +30 -4
- data/lib/metaclean/strategy.rb +1 -1
- data/lib/metaclean/version.rb +1 -1
- data/lib/metaclean.rb +76 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 29c5b4aad77d530c77d0f4fbae6bbb1811dd6c801719e67999a1da802522a06a
|
|
4
|
+
data.tar.gz: cbeb0bc1eaf21557b922641fe255b1542dfc776e8833d20ae3fc1c74c9d378a5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b33eebe165d0d484d8fd19c9ba8bcaa28a2767b5597c5a08df946fa329256fb23694b9df35507007d4301998b49ca2bf7599eae1f11b0979b1408ee46a71ee74
|
|
7
|
+
data.tar.gz: 48a238f59f367c4bd905455cc44bfe12863b02a73bda64b22391d528ef8523d513e5d668992a8d337fe593448ee6552746ca41cf1766fcac6dabbd540e36ba85
|
data/README.md
CHANGED
|
@@ -31,7 +31,9 @@ It wraps four battle-tested tools and routes each file to the right one:
|
|
|
31
31
|
- **Verification-first:** it re-reads the cleaned file and refuses to write a
|
|
32
32
|
result when known privacy metadata survives.
|
|
33
33
|
- **Safer defaults:** it writes `*_clean` copies by default; `--in-place` keeps a
|
|
34
|
-
`.bak` and asks for confirmation unless `--force` is set.
|
|
34
|
+
`.bak` and asks for confirmation unless `--force` is set. Note the `.bak` is the
|
|
35
|
+
**original, with all its metadata** — delete or move the `.bak` files before
|
|
36
|
+
sharing an in-place-cleaned folder.
|
|
35
37
|
- **Lossless routing:** it avoids mat2 paths that recompress JPEG/WebP or
|
|
36
38
|
downconvert TIFF, and uses ffmpeg stream-copy for Matroska.
|
|
37
39
|
- **Batch-friendly:** failed or unverified files exit non-zero, so scripts and CI
|
|
@@ -150,7 +152,7 @@ metaclean --dry-run photo.jpg
|
|
|
150
152
|
| --- | --- |
|
|
151
153
|
| `--inspect` | Read-only — print metadata, never write |
|
|
152
154
|
| `--dry-run` | Simulate cleaning, show diff, write nothing |
|
|
153
|
-
| `-i`, `--in-place` | Overwrite originals (keeps a `<file>.bak`) |
|
|
155
|
+
| `-i`, `--in-place` | Overwrite originals (keeps a `<file>.bak` — the **original, metadata intact**; remove it before sharing) |
|
|
154
156
|
| `-r`, `--recursive` | Recurse into directories |
|
|
155
157
|
| `-f`, `--force` | Skip the confirmation prompt |
|
|
156
158
|
| `-h`, `--help` | Show usage and exit |
|
|
@@ -166,8 +168,8 @@ secret**; the one-time prerequisite is registering this gem's Trusted Publisher
|
|
|
166
168
|
on rubygems.org.
|
|
167
169
|
|
|
168
170
|
```bash
|
|
169
|
-
git tag
|
|
170
|
-
git push origin
|
|
171
|
+
git tag v4.0.1
|
|
172
|
+
git push origin v4.0.1
|
|
171
173
|
```
|
|
172
174
|
|
|
173
175
|
## Safety
|
|
@@ -177,6 +179,13 @@ git push origin v3.0.0
|
|
|
177
179
|
- `--in-place` writes atomically: the file is built in a temp file and
|
|
178
180
|
renamed into place, so a crash mid-run cannot leave a half-written original.
|
|
179
181
|
- Symlinks are always skipped — metaclean never cleans through a link.
|
|
182
|
+
- Folder and recursive (`-r`) scans deliberately skip **hidden files** (dot-prefixed,
|
|
183
|
+
e.g. `.secret.jpg`) and metaclean's own outputs (`*_clean.*`, `*.bak`), so a
|
|
184
|
+
directory run does not clean every file in the tree. To clean a hidden file,
|
|
185
|
+
name it explicitly: `metaclean .secret.jpg`.
|
|
186
|
+
- A `<file>.bak` left by `--in-place` is the **untouched original** and still
|
|
187
|
+
contains all its metadata. It is reported in the run, but it is yours to delete
|
|
188
|
+
or move before sharing the folder.
|
|
180
189
|
- Filename collisions (`photo_clean.jpg` already exists, `.bak` already
|
|
181
190
|
exists) are resolved with `_1`, `_2`, … suffixes, including late collisions
|
|
182
191
|
that appear while a file is being cleaned.
|
data/lib/metaclean/exiftool.rb
CHANGED
|
@@ -50,7 +50,7 @@ module Metaclean
|
|
|
50
50
|
# -n Numeric values (no human formatting like "1/100 sec")
|
|
51
51
|
# -api largefilesupport=1 Allow files >4 GB
|
|
52
52
|
def read(path)
|
|
53
|
-
out, err, status =
|
|
53
|
+
out, err, status = Metaclean.capture3(
|
|
54
54
|
'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1',
|
|
55
55
|
Metaclean.safe_path(path)
|
|
56
56
|
)
|
|
@@ -111,7 +111,7 @@ module Metaclean
|
|
|
111
111
|
also_delete.each { |tag| args << "-#{tag}=" }
|
|
112
112
|
args.concat(['-overwrite_original', '-q', '-q', '-api', 'largefilesupport=1', Metaclean.safe_path(path)])
|
|
113
113
|
|
|
114
|
-
_out, err, status =
|
|
114
|
+
_out, err, status = Metaclean.capture3(*args)
|
|
115
115
|
return true if status.success?
|
|
116
116
|
return :unsupported if err.match?(WRITE_UNSUPPORTED_RE)
|
|
117
117
|
|
data/lib/metaclean/ffmpeg.rb
CHANGED
|
@@ -51,7 +51,7 @@ module Metaclean
|
|
|
51
51
|
# Clear any stale temp from an earlier crashed run before muxing.
|
|
52
52
|
File.delete(tmp) if File.exist?(tmp)
|
|
53
53
|
|
|
54
|
-
_out, err, status =
|
|
54
|
+
_out, err, status = Metaclean.capture3(
|
|
55
55
|
'ffmpeg', '-y', '-v', 'error', '-nostdin', '-i', file_url(path),
|
|
56
56
|
'-map', '0', '-map_metadata', '-1', '-map_chapters', '-1', '-c', 'copy',
|
|
57
57
|
file_url(tmp)
|
data/lib/metaclean/mat2.rb
CHANGED
|
@@ -84,7 +84,7 @@ module Metaclean
|
|
|
84
84
|
# crashed run, remove it so we don't accidentally use old data.
|
|
85
85
|
File.delete(cleaned) if File.exist?(cleaned)
|
|
86
86
|
|
|
87
|
-
out, err, status =
|
|
87
|
+
out, err, status = Metaclean.capture3('mat2', safe)
|
|
88
88
|
|
|
89
89
|
# Success path first. mat2 only creates `<name>.cleaned.<ext>` when it
|
|
90
90
|
# actually stripped something; no file after exit 0 means there was
|
data/lib/metaclean/qpdf.rb
CHANGED
|
@@ -52,7 +52,7 @@ module Metaclean
|
|
|
52
52
|
src = Metaclean.safe_path(path)
|
|
53
53
|
tmp = tmp_path_for(path)
|
|
54
54
|
|
|
55
|
-
_out, err, status =
|
|
55
|
+
_out, err, status = Metaclean.capture3(
|
|
56
56
|
'qpdf', '--linearize', '--object-streams=generate',
|
|
57
57
|
'--remove-unreferenced-resources=yes', src, Metaclean.safe_path(tmp)
|
|
58
58
|
)
|
data/lib/metaclean/runner.rb
CHANGED
|
@@ -17,6 +17,10 @@ module Metaclean
|
|
|
17
17
|
class Runner
|
|
18
18
|
def initialize(options)
|
|
19
19
|
@options = options
|
|
20
|
+
# Paths that couldn't be read during discovery (missing arg, unreadable
|
|
21
|
+
# directory). Tracked so a partially-scanned batch exits non-zero instead
|
|
22
|
+
# of letting automation mistake "some files cleaned" for "everything done".
|
|
23
|
+
@scan_errors = 0
|
|
20
24
|
end
|
|
21
25
|
|
|
22
26
|
# Public entry points: one for `--inspect`, one for the cleaning flow.
|
|
@@ -27,6 +31,7 @@ module Metaclean
|
|
|
27
31
|
Display.warning('No files to inspect.')
|
|
28
32
|
exit 1
|
|
29
33
|
end
|
|
34
|
+
failed = 0
|
|
30
35
|
files.each do |file|
|
|
31
36
|
Display.header "📄 #{file}"
|
|
32
37
|
meta = Exiftool.read(file)
|
|
@@ -36,7 +41,13 @@ module Metaclean
|
|
|
36
41
|
# One unreadable/odd file shouldn't abort inspecting the rest — mirrors
|
|
37
42
|
# the per-file rescue in the clean batch.
|
|
38
43
|
warn Display.error("#{file}: #{e.message}")
|
|
44
|
+
failed += 1
|
|
39
45
|
end
|
|
46
|
+
|
|
47
|
+
# A file we couldn't read is a non-zero condition: a script using --inspect
|
|
48
|
+
# as a gate must not mistake "couldn't read it" for "no metadata". Discovery
|
|
49
|
+
# errors (missing paths / unreadable dirs) count too.
|
|
50
|
+
exit 1 if failed.positive? || @scan_errors.positive?
|
|
40
51
|
end
|
|
41
52
|
|
|
42
53
|
def clean_paths(paths)
|
|
@@ -55,11 +66,19 @@ module Metaclean
|
|
|
55
66
|
action = @options[:in_place] ? 'OVERWRITE' : 'create cleaned copies of'
|
|
56
67
|
puts Display.c("About to #{action} #{files.size} file(s).", :yellow)
|
|
57
68
|
if @options[:in_place]
|
|
58
|
-
|
|
69
|
+
# The .bak IS the original — metadata intact. Say so plainly: a user who
|
|
70
|
+
# shares the "cleaned" folder would otherwise leak it via the backup.
|
|
71
|
+
puts Display.c('Each <file>.bak is the ORIGINAL, with all metadata still in it — ' \
|
|
72
|
+
'delete or move the .bak files before sharing the folder.', :yellow)
|
|
59
73
|
end
|
|
60
74
|
print Display.c('Proceed? [y/N] ', :bold)
|
|
61
75
|
ans = $stdin.gets&.strip&.downcase # gets → nil on Ctrl-D
|
|
62
|
-
|
|
76
|
+
# Abort (no/blank/EOF) is a non-zero exit, not silent success — a
|
|
77
|
+
# non-interactive caller must not read "Aborted." as "files were cleaned".
|
|
78
|
+
unless %w[y yes].include?(ans)
|
|
79
|
+
Display.warning('Aborted.')
|
|
80
|
+
exit 1
|
|
81
|
+
end
|
|
63
82
|
end
|
|
64
83
|
|
|
65
84
|
summary = { cleaned: 0, unverified: 0, failed: 0, removed_total: 0, residual_files: 0 }
|
|
@@ -81,8 +100,9 @@ module Metaclean
|
|
|
81
100
|
|
|
82
101
|
print_summary(summary)
|
|
83
102
|
|
|
84
|
-
# Non-zero exit so CI/scripts can detect a failed or not-fully-verified file
|
|
85
|
-
|
|
103
|
+
# Non-zero exit so CI/scripts can detect a failed or not-fully-verified file,
|
|
104
|
+
# OR a batch that was never fully discovered (a path/dir we couldn't read).
|
|
105
|
+
exit 1 if summary[:failed].positive? || summary[:unverified].positive? || @scan_errors.positive?
|
|
86
106
|
end
|
|
87
107
|
|
|
88
108
|
private
|
|
@@ -487,6 +507,9 @@ module Metaclean
|
|
|
487
507
|
if summary[:residual_files].positive?
|
|
488
508
|
Display.warning "Files with privacy residual: #{summary[:residual_files]}"
|
|
489
509
|
end
|
|
510
|
+
if @scan_errors.positive?
|
|
511
|
+
Display.warning "Paths skipped during discovery (not found or unreadable): #{@scan_errors}"
|
|
512
|
+
end
|
|
490
513
|
end
|
|
491
514
|
|
|
492
515
|
# File discovery — turning the user's paths into a flat list.
|
|
@@ -510,6 +533,7 @@ module Metaclean
|
|
|
510
533
|
explicit << p
|
|
511
534
|
else
|
|
512
535
|
Display.warning "Not found: #{p}"
|
|
536
|
+
@scan_errors += 1
|
|
513
537
|
end
|
|
514
538
|
end
|
|
515
539
|
discovered.reject! { |f| skip?(f) }
|
|
@@ -559,6 +583,7 @@ module Metaclean
|
|
|
559
583
|
# warn and skip this directory so one bad entry doesn't abort discovery of
|
|
560
584
|
# the rest of the batch.
|
|
561
585
|
Display.warning "Skipping #{dir}: #{e.message}"
|
|
586
|
+
@scan_errors += 1
|
|
562
587
|
end
|
|
563
588
|
|
|
564
589
|
# Manual recursive walker. Symlinks are always skipped (never followed), so
|
|
@@ -579,6 +604,7 @@ module Metaclean
|
|
|
579
604
|
# warn and skip this directory so one bad entry doesn't abort discovery of
|
|
580
605
|
# the rest of the batch.
|
|
581
606
|
Display.warning "Skipping #{dir}: #{e.message}"
|
|
607
|
+
@scan_errors += 1
|
|
582
608
|
end
|
|
583
609
|
|
|
584
610
|
# Files we never touch when DISCOVERED via directory scanning. This is
|
data/lib/metaclean/strategy.rb
CHANGED
|
@@ -76,7 +76,7 @@ module Metaclean
|
|
|
76
76
|
|
|
77
77
|
# Returns an ordered list of tool symbols (e.g. `[:mat2, :exiftool, :qpdf]`)
|
|
78
78
|
# to run on `path`. The runner executes them in order; if one fails or
|
|
79
|
-
# is skipped, the next still runs. The
|
|
79
|
+
# is skipped, the next still runs. The tools are always used together
|
|
80
80
|
# for maximum coverage — there is no per-tool opt-out; a tool that isn't
|
|
81
81
|
# installed is simply left out (the `.available?`/`.supports?` checks).
|
|
82
82
|
def tools_for(path)
|
data/lib/metaclean/version.rb
CHANGED
data/lib/metaclean.rb
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
# Library entry point. require order matters: dependencies before dependents.
|
|
4
4
|
|
|
5
|
+
require 'open3'
|
|
6
|
+
|
|
5
7
|
require 'metaclean/version'
|
|
6
8
|
require 'metaclean/display'
|
|
7
9
|
require 'metaclean/exiftool'
|
|
@@ -31,6 +33,80 @@ module Metaclean
|
|
|
31
33
|
s.start_with?('-') ? File.join('.', s) : s
|
|
32
34
|
end
|
|
33
35
|
|
|
36
|
+
# External tools can hang, or run away producing endless output, on a corrupt
|
|
37
|
+
# or hostile file. Every OPERATIONAL shell-out (read/strip/rebuild) goes through
|
|
38
|
+
# this instead of Open3.capture3 so one bad file is bounded on BOTH axes — by
|
|
39
|
+
# wall-clock (COMMAND_TIMEOUT) and by captured bytes (MAX_OUTPUT_BYTES) — rather
|
|
40
|
+
# than hanging or exhausting memory and taking the whole batch with it. The
|
|
41
|
+
# quick availability probes (`-ver`/`--version`) stay on plain capture3: fixed
|
|
42
|
+
# args, no file input, nothing to hang on.
|
|
43
|
+
COMMAND_TIMEOUT = 120 # seconds
|
|
44
|
+
# Per stream (stdout AND stderr). Far above any legitimate output from the
|
|
45
|
+
# tools' invocations here (metadata JSON / `-q` strips / `-v error` muxes), so
|
|
46
|
+
# tripping it means a runaway, not a real result.
|
|
47
|
+
MAX_OUTPUT_BYTES = 64 * 1024 * 1024
|
|
48
|
+
READ_CHUNK = 64 * 1024
|
|
49
|
+
|
|
50
|
+
# Drop-in replacement for Open3.capture3 that returns the same [out, err,
|
|
51
|
+
# status] triple but kills the command (and anything it spawned) if it runs
|
|
52
|
+
# past `timeout` OR floods more than `max_output` bytes on either stream.
|
|
53
|
+
def self.capture3(*cmd, timeout: COMMAND_TIMEOUT, max_output: MAX_OUTPUT_BYTES)
|
|
54
|
+
Open3.popen3(*cmd, pgroup: true) do |stdin, stdout, stderr, wait_thr|
|
|
55
|
+
stdin.close
|
|
56
|
+
# Drain both pipes concurrently: a tool that fills one pipe buffer would
|
|
57
|
+
# otherwise block forever before exiting, and `join` below would never see
|
|
58
|
+
# it finish even though it isn't actually hung.
|
|
59
|
+
out_t = read_capped(stdout, max_output, wait_thr)
|
|
60
|
+
err_t = read_capped(stderr, max_output, wait_thr)
|
|
61
|
+
|
|
62
|
+
if wait_thr.join(timeout).nil?
|
|
63
|
+
kill_group(wait_thr)
|
|
64
|
+
out_t.join(2)
|
|
65
|
+
err_t.join(2)
|
|
66
|
+
raise Error, "#{cmd.first} timed out after #{timeout}s"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
out, out_over = out_t.value
|
|
70
|
+
err, err_over = err_t.value
|
|
71
|
+
raise Error, "#{cmd.first} exceeded the #{max_output}-byte output limit" if out_over || err_over
|
|
72
|
+
|
|
73
|
+
[out, err, wait_thr.value]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Read an IO into a String in a thread, but stop accumulating once it passes
|
|
78
|
+
# `limit` bytes — and kill the command then, so a flooding stream is cut off
|
|
79
|
+
# promptly instead of waiting out the full timeout. After the cap is hit it
|
|
80
|
+
# keeps draining (discarding) so the dying child isn't blocked on a full pipe.
|
|
81
|
+
# Returns [string, overflowed?].
|
|
82
|
+
def self.read_capped(io, limit, wait_thr)
|
|
83
|
+
Thread.new do
|
|
84
|
+
buf = +''
|
|
85
|
+
over = false
|
|
86
|
+
while (chunk = io.read(READ_CHUNK))
|
|
87
|
+
next if over # past the cap: drain & discard so the child can exit
|
|
88
|
+
|
|
89
|
+
buf << chunk
|
|
90
|
+
next unless buf.bytesize > limit
|
|
91
|
+
|
|
92
|
+
over = true
|
|
93
|
+
buf = buf.byteslice(0, limit)
|
|
94
|
+
kill_group(wait_thr)
|
|
95
|
+
end
|
|
96
|
+
[buf, over]
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# SIGTERM the child's whole process group — pgroup:true made the child the
|
|
101
|
+
# group leader, so any helpers it forked are signalled too — escalating to
|
|
102
|
+
# SIGKILL if it ignores TERM. A negative pid targets the group.
|
|
103
|
+
def self.kill_group(wait_thr)
|
|
104
|
+
Process.kill('-TERM', wait_thr.pid)
|
|
105
|
+
Process.kill('-KILL', wait_thr.pid) unless wait_thr.join(2)
|
|
106
|
+
rescue Errno::ESRCH, Errno::EPERM
|
|
107
|
+
nil # already gone, or not permitted to signal it — nothing more to do
|
|
108
|
+
end
|
|
109
|
+
|
|
34
110
|
# Lower-cased, dot-stripped extension used for FORMAT ROUTING decisions
|
|
35
111
|
# (Strategy#tools_for, Strategy#mat2_essential?, Mat2.supports?). One
|
|
36
112
|
# definition so every routing path normalizes the extension identically —
|