metaclean 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +132 -0
- data/bin/metaclean +29 -0
- data/lib/metaclean/cli.rb +169 -0
- data/lib/metaclean/display.rb +197 -0
- data/lib/metaclean/exiftool.rb +140 -0
- data/lib/metaclean/mat2.rb +123 -0
- data/lib/metaclean/qpdf.rb +75 -0
- data/lib/metaclean/runner.rb +451 -0
- data/lib/metaclean/strategy.rb +96 -0
- data/lib/metaclean/version.rb +11 -0
- data/lib/metaclean.rb +33 -0
- metadata +61 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# A thin Ruby wrapper around the external `exiftool` binary.
|
|
5
|
+
#
|
|
6
|
+
# We use `Open3.capture3` instead of backticks or `system()` because:
|
|
7
|
+
# 1. It returns stdout, stderr, and the process status separately.
|
|
8
|
+
# 2. When called with multiple arguments, it bypasses the shell entirely
|
|
9
|
+
# — so a filename like `cat; rm -rf /` is treated as ONE filename, not
|
|
10
|
+
# a shell command. This is the standard way to safely shell out in Ruby.
|
|
11
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
require 'open3'
|
|
14
|
+
require 'json'
|
|
15
|
+
|
|
16
|
+
module Metaclean
|
|
17
|
+
# `module Exiftool` (vs `class`) because we want module-level methods like
|
|
18
|
+
# `Exiftool.read(path)` — there's no state to carry per instance.
|
|
19
|
+
module Exiftool
|
|
20
|
+
# `module_function` makes every method below act like a "static" method
|
|
21
|
+
# on the module *and* a private instance method (rarely used). It saves
|
|
22
|
+
# writing `def self.read` for every method.
|
|
23
|
+
module_function
|
|
24
|
+
|
|
25
|
+
# Returns true if `exiftool` is on PATH. The result is memoized in `@available`
|
|
26
|
+
# so repeated checks don't re-spawn the process.
|
|
27
|
+
#
|
|
28
|
+
# `defined?(@available)` is safer than `@available.nil?` because the
|
|
29
|
+
# cached value could legitimately be `false` — we want to skip the
|
|
30
|
+
# re-check in that case too.
|
|
31
|
+
def available?
|
|
32
|
+
return @available if defined?(@available)
|
|
33
|
+
|
|
34
|
+
_out, _err, status = Open3.capture3('exiftool', '-ver')
|
|
35
|
+
@available = status.success?
|
|
36
|
+
rescue Errno::ENOENT
|
|
37
|
+
# `Errno::ENOENT` ("no such file or directory") is what Open3 raises
|
|
38
|
+
# when the executable can't be found. We treat that as "not available".
|
|
39
|
+
@available = false
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Returns the version string, or nil if exiftool is missing/broken.
|
|
43
|
+
def version
|
|
44
|
+
return nil unless available?
|
|
45
|
+
|
|
46
|
+
out, _err, status = Open3.capture3('exiftool', '-ver')
|
|
47
|
+
status.success? ? out.strip : nil
|
|
48
|
+
rescue Errno::ENOENT
|
|
49
|
+
nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Hard-fail with a helpful install hint. Called from `read`/`strip!` before
|
|
53
|
+
# any work, so users see one clear message instead of a low-level Errno.
|
|
54
|
+
# The `<<~MSG ... MSG` is a "squiggly heredoc": leading indentation is
|
|
55
|
+
# stripped automatically, so the output is left-aligned.
|
|
56
|
+
def ensure_available!
|
|
57
|
+
return if available?
|
|
58
|
+
|
|
59
|
+
raise ExiftoolMissing, <<~MSG
|
|
60
|
+
ExifTool is not installed or not on PATH.
|
|
61
|
+
|
|
62
|
+
Install:
|
|
63
|
+
macOS: brew install exiftool
|
|
64
|
+
Debian: sudo apt install libimage-exiftool-perl
|
|
65
|
+
Fedora: sudo dnf install perl-Image-ExifTool
|
|
66
|
+
Arch: sudo pacman -S perl-image-exiftool
|
|
67
|
+
Windows: scoop install exiftool (or download exiftool.org)
|
|
68
|
+
MSG
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Reads metadata from a file and returns a flat Hash of "Group:Tag" => value.
|
|
72
|
+
#
|
|
73
|
+
# ExifTool flag glossary:
|
|
74
|
+
# -j JSON output (machine-parseable)
|
|
75
|
+
# -G1 Include the family-1 group name (e.g. "EXIF", "GPS", "IPTC")
|
|
76
|
+
# -a Allow duplicate tags (some formats have several with same name)
|
|
77
|
+
# -u Include unknown/unidentified tags
|
|
78
|
+
# -s Short tag names (no descriptions)
|
|
79
|
+
# -n Numeric values (no human formatting like "1/100 sec")
|
|
80
|
+
# -api largefilesupport=1 Allow files >4 GB
|
|
81
|
+
def read(path)
|
|
82
|
+
ensure_available!
|
|
83
|
+
out, err, status = Open3.capture3(
|
|
84
|
+
'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1', path.to_s
|
|
85
|
+
)
|
|
86
|
+
raise Error, "ExifTool read failed: #{err.strip}" unless status.success?
|
|
87
|
+
|
|
88
|
+
# ExifTool's JSON output is an array (one entry per file). We always
|
|
89
|
+
# pass one file, so we take the first element. `|| {}` handles the
|
|
90
|
+
# edge case where exiftool returns an empty array.
|
|
91
|
+
data = JSON.parse(out)
|
|
92
|
+
data.first || {}
|
|
93
|
+
rescue JSON::ParserError => e
|
|
94
|
+
raise Error, "Could not parse ExifTool output: #{e.message}"
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Removes every removable tag, in place. Returns true on success.
|
|
98
|
+
#
|
|
99
|
+
# `-all=` is the magic incantation: it sets every tag to nothing (= empty),
|
|
100
|
+
# which deletes them. `-overwrite_original` makes ExifTool replace the
|
|
101
|
+
# file directly instead of writing `file_original` next to it.
|
|
102
|
+
#
|
|
103
|
+
# The optional `keep_*` flags are useful because:
|
|
104
|
+
# * Orientation tells viewers how to rotate phone photos. Removing it
|
|
105
|
+
# can show the picture sideways.
|
|
106
|
+
# * ICC profile tells viewers which color space the image is in.
|
|
107
|
+
# Removing it can shift colors.
|
|
108
|
+
def strip!(path, keep_orientation: false, keep_color_profile: false)
|
|
109
|
+
ensure_available!
|
|
110
|
+
|
|
111
|
+
preserving = keep_orientation || keep_color_profile
|
|
112
|
+
args = ['exiftool', '-all=']
|
|
113
|
+
|
|
114
|
+
# `-tagsFromFile @` says "copy tags from the same file you're writing
|
|
115
|
+
# to". That sounds redundant, but combined with `-all=` running first,
|
|
116
|
+
# it means "delete everything, then re-add only the listed tags".
|
|
117
|
+
if preserving
|
|
118
|
+
args.concat(['-tagsFromFile', '@'])
|
|
119
|
+
args << '-Orientation' if keep_orientation
|
|
120
|
+
args << '-ICC_Profile' if keep_color_profile
|
|
121
|
+
end
|
|
122
|
+
args.concat(['-overwrite_original', '-q', '-q', '-api', 'largefilesupport=1', path.to_s])
|
|
123
|
+
|
|
124
|
+
_out, err, status = Open3.capture3(*args)
|
|
125
|
+
return true if status.success?
|
|
126
|
+
|
|
127
|
+
# Some minimal/odd files reject the preserve-pass. Fall back to a plain
|
|
128
|
+
# full strip — but only if we *were* preserving, otherwise the retry
|
|
129
|
+
# would be identical to the failed attempt.
|
|
130
|
+
raise Error, "ExifTool strip failed: #{err.strip}" unless preserving
|
|
131
|
+
|
|
132
|
+
_out2, err2, status2 = Open3.capture3(
|
|
133
|
+
'exiftool', '-all=', '-overwrite_original', '-q', '-q', path.to_s
|
|
134
|
+
)
|
|
135
|
+
return true if status2.success?
|
|
136
|
+
|
|
137
|
+
raise Error, "ExifTool strip failed: #{err2.strip.empty? ? err.strip : err2.strip}"
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# Wrapper around the external `mat2` (Metadata Anonymisation Toolkit 2).
|
|
5
|
+
#
|
|
6
|
+
# mat2 is stricter than ExifTool on certain formats (DOCX/PDF/PNG): instead
|
|
7
|
+
# of blacklisting known tags, it rebuilds the file from scratch keeping only
|
|
8
|
+
# the bytes it understands. The trade-off is that mat2 supports fewer formats.
|
|
9
|
+
#
|
|
10
|
+
# mat2's CLI quirk: it does NOT overwrite the original. It writes a new file
|
|
11
|
+
# named `<name>.cleaned.<ext>` next to it. We adapt by renaming that result
|
|
12
|
+
# back over the source after a successful run.
|
|
13
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
require 'open3'
|
|
16
|
+
require 'fileutils'
|
|
17
|
+
|
|
18
|
+
module Metaclean
|
|
19
|
+
module Mat2
|
|
20
|
+
# File extensions we know mat2 can handle. Keep this list conservative —
|
|
21
|
+
# if mat2 doesn't actually support an extension, the call will fail
|
|
22
|
+
# gracefully via UNSUPPORTED_RE below, but we'd rather not even try.
|
|
23
|
+
SUPPORTED_EXTS = %w[
|
|
24
|
+
pdf png jpg jpeg tif tiff gif bmp svg webp
|
|
25
|
+
mp3 flac ogg opus wav m4a
|
|
26
|
+
mp4 avi mkv mov wmv webm
|
|
27
|
+
docx xlsx pptx odt ods odp odg odf epub
|
|
28
|
+
zip torrent
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
31
|
+
# Regex matching the messages mat2 prints when it can't handle a file.
|
|
32
|
+
# We use this to distinguish "soft skip" from a real error.
|
|
33
|
+
# `i` flag = case-insensitive.
|
|
34
|
+
UNSUPPORTED_RE = /(not supported|isn't supported|cannot be cleaned|unsupported file)/i.freeze
|
|
35
|
+
|
|
36
|
+
module_function
|
|
37
|
+
|
|
38
|
+
# Memoized PATH check (same pattern as Exiftool.available?).
|
|
39
|
+
def available?
|
|
40
|
+
return @available if defined?(@available)
|
|
41
|
+
|
|
42
|
+
_out, _err, status = Open3.capture3('mat2', '--version')
|
|
43
|
+
@available = status.success?
|
|
44
|
+
rescue Errno::ENOENT
|
|
45
|
+
@available = false
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def version
|
|
49
|
+
return nil unless available?
|
|
50
|
+
|
|
51
|
+
out, _err, status = Open3.capture3('mat2', '--version')
|
|
52
|
+
# `mat2 --version` prints "mat2 0.14.0" — `.split.last` grabs the
|
|
53
|
+
# version number regardless of whatever prefix appears.
|
|
54
|
+
status.success? ? out.strip.split.last : nil
|
|
55
|
+
rescue Errno::ENOENT
|
|
56
|
+
nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Quick check before we even try mat2 on a file. Used by Strategy to
|
|
60
|
+
# decide whether to add :mat2 to the pipeline.
|
|
61
|
+
def supports?(path)
|
|
62
|
+
return false unless available?
|
|
63
|
+
|
|
64
|
+
SUPPORTED_EXTS.include?(File.extname(path).downcase.delete('.'))
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Strips metadata from `path` in place. Returns:
|
|
68
|
+
# true — stripped successfully
|
|
69
|
+
# :no_metadata — mat2 ran but found nothing to strip
|
|
70
|
+
# :unsupported — mat2 cannot handle this file type
|
|
71
|
+
# Raises Metaclean::Error on hard failure.
|
|
72
|
+
#
|
|
73
|
+
# We return symbols (instead of always raising) so the runner can show a
|
|
74
|
+
# friendly "skipped" message and continue with the next tool.
|
|
75
|
+
def strip!(path)
|
|
76
|
+
raise Error, 'mat2 not available' unless available?
|
|
77
|
+
|
|
78
|
+
cleaned = cleaned_path_for(path)
|
|
79
|
+
|
|
80
|
+
# Defensive: if a stale `<name>.cleaned.<ext>` exists from an earlier
|
|
81
|
+
# crashed run, remove it so we don't accidentally use old data.
|
|
82
|
+
File.delete(cleaned) if File.exist?(cleaned)
|
|
83
|
+
|
|
84
|
+
out, err, status = Open3.capture3('mat2', path.to_s)
|
|
85
|
+
combined = "#{out}\n#{err}"
|
|
86
|
+
|
|
87
|
+
# Soft skip — mat2 itself told us it can't process this file.
|
|
88
|
+
# Defensive: if mat2 still wrote a partial `<name>.cleaned.<ext>`,
|
|
89
|
+
# remove it so a later run doesn't pick up stale output.
|
|
90
|
+
if combined.match?(UNSUPPORTED_RE)
|
|
91
|
+
File.delete(cleaned) if File.exist?(cleaned)
|
|
92
|
+
return :unsupported
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
unless status.success?
|
|
96
|
+
File.delete(cleaned) if File.exist?(cleaned)
|
|
97
|
+
# `err.strip.empty? ? out.strip : err.strip` picks whichever stream
|
|
98
|
+
# has actual content — some tools log to stdout, others to stderr.
|
|
99
|
+
raise Error, "mat2 failed: #{err.strip.empty? ? out.strip : err.strip}"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# mat2 only creates `<name>.cleaned.<ext>` when it actually stripped
|
|
103
|
+
# something. If the file didn't exist after a successful run, there
|
|
104
|
+
# was nothing to remove.
|
|
105
|
+
if File.exist?(cleaned)
|
|
106
|
+
FileUtils.mv(cleaned, path.to_s)
|
|
107
|
+
true
|
|
108
|
+
else
|
|
109
|
+
:no_metadata
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Builds the path mat2 will write to: `name.cleaned.ext`.
|
|
114
|
+
# We use File.dirname/basename/join instead of string concatenation so
|
|
115
|
+
# this works on Windows (\ separator) too.
|
|
116
|
+
def cleaned_path_for(path)
|
|
117
|
+
dir = File.dirname(path)
|
|
118
|
+
ext = File.extname(path)
|
|
119
|
+
stem = File.basename(path, ext)
|
|
120
|
+
File.join(dir, "#{stem}.cleaned#{ext}")
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
4
|
+
# Wrapper around `qpdf` — a PDF structural cleaner.
|
|
5
|
+
#
|
|
6
|
+
# Why qpdf in addition to mat2/ExifTool? PDFs can carry metadata in places
|
|
7
|
+
# those two don't always reach: orphaned objects, unused image streams,
|
|
8
|
+
# old revisions kept in the file. qpdf rebuilds the PDF from scratch using
|
|
9
|
+
# only the objects actually referenced by the document. That's a great
|
|
10
|
+
# final pass after the other tools have stripped the obvious metadata.
|
|
11
|
+
# ───────────────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
require 'open3'
|
|
14
|
+
require 'fileutils'
|
|
15
|
+
|
|
16
|
+
module Metaclean
|
|
17
|
+
module Qpdf
|
|
18
|
+
module_function
|
|
19
|
+
|
|
20
|
+
def available?
|
|
21
|
+
return @available if defined?(@available)
|
|
22
|
+
|
|
23
|
+
_out, _err, status = Open3.capture3('qpdf', '--version')
|
|
24
|
+
@available = status.success?
|
|
25
|
+
rescue Errno::ENOENT
|
|
26
|
+
@available = false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def version
|
|
30
|
+
return nil unless available?
|
|
31
|
+
|
|
32
|
+
out, _err, status = Open3.capture3('qpdf', '--version')
|
|
33
|
+
# `qpdf --version` prints multiple lines starting with the version line.
|
|
34
|
+
# `.lines.first` grabs only that line.
|
|
35
|
+
status.success? ? out.lines.first.to_s.strip : nil
|
|
36
|
+
rescue Errno::ENOENT
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Rebuilds a PDF in place. The qpdf flags here:
|
|
41
|
+
# --linearize → optimize for streaming/web
|
|
42
|
+
# --object-streams=generate → bundle objects efficiently
|
|
43
|
+
# --remove-unreferenced-resources=yes → drop unused content (the
|
|
44
|
+
# privacy-relevant part!)
|
|
45
|
+
#
|
|
46
|
+
# qpdf can't write back to the same file, so we use the standard
|
|
47
|
+
# "atomic write" pattern: write to a temp file, then rename it on top of
|
|
48
|
+
# the original. `File.rename` (used internally by `FileUtils.mv` for
|
|
49
|
+
# same-filesystem moves) is atomic on POSIX — either the swap completes
|
|
50
|
+
# or nothing changes. No "half-written" state is ever visible.
|
|
51
|
+
def rebuild!(path)
|
|
52
|
+
raise Error, 'qpdf not available' unless available?
|
|
53
|
+
|
|
54
|
+
# Including `Process.pid` in the temp name avoids collisions if two
|
|
55
|
+
# metaclean processes happen to run at the same time on shared storage.
|
|
56
|
+
tmp = "#{path}.qpdf.tmp.#{Process.pid}"
|
|
57
|
+
|
|
58
|
+
_out, err, status = Open3.capture3(
|
|
59
|
+
'qpdf', '--linearize', '--object-streams=generate',
|
|
60
|
+
'--remove-unreferenced-resources=yes', path.to_s, tmp
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# qpdf has a quirk: exit code 3 means "succeeded with warnings" (output
|
|
64
|
+
# is still produced and valid). We treat that the same as success.
|
|
65
|
+
success = status.success? || status.exitstatus == 3
|
|
66
|
+
unless success
|
|
67
|
+
File.delete(tmp) if File.exist?(tmp)
|
|
68
|
+
raise Error, "qpdf failed: #{err.strip}"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
FileUtils.mv(tmp, path.to_s)
|
|
72
|
+
true
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|