metaclean 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ # frozen_string_literal: true
2
+
3
+ # ───────────────────────────────────────────────────────────────────────────
4
+ # A thin Ruby wrapper around the external `exiftool` binary.
5
+ #
6
+ # We use `Open3.capture3` instead of backticks or `system()` because:
7
+ # 1. It returns stdout, stderr, and the process status separately.
8
+ # 2. When called with multiple arguments, it bypasses the shell entirely
9
+ # — so a filename like `cat; rm -rf /` is treated as ONE filename, not
10
+ # a shell command. This is the standard way to safely shell out in Ruby.
11
+ # ───────────────────────────────────────────────────────────────────────────
12
+
13
+ require 'open3'
14
+ require 'json'
15
+
16
+ module Metaclean
17
+ # `module Exiftool` (vs `class`) because we want module-level methods like
18
+ # `Exiftool.read(path)` — there's no state to carry per instance.
19
+ module Exiftool
20
+ # `module_function` makes every method below act like a "static" method
21
+ # on the module *and* a private instance method (rarely used). It saves
22
+ # writing `def self.read` for every method.
23
+ module_function
24
+
25
+ # Returns true if `exiftool` is on PATH. The result is memoized in `@available`
26
+ # so repeated checks don't re-spawn the process.
27
+ #
28
+ # `defined?(@available)` is safer than `@available.nil?` because the
29
+ # cached value could legitimately be `false` — we want to skip the
30
+ # re-check in that case too.
31
+ def available?
32
+ return @available if defined?(@available)
33
+
34
+ _out, _err, status = Open3.capture3('exiftool', '-ver')
35
+ @available = status.success?
36
+ rescue Errno::ENOENT
37
+ # `Errno::ENOENT` ("no such file or directory") is what Open3 raises
38
+ # when the executable can't be found. We treat that as "not available".
39
+ @available = false
40
+ end
41
+
42
+ # Returns the version string, or nil if exiftool is missing/broken.
43
+ def version
44
+ return nil unless available?
45
+
46
+ out, _err, status = Open3.capture3('exiftool', '-ver')
47
+ status.success? ? out.strip : nil
48
+ rescue Errno::ENOENT
49
+ nil
50
+ end
51
+
52
+ # Hard-fail with a helpful install hint. Called from `read`/`strip!` before
53
+ # any work, so users see one clear message instead of a low-level Errno.
54
+ # The `<<~MSG ... MSG` is a "squiggly heredoc": leading indentation is
55
+ # stripped automatically, so the output is left-aligned.
56
+ def ensure_available!
57
+ return if available?
58
+
59
+ raise ExiftoolMissing, <<~MSG
60
+ ExifTool is not installed or not on PATH.
61
+
62
+ Install:
63
+ macOS: brew install exiftool
64
+ Debian: sudo apt install libimage-exiftool-perl
65
+ Fedora: sudo dnf install perl-Image-ExifTool
66
+ Arch: sudo pacman -S perl-image-exiftool
67
+ Windows: scoop install exiftool (or download exiftool.org)
68
+ MSG
69
+ end
70
+
71
+ # Reads metadata from a file and returns a flat Hash of "Group:Tag" => value.
72
+ #
73
+ # ExifTool flag glossary:
74
+ # -j JSON output (machine-parseable)
75
+ # -G1 Include the family-1 group name (e.g. "EXIF", "GPS", "IPTC")
76
+ # -a Allow duplicate tags (some formats have several with same name)
77
+ # -u Include unknown/unidentified tags
78
+ # -s Short tag names (no descriptions)
79
+ # -n Numeric values (no human formatting like "1/100 sec")
80
+ # -api largefilesupport=1 Allow files >4 GB
81
+ def read(path)
82
+ ensure_available!
83
+ out, err, status = Open3.capture3(
84
+ 'exiftool', '-j', '-G1', '-a', '-u', '-s', '-n', '-api', 'largefilesupport=1', path.to_s
85
+ )
86
+ raise Error, "ExifTool read failed: #{err.strip}" unless status.success?
87
+
88
+ # ExifTool's JSON output is an array (one entry per file). We always
89
+ # pass one file, so we take the first element. `|| {}` handles the
90
+ # edge case where exiftool returns an empty array.
91
+ data = JSON.parse(out)
92
+ data.first || {}
93
+ rescue JSON::ParserError => e
94
+ raise Error, "Could not parse ExifTool output: #{e.message}"
95
+ end
96
+
97
+ # Removes every removable tag, in place. Returns true on success.
98
+ #
99
+ # `-all=` is the magic incantation: it sets every tag to nothing (= empty),
100
+ # which deletes them. `-overwrite_original` makes ExifTool replace the
101
+ # file directly instead of writing `file_original` next to it.
102
+ #
103
+ # The optional `keep_*` flags are useful because:
104
+ # * Orientation tells viewers how to rotate phone photos. Removing it
105
+ # can show the picture sideways.
106
+ # * ICC profile tells viewers which color space the image is in.
107
+ # Removing it can shift colors.
108
+ def strip!(path, keep_orientation: false, keep_color_profile: false)
109
+ ensure_available!
110
+
111
+ preserving = keep_orientation || keep_color_profile
112
+ args = ['exiftool', '-all=']
113
+
114
+ # `-tagsFromFile @` says "copy tags from the same file you're writing
115
+ # to". That sounds redundant, but combined with `-all=` running first,
116
+ # it means "delete everything, then re-add only the listed tags".
117
+ if preserving
118
+ args.concat(['-tagsFromFile', '@'])
119
+ args << '-Orientation' if keep_orientation
120
+ args << '-ICC_Profile' if keep_color_profile
121
+ end
122
+ args.concat(['-overwrite_original', '-q', '-q', '-api', 'largefilesupport=1', path.to_s])
123
+
124
+ _out, err, status = Open3.capture3(*args)
125
+ return true if status.success?
126
+
127
+ # Some minimal/odd files reject the preserve-pass. Fall back to a plain
128
+ # full strip — but only if we *were* preserving, otherwise the retry
129
+ # would be identical to the failed attempt.
130
+ raise Error, "ExifTool strip failed: #{err.strip}" unless preserving
131
+
132
+ _out2, err2, status2 = Open3.capture3(
133
+ 'exiftool', '-all=', '-overwrite_original', '-q', '-q', path.to_s
134
+ )
135
+ return true if status2.success?
136
+
137
+ raise Error, "ExifTool strip failed: #{err2.strip.empty? ? err.strip : err2.strip}"
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ # ───────────────────────────────────────────────────────────────────────────
4
+ # Wrapper around the external `mat2` (Metadata Anonymisation Toolkit 2).
5
+ #
6
+ # mat2 is stricter than ExifTool on certain formats (DOCX/PDF/PNG): instead
7
+ # of blacklisting known tags, it rebuilds the file from scratch keeping only
8
+ # the bytes it understands. The trade-off is that mat2 supports fewer formats.
9
+ #
10
+ # mat2's CLI quirk: it does NOT overwrite the original. It writes a new file
11
+ # named `<name>.cleaned.<ext>` next to it. We adapt by renaming that result
12
+ # back over the source after a successful run.
13
+ # ───────────────────────────────────────────────────────────────────────────
14
+
15
+ require 'open3'
16
+ require 'fileutils'
17
+
18
+ module Metaclean
19
+ module Mat2
20
+ # File extensions we know mat2 can handle. Keep this list conservative —
21
+ # if mat2 doesn't actually support an extension, the call will fail
22
+ # gracefully via UNSUPPORTED_RE below, but we'd rather not even try.
23
+ SUPPORTED_EXTS = %w[
24
+ pdf png jpg jpeg tif tiff gif bmp svg webp
25
+ mp3 flac ogg opus wav m4a
26
+ mp4 avi mkv mov wmv webm
27
+ docx xlsx pptx odt ods odp odg odf epub
28
+ zip torrent
29
+ ].freeze
30
+
31
+ # Regex matching the messages mat2 prints when it can't handle a file.
32
+ # We use this to distinguish "soft skip" from a real error.
33
+ # `i` flag = case-insensitive.
34
+ UNSUPPORTED_RE = /(not supported|isn't supported|cannot be cleaned|unsupported file)/i.freeze
35
+
36
+ module_function
37
+
38
+ # Memoized PATH check (same pattern as Exiftool.available?).
39
+ def available?
40
+ return @available if defined?(@available)
41
+
42
+ _out, _err, status = Open3.capture3('mat2', '--version')
43
+ @available = status.success?
44
+ rescue Errno::ENOENT
45
+ @available = false
46
+ end
47
+
48
+ def version
49
+ return nil unless available?
50
+
51
+ out, _err, status = Open3.capture3('mat2', '--version')
52
+ # `mat2 --version` prints "mat2 0.14.0" — `.split.last` grabs the
53
+ # version number regardless of whatever prefix appears.
54
+ status.success? ? out.strip.split.last : nil
55
+ rescue Errno::ENOENT
56
+ nil
57
+ end
58
+
59
+ # Quick check before we even try mat2 on a file. Used by Strategy to
60
+ # decide whether to add :mat2 to the pipeline.
61
+ def supports?(path)
62
+ return false unless available?
63
+
64
+ SUPPORTED_EXTS.include?(File.extname(path).downcase.delete('.'))
65
+ end
66
+
67
+ # Strips metadata from `path` in place. Returns:
68
+ # true — stripped successfully
69
+ # :no_metadata — mat2 ran but found nothing to strip
70
+ # :unsupported — mat2 cannot handle this file type
71
+ # Raises Metaclean::Error on hard failure.
72
+ #
73
+ # We return symbols (instead of always raising) so the runner can show a
74
+ # friendly "skipped" message and continue with the next tool.
75
+ def strip!(path)
76
+ raise Error, 'mat2 not available' unless available?
77
+
78
+ cleaned = cleaned_path_for(path)
79
+
80
+ # Defensive: if a stale `<name>.cleaned.<ext>` exists from an earlier
81
+ # crashed run, remove it so we don't accidentally use old data.
82
+ File.delete(cleaned) if File.exist?(cleaned)
83
+
84
+ out, err, status = Open3.capture3('mat2', path.to_s)
85
+ combined = "#{out}\n#{err}"
86
+
87
+ # Soft skip — mat2 itself told us it can't process this file.
88
+ # Defensive: if mat2 still wrote a partial `<name>.cleaned.<ext>`,
89
+ # remove it so a later run doesn't pick up stale output.
90
+ if combined.match?(UNSUPPORTED_RE)
91
+ File.delete(cleaned) if File.exist?(cleaned)
92
+ return :unsupported
93
+ end
94
+
95
+ unless status.success?
96
+ File.delete(cleaned) if File.exist?(cleaned)
97
+ # `err.strip.empty? ? out.strip : err.strip` picks whichever stream
98
+ # has actual content — some tools log to stdout, others to stderr.
99
+ raise Error, "mat2 failed: #{err.strip.empty? ? out.strip : err.strip}"
100
+ end
101
+
102
+ # mat2 only creates `<name>.cleaned.<ext>` when it actually stripped
103
+ # something. If the file didn't exist after a successful run, there
104
+ # was nothing to remove.
105
+ if File.exist?(cleaned)
106
+ FileUtils.mv(cleaned, path.to_s)
107
+ true
108
+ else
109
+ :no_metadata
110
+ end
111
+ end
112
+
113
+ # Builds the path mat2 will write to: `name.cleaned.ext`.
114
+ # We use File.dirname/basename/join instead of string concatenation so
115
+ # this works on Windows (\ separator) too.
116
+ def cleaned_path_for(path)
117
+ dir = File.dirname(path)
118
+ ext = File.extname(path)
119
+ stem = File.basename(path, ext)
120
+ File.join(dir, "#{stem}.cleaned#{ext}")
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ # ───────────────────────────────────────────────────────────────────────────
4
+ # Wrapper around `qpdf` — a PDF structural cleaner.
5
+ #
6
+ # Why qpdf in addition to mat2/ExifTool? PDFs can carry metadata in places
7
+ # those two don't always reach: orphaned objects, unused image streams,
8
+ # old revisions kept in the file. qpdf rebuilds the PDF from scratch using
9
+ # only the objects actually referenced by the document. That's a great
10
+ # final pass after the other tools have stripped the obvious metadata.
11
+ # ───────────────────────────────────────────────────────────────────────────
12
+
13
+ require 'open3'
14
+ require 'fileutils'
15
+
16
+ module Metaclean
17
+ module Qpdf
18
+ module_function
19
+
20
+ def available?
21
+ return @available if defined?(@available)
22
+
23
+ _out, _err, status = Open3.capture3('qpdf', '--version')
24
+ @available = status.success?
25
+ rescue Errno::ENOENT
26
+ @available = false
27
+ end
28
+
29
+ def version
30
+ return nil unless available?
31
+
32
+ out, _err, status = Open3.capture3('qpdf', '--version')
33
+ # `qpdf --version` prints multiple lines starting with the version line.
34
+ # `.lines.first` grabs only that line.
35
+ status.success? ? out.lines.first.to_s.strip : nil
36
+ rescue Errno::ENOENT
37
+ nil
38
+ end
39
+
40
+ # Rebuilds a PDF in place. The qpdf flags here:
41
+ # --linearize → optimize for streaming/web
42
+ # --object-streams=generate → bundle objects efficiently
43
+ # --remove-unreferenced-resources=yes → drop unused content (the
44
+ # privacy-relevant part!)
45
+ #
46
+ # qpdf can't write back to the same file, so we use the standard
47
+ # "atomic write" pattern: write to a temp file, then rename it on top of
48
+ # the original. `File.rename` (used internally by `FileUtils.mv` for
49
+ # same-filesystem moves) is atomic on POSIX — either the swap completes
50
+ # or nothing changes. No "half-written" state is ever visible.
51
+ def rebuild!(path)
52
+ raise Error, 'qpdf not available' unless available?
53
+
54
+ # Including `Process.pid` in the temp name avoids collisions if two
55
+ # metaclean processes happen to run at the same time on shared storage.
56
+ tmp = "#{path}.qpdf.tmp.#{Process.pid}"
57
+
58
+ _out, err, status = Open3.capture3(
59
+ 'qpdf', '--linearize', '--object-streams=generate',
60
+ '--remove-unreferenced-resources=yes', path.to_s, tmp
61
+ )
62
+
63
+ # qpdf has a quirk: exit code 3 means "succeeded with warnings" (output
64
+ # is still produced and valid). We treat that the same as success.
65
+ success = status.success? || status.exitstatus == 3
66
+ unless success
67
+ File.delete(tmp) if File.exist?(tmp)
68
+ raise Error, "qpdf failed: #{err.strip}"
69
+ end
70
+
71
+ FileUtils.mv(tmp, path.to_s)
72
+ true
73
+ end
74
+ end
75
+ end