rosetta-squint 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""rosetta_squint — point at an image (path or bytes), get the same phash
|
|
2
|
+
hex string as every other rosetta-squint port for the same input.
|
|
3
|
+
|
|
4
|
+
This is the Python implementation of the cross-language perceptual-hash
|
|
5
|
+
convenience API. It depends on `rosetta_squint_hash` (which re-exports
|
|
6
|
+
upstream `imagehash` + adds `whash_db4_robust`) and uses PIL/Pillow for
|
|
7
|
+
decoding most formats. HEIC is decoded via a ctypes wrapper around
|
|
8
|
+
system libheif so that output matches the 5 native ports (which all FFI
|
|
9
|
+
to the same system libheif).
|
|
10
|
+
|
|
11
|
+
Each public function comes in three flavors:
|
|
12
|
+
- `phash(path_or_image, ...)` — accept a file path str/Path OR a PIL.Image
|
|
13
|
+
- `phash_bytes(bytes, ...)` — accept raw image bytes in memory
|
|
14
|
+
|
|
15
|
+
API matches the same names in the non-Python rosetta-squint ports
|
|
16
|
+
(`phash`, `dhash`, `average_hash`, `whash_haar`, `colorhash`,
|
|
17
|
+
`crop_resistant_hash`, plus the extensions `whash_db4`, `whash_db4_robust`,
|
|
18
|
+
`phash_simple`, `dhash_vertical`).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from ._impl import (
|
|
24
|
+
# Path-based entries
|
|
25
|
+
average_hash,
|
|
26
|
+
colorhash,
|
|
27
|
+
crop_resistant_hash,
|
|
28
|
+
dhash,
|
|
29
|
+
dhash_vertical,
|
|
30
|
+
phash,
|
|
31
|
+
phash_simple,
|
|
32
|
+
whash_db4,
|
|
33
|
+
whash_db4_robust,
|
|
34
|
+
whash_haar,
|
|
35
|
+
# Bytes-based entries
|
|
36
|
+
average_hash_bytes,
|
|
37
|
+
colorhash_bytes,
|
|
38
|
+
crop_resistant_hash_bytes,
|
|
39
|
+
dhash_bytes,
|
|
40
|
+
dhash_vertical_bytes,
|
|
41
|
+
phash_bytes,
|
|
42
|
+
phash_simple_bytes,
|
|
43
|
+
whash_db4_bytes,
|
|
44
|
+
whash_db4_robust_bytes,
|
|
45
|
+
whash_haar_bytes,
|
|
46
|
+
# Decode helpers
|
|
47
|
+
decode_bytes,
|
|
48
|
+
decode_file,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Re-export hash types so callers don't need to import rosetta_squint_hash
|
|
52
|
+
# separately.
|
|
53
|
+
from rosetta_squint_hash import (
|
|
54
|
+
ImageHash,
|
|
55
|
+
ImageMultiHash,
|
|
56
|
+
hex_to_flathash,
|
|
57
|
+
hex_to_hash,
|
|
58
|
+
hex_to_multihash,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
__version__ = "0.1.0"
|
|
62
|
+
|
|
63
|
+
__all__ = [
|
|
64
|
+
"ImageHash",
|
|
65
|
+
"ImageMultiHash",
|
|
66
|
+
"hex_to_flathash",
|
|
67
|
+
"hex_to_hash",
|
|
68
|
+
"hex_to_multihash",
|
|
69
|
+
"decode_file",
|
|
70
|
+
"decode_bytes",
|
|
71
|
+
"average_hash",
|
|
72
|
+
"average_hash_bytes",
|
|
73
|
+
"colorhash",
|
|
74
|
+
"colorhash_bytes",
|
|
75
|
+
"crop_resistant_hash",
|
|
76
|
+
"crop_resistant_hash_bytes",
|
|
77
|
+
"dhash",
|
|
78
|
+
"dhash_bytes",
|
|
79
|
+
"dhash_vertical",
|
|
80
|
+
"dhash_vertical_bytes",
|
|
81
|
+
"phash",
|
|
82
|
+
"phash_bytes",
|
|
83
|
+
"phash_simple",
|
|
84
|
+
"phash_simple_bytes",
|
|
85
|
+
"whash_db4",
|
|
86
|
+
"whash_db4_bytes",
|
|
87
|
+
"whash_db4_robust",
|
|
88
|
+
"whash_db4_robust_bytes",
|
|
89
|
+
"whash_haar",
|
|
90
|
+
"whash_haar_bytes",
|
|
91
|
+
"__version__",
|
|
92
|
+
]
|
rosetta_squint/_impl.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""Implementation of the rosetta_squint convenience API.
|
|
2
|
+
|
|
3
|
+
For most formats we use PIL.Image.open() because that's what upstream
|
|
4
|
+
`imagehash` itself uses, so we match imagehash's behavior exactly. For
|
|
5
|
+
HEIC specifically, we decode via a ctypes wrapper around system libheif
|
|
6
|
+
(NOT pillow-heif, which bundles libheif 1.21.2 and diverges ±1 px from
|
|
7
|
+
the system libheif 1.17.6 that the 5 native ports link to).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import ctypes
|
|
13
|
+
import ctypes.util
|
|
14
|
+
import io
|
|
15
|
+
import os
|
|
16
|
+
import stat
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Union
|
|
20
|
+
|
|
21
|
+
import imagehash
|
|
22
|
+
import rosetta_squint_hash as rih
|
|
23
|
+
from PIL import Image
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Reject path-based decode of files that are too large or are non-regular
|
|
27
|
+
# (e.g., /dev/zero, named pipes, character devices) BEFORE reading bytes.
|
|
28
|
+
# Callers that genuinely need to process images larger than this threshold
|
|
29
|
+
# should decode via rosetta-squint-decode directly after explicit validation.
|
|
30
|
+
MAX_FILE_SIZE = 256 * 1024 * 1024 # 256 MiB
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _load_libheif_xplat() -> ctypes.CDLL:
|
|
34
|
+
"""Cross-platform libheif loader.
|
|
35
|
+
|
|
36
|
+
Linux: libheif.so.1
|
|
37
|
+
macOS: libheif.dylib (Homebrew unversioned) or libheif.1.dylib
|
|
38
|
+
Windows: libheif.dll / libheif-1.dll
|
|
39
|
+
Other: ctypes.util.find_library fallback
|
|
40
|
+
|
|
41
|
+
Raises OSError with a clear message if no candidate loads.
|
|
42
|
+
"""
|
|
43
|
+
if sys.platform == "darwin":
|
|
44
|
+
candidates = ["libheif.dylib", "libheif.1.dylib"]
|
|
45
|
+
elif sys.platform == "win32":
|
|
46
|
+
candidates = ["libheif.dll", "libheif-1.dll"]
|
|
47
|
+
else:
|
|
48
|
+
candidates = ["libheif.so.1", "libheif.so"]
|
|
49
|
+
# ctypes.util.find_library lets us pick up homebrew/macports/other paths
|
|
50
|
+
found = ctypes.util.find_library("heif")
|
|
51
|
+
if found:
|
|
52
|
+
candidates.append(found)
|
|
53
|
+
for name in candidates:
|
|
54
|
+
try:
|
|
55
|
+
return ctypes.CDLL(name)
|
|
56
|
+
except OSError:
|
|
57
|
+
continue
|
|
58
|
+
raise OSError(
|
|
59
|
+
f"libheif not found. Tried: {', '.join(candidates)}. "
|
|
60
|
+
f"Install via your package manager (apt install libheif-dev, "
|
|
61
|
+
f"brew install libheif, etc.)."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
PathOrBytes = Union[str, Path, bytes, bytearray, memoryview]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class HeifError(ctypes.Structure):
|
|
68
|
+
"""C struct heif_error { int32 code; int32 subcode; const char* message; }.
|
|
69
|
+
|
|
70
|
+
libheif's fallible functions return this 12-byte struct by value.
|
|
71
|
+
Declaring restype = ctypes.c_int64 (the previous behaviour) only reads
|
|
72
|
+
8 of those bytes and reinterprets them as an integer, which silently
|
|
73
|
+
drops the message pointer and ignores the subcode.
|
|
74
|
+
"""
|
|
75
|
+
_fields_ = (
|
|
76
|
+
("code", ctypes.c_int),
|
|
77
|
+
("subcode", ctypes.c_int),
|
|
78
|
+
("message", ctypes.c_char_p),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _check_heif(err: "HeifError", op: str) -> None:
|
|
83
|
+
"""Raise RuntimeError when a libheif error struct indicates failure."""
|
|
84
|
+
if err.code != 0:
|
|
85
|
+
msg = err.message.decode("utf-8", "replace") if err.message else ""
|
|
86
|
+
raise RuntimeError(
|
|
87
|
+
f"libheif {op} failed: code={err.code} subcode={err.subcode} msg={msg}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ─── Decode helpers ──────────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _is_heic(path_or_first_bytes) -> bool:
|
|
95
|
+
"""Detect HEIC by ftyp box brand at offset 4..12 with brand in the
|
|
96
|
+
HEIC-family set. Mirrors what the 5 native ports do."""
|
|
97
|
+
if isinstance(path_or_first_bytes, (bytes, bytearray, memoryview)):
|
|
98
|
+
b = bytes(path_or_first_bytes[:12])
|
|
99
|
+
else:
|
|
100
|
+
with open(path_or_first_bytes, "rb") as f:
|
|
101
|
+
b = f.read(12)
|
|
102
|
+
if len(b) < 12 or b[4:8] != b"ftyp":
|
|
103
|
+
return False
|
|
104
|
+
brand = b[8:12]
|
|
105
|
+
return brand in (b"heic", b"heix", b"mif1", b"msf1", b"hevc", b"hevx")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _decode_heic_via_system_libheif(data: bytes) -> Image.Image:
|
|
109
|
+
"""Decode HEIC bytes using ctypes around system libheif so the result
|
|
110
|
+
matches the 5 native ports (which all link to system libheif via
|
|
111
|
+
FFI). pillow-heif would bundle libheif 1.21.2 and diverge from
|
|
112
|
+
system libheif 1.17.6 by ±1 px on lossy fixtures."""
|
|
113
|
+
lib = _load_libheif_xplat()
|
|
114
|
+
lib.heif_context_alloc.restype = ctypes.c_void_p
|
|
115
|
+
lib.heif_context_free.argtypes = [ctypes.c_void_p]
|
|
116
|
+
lib.heif_context_read_from_memory_without_copy.argtypes = [
|
|
117
|
+
ctypes.c_void_p,
|
|
118
|
+
ctypes.c_char_p,
|
|
119
|
+
ctypes.c_size_t,
|
|
120
|
+
ctypes.c_void_p,
|
|
121
|
+
]
|
|
122
|
+
lib.heif_context_read_from_memory_without_copy.restype = HeifError
|
|
123
|
+
lib.heif_context_get_primary_image_handle.argtypes = [
|
|
124
|
+
ctypes.c_void_p,
|
|
125
|
+
ctypes.POINTER(ctypes.c_void_p),
|
|
126
|
+
]
|
|
127
|
+
lib.heif_context_get_primary_image_handle.restype = HeifError
|
|
128
|
+
lib.heif_image_handle_release.argtypes = [ctypes.c_void_p]
|
|
129
|
+
lib.heif_image_handle_get_width.argtypes = [ctypes.c_void_p]
|
|
130
|
+
lib.heif_image_handle_get_width.restype = ctypes.c_int
|
|
131
|
+
lib.heif_image_handle_get_height.argtypes = [ctypes.c_void_p]
|
|
132
|
+
lib.heif_image_handle_get_height.restype = ctypes.c_int
|
|
133
|
+
lib.heif_image_handle_has_alpha_channel.argtypes = [ctypes.c_void_p]
|
|
134
|
+
lib.heif_image_handle_has_alpha_channel.restype = ctypes.c_int
|
|
135
|
+
lib.heif_decode_image.argtypes = [
|
|
136
|
+
ctypes.c_void_p,
|
|
137
|
+
ctypes.POINTER(ctypes.c_void_p),
|
|
138
|
+
ctypes.c_int,
|
|
139
|
+
ctypes.c_int,
|
|
140
|
+
ctypes.c_void_p,
|
|
141
|
+
]
|
|
142
|
+
lib.heif_decode_image.restype = HeifError
|
|
143
|
+
lib.heif_image_release.argtypes = [ctypes.c_void_p]
|
|
144
|
+
lib.heif_image_get_plane_readonly.argtypes = [
|
|
145
|
+
ctypes.c_void_p,
|
|
146
|
+
ctypes.c_int,
|
|
147
|
+
ctypes.POINTER(ctypes.c_int),
|
|
148
|
+
]
|
|
149
|
+
lib.heif_image_get_plane_readonly.restype = ctypes.c_void_p
|
|
150
|
+
|
|
151
|
+
HEIF_COLORSPACE_RGB = 1
|
|
152
|
+
HEIF_CHROMA_INTERLEAVED_RGB = 10
|
|
153
|
+
HEIF_CHROMA_INTERLEAVED_RGBA = 11
|
|
154
|
+
HEIF_CHANNEL_INTERLEAVED = 10
|
|
155
|
+
|
|
156
|
+
ctx = lib.heif_context_alloc()
|
|
157
|
+
if not ctx:
|
|
158
|
+
raise RuntimeError("heif_context_alloc failed")
|
|
159
|
+
try:
|
|
160
|
+
err = lib.heif_context_read_from_memory_without_copy(
|
|
161
|
+
ctx, data, len(data), None
|
|
162
|
+
)
|
|
163
|
+
_check_heif(err, "heif_context_read_from_memory_without_copy")
|
|
164
|
+
handle_ref = ctypes.c_void_p()
|
|
165
|
+
err = lib.heif_context_get_primary_image_handle(
|
|
166
|
+
ctx, ctypes.byref(handle_ref)
|
|
167
|
+
)
|
|
168
|
+
# Register the cleanup BEFORE checking the error so that any
|
|
169
|
+
# partial handle libheif may have written into handle_ref is
|
|
170
|
+
# released even if _check_heif raises.
|
|
171
|
+
try:
|
|
172
|
+
_check_heif(err, "heif_context_get_primary_image_handle")
|
|
173
|
+
if not handle_ref.value:
|
|
174
|
+
raise RuntimeError("heif_context_get_primary_image_handle failed")
|
|
175
|
+
handle = handle_ref.value
|
|
176
|
+
width = lib.heif_image_handle_get_width(handle)
|
|
177
|
+
height = lib.heif_image_handle_get_height(handle)
|
|
178
|
+
if width <= 0 or height <= 0:
|
|
179
|
+
raise RuntimeError(
|
|
180
|
+
f"invalid HEIC dimensions {width}x{height}"
|
|
181
|
+
)
|
|
182
|
+
has_alpha = lib.heif_image_handle_has_alpha_channel(handle) != 0
|
|
183
|
+
chroma = (
|
|
184
|
+
HEIF_CHROMA_INTERLEAVED_RGBA
|
|
185
|
+
if has_alpha
|
|
186
|
+
else HEIF_CHROMA_INTERLEAVED_RGB
|
|
187
|
+
)
|
|
188
|
+
mode = "RGBA" if has_alpha else "RGB"
|
|
189
|
+
channels = 4 if has_alpha else 3
|
|
190
|
+
|
|
191
|
+
img_ref = ctypes.c_void_p()
|
|
192
|
+
err = lib.heif_decode_image(
|
|
193
|
+
handle, ctypes.byref(img_ref), HEIF_COLORSPACE_RGB, chroma, None
|
|
194
|
+
)
|
|
195
|
+
# Same pattern: register cleanup BEFORE the error check so the
|
|
196
|
+
# image is released even if _check_heif raises.
|
|
197
|
+
try:
|
|
198
|
+
_check_heif(err, "heif_decode_image")
|
|
199
|
+
if not img_ref.value:
|
|
200
|
+
raise RuntimeError("heif_decode_image failed")
|
|
201
|
+
img = img_ref.value
|
|
202
|
+
stride = ctypes.c_int(0)
|
|
203
|
+
plane = lib.heif_image_get_plane_readonly(
|
|
204
|
+
img, HEIF_CHANNEL_INTERLEAVED, ctypes.byref(stride)
|
|
205
|
+
)
|
|
206
|
+
if not plane:
|
|
207
|
+
raise RuntimeError("null plane")
|
|
208
|
+
row_bytes = width * channels
|
|
209
|
+
if stride.value < row_bytes:
|
|
210
|
+
raise RuntimeError(
|
|
211
|
+
f"libheif returned stride {stride.value} smaller than "
|
|
212
|
+
f"required {row_bytes} (width={width} channels={channels})"
|
|
213
|
+
)
|
|
214
|
+
pixels = bytearray()
|
|
215
|
+
for y in range(height):
|
|
216
|
+
row = ctypes.string_at(plane + y * stride.value, row_bytes)
|
|
217
|
+
pixels.extend(row)
|
|
218
|
+
return Image.frombytes(mode, (width, height), bytes(pixels))
|
|
219
|
+
finally:
|
|
220
|
+
if img_ref.value:
|
|
221
|
+
lib.heif_image_release(img_ref.value)
|
|
222
|
+
finally:
|
|
223
|
+
if handle_ref.value:
|
|
224
|
+
lib.heif_image_handle_release(handle_ref.value)
|
|
225
|
+
finally:
|
|
226
|
+
lib.heif_context_free(ctx)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _read_path_safely(path: Union[str, Path]) -> bytes:
|
|
230
|
+
"""Open `path`, validate stat against the same fd that we read from
|
|
231
|
+
(closing a TOCTOU window), enforce symlink + regular-file + size
|
|
232
|
+
guards, and return the bytes. Mirrors the fd-based stat-and-read
|
|
233
|
+
pattern used in the other 5 squint ports.
|
|
234
|
+
|
|
235
|
+
A naive `os.stat(path)` then `open(path).read()` has a classic TOCTOU
|
|
236
|
+
window: an attacker with write access along the path can swap the
|
|
237
|
+
target between the two syscalls (file → symlink to /dev/zero, file →
|
|
238
|
+
larger file, regular file → FIFO). Holding the same fd across stat
|
|
239
|
+
and read defeats this — fstat reports the inode the read will actually
|
|
240
|
+
consume.
|
|
241
|
+
|
|
242
|
+
The open itself uses ``O_NOFOLLOW`` so that a symlink at ``path``
|
|
243
|
+
causes the open to fail rather than silently resolving to whatever
|
|
244
|
+
the symlink currently points at — closing a separate TOCTOU window
|
|
245
|
+
on the symlink target itself. Callers who genuinely want symlink
|
|
246
|
+
resolution must do it explicitly (e.g. ``Path(path).resolve()``)
|
|
247
|
+
before calling this function. Windows has no ``O_NOFOLLOW`` flag;
|
|
248
|
+
fall back to an ``lstat`` check.
|
|
249
|
+
"""
|
|
250
|
+
if sys.platform == "win32":
|
|
251
|
+
# Windows: pre-check with os.lstat. There's a narrow race
|
|
252
|
+
# between the lstat and the open below, but Windows lacks
|
|
253
|
+
# O_NOFOLLOW and the alternative (reparse-point flags) would
|
|
254
|
+
# require ctypes wrapping of CreateFileW.
|
|
255
|
+
try:
|
|
256
|
+
st_link = os.lstat(path)
|
|
257
|
+
except OSError as e:
|
|
258
|
+
raise OSError(e.errno, f"lstat failed for {path}: {e.strerror}") from e
|
|
259
|
+
if stat.S_ISLNK(st_link.st_mode):
|
|
260
|
+
raise ValueError(f"symlink not allowed: {path}")
|
|
261
|
+
fd = os.open(path, os.O_RDONLY | os.O_BINARY) # type: ignore[attr-defined]
|
|
262
|
+
else:
|
|
263
|
+
try:
|
|
264
|
+
fd = os.open(path, os.O_RDONLY | os.O_NOFOLLOW)
|
|
265
|
+
except OSError as e:
|
|
266
|
+
# ELOOP (40 on Linux, 62 on macOS) is what O_NOFOLLOW raises
|
|
267
|
+
# when the final path component is a symlink. Translate to a
|
|
268
|
+
# clearer error so callers can distinguish symlink rejection
|
|
269
|
+
# from a generic "not a regular file" or I/O error.
|
|
270
|
+
import errno as _errno
|
|
271
|
+
if e.errno == _errno.ELOOP:
|
|
272
|
+
raise ValueError(f"symlink not allowed: {path}") from e
|
|
273
|
+
raise
|
|
274
|
+
# Wrap the bare fd in a Python file object so close-on-GC is automatic.
|
|
275
|
+
# If `os.fdopen` itself raises (e.g., MemoryError on a stressed system),
|
|
276
|
+
# the fd would leak — close it explicitly to be safe.
|
|
277
|
+
try:
|
|
278
|
+
f = os.fdopen(fd, "rb")
|
|
279
|
+
except BaseException:
|
|
280
|
+
os.close(fd)
|
|
281
|
+
raise
|
|
282
|
+
try:
|
|
283
|
+
st = os.fstat(f.fileno())
|
|
284
|
+
if not stat.S_ISREG(st.st_mode):
|
|
285
|
+
raise RuntimeError(f"not a regular file: {path}")
|
|
286
|
+
if st.st_size > MAX_FILE_SIZE:
|
|
287
|
+
raise RuntimeError(
|
|
288
|
+
f"input file too large: {st.st_size} bytes "
|
|
289
|
+
f"(max {MAX_FILE_SIZE} bytes / 256 MiB). For images above this "
|
|
290
|
+
f"threshold, decode via rosetta-squint-decode directly after "
|
|
291
|
+
f"explicit validation."
|
|
292
|
+
)
|
|
293
|
+
# Read up to MAX_FILE_SIZE+1 so we detect "file grew between fstat
|
|
294
|
+
# and read" (e.g. concurrent writer appending). The +1 absence is
|
|
295
|
+
# the contract: if we got more than MAX_FILE_SIZE bytes, reject.
|
|
296
|
+
data = f.read(MAX_FILE_SIZE + 1)
|
|
297
|
+
if len(data) > MAX_FILE_SIZE:
|
|
298
|
+
raise RuntimeError(
|
|
299
|
+
f"input file too large: {len(data)} bytes "
|
|
300
|
+
f"(max {MAX_FILE_SIZE} bytes / 256 MiB). For images above this "
|
|
301
|
+
f"threshold, decode via rosetta-squint-decode directly after "
|
|
302
|
+
f"explicit validation."
|
|
303
|
+
)
|
|
304
|
+
return data
|
|
305
|
+
finally:
|
|
306
|
+
f.close()
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def decode_file(path: Union[str, Path]) -> Image.Image:
|
|
310
|
+
"""Decode a file at `path` into a PIL.Image suitable for hashing.
|
|
311
|
+
HEIC uses the system-libheif ctypes wrapper; everything else uses
|
|
312
|
+
PIL.Image.open.
|
|
313
|
+
|
|
314
|
+
Refuses symlinks (via ``O_NOFOLLOW`` on POSIX / ``lstat`` on Windows),
|
|
315
|
+
non-regular files (FIFOs, /dev/zero, character devices, etc.) and
|
|
316
|
+
files larger than MAX_FILE_SIZE BEFORE reading bytes. The
|
|
317
|
+
regular-file and size checks run against the same fd as the read,
|
|
318
|
+
closing the obvious TOCTOU window. Callers who genuinely want symlink
|
|
319
|
+
resolution must do it explicitly (e.g. ``Path(path).resolve()``)
|
|
320
|
+
before calling this function.
|
|
321
|
+
"""
|
|
322
|
+
data = _read_path_safely(path)
|
|
323
|
+
if _is_heic(data):
|
|
324
|
+
return _decode_heic_via_system_libheif(data)
|
|
325
|
+
return Image.open(io.BytesIO(data))
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def decode_bytes(data: bytes) -> Image.Image:
|
|
329
|
+
"""Decode raw image bytes into a PIL.Image. HEIC bytes use the
|
|
330
|
+
ctypes wrapper around system libheif; everything else goes through
|
|
331
|
+
PIL.Image.open."""
|
|
332
|
+
if isinstance(data, (bytearray, memoryview)):
|
|
333
|
+
data = bytes(data)
|
|
334
|
+
if _is_heic(data):
|
|
335
|
+
return _decode_heic_via_system_libheif(data)
|
|
336
|
+
return Image.open(io.BytesIO(data))
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ─── Convenience hash functions ──────────────────────────────────────────────
|
|
340
|
+
# Each algorithm gets a (path, size) and a (_bytes, size) variant.
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def average_hash(path: Union[str, Path], hash_size: int = 8) -> imagehash.ImageHash:
|
|
344
|
+
return rih.average_hash(decode_file(path), hash_size=hash_size)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def average_hash_bytes(data: bytes, hash_size: int = 8) -> imagehash.ImageHash:
|
|
348
|
+
return rih.average_hash(decode_bytes(data), hash_size=hash_size)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def phash(
|
|
352
|
+
path: Union[str, Path],
|
|
353
|
+
hash_size: int = 8,
|
|
354
|
+
highfreq_factor: int = 4,
|
|
355
|
+
) -> imagehash.ImageHash:
|
|
356
|
+
return rih.phash(
|
|
357
|
+
decode_file(path), hash_size=hash_size, highfreq_factor=highfreq_factor
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def phash_bytes(
|
|
362
|
+
data: bytes, hash_size: int = 8, highfreq_factor: int = 4
|
|
363
|
+
) -> imagehash.ImageHash:
|
|
364
|
+
return rih.phash(
|
|
365
|
+
decode_bytes(data), hash_size=hash_size, highfreq_factor=highfreq_factor
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def phash_simple(
|
|
370
|
+
path: Union[str, Path],
|
|
371
|
+
hash_size: int = 8,
|
|
372
|
+
highfreq_factor: int = 4,
|
|
373
|
+
) -> imagehash.ImageHash:
|
|
374
|
+
return rih.phash_simple(
|
|
375
|
+
decode_file(path), hash_size=hash_size, highfreq_factor=highfreq_factor
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def phash_simple_bytes(
|
|
380
|
+
data: bytes, hash_size: int = 8, highfreq_factor: int = 4
|
|
381
|
+
) -> imagehash.ImageHash:
|
|
382
|
+
return rih.phash_simple(
|
|
383
|
+
decode_bytes(data), hash_size=hash_size, highfreq_factor=highfreq_factor
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def dhash(path: Union[str, Path], hash_size: int = 8) -> imagehash.ImageHash:
|
|
388
|
+
return rih.dhash(decode_file(path), hash_size=hash_size)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def dhash_bytes(data: bytes, hash_size: int = 8) -> imagehash.ImageHash:
|
|
392
|
+
return rih.dhash(decode_bytes(data), hash_size=hash_size)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def dhash_vertical(
|
|
396
|
+
path: Union[str, Path], hash_size: int = 8
|
|
397
|
+
) -> imagehash.ImageHash:
|
|
398
|
+
return rih.dhash_vertical(decode_file(path), hash_size=hash_size)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def dhash_vertical_bytes(data: bytes, hash_size: int = 8) -> imagehash.ImageHash:
|
|
402
|
+
return rih.dhash_vertical(decode_bytes(data), hash_size=hash_size)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def whash_haar(
|
|
406
|
+
path: Union[str, Path], hash_size: int = 8
|
|
407
|
+
) -> imagehash.ImageHash:
|
|
408
|
+
return rih.whash(
|
|
409
|
+
decode_file(path),
|
|
410
|
+
hash_size=hash_size,
|
|
411
|
+
mode="haar",
|
|
412
|
+
remove_max_haar_ll=True,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def whash_haar_bytes(data: bytes, hash_size: int = 8) -> imagehash.ImageHash:
|
|
417
|
+
return rih.whash(
|
|
418
|
+
decode_bytes(data),
|
|
419
|
+
hash_size=hash_size,
|
|
420
|
+
mode="haar",
|
|
421
|
+
remove_max_haar_ll=True,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def whash_db4(path: Union[str, Path], hash_size: int = 8) -> imagehash.ImageHash:
|
|
426
|
+
# rih.whash_db4 is the port-local snap-applying override (NOT
|
|
427
|
+
# rih.whash(mode='db4'), which forwards to upstream imagehash without
|
|
428
|
+
# the snap-to-threshold tie-break). See spec/SPEC.md §"Threshold
|
|
429
|
+
# tie-break".
|
|
430
|
+
return rih.whash_db4(decode_file(path), hash_size=hash_size)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def whash_db4_bytes(data: bytes, hash_size: int = 8) -> imagehash.ImageHash:
|
|
434
|
+
return rih.whash_db4(decode_bytes(data), hash_size=hash_size)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def whash_db4_robust(
|
|
438
|
+
path: Union[str, Path], hash_size: int = 8
|
|
439
|
+
) -> imagehash.ImageHash:
|
|
440
|
+
return rih.whash_db4_robust(decode_file(path), hash_size=hash_size)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def whash_db4_robust_bytes(
|
|
444
|
+
data: bytes, hash_size: int = 8
|
|
445
|
+
) -> imagehash.ImageHash:
|
|
446
|
+
return rih.whash_db4_robust(decode_bytes(data), hash_size=hash_size)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def colorhash(path: Union[str, Path], binbits: int = 3) -> imagehash.ImageHash:
|
|
450
|
+
return rih.colorhash(decode_file(path), binbits=binbits)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def colorhash_bytes(data: bytes, binbits: int = 3) -> imagehash.ImageHash:
|
|
454
|
+
return rih.colorhash(decode_bytes(data), binbits=binbits)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def crop_resistant_hash(path: Union[str, Path]) -> imagehash.ImageMultiHash:
|
|
458
|
+
return rih.crop_resistant_hash(decode_file(path))
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def crop_resistant_hash_bytes(data: bytes) -> imagehash.ImageMultiHash:
|
|
462
|
+
return rih.crop_resistant_hash(decode_bytes(data))
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rosetta-squint
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Cross-language byte-exact perceptual image hashing — decode + hash in one call
|
|
5
|
+
Author-email: Will Metcalf <william.metcalf@gmail.com>
|
|
6
|
+
License: BSD-2-Clause
|
|
7
|
+
Project-URL: Homepage, https://github.com/wmetcalf/rosetta-squint
|
|
8
|
+
Project-URL: Repository, https://github.com/wmetcalf/rosetta-squint
|
|
9
|
+
Project-URL: Issues, https://github.com/wmetcalf/rosetta-squint/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/wmetcalf/rosetta-squint/blob/main/CHANGELOG.md
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: rosetta-squint-hash<2.0,>=1.0.0
|
|
14
|
+
Requires-Dist: Pillow==12.2.*
|
|
15
|
+
Provides-Extra: test
|
|
16
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
17
|
+
|
|
18
|
+
# rosetta_squint — Python convenience API
|
|
19
|
+
|
|
20
|
+
Point at an image file or pass in raw image bytes; get back the same perceptual hash hex string that every other `rosetta-squint` port produces for the same input.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install -e ../../hash/python # rosetta-squint-hash (wrapper around imagehash)
|
|
26
|
+
pip install -e . # this package
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
(Not on PyPI yet — both are local.)
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import rosetta_squint as rs
|
|
35
|
+
|
|
36
|
+
# Path on disk
|
|
37
|
+
h = rs.phash("photo.jpg", 8)
|
|
38
|
+
print(h) # "c3f8a1b27d0e4f96"
|
|
39
|
+
|
|
40
|
+
# Raw image bytes (from an HTTP response, a database BLOB, a multipart upload)
|
|
41
|
+
with open("photo.jpg", "rb") as f:
|
|
42
|
+
h = rs.phash_bytes(f.read(), 8)
|
|
43
|
+
|
|
44
|
+
# Every algorithm available has both flavors:
|
|
45
|
+
rs.average_hash(path, 8) # rs.average_hash_bytes(bytes, 8)
|
|
46
|
+
rs.phash(path, 8) # rs.phash_bytes(bytes, 8)
|
|
47
|
+
rs.phash_simple(path, 8) # rs.phash_simple_bytes(bytes, 8)
|
|
48
|
+
rs.dhash(path, 8) # rs.dhash_bytes(bytes, 8)
|
|
49
|
+
rs.dhash_vertical(path, 8) # rs.dhash_vertical_bytes(bytes, 8)
|
|
50
|
+
rs.whash_haar(path, 8) # rs.whash_haar_bytes(bytes, 8)
|
|
51
|
+
rs.whash_db4(path, 8) # rs.whash_db4_bytes(bytes, 8)
|
|
52
|
+
rs.whash_db4_robust(path, 8) # rs.whash_db4_robust_bytes(bytes, 8) — cross-port-stable
|
|
53
|
+
rs.colorhash(path, 3) # rs.colorhash_bytes(bytes, 3) — takes binbits
|
|
54
|
+
rs.crop_resistant_hash(path) # rs.crop_resistant_hash_bytes(bytes) — no size, returns ImageMultiHash
|
|
55
|
+
|
|
56
|
+
# Hex round-trips:
|
|
57
|
+
restored = rs.hex_to_hash("c3f8a1b27d0e4f96")
|
|
58
|
+
restored = rs.hex_to_flathash("...", hashsize=3)
|
|
59
|
+
restored = rs.hex_to_multihash("hex1,hex2,hex3")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Cross-port equivalence
|
|
63
|
+
|
|
64
|
+
The output of `rs.phash("photo.jpg", 8)` is the same hex string as you'd get from the Rust, Go, Java, JS, or Swift `rosetta-squint` ports given the same byte input. **Verified live for `imagehash.png` at size 8: `ba8c84536bd3c366` across Python, Go, Java, JS, Swift.**
|
|
65
|
+
|
|
66
|
+
## Decode strategy
|
|
67
|
+
|
|
68
|
+
| Format | Decoder | Why |
|
|
69
|
+
|---|---|---|
|
|
70
|
+
| BMP, PNG, GIF, JPEG, WebP, TIFF | PIL/Pillow (system) | The canonical Python decoders; the goldens used to validate the 5 native ports were generated by PIL itself, so output matches by construction. |
|
|
71
|
+
| HEIC | ctypes wrapper around system `libheif.so.1` | pillow-heif bundles libheif 1.21.2 in its wheel; the 5 native ports link to system libheif 1.17.6. The wrapper avoids the ±1 px divergence. |
|
|
72
|
+
|
|
73
|
+
If you already have a `PIL.Image.Image` (from `PIL.Image.open(...)`, a thumbnailer, etc.), use the `rosetta_squint_hash` lower-level API directly — the squint layer's only job is the decode step:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import rosetta_squint_hash as rih
|
|
77
|
+
from PIL import Image
|
|
78
|
+
img = Image.open("photo.jpg")
|
|
79
|
+
h = rih.phash(img, hash_size=8)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Dependencies
|
|
83
|
+
|
|
84
|
+
- `rosetta_squint_hash` (which re-exports `imagehash==4.3.2` + adds `whash_db4_robust`)
|
|
85
|
+
- `Pillow==12.2.*`
|
|
86
|
+
|
|
87
|
+
Tight pins are intentional. See [`../../hash/python/README.md`](../../hash/python/README.md) under "Version policy" for the upgrade workflow.
|
|
88
|
+
|
|
89
|
+
## Testing
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pytest
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Tests verify (1) path/bytes parity for every algorithm, (2) chain consistency between `rs.phash(path)` and `imagehash.phash(rs.decode_file(path))`, (3) cross-port byte-exact equality with Go/Java/JS for `imagehash.png`.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
rosetta_squint/__init__.py,sha256=WS7jtOcUan-SQlotpcN2dGDyZgG58e6-u8wiRiXVg5A,2391
|
|
2
|
+
rosetta_squint/_impl.py,sha256=OI97L1meX-oDcx5uYIz2C7GIPvnOQxKCoeErH5k5vRk,17858
|
|
3
|
+
rosetta_squint-1.0.0.dist-info/METADATA,sha256=zrNzNFx7Z_MCdxrZltPjegom3TjOKJ7rSXzzzxW_EmY,4026
|
|
4
|
+
rosetta_squint-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
rosetta_squint-1.0.0.dist-info/top_level.txt,sha256=esQHc93VdXUXumPzwgdtqIqxhKO80WG_bzYg94LPNt4,15
|
|
6
|
+
rosetta_squint-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
rosetta_squint
|