PyPI - Perception - Versions diffs - 0.7.5__tar.gz → 0.7.7__tar.gz - Mend

Perception 0.7.5tar.gz → 0.7.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{perception-0.7.5 → perception-0.7.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: Perception
-Version: 0.7.5
+Version: 0.7.7
 Summary: Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.
 License: Apache-2.0
 Author: Thorn

perception-0.7.7/perception/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from importlib import metadata
+__version__ = metadata.version("perception")

{perception-0.7.5 → perception-0.7.7}/perception/benchmarking/common.py RENAMED Viewed

@@ -3,12 +3,10 @@ import logging
 import os
 import shutil
 import tempfile
-import typing
 import uuid
 import warnings
 import zipfile
 from abc import ABC
-from typing import Optional
 import matplotlib.pyplot as plt
 import numpy as np
@@ -101,7 +99,7 @@ def compute_threshold_precision_recall(pos, neg, precision_threshold=99.9):
 class Filterable(ABC):
     _df: pd.DataFrame
-    expected_columns: typing.List
+    expected_columns: list
     def __init__(self, df):
         assert sorted(df.columns) == sorted(
@@ -135,7 +133,7 @@ class Saveable(Filterable):
     def load(
         cls,
         path_to_zip_or_directory: str,
-        storage_dir: Optional[str] = None,
+        storage_dir: str | None = None,
         verify_md5=True,
     ):
         """Load a dataset from a ZIP file or directory.
@@ -311,7 +309,7 @@ class BenchmarkHashes(Filterable):
     def __init__(self, df: pd.DataFrame):
         super().__init__(df)
-        self._metrics: Optional[pd.DataFrame] = None
+        self._metrics: pd.DataFrame | None = None
     def __add__(self, other):
         return BenchmarkHashes(df=pd.concat([self._df, other._df]).drop_duplicates())
@@ -327,7 +325,7 @@ class BenchmarkHashes(Filterable):
         self._df.to_csv(filepath, index=False)
     def compute_metrics(
-        self, custom_distance_metrics: Optional[dict] = None
+        self, custom_distance_metrics: dict | None = None
     ) -> pd.DataFrame:
         if self._metrics is not None:
             return self._metrics
@@ -610,7 +608,7 @@ class BenchmarkDataset(Saveable):
     expected_columns = ["filepath", "category"]
     @classmethod
-    def from_tuples(cls, files: typing.List[typing.Tuple[str, str]]):
+    def from_tuples(cls, files: list[tuple[str, str]]):
         """Build dataset from a set of files.
         Args:

{perception-0.7.5 → perception-0.7.7}/perception/benchmarking/image.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import logging
 import os
-import typing
 import uuid
 import warnings
@@ -19,7 +18,7 @@ log = logging.getLogger(__name__)
 class BenchmarkImageTransforms(BenchmarkTransforms):
     def compute_hashes(
-        self, hashers: typing.Dict[str, ImageHasher], max_workers: int = 5
+        self, hashers: dict[str, ImageHasher], max_workers: int = 5
     ) -> BenchmarkHashes:
         """Compute hashes for a series of files given some set of hashers.
@@ -86,7 +85,7 @@ class BenchmarkImageTransforms(BenchmarkTransforms):
 class BenchmarkImageDataset(BenchmarkDataset):
     def deduplicate(
         self, hasher: ImageHasher, threshold=0.001, isometric=False
-    ) -> typing.Tuple["BenchmarkImageDataset", typing.Set[typing.Tuple[str, str]]]:
+    ) -> tuple["BenchmarkImageDataset", set[tuple[str, str]]]:
         """Remove duplicate files from dataset.
         Args:
@@ -99,7 +98,7 @@ class BenchmarkImageDataset(BenchmarkDataset):
             A list where each entry is a list of files that are
             duplicates of each other. We keep only the last entry.
         """
-        pairs: typing.Set[typing.Tuple[str, str]] = set()
+        pairs: set[tuple[str, str]] = set()
         for _, group in tqdm(
             self._df.groupby(["category"]), desc="Deduplicating categories."
         ):
@@ -120,7 +119,7 @@ class BenchmarkImageDataset(BenchmarkDataset):
     def transform(
         self,
-        transforms: typing.Dict[str, imgaug.augmenters.meta.Augmenter],
+        transforms: dict[str, imgaug.augmenters.meta.Augmenter],
         storage_dir: str,
         errors: str = "raise",
     ) -> BenchmarkImageTransforms:

{perception-0.7.5 → perception-0.7.7}/perception/benchmarking/video.py RENAMED Viewed

@@ -68,7 +68,7 @@ def _process_row(row, hashers, framerates):
 class BenchmarkVideoDataset(BenchmarkDataset):
     def transform(
         self,
-        transforms: typing.Dict[str, typing.Callable],
+        transforms: dict[str, typing.Callable],
         storage_dir: str,
         errors: str = "raise",
     ):
@@ -171,7 +171,7 @@ class BenchmarkVideoTransforms(BenchmarkTransforms):
     ]
     def compute_hashes(
-        self, hashers: typing.Dict[str, VideoHasher], max_workers: int = 5
+        self, hashers: dict[str, VideoHasher], max_workers: int = 5
     ) -> BenchmarkHashes:
         """Compute hashes for a series of files given some set of hashers.

{perception-0.7.5 → perception-0.7.7}/perception/benchmarking/video_transforms.py RENAMED Viewed

@@ -1,6 +1,4 @@
 import os
-import typing
-from typing import Optional
 import cv2
 import ffmpeg
@@ -29,12 +27,12 @@ def sanitize_output_filepath(input_filepath, output_filepath, output_ext=None):
 def get_simple_transform(
-    width: typing.Union[str, int] = -1,
-    height: typing.Union[str, int] = -1,
-    pad: Optional[str] = None,
-    codec: Optional[str] = None,
-    clip_pct: Optional[typing.Tuple[float, float]] = None,
-    clip_s: Optional[typing.Tuple[float, float]] = None,
+    width: str | int = -1,
+    height: str | int = -1,
+    pad: str | None = None,
+    codec: str | None = None,
+    clip_pct: tuple[float, float] | None = None,
+    clip_s: tuple[float, float] | None = None,
     sar=None,
     fps=None,
     output_ext=None,

{perception-0.7.5 → perception-0.7.7}/perception/experimental/ann/index.py RENAMED Viewed

@@ -1,7 +1,6 @@
 import time
 import typing
 import warnings
-from typing import Optional
 import faiss
 import numpy as np
@@ -10,11 +9,15 @@ import typing_extensions
 import perception.hashers.tools as pht
-QueryInput = typing_extensions.TypedDict("QueryInput", {"id": str, "hash": str})
-QueryMatch = typing_extensions.TypedDict(
-    "QueryMatch", {"id": typing.Any, "matches": typing.List[dict]}
-)
+class QueryInput(typing_extensions.TypedDict):
+    id: str
+    hash: str
+class QueryMatch(typing_extensions.TypedDict):
+    id: typing.Any
+    matches: list[dict]
 class TuningFailure(Exception):
@@ -260,7 +263,7 @@ class ApproximateNearestNeighbors:
             s, hash_format=hash_format, dtype=self.dtype, hash_length=self.hash_length
         )
-    def vector_to_string(self, vector, hash_format="base64") -> typing.Optional[str]:
+    def vector_to_string(self, vector, hash_format="base64") -> str | None:
         """Convert a vector back to string
         Args:
@@ -272,9 +275,9 @@ class ApproximateNearestNeighbors:
     def search(
         self,
-        queries: typing.List[QueryInput],
-        threshold: Optional[int] = None,
-        threshold_func: Optional[typing.Callable[[np.ndarray], np.ndarray]] = None,
+        queries: list[QueryInput],
+        threshold: int | None = None,
+        threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,
         hash_format="base64",
         k=1,
     ):
@@ -318,7 +321,7 @@ class ApproximateNearestNeighbors:
             if not self.metadata_columns
             else self.query_by_id(ids=np.unique(indices[distances < thresholds]))
         )
-        matches: typing.List[QueryMatch] = []
+        matches: list[QueryMatch] = []
         for match_distances, match_ids, q, q_threshold in zip(
             distances, indices, queries, thresholds
         ):

{perception-0.7.5 → perception-0.7.7}/perception/experimental/ann/serve.py RENAMED Viewed

@@ -3,7 +3,6 @@ import functools
 import json
 import logging
 import typing
-from typing import Optional
 import aiohttp.web
 import numpy as np
@@ -96,8 +95,8 @@ def get_logger(name, log_level):
 async def serve(
     index: ApproximateNearestNeighbors,
-    default_threshold: Optional[int] = None,
-    default_threshold_func: Optional[typing.Callable[[np.ndarray], np.ndarray]] = None,
+    default_threshold: int | None = None,
+    default_threshold_func: typing.Callable[[np.ndarray], np.ndarray] | None = None,
     default_k: int = 1,
     concurrency: int = 2,
     log_level=logging.INFO,

{perception-0.7.5 → perception-0.7.7}/perception/experimental/approximate_deduplication.py RENAMED Viewed

@@ -2,7 +2,6 @@ import logging
 import math
 import os.path as op
 import typing
-from typing import Optional
 import faiss
 import networkit as nk
@@ -17,9 +16,10 @@ DEFAULT_PCT_PROBE = 0
 # For faiss training on datasets larger than 50,000 vectors, we take a random sub-sample.
 TRAIN_LARGE_SIZE: int = 50_000
-ClusterAssignment = typing_extensions.TypedDict(
-    "ClusterAssignment", {"cluster": int, "id": typing.Any}
-)
+class ClusterAssignment(typing_extensions.TypedDict):
+    cluster: int
+    id: typing.Any
 def build_index(
@@ -90,7 +90,7 @@ def compute_euclidean_pairwise_duplicates_approx(
     y_counts=None,
     pct_probe=0.1,
     use_gpu: bool = True,
-    faiss_cache_path: Optional[str] = None,
+    faiss_cache_path: str | None = None,
     show_progress: bool = False,
 ):
     """Provides the same result as perception.extensions.compute_pairwise_duplicates_simple
@@ -199,12 +199,12 @@ def compute_euclidean_pairwise_duplicates_approx(
 def pairs_to_clusters(
     ids: typing.Iterable[str],
-    pairs: typing.Iterable[typing.Tuple[str, str]],
+    pairs: typing.Iterable[tuple[str, str]],
     strictness: typing_extensions.Literal[
         "clique", "community", "component"
     ] = "clique",
     max_clique_batch_size: int = 1000,
-) -> typing.List[ClusterAssignment]:
+) -> list[ClusterAssignment]:
     """Given a list of pairs of matching files, compute sets
     of cliques where all files in a clique are connected.
     Args:
@@ -232,7 +232,7 @@ def pairs_to_clusters(
     for node_pair in node_pairs:
         graph.addEdge(node_pair[0], node_pair[1])
-    assignments: typing.List[ClusterAssignment] = []
+    assignments: list[ClusterAssignment] = []
     cluster_index = 0
     cc_query = nk.components.ConnectedComponents(graph)
     cc_query.run()

{perception-0.7.5 → perception-0.7.7}/perception/experimental/debug.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import logging
 import random
-from typing import Optional
 import cv2
 import numpy as np
@@ -18,7 +17,7 @@ def vizualize_pair(
     features_2,
     ratio: float,
     match_metadata=None,
-    local_path_col: Optional[str] = None,
+    local_path_col: str | None = None,
     sanitized: bool = False,
     include_all_points=False,
     circle_size=KEYPOINT_SIZE,

{perception-0.7.5 → perception-0.7.7}/perception/experimental/local_descriptor_deduplication.py RENAMED Viewed

@@ -35,20 +35,20 @@ class Descriptors(typing_extensions.TypedDict):
     keypoints: np.ndarray
     descriptors: np.ndarray
     descriptor_count: int
-    dimensions: typing.Tuple[int, int]
+    dimensions: tuple[int, int]
     filepath: str
     hasher: str
 class MatchStats(typing_extensions.TypedDict):
-    match: typing.Optional[float]
-    min_kpBM: typing.Optional[int]
-    MAB: typing.Optional[str]
-    intersection: typing.Optional[float]
-    inliers: typing.Optional[float]
-    bounds_intersection: typing.Optional[float]
-    final_matched_a_pts: typing.Optional[typing.List[np.ndarray]]
-    final_matched_b_pts: typing.Optional[typing.List[np.ndarray]]
+    match: float | None
+    min_kpBM: int | None
+    MAB: str | None
+    intersection: float | None
+    inliers: float | None
+    bounds_intersection: float | None
+    final_matched_a_pts: list[np.ndarray] | None
+    final_matched_b_pts: list[np.ndarray] | None
 class LocalHasher(ABC):
@@ -76,7 +76,7 @@ class LocalHasher(ABC):
         self.validation_inliers = validation_inliers
         self.validation_intersection = validation_intersection
-    def compute(self, image) -> typing.Tuple[np.ndarray, np.ndarray]:
+    def compute(self, image) -> tuple[np.ndarray, np.ndarray]:
         return self.hasher.detectAndCompute(image, None)
     def validate_match(
@@ -86,7 +86,7 @@ class LocalHasher(ABC):
         minimum_match: float = DEFAULT_MATCH_PCT,
         minimum_intersection: float = DEFAULT_INTERSECTION,
         minimum_inliers: int = DEFAULT_INLIERS,
-    ) -> typing.Tuple[bool, MatchStats]:
+    ) -> tuple[bool, MatchStats]:
         """Validate the match between two sets of keypoints and descriptors. The
         validation algorithm is as follows:
@@ -307,10 +307,10 @@ def load_and_preprocess(filepath, max_size=DEFAULT_MAX_SIZE, grayscale=True):
 def generate_image_descriptors(
     filepath: str,
-    hasher: typing.Optional[LocalHasher] = None,
+    hasher: LocalHasher | None = None,
     min_features=DEFAULT_MIN_FEATURES,
     max_size=DEFAULT_MAX_SIZE,
-) -> typing.Optional[Descriptors]:
+) -> Descriptors | None:
     """Generate local descriptors for a file.
     Args:
@@ -362,7 +362,7 @@ def generate_image_descriptors(
 def build_reference_df(
     filepaths: typing.Iterable[str],
-    hasher: typing.Optional[LocalHasher] = None,
+    hasher: LocalHasher | None = None,
     min_features=DEFAULT_MIN_FEATURES,
     max_size=DEFAULT_MAX_SIZE,
     show_progress=False,
@@ -429,10 +429,10 @@ def check_hasher(df1: pd.DataFrame, df2: pd.DataFrame):
 def compute_pairs(
     match_df,
     query_df=None,
-    hasher: typing.Optional[LocalHasher] = None,
+    hasher: LocalHasher | None = None,
     pct_probe=0.1,
     use_gpu: bool = True,
-    faiss_cache_path: typing.Optional[str] = None,
+    faiss_cache_path: str | None = None,
     show_progress: bool = False,
 ):
     """Compute pairs of matching images from a reference
@@ -537,18 +537,18 @@ def deduplicate_sift_dfs(*args, **kwargs):
 def deduplicate_dfs(
     match_df: pd.DataFrame,
-    query_df: typing.Optional[pd.DataFrame] = None,
+    query_df: pd.DataFrame | None = None,
     coarse_pct_probe: float = ad.DEFAULT_PCT_PROBE,
-    max_workers: typing.Optional[int] = None,
+    max_workers: int | None = None,
     use_gpu: bool = True,
-    faiss_cache_path: typing.Optional[str] = None,
+    faiss_cache_path: str | None = None,
     verbose: bool = False,
-    hasher: typing.Optional[LocalHasher] = None,
+    hasher: LocalHasher | None = None,
     show_progress: bool = False,
-) -> typing.Union[
-    typing.List[typing.Tuple[typing.Any, typing.Any]],
-    typing.List[typing.Tuple[typing.Any, typing.Any, MatchStats]],
-]:
+) -> (
+    list[tuple[typing.Any, typing.Any]]
+    | list[tuple[typing.Any, typing.Any, MatchStats]]
+):
     """Deduplicate images within one set of images or between two sets of images:
     #. Given a dataframe (or two) of descriptors and keypoints for images.
     #. Perform a coarse, approximate search for images with common features.
@@ -606,10 +606,10 @@ def deduplicate_dfs(
     ), "Index of query_df must be unique, or it will cause wrong matches."
     LOGGER.debug("Validating candidate pairs: %d", len(candidates))
-    keep: typing.Union[
-        typing.List[typing.Tuple[typing.Any, typing.Any]],
-        typing.List[typing.Tuple[typing.Any, typing.Any, MatchStats]],
-    ] = []  # type: ignore
+    keep: (
+        list[tuple[typing.Any, typing.Any]]
+        | list[tuple[typing.Any, typing.Any, MatchStats]]
+    ) = []  # type: ignore
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         batch_size = 10_000
         for start in tqdm.tqdm(range(0, len(candidates), batch_size)):
@@ -638,20 +638,18 @@ def deduplicate_dfs(
 def deduplicate(
-    filepaths_or_reference_df: typing.Union[typing.Iterable[str], pd.DataFrame],
-    query_filepaths_or_df: typing.Optional[
-        typing.Union[typing.Iterable[str], pd.DataFrame]
-    ] = None,
+    filepaths_or_reference_df: typing.Iterable[str] | pd.DataFrame,
+    query_filepaths_or_df: None | (typing.Iterable[str] | pd.DataFrame) = None,
     max_features: int = DEFAULT_MAX_FEATURES,
     min_features: int = DEFAULT_MIN_FEATURES,
     max_size: int = DEFAULT_MAX_SIZE,
-    hasher: typing.Optional[LocalHasher] = None,
+    hasher: LocalHasher | None = None,
     show_progress: bool = False,
     **kwargs,
-) -> typing.Union[
-    typing.List[typing.Tuple[typing.Any, typing.Any]],
-    typing.List[typing.Tuple[typing.Any, typing.Any, MatchStats]],
-]:
+) -> (
+    list[tuple[typing.Any, typing.Any]]
+    | list[tuple[typing.Any, typing.Any, MatchStats]]
+):
     """Deduplicate images by doing the following:
     #. Unletterbox all images and resize to some maximum size, preserving
        aspect ratio.

{perception-0.7.5 → perception-0.7.7}/perception/hashers/hasher.py RENAMED Viewed

@@ -3,7 +3,6 @@ import typing
 import warnings
 from abc import ABC, abstractmethod
 from logging import warning
-from typing import Optional
 import numpy as np
 import scipy.spatial
@@ -50,7 +49,7 @@ class Hasher(ABC):
     def vector_to_string(
         self, vector: np.ndarray, hash_format: str = "base64"
-    ) -> typing.Optional[str]:
+    ) -> str | None:
         """Convert vector to hash string.
         Args:
@@ -61,8 +60,8 @@ class Hasher(ABC):
     def compute_distance(
         self,
-        hash1: typing.Union[np.ndarray, str],
-        hash2: typing.Union[np.ndarray, str],
+        hash1: np.ndarray | str,
+        hash2: np.ndarray | str,
         hash_format="base64",
     ):
         """Compute the distance between two hashes.
@@ -110,9 +109,9 @@ class Hasher(ABC):
     @typing.no_type_check
     def compute_parallel(
         self,
-        filepaths: typing.List[str],
-        progress: Optional["tqdm.tqdm"] = None,
-        progress_desc: Optional[str] = None,
+        filepaths: list[str],
+        progress: tqdm.tqdm | None = None,
+        progress_desc: str | None = None,
         max_workers: int = 5,
         isometric: bool = False,
     ):
@@ -231,9 +230,7 @@ class ImageHasher(Hasher):
     def compute(
         self, image: tools.ImageInputType, hash_format="base64"
-    ) -> typing.Union[
-        np.ndarray, typing.Optional[str], typing.List[typing.Optional[str]]
-    ]:
+    ) -> np.ndarray | str | None | list[str | None]:
         """Compute a hash from an image.
         Args:
@@ -259,10 +256,8 @@ class ImageHasher(Hasher):
     def compute_with_quality(
         self, image: tools.ImageInputType, hash_format="base64"
-    ) -> typing.Tuple[
-        typing.Union[
-            np.ndarray, typing.Optional[str], typing.List[typing.Optional[str]]
-        ],
+    ) -> tuple[
+        (np.ndarray | str | None | list[str | None]),
         int,
     ]:
         """Compute hash and hash quality from image.
@@ -287,7 +282,7 @@ class ImageHasher(Hasher):
             )
         return (self.vector_to_string(vector, hash_format=hash_format), quality)
-    def _compute_with_quality(self, image: np.ndarray) -> typing.Tuple[np.ndarray, int]:
+    def _compute_with_quality(self, image: np.ndarray) -> tuple[np.ndarray, int]:
         return self._compute(image), tools.compute_quality(image)
@@ -300,9 +295,9 @@ class VideoHasher(Hasher):
     def process_frame(
         self,
         frame: np.ndarray,
-        frame_index: typing.Optional[int],
-        frame_timestamp: typing.Optional[float],
-        state: Optional[dict] = None,
+        frame_index: int | None,
+        frame_timestamp: float | None,
+        state: dict | None = None,
     ) -> dict:
         """Called for each frame in the video. For all
         but the first frame, a state is provided recording the state from
@@ -327,7 +322,7 @@ class VideoHasher(Hasher):
     def compute_with_timestamps(
         self, filepath, errors="raise", hash_format="base64", **kwargs
     ):
-        scenes: typing.List[dict] = []
+        scenes: list[dict] = []
         hashes = self.compute(filepath, errors, hash_format, scenes, **kwargs)
         return [
             {

{perception-0.7.5 → perception-0.7.7}/perception/hashers/tools.py RENAMED Viewed

@@ -17,7 +17,6 @@ import warnings
 from collections import Counter
 from http import client
 from numbers import Number
-from typing import Optional
 from urllib import request
 import cv2
@@ -47,7 +46,7 @@ CUDA_CODECS = {
 }
 FramesWithIndexesAndTimestamps = typing.Generator[
-    typing.Tuple[np.ndarray, typing.Optional[int], typing.Optional[float]], None, None
+    tuple[np.ndarray, int | None, float | None], None, None
 ]
@@ -105,9 +104,7 @@ def get_string_length(hash_length: int, dtype: str, hash_format="hex") -> int:
     raise NotImplementedError("Unknown hash format: " + hash_format)
-def vector_to_string(
-    vector: np.ndarray, dtype: str, hash_format: str
-) -> typing.Optional[str]:
+def vector_to_string(vector: np.ndarray, dtype: str, hash_format: str) -> str | None:
     """Convert vector to hash.
     Args:
@@ -287,8 +284,8 @@ def get_common_framerates(id_rates: dict):
         min(framerates) >= 1 / factor
     ), "Framerates must be at least 1 frame per hour."
     best_frame_count = np.inf
-    best_grouping: typing.Optional[typing.List] = None
-    best_frame_rates: typing.Optional[typing.List] = None
+    best_grouping: list | None = None
+    best_frame_rates: list | None = None
     # We try every possible grouping of framerates to minimize the number
     # of frames we decode. There is likely a better way to do this,
@@ -432,7 +429,7 @@ def get_video_properties(filepath):
             raise ValueError(f"{str(out)}: {str(err)}")
         data = json.loads(out.decode("utf-8"))["streams"][0]
         numerator, denominator = tuple(map(int, data["avg_frame_rate"].split("/")[:2]))
-        avg_frame_rate: typing.Optional[fractions.Fraction]
+        avg_frame_rate: fractions.Fraction | None
         if numerator > 0 and denominator > 0:
             avg_frame_rate = fractions.Fraction(
                 numerator=numerator, denominator=denominator
@@ -450,11 +447,11 @@ def get_video_properties(filepath):
 def read_video_to_generator_ffmpeg(
     filepath,
-    frames_per_second: typing.Optional[typing.Union[str, float]] = None,
+    frames_per_second: str | float | None = None,
     errors="raise",
-    max_duration: Optional[float] = None,
-    max_size: Optional[int] = None,
-    interp: Optional[str] = None,
+    max_duration: float | None = None,
+    max_size: int | None = None,
+    interp: str | None = None,
     frame_rounding: str = "up",
     draw_timestamps=False,
     use_cuda=False,
@@ -519,7 +516,7 @@ def read_video_to_generator_ffmpeg(
             start_time,
         ) = get_video_properties(filepath)
         start_time_offset = (
-            0.0 if avg_frame_rate is None else float((1 / (2 * avg_frame_rate)))
+            0.0 if avg_frame_rate is None else float(1 / (2 * avg_frame_rate))
         )
         LOGGER.debug(
             "raw_width: %s, raw_height: %s, avg_frame_rate: %s, codec_name: %s, start_time: %s",
@@ -597,8 +594,8 @@ def read_video_to_generator_ffmpeg(
             bufsize=bufsize,
         ) as p:
             assert p.stdout is not None, "Could not launch subprocess pipe."
-            timestamp: typing.Optional[float] = 0
-            frame_index: typing.Optional[int] = 0
+            timestamp: float | None = 0
+            frame_index: int | None = 0
             while True:
                 batch = p.stdout.read(bufsize)
                 if not batch:
@@ -648,10 +645,10 @@ def read_video_to_generator_ffmpeg(
 def read_video_to_generator(
     filepath,
-    frames_per_second: typing.Optional[typing.Union[str, float]] = None,
+    frames_per_second: str | float | None = None,
     errors="raise",
-    max_duration: Optional[float] = None,
-    max_size: Optional[int] = None,
+    max_duration: float | None = None,
+    max_size: int | None = None,
 ) -> FramesWithIndexesAndTimestamps:
     """This is used by :code:`read_video` when :code:`use_ffmpeg` is False (default).
@@ -674,7 +671,7 @@ def read_video_to_generator(
     if not os.path.isfile(filepath):
         raise FileNotFoundError(f"Could not find {filepath}.")
     if not os.access(filepath, os.R_OK):
-        raise IOError(f"{filepath} is not readable")
+        raise OSError(f"{filepath} is not readable")
     cap = cv2.VideoCapture(filename=filepath, apiPreference=cv2.CAP_FFMPEG)
     try:
         # The purpose of the following block is largely to create a
@@ -702,9 +699,9 @@ def read_video_to_generator(
         seconds_between_grabbed_frames = 1 / file_frames_per_second
         grabbed_frame_count = 0
         if frames_per_second == "keyframes":
-            frame_indexes: typing.Union[
-                range, typing.List[int], typing.Iterator[int]
-            ] = _get_keyframes(filepath)
+            frame_indexes: range | list[int] | typing.Iterator[int] = _get_keyframes(
+                filepath
+            )
             # The repeat flag is used to handle the case where the
             # desired sampling rate is higher than the file's frame
             # rate. In this case, we will need to repeat frames in
@@ -723,7 +720,7 @@ def read_video_to_generator(
             scale = min(max_size / max(input_width, input_height), 1)
         else:
             scale = 1
-        target_size: typing.Optional[typing.Tuple[int, int]]
+        target_size: tuple[int, int] | None
         if scale < 1:
             target_size = (int(scale * input_width), int(scale * input_height))
         else:
@@ -780,7 +777,7 @@ def read_video_into_queue(*args, video_queue, terminate, func, **kwargs):
 def read_video(
     filepath,
-    frames_per_second: typing.Optional[typing.Union[str, float]] = None,
+    frames_per_second: str | float | None = None,
     max_queue_size=128,
     use_queue=True,
     errors="raise",
@@ -822,12 +819,12 @@ def read_video(
         generator = read_video_to_generator_ffmpeg
     else:
         generator = read_video_to_generator
-    frame_index: typing.Optional[int]
-    timestamp: typing.Optional[float]
+    frame_index: int | None
+    timestamp: float | None
     if use_queue:
-        video_queue = queue.Queue(
+        video_queue: queue.Queue[tuple[np.ndarray, int, float]] = queue.Queue(
             maxsize=max_queue_size
-        )  # type: queue.Queue[typing.Tuple[np.ndarray, int, float]]
+        )
         terminate = threading.Event()
         thread = threading.Thread(
             target=read_video_into_queue,
@@ -964,7 +961,7 @@ def compute_synchronized_video_hashes(
 def unletterbox(
     image, only_remove_black: bool = False, min_fraction_meaningful_pixels: float = 0.1
-) -> typing.Optional[typing.Tuple[typing.Tuple[int, int], typing.Tuple[int, int]]]:
+) -> tuple[tuple[int, int], tuple[int, int]] | None:
     """Return bounds of non-trivial region of image or None.
     Unletterboxing is cropping an image such that trivial edge regions

{perception-0.7.5 → perception-0.7.7}/perception/hashers/video/framewise.py RENAMED Viewed

@@ -1,5 +1,3 @@
-from typing import Optional
 import numpy as np
 from .. import tools
@@ -17,7 +15,7 @@ class FramewiseHasher(VideoHasher):
         frame_hasher: ImageHasher,
         interframe_threshold: float,
         frames_per_second: int = 15,
-        quality_threshold: Optional[float] = None,
+        quality_threshold: float | None = None,
     ):
         self.hash_length = frame_hasher.hash_length
         self.frames_per_second = frames_per_second
@@ -25,10 +23,8 @@ class FramewiseHasher(VideoHasher):
         self.distance_metric = frame_hasher.distance_metric
         if self.distance_metric == "hamming" and interframe_threshold > 1:
             raise ValueError(
-                (
-                    "Hamming distance is always between 0 and 1 but "
-                    f"`interframe_threshold` was set to {interframe_threshold}."
-                )
+                "Hamming distance is always between 0 and 1 but "
+                f"`interframe_threshold` was set to {interframe_threshold}."
             )
         self.dtype = frame_hasher.dtype
         self.interframe_threshold = interframe_threshold

{perception-0.7.5 → perception-0.7.7}/perception/hashers/video/scenes.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import logging
-from typing import Optional
 import cv2
 import numpy as np
@@ -37,7 +36,7 @@ class SimpleSceneDetection(VideoHasher):
     def __init__(
         self,
-        base_hasher: Optional[VideoHasher] = None,
+        base_hasher: VideoHasher | None = None,
         interscene_threshold=None,
         min_frame_size=50,
         similarity_threshold=0.95,
@@ -131,12 +130,10 @@ class SimpleSceneDetection(VideoHasher):
         if subhash is not None and (
             self.base_hasher.returns_multiple
             or (
-                (
-                    self.interscene_threshold is None
-                    or not state["scenes"]
-                    or self.compute_distance(state["scenes"][-1]["hash"], subhash)
-                    > self.interscene_threshold
-                )
+                self.interscene_threshold is None
+                or not state["scenes"]
+                or self.compute_distance(state["scenes"][-1]["hash"], subhash)
+                > self.interscene_threshold
             )
         ):
             # Persist the scene's hash, frames, start timestamp, and end timestamp.

{perception-0.7.5 → perception-0.7.7}/perception/hashers/video/tmk.py RENAMED Viewed

@@ -1,4 +1,3 @@
-from typing import Optional
 import platform
 import warnings
@@ -17,7 +16,7 @@ class TMKL2(VideoHasher):
     def __init__(
         self,
-        frame_hasher: Optional[ImageHasher] = None,
+        frame_hasher: ImageHasher | None = None,
         frames_per_second: int = 15,
         normalization: str = "matrix",
     ):
@@ -119,23 +118,23 @@ class TMKL2(VideoHasher):
             fv_b = fv_b / norm_b
         if "freq" in normalization:
-            norm_a, norm_b = [
+            norm_a, norm_b = (
                 np.sqrt((fv**2).sum(axis=1, keepdims=True) / self.m + eps) + eps
                 for fv in [fv_a, fv_b]
-            ]
+            )
             fv_a = fv_a / norm_a
             fv_b = fv_b / norm_b
         if normalization == "matrix":
-            norm_a, norm_b = [
+            norm_a, norm_b = (
                 np.sqrt(np.sum(fv**2, axis=(1, 2)) + eps)[..., np.newaxis] + eps
                 for fv in [fv_a, fv_b]
-            ]  # (T, 1)
+            )  # (T, 1)
-        fv_a_sin, fv_b_sin = [fv[:, : self.m] for fv in [fv_a, fv_b]]  # (T, m, d)
-        fv_a_cos, fv_b_cos = [fv[:, self.m :] for fv in [fv_a, fv_b]]  # (T, m, d)
+        fv_a_sin, fv_b_sin = (fv[:, : self.m] for fv in [fv_a, fv_b])  # (T, m, d)
+        fv_a_cos, fv_b_cos = (fv[:, self.m :] for fv in [fv_a, fv_b])  # (T, m, d)
         ms = self.ms.reshape(-1, 1)  # (m, 1)
-        dot_sin_sin, dot_sin_cos, dot_cos_cos, dot_cos_sin = [
+        dot_sin_sin, dot_sin_cos, dot_cos_cos, dot_cos_sin = (
             np.sum(p, axis=2, keepdims=True)
             for p in [
                 fv_a_sin * fv_b_sin,
@@ -143,7 +142,7 @@ class TMKL2(VideoHasher):
                 fv_a_cos * fv_b_cos,
                 fv_a_cos * fv_b_sin,
             ]
-        ]  # (T, m, 1)
+        )  # (T, m, 1)
         delta = (
             ms.reshape(1, -1, 1) * offsets.reshape(1, -1) / self.T.reshape((-1, 1, 1))
         )
@@ -169,7 +168,7 @@ class TMKL1(VideoHasher):
     def __init__(
         self,
-        frame_hasher: Optional[ImageHasher] = None,
+        frame_hasher: ImageHasher | None = None,
         frames_per_second: int = 15,
         dtype="float32",
         distance_metric="cosine",

{perception-0.7.5 → perception-0.7.7}/perception/testing/__init__.py RENAMED Viewed

@@ -127,7 +127,7 @@ def test_hasher_parallelization(hasher, test_filepaths):
 def test_video_hasher_integrity(
-    hasher: hashers.VideoHasher, test_videos: typing.List[str] = DEFAULT_TEST_VIDEOS
+    hasher: hashers.VideoHasher, test_videos: list[str] = DEFAULT_TEST_VIDEOS
 ):
     test_hasher_parallelization(hasher, test_videos)
@@ -136,7 +136,7 @@ def test_image_hasher_integrity(
     hasher: hashers.ImageHasher,
     pil_opencv_threshold: float,
     transform_threshold: float,
-    test_images: typing.List[str] = DEFAULT_TEST_IMAGES,
+    test_images: list[str] = DEFAULT_TEST_IMAGES,
     opencv_hasher: bool = False,
 ):
     """Test to ensure a hasher works correctly.

{perception-0.7.5 → perception-0.7.7}/perception/tools.py RENAMED Viewed

@@ -1,11 +1,9 @@
 import base64
 import json
 import os
-import typing
 import urllib.parse
 import urllib.request
 import warnings
-from typing import Optional
 import numpy as np
 from scipy import spatial
@@ -25,9 +23,7 @@ except ImportError:
     extensions = None
-def _multiple_hashes_for_ids(
-    hashes: typing.List[typing.Tuple[str, typing.Union[str, np.ndarray]]]
-):
+def _multiple_hashes_for_ids(hashes: list[tuple[str, str | np.ndarray]]):
     """Check if a list of (hash_id, hash) tuples has more
     than one hash for a hash_id.
@@ -39,15 +35,15 @@ def _multiple_hashes_for_ids(
 def deduplicate_hashes(
-    hashes: typing.List[typing.Tuple[str, typing.Union[str, np.ndarray]]],
+    hashes: list[tuple[str, str | np.ndarray]],
     threshold: float,
     hash_format: str = "base64",
-    hasher: Optional[perception_hashers.ImageHasher] = None,
-    hash_length: Optional[int] = None,
-    hash_dtype: Optional[str] = None,
-    distance_metric: Optional[str] = None,
-    progress: Optional[tqdm] = None,
-) -> typing.List[typing.Tuple[str, str]]:
+    hasher: perception_hashers.ImageHasher | None = None,
+    hash_length: int | None = None,
+    hash_dtype: str | None = None,
+    distance_metric: str | None = None,
+    progress: tqdm | None = None,
+) -> list[tuple[str, str]]:
     """Find duplicates using a list of precomputed hashes.
     Args:
@@ -102,7 +98,7 @@ def deduplicate_hashes(
         ]
     )
     files = np.array([identifier for identifier, _ in hashes])
-    pairs: typing.List[typing.Tuple[str, str]] = []
+    pairs: list[tuple[str, str]] = []
     n_hashes = len(vectors)
     start_idx = 0
     end_idx = None
@@ -134,7 +130,7 @@ def deduplicate_hashes(
         # this so we can pass it to the compute_euclidean_pairwise_duplicates
         # function.
         if multiple_hashes_per_id:
-            counts = np.zeros(shape=len(set(hash_id for hash_id, _ in hashes))).astype(
+            counts = np.zeros(shape=len({hash_id for hash_id, _ in hashes})).astype(
                 "uint32"
             )
             previous_hash_id = None
@@ -162,11 +158,11 @@ def deduplicate_hashes(
 def deduplicate(
-    files: typing.List[str],
-    hashers: typing.List[typing.Tuple[perception_hashers.ImageHasher, float]],
+    files: list[str],
+    hashers: list[tuple[perception_hashers.ImageHasher, float]],
     isometric: bool = False,
-    progress: Optional[tqdm] = None,
-) -> typing.List[typing.Tuple[str, str]]:
+    progress: tqdm | None = None,
+) -> list[tuple[str, str]]:
     """Find duplicates in a list of files.
     Args:
@@ -187,7 +183,7 @@ def deduplicate(
             category=UserWarning,
         )
         files = list(files_dedup)
-    pairs: typing.List[typing.Tuple[str, str]] = []
+    pairs: list[tuple[str, str]] = []
     for hasher_idx, (hasher, threshold) in enumerate(hashers):
         hash_dicts = hasher.compute_parallel(
             filepaths=files,
@@ -271,12 +267,12 @@ class SaferMatcher:
     def __init__(
         self,
-        api_key: Optional[str] = None,
-        username: Optional[str] = None,
-        password: Optional[str] = None,
-        url: Optional[str] = None,
-        hasher: Optional[perception_hashers.ImageHasher] = None,
-        hasher_api_id: Optional[str] = None,
+        api_key: str | None = None,
+        username: str | None = None,
+        password: str | None = None,
+        url: str | None = None,
+        hasher: perception_hashers.ImageHasher | None = None,
+        hasher_api_id: str | None = None,
         quality_threshold: int = 90,
     ):
         if (
@@ -322,11 +318,7 @@ class SaferMatcher:
     def match(
         self,
-        images: typing.List[
-            typing.Union[
-                str, typing.Tuple[perception_hashers.tools.ImageInputType, str]
-            ]
-        ],
+        images: list[(str | tuple[perception_hashers.tools.ImageInputType, str])],
     ) -> dict:
         """Match hashes with the Safer matching service.

{perception-0.7.5 → perception-0.7.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "Perception"
-version = "0.7.5"
+version = "0.7.7"
 description = "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use."
 authors = ["Thorn <info@wearethorn.org>"]
 license = "Apache License 2.0"

{perception-0.7.5 → perception-0.7.7}/setup.py RENAMED Viewed

@@ -38,7 +38,7 @@ extras_require = \
 setup_kwargs = {
     'name': 'Perception',
-    'version': '0.7.5',
+    'version': '0.7.7',
     'description': 'Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.',
     'long_description': "# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\n\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\n\n## Background\n\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\n\n## Getting Started\n\n### Installation\n\n`pip install perception`\n\n### Hashing\n\nHashing with different functions is simple with `perception`.\n\n```python\nfrom perception import hashers\n\nfile1, file2 = 'test1.jpg', 'test2.jpg'\nhasher = hashers.PHash()\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\ndistance = hasher.compute_distance(hash1, hash2)\n```\n\n### Examples\n\nSee below for end-to-end examples for common use cases for perceptual hashes.\n\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\n\n## Supported Hashing Algorithms\n\n`perception` currently ships with:\n\n- pHash (DCT hash) (`perception.hashers.PHash`)\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\n- dHash (difference hash) (`perception.hashers.DHash`)\n- aHash (average hash) (`perception.hashers.AverageHash`)\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\n- Color Moment (`perception.hashers.ColorMoment`)\n- Block Mean (`perception.hashers.BlockMean`)\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\n\n## Contributing\n\nTo work on the project, start by doing the following.\n\n```bash\n# Install local dependencies for\n# code completion, etc.\nmake init\n\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\n\nTo implement new features, please first file an issue proposing your change for discussion.\n\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\n\n## Alternatives\n\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\nexamples.\n\n- [dedupe](https://github.com/dedupeio/dedupe)\n- [imagededup](https://idealo.github.io/imagededup/)\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\n```\n",
     'author': 'Thorn',