PyPI - Perception - Versions diffs - 0.7.6__tar.gz → 0.8.0__tar.gz - Mend

Perception 0.7.6tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{perception-0.7.6 → perception-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: Perception
-Version: 0.7.6
+Version: 0.8.0
 Summary: Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.
 License: Apache-2.0
 Author: Thorn
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Provides-Extra: benchmarking
 Provides-Extra: experimental
 Provides-Extra: matching

{perception-0.7.6/perception/experimental → perception-0.8.0/perception/approximate_deduplication}/debug.py RENAMED Viewed

@@ -4,7 +4,7 @@ import random
 import cv2
 import numpy as np
-import perception.experimental.local_descriptor_deduplication as ldd
+import perception.local_descriptor_deduplication as ldd
 LOGGER = logging.getLogger(__name__)

{perception-0.7.6 → perception-0.8.0}/perception/hashers/__init__.py RENAMED Viewed

@@ -5,9 +5,9 @@ from .image.opencv import BlockMean, ColorMoment, MarrHildreth
 from .image.phash import PHash, PHashF, PHashU8
 from .image.wavelet import WaveletHash
 from .video.framewise import FramewiseHasher
-from .video.scenes import SimpleSceneDetection
 from .video.tmk import TMKL1, TMKL2
 __all__ = [
     "ImageHasher",
     "VideoHasher",
@@ -23,5 +23,4 @@ __all__ = [
     "TMKL2",
     "PHashU8",
     "PHashF",
-    "SimpleSceneDetection",
 ]

{perception-0.7.6 → perception-0.8.0}/perception/hashers/hasher.py RENAMED Viewed

@@ -319,21 +319,6 @@ class VideoHasher(Hasher):
             state: The state dictionary at the end of processing.
         """
-    def compute_with_timestamps(
-        self, filepath, errors="raise", hash_format="base64", **kwargs
-    ):
-        scenes: list[dict] = []
-        hashes = self.compute(filepath, errors, hash_format, scenes, **kwargs)
-        return [
-            {
-                "hash": hashes[i],
-                "start_timestamp": scene.get("start_timestamp"),
-                "end_timestamp": scene.get("end_timestamp"),
-                "frame_index": scene.get("frame_index"),
-            }
-            for i, scene in enumerate(scenes)
-        ]
     def compute(
         self,
         filepath,

perception-0.8.0/perception/hashers/video/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .framewise import FramewiseHasher
+from .tmk import TMKL1, TMKL2
+__all__ = ["FramewiseHasher", "TMKL1", "TMKL2"]

{perception-0.7.6/perception/experimental → perception-0.8.0/perception}/local_descriptor_deduplication.py RENAMED Viewed

@@ -10,7 +10,7 @@ import pandas as pd
 import tqdm
 import typing_extensions
-import perception.experimental.approximate_deduplication as ad
+import perception.approximate_deduplication as ad
 import perception.hashers.tools as pht
 LOGGER = logging.getLogger(__name__)

{perception-0.7.6 → perception-0.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "Perception"
-version = "0.7.6"
+version = "0.8.0"
 description = "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use."
 authors = ["Thorn <info@wearethorn.org>"]
 license = "Apache License 2.0"

{perception-0.7.6 → perception-0.8.0}/setup.py RENAMED Viewed

@@ -3,9 +3,8 @@ from setuptools import setup
 packages = \
 ['perception',
+ 'perception.approximate_deduplication',
  'perception.benchmarking',
- 'perception.experimental',
- 'perception.experimental.ann',
  'perception.hashers',
  'perception.hashers.image',
  'perception.hashers.video',
@@ -38,7 +37,7 @@ extras_require = \
 setup_kwargs = {
     'name': 'Perception',
-    'version': '0.7.6',
+    'version': '0.8.0',
     'description': 'Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.',
     'long_description': "# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\n\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\n\n## Background\n\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\n\n## Getting Started\n\n### Installation\n\n`pip install perception`\n\n### Hashing\n\nHashing with different functions is simple with `perception`.\n\n```python\nfrom perception import hashers\n\nfile1, file2 = 'test1.jpg', 'test2.jpg'\nhasher = hashers.PHash()\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\ndistance = hasher.compute_distance(hash1, hash2)\n```\n\n### Examples\n\nSee below for end-to-end examples for common use cases for perceptual hashes.\n\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\n\n## Supported Hashing Algorithms\n\n`perception` currently ships with:\n\n- pHash (DCT hash) (`perception.hashers.PHash`)\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\n- dHash (difference hash) (`perception.hashers.DHash`)\n- aHash (average hash) (`perception.hashers.AverageHash`)\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\n- Color Moment (`perception.hashers.ColorMoment`)\n- Block Mean (`perception.hashers.BlockMean`)\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\n\n## Contributing\n\nTo work on the project, start by doing the following.\n\n```bash\n# Install local dependencies for\n# code completion, etc.\nmake init\n\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\n\nTo implement new features, please first file an issue proposing your change for discussion.\n\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\n\n## Alternatives\n\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\nexamples.\n\n- [dedupe](https://github.com/dedupeio/dedupe)\n- [imagededup](https://idealo.github.io/imagededup/)\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\n```\n",
     'author': 'Thorn',

perception-0.7.6/perception/experimental/__init__.py DELETED Viewed

File without changes

perception-0.7.6/perception/experimental/ann/__init__.py DELETED Viewed

File without changes

perception-0.7.6/perception/hashers/video/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .framewise import FramewiseHasher
-from .scenes import SimpleSceneDetection
-from .tmk import TMKL1, TMKL2
-__all__ = ["FramewiseHasher", "TMKL1", "TMKL2", "SimpleSceneDetection"]

perception-0.7.6/perception/hashers/video/scenes.py DELETED Viewed

@@ -1,238 +0,0 @@
-import logging
-import cv2
-import numpy as np
-from ...utils import flatten
-from .. import tools
-from ..hasher import VideoHasher
-from ..image.phash import PHashU8
-from .tmk import TMKL1
-logger = logging.getLogger(__name__)
-class SimpleSceneDetection(VideoHasher):
-    """The SimpleSceneDetection hasher is a wrapper around other video hashers
-    to create separate hashes for different scenes / shots in a video. It works
-    by shrinking each frame, blurring it, and doing a simple delta with the previous
-    frame. If they are different, this marks the start of a new scene. In addition,
-    this wrapper will also remove letterboxing from videos by checking for solid
-    black areas on the edges of the frame.
-    Args:
-        base_hasher: The base video hasher to use for each scene.
-        interscene_threshold: The distance threshold between sequential scenes that
-            new hashes must meet to be included (this is essentially for deduplication)
-        min_frame_size: The minimum frame size to use for computing hashes. This is
-            relevant for letterbox detection as black frames will tend to be completely
-            "cropped" and make the frame very small.
-        max_scene_length: The maximum length of a single scene.
-        similarity_threshold: The threshold for detecting whether two frames are
-            different enough to constitute a new scene.
-    """
-    returns_multiple = True
-    def __init__(
-        self,
-        base_hasher: VideoHasher | None = None,
-        interscene_threshold=None,
-        min_frame_size=50,
-        similarity_threshold=0.95,
-        max_scene_length=None,
-    ):
-        if base_hasher is None:
-            base_hasher = TMKL1(
-                frames_per_second=2,
-                frame_hasher=PHashU8(
-                    exclude_first_term=False, freq_shift=1, hash_size=12
-                ),
-                distance_metric="euclidean",
-                dtype="uint8",
-                norm=None,
-                quality_threshold=90,
-            )
-            if interscene_threshold is None:
-                interscene_threshold = 50
-        if interscene_threshold is not None and base_hasher.returns_multiple:
-            raise ValueError(
-                "Interscene thresholds not supported for hashers returning multiple hashes."
-            )
-        self.base_hasher = base_hasher
-        self.frames_per_second = base_hasher.frames_per_second
-        self.distance_metric = base_hasher.distance_metric
-        self.dtype = base_hasher.dtype
-        self.hash_length = base_hasher.hash_length
-        self.max_scene_length = max_scene_length
-        self.interscene_threshold = interscene_threshold
-        self.min_frame_size = min_frame_size
-        self.similarity_threshold = similarity_threshold
-    def compute_batches(
-        self, filepath, errors="raise", hash_format="base64", batch_size=10
-    ):
-        """Compute a hash for a video at a given filepath and
-        yield hashes in a given batch size.
-        Args:
-            filepath: Path to video file
-            errors: One of "raise", "ignore", or "warn". Passed
-                to perception.hashers.tools.read_video.
-            hash_format: The hash format to use when returning hashes.
-            batch_size: The minimum number of hashes to include in each batch.
-        """
-        def convert(scenes):
-            if hash_format == "vector":
-                return scenes
-            if self.base_hasher.returns_multiple:
-                return [
-                    (
-                        [
-                            self.vector_to_string(h, hash_format=hash_format)
-                            for h in scene["hash"]
-                        ],
-                        scene["frames"],
-                    )
-                    for scene in scenes
-                ]
-            return [
-                (
-                    self.vector_to_string(scene["hash"], hash_format=hash_format),
-                    scene["frames"],
-                )
-                for scene in scenes
-            ]
-        state = None
-        for frame, frame_index, frame_timestamp in tools.read_video(
-            filepath=filepath, frames_per_second=self.frames_per_second, errors=errors
-        ):
-            state = self.process_frame(
-                frame=frame,
-                frame_index=frame_index,
-                frame_timestamp=frame_timestamp,
-                state=state,
-                batch_mode=True,
-            )
-            if len(state["scenes"]) >= batch_size:
-                yield convert(state["scenes"])
-                state["scenes"] = []
-        assert state is not None
-        if state["substate"]:
-            self.handle_scene(state)
-        if state["scenes"]:
-            yield convert(state["scenes"])
-    def handle_scene(self, state, frame_timestamp=None, frame_index=None):
-        subhash = self.base_hasher.hash_from_final_state(state["substate"])
-        if subhash is not None and (
-            self.base_hasher.returns_multiple
-            or (
-                self.interscene_threshold is None
-                or not state["scenes"]
-                or self.compute_distance(state["scenes"][-1]["hash"], subhash)
-                > self.interscene_threshold
-            )
-        ):
-            # Persist the scene's hash, frames, start timestamp, and end timestamp.
-            # If frame_timestamp is None, we can assume we've reached the end of
-            # the video and should use the end timestamp instead
-            state["scenes"].append(
-                {
-                    "hash": subhash,
-                    "frames": state["frames"],
-                    "start_timestamp": state["start"],
-                    "end_timestamp": frame_timestamp or state.get("end"),
-                    "frame_index": state["frame_index"],
-                }
-            )
-        state["substate"] = None
-        state["bounds"] = None
-        state["frames"] = []
-        state["previous_frame"] = None
-        if frame_timestamp is not None:
-            state["start"] = frame_timestamp
-        if frame_index is not None:
-            state["frame_index"] = frame_index
-    def crop(self, frame, bounds):
-        # Check to see we have set bounds for this scene yet.
-        if not bounds:
-            # We don't have bounds, so we'll set them.
-            bounds = tools.unletterbox(frame)
-            # If the bounds come back invalid (i.e., the frame is too small)
-            # or no bounds are found (i.e., the frame is all back), we
-            # return None.
-            if (
-                bounds is None
-                or min(bounds[0][1] - bounds[0][0], bounds[1][1] - bounds[1][0])
-                < self.min_frame_size
-            ):
-                return None, None, None
-        (x1, x2), (y1, y2) = bounds
-        cropped = np.ascontiguousarray(frame[y1:y2, x1:x2])
-        current = cv2.resize(cv2.cvtColor(cropped, cv2.COLOR_RGB2GRAY), (128, 128))
-        current = cv2.blur(current, ksize=(4, 4))
-        return cropped, current, bounds
-    def process_frame(
-        self, frame, frame_index, frame_timestamp, state=None, batch_mode=False
-    ):
-        if not state:
-            state = {
-                "previous_frame": None,
-                "substate": None,
-                "start": 0,
-                "bounds": None,
-                "frames": [],
-                "scenes": [],
-                "frame_index": frame_index,
-            }
-        cropped, current, state["bounds"] = self.crop(frame, state["bounds"])
-        if cropped is None:
-            # A good crop was not found so we set the start of the scene to this
-            # point and continue on to the next frame. This will repeat until we
-            # find appropriate bounds.
-            state["start"] = frame_timestamp
-            return state
-        # Check if we have a previous frame to compare the
-        # current frame to.
-        if state["previous_frame"] is not None:
-            # Compute similarity between the previous frame and the
-            # current frame.
-            similarity = 1 - np.abs(
-                state["previous_frame"].astype("float32") - current.astype("float32")
-            ).sum() / (255 * 128**2)
-            # If the previous frame and the current one are too dissimilar, we've started
-            # a new scene and we should handle it appropriately
-            if similarity < self.similarity_threshold or (
-                self.max_scene_length is not None
-                and frame_timestamp - state["start"] > self.max_scene_length
-            ):
-                self.handle_scene(state, frame_timestamp, frame_index)
-                cropped, current, state["bounds"] = self.crop(frame, state["bounds"])
-                if cropped is None:
-                    # See comment above about invalid crops.
-                    state["start"] = frame_timestamp
-                    return state
-        state["previous_frame"] = current
-        try:
-            state["substate"] = self.base_hasher.process_frame(
-                cropped, frame_index, frame_timestamp, state=state["substate"]
-            )
-            if batch_mode:
-                state["frames"].append((frame, frame_index, frame_timestamp))
-        except Exception as e:
-            logger.warning("An error occurred while processing a frame: %s", str(e))
-        return state
-    def hash_from_final_state(self, state):
-        if state["substate"]:
-            self.handle_scene(state)
-        if not self.base_hasher.returns_multiple:
-            return [h["hash"] for h in state["scenes"]]
-        return flatten([scene["hash"] for scene in state["scenes"]])