Perception 0.7.6__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {perception-0.7.6 → perception-0.8.0}/PKG-INFO +3 -2
  2. {perception-0.7.6/perception/experimental → perception-0.8.0/perception/approximate_deduplication}/debug.py +1 -1
  3. {perception-0.7.6 → perception-0.8.0}/perception/hashers/__init__.py +1 -2
  4. {perception-0.7.6 → perception-0.8.0}/perception/hashers/hasher.py +0 -15
  5. perception-0.8.0/perception/hashers/video/__init__.py +4 -0
  6. {perception-0.7.6/perception/experimental → perception-0.8.0/perception}/local_descriptor_deduplication.py +1 -1
  7. {perception-0.7.6 → perception-0.8.0}/pyproject.toml +1 -1
  8. {perception-0.7.6 → perception-0.8.0}/setup.py +2 -3
  9. perception-0.7.6/perception/experimental/__init__.py +0 -0
  10. perception-0.7.6/perception/experimental/ann/__init__.py +0 -0
  11. perception-0.7.6/perception/hashers/video/__init__.py +0 -5
  12. perception-0.7.6/perception/hashers/video/scenes.py +0 -238
  13. {perception-0.7.6 → perception-0.8.0}/LICENSE +0 -0
  14. {perception-0.7.6 → perception-0.8.0}/README.md +0 -0
  15. {perception-0.7.6 → perception-0.8.0}/build.py +0 -0
  16. {perception-0.7.6 → perception-0.8.0}/perception/__init__.py +0 -0
  17. /perception-0.7.6/perception/experimental/approximate_deduplication.py → /perception-0.8.0/perception/approximate_deduplication/__init__.py +0 -0
  18. {perception-0.7.6/perception/experimental/ann → perception-0.8.0/perception/approximate_deduplication}/index.py +0 -0
  19. {perception-0.7.6/perception/experimental/ann → perception-0.8.0/perception/approximate_deduplication}/serve.py +0 -0
  20. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/__init__.py +0 -0
  21. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/common.py +0 -0
  22. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/extensions.pyx +0 -0
  23. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/image.py +0 -0
  24. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/image_transforms.py +0 -0
  25. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/video.py +0 -0
  26. {perception-0.7.6 → perception-0.8.0}/perception/benchmarking/video_transforms.py +0 -0
  27. {perception-0.7.6 → perception-0.8.0}/perception/extensions.pyx +0 -0
  28. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/__init__.py +0 -0
  29. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/average.py +0 -0
  30. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/dhash.py +0 -0
  31. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/opencv.py +0 -0
  32. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/pdq.py +0 -0
  33. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/phash.py +0 -0
  34. {perception-0.7.6 → perception-0.8.0}/perception/hashers/image/wavelet.py +0 -0
  35. {perception-0.7.6 → perception-0.8.0}/perception/hashers/tools.py +0 -0
  36. {perception-0.7.6 → perception-0.8.0}/perception/hashers/video/framewise.py +0 -0
  37. {perception-0.7.6 → perception-0.8.0}/perception/hashers/video/tmk.py +0 -0
  38. {perception-0.7.6 → perception-0.8.0}/perception/py.typed +0 -0
  39. {perception-0.7.6 → perception-0.8.0}/perception/testing/__init__.py +0 -0
  40. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/README.md +0 -0
  41. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image1.jpg +0 -0
  42. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image10.jpg +0 -0
  43. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image2.jpg +0 -0
  44. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image3.jpg +0 -0
  45. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image4.jpg +0 -0
  46. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image5.jpg +0 -0
  47. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image6.jpg +0 -0
  48. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image7.jpg +0 -0
  49. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image8.jpg +0 -0
  50. {perception-0.7.6 → perception-0.8.0}/perception/testing/images/image9.jpg +0 -0
  51. {perception-0.7.6 → perception-0.8.0}/perception/testing/logos/README.md +0 -0
  52. {perception-0.7.6 → perception-0.8.0}/perception/testing/logos/logoipsum.png +0 -0
  53. {perception-0.7.6 → perception-0.8.0}/perception/testing/videos/README.md +0 -0
  54. {perception-0.7.6 → perception-0.8.0}/perception/testing/videos/expected_tmk.json.gz +0 -0
  55. {perception-0.7.6 → perception-0.8.0}/perception/testing/videos/rgb.m4v +0 -0
  56. {perception-0.7.6 → perception-0.8.0}/perception/testing/videos/v1.m4v +0 -0
  57. {perception-0.7.6 → perception-0.8.0}/perception/testing/videos/v2.m4v +0 -0
  58. {perception-0.7.6 → perception-0.8.0}/perception/testing/videos/v2s.mov +0 -0
  59. {perception-0.7.6 → perception-0.8.0}/perception/tools.py +0 -0
  60. {perception-0.7.6 → perception-0.8.0}/perception/utils.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: Perception
3
- Version: 0.7.6
3
+ Version: 0.8.0
4
4
  Summary: Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.
5
5
  License: Apache-2.0
6
6
  Author: Thorn
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
14
15
  Provides-Extra: benchmarking
15
16
  Provides-Extra: experimental
16
17
  Provides-Extra: matching
@@ -4,7 +4,7 @@ import random
4
4
  import cv2
5
5
  import numpy as np
6
6
 
7
- import perception.experimental.local_descriptor_deduplication as ldd
7
+ import perception.local_descriptor_deduplication as ldd
8
8
 
9
9
  LOGGER = logging.getLogger(__name__)
10
10
 
@@ -5,9 +5,9 @@ from .image.opencv import BlockMean, ColorMoment, MarrHildreth
5
5
  from .image.phash import PHash, PHashF, PHashU8
6
6
  from .image.wavelet import WaveletHash
7
7
  from .video.framewise import FramewiseHasher
8
- from .video.scenes import SimpleSceneDetection
9
8
  from .video.tmk import TMKL1, TMKL2
10
9
 
10
+
11
11
  __all__ = [
12
12
  "ImageHasher",
13
13
  "VideoHasher",
@@ -23,5 +23,4 @@ __all__ = [
23
23
  "TMKL2",
24
24
  "PHashU8",
25
25
  "PHashF",
26
- "SimpleSceneDetection",
27
26
  ]
@@ -319,21 +319,6 @@ class VideoHasher(Hasher):
319
319
  state: The state dictionary at the end of processing.
320
320
  """
321
321
 
322
- def compute_with_timestamps(
323
- self, filepath, errors="raise", hash_format="base64", **kwargs
324
- ):
325
- scenes: list[dict] = []
326
- hashes = self.compute(filepath, errors, hash_format, scenes, **kwargs)
327
- return [
328
- {
329
- "hash": hashes[i],
330
- "start_timestamp": scene.get("start_timestamp"),
331
- "end_timestamp": scene.get("end_timestamp"),
332
- "frame_index": scene.get("frame_index"),
333
- }
334
- for i, scene in enumerate(scenes)
335
- ]
336
-
337
322
  def compute(
338
323
  self,
339
324
  filepath,
@@ -0,0 +1,4 @@
1
+ from .framewise import FramewiseHasher
2
+ from .tmk import TMKL1, TMKL2
3
+
4
+ __all__ = ["FramewiseHasher", "TMKL1", "TMKL2"]
@@ -10,7 +10,7 @@ import pandas as pd
10
10
  import tqdm
11
11
  import typing_extensions
12
12
 
13
- import perception.experimental.approximate_deduplication as ad
13
+ import perception.approximate_deduplication as ad
14
14
  import perception.hashers.tools as pht
15
15
 
16
16
  LOGGER = logging.getLogger(__name__)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "Perception"
3
- version = "0.7.6"
3
+ version = "0.8.0"
4
4
  description = "Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use."
5
5
  authors = ["Thorn <info@wearethorn.org>"]
6
6
  license = "Apache License 2.0"
@@ -3,9 +3,8 @@ from setuptools import setup
3
3
 
4
4
  packages = \
5
5
  ['perception',
6
+ 'perception.approximate_deduplication',
6
7
  'perception.benchmarking',
7
- 'perception.experimental',
8
- 'perception.experimental.ann',
9
8
  'perception.hashers',
10
9
  'perception.hashers.image',
11
10
  'perception.hashers.video',
@@ -38,7 +37,7 @@ extras_require = \
38
37
 
39
38
  setup_kwargs = {
40
39
  'name': 'Perception',
41
- 'version': '0.7.6',
40
+ 'version': '0.8.0',
42
41
  'description': 'Perception provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use.',
43
42
  'long_description': "# perception ![ci](https://github.com/thorn-oss/perception/workflows/ci/badge.svg)\n\n`perception` provides flexible, well-documented, and comprehensively tested tooling for perceptual hashing research, development, and production use. See [the documentation](https://perception.thorn.engineering/en/latest/) for details.\n\n## Background\n\n`perception` was initially developed at [Thorn](https://www.thorn.org) as part of our work to eliminate child sexual abuse material from the internet. For more information on the issue, check out [our CEO's TED talk](https://www.thorn.org/blog/time-is-now-eliminate-csam/).\n\n## Getting Started\n\n### Installation\n\n`pip install perception`\n\n### Hashing\n\nHashing with different functions is simple with `perception`.\n\n```python\nfrom perception import hashers\n\nfile1, file2 = 'test1.jpg', 'test2.jpg'\nhasher = hashers.PHash()\nhash1, hash2 = hasher.compute(file1), hasher.compute(file2)\ndistance = hasher.compute_distance(hash1, hash2)\n```\n\n### Examples\n\nSee below for end-to-end examples for common use cases for perceptual hashes.\n\n- [Detecting child sexual abuse material](https://perception.thorn.engineering/en/latest/examples/detecting_csam.html)\n- [Deduplicating media](https://perception.thorn.engineering/en/latest/examples/deduplication.html)\n- [Benchmarking perceptual hashes](https://perception.thorn.engineering/en/latest/examples/benchmarking.html)\n\n## Supported Hashing Algorithms\n\n`perception` currently ships with:\n\n- pHash (DCT hash) (`perception.hashers.PHash`)\n- Facebook's PDQ Hash (`perception.hashers.PDQ`)\n- dHash (difference hash) (`perception.hashers.DHash`)\n- aHash (average hash) (`perception.hashers.AverageHash`)\n- Marr-Hildreth (`perception.hashers.MarrHildreth`)\n- Color Moment (`perception.hashers.ColorMoment`)\n- Block Mean (`perception.hashers.BlockMean`)\n- wHash (wavelet hash) (`perception.hashers.WaveletHash`)\n\n## Contributing\n\nTo work on the project, start by doing the following.\n\n```bash\n# Install local dependencies for\n# code completion, etc.\nmake init\n\n- To do a (close to) comprehensive check before committing code, you can use `make precommit`.\n\nTo implement new features, please first file an issue proposing your change for discussion.\n\nTo report problems, please file an issue with sample code, expected results, actual results, and a complete traceback.\n\n## Alternatives\n\nThere are other packages worth checking out to see if they meet your needs for perceptual hashing. Here are some\nexamples.\n\n- [dedupe](https://github.com/dedupeio/dedupe)\n- [imagededup](https://idealo.github.io/imagededup/)\n- [ImageHash](https://github.com/JohannesBuchner/imagehash)\n- [PhotoHash](https://github.com/bunchesofdonald/photohash)\n```\n",
44
43
  'author': 'Thorn',
File without changes
File without changes
@@ -1,5 +0,0 @@
1
- from .framewise import FramewiseHasher
2
- from .scenes import SimpleSceneDetection
3
- from .tmk import TMKL1, TMKL2
4
-
5
- __all__ = ["FramewiseHasher", "TMKL1", "TMKL2", "SimpleSceneDetection"]
@@ -1,238 +0,0 @@
1
- import logging
2
-
3
- import cv2
4
- import numpy as np
5
-
6
- from ...utils import flatten
7
- from .. import tools
8
- from ..hasher import VideoHasher
9
- from ..image.phash import PHashU8
10
- from .tmk import TMKL1
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class SimpleSceneDetection(VideoHasher):
16
- """The SimpleSceneDetection hasher is a wrapper around other video hashers
17
- to create separate hashes for different scenes / shots in a video. It works
18
- by shrinking each frame, blurring it, and doing a simple delta with the previous
19
- frame. If they are different, this marks the start of a new scene. In addition,
20
- this wrapper will also remove letterboxing from videos by checking for solid
21
- black areas on the edges of the frame.
22
-
23
- Args:
24
- base_hasher: The base video hasher to use for each scene.
25
- interscene_threshold: The distance threshold between sequential scenes that
26
- new hashes must meet to be included (this is essentially for deduplication)
27
- min_frame_size: The minimum frame size to use for computing hashes. This is
28
- relevant for letterbox detection as black frames will tend to be completely
29
- "cropped" and make the frame very small.
30
- max_scene_length: The maximum length of a single scene.
31
- similarity_threshold: The threshold for detecting whether two frames are
32
- different enough to constitute a new scene.
33
- """
34
-
35
- returns_multiple = True
36
-
37
- def __init__(
38
- self,
39
- base_hasher: VideoHasher | None = None,
40
- interscene_threshold=None,
41
- min_frame_size=50,
42
- similarity_threshold=0.95,
43
- max_scene_length=None,
44
- ):
45
- if base_hasher is None:
46
- base_hasher = TMKL1(
47
- frames_per_second=2,
48
- frame_hasher=PHashU8(
49
- exclude_first_term=False, freq_shift=1, hash_size=12
50
- ),
51
- distance_metric="euclidean",
52
- dtype="uint8",
53
- norm=None,
54
- quality_threshold=90,
55
- )
56
- if interscene_threshold is None:
57
- interscene_threshold = 50
58
- if interscene_threshold is not None and base_hasher.returns_multiple:
59
- raise ValueError(
60
- "Interscene thresholds not supported for hashers returning multiple hashes."
61
- )
62
- self.base_hasher = base_hasher
63
- self.frames_per_second = base_hasher.frames_per_second
64
- self.distance_metric = base_hasher.distance_metric
65
- self.dtype = base_hasher.dtype
66
- self.hash_length = base_hasher.hash_length
67
- self.max_scene_length = max_scene_length
68
- self.interscene_threshold = interscene_threshold
69
- self.min_frame_size = min_frame_size
70
- self.similarity_threshold = similarity_threshold
71
-
72
- def compute_batches(
73
- self, filepath, errors="raise", hash_format="base64", batch_size=10
74
- ):
75
- """Compute a hash for a video at a given filepath and
76
- yield hashes in a given batch size.
77
-
78
- Args:
79
- filepath: Path to video file
80
- errors: One of "raise", "ignore", or "warn". Passed
81
- to perception.hashers.tools.read_video.
82
- hash_format: The hash format to use when returning hashes.
83
- batch_size: The minimum number of hashes to include in each batch.
84
- """
85
-
86
- def convert(scenes):
87
- if hash_format == "vector":
88
- return scenes
89
- if self.base_hasher.returns_multiple:
90
- return [
91
- (
92
- [
93
- self.vector_to_string(h, hash_format=hash_format)
94
- for h in scene["hash"]
95
- ],
96
- scene["frames"],
97
- )
98
- for scene in scenes
99
- ]
100
- return [
101
- (
102
- self.vector_to_string(scene["hash"], hash_format=hash_format),
103
- scene["frames"],
104
- )
105
- for scene in scenes
106
- ]
107
-
108
- state = None
109
- for frame, frame_index, frame_timestamp in tools.read_video(
110
- filepath=filepath, frames_per_second=self.frames_per_second, errors=errors
111
- ):
112
- state = self.process_frame(
113
- frame=frame,
114
- frame_index=frame_index,
115
- frame_timestamp=frame_timestamp,
116
- state=state,
117
- batch_mode=True,
118
- )
119
- if len(state["scenes"]) >= batch_size:
120
- yield convert(state["scenes"])
121
- state["scenes"] = []
122
- assert state is not None
123
- if state["substate"]:
124
- self.handle_scene(state)
125
- if state["scenes"]:
126
- yield convert(state["scenes"])
127
-
128
- def handle_scene(self, state, frame_timestamp=None, frame_index=None):
129
- subhash = self.base_hasher.hash_from_final_state(state["substate"])
130
- if subhash is not None and (
131
- self.base_hasher.returns_multiple
132
- or (
133
- self.interscene_threshold is None
134
- or not state["scenes"]
135
- or self.compute_distance(state["scenes"][-1]["hash"], subhash)
136
- > self.interscene_threshold
137
- )
138
- ):
139
- # Persist the scene's hash, frames, start timestamp, and end timestamp.
140
- # If frame_timestamp is None, we can assume we've reached the end of
141
- # the video and should use the end timestamp instead
142
- state["scenes"].append(
143
- {
144
- "hash": subhash,
145
- "frames": state["frames"],
146
- "start_timestamp": state["start"],
147
- "end_timestamp": frame_timestamp or state.get("end"),
148
- "frame_index": state["frame_index"],
149
- }
150
- )
151
- state["substate"] = None
152
- state["bounds"] = None
153
- state["frames"] = []
154
- state["previous_frame"] = None
155
- if frame_timestamp is not None:
156
- state["start"] = frame_timestamp
157
- if frame_index is not None:
158
- state["frame_index"] = frame_index
159
-
160
- def crop(self, frame, bounds):
161
- # Check to see we have set bounds for this scene yet.
162
- if not bounds:
163
- # We don't have bounds, so we'll set them.
164
- bounds = tools.unletterbox(frame)
165
- # If the bounds come back invalid (i.e., the frame is too small)
166
- # or no bounds are found (i.e., the frame is all back), we
167
- # return None.
168
- if (
169
- bounds is None
170
- or min(bounds[0][1] - bounds[0][0], bounds[1][1] - bounds[1][0])
171
- < self.min_frame_size
172
- ):
173
- return None, None, None
174
- (x1, x2), (y1, y2) = bounds
175
- cropped = np.ascontiguousarray(frame[y1:y2, x1:x2])
176
- current = cv2.resize(cv2.cvtColor(cropped, cv2.COLOR_RGB2GRAY), (128, 128))
177
- current = cv2.blur(current, ksize=(4, 4))
178
- return cropped, current, bounds
179
-
180
- def process_frame(
181
- self, frame, frame_index, frame_timestamp, state=None, batch_mode=False
182
- ):
183
- if not state:
184
- state = {
185
- "previous_frame": None,
186
- "substate": None,
187
- "start": 0,
188
- "bounds": None,
189
- "frames": [],
190
- "scenes": [],
191
- "frame_index": frame_index,
192
- }
193
- cropped, current, state["bounds"] = self.crop(frame, state["bounds"])
194
- if cropped is None:
195
- # A good crop was not found so we set the start of the scene to this
196
- # point and continue on to the next frame. This will repeat until we
197
- # find appropriate bounds.
198
- state["start"] = frame_timestamp
199
- return state
200
-
201
- # Check if we have a previous frame to compare the
202
- # current frame to.
203
- if state["previous_frame"] is not None:
204
- # Compute similarity between the previous frame and the
205
- # current frame.
206
- similarity = 1 - np.abs(
207
- state["previous_frame"].astype("float32") - current.astype("float32")
208
- ).sum() / (255 * 128**2)
209
- # If the previous frame and the current one are too dissimilar, we've started
210
- # a new scene and we should handle it appropriately
211
- if similarity < self.similarity_threshold or (
212
- self.max_scene_length is not None
213
- and frame_timestamp - state["start"] > self.max_scene_length
214
- ):
215
- self.handle_scene(state, frame_timestamp, frame_index)
216
- cropped, current, state["bounds"] = self.crop(frame, state["bounds"])
217
- if cropped is None:
218
- # See comment above about invalid crops.
219
- state["start"] = frame_timestamp
220
- return state
221
-
222
- state["previous_frame"] = current
223
- try:
224
- state["substate"] = self.base_hasher.process_frame(
225
- cropped, frame_index, frame_timestamp, state=state["substate"]
226
- )
227
- if batch_mode:
228
- state["frames"].append((frame, frame_index, frame_timestamp))
229
- except Exception as e:
230
- logger.warning("An error occurred while processing a frame: %s", str(e))
231
- return state
232
-
233
- def hash_from_final_state(self, state):
234
- if state["substate"]:
235
- self.handle_scene(state)
236
- if not self.base_hasher.returns_multiple:
237
- return [h["hash"] for h in state["scenes"]]
238
- return flatten([scene["hash"] for scene in state["scenes"]])
File without changes
File without changes
File without changes