PyPI - Perception - Versions diffs - 0.8.3__cp313-cp313-win_amd64.whl - Mend

Perception 0.8.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

perception/__init__.py +14 -0
perception/approximate_deduplication/__init__.py +301 -0
perception/approximate_deduplication/debug.py +239 -0
perception/approximate_deduplication/index.py +433 -0
perception/approximate_deduplication/serve.py +151 -0
perception/benchmarking/__init__.py +23 -0
perception/benchmarking/common.py +653 -0
perception/benchmarking/extensions.c +31202 -0
perception/benchmarking/extensions.cp313-win_amd64.pyd +0 -0
perception/benchmarking/extensions.pyx +112 -0
perception/benchmarking/image.py +204 -0
perception/benchmarking/image_transforms.py +42 -0
perception/benchmarking/video.py +224 -0
perception/benchmarking/video_transforms.py +198 -0
perception/extensions.cp313-win_amd64.pyd +0 -0
perception/extensions.cpp +33687 -0
perception/extensions.pyx +305 -0
perception/hashers/__init__.py +33 -0
perception/hashers/hasher.py +386 -0
perception/hashers/image/__init__.py +17 -0
perception/hashers/image/average.py +35 -0
perception/hashers/image/dhash.py +30 -0
perception/hashers/image/opencv.py +63 -0
perception/hashers/image/pdq.py +34 -0
perception/hashers/image/phash.py +109 -0
perception/hashers/image/wavelet.py +59 -0
perception/hashers/tools.py +1178 -0
perception/hashers/video/__init__.py +4 -0
perception/hashers/video/framewise.py +102 -0
perception/hashers/video/tmk.py +219 -0
perception/local_descriptor_deduplication.py +708 -0
perception/py.typed +0 -0
perception/testing/__init__.py +245 -0
perception/testing/images/README.md +13 -0
perception/testing/images/image1.jpg +0 -0
perception/testing/images/image10.jpg +0 -0
perception/testing/images/image2.jpg +0 -0
perception/testing/images/image3.jpg +0 -0
perception/testing/images/image4.jpg +0 -0
perception/testing/images/image5.jpg +0 -0
perception/testing/images/image6.jpg +0 -0
perception/testing/images/image7.jpg +0 -0
perception/testing/images/image8.jpg +0 -0
perception/testing/images/image9.jpg +0 -0
perception/testing/logos/README.md +4 -0
perception/testing/logos/logoipsum.png +0 -0
perception/testing/videos/README.md +6 -0
perception/testing/videos/expected_tmk.json.gz +0 -0
perception/testing/videos/extra_channel_attached_pic.mp4 +0 -0
perception/testing/videos/extra_channel_attached_pic_audio.mp4 +0 -0
perception/testing/videos/rgb.m4v +0 -0
perception/testing/videos/v1.m4v +0 -0
perception/testing/videos/v2.m4v +0 -0
perception/testing/videos/v2s.mov +0 -0
perception/tools.py +379 -0
perception/utils.py +2 -0
perception-0.8.3.dist-info/DELVEWHEEL +1 -0
perception-0.8.3.dist-info/METADATA +115 -0
perception-0.8.3.dist-info/RECORD +62 -0
perception-0.8.3.dist-info/WHEEL +4 -0
perception-0.8.3.dist-info/licenses/LICENSE +191 -0
perception.libs/msvcp140-a4c2229bdc2a2a630acdc095b4d86008.dll +0 -0

perception/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""""" # start delvewheel patch
+def _delvewheel_patch_1_12_0():
+    import os
+    if os.path.isdir(libs_dir := os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'perception.libs'))):
+        os.add_dll_directory(libs_dir)
+_delvewheel_patch_1_12_0()
+del _delvewheel_patch_1_12_0
+# end delvewheel patch
+from importlib import metadata
+__version__ = metadata.version("perception")

perception/approximate_deduplication/__init__.py ADDED Viewed

@@ -0,0 +1,301 @@
+import logging
+import math
+import os.path as op
+import typing
+import faiss
+import networkit as nk
+import numpy as np
+import tqdm
+import typing_extensions
+LOGGER = logging.getLogger(__name__)
+DEFAULT_PCT_PROBE = 0
+# For faiss training on datasets larger than 50,000 vectors, we take a random sub-sample.
+TRAIN_LARGE_SIZE: int = 50_000
+class ClusterAssignment(typing_extensions.TypedDict):
+    cluster: int
+    id: typing.Any
+def build_index(
+    X: np.ndarray,
+    pct_probe: float = DEFAULT_PCT_PROBE,
+    approximate: bool = True,
+    use_gpu: bool = True,
+):
+    """Buid a FAISS index from a reference dataframe.
+    Args:
+        X: The vectors to add to the index.
+        pct_probe: The minimum fraction of nearest lists to search. If
+            the product of pct_probe and the number of lists is less
+            than 1, one list will be searched.
+        approximate: Whether to build an approximate or exact index.
+    Returns:
+        An (index, lookup) tuple where the lookup returns the filepath
+        for a given entry in the index.
+    """
+    if X is None:
+        return None
+    X = X.astype("float32")
+    d = X.shape[1]
+    if approximate:
+        ntotal = X.shape[0]
+        nlist = int(max(min(4 * np.sqrt(ntotal), ntotal / 39), 1))
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, nlist)
+        gpu = False
+        if use_gpu:
+            try:
+                res = faiss.StandardGpuResources()
+                index = faiss.index_cpu_to_gpu(res, 0, index)
+                gpu = True
+            except AttributeError:
+                LOGGER.info("Building approximate FAISS index on CPU.")
+        if X.shape[0] > TRAIN_LARGE_SIZE:
+            # Take random sample of 50,000 or 39 points per centroid.
+            # 39 points per centroid is the min for for not getting warnings.
+            # https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
+            sample_size = max(39 * nlist, TRAIN_LARGE_SIZE)
+            index.train(X[np.random.choice(X.shape[0], sample_size, replace=False)])
+        else:
+            index.train(X)
+        batch_size = 10_000
+        for i in range(0, X.shape[0], batch_size):
+            index.add(X[i : i + batch_size])
+        if gpu:
+            index = faiss.index_gpu_to_cpu(index)
+        nprobe = max(math.ceil(pct_probe * nlist), 1)
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", nprobe)
+    else:
+        index = faiss.IndexFlat(d)
+        index.add(X)
+    return index
+def compute_euclidean_pairwise_duplicates_approx(
+    X,
+    counts,
+    threshold,
+    minimum_overlap,
+    Y=None,
+    y_counts=None,
+    pct_probe=0.1,
+    use_gpu: bool = True,
+    faiss_cache_path: str | None = None,
+    show_progress: bool = False,
+):
+    """Provides the same result as perception.extensions.compute_pairwise_duplicates_simple
+    but uses an approximate search instead of an exhaustive search, which can dramatically reduce
+    processing time.
+    Args:
+        X: An array of vectors to compute pairs for.
+        Y: if provided we search in X for Y vectors.
+        counts: A list of counts of vectors for separate files in the
+            in the vectors (should add up to the length of X)
+        threshold: The threshold for a match as a euclidean distance.
+        minimum_overlap: The minimum overlap between two files to qualify as a match.
+        pct_probe: The minimum percentage of sublists to search for matches. The larger the
+            value, the more exhaustive the search.
+        faiss_cache_path: If provided load any existing faiss index from this path, and if
+            it does not exist then save the generated faiss index to the path.
+        show_progress: Whether or not to show a progress bar while computing pairs
+    Returns:
+        A list of pairs of matching file indexes.
+    """
+    assert (
+        counts.sum() == X.shape[0]
+    ), "Length of counts incompatible with vectors shape."
+    assert (Y is None) == (
+        y_counts is None
+    ), "Must provide both or neither for y, y_counts."
+    if X.dtype != "float32":
+        # Only make the copy if we have to.
+        X = X.astype("float32")
+    if Y is not None and Y.dtype != "float32":
+        # Only make the copy if we have to.
+        Y = Y.astype("float32")
+    lookup_ = []
+    for idx, count in enumerate(counts):
+        lookup_.extend([idx] * count)
+    lookup = np.array(lookup_)
+    if faiss_cache_path is not None and op.exists(faiss_cache_path):
+        LOGGER.debug("Loading cached FAISS index from %s", faiss_cache_path)
+        index = faiss.read_index(faiss_cache_path)
+        assert (
+            X.shape[0] == index.ntotal
+        ), "Cached FAISS index does not match provided X."
+    else:
+        LOGGER.debug("Building FAISS index.")
+        index = build_index(X=X, pct_probe=pct_probe, approximate=True, use_gpu=use_gpu)
+        if faiss_cache_path is not None:
+            faiss.write_index(index, faiss_cache_path)
+    LOGGER.debug("FAISS index ready, start aprox search")
+    pairs = []
+    # Only use y_counts if present.
+    if y_counts is None:
+        iterator_counts = counts
+        M = X
+    else:
+        iterator_counts = y_counts
+        M = Y
+    for end, length, query in tqdm.tqdm(
+        zip(iterator_counts.cumsum(), iterator_counts, range(len(iterator_counts))),
+        total=len(iterator_counts),
+        disable=not show_progress,
+        desc="Vectors",
+    ):
+        if length == 0:
+            continue
+        Xq = M[end - length : end]
+        lims, _, idxs = index.range_search(Xq, threshold**2)
+        lims = lims.astype("int32")
+        matched = [
+            match
+            for match in np.unique(lookup[list(set(idxs))])  # type: ignore
+            if match != query
+            or Y is not None  # Protect self matches if Y is not present.
+        ]
+        query_in_match: typing.Mapping[int, set] = {m: set() for m in matched}
+        match_in_query: typing.Mapping[int, set] = {m: set() for m in matched}
+        for query_idx in range(length):
+            for match_idx in idxs[lims[query_idx] : lims[query_idx + 1]]:
+                match = lookup[match_idx]
+                if (
+                    match == query and Y is None
+                ):  # Protect self matches if Y is not present.
+                    continue
+                match_in_query[match].add(match_idx)
+                query_in_match[match].add(query_idx)
+        for match in matched:
+            overlap = min(
+                [
+                    len(query_in_match[match]) / length,
+                    len(match_in_query[match]) / counts[match],
+                ]
+            )
+            if overlap >= minimum_overlap and overlap > 0:
+                if Y is None:
+                    pairs.append(tuple(sorted([query, match])))
+                else:
+                    pairs.append(tuple([query, match]))
+    return list(set(pairs))
+def pairs_to_clusters(
+    ids: typing.Iterable[str],
+    pairs: typing.Iterable[tuple[str, str]],
+    strictness: typing_extensions.Literal[
+        "clique", "community", "component"
+    ] = "clique",
+    max_clique_batch_size: int = 1000,
+) -> list[ClusterAssignment]:
+    """Given a list of pairs of matching files, compute sets
+    of cliques where all files in a clique are connected.
+    Args:
+        ids: A list of node ids (e.g., filepaths).
+        pairs: A list of pairs of node ids, each pair is assumed to have an edge
+        strictness: The level at which groups will be clustered. "component"
+            means that all clusters will be connected components. "community"
+            will select clusters of files within components that are clustered
+            together. "clique" will result in clusters where every file is
+            connected to every other file.
+        max_clique_batch_size: The maximum batch size for identifying
+            cliques.
+    Returns:
+        A list of cluster assignments (dicts with id and cluster
+        entries).
+    """
+    assert strictness in ["component", "community", "clique"], "Invalid strictness."
+    list_ids = list(ids)
+    id_to_node_map = {v: i for i, v in enumerate(list_ids)}
+    node_to_id_map = {v: k for k, v in id_to_node_map.items()}
+    LOGGER.debug("Building graph.")
+    graph = nk.Graph(len(list_ids))
+    node_pairs = {(id_to_node_map[pair[0]], id_to_node_map[pair[1]]) for pair in pairs}
+    for node_pair in node_pairs:
+        graph.addEdge(node_pair[0], node_pair[1])
+    assignments: list[ClusterAssignment] = []
+    cluster_index = 0
+    cc_query = nk.components.ConnectedComponents(graph)
+    cc_query.run()
+    components = cc_query.getComponents()
+    for component in components:
+        LOGGER.debug("Got component with size: %s", len(component))
+        if strictness == "component":
+            assignments.extend(
+                [{"id": node_to_id_map[n], "cluster": cluster_index} for n in component]
+            )
+            cluster_index += 1
+            continue
+        # Map between node values for a connected component
+        component_node_map = dict(enumerate(component))
+        cc_sub_graph = nk.graphtools.subgraphFromNodes(graph, component, compact=True)
+        algo = nk.community.PLP(cc_sub_graph)
+        algo.run()
+        communities = algo.getPartition()
+        community_map = communities.subsetSizeMap()
+        for community, size in community_map.items():
+            LOGGER.debug("Got community with size: %s", size)
+            community_members = list(
+                communities.getMembers(community)
+            )  # Need to do this to do batching.
+            community_members = [component_node_map[i] for i in community_members]
+            if strictness == "community":
+                assignments.extend(
+                    [
+                        {"id": node_to_id_map[n], "cluster": cluster_index}
+                        for n in community_members
+                    ]
+                )
+                cluster_index += 1
+                continue
+            for start in range(0, len(community_members), max_clique_batch_size):
+                community_nodes = community_members[
+                    start : start + max_clique_batch_size
+                ]
+                LOGGER.debug("Creating subgraph with %s nodes.", len(community_nodes))
+                # Map between node values for a community
+                community_node_map = dict(enumerate(community_nodes))
+                subgraph = nk.graphtools.subgraphFromNodes(
+                    graph, community_nodes, compact=True
+                )
+                while subgraph.numberOfNodes() > 0:
+                    LOGGER.debug("Subgraph size: %s", subgraph.numberOfNodes())
+                    clique = nk.clique.MaximalCliques(subgraph, maximumOnly=True)
+                    clique.run()
+                    clique_members = clique.getCliques()[0]
+                    assignments.extend(
+                        [
+                            {
+                                "id": node_to_id_map[community_node_map[n]],
+                                "cluster": cluster_index,
+                            }
+                            for n in clique_members
+                        ]
+                    )
+                    cluster_index += 1
+                    for n in clique_members:
+                        subgraph.removeNode(n)
+    return assignments

perception/approximate_deduplication/debug.py ADDED Viewed

@@ -0,0 +1,239 @@
+import logging
+import random
+import cv2
+import numpy as np
+import perception.local_descriptor_deduplication as ldd
+LOGGER = logging.getLogger(__name__)
+# Set a fixed size for drawing, we don't have the real descriptor size.
+KEYPOINT_SIZE: int = 8
+def vizualize_pair(
+    features_1,
+    features_2,
+    ratio: float,
+    match_metadata=None,
+    local_path_col: str | None = None,
+    sanitized: bool = False,
+    include_all_points=False,
+    circle_size=KEYPOINT_SIZE,
+):
+    """Given two rows from a reference df vizualize their overlap.
+    Currently recalcs overlap using cv2 default logic.
+    Args:
+        features_1: The row from a reference df for one image.
+        features_2: The row from a reference df for the other image.
+        ratio: Value for ratio test, suggest re-using value from matching.
+        match_metadata: metadata returned from matching, if None will redo brute force matching.
+        local_path_col: column in df with path to the image. If None will
+            use the index: features_1.name and features_2.name
+        sanitized: if True images themselves will not be rendered, only the points.
+        include_all_points: if True will draw all points, not just matched points.
+        circle_size: size of the circle to draw around keypoints.
+    Returns:
+        An image of the two images concatted together and matching keypoints drawn.
+    """
+    # Set a fixed size for drawing, we don't have the real descriptor size.
+    if local_path_col is not None:
+        features_1_path = features_1[local_path_col]
+        features_2_path = features_2[local_path_col]
+    else:
+        features_1_path = features_1.name
+        features_2_path = features_2.name
+    img1 = np.zeros(
+        (features_1.dimensions[1], features_1.dimensions[0], 1), dtype="uint8"
+    )
+    img2 = np.zeros(
+        (features_2.dimensions[1], features_2.dimensions[0], 1), dtype="uint8"
+    )
+    if not sanitized:
+        try:
+            img1 = ldd.load_and_preprocess(
+                features_1_path, max_size=max(features_1.dimensions), grayscale=False
+            )
+        except Exception:
+            LOGGER.warning("Failed to load image %s", features_1_path)
+        try:
+            img2 = ldd.load_and_preprocess(
+                features_2_path, max_size=max(features_2.dimensions), grayscale=False
+            )
+        except Exception:
+            LOGGER.warning("Failed to load image %s", features_2_path)
+    if match_metadata is not None:
+        img_matched = viz_match_data(
+            features_1,
+            features_2,
+            img1,
+            img2,
+            match_metadata,
+            include_all_points=include_all_points,
+            circle_size=circle_size,
+        )
+    else:
+        LOGGER.warning(
+            """No match_metadata provided, recalculating match points,
+            won't match perception match points."""
+        )
+        img_matched = viz_brute_force(features_1, features_2, img1, img2, ratio=ratio)
+    return img_matched
+def viz_match_data(
+    features_1,
+    features_2,
+    img1,
+    img2,
+    match_metadata,
+    include_all_points=False,
+    circle_size=KEYPOINT_SIZE,
+):
+    """Given match data viz matching points.
+    Args:
+        features_1: The row from a reference df for one image.
+        features_2: The row from a reference df for the other image.
+        img1: cv2 of first image
+        img2: cv2 of second image
+        match_metadata: metadata returned from matching, if None will redo
+            brute force matching.
+        include_all_points: if True will draw all points, not just matched points.
+        circle_size: size of the circle to draw around keypoints.
+    Returns:
+        cv2 img with matching keypoints drawn.
+    """
+    # NOTE: could refactor to put matches in to correct format and use: cv2.drawMatchesKnn,
+    #  but python docs on necessary class not clear.
+    # Pad img1 or img2 vertically with black pixels to match the height of the other image
+    if img1.shape[0] > img2.shape[0]:
+        img2 = np.pad(
+            img2,
+            ((0, img1.shape[0] - img2.shape[0]), (0, 0), (0, 0)),
+            mode="constant",
+            constant_values=0,
+        )
+    elif img1.shape[0] < img2.shape[0]:
+        img1 = np.pad(
+            img1,
+            ((0, img2.shape[0] - img1.shape[0]), (0, 0), (0, 0)),
+            mode="constant",
+            constant_values=0,
+        )
+    # draw two images h concat:
+    img_matched = np.concatenate((img1, img2), axis=1)
+    overlay = img_matched.copy()
+    if include_all_points:
+        # draw all points in kp_1
+        for k in features_1["keypoints"]:
+            new_color = (
+                random.randint(0, 255),
+                random.randint(0, 255),
+                random.randint(0, 255),
+            )
+            # Draw semi transparent circle
+            cv2.circle(img_matched, (int(k[0]), int(k[1])), circle_size, new_color, 1)
+        # draw all points in kp_2
+        for k in features_2["keypoints"]:
+            new_color = (
+                random.randint(0, 255),
+                random.randint(0, 255),
+                random.randint(0, 255),
+            )
+            cv2.circle(
+                img_matched,
+                (int(k[0] + features_1.dimensions[0]), int(k[1])),
+                circle_size,
+                new_color,
+                1,
+            )
+    # draw lines between matching points
+    for i in range(len(match_metadata["final_matched_b_pts"])):
+        new_color = (
+            random.randint(0, 255),
+            random.randint(0, 255),
+            random.randint(0, 255),
+        )
+        a_pt = (
+            int(match_metadata["final_matched_a_pts"][i][0]),
+            int(match_metadata["final_matched_a_pts"][i][1]),
+        )
+        b_pt = (
+            int(match_metadata["final_matched_b_pts"][i][0] + features_1.dimensions[0]),
+            int(match_metadata["final_matched_b_pts"][i][1]),
+        )
+        cv2.circle(img_matched, a_pt, circle_size, new_color, 1)
+        cv2.circle(img_matched, b_pt, circle_size, new_color, 1)
+        cv2.line(
+            img_matched,
+            a_pt,
+            b_pt,
+            new_color,
+            1,
+        )
+    # Re-overlay original image to add some transparency effect to lines and circles.
+    alpha = 0.4  # Transparency factor.
+    # Following line overlays transparent rectangle over the image
+    img_matched = cv2.addWeighted(overlay, alpha, img_matched, 1 - alpha, 0)
+    return img_matched
+def viz_brute_force(features_1, features_2, img1, img2, ratio: float):
+    """
+    Given two rows from a reference df vizualize their overlap.
+    NOTE: It redoes matching using cv2 bruteforce, so will not match the same
+        as the perception matching code.
+    Args:
+        features_1: The row from a reference df for one image.
+        features_2: The row from a reference df for the other image.
+        img1: cv2 of first image
+        img2: cv2 of second image
+        ratio: Value for ratio test, suggest re-using value from matching.
+    Returns:
+        An image of the two images concatted together and matching keypoints drawn.
+    """
+    # Convert numpy keypoints to cv2.KeyPoints
+    kp1_fixed = []
+    for k in features_1["keypoints"]:
+        kp1_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))
+    kp2_fixed = []
+    for k in features_2["keypoints"]:
+        kp2_fixed.append(cv2.KeyPoint(k[0], k[1], KEYPOINT_SIZE))
+    brute_force_matcher = cv2.BFMatcher()
+    kn_matches = brute_force_matcher.knnMatch(
+        features_1["descriptors"], features_2["descriptors"], k=2
+    )
+    # Apply ratio test
+    good = []
+    for nearest_match, next_nearest_match in kn_matches:
+        if nearest_match.distance < ratio * next_nearest_match.distance:
+            good.append([nearest_match])
+    img_matched = cv2.drawMatchesKnn(  # type: ignore[call-overload]
+        img1,
+        kp1_fixed,
+        img2,
+        kp2_fixed,
+        good,
+        None,
+        flags=cv2.DrawMatchesFlags_DRAW_RICH_KEYPOINTS,
+    )
+    return img_matched