PyPI - photo-stack-finder - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

photo-stack-finder 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

orchestrator/__init__.py +2 -2
orchestrator/app.py +6 -11
orchestrator/build_pipeline.py +19 -21
orchestrator/orchestrator_runner.py +11 -8
orchestrator/pipeline_builder.py +126 -126
orchestrator/pipeline_orchestrator.py +604 -604
orchestrator/review_persistence.py +162 -162
orchestrator/static/orchestrator.css +76 -76
orchestrator/static/orchestrator.html +11 -5
orchestrator/static/orchestrator.js +3 -1
overlap_metrics/__init__.py +1 -1
overlap_metrics/config.py +135 -135
overlap_metrics/core.py +284 -284
overlap_metrics/estimators.py +292 -292
overlap_metrics/metrics.py +307 -307
overlap_metrics/registry.py +99 -99
overlap_metrics/utils.py +104 -104
photo_compare/__init__.py +1 -1
photo_compare/base.py +285 -285
photo_compare/config.py +225 -225
photo_compare/distance.py +15 -15
photo_compare/feature_methods.py +173 -173
photo_compare/file_hash.py +29 -29
photo_compare/hash_methods.py +99 -99
photo_compare/histogram_methods.py +118 -118
photo_compare/pixel_methods.py +58 -58
photo_compare/structural_methods.py +104 -104
photo_compare/types.py +28 -28
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
scripts/orchestrate.py +12 -10
utils/__init__.py +4 -3
utils/base_pipeline_stage.py +171 -171
utils/base_ports.py +176 -176
utils/benchmark_utils.py +823 -823
utils/channel.py +74 -74
utils/comparison_gates.py +40 -21
utils/compute_benchmarks.py +355 -355
utils/compute_identical.py +94 -24
utils/compute_indices.py +235 -235
utils/compute_perceptual_hash.py +127 -127
utils/compute_perceptual_match.py +240 -240
utils/compute_sha_bins.py +64 -20
utils/compute_template_similarity.py +1 -1
utils/compute_versions.py +483 -483
utils/config.py +8 -5
utils/data_io.py +83 -83
utils/graph_context.py +44 -44
utils/logger.py +2 -2
utils/models.py +2 -2
utils/photo_file.py +90 -91
utils/pipeline_graph.py +334 -334
utils/pipeline_stage.py +408 -408
utils/plot_helpers.py +123 -123
utils/ports.py +136 -136
utils/progress.py +415 -415
utils/report_builder.py +139 -139
utils/review_types.py +55 -55
utils/review_utils.py +10 -19
utils/sequence.py +10 -8
utils/sequence_clustering.py +1 -1
utils/template.py +57 -57
utils/template_parsing.py +71 -0
photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0

utils/compute_identical.py CHANGED Viewed

@@ -6,18 +6,41 @@ import random
 from .config import CONFIG
 from .models import IdenticalGroup, ReviewType
-from .photo_file import PhotoFile, pick_exemplar_from_class
+from .photo_file import (
+    PhotoFile,
+    pick_exemplar_from_class,
+)
 from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
 from .ports import InputPort, OutputPort
 from .review_utils import build_identical_group
+from .template_parsing import INDEX_T
-class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[PhotoFile]]):
-    # FIXME : Add docstring
-    # TODO: Update to incorporate digit parsing and output template bins
+class ComputeIdentical(
+    PipelineStage[
+        list[PhotoFile],  # WorkItem: SHA bins
+        list[PhotoFile],  # Accumulator: flat list of exemplars during processing
+        dict[str, list[tuple[INDEX_T, PhotoFile]]],  # Result: template bins after finalise()
+    ]
+):
+    """Pipeline stage that detects byte-identical files and outputs template bins.
+    This stage consumes SHA256 bins, identifies byte-identical files within each bin,
+    picks exemplars from non-identical files, and bins the exemplars by filename template.
+    Input:
+        SHA256 bins (from ComputeSHABins)
+    Output:
+        Template bins: dict mapping template patterns to (index, photo) tuples
+        Example: {"IMG_{P0}.jpg": [("1234", photo1), ("5678", photo2)]}
+    Review data:
+        Identical photo groups for user review
+    """
     # Typed result field - populated after run() completes
-    # Full tuple: (identical_classes, nonidentical_exemplars)
-    result: list[PhotoFile]
+    result: dict[str, list[tuple[INDEX_T, PhotoFile]]]
     def __init__(self) -> None:
         """Initialize identical files detection stage."""
@@ -33,12 +56,14 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
         self.sha_bins_i: InputPort[dict[str, list[PhotoFile]]] = InputPort("sha_bins")
         # Create output ports
-        # - nonidentical_o: for next stage (templates)
-        self.nonidentical_o: OutputPort[list[PhotoFile]] = OutputPort(self, getter=lambda: self.result)
+        # - nonidentical_o: template bins for next stage (versions)
+        self.nonidentical_o: OutputPort[dict[str, list[tuple[INDEX_T, PhotoFile]]]] = OutputPort(
+            self, getter=lambda: self.result
+        )
     def prepare(
         self,
-    ) -> PrepareResult[list[PhotoFile], list[PhotoFile]]:
+    ) -> PrepareResult[list[PhotoFile], dict[str, list[tuple[INDEX_T, PhotoFile]]]]:
         """Prepare identical file detection by splitting bins into work items.
         Reads SHA bins from input port and prepares work items for parallel processing.
@@ -46,7 +71,7 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
         Returns:
             Tuple of (work_items, accumulator) where:
             - work_items: List of bins with multiple photos (potential duplicates)
-            - accumulator: nonidentical_photos
+            - accumulator: Empty template bins dict (populated during accumulation)
         """
         # Read SHA bins from input port
         sha_bins: dict[str, list[PhotoFile]] = self.sha_bins_i.read()
@@ -58,14 +83,30 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
         multiple_bins: list[list[PhotoFile]] = [b for b in sha_bins.values() if len(b) > 1]
         singleton_bins: list[list[PhotoFile]] = [b for b in sha_bins.values() if len(b) == 1]
-        exemplars: list[PhotoFile] = [x for b in singleton_bins for x in b]
-        return multiple_bins, exemplars
+        # Pre-bin singleton exemplars by template (optimization)
+        template_bins: dict[str, list[tuple[INDEX_T, PhotoFile]]] = {}
+        for bin in singleton_bins:
+            photo = bin[0]
+            assert "TEMPLATE" in photo.cache, "Template must be cached by SHA stage"
+            template_bins.setdefault(photo.template, []).append((photo.template_index, photo))
+        return multiple_bins, template_bins
     def finalise(self) -> None:
-        self.ref_photos_final = len(self.result)
-        self.ref_seqs_final = None
-        # Count total photos to ensure no photos lost (invariant check)
+        """Update reference counts after template binning.
+        Template binning already happened during accumulation, so this just
+        computes the final reference counts.
+        """
+        # self.result is already a dict[str, list[tuple[INDEX_T, PhotoFile]]] from accumulator
+        template_bins = self.result
+        # Update reference counting
+        self.ref_photos_final = sum(len(photos) for photos in template_bins.values())
+        self.ref_seqs_final = len(template_bins)  # Number of unique templates
+        # Existing invariant check (photos count unchanged)
         photos_final: int = (
             sum(len(cl.photos) for cl in self.identical_review_result)
             - len(self.identical_review_result)
@@ -76,6 +117,9 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
             f"ComputeIdentical started with {self.total_photos} photos and ended up with {photos_final}"
         )
+        # Shuffle review groups for variety in review UI (byte-identical groups are 100% confident)
+        random.shuffle(self.identical_review_result)
     @classmethod
     def stage_worker(cls, photo_list: list[PhotoFile], _args: str) -> WorkerResult[list[PhotoFile]]:
         """Process one SHA bin to find byte-identical files.
@@ -95,18 +139,39 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
         # Singleton bins are filtered out by prepare()
         assert len(photo_list) >= 2
+        # If skipping byte-identical check, trust SHA256 uniqueness
+        # Treat all photos in the bin as identical (same SHA = identical files)
+        if CONFIG.processing.SKIP_BYTE_IDENTICAL:
+            # Pick best exemplar using same logic as non-skip path
+            # (prefer higher resolution, larger file size, stable path/ID tiebreaker)
+            photos_dict: dict[int, PhotoFile] = {pf.id: pf for pf in photo_list}
+            photo_ids: set[int] = set(photos_dict.keys())
+            exemplar_id: int = pick_exemplar_from_class(photos_dict, photo_ids)
+            exemplar: PhotoFile = photos_dict[exemplar_id]
+            # Mark all other photos as identical to the exemplar
+            for photo in photo_list:
+                if photo.id != exemplar_id:
+                    photo.cache["IDENTICAL"] = exemplar
+            # Skip building review data - no need to review when trusting SHA256
+            # (building review data would open every image file just to get dimensions)
+            # Return empty review list and single exemplar
+            return [build_identical_group(photo_list, exemplar_id)], [], [exemplar]
+        # Otherwise, perform byte-by-byte comparison (original behavior)
         groups: list[IdenticalGroup] = []
         exemplars: list[PhotoFile] = []
         # Build dict and set for pick_exemplar_from_class
-        photos_dict: dict[int, PhotoFile] = {pf.id: pf for pf in photo_list}
+        photos_dict = {pf.id: pf for pf in photo_list}
         remaining_ids: set[int] = set(photos_dict.keys())
         # Process bin until empty
         while remaining_ids:
             # Pick exemplar from remaining files
-            exemplar_id: int = pick_exemplar_from_class(photos_dict, remaining_ids)
-            exemplar: PhotoFile = photos_dict[exemplar_id]
+            exemplar_id = pick_exemplar_from_class(photos_dict, remaining_ids)
+            exemplar = photos_dict[exemplar_id]
             # Create new equivalence class starting with exemplar
             eq_class: list[PhotoFile] = [exemplar]
@@ -141,18 +206,23 @@ class ComputeIdentical(PipelineStage[list[PhotoFile], list[PhotoFile], list[Phot
                 groups.append(build_identical_group(eq_class, exemplar_id))
             exemplars.append(exemplar)
-        # Shuffle the groups to see something more inteeresting at review time.
-        random.shuffle(groups)
         return groups, [], exemplars
     def accumulate_results(
         self,
-        accum: list[PhotoFile],
+        accum: dict[str, list[tuple[INDEX_T, PhotoFile]]],
         job: list[PhotoFile],
     ) -> None:
-        # FIXME: Add docstring
-        accum.extend(job)
+        """Accumulate exemplars from workers into template bins.
+        Args:
+            accum: Template bins dictionary being built incrementally
+            job: List of exemplars from one worker (flat list)
+        """
+        # Bin each exemplar by its template
+        for exemplar in job:
+            assert "TEMPLATE" in exemplar.cache, "Template must be cached by SHA stage"
+            accum.setdefault(exemplar.template, []).append((exemplar.template_index, exemplar))
     def needs_review(self) -> ReviewType:
         """This stage produces photo groups (byte-identical duplicates).

utils/compute_indices.py CHANGED Viewed

@@ -1,235 +1,235 @@
-"""Compute similar sequences from bins created by puting the sequence in bins defined by the max two indices of the sequence."""
-from __future__ import annotations
-from itertools import combinations
-from typing import cast
-import networkx as nx
-from .comparison_gates import GateName, GateSequence
-from .config import CONFIG
-from .logger import get_logger
-from .models import ReviewType
-from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
-from .ports import InputPort, OutputPort
-from .sequence import (
-    INDEX_T,
-    PhotoSequence,
-    count_forest_ref_photos,
-    count_forest_ref_sequences,
-    count_forest_total_photos,
-)
-from .sequence_clustering import cluster_similar_sequences
-def build_cohabitation_graph(
-    index_bins: dict[INDEX_T, list[PhotoSequence]],
-) -> list[set[PhotoSequence]]:
-    """Build graph from index bins and find connected components.
-    Args:
-            index_bins: Dict mapping index pattern → list of sequences
-    Returns:
-            List of connected components (each component is a set of PhotoSequence objects)
-    """
-    # Build graph
-    graph: nx.Graph[PhotoSequence] = nx.Graph()
-    graph.add_nodes_from(set().union(*index_bins.values()))
-    # Add edges where sequences share index bins
-    # Add edges between all pairs in this bin
-    for index_bin in index_bins.values():
-        for seq1, seq2 in combinations(index_bin, 2):
-            graph.add_edge(seq1, seq2)
-    # Find connected components
-    result = [set(c) for c in nx.connected_components(graph)]
-    n_seqs = len(set().union(*index_bins.values()))
-    n_result_seqs = len(set().union(*result))
-    assert n_seqs == n_result_seqs, f"build_cohabitation_graph had {n_seqs} but only returned {n_result_seqs}"
-    return result
-class ComputeIndices(
-    PipelineStage[
-        set[PhotoSequence],  # S: component
-        list[PhotoSequence],  # T: work data
-        tuple[list[PhotoSequence], list[PhotoSequence]],  # R: accumulator
-    ]
-):
-    def __init__(self) -> None:
-        """Initialize the index-based grouping stage."""
-        super().__init__(
-            path=CONFIG.paths.forest_sequence_matches_pkl,
-            stage_name="Index Grouping",
-        )
-        # Store worker argument
-        self.args = self.stage_name  # Standard args attribute for run()
-        # Create input port for index bins
-        self.index_bins_i: InputPort[dict[INDEX_T, list[PhotoSequence]]] = InputPort("index_bins")
-        # Create output ports - separate ports per downstream consumer
-        # Full tuple output (for backward compatibility or review)
-        self.forest_bins_o: OutputPort[tuple[list[PhotoSequence], list[PhotoSequence]]] = OutputPort(
-            self, getter=lambda: self.result
-        )
-        # Forest output (for ComputePerceptualHash and ComputePerceptualMatch)
-        self.forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result[0])
-    def prepare(
-        self,
-    ) -> PrepareResult[set[PhotoSequence], tuple[list[PhotoSequence], list[PhotoSequence]]]:
-        """Extract index bins, build graph, and return processable components.
-        Reads index bins from input port and prepares work items for parallel processing.
-        Returns:
-            Tuple of (processable_components, accumulator)
-        """
-        # Read index bins from input port
-        index_bins: dict[INDEX_T, list[PhotoSequence]] = self.index_bins_i.read()
-        # Get reference counts from upstream for UI statistics tracking
-        all_sequences = set().union(*index_bins.values())
-        self.ref_photos_init = self.index_bins_i.get_ref_photo_count()
-        self.ref_seqs_init = self.index_bins_i.get_ref_sequence_count()
-        # Count total photos for internal invariant checking (should never change)
-        self.total_photos = sum(seq.n_photos for seq in all_sequences)
-        n_photos = self.total_photos
-        # Build cohabitation graph
-        components: list[set[PhotoSequence]] = build_cohabitation_graph(index_bins)
-        n_component_photos = sum(seq.n_photos for seq in set().union(*components))
-        assert n_photos == n_component_photos, (
-            f"Had {n_photos} before cohabitation graph and {n_component_photos} afterward"
-        )
-        # Filter components by size
-        max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
-        processable_components: list[set[PhotoSequence]] = sorted(
-            [c for c in components if 2 <= len(c) <= max_size],
-            key=lambda c: -sum([s.n_ref_photos for s in c]),
-        )
-        skipped_components: list[set[PhotoSequence]] = [c for c in components if len(c) > max_size or len(c) < 2]
-        # Calculate skip statistics
-        num_singletons = sum(1 for c in skipped_components if len(c) < 2)
-        num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
-        get_logger().info(
-            f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
-            f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
-        )
-        # Initialize forest with skipped sequences (pass-through)
-        skipped_sequences = [seq for comp in skipped_components for seq in comp]
-        forest: list[PhotoSequence] = list(skipped_sequences)
-        bins: list[PhotoSequence] = list(skipped_sequences)
-        new_photos = sum(seq.n_photos for seq in set().union(*processable_components)) + +sum(
-            v.n_photos for v in forest
-        )
-        assert n_photos == new_photos, f"ComputeIndices.prepare had {n_photos} photos and ended up with {new_photos}"
-        # Return work items and tuple accumulator
-        return processable_components, (forest, bins)
-    @classmethod
-    def stage_worker(cls, component: set[PhotoSequence], created_by: str) -> WorkerResult[list[PhotoSequence]]:
-        """Process one connected component to form PhotoSequence objects.
-        Uses predicted exemplar sequence and intersection-based comparison.
-        Builds SequenceGroup models incrementally for review.
-        Args:
-            component: Set of PhotoSequence objects to compare
-            created_by: Annotation of how the similarity was detected
-        Returns:
-            Tuple of (identical_groups, sequence_groups, work_sequences) where:
-            - identical_groups: Always empty list for this stage
-            - sequence_groups: SequenceGroup models for multi-sequence groups
-            - work_sequences: PhotoSequence objects for pipeline flow
-        """
-        # ASSERTION: Count input photos (atomic invariant)
-        input_photos: int = sum(seq.n_photos for seq in component)
-        # Use configured gate sequence instead of hardcoded method
-        gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
-        # Use common clustering algorithm
-        result_classes, sequence_groups = cluster_similar_sequences(
-            list(component),
-            gates,
-            created_by,
-        )
-        # ASSERTION: Verify all photos preserved
-        output_photos = sum(seq.n_photos for seq in result_classes)
-        assert output_photos == input_photos, (
-            f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
-        )
-        return [], sequence_groups, result_classes
-    def accumulate_results(
-        self,
-        accum: tuple[list[PhotoSequence], list[PhotoSequence]],
-        job: list[PhotoSequence],
-    ) -> None:
-        """Accumulate worker results into forest and bins.
-        Args:
-            accum: Tuple of (forest, bins) - both contain all sequences
-            job: List of PhotoSequence objects from worker
-        """
-        forest, bins = accum
-        forest.extend(job)
-        bins.extend(job)
-    def finalise(self) -> None:
-        forest = self.result[0]
-        self.ref_photos_final = count_forest_ref_photos(forest)
-        self.ref_seqs_final = len(forest)
-        # Count total photos to ensure no photos lost (invariant check)
-        photos_final = count_forest_total_photos(forest)
-        count_forest_ref_sequences(forest)
-        # FIXME: Sequence count validation disabled due to test fixture limitations
-        assert photos_final == self.total_photos, (
-            f"Started with {self.total_photos} photos but ended up with {photos_final}"
-        )
-    def needs_review(self) -> ReviewType:
-        """This stage produces sequence groups (index overlap sequences).
-        Returns:
-            "sequences" to indicate this stage produces reviewable sequence groups
-        """
-        return "sequences"
-    def has_review_data(self) -> bool:
-        """Check if there are any index overlap sequence groups to review.
-        Returns:
-            True if forest has classes (multi-sequence groups), False otherwise
-        """
-        # Check if stage has run
-        if not hasattr(self, "result") or self.result is None:
-            return False
-        # Check if there are any classes (multi-sequence groups) in the forest
-        forest = self.result[0]
-        return any(seq.is_class() for seq in forest)
-    # Typed result field - tuple of (forest, bins)
-    result: tuple[list[PhotoSequence], list[PhotoSequence]]
+"""Compute similar sequences from bins created by puting the sequence in bins defined by the max two indices of the sequence."""
+from __future__ import annotations
+from itertools import combinations
+from typing import cast
+import networkx as nx
+from .comparison_gates import GateName, GateSequence
+from .config import CONFIG
+from .logger import get_logger
+from .models import ReviewType
+from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
+from .ports import InputPort, OutputPort
+from .sequence import (
+    INDEX_T,
+    PhotoSequence,
+    count_forest_ref_photos,
+    count_forest_ref_sequences,
+    count_forest_total_photos,
+)
+from .sequence_clustering import cluster_similar_sequences
+def build_cohabitation_graph(
+    index_bins: dict[INDEX_T, list[PhotoSequence]],
+) -> list[set[PhotoSequence]]:
+    """Build graph from index bins and find connected components.
+    Args:
+            index_bins: Dict mapping index pattern → list of sequences
+    Returns:
+            List of connected components (each component is a set of PhotoSequence objects)
+    """
+    # Build graph
+    graph: nx.Graph[PhotoSequence] = nx.Graph()
+    graph.add_nodes_from(set().union(*index_bins.values()))
+    # Add edges where sequences share index bins
+    # Add edges between all pairs in this bin
+    for index_bin in index_bins.values():
+        for seq1, seq2 in combinations(index_bin, 2):
+            graph.add_edge(seq1, seq2)
+    # Find connected components
+    result = [set(c) for c in nx.connected_components(graph)]
+    n_seqs = len(set().union(*index_bins.values()))
+    n_result_seqs = len(set().union(*result))
+    assert n_seqs == n_result_seqs, f"build_cohabitation_graph had {n_seqs} but only returned {n_result_seqs}"
+    return result
+class ComputeIndices(
+    PipelineStage[
+        set[PhotoSequence],  # S: component
+        list[PhotoSequence],  # T: work data
+        tuple[list[PhotoSequence], list[PhotoSequence]],  # R: accumulator
+    ]
+):
+    def __init__(self) -> None:
+        """Initialize the index-based grouping stage."""
+        super().__init__(
+            path=CONFIG.paths.forest_sequence_matches_pkl,
+            stage_name="Index Grouping",
+        )
+        # Store worker argument
+        self.args = self.stage_name  # Standard args attribute for run()
+        # Create input port for index bins
+        self.index_bins_i: InputPort[dict[INDEX_T, list[PhotoSequence]]] = InputPort("index_bins")
+        # Create output ports - separate ports per downstream consumer
+        # Full tuple output (for backward compatibility or review)
+        self.forest_bins_o: OutputPort[tuple[list[PhotoSequence], list[PhotoSequence]]] = OutputPort(
+            self, getter=lambda: self.result
+        )
+        # Forest output (for ComputePerceptualHash and ComputePerceptualMatch)
+        self.forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result[0])
+    def prepare(
+        self,
+    ) -> PrepareResult[set[PhotoSequence], tuple[list[PhotoSequence], list[PhotoSequence]]]:
+        """Extract index bins, build graph, and return processable components.
+        Reads index bins from input port and prepares work items for parallel processing.
+        Returns:
+            Tuple of (processable_components, accumulator)
+        """
+        # Read index bins from input port
+        index_bins: dict[INDEX_T, list[PhotoSequence]] = self.index_bins_i.read()
+        # Get reference counts from upstream for UI statistics tracking
+        all_sequences = set().union(*index_bins.values())
+        self.ref_photos_init = self.index_bins_i.get_ref_photo_count()
+        self.ref_seqs_init = self.index_bins_i.get_ref_sequence_count()
+        # Count total photos for internal invariant checking (should never change)
+        self.total_photos = sum(seq.n_photos for seq in all_sequences)
+        n_photos = self.total_photos
+        # Build cohabitation graph
+        components: list[set[PhotoSequence]] = build_cohabitation_graph(index_bins)
+        n_component_photos = sum(seq.n_photos for seq in set().union(*components))
+        assert n_photos == n_component_photos, (
+            f"Had {n_photos} before cohabitation graph and {n_component_photos} afterward"
+        )
+        # Filter components by size
+        max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
+        processable_components: list[set[PhotoSequence]] = sorted(
+            [c for c in components if 2 <= len(c) <= max_size],
+            key=lambda c: -sum([s.n_ref_photos for s in c]),
+        )
+        skipped_components: list[set[PhotoSequence]] = [c for c in components if len(c) > max_size or len(c) < 2]
+        # Calculate skip statistics
+        num_singletons = sum(1 for c in skipped_components if len(c) < 2)
+        num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
+        get_logger().info(
+            f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
+            f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
+        )
+        # Initialize forest with skipped sequences (pass-through)
+        skipped_sequences = [seq for comp in skipped_components for seq in comp]
+        forest: list[PhotoSequence] = list(skipped_sequences)
+        bins: list[PhotoSequence] = list(skipped_sequences)
+        new_photos = sum(seq.n_photos for seq in set().union(*processable_components)) + +sum(
+            v.n_photos for v in forest
+        )
+        assert n_photos == new_photos, f"ComputeIndices.prepare had {n_photos} photos and ended up with {new_photos}"
+        # Return work items and tuple accumulator
+        return processable_components, (forest, bins)
+    @classmethod
+    def stage_worker(cls, component: set[PhotoSequence], created_by: str) -> WorkerResult[list[PhotoSequence]]:
+        """Process one connected component to form PhotoSequence objects.
+        Uses predicted exemplar sequence and intersection-based comparison.
+        Builds SequenceGroup models incrementally for review.
+        Args:
+            component: Set of PhotoSequence objects to compare
+            created_by: Annotation of how the similarity was detected
+        Returns:
+            Tuple of (identical_groups, sequence_groups, work_sequences) where:
+            - identical_groups: Always empty list for this stage
+            - sequence_groups: SequenceGroup models for multi-sequence groups
+            - work_sequences: PhotoSequence objects for pipeline flow
+        """
+        # ASSERTION: Count input photos (atomic invariant)
+        input_photos: int = sum(seq.n_photos for seq in component)
+        # Use configured gate sequence instead of hardcoded method
+        gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
+        # Use common clustering algorithm
+        result_classes, sequence_groups = cluster_similar_sequences(
+            list(component),
+            gates,
+            created_by,
+        )
+        # ASSERTION: Verify all photos preserved
+        output_photos = sum(seq.n_photos for seq in result_classes)
+        assert output_photos == input_photos, (
+            f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
+        )
+        return [], sequence_groups, result_classes
+    def accumulate_results(
+        self,
+        accum: tuple[list[PhotoSequence], list[PhotoSequence]],
+        job: list[PhotoSequence],
+    ) -> None:
+        """Accumulate worker results into forest and bins.
+        Args:
+            accum: Tuple of (forest, bins) - both contain all sequences
+            job: List of PhotoSequence objects from worker
+        """
+        forest, bins = accum
+        forest.extend(job)
+        bins.extend(job)
+    def finalise(self) -> None:
+        forest = self.result[0]
+        self.ref_photos_final = count_forest_ref_photos(forest)
+        self.ref_seqs_final = len(forest)
+        # Count total photos to ensure no photos lost (invariant check)
+        photos_final = count_forest_total_photos(forest)
+        count_forest_ref_sequences(forest)
+        # FIXME: Sequence count validation disabled due to test fixture limitations
+        assert photos_final == self.total_photos, (
+            f"Started with {self.total_photos} photos but ended up with {photos_final}"
+        )
+    def needs_review(self) -> ReviewType:
+        """This stage produces sequence groups (index overlap sequences).
+        Returns:
+            "sequences" to indicate this stage produces reviewable sequence groups
+        """
+        return "sequences"
+    def has_review_data(self) -> bool:
+        """Check if there are any index overlap sequence groups to review.
+        Returns:
+            True if forest has classes (multi-sequence groups), False otherwise
+        """
+        # Check if stage has run
+        if not hasattr(self, "result") or self.result is None:
+            return False
+        # Check if there are any classes (multi-sequence groups) in the forest
+        forest = self.result[0]
+        return any(seq.is_class() for seq in forest)
+    # Typed result field - tuple of (forest, bins)
+    result: tuple[list[PhotoSequence], list[PhotoSequence]]

photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

photo-stack-finder 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl