PyPI - photo-stack-finder - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

photo-stack-finder 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

orchestrator/__init__.py +2 -2
orchestrator/app.py +6 -11
orchestrator/build_pipeline.py +19 -21
orchestrator/orchestrator_runner.py +11 -8
orchestrator/pipeline_builder.py +126 -126
orchestrator/pipeline_orchestrator.py +604 -604
orchestrator/review_persistence.py +162 -162
orchestrator/static/orchestrator.css +76 -76
orchestrator/static/orchestrator.html +11 -5
orchestrator/static/orchestrator.js +3 -1
overlap_metrics/__init__.py +1 -1
overlap_metrics/config.py +135 -135
overlap_metrics/core.py +284 -284
overlap_metrics/estimators.py +292 -292
overlap_metrics/metrics.py +307 -307
overlap_metrics/registry.py +99 -99
overlap_metrics/utils.py +104 -104
photo_compare/__init__.py +1 -1
photo_compare/base.py +285 -285
photo_compare/config.py +225 -225
photo_compare/distance.py +15 -15
photo_compare/feature_methods.py +173 -173
photo_compare/file_hash.py +29 -29
photo_compare/hash_methods.py +99 -99
photo_compare/histogram_methods.py +118 -118
photo_compare/pixel_methods.py +58 -58
photo_compare/structural_methods.py +104 -104
photo_compare/types.py +28 -28
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/METADATA +21 -22
photo_stack_finder-0.1.8.dist-info/RECORD +75 -0
scripts/orchestrate.py +12 -10
utils/__init__.py +4 -3
utils/base_pipeline_stage.py +171 -171
utils/base_ports.py +176 -176
utils/benchmark_utils.py +823 -823
utils/channel.py +74 -74
utils/comparison_gates.py +40 -21
utils/compute_benchmarks.py +355 -355
utils/compute_identical.py +94 -24
utils/compute_indices.py +235 -235
utils/compute_perceptual_hash.py +127 -127
utils/compute_perceptual_match.py +240 -240
utils/compute_sha_bins.py +64 -20
utils/compute_template_similarity.py +1 -1
utils/compute_versions.py +483 -483
utils/config.py +8 -5
utils/data_io.py +83 -83
utils/graph_context.py +44 -44
utils/logger.py +2 -2
utils/models.py +2 -2
utils/photo_file.py +90 -91
utils/pipeline_graph.py +334 -334
utils/pipeline_stage.py +408 -408
utils/plot_helpers.py +123 -123
utils/ports.py +136 -136
utils/progress.py +415 -415
utils/report_builder.py +139 -139
utils/review_types.py +55 -55
utils/review_utils.py +10 -19
utils/sequence.py +10 -8
utils/sequence_clustering.py +1 -1
utils/template.py +57 -57
utils/template_parsing.py +71 -0
photo_stack_finder-0.1.7.dist-info/RECORD +0 -74
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/WHEEL +0 -0
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/entry_points.txt +0 -0
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/licenses/LICENSE +0 -0
{photo_stack_finder-0.1.7.dist-info → photo_stack_finder-0.1.8.dist-info}/top_level.txt +0 -0

utils/compute_perceptual_match.py CHANGED Viewed

@@ -1,240 +1,240 @@
-"""Implementation of perceptual matching pipeline stage."""
-from __future__ import annotations
-from collections import defaultdict
-from itertools import combinations
-from typing import cast
-import networkx as nx
-from .comparison_gates import GateName, GateSequence
-from .config import CONFIG
-from .logger import get_logger
-from .models import ReviewType, SequenceGroup
-from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
-from .ports import InputPort, OutputPort
-from .sequence import (
-    INDEX_T,
-    PhotoSequence,
-    count_forest_ref_photos,
-    count_forest_ref_sequences,
-    count_forest_total_photos,
-    predict_exemplar_sequence,
-)
-from .sequence_clustering import cluster_similar_sequences
-class ComputePerceptualMatch(
-    PipelineStage[
-        list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],  # S: component
-        list[PhotoSequence],  # T: merged sequences
-        list[PhotoSequence],  # R: result forest
-    ]
-):
-    def __init__(self) -> None:
-        """Initialize the perceptual matching stage."""
-        super().__init__(
-            path=CONFIG.paths.forest_final_pkl,
-            stage_name="Perceptual Matching",
-        )
-        # Store worker argument
-        self.args = self.stage_name  # Standard args attribute for run()
-        # Create input port for forest (from ComputeIndices)
-        self.forest_i: InputPort[list[PhotoSequence]] = InputPort("forest")
-        # Create input port for perceptual bins (from ComputePerceptualHash)
-        self.perceptual_bins_i: InputPort[dict[bytes, dict[int, list[INDEX_T]]]] = InputPort("perceptual_bins")
-        # Create output port for final forest
-        self.final_forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result)
-    def prepare(
-        self,
-    ) -> PrepareResult[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], list[PhotoSequence]]:
-        """Extract index bins, build graph, and return processable components.
-        Reads forest and bins from input ports, builds connection graph,
-        and filters components by size.
-        Returns:
-            Tuple of (processable_components, skipped_sequences)
-        """
-        # Read from input ports
-        forest = self.forest_i.read()
-        # Get reference counts from upstream for UI statistics tracking
-        self.ref_photos_init = self.forest_i.get_ref_photo_count()
-        self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
-        # Count total photos for internal invariant checking (should never change)
-        self.total_photos = sum(seq.n_photos for seq in forest)
-        perceptual_bins = self.perceptual_bins_i.read()
-        # Within each bin, calculate the number of connections between sequences and the best index mapping with its value
-        connections: dict[tuple[int, int], list[tuple[list[INDEX_T], list[INDEX_T]]]] = defaultdict(list)
-        associations: dict[int, list[tuple[INDEX_T, bytes]]] = defaultdict(list)
-        k: bytes
-        hbin: dict[int, list[INDEX_T]]
-        for k, hbin in perceptual_bins.items():
-            # label each index of the sequence with its hash
-            s: int
-            idces: list[INDEX_T]
-            for s, idces in hbin.items():
-                associations[s].extend([(idx, k) for idx in idces])
-            # add the pair of index lists that are matched to the pair of sequences
-            for (s1, hb1), (s2, hb2) in combinations(sorted(hbin.items()), 2):
-                connections[(s1, s2)].append((hb1, hb2))
-        # Form connection graph along with index mappings and get components of connected sequences
-        graph: nx.Graph[int] = nx.Graph()
-        graph.add_nodes_from(range(len(forest)))
-        for (s1, s2), idx_pairs in connections.items():
-            # If the sequences match for at least half their points then test them for equality
-            if sum([min(len(idces1), len(idces2)) for idces1, idces2 in idx_pairs]) >= 0.5 * min(
-                len(forest[s1].get_reference()), len(forest[s2].get_reference())
-            ):
-                graph.add_edge(s1, s2)
-        components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
-            [(forest[i], associations[i]) for i in c] for c in nx.connected_components(graph)
-        ]
-        # Filter components by size
-        max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
-        processable_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = sorted(
-            [c for c in components if 2 <= len(c) <= max_size],
-            key=lambda c: -sum([seq.n_ref_photos for seq, _ in c]),
-        )
-        skipped_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
-            c for c in components if len(c) > max_size or len(c) < 2
-        ]
-        # Flatten skipped components into result sequences
-        results: list[PhotoSequence] = [seq for comp in skipped_components for seq, _ in comp]
-        # Calculate skip statistics
-        num_singletons = sum(1 for c in skipped_components if len(c) < 2)
-        num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
-        get_logger().info(
-            f"There are {len(processable_components)} perceptual components with an average of {float(sum([len(c) for c in processable_components])) / float(len(processable_components)) if processable_components else 0} sequences"
-        )
-        get_logger().info(
-            f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
-            f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
-        )
-        n_photos_processable = sum(seq.n_photos for component in processable_components for seq, _ in component)
-        n_photos_skipped = sum(seq.n_photos for seq in results)
-        assert self.total_photos == n_photos_processable + n_photos_skipped, (
-            f"ComputePerceptualMatch._prepare_with_bins lost photos, expected {self.total_photos}, got {n_photos_processable} + {n_photos_skipped}"
-        )
-        return processable_components, results
-    @classmethod
-    def stage_worker(
-        cls,
-        bin_data: list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],
-        created_by: str,
-    ) -> WorkerResult[list[PhotoSequence]]:
-        # ASSERTION: Count input photos (atomic invariant)
-        input_photos: int = sum(seq.n_photos for seq, _ in bin_data)
-        gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
-        # Keep sequences and their hash associations
-        seq_with_hashes: list[tuple[PhotoSequence, dict[INDEX_T, bytes]]] = [
-            (seq, dict(hashes)) for seq, hashes in bin_data
-        ]
-        result_sequences: list[PhotoSequence] = []
-        review_groups: list[SequenceGroup] = []
-        # Iteratively find clusters of similar sequences
-        while seq_with_hashes:
-            # Extract just sequences for exemplar prediction
-            sequences = [seq for seq, _hashes in seq_with_hashes]
-            # Pick best exemplar from remaining sequences
-            exemplar_seq_obj = predict_exemplar_sequence(sequences)
-            exemplar_idx = sequences.index(exemplar_seq_obj)
-            seq_with_hashes[exemplar_idx][1]
-            remaining_with_hashes = [
-                (seq, hashes) for i, (seq, hashes) in enumerate(seq_with_hashes) if i != exemplar_idx
-            ]
-            # Use common clustering algorithm
-            cluster_results, cluster_reviews = cluster_similar_sequences(
-                [exemplar_seq_obj] + [seq for seq, _ in remaining_with_hashes],
-                gates,
-                created_by,
-            )
-            result_sequences.extend(cluster_results)
-            review_groups.extend(cluster_reviews)
-            # Remove all processed sequences from pool (they're now in cluster_results)
-            # Note: cluster_results contains NEW PhotoSequence objects, so we track input sequences instead
-            input_sequences = {exemplar_seq_obj} | {seq for seq, _ in remaining_with_hashes}
-            seq_with_hashes = [(seq, hashes) for seq, hashes in seq_with_hashes if seq not in input_sequences]
-        # ASSERTION: Verify all photos preserved
-        output_photos = sum(seq.n_photos for seq in result_sequences)
-        assert output_photos == input_photos, (
-            f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
-        )
-        return [], review_groups, result_sequences
-    def accumulate_results(
-        self,
-        accum: list[PhotoSequence],
-        job: list[PhotoSequence],
-    ) -> None:
-        accum.extend(job)
-    def finalise(self) -> None:
-        self.ref_photos_final = count_forest_ref_photos(self.result)
-        self.ref_seqs_final = len(self.result)
-        # Count total photos to ensure no photos lost (invariant check)
-        photos_final = count_forest_total_photos(self.result)
-        seqs_final = count_forest_ref_sequences(self.result)
-        if seqs_final != self.ref_seqs_init:
-            get_logger().warning(
-                f"Sequence count mismatch in {self.stage_name}: "
-                f"started with {self.ref_seqs_init} but ended with {seqs_final}"
-            )
-        assert photos_final == self.total_photos, (
-            f"Started with {self.total_photos} photos and ended up with {photos_final}"
-        )
-    def needs_review(self) -> ReviewType:
-        """This stage produces sequence groups (similar photo sequences).
-        Returns:
-            "sequences" to indicate this stage produces reviewable sequence groups
-        """
-        return "sequences"
-    def has_review_data(self) -> bool:
-        """Check if there are any sequence groups to review.
-        Returns:
-            True if forest has classes (multi-sequence groups), False otherwise
-        """
-        # Check if stage has run
-        if not hasattr(self, "result") or self.result is None:
-            return False
-        # Check if there are any classes (multi-sequence groups)
-        return any(seq.is_class() for seq in self.result)
-    # Typed result field - just the forest
-    result: list[PhotoSequence]
+"""Implementation of perceptual matching pipeline stage."""
+from __future__ import annotations
+from collections import defaultdict
+from itertools import combinations
+from typing import cast
+import networkx as nx
+from .comparison_gates import GateName, GateSequence
+from .config import CONFIG
+from .logger import get_logger
+from .models import ReviewType, SequenceGroup
+from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
+from .ports import InputPort, OutputPort
+from .sequence import (
+    INDEX_T,
+    PhotoSequence,
+    count_forest_ref_photos,
+    count_forest_ref_sequences,
+    count_forest_total_photos,
+    predict_exemplar_sequence,
+)
+from .sequence_clustering import cluster_similar_sequences
+class ComputePerceptualMatch(
+    PipelineStage[
+        list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],  # S: component
+        list[PhotoSequence],  # T: merged sequences
+        list[PhotoSequence],  # R: result forest
+    ]
+):
+    def __init__(self) -> None:
+        """Initialize the perceptual matching stage."""
+        super().__init__(
+            path=CONFIG.paths.forest_final_pkl,
+            stage_name="Perceptual Matching",
+        )
+        # Store worker argument
+        self.args = self.stage_name  # Standard args attribute for run()
+        # Create input port for forest (from ComputeIndices)
+        self.forest_i: InputPort[list[PhotoSequence]] = InputPort("forest")
+        # Create input port for perceptual bins (from ComputePerceptualHash)
+        self.perceptual_bins_i: InputPort[dict[bytes, dict[int, list[INDEX_T]]]] = InputPort("perceptual_bins")
+        # Create output port for final forest
+        self.final_forest_o: OutputPort[list[PhotoSequence]] = OutputPort(self, getter=lambda: self.result)
+    def prepare(
+        self,
+    ) -> PrepareResult[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]], list[PhotoSequence]]:
+        """Extract index bins, build graph, and return processable components.
+        Reads forest and bins from input ports, builds connection graph,
+        and filters components by size.
+        Returns:
+            Tuple of (processable_components, skipped_sequences)
+        """
+        # Read from input ports
+        forest = self.forest_i.read()
+        # Get reference counts from upstream for UI statistics tracking
+        self.ref_photos_init = self.forest_i.get_ref_photo_count()
+        self.ref_seqs_init = self.forest_i.get_ref_sequence_count()
+        # Count total photos for internal invariant checking (should never change)
+        self.total_photos = sum(seq.n_photos for seq in forest)
+        perceptual_bins = self.perceptual_bins_i.read()
+        # Within each bin, calculate the number of connections between sequences and the best index mapping with its value
+        connections: dict[tuple[int, int], list[tuple[list[INDEX_T], list[INDEX_T]]]] = defaultdict(list)
+        associations: dict[int, list[tuple[INDEX_T, bytes]]] = defaultdict(list)
+        k: bytes
+        hbin: dict[int, list[INDEX_T]]
+        for k, hbin in perceptual_bins.items():
+            # label each index of the sequence with its hash
+            s: int
+            idces: list[INDEX_T]
+            for s, idces in hbin.items():
+                associations[s].extend([(idx, k) for idx in idces])
+            # add the pair of index lists that are matched to the pair of sequences
+            for (s1, hb1), (s2, hb2) in combinations(sorted(hbin.items()), 2):
+                connections[(s1, s2)].append((hb1, hb2))
+        # Form connection graph along with index mappings and get components of connected sequences
+        graph: nx.Graph[int] = nx.Graph()
+        graph.add_nodes_from(range(len(forest)))
+        for (s1, s2), idx_pairs in connections.items():
+            # If the sequences match for at least half their points then test them for equality
+            if sum([min(len(idces1), len(idces2)) for idces1, idces2 in idx_pairs]) >= 0.5 * min(
+                len(forest[s1].get_reference()), len(forest[s2].get_reference())
+            ):
+                graph.add_edge(s1, s2)
+        components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
+            [(forest[i], associations[i]) for i in c] for c in nx.connected_components(graph)
+        ]
+        # Filter components by size
+        max_size = CONFIG.sequences.MAX_COMPONENT_SIZE
+        processable_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = sorted(
+            [c for c in components if 2 <= len(c) <= max_size],
+            key=lambda c: -sum([seq.n_ref_photos for seq, _ in c]),
+        )
+        skipped_components: list[list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]]] = [
+            c for c in components if len(c) > max_size or len(c) < 2
+        ]
+        # Flatten skipped components into result sequences
+        results: list[PhotoSequence] = [seq for comp in skipped_components for seq, _ in comp]
+        # Calculate skip statistics
+        num_singletons = sum(1 for c in skipped_components if len(c) < 2)
+        num_oversized = sum(1 for c in skipped_components if len(c) > max_size)
+        get_logger().info(
+            f"There are {len(processable_components)} perceptual components with an average of {float(sum([len(c) for c in processable_components])) / float(len(processable_components)) if processable_components else 0} sequences"
+        )
+        get_logger().info(
+            f"Skipped {len(skipped_components)} components ({num_singletons} singletons, {num_oversized} oversized), "
+            f"total sequences is {sum([len(c) for c in processable_components])} in {len(processable_components)} sets"
+        )
+        n_photos_processable = sum(seq.n_photos for component in processable_components for seq, _ in component)
+        n_photos_skipped = sum(seq.n_photos for seq in results)
+        assert self.total_photos == n_photos_processable + n_photos_skipped, (
+            f"ComputePerceptualMatch._prepare_with_bins lost photos, expected {self.total_photos}, got {n_photos_processable} + {n_photos_skipped}"
+        )
+        return processable_components, results
+    @classmethod
+    def stage_worker(
+        cls,
+        bin_data: list[tuple[PhotoSequence, list[tuple[INDEX_T, bytes]]]],
+        created_by: str,
+    ) -> WorkerResult[list[PhotoSequence]]:
+        # ASSERTION: Count input photos (atomic invariant)
+        input_photos: int = sum(seq.n_photos for seq, _ in bin_data)
+        gates = GateSequence(cast(list[GateName], CONFIG.processing.COMPARISON_GATES))
+        # Keep sequences and their hash associations
+        seq_with_hashes: list[tuple[PhotoSequence, dict[INDEX_T, bytes]]] = [
+            (seq, dict(hashes)) for seq, hashes in bin_data
+        ]
+        result_sequences: list[PhotoSequence] = []
+        review_groups: list[SequenceGroup] = []
+        # Iteratively find clusters of similar sequences
+        while seq_with_hashes:
+            # Extract just sequences for exemplar prediction
+            sequences = [seq for seq, _hashes in seq_with_hashes]
+            # Pick best exemplar from remaining sequences
+            exemplar_seq_obj = predict_exemplar_sequence(sequences)
+            exemplar_idx = sequences.index(exemplar_seq_obj)
+            seq_with_hashes[exemplar_idx][1]
+            remaining_with_hashes = [
+                (seq, hashes) for i, (seq, hashes) in enumerate(seq_with_hashes) if i != exemplar_idx
+            ]
+            # Use common clustering algorithm
+            cluster_results, cluster_reviews = cluster_similar_sequences(
+                [exemplar_seq_obj] + [seq for seq, _ in remaining_with_hashes],
+                gates,
+                created_by,
+            )
+            result_sequences.extend(cluster_results)
+            review_groups.extend(cluster_reviews)
+            # Remove all processed sequences from pool (they're now in cluster_results)
+            # Note: cluster_results contains NEW PhotoSequence objects, so we track input sequences instead
+            input_sequences = {exemplar_seq_obj} | {seq for seq, _ in remaining_with_hashes}
+            seq_with_hashes = [(seq, hashes) for seq, hashes in seq_with_hashes if seq not in input_sequences]
+        # ASSERTION: Verify all photos preserved
+        output_photos = sum(seq.n_photos for seq in result_sequences)
+        assert output_photos == input_photos, (
+            f"Lost photos in stage_worker: started {input_photos}, ended {output_photos}"
+        )
+        return [], review_groups, result_sequences
+    def accumulate_results(
+        self,
+        accum: list[PhotoSequence],
+        job: list[PhotoSequence],
+    ) -> None:
+        accum.extend(job)
+    def finalise(self) -> None:
+        self.ref_photos_final = count_forest_ref_photos(self.result)
+        self.ref_seqs_final = len(self.result)
+        # Count total photos to ensure no photos lost (invariant check)
+        photos_final = count_forest_total_photos(self.result)
+        seqs_final = count_forest_ref_sequences(self.result)
+        if seqs_final != self.ref_seqs_init:
+            get_logger().warning(
+                f"Sequence count mismatch in {self.stage_name}: "
+                f"started with {self.ref_seqs_init} but ended with {seqs_final}"
+            )
+        assert photos_final == self.total_photos, (
+            f"Started with {self.total_photos} photos and ended up with {photos_final}"
+        )
+    def needs_review(self) -> ReviewType:
+        """This stage produces sequence groups (similar photo sequences).
+        Returns:
+            "sequences" to indicate this stage produces reviewable sequence groups
+        """
+        return "sequences"
+    def has_review_data(self) -> bool:
+        """Check if there are any sequence groups to review.
+        Returns:
+            True if forest has classes (multi-sequence groups), False otherwise
+        """
+        # Check if stage has run
+        if not hasattr(self, "result") or self.result is None:
+            return False
+        # Check if there are any classes (multi-sequence groups)
+        return any(seq.is_class() for seq in self.result)
+    # Typed result field - just the forest
+    result: list[PhotoSequence]

utils/compute_sha_bins.py CHANGED Viewed

@@ -1,26 +1,57 @@
 from __future__ import annotations
+import hashlib
+import io
 import mimetypes
 import os
 from collections import defaultdict
 from collections.abc import Iterator
 from pathlib import Path
-from photo_compare import file_sha256
+from PIL import Image
 from .config import CONFIG
 from .photo_file import PhotoFile
 from .pipeline_stage import PipelineStage, PrepareResult, WorkerResult
 from .ports import OutputPort
+from .template_parsing import extract_template
+def _get_oriented_dimensions(img: Image.Image) -> tuple[int, int]:
+    """Extract dimensions from PIL Image with EXIF orientation applied.
+    Args:
+        img: Opened PIL Image
+    Returns:
+        Tuple of (width, height) with EXIF orientation applied
+    """
+    # Get EXIF orientation if present
+    orientation: int = 0
+    if hasattr(img, "_getexif") and img._getexif() is not None:
+        exif = img._getexif()
+        orientation = exif.get(274, 0)  # 274 = Orientation EXIF tag
+    # Get raw dimensions
+    raw_width: int = img.width
+    raw_height: int = img.height
+    # Apply EXIF orientation (swap dimensions for rotations 5,6,7,8)
+    # https://www.impulseadventure.com/photo/exif-orientation.html
+    if orientation in {5, 6, 7, 8}:
+        return raw_height, raw_width
+    return raw_width, raw_height
 class ComputeShaBins(PipelineStage[tuple[int, tuple[Path, str]], tuple[PhotoFile, str], dict[str, list[PhotoFile]]]):
     """Pipeline stage that walks source directory and bins photos by SHA256 hash.
-    Creates PhotoFile objects with minimal core properties (path, mime, size_bytes)
-    computed from file metadata only - NO image opening! SHA256 is computed from
-    file contents and used for binning, then discarded. All image-derived properties
-    (pixels, dimensions, EXIF) are computed lazily when first accessed.
+    Creates PhotoFile objects with full metadata extracted from a single file read:
+    - SHA256 hash (for binning, then discarded)
+    - File size, MIME type, path
+    - Image dimensions with EXIF orientation applied
+    PhotoFile is a pure data container - this stage performs ALL file I/O.
     """
     def __init__(self, source_path: Path) -> None:
@@ -82,44 +113,57 @@ class ComputeShaBins(PipelineStage[tuple[int, tuple[Path, str]], tuple[PhotoFile
     @classmethod
     def stage_worker(cls, param: tuple[int, tuple[Path, str]], _args: str) -> WorkerResult[tuple[PhotoFile, str]]:
-        """Create PhotoFile with core file properties and compute SHA256.
-        Work function for parallel processing that takes enumerated file info
-        and returns a PhotoFile with core file properties (no image opening!).
-        SHA256 is computed and returned separately for binning.
+        """Create PhotoFile with dimensions and compute SHA256 in single file read.
-        Pixels, dimensions, and all image-derived properties are computed lazily
-        when first accessed.
+        Work function for parallel processing that reads file once, computes SHA256,
+        extracts dimensions with EXIF orientation, and creates PhotoFile with all
+        metadata. PhotoFile.__init__ never opens files - this is the only file I/O.
-        There is no exception handling in here.  All exceptions should be surfaced to be dealt with by the user.
+        There is no exception handling in here. All exceptions should be surfaced
+        to be dealt with by the user.
         Args:
                 param: (photo_id, (path, mime)) tuple
                 _args: Placeholder to match pattern
         Returns:
-                (PhotoFile with core properties, SHA256 hash) tuple
+                (PhotoFile with all metadata, SHA256 hash) tuple
         """
         photo_id: int
         path: Path
         mime: str
         photo_id, (path, mime) = param
-        # Compute SHA256 (file I/O only, no image opening)
-        sha256_hash: str = file_sha256(path)
+        # Read file once into memory for both SHA256 and dimensions (optimization)
+        with path.open("rb") as f:
+            file_data: bytes = f.read()
-        # Get file size
-        size_bytes: int = path.stat().st_size
+        # Compute SHA256 from in-memory data
+        sha256_hash: str = hashlib.sha256(file_data).hexdigest()
-        # Create PhotoFile with core file properties only
-        # No image opening! Pixels/dimensions computed lazily when accessed
+        # Extract dimensions with EXIF orientation from in-memory data
+        with Image.open(io.BytesIO(file_data)) as img:
+            width, height = _get_oriented_dimensions(img)
+        # Get file size from data length (avoids separate stat call)
+        size_bytes: int = len(file_data)
+        # Create PhotoFile with all metadata (no file I/O in PhotoFile.__init__)
         photo = PhotoFile(
             path=path,
             mime=mime,
             size_bytes=size_bytes,
             file_id=photo_id,
+            width=width,
+            height=height,
         )
+        # Extract template from filename (with extension) and include full directory path
+        # This ensures files with same name in different directories have different templates
+        template_pattern, index = extract_template(path.name)
+        template_with_path = str(path.with_name(template_pattern))
+        photo.cache["TEMPLATE"] = (template_with_path, index)
         # Return PhotoFile and SHA256 separately (SHA256 used for binning only)
         return (
             [],

utils/compute_template_similarity.py CHANGED Viewed

@@ -323,7 +323,7 @@ class ComputeTemplateSimilarity(
                 # Bin is too large - subdivide it
                 if template_remainder in template_bins and template_remainder not in original_large_bins_seen:
-                    get_logger().info(f"Subdividing large bin '{template_remainder}' with {bin_size} sequences")
+                    get_logger().debug(f"Subdividing large bin '{template_remainder}' with {bin_size} sequences")
                     original_large_bins_seen.add(template_remainder)
                     total_subdivisions += 1
                     largest_input_bin = max(largest_input_bin, bin_size)

photo-stack-finder 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

photo-stack-finder 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl