PyPI - caption-flow - Versions diffs - 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +937 -416
caption_flow/models.py +45 -3
caption_flow/monitor.py +5 -3
caption_flow/orchestrator.py +186 -116
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +440 -68
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +66 -25
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +41 -19
caption_flow/utils/chunk_tracker.py +200 -65
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +12 -6
caption_flow/workers/caption.py +272 -91
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.3.dist-info/RECORD +0 -33
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 """Utility modules for CaptionFlow."""
-from .chunk_tracker import ChunkTracker
 from .caption_utils import CaptionUtils
+from .chunk_tracker import ChunkTracker

caption_flow/utils/auth.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Authentication management."""
-from typing import Dict, Any, Optional
 from dataclasses import dataclass
+from typing import Any, Dict, Optional
 @dataclass

caption_flow/utils/caption_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Caption processing utilities from the original vLLM script."""
-from typing import List, Dict
+from typing import Dict, List
 class CaptionUtils:

caption_flow/utils/certificates.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """SSL certificate management."""
+import datetime as _datetime
 import subprocess
+from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Optional
 from cryptography import x509
-from cryptography.x509.oid import NameOID
 from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
-from datetime import datetime, timedelta
+from cryptography.x509.oid import NameOID
 class CertificateManager:
@@ -35,8 +37,8 @@ class CertificateManager:
             .issuer_name(issuer)
             .public_key(key.public_key())
             .serial_number(x509.random_serial_number())
-            .not_valid_before(datetime.utcnow())
-            .not_valid_after(datetime.utcnow() + timedelta(days=365))
+            .not_valid_before(datetime.now(_datetime.UTC))
+            .not_valid_after(datetime.now(_datetime.UTC) + timedelta(days=365))
             .add_extension(
                 x509.SubjectAlternativeName(
                     [
@@ -71,14 +73,15 @@ class CertificateManager:
     def generate_letsencrypt(
         self, domain: str, email: str, output_dir: Optional[Path] = None, staging: bool = False
     ) -> tuple[Path, Path]:
-        """
-        Generate Let's Encrypt certificate.
+        """Generate Let's Encrypt certificate.
         Args:
+        ----
             domain: Domain name for certificate
             email: Email for Let's Encrypt account
             output_dir: Custom output directory (uses /etc/letsencrypt by default)
             staging: Use Let's Encrypt staging server for testing
         """
         cmd = [
             "certbot",
@@ -133,8 +136,12 @@ class CertificateManager:
         return {
             "subject": cert.subject.rfc4514_string(),
             "issuer": cert.issuer.rfc4514_string(),
-            "not_before": cert.not_valid_before,
-            "not_after": cert.not_valid_after,
+            "not_before": cert.not_valid_before_utc,
+            "not_after": cert.not_valid_after_utc,
             "serial_number": cert.serial_number,
             "is_self_signed": cert.issuer == cert.subject,
         }
+    def inspect_certificate(self, cert_path: Path) -> dict:
+        """Inspect a certificate (alias for get_cert_info for CLI compatibility)."""
+        return self.get_cert_info(cert_path)

caption_flow/utils/checkpoint_tracker.py CHANGED Viewed

@@ -1,13 +1,17 @@
 """Base class for checkpoint tracking with persistent state."""
+import datetime as _datetime
 import json
 import logging
+import os
 from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Dict, Any, Optional
+from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
 logger = logging.getLogger(__name__)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class CheckpointTracker(ABC):
@@ -53,34 +57,52 @@ class CheckpointTracker(ABC):
     def save(self) -> None:
         """Save checkpoint to disk atomically."""
         try:
+            # If a save is already in progress, let it finish.
+            # This prevents race conditions if save() is called rapidly.
+            if hasattr(self, "_save_future") and self._save_future and not self._save_future.done():
+                logger.warning("Previous save still in progress, skipping this save")
+                return  # don't save this time,
+            logger.info("Saving chunk tracker state...")
             # Prepare data with metadata
-            data = self._serialize_state()
-            data["updated_at"] = datetime.utcnow().isoformat()
+            with self.lock:
+                data = self._serialize_state()
+            data["updated_at"] = datetime.now(_datetime.UTC).isoformat()
             # Write atomically using temp file
             tmp_file = self.checkpoint_path.with_suffix(".tmp")
+            # Use an executor to run the save operation in a background thread.
+            # This makes the save call non-blocking.
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                data_to_save = data.copy()
+                self._save_future = executor.submit(self._write_to_disk, data_to_save, tmp_file)
+        except Exception as e:
+            logger.error(f"Failed to submit save task: {e}", exc_info=True)
+    def _write_to_disk(self, data: Dict[str, Any], checkpoint_path: Optional[str] = None) -> None:
+        """Write checkpoint data to disk atomically."""
+        # Create a temporary file in the same directory as the checkpoint
+        tmp_file = (checkpoint_path or self.checkpoint_path).with_suffix(".tmp")
+        logger.debug(f"Checkpoint {tmp_file=}")
+        try:
+            # Ensure the parent directory exists
+            self.checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
             with open(tmp_file, "w") as f:
                 json.dump(data, f, indent=2)
-            # Ensure temp file was created
-            if not tmp_file.exists():
-                raise IOError(f"Failed to create temporary file: {tmp_file}")
-            # Move atomically
+            # Atomically replace the checkpoint file
             tmp_file.replace(self.checkpoint_path)
             logger.debug(f"Saved checkpoint to {self.checkpoint_path}")
         except Exception as e:
-            # logger.error(f"Error saving checkpoint: {e}", exc_info=True)
-            # Try direct write as fallback
-            try:
-                with open(self.checkpoint_path, "w") as f:
-                    json.dump(data, f, indent=2)
-                # logger.info("Saved checkpoint using fallback direct write")
-            except Exception as fallback_error:
-                logger.error(f"Fallback save also failed: {fallback_error}")
+            logger.error(f"Failed to save checkpoint atomically: {e}", exc_info=True)
+            # Try to clean up the temp file if it exists
+            if tmp_file.exists():
+                try:
+                    tmp_file.unlink()
+                except:
+                    pass
     def get_stats(self) -> Dict[str, Any]:
         """Get statistics about tracked items. Override for custom stats."""

caption_flow/utils/chunk_tracker.py CHANGED Viewed

@@ -1,16 +1,19 @@
 """Chunk tracking using CheckpointTracker base class with memory optimization."""
-from collections import defaultdict
+import datetime as _datetime
 import logging
-from pathlib import Path
-from typing import Set, Dict, List, Optional, Any, Tuple
+import os
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
 from datetime import datetime, timedelta
-from dataclasses import dataclass, asdict, field
+from pathlib import Path
+from threading import Lock
+from typing import Any, Dict, List, Optional, Set, Tuple
 from .checkpoint_tracker import CheckpointTracker
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 @dataclass
@@ -31,8 +34,16 @@ class ChunkState:
     assigned_to: Optional[str] = None
     assigned_at: Optional[datetime] = None
+    # Cache for expensive range calculations
+    _cached_merged_ranges: Optional[List[Tuple[int, int]]] = field(default=None, init=False)
+    _cached_unprocessed_ranges: Optional[List[Tuple[int, int]]] = field(default=None, init=False)
+    _cache_invalidated: bool = field(default=True, init=False)
     def add_processed_range(self, start: int, end: int):
         """Add a processed range and merge if needed."""
+        # Invalidate cache before modifying ranges
+        self._invalidate_cache()
         # Add new range
         self.processed_ranges.append((start, end))
@@ -57,38 +68,98 @@ class ChunkState:
     def mark_completed(self):
         """Mark chunk as completed and clear unnecessary data to save memory."""
+        self._invalidate_cache()
         self.status = "completed"
-        self.completed_at = datetime.utcnow()
+        self.completed_at = datetime.now(_datetime.UTC)
         # Clear processed_ranges since we don't need them after completion
-        self.processed_ranges = []
-        self.assigned_to = None
-        self.assigned_at = None
+        # self.processed_ranges = []
+        # self.assigned_to = None
+        # self.assigned_at = None
+    def _invalidate_cache(self):
+        """Invalidate cached range calculations."""
+        self._cached_merged_ranges = None
+        self._cached_unprocessed_ranges = None
+        self._cache_invalidated = True
+    def _get_merged_ranges(self) -> List[Tuple[int, int]]:
+        """Get merged ranges with caching."""
+        if self._cached_merged_ranges is None:
+            self._cached_merged_ranges = self._merge_ranges(self.processed_ranges)
+        return self._cached_merged_ranges
     def get_unprocessed_ranges(self) -> List[Tuple[int, int]]:
-        """Get ranges that haven't been processed yet."""
+        """Get ranges of unprocessed items within the chunk (relative indices)."""
         if self.status == "completed":
             return []
         if not self.processed_ranges:
-            logger.info(f"Chunk {self.chunk_id} has no processed ranges, returning full range")
+            if self._cache_invalidated:  # Only log once per invalidation
+                logger.info(f"Chunk {self.chunk_id} has no processed ranges, returning full range")
+                self._cache_invalidated = False
             return [(0, self.chunk_size - 1)]
-        unprocessed = []
-        current = 0
+        # Use cached result if available
+        if self._cached_unprocessed_ranges is not None:
+            return self._cached_unprocessed_ranges
-        logger.info(
-            f"Processing {len(self.processed_ranges)} processed ranges for chunk {self.chunk_id}"
-        )
-        for start, end in self.processed_ranges:
-            if current < start:
-                unprocessed.append((current, start - 1))
-            current = max(current, end + 1)
+        # Calculate and cache unprocessed ranges
+        merged_ranges = self._get_merged_ranges()
-        if current < self.chunk_size:
-            unprocessed.append((current, self.chunk_size - 1))
+        unprocessed = []
+        current_pos = 0
+        for start, end in merged_ranges:
+            if current_pos < start:
+                unprocessed.append((current_pos, start - 1))
+            current_pos = max(current_pos, end + 1)
+        # Add any remaining range
+        if current_pos < self.chunk_size:
+            unprocessed.append((current_pos, self.chunk_size - 1))
+        # Cache the result
+        self._cached_unprocessed_ranges = unprocessed
+        # Log for debugging (only when cache is being computed)
+        if self._cache_invalidated:
+            if not unprocessed:
+                logger.info(
+                    f"Chunk {self.chunk_id} has processed ranges {merged_ranges} covering entire chunk size {self.chunk_size}"
+                )
+            else:
+                logger.debug(f"Merged ranges for chunk {self.chunk_id}: {merged_ranges}")
+                total_processed = sum(end - start + 1 for start, end in merged_ranges)
+                total_unprocessed = sum(end - start + 1 for start, end in unprocessed)
+                logger.debug(
+                    f"Chunk {self.chunk_id}: {total_processed} processed, {total_unprocessed} unprocessed"
+                )
+            self._cache_invalidated = False
         return unprocessed
+    def _merge_ranges(self, ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
+        """Merge overlapping or adjacent ranges."""
+        if not ranges:
+            return []
+        # Sort ranges by start index, ensuring all are tuples
+        sorted_ranges = sorted([tuple(r) for r in ranges])
+        merged = [sorted_ranges[0]]
+        for current_start, current_end in sorted_ranges[1:]:
+            last_start, last_end = merged[-1]
+            # Check if ranges overlap or are adjacent
+            if current_start <= last_end + 1:
+                # Merge the ranges
+                merged[-1] = (last_start, max(last_end, current_end))
+            else:
+                # Add as new range
+                merged.append((current_start, current_end))
+        return merged
     def to_dict(self):
         """Convert to dictionary for JSON serialization."""
         d = asdict(self)
@@ -108,6 +179,10 @@ class ChunkState:
         # Ensure processed_ranges exists
         d.setdefault("processed_ranges", [])
         d.setdefault("processed_count", 0)
+        # Remove cache fields from dict if they exist (shouldn't be serialized)
+        d.pop("_cached_merged_ranges", None)
+        d.pop("_cached_unprocessed_ranges", None)
+        d.pop("_cache_invalidated", None)
         return cls(**d)
@@ -119,11 +194,22 @@ class ChunkTracker(CheckpointTracker):
         checkpoint_file: Path,
         max_completed_chunks_in_memory: int = 1000,
         archive_after_hours: int = 24,
+        save_batch_size: int = 10,
+        auto_save_interval: int = 60,
     ):
         self.chunks: Dict[str, ChunkState] = {}
         self.max_completed_chunks_in_memory = max_completed_chunks_in_memory
         self.archive_after_hours = archive_after_hours
         self._completed_count = 0  # Track count without storing all IDs
+        self.lock = Lock()
+        # Batching mechanism
+        self._dirty = False
+        self._pending_changes = 0
+        self._save_batch_size = save_batch_size
+        self._auto_save_interval = auto_save_interval
+        self._last_save = datetime.now(_datetime.UTC)
         super().__init__(checkpoint_file)
     def _get_default_state(self) -> Dict[str, Any]:
@@ -139,7 +225,8 @@ class ChunkTracker(CheckpointTracker):
         completed_chunks = 0
         for chunk_id, chunk_data in data.get("chunks", {}).items():
             chunk_state = ChunkState.from_dict(chunk_data)
-            self.chunks[chunk_id] = chunk_state
+            with self.lock:
+                self.chunks[chunk_id] = chunk_state
             if chunk_state.status == "completed":
                 completed_chunks += 1
@@ -156,12 +243,47 @@ class ChunkTracker(CheckpointTracker):
             "completed_count": self._completed_count,
         }
+    def _mark_dirty(self):
+        """Mark tracker as having pending changes."""
+        self._dirty = True
+        self._pending_changes += 1
+        # Auto-save based on batch size or time interval
+        now = datetime.now(_datetime.UTC)
+        time_since_last_save = (now - self._last_save).total_seconds()
+        if (
+            self._pending_changes >= self._save_batch_size
+            or time_since_last_save >= self._auto_save_interval
+        ):
+            self._do_save()
+    def _do_save(self) -> bool:
+        """Internal method to perform the actual save."""
+        super().save()  # Parent method returns None but triggers save
+        # Reset dirty state since save was initiated successfully
+        self._dirty = False
+        self._pending_changes = 0
+        self._last_save = datetime.now(_datetime.UTC)
+        return True
+    def save(self, force: bool = False) -> bool:
+        """Save state to checkpoint file, with batching optimization."""
+        if not force and not self._dirty:
+            return False
+        return self._do_save()
+    def flush(self):
+        """Force save any pending changes."""
+        if self._dirty:
+            self._do_save()
     def _archive_old_completed_chunks(self):
         """Remove old completed chunks from memory to prevent unbounded growth."""
         if not self.archive_after_hours:
             return
-        cutoff_time = datetime.utcnow() - timedelta(hours=self.archive_after_hours)
+        cutoff_time = datetime.now(_datetime.UTC) - timedelta(hours=self.archive_after_hours)
         chunks_to_remove = []
         for chunk_id, chunk in self.chunks.items():
@@ -176,7 +298,7 @@ class ChunkTracker(CheckpointTracker):
             for chunk_id in chunks_to_remove:
                 del self.chunks[chunk_id]
             logger.info(f"Archived {len(chunks_to_remove)} old completed chunks from memory")
-            self.save()
+            self._mark_dirty()
     def _limit_completed_chunks_in_memory(self):
         """Keep only the most recent completed chunks in memory."""
@@ -194,7 +316,7 @@ class ChunkTracker(CheckpointTracker):
                 del self.chunks[chunk_id]
             logger.info(f"Removed {to_remove} oldest completed chunks from memory")
-            self.save()
+            self._mark_dirty()
     def add_chunk(
         self, chunk_id: str, shard_name: str, shard_url: str, start_index: int, chunk_size: int
@@ -214,7 +336,7 @@ class ChunkTracker(CheckpointTracker):
             chunk_size=chunk_size,
             status="pending",
         )
-        self.save()
+        self._mark_dirty()
         # Periodically clean up old chunks
         if len(self.chunks) % 100 == 0:
@@ -229,8 +351,8 @@ class ChunkTracker(CheckpointTracker):
             chunk = self.chunks[chunk_id]
             chunk.status = "assigned"
             chunk.assigned_to = worker_id
-            chunk.assigned_at = datetime.utcnow()
-            self.save()
+            chunk.assigned_at = datetime.now(_datetime.UTC)
+            self._mark_dirty()
     def mark_completed(self, chunk_id: str):
         """Mark chunk as completed."""
@@ -240,7 +362,7 @@ class ChunkTracker(CheckpointTracker):
             chunk.mark_completed()  # This clears processed_ranges
             if not was_completed:
                 self._completed_count += 1
-            self.save()
+            self._mark_dirty()
             logger.debug(f"Chunk {chunk_id} marked as completed")
             # Check if we need to clean up
@@ -254,7 +376,7 @@ class ChunkTracker(CheckpointTracker):
             chunk.status = "pending"  # Reset to pending for retry
             chunk.assigned_to = None
             chunk.assigned_at = None
-            self.save()
+            self._mark_dirty()
     def mark_pending(self, chunk_id: str):
         """Mark chunk as pending (for manual reset)."""
@@ -265,7 +387,7 @@ class ChunkTracker(CheckpointTracker):
             chunk.status = "pending"
             chunk.assigned_to = None
             chunk.assigned_at = None
-            self.save()
+            self._mark_dirty()
     def release_worker_chunks(self, worker_id: str):
         """Release all chunks assigned to a worker."""
@@ -276,7 +398,8 @@ class ChunkTracker(CheckpointTracker):
                 chunk.assigned_to = None
                 chunk.assigned_at = None
                 released_chunks.append(chunk_id)
-        self.save()
+        if released_chunks:
+            self._mark_dirty()
         return released_chunks
     def get_pending_chunks(self, shard_name: Optional[str] = None) -> List[str]:
@@ -330,7 +453,7 @@ class ChunkTracker(CheckpointTracker):
         """Get summary of all shards and their chunk status."""
         shards = {}
-        for chunk_id, chunk_state in self.chunks.items():
+        for _chunk_id, chunk_state in self.chunks.items():
             shard_name = chunk_state.shard_name
             if shard_name not in shards:
                 shards[shard_name] = {
@@ -340,9 +463,11 @@ class ChunkTracker(CheckpointTracker):
                     "assigned_chunks": 0,
                     "failed_chunks": 0,
                     "is_complete": True,
+                    "chunks": [],
                 }
             shards[shard_name]["total_chunks"] += 1
+            shards[shard_name]["chunks"].append(chunk_state)
             if chunk_state.status == "completed":
                 shards[shard_name]["completed_chunks"] += 1
@@ -361,7 +486,7 @@ class ChunkTracker(CheckpointTracker):
     def get_incomplete_shards(self) -> Set[str]:
         """Get set of shard names that have incomplete chunks."""
         incomplete = set()
-        for chunk_id, chunk_state in self.chunks.items():
+        for _chunk_id, chunk_state in self.chunks.items():
             if chunk_state.status != "completed":
                 incomplete.add(chunk_state.shard_name)
         return incomplete
@@ -373,22 +498,21 @@ class ChunkTracker(CheckpointTracker):
         if not storage_manager.captions_path.exists():
             return
-        import pyarrow as pa
-        import pyarrow.parquet as pq
+        import lance
         # Check if item_index column exists
-        table_metadata = pq.read_metadata(storage_manager.captions_path)
+        table_metadata = lance.dataset(storage_manager.captions_path).schema
         columns = ["job_id", "chunk_id", "item_key"]
-        if "item_index" in table_metadata.schema.names:
+        if "item_index" in table_metadata.names:
             columns.append("item_index")
         # Process in batches to avoid loading entire table
         batch_size = 10000
-        parquet_file = pq.ParquetFile(storage_manager.captions_path)
+        lance_dataset = lance.dataset(storage_manager.captions_path)
         chunk_indices = defaultdict(set)
-        for batch in parquet_file.iter_batches(batch_size=batch_size, columns=columns):
+        for batch in lance_dataset.to_batches(batch_size=batch_size, columns=columns):
             batch_dict = batch.to_pydict()
             for i in range(len(batch_dict["chunk_id"])):
@@ -453,11 +577,12 @@ class ChunkTracker(CheckpointTracker):
             self._process_chunk_indices(chunk_indices)
         logger.info("Sync with storage completed")
-        self.save()
+        self._mark_dirty()
     def _process_chunk_indices(self, chunk_indices: Dict[str, Set[int]]):
         """Process a batch of chunk indices."""
         for chunk_id, abs_indices in chunk_indices.items():
+            logger.debug(f"Processing indices: {abs_indices} for chunk {chunk_id}")
             if chunk_id not in self.chunks:
                 continue
@@ -494,39 +619,49 @@ class ChunkTracker(CheckpointTracker):
                 for start_idx, end_idx in ranges:
                     chunk.add_processed_range(start_idx, end_idx)
-    def mark_items_processed(self, chunk_id: str, start_idx: int, end_idx: int):
-        """Mark a range of items as processed within a chunk (expects ABSOLUTE indices)."""
+    def mark_items_processed(self, chunk_id: str, start_idx: int, end_idx: int) -> None:
+        """Mark a range of items as processed within a chunk."""
         if chunk_id not in self.chunks:
-            logger.error(f"Unknown chunk: {chunk_id}")
+            logger.warning(f"Chunk {chunk_id} not found in tracker")
             return
-        chunk = self.chunks[chunk_id]
+        chunk_state = self.chunks[chunk_id]
-        # Convert absolute indices to chunk-relative
-        relative_start = start_idx - chunk.start_index
-        relative_end = end_idx - chunk.start_index
+        # Convert absolute indices to chunk-relative indices
+        relative_start = start_idx - chunk_state.start_index
+        relative_end = end_idx - chunk_state.start_index
-        # Validate boundaries
-        if relative_start < 0 or relative_end >= chunk.chunk_size:
-            logger.error(
-                f"Invalid indices for chunk {chunk_id}: "
-                f"absolute {start_idx}-{end_idx} (relative {relative_start}-{relative_end}) "
-                f"outside chunk bounds [{chunk.start_index}, {chunk.start_index + chunk.chunk_size - 1}]"
+        # Ensure indices are within chunk bounds and maintain valid range
+        relative_start = max(0, relative_start)
+        relative_end = min(chunk_state.chunk_size - 1, relative_end)
+        # Skip invalid ranges where start > end
+        if relative_start > relative_end:
+            logger.warning(
+                f"Invalid range for chunk {chunk_id}: start={relative_start}, end={relative_end}, skipping"
             )
             return
-        # Add the relative range
-        chunk.add_processed_range(relative_start, relative_end)
+        # Invalidate cache before modifying ranges
+        chunk_state._invalidate_cache()
-        # If chunk is now complete, increment counter
-        if chunk.status == "completed":
-            self._completed_count += 1
+        # Add to processed ranges
+        chunk_state.processed_ranges.append((relative_start, relative_end))
-        self.save()
-        logger.debug(
-            f"Marked items {start_idx}-{end_idx} as processed in chunk {chunk_id} "
-            f"(relative indices: {relative_start}-{relative_end})"
-        )
+        # Merge overlapping ranges
+        chunk_state.processed_ranges = chunk_state._merge_ranges(chunk_state.processed_ranges)
+        # logger.debug(
+        #     f"Marked items {start_idx}-{end_idx} as processed in chunk {chunk_id} (relative indices: {relative_start}-{relative_end})"
+        # )
+        # Check if chunk is now complete
+        if chunk_state.get_unprocessed_ranges() == []:
+            logger.info(f"Chunk {chunk_id} is now complete")
+            chunk_state.status = "completed"
+        # Mark as dirty, will be saved based on batching logic
+        self._mark_dirty()
     def get_chunk_with_unprocessed_items(self, chunk_id: str) -> Optional[Dict[str, Any]]:
         """Get chunk info with unprocessed item ranges."""

caption-flow 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.3py3-none-any.whl → 0.4.0py3-none-any.whl