PyPI - caption-flow - Versions diffs - 0.3.1__tar.gz → 0.3.3__tar.gz - Mend

caption-flow 0.3.1tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{caption_flow-0.3.1/src/caption_flow.egg-info → caption_flow-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.3.1
+Version: 0.3.3
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT

{caption_flow-0.3.1 → caption_flow-0.3.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "caption-flow"
-version = "0.3.1"
+version = "0.3.3"
 description = "Self-contained distributed community captioning system"
 readme = "README.md"
 requires-python = ">=3.10,<3.13"

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """CaptionFlow - Distributed community captioning system."""
-__version__ = "0.3.1"
+__version__ = "0.3.3"
 from .orchestrator import Orchestrator
 from .workers.data import DataWorker

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/orchestrator.py RENAMED Viewed

@@ -124,13 +124,14 @@ class Orchestrator:
         # Initialize storage
         await self.storage.initialize()
-        await self.update_unprocessed_ranges()
         # Start background tasks
         asyncio.create_task(self._heartbeat_loop())
         asyncio.create_task(self._checkpoint_loop())
         asyncio.create_task(self._stats_update_loop())
+        await self.update_unprocessed_ranges()
         # Start WebSocket server
         websocket_logger = logging.getLogger("websockets")
         websocket_logger.setLevel(logging.WARNING)

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/processors/base.py RENAMED Viewed

@@ -14,6 +14,7 @@ class WorkUnit:
     unit_id: str  # usually, but not always, the chunk id
     chunk_id: str  # always the chunk id
     source_id: str  # the shard name
+    unit_size: int  # how many elements are in the workunit
     data: Dict[str, Any]
     metadata: Dict[str, Any] = field(default_factory=dict)
     priority: int = 0
@@ -44,6 +45,7 @@ class WorkAssignment:
                     "unit_id": u.unit_id,
                     "source_id": u.source_id,
                     "chunk_id": u.chunk_id,
+                    "unit_size": u.unit_size,
                     "data": u.data,
                     "metadata": u.metadata,
                     "priority": u.priority,
@@ -62,6 +64,7 @@ class WorkAssignment:
                 unit_id=u["unit_id"],
                 chunk_id=u["chunk_id"],
                 source_id=u["source_id"],
+                unit_size=u["unit_size"],
                 data=u["data"],
                 metadata=u.get("metadata", {}),
                 priority=u.get("priority", 0),

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/processors/huggingface.py RENAMED Viewed

@@ -425,6 +425,7 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
             unit_id=unit_id,
             chunk_id=unit_id,
             source_id=shard_name,
+            unit_size=chunk_size,
             data={
                 "dataset_name": self.dataset_name,
                 "config": self.config,

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/processors/local_filesystem.py RENAMED Viewed

@@ -251,6 +251,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
                         unit_id=chunk_id,
                         chunk_id=chunk_id,
                         source_id="local",
+                        unit_size=chunk_state.chunk_size,
                         data={
                             "start_index": chunk_state.start_index,
                             "chunk_size": chunk_state.chunk_size,
@@ -319,6 +320,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
                         unit_id=unit_id,
                         chunk_id=unit_id,
                         source_id="local",
+                        unit_size=chunk_size,
                         data={
                             "start_index": self.current_index,
                             "chunk_size": chunk_size,

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/processors/webdataset.py RENAMED Viewed

@@ -12,6 +12,7 @@ from datetime import datetime
 from PIL import Image
 import io
+from caption_flow.models import JobId
 from caption_flow.storage import StorageManager
 from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
 from ..utils import ChunkTracker
@@ -21,6 +22,7 @@ import cv2
 import numpy as np
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
@@ -108,52 +110,86 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
         return self.shard_info_cache[shard_idx]
     def _restore_state(self, storage: StorageManager) -> None:
-        """Restore state from chunk tracker."""
-        logger.debug("Restoring state from chunk tracker")
+        """Restore state from chunk tracker and synchronize with storage."""
+        logger.info("Restoring state from chunk tracker and synchronizing with storage")
         if not self.chunk_tracker:
             return
+        # First, update chunk tracker from storage
+        processed_job_ids = storage.get_all_processed_job_ids()
+        if processed_job_ids:
+            logger.info(
+                f"Synchronizing chunk tracker with {len(processed_job_ids)} processed items from storage"
+            )
+            self.update_from_storage(processed_job_ids)
+        # Then restore work units from chunk tracker
         shards_summary = self.chunk_tracker.get_shards_summary()
+        logger.info(f"Restoring work units from chunk tracker: {len(shards_summary)} shards")
         with self.lock:
+            restored_count = 0
             for shard_name, shard_info in shards_summary.items():
                 chunks = shard_info.get("chunks", [])
                 for chunk_state in chunks:
                     # Only add incomplete chunks
-                    if chunk_state.status != "completed":
-                        logger.debug(f"Restoring incomplete chunk {chunk_state.chunk_id}")
+                    if chunk_state.status == "completed":
+                        logger.debug(f"Skipping completed chunk {chunk_state.chunk_id}")
+                        continue
-                        # Get unprocessed ranges
-                        unprocessed_ranges = chunk_state.get_unprocessed_ranges()
-                        if not unprocessed_ranges:
-                            continue
+                    # Get unprocessed ranges
+                    unprocessed_ranges = chunk_state.get_unprocessed_ranges()
+                    if not unprocessed_ranges:
+                        logger.debug(
+                            f"Chunk {chunk_state.chunk_id} has no unprocessed ranges, marking as completed"
+                        )
+                        self.chunk_tracker.mark_completed(chunk_state.chunk_id)
+                        continue
-                        # Convert relative ranges to absolute file indices
-                        absolute_ranges = []
-                        for start, end in unprocessed_ranges:
-                            abs_start = chunk_state.start_index + start
-                            abs_end = chunk_state.start_index + end
-                            absolute_ranges.append((abs_start, abs_end))
+                    logger.info(
+                        f"Restoring chunk {chunk_state.chunk_id} with unprocessed ranges: {unprocessed_ranges}"
+                    )
-                        unit = WorkUnit(
-                            unit_id=chunk_state.chunk_id,
-                            chunk_id=chunk_state.chunk_id,
-                            source_id=shard_name,
-                            data={
-                                "shard_url": chunk_state.shard_url,
-                                "shard_name": shard_name,
-                                "start_index": chunk_state.start_index,
-                                "chunk_size": chunk_state.chunk_size,
-                                "unprocessed_ranges": absolute_ranges,
-                            },
-                            metadata={
-                                "shard_name": shard_name,
-                                "chunk_index": chunk_state.start_index // self.chunk_size,
-                            },
-                        )
+                    # Convert relative ranges to absolute file indices
+                    absolute_ranges = []
+                    for start, end in unprocessed_ranges:
+                        abs_start = chunk_state.start_index + start
+                        abs_end = chunk_state.start_index + end
+                        absolute_ranges.append((abs_start, abs_end))
+                    # Get shard index if available
+                    shard_idx = None
+                    if self.dataset:
+                        for idx in range(self.dataset.num_shards):
+                            shard_info = self._get_shard_info_cached(idx)
+                            if shard_info and shard_info["name"] == shard_name:
+                                shard_idx = idx
+                                break
+                    unit = WorkUnit(
+                        unit_id=chunk_state.chunk_id,
+                        chunk_id=chunk_state.chunk_id,
+                        source_id=shard_name,
+                        unit_size=chunk_state.chunk_size,
+                        data={
+                            "shard_url": chunk_state.shard_url,
+                            "shard_name": shard_name,
+                            "shard_idx": shard_idx,
+                            "start_index": chunk_state.start_index,
+                            "chunk_size": chunk_state.chunk_size,
+                            "unprocessed_ranges": absolute_ranges,
+                        },
+                        metadata={
+                            "shard_name": shard_name,
+                            "chunk_index": chunk_state.start_index // self.chunk_size,
+                        },
+                    )
-                        self.work_units[unit.unit_id] = unit
-                        self.pending_units.append(unit.unit_id)
+                    self.work_units[unit.unit_id] = unit
+                    self.pending_units.append(unit.unit_id)
+                    restored_count += 1
+            logger.info(f"Restored {restored_count} incomplete work units")
     def _create_units_background(self) -> None:
         """Background thread to create work units on demand."""
@@ -201,7 +237,13 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 # Create chunk for current position
                 chunk_size = min(self.chunk_size, shard_files - current_file_idx)
-                chunk_id = f"{shard_name}:chunk:{current_file_idx // self.chunk_size}"
+                self.current_chunk_index = current_file_idx // self.chunk_size
+                job_id_obj = JobId(
+                    shard_id=shard_name,
+                    chunk_id=self.current_chunk_index,
+                    sample_id=current_file_idx,
+                )
+                chunk_id = job_id_obj.get_chunk_str()
                 with self.lock:
                     # Skip if already exists
@@ -224,6 +266,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                         unit_id=chunk_id,
                         chunk_id=chunk_id,
                         source_id=shard_name,
+                        unit_size=chunk_size,
                         data={
                             "shard_url": shard_url,
                             "shard_name": shard_name,
@@ -268,6 +311,25 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 unit = self.work_units.get(unit_id)
                 if unit:
+                    # Update unprocessed ranges from chunk tracker before assigning
+                    if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
+                        chunk_state = self.chunk_tracker.chunks[unit_id]
+                        relative_unprocessed = chunk_state.get_unprocessed_ranges()
+                        # Convert relative to absolute indices
+                        absolute_ranges = []
+                        for start, end in relative_unprocessed:
+                            abs_start = chunk_state.start_index + start
+                            abs_end = chunk_state.start_index + end
+                            absolute_ranges.append((abs_start, abs_end))
+                        # Update the work unit's unprocessed ranges
+                        unit.data["unprocessed_ranges"] = absolute_ranges
+                        logger.debug(
+                            f"Updated unit {unit_id} with unprocessed ranges: {absolute_ranges}"
+                        )
                     self.assigned_units[worker_id].add(unit_id)
                     assigned.append(unit)
@@ -373,26 +435,72 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
             # Group by chunk
             processed_by_chunk = defaultdict(set)
-            for job_id in processed_job_ids:
-                # Parse job_id to extract chunk and index
-                # Expected format: "shard:chunk:X:idx:Y"
-                parts = job_id.split(":")
-                if len(parts) >= 5 and parts[3] == "idx":
-                    chunk_id = ":".join(parts[:3])  # "shard:chunk:X"
-                    try:
-                        idx = int(parts[4])
-                        processed_by_chunk[chunk_id].add(idx)
-                    except ValueError:
-                        continue
+            for job_id_str in processed_job_ids:
+                try:
+                    # Use JobId to parse the job ID string
+                    job_id = JobId.from_str(job_id_str)
+                    chunk_id = job_id.get_chunk_str()
+                    sample_idx = int(job_id.sample_id)
+                    processed_by_chunk[chunk_id].add(sample_idx)
+                except ValueError as e:
+                    logger.warning(f"Invalid job ID format: {job_id_str} - {e}")
+                    continue
             # Update chunk tracker with processed items
             if self.chunk_tracker:
                 for chunk_id, indices in processed_by_chunk.items():
                     if indices:
+                        # Get or create chunk state
+                        chunk_state = self.chunk_tracker.chunks.get(chunk_id)
+                        if not chunk_state:
+                            # Parse chunk_id using JobId to get shard info
+                            try:
+                                # chunk_id format: "shard_id:chunk:chunk_idx"
+                                parts = chunk_id.split(":")
+                                if len(parts) >= 3:
+                                    shard_name = parts[0]
+                                    chunk_idx = int(parts[2])
+                                    # Infer start index from chunk index and size
+                                    start_index = chunk_idx * self.chunk_size
+                                    # Create chunk state
+                                    self.chunk_tracker.add_chunk(
+                                        chunk_id,
+                                        shard_name,
+                                        f"{shard_name}.tar",
+                                        start_index,
+                                        self.chunk_size,
+                                    )
+                                    logger.info(f"Created missing chunk state for {chunk_id}")
+                            except (ValueError, IndexError) as e:
+                                logger.error(f"Failed to create chunk state for {chunk_id}: {e}")
+                                continue
                         # Sort indices and convert to ranges
                         sorted_indices = sorted(indices)
-                        for idx in sorted_indices:
-                            self.chunk_tracker.mark_items_processed(chunk_id, idx, idx)
+                        if not sorted_indices:
+                            continue
+                        # Condense into contiguous ranges
+                        ranges = []
+                        start_range = sorted_indices[0]
+                        end_range = sorted_indices[0]
+                        for i in range(1, len(sorted_indices)):
+                            if sorted_indices[i] == end_range + 1:
+                                end_range = sorted_indices[i]
+                            else:
+                                ranges.append((start_range, end_range))
+                                start_range = sorted_indices[i]
+                                end_range = sorted_indices[i]
+                        ranges.append((start_range, end_range))
+                        # Mark each contiguous range as processed
+                        logger.info(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
+                        for start_idx, end_idx in ranges:
+                            self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
+                # Save checkpoint after updating
+                self.chunk_tracker.save()
     def get_stats(self) -> Dict[str, Any]:
         """Get processor statistics."""
@@ -488,7 +596,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
     def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
         """Process a work unit by iterating specified ranges."""
-        logger.debug(f"Processing unit: {unit.unit_id}")
+        logger.debug(f"Processing unit: {unit}")
         shard_name = unit.data["shard_name"]
         shard_idx = unit.data.get("shard_idx")
@@ -502,7 +610,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
             # Generate mock results for unprocessed ranges
             for start_idx, end_idx in unprocessed_ranges:
                 for idx in range(start_idx, end_idx + 1):
-                    job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
+                    # Use JobId to create consistent job ID
+                    job_id = JobId.from_values(
+                        shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(idx)
+                    )
+                    job_id_str = job_id.get_sample_str()
                     yield {
                         "image": self._create_mock_image(idx),
@@ -512,10 +624,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                         "metadata": {
                             "_item_index": idx,
                             "_chunk_relative_index": idx - unit.data["start_index"],
-                            "_job_id": job_id,
+                            "_job_id": job_id_str,
                             "_mock": True,
+                            "_processed_indices": processed_indices,
                         },
-                        "job_id": job_id,
+                        "job_id": job_id_str,
                     }
                     processed_indices.append(idx)
@@ -560,8 +673,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                                         f"Error decoding image {entry.path} with cv2: {img_e}"
                                     )
-                            # Generate job ID compatible with chunk tracker
-                            job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
+                            # Generate job ID using JobId class
+                            job_id = JobId.from_values(
+                                shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(idx)
+                            )
+                            job_id_str = job_id.get_sample_str()
                             yield {
                                 "image": image,
@@ -571,11 +687,12 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                                 "metadata": {
                                     "_item_index": idx,
                                     "_chunk_relative_index": idx - unit.data["start_index"],
-                                    "_job_id": job_id,
+                                    "_job_id": job_id_str,
                                     "_filename": entry.path,
                                     "_file_size": entry.size,
+                                    "_processed_indices": processed_indices,
                                 },
-                                "job_id": job_id,
+                                "job_id": job_id_str,
                             }
                             processed_indices.append(idx)
@@ -605,8 +722,8 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
         result = super().prepare_result(unit, outputs, processing_time_ms)
         # Add processed indices for chunk tracker
-        if outputs and "_processed_indices" in outputs[0].get("metadata", {}):
-            result.metadata["item_indices"] = outputs[0]["metadata"]["_processed_indices"]
+        if hasattr(self, "_last_context") and "_processed_indices" in self._last_context:
+            result.metadata["item_indices"] = self._last_context["_processed_indices"]
         return result

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/utils/chunk_tracker.py RENAMED Viewed

@@ -10,6 +10,7 @@ from dataclasses import dataclass, asdict, field
 from .checkpoint_tracker import CheckpointTracker
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
 @dataclass

{caption_flow-0.3.1 → caption_flow-0.3.3}/src/caption_flow/workers/caption.py RENAMED Viewed

@@ -565,7 +565,8 @@ class CaptionWorker(BaseWorker):
         batch = []
         batch_size = self.vllm_config.get("batch_size", 8)
         context = {}
+        self.items_processed = 0
+        self.items_failed = 0
         # Collect items for batching
         for item_data in self.processor.process_unit(unit, context):
             if self.should_stop_processing.is_set() or not self.connected.is_set():
@@ -604,16 +605,33 @@ class CaptionWorker(BaseWorker):
             self._process_batch(batch)
         # Notify orchestrator that unit is complete
-        if self.connected.is_set() and self.websocket:
-            try:
-                asyncio.run_coroutine_threadsafe(
-                    self.websocket.send(
-                        json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
-                    ),
-                    self.main_loop,
-                ).result(timeout=5)
-            except Exception as e:
-                logger.warning(f"Could not notify work complete: {e}")
+        # Check if the number of processed items matches the expected count for the unit.
+        # The context dictionary holds the count of items yielded by the processor.
+        total_items_in_unit = unit.unit_size
+        if (
+            not self.should_stop_processing.is_set()
+            and self.connected.is_set()
+            and self.items_failed == 0
+            and self.items_processed >= total_items_in_unit
+        ):
+            if self.websocket:
+                try:
+                    asyncio.run_coroutine_threadsafe(
+                        self.websocket.send(
+                            json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
+                        ),
+                        self.main_loop,
+                    ).result(timeout=5)
+                    logger.info(
+                        f"Unit {unit.unit_id} fully processed ({self.items_processed}/{total_items_in_unit}) and marked complete."
+                    )
+                except Exception as e:
+                    logger.warning(f"Could not notify work complete for unit {unit.unit_id}: {e}")
+        else:
+            logger.warning(
+                f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
+            )
     def _process_batch(self, batch: List[ProcessingItem]):
         """Process a batch of items through all stages."""

{caption_flow-0.3.1 → caption_flow-0.3.3/src/caption_flow.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.3.1
+Version: 0.3.3
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT