PyPI - caption-flow - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

caption-flow 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{caption_flow-0.3.1/src/caption_flow.egg-info → caption_flow-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.3.1
+Version: 0.3.2
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT

{caption_flow-0.3.1 → caption_flow-0.3.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "caption-flow"
-version = "0.3.1"
+version = "0.3.2"
 description = "Self-contained distributed community captioning system"
 readme = "README.md"
 requires-python = ">=3.10,<3.13"

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """CaptionFlow - Distributed community captioning system."""
-__version__ = "0.3.1"
+__version__ = "0.3.2"
 from .orchestrator import Orchestrator
 from .workers.data import DataWorker

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/base.py RENAMED Viewed

@@ -14,6 +14,7 @@ class WorkUnit:
     unit_id: str  # usually, but not always, the chunk id
     chunk_id: str  # always the chunk id
     source_id: str  # the shard name
+    unit_size: int  # how many elements are in the workunit
     data: Dict[str, Any]
     metadata: Dict[str, Any] = field(default_factory=dict)
     priority: int = 0
@@ -44,6 +45,7 @@ class WorkAssignment:
                     "unit_id": u.unit_id,
                     "source_id": u.source_id,
                     "chunk_id": u.chunk_id,
+                    "unit_size": u.unit_size,
                     "data": u.data,
                     "metadata": u.metadata,
                     "priority": u.priority,
@@ -62,6 +64,7 @@ class WorkAssignment:
                 unit_id=u["unit_id"],
                 chunk_id=u["chunk_id"],
                 source_id=u["source_id"],
+                unit_size=u["unit_size"],
                 data=u["data"],
                 metadata=u.get("metadata", {}),
                 priority=u.get("priority", 0),

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/huggingface.py RENAMED Viewed

@@ -425,6 +425,7 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
             unit_id=unit_id,
             chunk_id=unit_id,
             source_id=shard_name,
+            unit_size=chunk_size,
             data={
                 "dataset_name": self.dataset_name,
                 "config": self.config,

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/local_filesystem.py RENAMED Viewed

@@ -251,6 +251,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
                         unit_id=chunk_id,
                         chunk_id=chunk_id,
                         source_id="local",
+                        unit_size=chunk_state.chunk_size,
                         data={
                             "start_index": chunk_state.start_index,
                             "chunk_size": chunk_state.chunk_size,
@@ -319,6 +320,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
                         unit_id=unit_id,
                         chunk_id=unit_id,
                         source_id="local",
+                        unit_size=chunk_size,
                         data={
                             "start_index": self.current_index,
                             "chunk_size": chunk_size,

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/webdataset.py RENAMED Viewed

@@ -12,6 +12,7 @@ from datetime import datetime
 from PIL import Image
 import io
+from caption_flow.models import JobId
 from caption_flow.storage import StorageManager
 from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
 from ..utils import ChunkTracker
@@ -21,6 +22,7 @@ import cv2
 import numpy as np
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
@@ -114,17 +116,22 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
             return
         shards_summary = self.chunk_tracker.get_shards_summary()
+        logger.debug(f"Restoring state: {shards_summary}")
         with self.lock:
             for shard_name, shard_info in shards_summary.items():
                 chunks = shard_info.get("chunks", [])
+                logger.debug(f"Existing job ids: {storage.get_all_processed_job_ids()}")
                 for chunk_state in chunks:
                     # Only add incomplete chunks
                     if chunk_state.status != "completed":
-                        logger.debug(f"Restoring incomplete chunk {chunk_state.chunk_id}")
+                        logger.debug(f"Restoring incomplete chunk {chunk_state}")
                         # Get unprocessed ranges
                         unprocessed_ranges = chunk_state.get_unprocessed_ranges()
+                        logger.debug(
+                            f"Chunk {chunk_state.chunk_id} unprocessed ranges: {unprocessed_ranges}"
+                        )
                         if not unprocessed_ranges:
                             continue
@@ -139,6 +146,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                             unit_id=chunk_state.chunk_id,
                             chunk_id=chunk_state.chunk_id,
                             source_id=shard_name,
+                            unit_size=chunk_state.chunk_size,
                             data={
                                 "shard_url": chunk_state.shard_url,
                                 "shard_name": shard_name,
@@ -201,7 +209,13 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 # Create chunk for current position
                 chunk_size = min(self.chunk_size, shard_files - current_file_idx)
-                chunk_id = f"{shard_name}:chunk:{current_file_idx // self.chunk_size}"
+                self.current_chunk_index = current_file_idx // self.chunk_size
+                job_id_obj = JobId(
+                    shard_id=shard_name,
+                    chunk_id=self.current_chunk_index,
+                    sample_id=current_file_idx,
+                )
+                chunk_id = job_id_obj.get_chunk_str()
                 with self.lock:
                     # Skip if already exists
@@ -224,6 +238,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                         unit_id=chunk_id,
                         chunk_id=chunk_id,
                         source_id=shard_name,
+                        unit_size=chunk_size,
                         data={
                             "shard_url": shard_url,
                             "shard_name": shard_name,
@@ -268,6 +283,25 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                 unit = self.work_units.get(unit_id)
                 if unit:
+                    # Update unprocessed ranges from chunk tracker before assigning
+                    if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
+                        chunk_state = self.chunk_tracker.chunks[unit_id]
+                        relative_unprocessed = chunk_state.get_unprocessed_ranges()
+                        # Convert relative to absolute indices
+                        absolute_ranges = []
+                        for start, end in relative_unprocessed:
+                            abs_start = chunk_state.start_index + start
+                            abs_end = chunk_state.start_index + end
+                            absolute_ranges.append((abs_start, abs_end))
+                        # Update the work unit's unprocessed ranges
+                        unit.data["unprocessed_ranges"] = absolute_ranges
+                        logger.debug(
+                            f"Updated unit {unit_id} with unprocessed ranges: {absolute_ranges}"
+                        )
                     self.assigned_units[worker_id].add(unit_id)
                     assigned.append(unit)
@@ -391,8 +425,27 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
                     if indices:
                         # Sort indices and convert to ranges
                         sorted_indices = sorted(indices)
-                        for idx in sorted_indices:
-                            self.chunk_tracker.mark_items_processed(chunk_id, idx, idx)
+                        if not sorted_indices:
+                            continue
+                        # Condense into contiguous ranges
+                        ranges = []
+                        start_range = sorted_indices[0]
+                        end_range = sorted_indices[0]
+                        for i in range(1, len(sorted_indices)):
+                            if sorted_indices[i] == end_range + 1:
+                                end_range = sorted_indices[i]
+                            else:
+                                ranges.append((start_range, end_range))
+                                start_range = sorted_indices[i]
+                                end_range = sorted_indices[i]
+                        ranges.append((start_range, end_range))
+                        # Mark each contiguous range as processed
+                        logger.debug(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
+                        for start_idx, end_idx in ranges:
+                            self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
     def get_stats(self) -> Dict[str, Any]:
         """Get processor statistics."""
@@ -488,7 +541,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
     def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
         """Process a work unit by iterating specified ranges."""
-        logger.debug(f"Processing unit: {unit.unit_id}")
+        logger.debug(f"Processing unit: {unit}")
         shard_name = unit.data["shard_name"]
         shard_idx = unit.data.get("shard_idx")
@@ -514,6 +567,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                             "_chunk_relative_index": idx - unit.data["start_index"],
                             "_job_id": job_id,
                             "_mock": True,
+                            "_processed_indices": processed_indices,
                         },
                         "job_id": job_id,
                     }
@@ -574,6 +628,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
                                     "_job_id": job_id,
                                     "_filename": entry.path,
                                     "_file_size": entry.size,
+                                    "_processed_indices": processed_indices,
                                 },
                                 "job_id": job_id,
                             }
@@ -605,8 +660,8 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
         result = super().prepare_result(unit, outputs, processing_time_ms)
         # Add processed indices for chunk tracker
-        if outputs and "_processed_indices" in outputs[0].get("metadata", {}):
-            result.metadata["item_indices"] = outputs[0]["metadata"]["_processed_indices"]
+        if hasattr(self, "_last_context") and "_processed_indices" in self._last_context:
+            result.metadata["item_indices"] = self._last_context["_processed_indices"]
         return result

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/chunk_tracker.py RENAMED Viewed

@@ -10,6 +10,7 @@ from dataclasses import dataclass, asdict, field
 from .checkpoint_tracker import CheckpointTracker
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
 @dataclass

{caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/caption.py RENAMED Viewed

@@ -565,7 +565,8 @@ class CaptionWorker(BaseWorker):
         batch = []
         batch_size = self.vllm_config.get("batch_size", 8)
         context = {}
+        self.items_processed = 0
+        self.items_failed = 0
         # Collect items for batching
         for item_data in self.processor.process_unit(unit, context):
             if self.should_stop_processing.is_set() or not self.connected.is_set():
@@ -604,16 +605,33 @@ class CaptionWorker(BaseWorker):
             self._process_batch(batch)
         # Notify orchestrator that unit is complete
-        if self.connected.is_set() and self.websocket:
-            try:
-                asyncio.run_coroutine_threadsafe(
-                    self.websocket.send(
-                        json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
-                    ),
-                    self.main_loop,
-                ).result(timeout=5)
-            except Exception as e:
-                logger.warning(f"Could not notify work complete: {e}")
+        # Check if the number of processed items matches the expected count for the unit.
+        # The context dictionary holds the count of items yielded by the processor.
+        total_items_in_unit = unit.unit_size
+        if (
+            not self.should_stop_processing.is_set()
+            and self.connected.is_set()
+            and self.items_failed == 0
+            and self.items_processed >= total_items_in_unit
+        ):
+            if self.websocket:
+                try:
+                    asyncio.run_coroutine_threadsafe(
+                        self.websocket.send(
+                            json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
+                        ),
+                        self.main_loop,
+                    ).result(timeout=5)
+                    logger.info(
+                        f"Unit {unit.unit_id} fully processed ({self.items_processed}/{total_items_in_unit}) and marked complete."
+                    )
+                except Exception as e:
+                    logger.warning(f"Could not notify work complete for unit {unit.unit_id}: {e}")
+        else:
+            logger.warning(
+                f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
+            )
     def _process_batch(self, batch: List[ProcessingItem]):
         """Process a batch of items through all stages."""

{caption_flow-0.3.1 → caption_flow-0.3.2/src/caption_flow.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.3.1
+Version: 0.3.2
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT