PyPI - caption-flow - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

caption-flow 0.2.4py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

caption_flow/__init__.py +1 -1
caption_flow/orchestrator.py +9 -9
caption_flow/processors/base.py +3 -0
caption_flow/processors/huggingface.py +637 -464
caption_flow/processors/local_filesystem.py +2 -0
caption_flow/processors/webdataset.py +438 -538
caption_flow/storage/manager.py +328 -305
caption_flow/utils/__init__.py +0 -2
caption_flow/utils/chunk_tracker.py +197 -164
caption_flow/utils/image_processor.py +19 -132
caption_flow/workers/caption.py +191 -138
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/METADATA +2 -1
caption_flow-0.3.2.dist-info/RECORD +33 -0
caption_flow/utils/dataset_loader.py +0 -222
caption_flow/utils/dataset_metadata_cache.py +0 -67
caption_flow/utils/job_queue.py +0 -41
caption_flow/utils/shard_processor.py +0 -119
caption_flow/utils/shard_tracker.py +0 -83
caption_flow-0.2.4.dist-info/RECORD +0 -38
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/WHEEL +0 -0
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/top_level.txt +0 -0

caption_flow/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """CaptionFlow - Distributed community captioning system."""
-__version__ = "0.1.0"
+__version__ = "0.3.2"
 from .orchestrator import Orchestrator
 from .workers.data import DataWorker

caption_flow/orchestrator.py CHANGED Viewed

@@ -66,7 +66,7 @@ class Orchestrator:
         self.processor.initialize(processor_config, self.storage)
         # Processing configuration
-        self.units_per_request = config.get("units_per_request", 2)
+        self.chunks_per_request = config.get("chunks_per_request", 2)
         # Track connections
         self.workers: Dict[str, WebSocketServerProtocol] = {}
@@ -284,10 +284,10 @@ class Orchestrator:
                     self.processor.initialize(processor_config)
                     updated_sections.append("processor_config")
-            # Update units per request
-            if "units_per_request" in orchestrator_config:
-                self.units_per_request = orchestrator_config["units_per_request"]
-                updated_sections.append("units_per_request")
+            # Update chunks per request
+            if "chunks_per_request" in orchestrator_config:
+                self.chunks_per_request = orchestrator_config["chunks_per_request"]
+                updated_sections.append("chunks_per_request")
             # Update auth configuration
             if "auth" in orchestrator_config:
@@ -332,8 +332,8 @@ class Orchestrator:
         """Process message from worker."""
         msg_type = data.get("type")
-        if msg_type == "request_work":
-            count = data.get("count", self.units_per_request)
+        if msg_type == "get_work_units":
+            count = data.get("count", self.chunks_per_request)
             units = self.processor.get_work_units(count, worker_id)
             logger.debug(f"Assigning units: {[unit.chunk_id for unit in units]}")
@@ -352,7 +352,8 @@ class Orchestrator:
                 logger.debug(f"Assigned {len(units)} work units to worker {worker_id}")
             else:
-                await self.workers[worker_id].send(safe_json_dumps({"type": "no_work"}))
+                if worker_id in self.workers:
+                    await self.workers[worker_id].send(safe_json_dumps({"type": "no_work"}))
         elif msg_type == "work_complete":
             unit_id = data["unit_id"]
@@ -375,7 +376,6 @@ class Orchestrator:
         """Process results submission from worker."""
         # Extract user from worker_id
         worker_user = worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
         # Create work result
         _job_id = data.get("job_id")
         job_id = JobId.from_str(_job_id)

caption_flow/processors/base.py CHANGED Viewed

@@ -14,6 +14,7 @@ class WorkUnit:
     unit_id: str  # usually, but not always, the chunk id
     chunk_id: str  # always the chunk id
     source_id: str  # the shard name
+    unit_size: int  # how many elements are in the workunit
     data: Dict[str, Any]
     metadata: Dict[str, Any] = field(default_factory=dict)
     priority: int = 0
@@ -44,6 +45,7 @@ class WorkAssignment:
                     "unit_id": u.unit_id,
                     "source_id": u.source_id,
                     "chunk_id": u.chunk_id,
+                    "unit_size": u.unit_size,
                     "data": u.data,
                     "metadata": u.metadata,
                     "priority": u.priority,
@@ -62,6 +64,7 @@ class WorkAssignment:
                 unit_id=u["unit_id"],
                 chunk_id=u["chunk_id"],
                 source_id=u["source_id"],
+                unit_size=u["unit_size"],
                 data=u["data"],
                 metadata=u.get("metadata", {}),
                 priority=u.get("priority", 0),

caption-flow 0.2.4__py3-none-any.whl → 0.3.2__py3-none-any.whl

caption-flow 0.2.4py3-none-any.whl → 0.3.2py3-none-any.whl