caption-flow 0.2.4__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {caption_flow-0.2.4/src/caption_flow.egg-info → caption_flow-0.3.1}/PKG-INFO +2 -1
- {caption_flow-0.2.4 → caption_flow-0.3.1}/pyproject.toml +2 -1
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/__init__.py +1 -1
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/orchestrator.py +9 -9
- caption_flow-0.3.1/src/caption_flow/processors/huggingface.py +1004 -0
- caption_flow-0.3.1/src/caption_flow/processors/webdataset.py +627 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/storage/manager.py +328 -305
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/__init__.py +0 -2
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/chunk_tracker.py +196 -164
- caption_flow-0.3.1/src/caption_flow/utils/image_processor.py +55 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/workers/caption.py +164 -129
- {caption_flow-0.2.4 → caption_flow-0.3.1/src/caption_flow.egg-info}/PKG-INFO +2 -1
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow.egg-info/SOURCES.txt +0 -5
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow.egg-info/requires.txt +1 -0
- caption_flow-0.2.4/src/caption_flow/processors/huggingface.py +0 -832
- caption_flow-0.2.4/src/caption_flow/processors/webdataset.py +0 -782
- caption_flow-0.2.4/src/caption_flow/utils/dataset_loader.py +0 -222
- caption_flow-0.2.4/src/caption_flow/utils/dataset_metadata_cache.py +0 -67
- caption_flow-0.2.4/src/caption_flow/utils/image_processor.py +0 -168
- caption_flow-0.2.4/src/caption_flow/utils/job_queue.py +0 -41
- caption_flow-0.2.4/src/caption_flow/utils/shard_processor.py +0 -119
- caption_flow-0.2.4/src/caption_flow/utils/shard_tracker.py +0 -83
- {caption_flow-0.2.4 → caption_flow-0.3.1}/LICENSE +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/README.md +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/setup.cfg +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/cli.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/models.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/monitor.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/processors/__init__.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/processors/base.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/processors/local_filesystem.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/storage/__init__.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/storage/exporter.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/auth.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/caption_utils.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/certificates.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/checkpoint_tracker.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/json_utils.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/prompt_template.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/utils/vllm_config.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/viewer.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/workers/base.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow/workers/data.py +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow.egg-info/dependency_links.txt +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow.egg-info/entry_points.txt +0 -0
- {caption_flow-0.2.4 → caption_flow-0.3.1}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -35,6 +35,7 @@ Requires-Dist: boto3<2.0.0,>=1.40.11
|
|
35
35
|
Requires-Dist: torchdata<0.12.0,>=0.11.0
|
36
36
|
Requires-Dist: textual<6.0.0,>=5.3.0
|
37
37
|
Requires-Dist: urwid<4.0.0,>=3.0.2
|
38
|
+
Requires-Dist: webshart<0.5.0,>=0.4.0
|
38
39
|
Provides-Extra: dev
|
39
40
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
40
41
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "caption-flow"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.1"
|
4
4
|
description = "Self-contained distributed community captioning system"
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10,<3.13"
|
@@ -40,6 +40,7 @@ dependencies = [
|
|
40
40
|
"torchdata (>=0.11.0,<0.12.0)",
|
41
41
|
"textual (>=5.3.0,<6.0.0)",
|
42
42
|
"urwid (>=3.0.2,<4.0.0)",
|
43
|
+
"webshart (>=0.4.0,<0.5.0)",
|
43
44
|
]
|
44
45
|
|
45
46
|
[project.optional-dependencies]
|
@@ -66,7 +66,7 @@ class Orchestrator:
|
|
66
66
|
self.processor.initialize(processor_config, self.storage)
|
67
67
|
|
68
68
|
# Processing configuration
|
69
|
-
self.
|
69
|
+
self.chunks_per_request = config.get("chunks_per_request", 2)
|
70
70
|
|
71
71
|
# Track connections
|
72
72
|
self.workers: Dict[str, WebSocketServerProtocol] = {}
|
@@ -284,10 +284,10 @@ class Orchestrator:
|
|
284
284
|
self.processor.initialize(processor_config)
|
285
285
|
updated_sections.append("processor_config")
|
286
286
|
|
287
|
-
# Update
|
288
|
-
if "
|
289
|
-
self.
|
290
|
-
updated_sections.append("
|
287
|
+
# Update chunks per request
|
288
|
+
if "chunks_per_request" in orchestrator_config:
|
289
|
+
self.chunks_per_request = orchestrator_config["chunks_per_request"]
|
290
|
+
updated_sections.append("chunks_per_request")
|
291
291
|
|
292
292
|
# Update auth configuration
|
293
293
|
if "auth" in orchestrator_config:
|
@@ -332,8 +332,8 @@ class Orchestrator:
|
|
332
332
|
"""Process message from worker."""
|
333
333
|
msg_type = data.get("type")
|
334
334
|
|
335
|
-
if msg_type == "
|
336
|
-
count = data.get("count", self.
|
335
|
+
if msg_type == "get_work_units":
|
336
|
+
count = data.get("count", self.chunks_per_request)
|
337
337
|
units = self.processor.get_work_units(count, worker_id)
|
338
338
|
logger.debug(f"Assigning units: {[unit.chunk_id for unit in units]}")
|
339
339
|
|
@@ -352,7 +352,8 @@ class Orchestrator:
|
|
352
352
|
|
353
353
|
logger.debug(f"Assigned {len(units)} work units to worker {worker_id}")
|
354
354
|
else:
|
355
|
-
|
355
|
+
if worker_id in self.workers:
|
356
|
+
await self.workers[worker_id].send(safe_json_dumps({"type": "no_work"}))
|
356
357
|
|
357
358
|
elif msg_type == "work_complete":
|
358
359
|
unit_id = data["unit_id"]
|
@@ -375,7 +376,6 @@ class Orchestrator:
|
|
375
376
|
"""Process results submission from worker."""
|
376
377
|
# Extract user from worker_id
|
377
378
|
worker_user = worker_id.rsplit("_", 1)[0] if "_" in worker_id else worker_id
|
378
|
-
|
379
379
|
# Create work result
|
380
380
|
_job_id = data.get("job_id")
|
381
381
|
job_id = JobId.from_str(_job_id)
|