caption-flow 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {caption_flow-0.2.2/src/caption_flow.egg-info → caption_flow-0.2.3}/PKG-INFO +29 -27
- {caption_flow-0.2.2 → caption_flow-0.2.3}/README.md +28 -26
- {caption_flow-0.2.2 → caption_flow-0.2.3}/pyproject.toml +1 -1
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/cli.py +1 -0
- caption_flow-0.2.3/src/caption_flow/models.py +191 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/monitor.py +1 -1
- caption_flow-0.2.3/src/caption_flow/orchestrator.py +914 -0
- caption_flow-0.2.3/src/caption_flow/processors/__init__.py +11 -0
- caption_flow-0.2.3/src/caption_flow/processors/base.py +219 -0
- caption_flow-0.2.3/src/caption_flow/processors/huggingface.py +832 -0
- caption_flow-0.2.3/src/caption_flow/processors/local_filesystem.py +683 -0
- caption_flow-0.2.3/src/caption_flow/processors/webdataset.py +782 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/storage.py +411 -407
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/checkpoint_tracker.py +2 -2
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/chunk_tracker.py +73 -32
- caption_flow-0.2.3/src/caption_flow/utils/dataset_loader.py +222 -0
- caption_flow-0.2.3/src/caption_flow/utils/dataset_metadata_cache.py +67 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/image_processor.py +1 -4
- caption_flow-0.2.3/src/caption_flow/utils/shard_processor.py +119 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/shard_tracker.py +1 -5
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/workers/base.py +3 -3
- caption_flow-0.2.3/src/caption_flow/workers/caption.py +945 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3/src/caption_flow.egg-info}/PKG-INFO +29 -27
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow.egg-info/SOURCES.txt +6 -0
- caption_flow-0.2.2/src/caption_flow/models.py +0 -84
- caption_flow-0.2.2/src/caption_flow/orchestrator.py +0 -2206
- caption_flow-0.2.2/src/caption_flow/utils/dataset_loader.py +0 -462
- caption_flow-0.2.2/src/caption_flow/utils/shard_processor.py +0 -379
- caption_flow-0.2.2/src/caption_flow/workers/caption.py +0 -1321
- {caption_flow-0.2.2 → caption_flow-0.2.3}/LICENSE +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/setup.cfg +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/__init__.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/__init__.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/auth.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/caption_utils.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/certificates.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/job_queue.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/json_utils.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/prompt_template.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/utils/vllm_config.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow/workers/data.py +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow.egg-info/dependency_links.txt +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow.egg-info/entry_points.txt +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow.egg-info/requires.txt +0 -0
- {caption_flow-0.2.2 → caption_flow-0.2.3}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -69,12 +69,14 @@ pip install -e . # installs the `caption-flow` command
|
|
69
69
|
1. copy + edit the sample configs
|
70
70
|
|
71
71
|
```bash
|
72
|
-
cp orchestrator.yaml my-orchestrator.yaml
|
73
|
-
cp worker.yaml my-worker.yaml
|
74
|
-
cp monitor.yaml my-monitor.yaml # optional
|
72
|
+
cp examples/orchestrator/local_image_files.yaml my-orchestrator.yaml
|
73
|
+
cp examples/worker.yaml my-worker.yaml
|
74
|
+
cp examples/monitor.yaml my-monitor.yaml # optional terminal interface
|
75
75
|
```
|
76
76
|
|
77
|
-
set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
|
77
|
+
set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
|
78
|
+
|
79
|
+
if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting anything.
|
78
80
|
|
79
81
|
2. start the orchestrator
|
80
82
|
|
@@ -90,6 +92,9 @@ caption-flow worker --config my-worker.yaml --gpu-id 0
|
|
90
92
|
|
91
93
|
# your second GPU
|
92
94
|
caption-flow worker --config my-worker.yaml --gpu-id 1
|
95
|
+
|
96
|
+
# on a remote host
|
97
|
+
caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:8765
|
93
98
|
```
|
94
99
|
|
95
100
|
4. (optional) start the monitor
|
@@ -98,12 +103,6 @@ caption-flow worker --config my-worker.yaml --gpu-id 1
|
|
98
103
|
caption-flow monitor --config my-monitor.yaml
|
99
104
|
```
|
100
105
|
|
101
|
-
5. (optional) scan/fix chunks on disk if you had crashes
|
102
|
-
|
103
|
-
```bash
|
104
|
-
caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
|
105
|
-
```
|
106
|
-
|
107
106
|
---
|
108
107
|
|
109
108
|
## how it’s wired
|
@@ -178,7 +177,7 @@ orchestrator:
|
|
178
177
|
# key: /path/privkey.pem
|
179
178
|
|
180
179
|
dataset:
|
181
|
-
type: huggingface
|
180
|
+
type: huggingface
|
182
181
|
path: <hf-dataset-or-local-path>
|
183
182
|
name: <logical-name>
|
184
183
|
version: "1.0"
|
@@ -315,28 +314,31 @@ PRs welcome. keep it simple and fast.
|
|
315
314
|
## Storage Schema
|
316
315
|
|
317
316
|
### captions.parquet
|
317
|
+
|
318
318
|
- `job_id`: Unique job identifier
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
319
|
+
* `dataset`: Dataset name
|
320
|
+
* `shard`: Shard identifier
|
321
|
+
* `item_key`: Item within shard
|
322
|
+
* `caption`: Generated caption text
|
323
|
+
* `contributor_id`: Worker who generated it
|
324
|
+
* `timestamp`: Generation time
|
325
|
+
* `quality_score`: Optional quality metric
|
326
326
|
|
327
327
|
### jobs.parquet
|
328
|
+
|
328
329
|
- `job_id`: Unique identifier
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
330
|
+
* `dataset`: Dataset name
|
331
|
+
* `shard`: Shard identifier
|
332
|
+
* `status`: pending/processing/completed/failed
|
333
|
+
* `assigned_to`: Worker ID
|
334
|
+
* `timestamp`: Status change time
|
334
335
|
|
335
336
|
### contributors.parquet
|
337
|
+
|
336
338
|
- `contributor_id`: Unique identifier
|
337
|
-
|
338
|
-
|
339
|
-
|
339
|
+
* `name`: Display name
|
340
|
+
* `total_captions`: Lifetime count
|
341
|
+
* `trust_level`: Quality tier (0-5)
|
340
342
|
|
341
343
|
## Development
|
342
344
|
|
@@ -25,12 +25,14 @@ pip install -e . # installs the `caption-flow` command
|
|
25
25
|
1. copy + edit the sample configs
|
26
26
|
|
27
27
|
```bash
|
28
|
-
cp orchestrator.yaml my-orchestrator.yaml
|
29
|
-
cp worker.yaml my-worker.yaml
|
30
|
-
cp monitor.yaml my-monitor.yaml # optional
|
28
|
+
cp examples/orchestrator/local_image_files.yaml my-orchestrator.yaml
|
29
|
+
cp examples/worker.yaml my-worker.yaml
|
30
|
+
cp examples/monitor.yaml my-monitor.yaml # optional terminal interface
|
31
31
|
```
|
32
32
|
|
33
|
-
set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
|
33
|
+
set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
|
34
|
+
|
35
|
+
if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting anything.
|
34
36
|
|
35
37
|
2. start the orchestrator
|
36
38
|
|
@@ -46,6 +48,9 @@ caption-flow worker --config my-worker.yaml --gpu-id 0
|
|
46
48
|
|
47
49
|
# your second GPU
|
48
50
|
caption-flow worker --config my-worker.yaml --gpu-id 1
|
51
|
+
|
52
|
+
# on a remote host
|
53
|
+
caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:8765
|
49
54
|
```
|
50
55
|
|
51
56
|
4. (optional) start the monitor
|
@@ -54,12 +59,6 @@ caption-flow worker --config my-worker.yaml --gpu-id 1
|
|
54
59
|
caption-flow monitor --config my-monitor.yaml
|
55
60
|
```
|
56
61
|
|
57
|
-
5. (optional) scan/fix chunks on disk if you had crashes
|
58
|
-
|
59
|
-
```bash
|
60
|
-
caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
|
61
|
-
```
|
62
|
-
|
63
62
|
---
|
64
63
|
|
65
64
|
## how it’s wired
|
@@ -134,7 +133,7 @@ orchestrator:
|
|
134
133
|
# key: /path/privkey.pem
|
135
134
|
|
136
135
|
dataset:
|
137
|
-
type: huggingface
|
136
|
+
type: huggingface
|
138
137
|
path: <hf-dataset-or-local-path>
|
139
138
|
name: <logical-name>
|
140
139
|
version: "1.0"
|
@@ -271,28 +270,31 @@ PRs welcome. keep it simple and fast.
|
|
271
270
|
## Storage Schema
|
272
271
|
|
273
272
|
### captions.parquet
|
273
|
+
|
274
274
|
- `job_id`: Unique job identifier
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
275
|
+
* `dataset`: Dataset name
|
276
|
+
* `shard`: Shard identifier
|
277
|
+
* `item_key`: Item within shard
|
278
|
+
* `caption`: Generated caption text
|
279
|
+
* `contributor_id`: Worker who generated it
|
280
|
+
* `timestamp`: Generation time
|
281
|
+
* `quality_score`: Optional quality metric
|
282
282
|
|
283
283
|
### jobs.parquet
|
284
|
+
|
284
285
|
- `job_id`: Unique identifier
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
286
|
+
* `dataset`: Dataset name
|
287
|
+
* `shard`: Shard identifier
|
288
|
+
* `status`: pending/processing/completed/failed
|
289
|
+
* `assigned_to`: Worker ID
|
290
|
+
* `timestamp`: Status change time
|
290
291
|
|
291
292
|
### contributors.parquet
|
293
|
+
|
292
294
|
- `contributor_id`: Unique identifier
|
293
|
-
|
294
|
-
|
295
|
-
|
295
|
+
* `name`: Display name
|
296
|
+
* `total_captions`: Lifetime count
|
297
|
+
* `trust_level`: Quality tier (0-5)
|
296
298
|
|
297
299
|
## Development
|
298
300
|
|
@@ -161,6 +161,7 @@ def main(ctx, verbose: bool):
|
|
161
161
|
@click.option("--key", help="SSL key path")
|
162
162
|
@click.option("--no-ssl", is_flag=True, help="Disable SSL (development only)")
|
163
163
|
@click.option("--vllm", is_flag=True, help="Use vLLM orchestrator for WebDataset/HF datasets")
|
164
|
+
@click.option("--verbose", is_flag=True, help="Enable verbose logging")
|
164
165
|
@click.pass_context
|
165
166
|
def orchestrator(ctx, config: Optional[str], **kwargs):
|
166
167
|
"""Start the orchestrator server."""
|
@@ -0,0 +1,191 @@
|
|
1
|
+
"""Data models for CaptionFlow."""
|
2
|
+
|
3
|
+
import PIL
|
4
|
+
from dataclasses import dataclass, field
|
5
|
+
from datetime import datetime
|
6
|
+
from enum import Enum
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
8
|
+
from PIL import Image
|
9
|
+
|
10
|
+
|
11
|
+
class JobStatus(Enum):
|
12
|
+
"""Job processing status."""
|
13
|
+
|
14
|
+
PENDING = "pending"
|
15
|
+
PROCESSING = "processing"
|
16
|
+
COMPLETED = "completed"
|
17
|
+
FAILED = "failed"
|
18
|
+
|
19
|
+
def __str__(self):
|
20
|
+
return self.value
|
21
|
+
|
22
|
+
def to_json(self):
|
23
|
+
return self.value
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class Job:
|
28
|
+
"""Captioning job."""
|
29
|
+
|
30
|
+
job_id: str
|
31
|
+
dataset: str
|
32
|
+
shard: str
|
33
|
+
item_key: str
|
34
|
+
status: JobStatus = JobStatus.PENDING
|
35
|
+
assigned_to: Optional[str] = None
|
36
|
+
created_at: datetime = None
|
37
|
+
|
38
|
+
def __post_init__(self):
|
39
|
+
if self.created_at is None:
|
40
|
+
self.created_at = datetime.utcnow()
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass
|
44
|
+
class JobId:
|
45
|
+
shard_id: str
|
46
|
+
chunk_id: str
|
47
|
+
sample_id: str
|
48
|
+
|
49
|
+
def get_shard_str(self):
|
50
|
+
return f"{self.shard_id}"
|
51
|
+
|
52
|
+
def get_chunk_str(self):
|
53
|
+
return f"{self.shard_id}:chunk:{self.chunk_id}"
|
54
|
+
|
55
|
+
def get_sample_str(self):
|
56
|
+
return f"{self.shard_id}:chunk:{self.chunk_id}:idx:{self.sample_id}"
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def from_dict(job: dict) -> "JobId":
|
60
|
+
return JobId(shard_id=job["shard_id"], chunk_id=job["chunk_id"], sample_id=job["sample_id"])
|
61
|
+
|
62
|
+
@staticmethod
|
63
|
+
def from_values(shard_id: str, chunk_id: str, sample_id: str) -> "JobId":
|
64
|
+
return JobId(shard_id=shard_id, chunk_id=chunk_id, sample_id=sample_id)
|
65
|
+
|
66
|
+
@staticmethod
|
67
|
+
def from_str(job_id: str):
|
68
|
+
# from data-0000:chunk:0:idx:0
|
69
|
+
parts = job_id.split(":")
|
70
|
+
if len(parts) != 5:
|
71
|
+
raise ValueError(f"Invalid job_id format: {job_id}")
|
72
|
+
return JobId(shard_id=parts[0], chunk_id=parts[2], sample_id=parts[4])
|
73
|
+
|
74
|
+
|
75
|
+
@dataclass
|
76
|
+
class Caption:
|
77
|
+
"""Generated caption with attribution and image metadata."""
|
78
|
+
|
79
|
+
# Core fields
|
80
|
+
job_id: str
|
81
|
+
dataset: str
|
82
|
+
shard: str
|
83
|
+
item_key: str
|
84
|
+
contributor_id: str
|
85
|
+
timestamp: datetime
|
86
|
+
caption_count: int = 1 # Number of captions generated for this item
|
87
|
+
caption: Optional[str] = None
|
88
|
+
captions: Optional[List[str]] = None
|
89
|
+
outputs: Dict[str, List[str]] = field(default_factory=dict)
|
90
|
+
quality_score: Optional[float] = None
|
91
|
+
quality_scores: Optional[List[float]] = None
|
92
|
+
|
93
|
+
# Image metadata
|
94
|
+
image_width: Optional[int] = None
|
95
|
+
image_height: Optional[int] = None
|
96
|
+
image_format: Optional[str] = None
|
97
|
+
file_size: Optional[int] = None
|
98
|
+
filename: Optional[str] = None
|
99
|
+
url: Optional[str] = None
|
100
|
+
|
101
|
+
# Processing metadata
|
102
|
+
caption_index: Optional[int] = None # Which caption this is (0, 1, 2...)
|
103
|
+
total_captions: Optional[int] = None # Total captions for this image
|
104
|
+
processing_time_ms: Optional[float] = None
|
105
|
+
chunk_id: Optional[str] = None
|
106
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
107
|
+
|
108
|
+
def __post_init__(self):
|
109
|
+
if self.caption is None and self.captions is None:
|
110
|
+
raise ValueError("At least one of 'caption' or 'captions' must be provided")
|
111
|
+
|
112
|
+
|
113
|
+
@dataclass
|
114
|
+
class Contributor:
|
115
|
+
"""Contributor information."""
|
116
|
+
|
117
|
+
contributor_id: str
|
118
|
+
name: str
|
119
|
+
total_captions: int = 0
|
120
|
+
trust_level: int = 1
|
121
|
+
|
122
|
+
|
123
|
+
@dataclass
|
124
|
+
class ProcessingStage:
|
125
|
+
"""Configuration for a single processing stage."""
|
126
|
+
|
127
|
+
name: str
|
128
|
+
model: str
|
129
|
+
prompts: List[str]
|
130
|
+
output_field: str
|
131
|
+
requires: List[str] = field(default_factory=list)
|
132
|
+
sampling: Optional[Dict[str, Any]] = None
|
133
|
+
|
134
|
+
# Model-specific overrides
|
135
|
+
tensor_parallel_size: Optional[int] = None
|
136
|
+
max_model_len: Optional[int] = None
|
137
|
+
dtype: Optional[str] = None
|
138
|
+
gpu_memory_utilization: Optional[float] = None
|
139
|
+
|
140
|
+
|
141
|
+
@dataclass
|
142
|
+
class StageResult:
|
143
|
+
"""Results from a single stage."""
|
144
|
+
|
145
|
+
stage_name: str
|
146
|
+
output_field: str
|
147
|
+
outputs: List[str] # Multiple outputs from multiple prompts
|
148
|
+
error: Optional[str] = None
|
149
|
+
|
150
|
+
def is_success(self) -> bool:
|
151
|
+
return self.error is None and bool(self.outputs)
|
152
|
+
|
153
|
+
|
154
|
+
@dataclass
|
155
|
+
class ShardChunk:
|
156
|
+
"""Shard chunk assignment with unprocessed ranges."""
|
157
|
+
|
158
|
+
chunk_id: str
|
159
|
+
shard_url: str
|
160
|
+
shard_name: str
|
161
|
+
start_index: int
|
162
|
+
chunk_size: int
|
163
|
+
unprocessed_ranges: List[Tuple[int, int]] = field(default_factory=list)
|
164
|
+
|
165
|
+
|
166
|
+
@dataclass
|
167
|
+
class ProcessingItem:
|
168
|
+
"""Item being processed."""
|
169
|
+
|
170
|
+
chunk_id: str
|
171
|
+
item_key: str
|
172
|
+
image: Image.Image
|
173
|
+
image_data: bytes
|
174
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
175
|
+
stage_results: Dict[str, StageResult] = field(default_factory=dict) # Accumulated results
|
176
|
+
|
177
|
+
|
178
|
+
@dataclass
|
179
|
+
class ProcessedResult:
|
180
|
+
"""Result with multi-stage outputs."""
|
181
|
+
|
182
|
+
chunk_id: str
|
183
|
+
shard_name: str
|
184
|
+
item_key: str
|
185
|
+
outputs: Dict[str, List[str]] # field_name -> list of outputs
|
186
|
+
image_width: int
|
187
|
+
image_height: int
|
188
|
+
image_format: str
|
189
|
+
file_size: int
|
190
|
+
processing_time_ms: float
|
191
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|