caption-flow 0.2.3__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {caption_flow-0.2.3/src/caption_flow.egg-info → caption_flow-0.3.1}/PKG-INFO +45 -177
- {caption_flow-0.2.3 → caption_flow-0.3.1}/README.md +41 -176
- {caption_flow-0.2.3 → caption_flow-0.3.1}/pyproject.toml +4 -1
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/__init__.py +1 -1
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/cli.py +307 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/models.py +26 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/orchestrator.py +9 -9
- caption_flow-0.3.1/src/caption_flow/processors/huggingface.py +1004 -0
- caption_flow-0.3.1/src/caption_flow/processors/webdataset.py +627 -0
- caption_flow-0.3.1/src/caption_flow/storage/__init__.py +1 -0
- caption_flow-0.3.1/src/caption_flow/storage/exporter.py +550 -0
- caption_flow-0.2.3/src/caption_flow/storage.py → caption_flow-0.3.1/src/caption_flow/storage/manager.py +410 -303
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/__init__.py +0 -2
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/chunk_tracker.py +196 -164
- caption_flow-0.3.1/src/caption_flow/utils/image_processor.py +55 -0
- caption_flow-0.3.1/src/caption_flow/viewer.py +594 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/workers/caption.py +164 -129
- {caption_flow-0.2.3 → caption_flow-0.3.1/src/caption_flow.egg-info}/PKG-INFO +45 -177
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow.egg-info/SOURCES.txt +4 -6
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow.egg-info/requires.txt +3 -0
- caption_flow-0.2.3/src/caption_flow/processors/huggingface.py +0 -832
- caption_flow-0.2.3/src/caption_flow/processors/webdataset.py +0 -782
- caption_flow-0.2.3/src/caption_flow/utils/dataset_loader.py +0 -222
- caption_flow-0.2.3/src/caption_flow/utils/dataset_metadata_cache.py +0 -67
- caption_flow-0.2.3/src/caption_flow/utils/image_processor.py +0 -168
- caption_flow-0.2.3/src/caption_flow/utils/job_queue.py +0 -41
- caption_flow-0.2.3/src/caption_flow/utils/shard_processor.py +0 -119
- caption_flow-0.2.3/src/caption_flow/utils/shard_tracker.py +0 -83
- {caption_flow-0.2.3 → caption_flow-0.3.1}/LICENSE +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/setup.cfg +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/monitor.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/processors/__init__.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/processors/base.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/processors/local_filesystem.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/auth.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/caption_utils.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/certificates.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/checkpoint_tracker.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/json_utils.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/prompt_template.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/utils/vllm_config.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/workers/base.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow/workers/data.py +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow.egg-info/dependency_links.txt +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow.egg-info/entry_points.txt +0 -0
- {caption_flow-0.2.3 → caption_flow-0.3.1}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -33,6 +33,9 @@ Requires-Dist: arrow<2.0.0,>=1.3.0
|
|
33
33
|
Requires-Dist: datasets<5.0.0,>=4.0.0
|
34
34
|
Requires-Dist: boto3<2.0.0,>=1.40.11
|
35
35
|
Requires-Dist: torchdata<0.12.0,>=0.11.0
|
36
|
+
Requires-Dist: textual<6.0.0,>=5.3.0
|
37
|
+
Requires-Dist: urwid<4.0.0,>=3.0.2
|
38
|
+
Requires-Dist: webshart<0.5.0,>=0.4.0
|
36
39
|
Provides-Extra: dev
|
37
40
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
38
41
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
@@ -44,12 +47,13 @@ Dynamic: license-file
|
|
44
47
|
|
45
48
|
# CaptionFlow
|
46
49
|
|
47
|
-
scalable, fault-tolerant **vLLM-powered image captioning**.
|
50
|
+
scalable, fault-tolerant **vLLM-powered image captioning**.
|
51
|
+
|
52
|
+
a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.
|
48
53
|
|
49
54
|
* **orchestrator**: hands out work in chunked shards, collects captions, checkpoints progress, and keeps simple stats.
|
50
55
|
* **workers (vLLM)**: connect to the orchestrator, stream in image samples, batch them, and generate 1..N captions per image using prompts supplied by the orchestrator.
|
51
56
|
* **config-driven**: all components read YAML config; flags can override.
|
52
|
-
* **tui monitor (optional)**: a monitor client is wired into the CLI; ship a `monitor` module to enable it.
|
53
57
|
|
54
58
|
> no conda. just `venv` + `pip`.
|
55
59
|
|
@@ -103,6 +107,25 @@ caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:
|
|
103
107
|
caption-flow monitor --config my-monitor.yaml
|
104
108
|
```
|
105
109
|
|
110
|
+
5. export the data
|
111
|
+
|
112
|
+
```bash
|
113
|
+
% caption-flow export --help
|
114
|
+
Usage: caption-flow export [OPTIONS]
|
115
|
+
|
116
|
+
Export caption data to various formats.
|
117
|
+
|
118
|
+
Options:
|
119
|
+
--format [jsonl|json|csv|txt|huggingface_hub|all] Export format (default: jsonl)
|
120
|
+
```
|
121
|
+
|
122
|
+
* **jsonl**: create JSON line file in the specified `--output` path
|
123
|
+
* **csv**: exports CSV-compatible data columns to the `--output` path containing incomplete metadata
|
124
|
+
* **json**: creates a `.json` file for each sample inside the `--output` subdirectory containing **complete** metadata; useful for webdatasets
|
125
|
+
* **txt**: creates `.txt` file for each sample inside the `--output` subdirectory containing ONLY captions
|
126
|
+
* **huggingface_hub**: creates a dataset on Hugging Face Hub, possibly `--private` and `--nsfw` where necessary
|
127
|
+
* **all**: creates all export formats in a specified `--output` directory
|
128
|
+
|
106
129
|
---
|
107
130
|
|
108
131
|
## how it’s wired
|
@@ -111,20 +134,11 @@ caption-flow monitor --config my-monitor.yaml
|
|
111
134
|
|
112
135
|
* **websocket server** (default `0.0.0.0:8765`) with three client roles: workers, data-feeders, and admin.
|
113
136
|
* **dataset control**: the orchestrator centrally defines the dataset (`huggingface` or `local`) and version/name. it chunk-slices shards and assigns work.
|
137
|
+
* **data serving to remote workers**: local files can be captioned by remote workers that don't have access to the same files, automatically.
|
114
138
|
* **vLLM config broadcast**: model, tp size, dtype, max seq len, memory targets, batching, sampling params, and **inference prompts** are all pushed to workers; workers can apply many changes without a model reload.
|
115
139
|
* **storage + checkpoints**: captions buffer to disk with periodic checkpoints. chunk state is tracked so restarts don’t double-work.
|
116
140
|
* **auth**: token lists for `worker`, `monitor`, and `admin` roles.
|
117
141
|
|
118
|
-
start flags you’ll likely use:
|
119
|
-
|
120
|
-
```text
|
121
|
-
--config PATH # yaml config for the orchestrator
|
122
|
-
--port INT, --host STR # bind controls
|
123
|
-
--data-dir PATH # overrides storage.data_dir
|
124
|
-
--cert PATH, --key PATH # enable TLS (or use --no-ssl for ws:// in dev)
|
125
|
-
--vllm # use the vLLM-style orchestrator (webdataset/hf)
|
126
|
-
```
|
127
|
-
|
128
142
|
### vLLM worker
|
129
143
|
|
130
144
|
* **one process per gpu**. select the device with `--gpu-id` (or `worker.gpu_id` in YAML).
|
@@ -132,27 +146,15 @@ start flags you’ll likely use:
|
|
132
146
|
* **resilient**: detects disconnects, abandons the current chunk cleanly, clears queues, reconnects, and resumes.
|
133
147
|
* **batched generate()**: images are resized down for consistent batching; each image can get multiple captions (one per prompt).
|
134
148
|
|
135
|
-
|
136
|
-
|
137
|
-
```text
|
138
|
-
--config PATH # yaml for the worker
|
139
|
-
--server URL # ws(s)://host:port
|
140
|
-
--token STR # must match an allowed worker token on the orchestrator
|
141
|
-
--name STR # display name
|
142
|
-
--batch-size INT # override vLLM batch size
|
143
|
-
--vllm # use the vLLM worker implementation
|
144
|
-
--gpu-id INT # which gpu to use
|
145
|
-
--precision STR, --model STR # optional overrides for dtype/model
|
146
|
-
--no-verify-ssl # accept self-signed certs in dev
|
147
|
-
```
|
148
|
-
|
149
|
-
### (optional) monitor
|
149
|
+
---
|
150
150
|
|
151
|
-
|
151
|
+
## dataset formats
|
152
152
|
|
153
|
-
|
153
|
+
* huggingface hub or local based URL list datasets that are compatible with the datasets library
|
154
|
+
* webdatasets shards containing full image data; also can be hosted on the hub
|
155
|
+
* local folder filled with images; orchestrator will serve the data to workers
|
154
156
|
|
155
|
-
## configuration
|
157
|
+
## configuration path
|
156
158
|
|
157
159
|
### config discovery order
|
158
160
|
|
@@ -166,98 +168,6 @@ for any component, the CLI looks for config in this order (first match wins):
|
|
166
168
|
6. any `$XDG_CONFIG_DIRS` entries under `caption-flow/`
|
167
169
|
7. `./examples/<component>.yaml` (fallback)
|
168
170
|
|
169
|
-
### orchestrator.yaml (highlights)
|
170
|
-
|
171
|
-
```yaml
|
172
|
-
orchestrator:
|
173
|
-
host: 0.0.0.0
|
174
|
-
port: 8765
|
175
|
-
# ssl:
|
176
|
-
# cert: /path/fullchain.pem
|
177
|
-
# key: /path/privkey.pem
|
178
|
-
|
179
|
-
dataset:
|
180
|
-
type: huggingface
|
181
|
-
path: <hf-dataset-or-local-path>
|
182
|
-
name: <logical-name>
|
183
|
-
version: "1.0"
|
184
|
-
|
185
|
-
vllm:
|
186
|
-
model: Qwen/Qwen2.5-VL-3B-Instruct
|
187
|
-
tensor_parallel_size: 1
|
188
|
-
max_model_len: 16384
|
189
|
-
dtype: float16
|
190
|
-
gpu_memory_utilization: 0.92
|
191
|
-
enforce_eager: true
|
192
|
-
disable_mm_preprocessor_cache: true
|
193
|
-
limit_mm_per_prompt: { image: 1 }
|
194
|
-
|
195
|
-
batch_size: 8
|
196
|
-
|
197
|
-
sampling:
|
198
|
-
temperature: 0.7
|
199
|
-
top_p: 0.95
|
200
|
-
max_tokens: 256
|
201
|
-
repetition_penalty: 1.05
|
202
|
-
skip_special_tokens: true
|
203
|
-
stop: ["<|end|>", "<|endoftext|>", "<|im_end|>"]
|
204
|
-
|
205
|
-
inference_prompts:
|
206
|
-
- "describe this image in detail"
|
207
|
-
- "provide a comprehensive description of the visual content"
|
208
|
-
- "what are the key elements in this image?"
|
209
|
-
|
210
|
-
storage:
|
211
|
-
data_dir: ./caption_data
|
212
|
-
checkpoint_dir: ./checkpoints
|
213
|
-
caption_buffer_size: 100
|
214
|
-
checkpoint_interval: 1000
|
215
|
-
|
216
|
-
# chunking/queueing
|
217
|
-
chunk_size: 1000
|
218
|
-
chunks_per_request: 2
|
219
|
-
chunk_buffer_multiplier: 3
|
220
|
-
min_chunk_buffer: 10
|
221
|
-
|
222
|
-
auth:
|
223
|
-
worker_tokens:
|
224
|
-
- { token: "example-worker-token", name: "Example Worker" }
|
225
|
-
monitor_tokens:
|
226
|
-
- { token: "letmein", name: "Default monitor" }
|
227
|
-
admin_tokens:
|
228
|
-
- { token: "admin-secret-2024", name: "Admin" }
|
229
|
-
```
|
230
|
-
|
231
|
-
### worker.yaml (highlights)
|
232
|
-
|
233
|
-
```yaml
|
234
|
-
worker:
|
235
|
-
server: ws://localhost:8765 # use wss:// in prod
|
236
|
-
token: example-worker-token
|
237
|
-
name: local-gpu
|
238
|
-
gpu_id: 0
|
239
|
-
vllm: true
|
240
|
-
|
241
|
-
# local queues
|
242
|
-
readahead_size: 256
|
243
|
-
inference_queue_size: 128
|
244
|
-
```
|
245
|
-
|
246
|
-
### monitor.yaml (optional)
|
247
|
-
|
248
|
-
```yaml
|
249
|
-
monitor:
|
250
|
-
server: ws://localhost:8765
|
251
|
-
token: letmein
|
252
|
-
refresh_rate: 1.0
|
253
|
-
show_contributors: true
|
254
|
-
show_quality_metrics: true
|
255
|
-
max_activity_items: 20
|
256
|
-
show_chunk_progress: true
|
257
|
-
show_worker_queues: true
|
258
|
-
show_throughput_graph: true
|
259
|
-
```
|
260
|
-
|
261
171
|
---
|
262
172
|
|
263
173
|
## tls / certificates
|
@@ -300,66 +210,24 @@ PRs welcome. keep it simple and fast.
|
|
300
210
|
```
|
301
211
|
┌─────────────┐ WebSocket ┌─────────────┐
|
302
212
|
│ Worker │◄──────────────────►│ │
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
│
|
307
|
-
|
213
|
+
│ │ │ │ ┌──────────────┐
|
214
|
+
│ │◄───────────────────│ │────►│Arrow/Parquet │
|
215
|
+
└─────────────┘ HTTP (img data) │ Orchestrator│ │ Storage │
|
216
|
+
│ │ └──────────────┘
|
217
|
+
┌─────────────┐ │ │
|
218
|
+
│ Worker │◄──────────────────►│ │
|
219
|
+
│ │ │ │
|
220
|
+
│ │◄───────────────────│ │
|
221
|
+
└─────────────┘ HTTP (img data) └─────────────┘
|
308
222
|
▲
|
309
223
|
┌─────────────┐ │
|
310
224
|
│ Monitor │◄──────────────────────────┘
|
311
225
|
└─────────────┘
|
312
226
|
```
|
313
227
|
|
314
|
-
##
|
315
|
-
|
316
|
-
### captions.parquet
|
317
|
-
|
318
|
-
- `job_id`: Unique job identifier
|
319
|
-
* `dataset`: Dataset name
|
320
|
-
* `shard`: Shard identifier
|
321
|
-
* `item_key`: Item within shard
|
322
|
-
* `caption`: Generated caption text
|
323
|
-
* `contributor_id`: Worker who generated it
|
324
|
-
* `timestamp`: Generation time
|
325
|
-
* `quality_score`: Optional quality metric
|
326
|
-
|
327
|
-
### jobs.parquet
|
328
|
-
|
329
|
-
- `job_id`: Unique identifier
|
330
|
-
* `dataset`: Dataset name
|
331
|
-
* `shard`: Shard identifier
|
332
|
-
* `status`: pending/processing/completed/failed
|
333
|
-
* `assigned_to`: Worker ID
|
334
|
-
* `timestamp`: Status change time
|
335
|
-
|
336
|
-
### contributors.parquet
|
337
|
-
|
338
|
-
- `contributor_id`: Unique identifier
|
339
|
-
* `name`: Display name
|
340
|
-
* `total_captions`: Lifetime count
|
341
|
-
* `trust_level`: Quality tier (0-5)
|
342
|
-
|
343
|
-
## Development
|
344
|
-
|
345
|
-
```bash
|
346
|
-
# Install with dev dependencies
|
347
|
-
pip install -e ".[dev]"
|
348
|
-
|
349
|
-
# Run tests
|
350
|
-
pytest
|
351
|
-
|
352
|
-
# Format code
|
353
|
-
black src/
|
354
|
-
ruff --fix src/
|
355
|
-
|
356
|
-
# Type checking
|
357
|
-
mypy src/
|
358
|
-
```
|
359
|
-
|
360
|
-
## Community Contribution
|
228
|
+
## Community Clusters
|
361
229
|
|
362
|
-
To contribute compute:
|
230
|
+
To contribute compute to a cluster:
|
363
231
|
|
364
232
|
1. Install caption-flow: `pip install caption-flow`
|
365
233
|
2. Get a worker token from the project maintainer
|
@@ -369,4 +237,4 @@ Your contributions will be tracked and attributed in the final dataset!
|
|
369
237
|
|
370
238
|
## License
|
371
239
|
|
372
|
-
|
240
|
+
AGPLv3
|
@@ -1,11 +1,12 @@
|
|
1
1
|
# CaptionFlow
|
2
2
|
|
3
|
-
scalable, fault-tolerant **vLLM-powered image captioning**.
|
3
|
+
scalable, fault-tolerant **vLLM-powered image captioning**.
|
4
|
+
|
5
|
+
a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.
|
4
6
|
|
5
7
|
* **orchestrator**: hands out work in chunked shards, collects captions, checkpoints progress, and keeps simple stats.
|
6
8
|
* **workers (vLLM)**: connect to the orchestrator, stream in image samples, batch them, and generate 1..N captions per image using prompts supplied by the orchestrator.
|
7
9
|
* **config-driven**: all components read YAML config; flags can override.
|
8
|
-
* **tui monitor (optional)**: a monitor client is wired into the CLI; ship a `monitor` module to enable it.
|
9
10
|
|
10
11
|
> no conda. just `venv` + `pip`.
|
11
12
|
|
@@ -59,6 +60,25 @@ caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:
|
|
59
60
|
caption-flow monitor --config my-monitor.yaml
|
60
61
|
```
|
61
62
|
|
63
|
+
5. export the data
|
64
|
+
|
65
|
+
```bash
|
66
|
+
% caption-flow export --help
|
67
|
+
Usage: caption-flow export [OPTIONS]
|
68
|
+
|
69
|
+
Export caption data to various formats.
|
70
|
+
|
71
|
+
Options:
|
72
|
+
--format [jsonl|json|csv|txt|huggingface_hub|all] Export format (default: jsonl)
|
73
|
+
```
|
74
|
+
|
75
|
+
* **jsonl**: create JSON line file in the specified `--output` path
|
76
|
+
* **csv**: exports CSV-compatible data columns to the `--output` path containing incomplete metadata
|
77
|
+
* **json**: creates a `.json` file for each sample inside the `--output` subdirectory containing **complete** metadata; useful for webdatasets
|
78
|
+
* **txt**: creates `.txt` file for each sample inside the `--output` subdirectory containing ONLY captions
|
79
|
+
* **huggingface_hub**: creates a dataset on Hugging Face Hub, possibly `--private` and `--nsfw` where necessary
|
80
|
+
* **all**: creates all export formats in a specified `--output` directory
|
81
|
+
|
62
82
|
---
|
63
83
|
|
64
84
|
## how it’s wired
|
@@ -67,20 +87,11 @@ caption-flow monitor --config my-monitor.yaml
|
|
67
87
|
|
68
88
|
* **websocket server** (default `0.0.0.0:8765`) with three client roles: workers, data-feeders, and admin.
|
69
89
|
* **dataset control**: the orchestrator centrally defines the dataset (`huggingface` or `local`) and version/name. it chunk-slices shards and assigns work.
|
90
|
+
* **data serving to remote workers**: local files can be captioned by remote workers that don't have access to the same files, automatically.
|
70
91
|
* **vLLM config broadcast**: model, tp size, dtype, max seq len, memory targets, batching, sampling params, and **inference prompts** are all pushed to workers; workers can apply many changes without a model reload.
|
71
92
|
* **storage + checkpoints**: captions buffer to disk with periodic checkpoints. chunk state is tracked so restarts don’t double-work.
|
72
93
|
* **auth**: token lists for `worker`, `monitor`, and `admin` roles.
|
73
94
|
|
74
|
-
start flags you’ll likely use:
|
75
|
-
|
76
|
-
```text
|
77
|
-
--config PATH # yaml config for the orchestrator
|
78
|
-
--port INT, --host STR # bind controls
|
79
|
-
--data-dir PATH # overrides storage.data_dir
|
80
|
-
--cert PATH, --key PATH # enable TLS (or use --no-ssl for ws:// in dev)
|
81
|
-
--vllm # use the vLLM-style orchestrator (webdataset/hf)
|
82
|
-
```
|
83
|
-
|
84
95
|
### vLLM worker
|
85
96
|
|
86
97
|
* **one process per gpu**. select the device with `--gpu-id` (or `worker.gpu_id` in YAML).
|
@@ -88,27 +99,15 @@ start flags you’ll likely use:
|
|
88
99
|
* **resilient**: detects disconnects, abandons the current chunk cleanly, clears queues, reconnects, and resumes.
|
89
100
|
* **batched generate()**: images are resized down for consistent batching; each image can get multiple captions (one per prompt).
|
90
101
|
|
91
|
-
|
92
|
-
|
93
|
-
```text
|
94
|
-
--config PATH # yaml for the worker
|
95
|
-
--server URL # ws(s)://host:port
|
96
|
-
--token STR # must match an allowed worker token on the orchestrator
|
97
|
-
--name STR # display name
|
98
|
-
--batch-size INT # override vLLM batch size
|
99
|
-
--vllm # use the vLLM worker implementation
|
100
|
-
--gpu-id INT # which gpu to use
|
101
|
-
--precision STR, --model STR # optional overrides for dtype/model
|
102
|
-
--no-verify-ssl # accept self-signed certs in dev
|
103
|
-
```
|
104
|
-
|
105
|
-
### (optional) monitor
|
102
|
+
---
|
106
103
|
|
107
|
-
|
104
|
+
## dataset formats
|
108
105
|
|
109
|
-
|
106
|
+
* huggingface hub or local based URL list datasets that are compatible with the datasets library
|
107
|
+
* webdatasets shards containing full image data; also can be hosted on the hub
|
108
|
+
* local folder filled with images; orchestrator will serve the data to workers
|
110
109
|
|
111
|
-
## configuration
|
110
|
+
## configuration path
|
112
111
|
|
113
112
|
### config discovery order
|
114
113
|
|
@@ -122,98 +121,6 @@ for any component, the CLI looks for config in this order (first match wins):
|
|
122
121
|
6. any `$XDG_CONFIG_DIRS` entries under `caption-flow/`
|
123
122
|
7. `./examples/<component>.yaml` (fallback)
|
124
123
|
|
125
|
-
### orchestrator.yaml (highlights)
|
126
|
-
|
127
|
-
```yaml
|
128
|
-
orchestrator:
|
129
|
-
host: 0.0.0.0
|
130
|
-
port: 8765
|
131
|
-
# ssl:
|
132
|
-
# cert: /path/fullchain.pem
|
133
|
-
# key: /path/privkey.pem
|
134
|
-
|
135
|
-
dataset:
|
136
|
-
type: huggingface
|
137
|
-
path: <hf-dataset-or-local-path>
|
138
|
-
name: <logical-name>
|
139
|
-
version: "1.0"
|
140
|
-
|
141
|
-
vllm:
|
142
|
-
model: Qwen/Qwen2.5-VL-3B-Instruct
|
143
|
-
tensor_parallel_size: 1
|
144
|
-
max_model_len: 16384
|
145
|
-
dtype: float16
|
146
|
-
gpu_memory_utilization: 0.92
|
147
|
-
enforce_eager: true
|
148
|
-
disable_mm_preprocessor_cache: true
|
149
|
-
limit_mm_per_prompt: { image: 1 }
|
150
|
-
|
151
|
-
batch_size: 8
|
152
|
-
|
153
|
-
sampling:
|
154
|
-
temperature: 0.7
|
155
|
-
top_p: 0.95
|
156
|
-
max_tokens: 256
|
157
|
-
repetition_penalty: 1.05
|
158
|
-
skip_special_tokens: true
|
159
|
-
stop: ["<|end|>", "<|endoftext|>", "<|im_end|>"]
|
160
|
-
|
161
|
-
inference_prompts:
|
162
|
-
- "describe this image in detail"
|
163
|
-
- "provide a comprehensive description of the visual content"
|
164
|
-
- "what are the key elements in this image?"
|
165
|
-
|
166
|
-
storage:
|
167
|
-
data_dir: ./caption_data
|
168
|
-
checkpoint_dir: ./checkpoints
|
169
|
-
caption_buffer_size: 100
|
170
|
-
checkpoint_interval: 1000
|
171
|
-
|
172
|
-
# chunking/queueing
|
173
|
-
chunk_size: 1000
|
174
|
-
chunks_per_request: 2
|
175
|
-
chunk_buffer_multiplier: 3
|
176
|
-
min_chunk_buffer: 10
|
177
|
-
|
178
|
-
auth:
|
179
|
-
worker_tokens:
|
180
|
-
- { token: "example-worker-token", name: "Example Worker" }
|
181
|
-
monitor_tokens:
|
182
|
-
- { token: "letmein", name: "Default monitor" }
|
183
|
-
admin_tokens:
|
184
|
-
- { token: "admin-secret-2024", name: "Admin" }
|
185
|
-
```
|
186
|
-
|
187
|
-
### worker.yaml (highlights)
|
188
|
-
|
189
|
-
```yaml
|
190
|
-
worker:
|
191
|
-
server: ws://localhost:8765 # use wss:// in prod
|
192
|
-
token: example-worker-token
|
193
|
-
name: local-gpu
|
194
|
-
gpu_id: 0
|
195
|
-
vllm: true
|
196
|
-
|
197
|
-
# local queues
|
198
|
-
readahead_size: 256
|
199
|
-
inference_queue_size: 128
|
200
|
-
```
|
201
|
-
|
202
|
-
### monitor.yaml (optional)
|
203
|
-
|
204
|
-
```yaml
|
205
|
-
monitor:
|
206
|
-
server: ws://localhost:8765
|
207
|
-
token: letmein
|
208
|
-
refresh_rate: 1.0
|
209
|
-
show_contributors: true
|
210
|
-
show_quality_metrics: true
|
211
|
-
max_activity_items: 20
|
212
|
-
show_chunk_progress: true
|
213
|
-
show_worker_queues: true
|
214
|
-
show_throughput_graph: true
|
215
|
-
```
|
216
|
-
|
217
124
|
---
|
218
125
|
|
219
126
|
## tls / certificates
|
@@ -256,66 +163,24 @@ PRs welcome. keep it simple and fast.
|
|
256
163
|
```
|
257
164
|
┌─────────────┐ WebSocket ┌─────────────┐
|
258
165
|
│ Worker │◄──────────────────►│ │
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
│
|
263
|
-
|
166
|
+
│ │ │ │ ┌──────────────┐
|
167
|
+
│ │◄───────────────────│ │────►│Arrow/Parquet │
|
168
|
+
└─────────────┘ HTTP (img data) │ Orchestrator│ │ Storage │
|
169
|
+
│ │ └──────────────┘
|
170
|
+
┌─────────────┐ │ │
|
171
|
+
│ Worker │◄──────────────────►│ │
|
172
|
+
│ │ │ │
|
173
|
+
│ │◄───────────────────│ │
|
174
|
+
└─────────────┘ HTTP (img data) └─────────────┘
|
264
175
|
▲
|
265
176
|
┌─────────────┐ │
|
266
177
|
│ Monitor │◄──────────────────────────┘
|
267
178
|
└─────────────┘
|
268
179
|
```
|
269
180
|
|
270
|
-
##
|
271
|
-
|
272
|
-
### captions.parquet
|
273
|
-
|
274
|
-
- `job_id`: Unique job identifier
|
275
|
-
* `dataset`: Dataset name
|
276
|
-
* `shard`: Shard identifier
|
277
|
-
* `item_key`: Item within shard
|
278
|
-
* `caption`: Generated caption text
|
279
|
-
* `contributor_id`: Worker who generated it
|
280
|
-
* `timestamp`: Generation time
|
281
|
-
* `quality_score`: Optional quality metric
|
282
|
-
|
283
|
-
### jobs.parquet
|
284
|
-
|
285
|
-
- `job_id`: Unique identifier
|
286
|
-
* `dataset`: Dataset name
|
287
|
-
* `shard`: Shard identifier
|
288
|
-
* `status`: pending/processing/completed/failed
|
289
|
-
* `assigned_to`: Worker ID
|
290
|
-
* `timestamp`: Status change time
|
291
|
-
|
292
|
-
### contributors.parquet
|
293
|
-
|
294
|
-
- `contributor_id`: Unique identifier
|
295
|
-
* `name`: Display name
|
296
|
-
* `total_captions`: Lifetime count
|
297
|
-
* `trust_level`: Quality tier (0-5)
|
298
|
-
|
299
|
-
## Development
|
300
|
-
|
301
|
-
```bash
|
302
|
-
# Install with dev dependencies
|
303
|
-
pip install -e ".[dev]"
|
304
|
-
|
305
|
-
# Run tests
|
306
|
-
pytest
|
307
|
-
|
308
|
-
# Format code
|
309
|
-
black src/
|
310
|
-
ruff --fix src/
|
311
|
-
|
312
|
-
# Type checking
|
313
|
-
mypy src/
|
314
|
-
```
|
315
|
-
|
316
|
-
## Community Contribution
|
181
|
+
## Community Clusters
|
317
182
|
|
318
|
-
To contribute compute:
|
183
|
+
To contribute compute to a cluster:
|
319
184
|
|
320
185
|
1. Install caption-flow: `pip install caption-flow`
|
321
186
|
2. Get a worker token from the project maintainer
|
@@ -325,4 +190,4 @@ Your contributions will be tracked and attributed in the final dataset!
|
|
325
190
|
|
326
191
|
## License
|
327
192
|
|
328
|
-
|
193
|
+
AGPLv3
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "caption-flow"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.1"
|
4
4
|
description = "Self-contained distributed community captioning system"
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10,<3.13"
|
@@ -38,6 +38,9 @@ dependencies = [
|
|
38
38
|
"datasets (>=4.0.0,<5.0.0)",
|
39
39
|
"boto3 (>=1.40.11,<2.0.0)",
|
40
40
|
"torchdata (>=0.11.0,<0.12.0)",
|
41
|
+
"textual (>=5.3.0,<6.0.0)",
|
42
|
+
"urwid (>=3.0.2,<4.0.0)",
|
43
|
+
"webshart (>=0.4.0,<0.5.0)",
|
41
44
|
]
|
42
45
|
|
43
46
|
[project.optional-dependencies]
|