caption-flow 0.1.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. caption_flow-0.2.1/PKG-INFO +370 -0
  2. caption_flow-0.2.1/README.md +326 -0
  3. {caption_flow-0.1.0 → caption_flow-0.2.1}/pyproject.toml +5 -2
  4. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/__init__.py +3 -2
  5. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/cli.py +65 -42
  6. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/models.py +6 -4
  7. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/monitor.py +13 -3
  8. caption_flow-0.2.1/src/caption_flow/orchestrator.py +2086 -0
  9. caption_flow-0.2.1/src/caption_flow/storage.py +1051 -0
  10. caption_flow-0.2.1/src/caption_flow/utils/__init__.py +6 -0
  11. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/utils/auth.py +24 -25
  12. caption_flow-0.2.1/src/caption_flow/utils/checkpoint_tracker.py +92 -0
  13. caption_flow-0.2.1/src/caption_flow/utils/chunk_tracker.py +449 -0
  14. caption_flow-0.2.1/src/caption_flow/utils/dataset_loader.py +680 -0
  15. caption_flow-0.2.1/src/caption_flow/utils/image_processor.py +171 -0
  16. caption_flow-0.2.1/src/caption_flow/utils/prompt_template.py +137 -0
  17. caption_flow-0.2.1/src/caption_flow/utils/shard_processor.py +315 -0
  18. caption_flow-0.2.1/src/caption_flow/utils/shard_tracker.py +87 -0
  19. caption_flow-0.2.1/src/caption_flow/workers/base.py +228 -0
  20. caption_flow-0.2.1/src/caption_flow/workers/caption.py +1321 -0
  21. caption_flow-0.1.0/src/caption_flow/worker_data.py → caption_flow-0.2.1/src/caption_flow/workers/data.py +162 -234
  22. caption_flow-0.2.1/src/caption_flow.egg-info/PKG-INFO +370 -0
  23. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow.egg-info/SOURCES.txt +8 -4
  24. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow.egg-info/requires.txt +3 -0
  25. caption_flow-0.1.0/PKG-INFO +0 -427
  26. caption_flow-0.1.0/README.md +0 -386
  27. caption_flow-0.1.0/src/caption_flow/orchestrator.py +0 -1301
  28. caption_flow-0.1.0/src/caption_flow/storage.py +0 -694
  29. caption_flow-0.1.0/src/caption_flow/utils/__init__.py +0 -4
  30. caption_flow-0.1.0/src/caption_flow/utils/chunk_tracker.py +0 -365
  31. caption_flow-0.1.0/src/caption_flow/utils/dataset_loader.py +0 -186
  32. caption_flow-0.1.0/src/caption_flow/utils/image_processor.py +0 -51
  33. caption_flow-0.1.0/src/caption_flow/worker.py +0 -300
  34. caption_flow-0.1.0/src/caption_flow/worker_vllm.py +0 -1028
  35. caption_flow-0.1.0/src/caption_flow.egg-info/PKG-INFO +0 -427
  36. {caption_flow-0.1.0 → caption_flow-0.2.1}/LICENSE +0 -0
  37. {caption_flow-0.1.0 → caption_flow-0.2.1}/setup.cfg +0 -0
  38. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/utils/caption_utils.py +0 -0
  39. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/utils/certificates.py +0 -0
  40. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/utils/job_queue.py +0 -0
  41. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/utils/json_utils.py +0 -0
  42. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow/utils/vllm_config.py +0 -0
  43. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow.egg-info/dependency_links.txt +0 -0
  44. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow.egg-info/entry_points.txt +0 -0
  45. {caption_flow-0.1.0 → caption_flow-0.2.1}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -0,0 +1,370 @@
1
+ Metadata-Version: 2.4
2
+ Name: caption-flow
3
+ Version: 0.2.1
4
+ Summary: Self-contained distributed community captioning system
5
+ Author-email: bghira <bghira@users.github.com>
6
+ License: MIT
7
+ Keywords: captioning,distributed,vllm,dataset,community
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Python: <3.13,>=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: websockets>=12.0
19
+ Requires-Dist: pyarrow>=14.0.0
20
+ Requires-Dist: click>=8.1.0
21
+ Requires-Dist: pydantic>=2.0.0
22
+ Requires-Dist: aiofiles>=23.0.0
23
+ Requires-Dist: rich>=13.0.0
24
+ Requires-Dist: cryptography>=41.0.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Requires-Dist: certbot>=2.0.0
27
+ Requires-Dist: numpy>=1.24.0
28
+ Requires-Dist: pillow>=10.0.0
29
+ Requires-Dist: vllm<0.11.0,>=0.10.0
30
+ Requires-Dist: webdataset<2.0.0,>=1.0.2
31
+ Requires-Dist: pandas<3.0.0,>=2.3.1
32
+ Requires-Dist: arrow<2.0.0,>=1.3.0
33
+ Requires-Dist: datasets<5.0.0,>=4.0.0
34
+ Requires-Dist: boto3<2.0.0,>=1.40.11
35
+ Requires-Dist: torchdata<0.12.0,>=0.11.0
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
38
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
39
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
40
+ Requires-Dist: black>=23.0.0; extra == "dev"
41
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
42
+ Requires-Dist: mypy>=1.5.0; extra == "dev"
43
+ Dynamic: license-file
44
+
45
+ # CaptionFlow
46
+
47
+ scalable, fault-tolerant **vLLM-powered image captioning**. this "first round" focuses on a fast websocket orchestrator plus lightweight gpu workers that batch requests through vLLM.
48
+
49
+ * **orchestrator**: hands out work in chunked shards, collects captions, checkpoints progress, and keeps simple stats.
50
+ * **workers (vLLM)**: connect to the orchestrator, stream in image samples, batch them, and generate 1..N captions per image using prompts supplied by the orchestrator.
51
+ * **config-driven**: all components read YAML config; flags can override.
52
+ * **tui monitor (optional)**: a monitor client is wired into the CLI; ship a `monitor` module to enable it.
53
+
54
+ > no conda. just `venv` + `pip`.
55
+
56
+ ---
57
+
58
+ ## install
59
+
60
+ ```bash
61
+ python -m venv .venv
62
+ source .venv/bin/activate # windows: .venv\Scripts\activate
63
+ pip install --upgrade pip
64
+ pip install -e . # installs the `caption-flow` command
65
+ ```
66
+
67
+ ## quickstart (single box)
68
+
69
+ 1. copy + edit the sample configs
70
+
71
+ ```bash
72
+ cp orchestrator.yaml my-orchestrator.yaml
73
+ cp worker.yaml my-worker.yaml
74
+ cp monitor.yaml my-monitor.yaml # optional; requires a monitor module
75
+ ```
76
+
77
+ set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config). if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting workers.
78
+
79
+ 2. start the orchestrator
80
+
81
+ ```bash
82
+ caption-flow orchestrator --config my-orchestrator.yaml
83
+ ```
84
+
85
+ 3. start one or more vLLM workers
86
+
87
+ ```bash
88
+ # gpu 0 on the same host
89
+ caption-flow worker --config my-worker.yaml --gpu-id 0
90
+
91
+ # your second GPU
92
+ caption-flow worker --config my-worker.yaml --gpu-id 1
93
+ ```
94
+
95
+ 4. (optional) start the monitor
96
+
97
+ ```bash
98
+ caption-flow monitor --config my-monitor.yaml
99
+ ```
100
+
101
+ 5. (optional) scan/fix chunks on disk if you had crashes
102
+
103
+ ```bash
104
+ caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
105
+ ```
106
+
107
+ ---
108
+
109
+ ## how it’s wired
110
+
111
+ ### orchestrator
112
+
113
+ * **websocket server** (default `0.0.0.0:8765`) with three client roles: workers, data-feeders, and admin.
114
+ * **dataset control**: the orchestrator centrally defines the dataset (`huggingface` or `local`) and version/name. it chunk-slices shards and assigns work.
115
+ * **vLLM config broadcast**: model, tp size, dtype, max seq len, memory targets, batching, sampling params, and **inference prompts** are all pushed to workers; workers can apply many changes without a model reload.
116
+ * **storage + checkpoints**: captions buffer to disk with periodic checkpoints. chunk state is tracked so restarts don’t double-work.
117
+ * **auth**: token lists for `worker`, `monitor`, and `admin` roles.
118
+
119
+ start flags you’ll likely use:
120
+
121
+ ```text
122
+ --config PATH # yaml config for the orchestrator
123
+ --port INT, --host STR # bind controls
124
+ --data-dir PATH # overrides storage.data_dir
125
+ --cert PATH, --key PATH # enable TLS (or use --no-ssl for ws:// in dev)
126
+ --vllm # use the vLLM-style orchestrator (webdataset/hf)
127
+ ```
128
+
129
+ ### vLLM worker
130
+
131
+ * **one process per gpu**. select the device with `--gpu-id` (or `worker.gpu_id` in YAML).
132
+ * **gets its marching orders** from the orchestrator: dataset info, model, prompts, batch size, and sampling.
133
+ * **resilient**: detects disconnects, abandons the current chunk cleanly, clears queues, reconnects, and resumes.
134
+ * **batched generate()**: images are resized down for consistent batching; each image can get multiple captions (one per prompt).
135
+
136
+ start flags you’ll likely use:
137
+
138
+ ```text
139
+ --config PATH # yaml for the worker
140
+ --server URL # ws(s)://host:port
141
+ --token STR # must match an allowed worker token on the orchestrator
142
+ --name STR # display name
143
+ --batch-size INT # override vLLM batch size
144
+ --vllm # use the vLLM worker implementation
145
+ --gpu-id INT # which gpu to use
146
+ --precision STR, --model STR # optional overrides for dtype/model
147
+ --no-verify-ssl # accept self-signed certs in dev
148
+ ```
149
+
150
+ ### (optional) monitor
151
+
152
+ * a CLI entry exists for a TUI monitor; wire in a `monitor` module to enable it. config lives in `monitor.yaml` or inside `orchestrator.yaml` under `monitor:`.
153
+
154
+ ---
155
+
156
+ ## configuration
157
+
158
+ ### config discovery order
159
+
160
+ for any component, the CLI looks for config in this order (first match wins):
161
+
162
+ 1. `--config /path/to/file.yaml`
163
+ 2. `./<component>.yaml` (current directory)
164
+ 3. `~/.caption-flow/<component>.yaml`
165
+ 4. `$XDG_CONFIG_HOME/caption-flow/<component>.yaml`
166
+ 5. `/etc/caption-flow/<component>.yaml`
167
+ 6. any `$XDG_CONFIG_DIRS` entries under `caption-flow/`
168
+ 7. `./examples/<component>.yaml` (fallback)
169
+
170
+ ### orchestrator.yaml (highlights)
171
+
172
+ ```yaml
173
+ orchestrator:
174
+ host: 0.0.0.0
175
+ port: 8765
176
+ # ssl:
177
+ # cert: /path/fullchain.pem
178
+ # key: /path/privkey.pem
179
+
180
+ dataset:
181
+ type: huggingface # or "local"
182
+ path: <hf-dataset-or-local-path>
183
+ name: <logical-name>
184
+ version: "1.0"
185
+
186
+ vllm:
187
+ model: Qwen/Qwen2.5-VL-3B-Instruct
188
+ tensor_parallel_size: 1
189
+ max_model_len: 16384
190
+ dtype: float16
191
+ gpu_memory_utilization: 0.92
192
+ enforce_eager: true
193
+ disable_mm_preprocessor_cache: true
194
+ limit_mm_per_prompt: { image: 1 }
195
+
196
+ batch_size: 8
197
+
198
+ sampling:
199
+ temperature: 0.7
200
+ top_p: 0.95
201
+ max_tokens: 256
202
+ repetition_penalty: 1.05
203
+ skip_special_tokens: true
204
+ stop: ["<|end|>", "<|endoftext|>", "<|im_end|>"]
205
+
206
+ inference_prompts:
207
+ - "describe this image in detail"
208
+ - "provide a comprehensive description of the visual content"
209
+ - "what are the key elements in this image?"
210
+
211
+ storage:
212
+ data_dir: ./caption_data
213
+ checkpoint_dir: ./checkpoints
214
+ caption_buffer_size: 100
215
+ checkpoint_interval: 1000
216
+
217
+ # chunking/queueing
218
+ chunk_size: 1000
219
+ chunks_per_request: 2
220
+ chunk_buffer_multiplier: 3
221
+ min_chunk_buffer: 10
222
+
223
+ auth:
224
+ worker_tokens:
225
+ - { token: "example-worker-token", name: "Example Worker" }
226
+ monitor_tokens:
227
+ - { token: "letmein", name: "Default monitor" }
228
+ admin_tokens:
229
+ - { token: "admin-secret-2024", name: "Admin" }
230
+ ```
231
+
232
+ ### worker.yaml (highlights)
233
+
234
+ ```yaml
235
+ worker:
236
+ server: ws://localhost:8765 # use wss:// in prod
237
+ token: example-worker-token
238
+ name: local-gpu
239
+ gpu_id: 0
240
+ vllm: true
241
+
242
+ # local queues
243
+ readahead_size: 256
244
+ inference_queue_size: 128
245
+ ```
246
+
247
+ ### monitor.yaml (optional)
248
+
249
+ ```yaml
250
+ monitor:
251
+ server: ws://localhost:8765
252
+ token: letmein
253
+ refresh_rate: 1.0
254
+ show_contributors: true
255
+ show_quality_metrics: true
256
+ max_activity_items: 20
257
+ show_chunk_progress: true
258
+ show_worker_queues: true
259
+ show_throughput_graph: true
260
+ ```
261
+
262
+ ---
263
+
264
+ ## tls / certificates
265
+
266
+ use the built-in helpers during development:
267
+
268
+ ```bash
269
+ # self-signed certs for quick local testing
270
+ caption-flow generate_cert --self-signed --domain localhost --output-dir ./certs
271
+
272
+ # inspect any certificate file
273
+ caption-flow inspect_cert ./certs/fullchain.pem
274
+ ```
275
+
276
+ then point the orchestrator at the resulting cert/key (or run `--no-ssl` for dev-only ws\://).
277
+
278
+ ---
279
+
280
+ ## tips & notes
281
+
282
+ * **multi-gpu**: start one worker process per gpu (set `--gpu-id` or `worker.gpu_id`).
283
+ * **throughput**: tune `vllm.batch_size` in the orchestrator config (or override with `--batch-size` at worker start). higher isn’t always better; watch VRAM.
284
+ * **prompts**: add more strings under `vllm.inference_prompts` to get multiple captions per image; the worker returns only non-empty generations.
285
+ * **private HF**: if your dataset/model needs auth, export `HUGGINGFACE_HUB_TOKEN` before `caption-flow worker ...`.
286
+ * **self-signed ssl**: pass `--no-verify-ssl` to workers/monitors in dev.
287
+ * **recovery**: if you hard-crash mid-run, `caption-flow scan_chunks --fix` can reset abandoned chunks so the orchestrator can reissue them cleanly.
288
+
289
+ ---
290
+
291
+ ## roadmap
292
+
293
+ * hot config reload via the admin websocket path.
294
+ * dedicated data-feeder clients (separate from gpu workers) that push samples into the orchestrator.
295
+ * richer monitor TUI.
296
+
297
+ PRs welcome. keep it simple and fast.
298
+
299
+ ## architecture
300
+
301
+ ```
302
+ ┌─────────────┐ WebSocket ┌─────────────┐
303
+ │ Worker │◄──────────────────►│ │
304
+ └─────────────┘ │ │ ┌──────────────┐
305
+ │ Orchestrator│────►│Arrow/Parquet │
306
+ ┌─────────────┐ │ │ │ Storage │
307
+ │ Worker │◄──────────────────►│ │ └──────────────┘
308
+ └─────────────┘ └─────────────┘
309
+
310
+ ┌─────────────┐ │
311
+ │ Monitor │◄──────────────────────────┘
312
+ └─────────────┘
313
+ ```
314
+
315
+ ## Storage Schema
316
+
317
+ ### captions.parquet
318
+ - `job_id`: Unique job identifier
319
+ - `dataset`: Dataset name
320
+ - `shard`: Shard identifier
321
+ - `item_key`: Item within shard
322
+ - `caption`: Generated caption text
323
+ - `contributor_id`: Worker who generated it
324
+ - `timestamp`: Generation time
325
+ - `quality_score`: Optional quality metric
326
+
327
+ ### jobs.parquet
328
+ - `job_id`: Unique identifier
329
+ - `dataset`: Dataset name
330
+ - `shard`: Shard identifier
331
+ - `status`: pending/processing/completed/failed
332
+ - `assigned_to`: Worker ID
333
+ - `timestamp`: Status change time
334
+
335
+ ### contributors.parquet
336
+ - `contributor_id`: Unique identifier
337
+ - `name`: Display name
338
+ - `total_captions`: Lifetime count
339
+ - `trust_level`: Quality tier (0-5)
340
+
341
+ ## Development
342
+
343
+ ```bash
344
+ # Install with dev dependencies
345
+ pip install -e ".[dev]"
346
+
347
+ # Run tests
348
+ pytest
349
+
350
+ # Format code
351
+ black src/
352
+ ruff --fix src/
353
+
354
+ # Type checking
355
+ mypy src/
356
+ ```
357
+
358
+ ## Community Contribution
359
+
360
+ To contribute compute:
361
+
362
+ 1. Install caption-flow: `pip install caption-flow`
363
+ 2. Get a worker token from the project maintainer
364
+ 3. Run: `caption-flow worker --server wss://project.domain.com:8765 --token YOUR_TOKEN`
365
+
366
+ Your contributions will be tracked and attributed in the final dataset!
367
+
368
+ ## License
369
+
370
+ MIT
@@ -0,0 +1,326 @@
1
+ # CaptionFlow
2
+
3
+ scalable, fault-tolerant **vLLM-powered image captioning**. this "first round" focuses on a fast websocket orchestrator plus lightweight gpu workers that batch requests through vLLM.
4
+
5
+ * **orchestrator**: hands out work in chunked shards, collects captions, checkpoints progress, and keeps simple stats.
6
+ * **workers (vLLM)**: connect to the orchestrator, stream in image samples, batch them, and generate 1..N captions per image using prompts supplied by the orchestrator.
7
+ * **config-driven**: all components read YAML config; flags can override.
8
+ * **tui monitor (optional)**: a monitor client is wired into the CLI; ship a `monitor` module to enable it.
9
+
10
+ > no conda. just `venv` + `pip`.
11
+
12
+ ---
13
+
14
+ ## install
15
+
16
+ ```bash
17
+ python -m venv .venv
18
+ source .venv/bin/activate # windows: .venv\Scripts\activate
19
+ pip install --upgrade pip
20
+ pip install -e . # installs the `caption-flow` command
21
+ ```
22
+
23
+ ## quickstart (single box)
24
+
25
+ 1. copy + edit the sample configs
26
+
27
+ ```bash
28
+ cp orchestrator.yaml my-orchestrator.yaml
29
+ cp worker.yaml my-worker.yaml
30
+ cp monitor.yaml my-monitor.yaml # optional; requires a monitor module
31
+ ```
32
+
33
+ set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config). if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting workers.
34
+
35
+ 2. start the orchestrator
36
+
37
+ ```bash
38
+ caption-flow orchestrator --config my-orchestrator.yaml
39
+ ```
40
+
41
+ 3. start one or more vLLM workers
42
+
43
+ ```bash
44
+ # gpu 0 on the same host
45
+ caption-flow worker --config my-worker.yaml --gpu-id 0
46
+
47
+ # your second GPU
48
+ caption-flow worker --config my-worker.yaml --gpu-id 1
49
+ ```
50
+
51
+ 4. (optional) start the monitor
52
+
53
+ ```bash
54
+ caption-flow monitor --config my-monitor.yaml
55
+ ```
56
+
57
+ 5. (optional) scan/fix chunks on disk if you had crashes
58
+
59
+ ```bash
60
+ caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
61
+ ```
62
+
63
+ ---
64
+
65
+ ## how it’s wired
66
+
67
+ ### orchestrator
68
+
69
+ * **websocket server** (default `0.0.0.0:8765`) with three client roles: workers, data-feeders, and admin.
70
+ * **dataset control**: the orchestrator centrally defines the dataset (`huggingface` or `local`) and version/name. it chunk-slices shards and assigns work.
71
+ * **vLLM config broadcast**: model, tp size, dtype, max seq len, memory targets, batching, sampling params, and **inference prompts** are all pushed to workers; workers can apply many changes without a model reload.
72
+ * **storage + checkpoints**: captions buffer to disk with periodic checkpoints. chunk state is tracked so restarts don’t double-work.
73
+ * **auth**: token lists for `worker`, `monitor`, and `admin` roles.
74
+
75
+ start flags you’ll likely use:
76
+
77
+ ```text
78
+ --config PATH # yaml config for the orchestrator
79
+ --port INT, --host STR # bind controls
80
+ --data-dir PATH # overrides storage.data_dir
81
+ --cert PATH, --key PATH # enable TLS (or use --no-ssl for ws:// in dev)
82
+ --vllm # use the vLLM-style orchestrator (webdataset/hf)
83
+ ```
84
+
85
+ ### vLLM worker
86
+
87
+ * **one process per gpu**. select the device with `--gpu-id` (or `worker.gpu_id` in YAML).
88
+ * **gets its marching orders** from the orchestrator: dataset info, model, prompts, batch size, and sampling.
89
+ * **resilient**: detects disconnects, abandons the current chunk cleanly, clears queues, reconnects, and resumes.
90
+ * **batched generate()**: images are resized down for consistent batching; each image can get multiple captions (one per prompt).
91
+
92
+ start flags you’ll likely use:
93
+
94
+ ```text
95
+ --config PATH # yaml for the worker
96
+ --server URL # ws(s)://host:port
97
+ --token STR # must match an allowed worker token on the orchestrator
98
+ --name STR # display name
99
+ --batch-size INT # override vLLM batch size
100
+ --vllm # use the vLLM worker implementation
101
+ --gpu-id INT # which gpu to use
102
+ --precision STR, --model STR # optional overrides for dtype/model
103
+ --no-verify-ssl # accept self-signed certs in dev
104
+ ```
105
+
106
+ ### (optional) monitor
107
+
108
+ * a CLI entry exists for a TUI monitor; wire in a `monitor` module to enable it. config lives in `monitor.yaml` or inside `orchestrator.yaml` under `monitor:`.
109
+
110
+ ---
111
+
112
+ ## configuration
113
+
114
+ ### config discovery order
115
+
116
+ for any component, the CLI looks for config in this order (first match wins):
117
+
118
+ 1. `--config /path/to/file.yaml`
119
+ 2. `./<component>.yaml` (current directory)
120
+ 3. `~/.caption-flow/<component>.yaml`
121
+ 4. `$XDG_CONFIG_HOME/caption-flow/<component>.yaml`
122
+ 5. `/etc/caption-flow/<component>.yaml`
123
+ 6. any `$XDG_CONFIG_DIRS` entries under `caption-flow/`
124
+ 7. `./examples/<component>.yaml` (fallback)
125
+
126
+ ### orchestrator.yaml (highlights)
127
+
128
+ ```yaml
129
+ orchestrator:
130
+ host: 0.0.0.0
131
+ port: 8765
132
+ # ssl:
133
+ # cert: /path/fullchain.pem
134
+ # key: /path/privkey.pem
135
+
136
+ dataset:
137
+ type: huggingface # or "local"
138
+ path: <hf-dataset-or-local-path>
139
+ name: <logical-name>
140
+ version: "1.0"
141
+
142
+ vllm:
143
+ model: Qwen/Qwen2.5-VL-3B-Instruct
144
+ tensor_parallel_size: 1
145
+ max_model_len: 16384
146
+ dtype: float16
147
+ gpu_memory_utilization: 0.92
148
+ enforce_eager: true
149
+ disable_mm_preprocessor_cache: true
150
+ limit_mm_per_prompt: { image: 1 }
151
+
152
+ batch_size: 8
153
+
154
+ sampling:
155
+ temperature: 0.7
156
+ top_p: 0.95
157
+ max_tokens: 256
158
+ repetition_penalty: 1.05
159
+ skip_special_tokens: true
160
+ stop: ["<|end|>", "<|endoftext|>", "<|im_end|>"]
161
+
162
+ inference_prompts:
163
+ - "describe this image in detail"
164
+ - "provide a comprehensive description of the visual content"
165
+ - "what are the key elements in this image?"
166
+
167
+ storage:
168
+ data_dir: ./caption_data
169
+ checkpoint_dir: ./checkpoints
170
+ caption_buffer_size: 100
171
+ checkpoint_interval: 1000
172
+
173
+ # chunking/queueing
174
+ chunk_size: 1000
175
+ chunks_per_request: 2
176
+ chunk_buffer_multiplier: 3
177
+ min_chunk_buffer: 10
178
+
179
+ auth:
180
+ worker_tokens:
181
+ - { token: "example-worker-token", name: "Example Worker" }
182
+ monitor_tokens:
183
+ - { token: "letmein", name: "Default monitor" }
184
+ admin_tokens:
185
+ - { token: "admin-secret-2024", name: "Admin" }
186
+ ```
187
+
188
+ ### worker.yaml (highlights)
189
+
190
+ ```yaml
191
+ worker:
192
+ server: ws://localhost:8765 # use wss:// in prod
193
+ token: example-worker-token
194
+ name: local-gpu
195
+ gpu_id: 0
196
+ vllm: true
197
+
198
+ # local queues
199
+ readahead_size: 256
200
+ inference_queue_size: 128
201
+ ```
202
+
203
+ ### monitor.yaml (optional)
204
+
205
+ ```yaml
206
+ monitor:
207
+ server: ws://localhost:8765
208
+ token: letmein
209
+ refresh_rate: 1.0
210
+ show_contributors: true
211
+ show_quality_metrics: true
212
+ max_activity_items: 20
213
+ show_chunk_progress: true
214
+ show_worker_queues: true
215
+ show_throughput_graph: true
216
+ ```
217
+
218
+ ---
219
+
220
+ ## tls / certificates
221
+
222
+ use the built-in helpers during development:
223
+
224
+ ```bash
225
+ # self-signed certs for quick local testing
226
+ caption-flow generate_cert --self-signed --domain localhost --output-dir ./certs
227
+
228
+ # inspect any certificate file
229
+ caption-flow inspect_cert ./certs/fullchain.pem
230
+ ```
231
+
232
+ then point the orchestrator at the resulting cert/key (or run `--no-ssl` for dev-only ws\://).
233
+
234
+ ---
235
+
236
+ ## tips & notes
237
+
238
+ * **multi-gpu**: start one worker process per gpu (set `--gpu-id` or `worker.gpu_id`).
239
+ * **throughput**: tune `vllm.batch_size` in the orchestrator config (or override with `--batch-size` at worker start). higher isn’t always better; watch VRAM.
240
+ * **prompts**: add more strings under `vllm.inference_prompts` to get multiple captions per image; the worker returns only non-empty generations.
241
+ * **private HF**: if your dataset/model needs auth, export `HUGGINGFACE_HUB_TOKEN` before `caption-flow worker ...`.
242
+ * **self-signed ssl**: pass `--no-verify-ssl` to workers/monitors in dev.
243
+ * **recovery**: if you hard-crash mid-run, `caption-flow scan_chunks --fix` can reset abandoned chunks so the orchestrator can reissue them cleanly.
244
+
245
+ ---
246
+
247
+ ## roadmap
248
+
249
+ * hot config reload via the admin websocket path.
250
+ * dedicated data-feeder clients (separate from gpu workers) that push samples into the orchestrator.
251
+ * richer monitor TUI.
252
+
253
+ PRs welcome. keep it simple and fast.
254
+
255
+ ## architecture
256
+
257
+ ```
258
+ ┌─────────────┐ WebSocket ┌─────────────┐
259
+ │ Worker │◄──────────────────►│ │
260
+ └─────────────┘ │ │ ┌──────────────┐
261
+ │ Orchestrator│────►│Arrow/Parquet │
262
+ ┌─────────────┐ │ │ │ Storage │
263
+ │ Worker │◄──────────────────►│ │ └──────────────┘
264
+ └─────────────┘ └─────────────┘
265
+
266
+ ┌─────────────┐ │
267
+ │ Monitor │◄──────────────────────────┘
268
+ └─────────────┘
269
+ ```
270
+
271
+ ## Storage Schema
272
+
273
+ ### captions.parquet
274
+ - `job_id`: Unique job identifier
275
+ - `dataset`: Dataset name
276
+ - `shard`: Shard identifier
277
+ - `item_key`: Item within shard
278
+ - `caption`: Generated caption text
279
+ - `contributor_id`: Worker who generated it
280
+ - `timestamp`: Generation time
281
+ - `quality_score`: Optional quality metric
282
+
283
+ ### jobs.parquet
284
+ - `job_id`: Unique identifier
285
+ - `dataset`: Dataset name
286
+ - `shard`: Shard identifier
287
+ - `status`: pending/processing/completed/failed
288
+ - `assigned_to`: Worker ID
289
+ - `timestamp`: Status change time
290
+
291
+ ### contributors.parquet
292
+ - `contributor_id`: Unique identifier
293
+ - `name`: Display name
294
+ - `total_captions`: Lifetime count
295
+ - `trust_level`: Quality tier (0-5)
296
+
297
+ ## Development
298
+
299
+ ```bash
300
+ # Install with dev dependencies
301
+ pip install -e ".[dev]"
302
+
303
+ # Run tests
304
+ pytest
305
+
306
+ # Format code
307
+ black src/
308
+ ruff --fix src/
309
+
310
+ # Type checking
311
+ mypy src/
312
+ ```
313
+
314
+ ## Community Contribution
315
+
316
+ To contribute compute:
317
+
318
+ 1. Install caption-flow: `pip install caption-flow`
319
+ 2. Get a worker token from the project maintainer
320
+ 3. Run: `caption-flow worker --server wss://project.domain.com:8765 --token YOUR_TOKEN`
321
+
322
+ Your contributions will be tracked and attributed in the final dataset!
323
+
324
+ ## License
325
+
326
+ MIT