caption-flow 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/cli.py +2 -1
- caption_flow/models.py +108 -1
- caption_flow/monitor.py +1 -1
- caption_flow/orchestrator.py +423 -1595
- caption_flow/processors/__init__.py +11 -0
- caption_flow/processors/base.py +219 -0
- caption_flow/processors/huggingface.py +832 -0
- caption_flow/processors/local_filesystem.py +683 -0
- caption_flow/processors/webdataset.py +782 -0
- caption_flow/storage.py +415 -406
- caption_flow/utils/checkpoint_tracker.py +2 -2
- caption_flow/utils/chunk_tracker.py +94 -35
- caption_flow/utils/dataset_loader.py +64 -522
- caption_flow/utils/dataset_metadata_cache.py +67 -0
- caption_flow/utils/image_processor.py +1 -4
- caption_flow/utils/shard_processor.py +4 -200
- caption_flow/utils/shard_tracker.py +1 -5
- caption_flow/workers/base.py +3 -3
- caption_flow/workers/caption.py +416 -792
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/METADATA +29 -27
- caption_flow-0.2.3.dist-info/RECORD +35 -0
- caption_flow-0.2.1.dist-info/RECORD +0 -29
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/WHEEL +0 -0
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.2.1.dist-info → caption_flow-0.2.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -69,12 +69,14 @@ pip install -e . # installs the `caption-flow` command
|
|
69
69
|
1. copy + edit the sample configs
|
70
70
|
|
71
71
|
```bash
|
72
|
-
cp orchestrator.yaml my-orchestrator.yaml
|
73
|
-
cp worker.yaml my-worker.yaml
|
74
|
-
cp monitor.yaml my-monitor.yaml # optional
|
72
|
+
cp examples/orchestrator/local_image_files.yaml my-orchestrator.yaml
|
73
|
+
cp examples/worker.yaml my-worker.yaml
|
74
|
+
cp examples/monitor.yaml my-monitor.yaml # optional terminal interface
|
75
75
|
```
|
76
76
|
|
77
|
-
set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
|
77
|
+
set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
|
78
|
+
|
79
|
+
if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting anything.
|
78
80
|
|
79
81
|
2. start the orchestrator
|
80
82
|
|
@@ -90,6 +92,9 @@ caption-flow worker --config my-worker.yaml --gpu-id 0
|
|
90
92
|
|
91
93
|
# your second GPU
|
92
94
|
caption-flow worker --config my-worker.yaml --gpu-id 1
|
95
|
+
|
96
|
+
# on a remote host
|
97
|
+
caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:8765
|
93
98
|
```
|
94
99
|
|
95
100
|
4. (optional) start the monitor
|
@@ -98,12 +103,6 @@ caption-flow worker --config my-worker.yaml --gpu-id 1
|
|
98
103
|
caption-flow monitor --config my-monitor.yaml
|
99
104
|
```
|
100
105
|
|
101
|
-
5. (optional) scan/fix chunks on disk if you had crashes
|
102
|
-
|
103
|
-
```bash
|
104
|
-
caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
|
105
|
-
```
|
106
|
-
|
107
106
|
---
|
108
107
|
|
109
108
|
## how it’s wired
|
@@ -178,7 +177,7 @@ orchestrator:
|
|
178
177
|
# key: /path/privkey.pem
|
179
178
|
|
180
179
|
dataset:
|
181
|
-
type: huggingface
|
180
|
+
type: huggingface
|
182
181
|
path: <hf-dataset-or-local-path>
|
183
182
|
name: <logical-name>
|
184
183
|
version: "1.0"
|
@@ -315,28 +314,31 @@ PRs welcome. keep it simple and fast.
|
|
315
314
|
## Storage Schema
|
316
315
|
|
317
316
|
### captions.parquet
|
317
|
+
|
318
318
|
- `job_id`: Unique job identifier
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
319
|
+
* `dataset`: Dataset name
|
320
|
+
* `shard`: Shard identifier
|
321
|
+
* `item_key`: Item within shard
|
322
|
+
* `caption`: Generated caption text
|
323
|
+
* `contributor_id`: Worker who generated it
|
324
|
+
* `timestamp`: Generation time
|
325
|
+
* `quality_score`: Optional quality metric
|
326
326
|
|
327
327
|
### jobs.parquet
|
328
|
+
|
328
329
|
- `job_id`: Unique identifier
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
330
|
+
* `dataset`: Dataset name
|
331
|
+
* `shard`: Shard identifier
|
332
|
+
* `status`: pending/processing/completed/failed
|
333
|
+
* `assigned_to`: Worker ID
|
334
|
+
* `timestamp`: Status change time
|
334
335
|
|
335
336
|
### contributors.parquet
|
337
|
+
|
336
338
|
- `contributor_id`: Unique identifier
|
337
|
-
|
338
|
-
|
339
|
-
|
339
|
+
* `name`: Display name
|
340
|
+
* `total_captions`: Lifetime count
|
341
|
+
* `trust_level`: Quality tier (0-5)
|
340
342
|
|
341
343
|
## Development
|
342
344
|
|
@@ -0,0 +1,35 @@
|
|
1
|
+
caption_flow/__init__.py,sha256=NLPJ25lRN7xHqncXweINDNwbt0q8lgjZ30G21zlPdRs,303
|
2
|
+
caption_flow/cli.py,sha256=qEueeJhf3DvxSBxnOp5t32p6gAnZskvIDe6cwtPA0-Y,28892
|
3
|
+
caption_flow/models.py,sha256=bpr7yMy3vPErZCQwmgOYIix489rRGbT6lVw8wxxwTkc,4931
|
4
|
+
caption_flow/monitor.py,sha256=bAt9EJqfPgT_KdbknGdCxwBRH002pRDgyUmYIj6Dyso,7885
|
5
|
+
caption_flow/orchestrator.py,sha256=ciqWghxUxk-5s6u7W3JwD7_JLSFYV57NgOwiMkxME-I,36133
|
6
|
+
caption_flow/storage.py,sha256=Wqgtsk6yZ9Kf-izeUKHLwSvPUH3xFqIbzox20QHbc64,43370
|
7
|
+
caption_flow/processors/__init__.py,sha256=hvq-OuAJWQe6hFglKe7QmkS8473k20FmxZDSxfXpCrg,423
|
8
|
+
caption_flow/processors/base.py,sha256=JlTqCHo5HRXrXMVzgle_6pNwh4HGHsF7jLF6PeSnWr0,6783
|
9
|
+
caption_flow/processors/huggingface.py,sha256=MNz9vDMtrrTOSXe9Q_kbBrQ7XBv69X6x5xD_QP9icdg,33765
|
10
|
+
caption_flow/processors/local_filesystem.py,sha256=EYmsImbkqsIU7UZL2FijL0hotKLtPOtkzfwernQDSxA,27860
|
11
|
+
caption_flow/processors/webdataset.py,sha256=xsrYx7_5FCqez30dc4hSDYfyA9A0oKqHqwt7CRc1J0c,33812
|
12
|
+
caption_flow/utils/__init__.py,sha256=F1BChVoCsj9zn1GJRBOLHET1kLW6xrAmsbzcR7hHy6Y,202
|
13
|
+
caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
|
14
|
+
caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
|
15
|
+
caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
|
16
|
+
caption_flow/utils/checkpoint_tracker.py,sha256=-nN5gLvXyMdKOCT2SNNL2Km6UYm2Hii9wuXeezWhwx4,3339
|
17
|
+
caption_flow/utils/chunk_tracker.py,sha256=x9UwFxpj-nMeAJ6bpKw5E09QNUqu7L0pejTlk8nxgE8,19402
|
18
|
+
caption_flow/utils/dataset_loader.py,sha256=2-SgXPGQkF4CyA3zyVYfSbZMSk4YzTsVFY0izmOZPrM,8771
|
19
|
+
caption_flow/utils/dataset_metadata_cache.py,sha256=AJ8Z1GYT0DC9_LLjxNvrePKU7ecenNZun5GhaB2gvj0,2650
|
20
|
+
caption_flow/utils/image_processor.py,sha256=7Ed92iUJ-OvjzQmAGPaULoYEqoirVHHo0lxtceWGc44,5586
|
21
|
+
caption_flow/utils/job_queue.py,sha256=itdfXcrkvGjmXn4qtpgMF63k1ufRBaejDe4V6WcxzgU,1104
|
22
|
+
caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
|
23
|
+
caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
|
24
|
+
caption_flow/utils/shard_processor.py,sha256=_PCW5TfSHFfCc63Sn7bVzgjA625-aWzL4cWwZLjW0rQ,3935
|
25
|
+
caption_flow/utils/shard_tracker.py,sha256=1OqiueaC8WoxhY2nc03erZAc50mnQCZazATS6R14lbQ,3029
|
26
|
+
caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
|
27
|
+
caption_flow/workers/base.py,sha256=2AGWERC5hbmO-0V_A1MUbgRVvRNN3blqGPyDokvvzmM,7575
|
28
|
+
caption_flow/workers/caption.py,sha256=_uvpdoBzym1TKWKXtky7hBfj8YnG1EaJz-NRwaH2X1A,36722
|
29
|
+
caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
|
30
|
+
caption_flow-0.2.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
31
|
+
caption_flow-0.2.3.dist-info/METADATA,sha256=bk5Gk3eWuDH_UWXPEDKulksPc3hVHvnzm3sstLbuU-0,11914
|
32
|
+
caption_flow-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
33
|
+
caption_flow-0.2.3.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
34
|
+
caption_flow-0.2.3.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
35
|
+
caption_flow-0.2.3.dist-info/RECORD,,
|
@@ -1,29 +0,0 @@
|
|
1
|
-
caption_flow/__init__.py,sha256=NLPJ25lRN7xHqncXweINDNwbt0q8lgjZ30G21zlPdRs,303
|
2
|
-
caption_flow/cli.py,sha256=bHxx66CPsCmSieaH3pw8NZBojIIbniRTdU9mEBHMmWA,28832
|
3
|
-
caption_flow/models.py,sha256=qo6lQiO10UISbaBVr6Cs-fSW_pmjwE6kmiTmmU_l3Wk,2140
|
4
|
-
caption_flow/monitor.py,sha256=ZZCSasYLKJ-UzA3-RoAtytv-tbNA-m3h5YjlZg_vukg,7870
|
5
|
-
caption_flow/orchestrator.py,sha256=bZ8NnGdqoXSmu7Nq-_7cOSH1DLHkBT88cne0uDyPeNY,89112
|
6
|
-
caption_flow/storage.py,sha256=hC6ZHT_PHFoUVjqD5JUwy3_79oAD1e1H30neA_xsz7s,40748
|
7
|
-
caption_flow/utils/__init__.py,sha256=F1BChVoCsj9zn1GJRBOLHET1kLW6xrAmsbzcR7hHy6Y,202
|
8
|
-
caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
|
9
|
-
caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
|
10
|
-
caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
|
11
|
-
caption_flow/utils/checkpoint_tracker.py,sha256=8tsTFF-HcygitK92YcS-QWzeg-qRm9AuCpQoQRfC8M0,3335
|
12
|
-
caption_flow/utils/chunk_tracker.py,sha256=hKn8CN6ubErc9kuCWZMj12ZCZKxVlqXqAEocbzjfa-k,17296
|
13
|
-
caption_flow/utils/dataset_loader.py,sha256=ZplJv655ZMyUbaZC4BBiL5II18sBy4JSJhxGZtK_VmA,29107
|
14
|
-
caption_flow/utils/image_processor.py,sha256=Zl8TAv9gYPdAYat3UiTuuNdIb2fXNfZ35AxsxuovJTs,5650
|
15
|
-
caption_flow/utils/job_queue.py,sha256=itdfXcrkvGjmXn4qtpgMF63k1ufRBaejDe4V6WcxzgU,1104
|
16
|
-
caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
|
17
|
-
caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
|
18
|
-
caption_flow/utils/shard_processor.py,sha256=CRda6M4xh4U0vwvYlzq9nJEzz4d_4yzUBosYAeBcPEA,10854
|
19
|
-
caption_flow/utils/shard_tracker.py,sha256=Wt2oE-O85F2FxSnqIocJiaYeFn00OVVjIiklZIZRGL8,3233
|
20
|
-
caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
|
21
|
-
caption_flow/workers/base.py,sha256=jPm_Xw4Lxd0cnrPs-biBqKRQKkTOJLvHLolmp0Gb1CI,7530
|
22
|
-
caption_flow/workers/caption.py,sha256=NZ9kTjk2uOoNwyyNSkB_arYk213vLr5mowHN-OjiFkk,54631
|
23
|
-
caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
|
24
|
-
caption_flow-0.2.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
25
|
-
caption_flow-0.2.1.dist-info/METADATA,sha256=fxNfSOqkCklb96aq3ZFU7SvRuXEBUQ11xbjkQn7Yzuo,11941
|
26
|
-
caption_flow-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
27
|
-
caption_flow-0.2.1.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
28
|
-
caption_flow-0.2.1.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
29
|
-
caption_flow-0.2.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|