caption-flow 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {caption_flow-0.2.1/src/caption_flow.egg-info → caption_flow-0.2.3}/PKG-INFO +29 -27
  2. {caption_flow-0.2.1 → caption_flow-0.2.3}/README.md +28 -26
  3. {caption_flow-0.2.1 → caption_flow-0.2.3}/pyproject.toml +1 -1
  4. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/cli.py +2 -1
  5. caption_flow-0.2.3/src/caption_flow/models.py +191 -0
  6. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/monitor.py +1 -1
  7. caption_flow-0.2.3/src/caption_flow/orchestrator.py +914 -0
  8. caption_flow-0.2.3/src/caption_flow/processors/__init__.py +11 -0
  9. caption_flow-0.2.3/src/caption_flow/processors/base.py +219 -0
  10. caption_flow-0.2.3/src/caption_flow/processors/huggingface.py +832 -0
  11. caption_flow-0.2.3/src/caption_flow/processors/local_filesystem.py +683 -0
  12. caption_flow-0.2.3/src/caption_flow/processors/webdataset.py +782 -0
  13. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/storage.py +415 -406
  14. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/checkpoint_tracker.py +2 -2
  15. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/chunk_tracker.py +94 -35
  16. caption_flow-0.2.3/src/caption_flow/utils/dataset_loader.py +222 -0
  17. caption_flow-0.2.3/src/caption_flow/utils/dataset_metadata_cache.py +67 -0
  18. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/image_processor.py +1 -4
  19. caption_flow-0.2.3/src/caption_flow/utils/shard_processor.py +119 -0
  20. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/shard_tracker.py +1 -5
  21. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/workers/base.py +3 -3
  22. caption_flow-0.2.3/src/caption_flow/workers/caption.py +945 -0
  23. {caption_flow-0.2.1 → caption_flow-0.2.3/src/caption_flow.egg-info}/PKG-INFO +29 -27
  24. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow.egg-info/SOURCES.txt +6 -0
  25. caption_flow-0.2.1/src/caption_flow/models.py +0 -84
  26. caption_flow-0.2.1/src/caption_flow/orchestrator.py +0 -2086
  27. caption_flow-0.2.1/src/caption_flow/utils/dataset_loader.py +0 -680
  28. caption_flow-0.2.1/src/caption_flow/utils/shard_processor.py +0 -315
  29. caption_flow-0.2.1/src/caption_flow/workers/caption.py +0 -1321
  30. {caption_flow-0.2.1 → caption_flow-0.2.3}/LICENSE +0 -0
  31. {caption_flow-0.2.1 → caption_flow-0.2.3}/setup.cfg +0 -0
  32. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/__init__.py +0 -0
  33. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/__init__.py +0 -0
  34. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/auth.py +0 -0
  35. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/caption_utils.py +0 -0
  36. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/certificates.py +0 -0
  37. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/job_queue.py +0 -0
  38. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/json_utils.py +0 -0
  39. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/prompt_template.py +0 -0
  40. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/utils/vllm_config.py +0 -0
  41. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow/workers/data.py +0 -0
  42. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow.egg-info/dependency_links.txt +0 -0
  43. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow.egg-info/entry_points.txt +0 -0
  44. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow.egg-info/requires.txt +0 -0
  45. {caption_flow-0.2.1 → caption_flow-0.2.3}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
@@ -69,12 +69,14 @@ pip install -e . # installs the `caption-flow` command
69
69
  1. copy + edit the sample configs
70
70
 
71
71
  ```bash
72
- cp orchestrator.yaml my-orchestrator.yaml
73
- cp worker.yaml my-worker.yaml
74
- cp monitor.yaml my-monitor.yaml # optional; requires a monitor module
72
+ cp examples/orchestrator/local_image_files.yaml my-orchestrator.yaml
73
+ cp examples/worker.yaml my-worker.yaml
74
+ cp examples/monitor.yaml my-monitor.yaml # optional terminal interface
75
75
  ```
76
76
 
77
- set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config). if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting workers.
77
+ set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
78
+
79
+ if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting anything.
78
80
 
79
81
  2. start the orchestrator
80
82
 
@@ -90,6 +92,9 @@ caption-flow worker --config my-worker.yaml --gpu-id 0
90
92
 
91
93
  # your second GPU
92
94
  caption-flow worker --config my-worker.yaml --gpu-id 1
95
+
96
+ # on a remote host
97
+ caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:8765
93
98
  ```
94
99
 
95
100
  4. (optional) start the monitor
@@ -98,12 +103,6 @@ caption-flow worker --config my-worker.yaml --gpu-id 1
98
103
  caption-flow monitor --config my-monitor.yaml
99
104
  ```
100
105
 
101
- 5. (optional) scan/fix chunks on disk if you had crashes
102
-
103
- ```bash
104
- caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
105
- ```
106
-
107
106
  ---
108
107
 
109
108
  ## how it’s wired
@@ -178,7 +177,7 @@ orchestrator:
178
177
  # key: /path/privkey.pem
179
178
 
180
179
  dataset:
181
- type: huggingface # or "local"
180
+ type: huggingface
182
181
  path: <hf-dataset-or-local-path>
183
182
  name: <logical-name>
184
183
  version: "1.0"
@@ -315,28 +314,31 @@ PRs welcome. keep it simple and fast.
315
314
  ## Storage Schema
316
315
 
317
316
  ### captions.parquet
317
+
318
318
  - `job_id`: Unique job identifier
319
- - `dataset`: Dataset name
320
- - `shard`: Shard identifier
321
- - `item_key`: Item within shard
322
- - `caption`: Generated caption text
323
- - `contributor_id`: Worker who generated it
324
- - `timestamp`: Generation time
325
- - `quality_score`: Optional quality metric
319
+ * `dataset`: Dataset name
320
+ * `shard`: Shard identifier
321
+ * `item_key`: Item within shard
322
+ * `caption`: Generated caption text
323
+ * `contributor_id`: Worker who generated it
324
+ * `timestamp`: Generation time
325
+ * `quality_score`: Optional quality metric
326
326
 
327
327
  ### jobs.parquet
328
+
328
329
  - `job_id`: Unique identifier
329
- - `dataset`: Dataset name
330
- - `shard`: Shard identifier
331
- - `status`: pending/processing/completed/failed
332
- - `assigned_to`: Worker ID
333
- - `timestamp`: Status change time
330
+ * `dataset`: Dataset name
331
+ * `shard`: Shard identifier
332
+ * `status`: pending/processing/completed/failed
333
+ * `assigned_to`: Worker ID
334
+ * `timestamp`: Status change time
334
335
 
335
336
  ### contributors.parquet
337
+
336
338
  - `contributor_id`: Unique identifier
337
- - `name`: Display name
338
- - `total_captions`: Lifetime count
339
- - `trust_level`: Quality tier (0-5)
339
+ * `name`: Display name
340
+ * `total_captions`: Lifetime count
341
+ * `trust_level`: Quality tier (0-5)
340
342
 
341
343
  ## Development
342
344
 
@@ -25,12 +25,14 @@ pip install -e . # installs the `caption-flow` command
25
25
  1. copy + edit the sample configs
26
26
 
27
27
  ```bash
28
- cp orchestrator.yaml my-orchestrator.yaml
29
- cp worker.yaml my-worker.yaml
30
- cp monitor.yaml my-monitor.yaml # optional; requires a monitor module
28
+ cp examples/orchestrator/local_image_files.yaml my-orchestrator.yaml
29
+ cp examples/worker.yaml my-worker.yaml
30
+ cp examples/monitor.yaml my-monitor.yaml # optional terminal interface
31
31
  ```
32
32
 
33
- set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config). if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting workers.
33
+ set a unique shared token in both `my-orchestrator.yaml` and `my-worker.yaml` (see `auth.worker_tokens` in the orchestrator config and `worker.token` in the worker config).
34
+
35
+ if you use private hugging face datasets/models, export `HUGGINGFACE_HUB_TOKEN` before starting anything.
34
36
 
35
37
  2. start the orchestrator
36
38
 
@@ -46,6 +48,9 @@ caption-flow worker --config my-worker.yaml --gpu-id 0
46
48
 
47
49
  # your second GPU
48
50
  caption-flow worker --config my-worker.yaml --gpu-id 1
51
+
52
+ # on a remote host
53
+ caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:8765
49
54
  ```
50
55
 
51
56
  4. (optional) start the monitor
@@ -54,12 +59,6 @@ caption-flow worker --config my-worker.yaml --gpu-id 1
54
59
  caption-flow monitor --config my-monitor.yaml
55
60
  ```
56
61
 
57
- 5. (optional) scan/fix chunks on disk if you had crashes
58
-
59
- ```bash
60
- caption-flow scan_chunks --data-dir ./caption_data --checkpoint-dir ./checkpoints --fix
61
- ```
62
-
63
62
  ---
64
63
 
65
64
  ## how it’s wired
@@ -134,7 +133,7 @@ orchestrator:
134
133
  # key: /path/privkey.pem
135
134
 
136
135
  dataset:
137
- type: huggingface # or "local"
136
+ type: huggingface
138
137
  path: <hf-dataset-or-local-path>
139
138
  name: <logical-name>
140
139
  version: "1.0"
@@ -271,28 +270,31 @@ PRs welcome. keep it simple and fast.
271
270
  ## Storage Schema
272
271
 
273
272
  ### captions.parquet
273
+
274
274
  - `job_id`: Unique job identifier
275
- - `dataset`: Dataset name
276
- - `shard`: Shard identifier
277
- - `item_key`: Item within shard
278
- - `caption`: Generated caption text
279
- - `contributor_id`: Worker who generated it
280
- - `timestamp`: Generation time
281
- - `quality_score`: Optional quality metric
275
+ * `dataset`: Dataset name
276
+ * `shard`: Shard identifier
277
+ * `item_key`: Item within shard
278
+ * `caption`: Generated caption text
279
+ * `contributor_id`: Worker who generated it
280
+ * `timestamp`: Generation time
281
+ * `quality_score`: Optional quality metric
282
282
 
283
283
  ### jobs.parquet
284
+
284
285
  - `job_id`: Unique identifier
285
- - `dataset`: Dataset name
286
- - `shard`: Shard identifier
287
- - `status`: pending/processing/completed/failed
288
- - `assigned_to`: Worker ID
289
- - `timestamp`: Status change time
286
+ * `dataset`: Dataset name
287
+ * `shard`: Shard identifier
288
+ * `status`: pending/processing/completed/failed
289
+ * `assigned_to`: Worker ID
290
+ * `timestamp`: Status change time
290
291
 
291
292
  ### contributors.parquet
293
+
292
294
  - `contributor_id`: Unique identifier
293
- - `name`: Display name
294
- - `total_captions`: Lifetime count
295
- - `trust_level`: Quality tier (0-5)
295
+ * `name`: Display name
296
+ * `total_captions`: Lifetime count
297
+ * `trust_level`: Quality tier (0-5)
296
298
 
297
299
  ## Development
298
300
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "caption-flow"
3
- version = "0.2.1"
3
+ version = "0.2.3"
4
4
  description = "Self-contained distributed community captioning system"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10,<3.13"
@@ -124,7 +124,7 @@ def setup_logging(verbose: bool = False):
124
124
  level = logging.DEBUG if verbose else logging.INFO
125
125
  logging.basicConfig(
126
126
  level=level,
127
- format="%(asctime)s %(message)s",
127
+ format="%(message)s",
128
128
  datefmt="[%Y-%m-%d %H:%M:%S]",
129
129
  handlers=[
130
130
  RichHandler(
@@ -161,6 +161,7 @@ def main(ctx, verbose: bool):
161
161
  @click.option("--key", help="SSL key path")
162
162
  @click.option("--no-ssl", is_flag=True, help="Disable SSL (development only)")
163
163
  @click.option("--vllm", is_flag=True, help="Use vLLM orchestrator for WebDataset/HF datasets")
164
+ @click.option("--verbose", is_flag=True, help="Enable verbose logging")
164
165
  @click.pass_context
165
166
  def orchestrator(ctx, config: Optional[str], **kwargs):
166
167
  """Start the orchestrator server."""
@@ -0,0 +1,191 @@
1
+ """Data models for CaptionFlow."""
2
+
3
+ import PIL
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+ from PIL import Image
9
+
10
+
11
+ class JobStatus(Enum):
12
+ """Job processing status."""
13
+
14
+ PENDING = "pending"
15
+ PROCESSING = "processing"
16
+ COMPLETED = "completed"
17
+ FAILED = "failed"
18
+
19
+ def __str__(self):
20
+ return self.value
21
+
22
+ def to_json(self):
23
+ return self.value
24
+
25
+
26
+ @dataclass
27
+ class Job:
28
+ """Captioning job."""
29
+
30
+ job_id: str
31
+ dataset: str
32
+ shard: str
33
+ item_key: str
34
+ status: JobStatus = JobStatus.PENDING
35
+ assigned_to: Optional[str] = None
36
+ created_at: datetime = None
37
+
38
+ def __post_init__(self):
39
+ if self.created_at is None:
40
+ self.created_at = datetime.utcnow()
41
+
42
+
43
+ @dataclass
44
+ class JobId:
45
+ shard_id: str
46
+ chunk_id: str
47
+ sample_id: str
48
+
49
+ def get_shard_str(self):
50
+ return f"{self.shard_id}"
51
+
52
+ def get_chunk_str(self):
53
+ return f"{self.shard_id}:chunk:{self.chunk_id}"
54
+
55
+ def get_sample_str(self):
56
+ return f"{self.shard_id}:chunk:{self.chunk_id}:idx:{self.sample_id}"
57
+
58
+ @staticmethod
59
+ def from_dict(job: dict) -> "JobId":
60
+ return JobId(shard_id=job["shard_id"], chunk_id=job["chunk_id"], sample_id=job["sample_id"])
61
+
62
+ @staticmethod
63
+ def from_values(shard_id: str, chunk_id: str, sample_id: str) -> "JobId":
64
+ return JobId(shard_id=shard_id, chunk_id=chunk_id, sample_id=sample_id)
65
+
66
+ @staticmethod
67
+ def from_str(job_id: str):
68
+ # from data-0000:chunk:0:idx:0
69
+ parts = job_id.split(":")
70
+ if len(parts) != 5:
71
+ raise ValueError(f"Invalid job_id format: {job_id}")
72
+ return JobId(shard_id=parts[0], chunk_id=parts[2], sample_id=parts[4])
73
+
74
+
75
+ @dataclass
76
+ class Caption:
77
+ """Generated caption with attribution and image metadata."""
78
+
79
+ # Core fields
80
+ job_id: str
81
+ dataset: str
82
+ shard: str
83
+ item_key: str
84
+ contributor_id: str
85
+ timestamp: datetime
86
+ caption_count: int = 1 # Number of captions generated for this item
87
+ caption: Optional[str] = None
88
+ captions: Optional[List[str]] = None
89
+ outputs: Dict[str, List[str]] = field(default_factory=dict)
90
+ quality_score: Optional[float] = None
91
+ quality_scores: Optional[List[float]] = None
92
+
93
+ # Image metadata
94
+ image_width: Optional[int] = None
95
+ image_height: Optional[int] = None
96
+ image_format: Optional[str] = None
97
+ file_size: Optional[int] = None
98
+ filename: Optional[str] = None
99
+ url: Optional[str] = None
100
+
101
+ # Processing metadata
102
+ caption_index: Optional[int] = None # Which caption this is (0, 1, 2...)
103
+ total_captions: Optional[int] = None # Total captions for this image
104
+ processing_time_ms: Optional[float] = None
105
+ chunk_id: Optional[str] = None
106
+ metadata: Dict[str, Any] = field(default_factory=dict)
107
+
108
+ def __post_init__(self):
109
+ if self.caption is None and self.captions is None:
110
+ raise ValueError("At least one of 'caption' or 'captions' must be provided")
111
+
112
+
113
+ @dataclass
114
+ class Contributor:
115
+ """Contributor information."""
116
+
117
+ contributor_id: str
118
+ name: str
119
+ total_captions: int = 0
120
+ trust_level: int = 1
121
+
122
+
123
+ @dataclass
124
+ class ProcessingStage:
125
+ """Configuration for a single processing stage."""
126
+
127
+ name: str
128
+ model: str
129
+ prompts: List[str]
130
+ output_field: str
131
+ requires: List[str] = field(default_factory=list)
132
+ sampling: Optional[Dict[str, Any]] = None
133
+
134
+ # Model-specific overrides
135
+ tensor_parallel_size: Optional[int] = None
136
+ max_model_len: Optional[int] = None
137
+ dtype: Optional[str] = None
138
+ gpu_memory_utilization: Optional[float] = None
139
+
140
+
141
+ @dataclass
142
+ class StageResult:
143
+ """Results from a single stage."""
144
+
145
+ stage_name: str
146
+ output_field: str
147
+ outputs: List[str] # Multiple outputs from multiple prompts
148
+ error: Optional[str] = None
149
+
150
+ def is_success(self) -> bool:
151
+ return self.error is None and bool(self.outputs)
152
+
153
+
154
+ @dataclass
155
+ class ShardChunk:
156
+ """Shard chunk assignment with unprocessed ranges."""
157
+
158
+ chunk_id: str
159
+ shard_url: str
160
+ shard_name: str
161
+ start_index: int
162
+ chunk_size: int
163
+ unprocessed_ranges: List[Tuple[int, int]] = field(default_factory=list)
164
+
165
+
166
+ @dataclass
167
+ class ProcessingItem:
168
+ """Item being processed."""
169
+
170
+ chunk_id: str
171
+ item_key: str
172
+ image: Image.Image
173
+ image_data: bytes
174
+ metadata: Dict[str, Any] = field(default_factory=dict)
175
+ stage_results: Dict[str, StageResult] = field(default_factory=dict) # Accumulated results
176
+
177
+
178
+ @dataclass
179
+ class ProcessedResult:
180
+ """Result with multi-stage outputs."""
181
+
182
+ chunk_id: str
183
+ shard_name: str
184
+ item_key: str
185
+ outputs: Dict[str, List[str]] # field_name -> list of outputs
186
+ image_width: int
187
+ image_height: int
188
+ image_format: str
189
+ file_size: int
190
+ processing_time_ms: float
191
+ metadata: Dict[str, Any] = field(default_factory=dict)
@@ -83,7 +83,7 @@ class Monitor:
83
83
  await self._handle_update(data)
84
84
 
85
85
  except Exception as e:
86
- logger.error(f"Connection error: {e}")
86
+ logger.error(f"Connection error: {e}", exc_info=True)
87
87
  await asyncio.sleep(5)
88
88
 
89
89
  async def _handle_update(self, data: Dict):