caption-flow 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {caption_flow-0.3.1/src/caption_flow.egg-info → caption_flow-0.3.2}/PKG-INFO +1 -1
- {caption_flow-0.3.1 → caption_flow-0.3.2}/pyproject.toml +1 -1
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/__init__.py +1 -1
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/base.py +3 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/huggingface.py +1 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/local_filesystem.py +2 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/webdataset.py +62 -7
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/chunk_tracker.py +1 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/caption.py +29 -11
- {caption_flow-0.3.1 → caption_flow-0.3.2/src/caption_flow.egg-info}/PKG-INFO +1 -1
- {caption_flow-0.3.1 → caption_flow-0.3.2}/LICENSE +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/README.md +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/setup.cfg +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/cli.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/models.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/monitor.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/orchestrator.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/__init__.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/storage/__init__.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/storage/exporter.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/storage/manager.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/__init__.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/auth.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/caption_utils.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/certificates.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/checkpoint_tracker.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/image_processor.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/json_utils.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/prompt_template.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/vllm_config.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/viewer.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/base.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/data.py +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/SOURCES.txt +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/dependency_links.txt +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/entry_points.txt +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/requires.txt +0 -0
- {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -14,6 +14,7 @@ class WorkUnit:
|
|
14
14
|
unit_id: str # usually, but not always, the chunk id
|
15
15
|
chunk_id: str # always the chunk id
|
16
16
|
source_id: str # the shard name
|
17
|
+
unit_size: int # how many elements are in the workunit
|
17
18
|
data: Dict[str, Any]
|
18
19
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
19
20
|
priority: int = 0
|
@@ -44,6 +45,7 @@ class WorkAssignment:
|
|
44
45
|
"unit_id": u.unit_id,
|
45
46
|
"source_id": u.source_id,
|
46
47
|
"chunk_id": u.chunk_id,
|
48
|
+
"unit_size": u.unit_size,
|
47
49
|
"data": u.data,
|
48
50
|
"metadata": u.metadata,
|
49
51
|
"priority": u.priority,
|
@@ -62,6 +64,7 @@ class WorkAssignment:
|
|
62
64
|
unit_id=u["unit_id"],
|
63
65
|
chunk_id=u["chunk_id"],
|
64
66
|
source_id=u["source_id"],
|
67
|
+
unit_size=u["unit_size"],
|
65
68
|
data=u["data"],
|
66
69
|
metadata=u.get("metadata", {}),
|
67
70
|
priority=u.get("priority", 0),
|
@@ -251,6 +251,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
|
|
251
251
|
unit_id=chunk_id,
|
252
252
|
chunk_id=chunk_id,
|
253
253
|
source_id="local",
|
254
|
+
unit_size=chunk_state.chunk_size,
|
254
255
|
data={
|
255
256
|
"start_index": chunk_state.start_index,
|
256
257
|
"chunk_size": chunk_state.chunk_size,
|
@@ -319,6 +320,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
|
|
319
320
|
unit_id=unit_id,
|
320
321
|
chunk_id=unit_id,
|
321
322
|
source_id="local",
|
323
|
+
unit_size=chunk_size,
|
322
324
|
data={
|
323
325
|
"start_index": self.current_index,
|
324
326
|
"chunk_size": chunk_size,
|
@@ -12,6 +12,7 @@ from datetime import datetime
|
|
12
12
|
from PIL import Image
|
13
13
|
import io
|
14
14
|
|
15
|
+
from caption_flow.models import JobId
|
15
16
|
from caption_flow.storage import StorageManager
|
16
17
|
from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
|
17
18
|
from ..utils import ChunkTracker
|
@@ -21,6 +22,7 @@ import cv2
|
|
21
22
|
import numpy as np
|
22
23
|
|
23
24
|
logger = logging.getLogger(__name__)
|
25
|
+
logger.setLevel(logging.INFO)
|
24
26
|
|
25
27
|
|
26
28
|
class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
@@ -114,17 +116,22 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
114
116
|
return
|
115
117
|
|
116
118
|
shards_summary = self.chunk_tracker.get_shards_summary()
|
119
|
+
logger.debug(f"Restoring state: {shards_summary}")
|
117
120
|
|
118
121
|
with self.lock:
|
119
122
|
for shard_name, shard_info in shards_summary.items():
|
120
123
|
chunks = shard_info.get("chunks", [])
|
124
|
+
logger.debug(f"Existing job ids: {storage.get_all_processed_job_ids()}")
|
121
125
|
for chunk_state in chunks:
|
122
126
|
# Only add incomplete chunks
|
123
127
|
if chunk_state.status != "completed":
|
124
|
-
logger.debug(f"Restoring incomplete chunk {chunk_state
|
128
|
+
logger.debug(f"Restoring incomplete chunk {chunk_state}")
|
125
129
|
|
126
130
|
# Get unprocessed ranges
|
127
131
|
unprocessed_ranges = chunk_state.get_unprocessed_ranges()
|
132
|
+
logger.debug(
|
133
|
+
f"Chunk {chunk_state.chunk_id} unprocessed ranges: {unprocessed_ranges}"
|
134
|
+
)
|
128
135
|
if not unprocessed_ranges:
|
129
136
|
continue
|
130
137
|
|
@@ -139,6 +146,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
139
146
|
unit_id=chunk_state.chunk_id,
|
140
147
|
chunk_id=chunk_state.chunk_id,
|
141
148
|
source_id=shard_name,
|
149
|
+
unit_size=chunk_state.chunk_size,
|
142
150
|
data={
|
143
151
|
"shard_url": chunk_state.shard_url,
|
144
152
|
"shard_name": shard_name,
|
@@ -201,7 +209,13 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
201
209
|
|
202
210
|
# Create chunk for current position
|
203
211
|
chunk_size = min(self.chunk_size, shard_files - current_file_idx)
|
204
|
-
|
212
|
+
self.current_chunk_index = current_file_idx // self.chunk_size
|
213
|
+
job_id_obj = JobId(
|
214
|
+
shard_id=shard_name,
|
215
|
+
chunk_id=self.current_chunk_index,
|
216
|
+
sample_id=current_file_idx,
|
217
|
+
)
|
218
|
+
chunk_id = job_id_obj.get_chunk_str()
|
205
219
|
|
206
220
|
with self.lock:
|
207
221
|
# Skip if already exists
|
@@ -224,6 +238,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
224
238
|
unit_id=chunk_id,
|
225
239
|
chunk_id=chunk_id,
|
226
240
|
source_id=shard_name,
|
241
|
+
unit_size=chunk_size,
|
227
242
|
data={
|
228
243
|
"shard_url": shard_url,
|
229
244
|
"shard_name": shard_name,
|
@@ -268,6 +283,25 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
268
283
|
unit = self.work_units.get(unit_id)
|
269
284
|
|
270
285
|
if unit:
|
286
|
+
# Update unprocessed ranges from chunk tracker before assigning
|
287
|
+
if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
|
288
|
+
chunk_state = self.chunk_tracker.chunks[unit_id]
|
289
|
+
relative_unprocessed = chunk_state.get_unprocessed_ranges()
|
290
|
+
|
291
|
+
# Convert relative to absolute indices
|
292
|
+
absolute_ranges = []
|
293
|
+
for start, end in relative_unprocessed:
|
294
|
+
abs_start = chunk_state.start_index + start
|
295
|
+
abs_end = chunk_state.start_index + end
|
296
|
+
absolute_ranges.append((abs_start, abs_end))
|
297
|
+
|
298
|
+
# Update the work unit's unprocessed ranges
|
299
|
+
unit.data["unprocessed_ranges"] = absolute_ranges
|
300
|
+
|
301
|
+
logger.debug(
|
302
|
+
f"Updated unit {unit_id} with unprocessed ranges: {absolute_ranges}"
|
303
|
+
)
|
304
|
+
|
271
305
|
self.assigned_units[worker_id].add(unit_id)
|
272
306
|
assigned.append(unit)
|
273
307
|
|
@@ -391,8 +425,27 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
391
425
|
if indices:
|
392
426
|
# Sort indices and convert to ranges
|
393
427
|
sorted_indices = sorted(indices)
|
394
|
-
|
395
|
-
|
428
|
+
if not sorted_indices:
|
429
|
+
continue
|
430
|
+
|
431
|
+
# Condense into contiguous ranges
|
432
|
+
ranges = []
|
433
|
+
start_range = sorted_indices[0]
|
434
|
+
end_range = sorted_indices[0]
|
435
|
+
|
436
|
+
for i in range(1, len(sorted_indices)):
|
437
|
+
if sorted_indices[i] == end_range + 1:
|
438
|
+
end_range = sorted_indices[i]
|
439
|
+
else:
|
440
|
+
ranges.append((start_range, end_range))
|
441
|
+
start_range = sorted_indices[i]
|
442
|
+
end_range = sorted_indices[i]
|
443
|
+
ranges.append((start_range, end_range))
|
444
|
+
|
445
|
+
# Mark each contiguous range as processed
|
446
|
+
logger.debug(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
|
447
|
+
for start_idx, end_idx in ranges:
|
448
|
+
self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
|
396
449
|
|
397
450
|
def get_stats(self) -> Dict[str, Any]:
|
398
451
|
"""Get processor statistics."""
|
@@ -488,7 +541,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
488
541
|
|
489
542
|
def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
|
490
543
|
"""Process a work unit by iterating specified ranges."""
|
491
|
-
logger.debug(f"Processing unit: {unit
|
544
|
+
logger.debug(f"Processing unit: {unit}")
|
492
545
|
|
493
546
|
shard_name = unit.data["shard_name"]
|
494
547
|
shard_idx = unit.data.get("shard_idx")
|
@@ -514,6 +567,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
514
567
|
"_chunk_relative_index": idx - unit.data["start_index"],
|
515
568
|
"_job_id": job_id,
|
516
569
|
"_mock": True,
|
570
|
+
"_processed_indices": processed_indices,
|
517
571
|
},
|
518
572
|
"job_id": job_id,
|
519
573
|
}
|
@@ -574,6 +628,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
574
628
|
"_job_id": job_id,
|
575
629
|
"_filename": entry.path,
|
576
630
|
"_file_size": entry.size,
|
631
|
+
"_processed_indices": processed_indices,
|
577
632
|
},
|
578
633
|
"job_id": job_id,
|
579
634
|
}
|
@@ -605,8 +660,8 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
605
660
|
result = super().prepare_result(unit, outputs, processing_time_ms)
|
606
661
|
|
607
662
|
# Add processed indices for chunk tracker
|
608
|
-
if
|
609
|
-
result.metadata["item_indices"] =
|
663
|
+
if hasattr(self, "_last_context") and "_processed_indices" in self._last_context:
|
664
|
+
result.metadata["item_indices"] = self._last_context["_processed_indices"]
|
610
665
|
|
611
666
|
return result
|
612
667
|
|
@@ -565,7 +565,8 @@ class CaptionWorker(BaseWorker):
|
|
565
565
|
batch = []
|
566
566
|
batch_size = self.vllm_config.get("batch_size", 8)
|
567
567
|
context = {}
|
568
|
-
|
568
|
+
self.items_processed = 0
|
569
|
+
self.items_failed = 0
|
569
570
|
# Collect items for batching
|
570
571
|
for item_data in self.processor.process_unit(unit, context):
|
571
572
|
if self.should_stop_processing.is_set() or not self.connected.is_set():
|
@@ -604,16 +605,33 @@ class CaptionWorker(BaseWorker):
|
|
604
605
|
self._process_batch(batch)
|
605
606
|
|
606
607
|
# Notify orchestrator that unit is complete
|
607
|
-
if
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
608
|
+
# Check if the number of processed items matches the expected count for the unit.
|
609
|
+
# The context dictionary holds the count of items yielded by the processor.
|
610
|
+
total_items_in_unit = unit.unit_size
|
611
|
+
|
612
|
+
if (
|
613
|
+
not self.should_stop_processing.is_set()
|
614
|
+
and self.connected.is_set()
|
615
|
+
and self.items_failed == 0
|
616
|
+
and self.items_processed >= total_items_in_unit
|
617
|
+
):
|
618
|
+
if self.websocket:
|
619
|
+
try:
|
620
|
+
asyncio.run_coroutine_threadsafe(
|
621
|
+
self.websocket.send(
|
622
|
+
json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
|
623
|
+
),
|
624
|
+
self.main_loop,
|
625
|
+
).result(timeout=5)
|
626
|
+
logger.info(
|
627
|
+
f"Unit {unit.unit_id} fully processed ({self.items_processed}/{total_items_in_unit}) and marked complete."
|
628
|
+
)
|
629
|
+
except Exception as e:
|
630
|
+
logger.warning(f"Could not notify work complete for unit {unit.unit_id}: {e}")
|
631
|
+
else:
|
632
|
+
logger.warning(
|
633
|
+
f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
|
634
|
+
)
|
617
635
|
|
618
636
|
def _process_batch(self, batch: List[ProcessingItem]):
|
619
637
|
"""Process a batch of items through all stages."""
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|