caption-flow 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {caption_flow-0.3.2/src/caption_flow.egg-info → caption_flow-0.3.3}/PKG-INFO +1 -1
- {caption_flow-0.3.2 → caption_flow-0.3.3}/pyproject.toml +1 -1
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/__init__.py +1 -1
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/orchestrator.py +2 -1
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/processors/webdataset.py +117 -55
- {caption_flow-0.3.2 → caption_flow-0.3.3/src/caption_flow.egg-info}/PKG-INFO +1 -1
- {caption_flow-0.3.2 → caption_flow-0.3.3}/LICENSE +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/README.md +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/setup.cfg +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/cli.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/models.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/monitor.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/processors/__init__.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/processors/base.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/processors/huggingface.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/processors/local_filesystem.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/storage/__init__.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/storage/exporter.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/storage/manager.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/__init__.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/auth.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/caption_utils.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/certificates.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/checkpoint_tracker.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/chunk_tracker.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/image_processor.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/json_utils.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/prompt_template.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/utils/vllm_config.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/viewer.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/workers/base.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/workers/caption.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow/workers/data.py +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow.egg-info/SOURCES.txt +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow.egg-info/dependency_links.txt +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow.egg-info/entry_points.txt +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow.egg-info/requires.txt +0 -0
- {caption_flow-0.3.2 → caption_flow-0.3.3}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -124,13 +124,14 @@ class Orchestrator:
|
|
124
124
|
|
125
125
|
# Initialize storage
|
126
126
|
await self.storage.initialize()
|
127
|
-
await self.update_unprocessed_ranges()
|
128
127
|
|
129
128
|
# Start background tasks
|
130
129
|
asyncio.create_task(self._heartbeat_loop())
|
131
130
|
asyncio.create_task(self._checkpoint_loop())
|
132
131
|
asyncio.create_task(self._stats_update_loop())
|
133
132
|
|
133
|
+
await self.update_unprocessed_ranges()
|
134
|
+
|
134
135
|
# Start WebSocket server
|
135
136
|
websocket_logger = logging.getLogger("websockets")
|
136
137
|
websocket_logger.setLevel(logging.WARNING)
|
@@ -110,58 +110,86 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
110
110
|
return self.shard_info_cache[shard_idx]
|
111
111
|
|
112
112
|
def _restore_state(self, storage: StorageManager) -> None:
|
113
|
-
"""Restore state from chunk tracker."""
|
114
|
-
logger.
|
113
|
+
"""Restore state from chunk tracker and synchronize with storage."""
|
114
|
+
logger.info("Restoring state from chunk tracker and synchronizing with storage")
|
115
115
|
if not self.chunk_tracker:
|
116
116
|
return
|
117
117
|
|
118
|
+
# First, update chunk tracker from storage
|
119
|
+
processed_job_ids = storage.get_all_processed_job_ids()
|
120
|
+
if processed_job_ids:
|
121
|
+
logger.info(
|
122
|
+
f"Synchronizing chunk tracker with {len(processed_job_ids)} processed items from storage"
|
123
|
+
)
|
124
|
+
self.update_from_storage(processed_job_ids)
|
125
|
+
|
126
|
+
# Then restore work units from chunk tracker
|
118
127
|
shards_summary = self.chunk_tracker.get_shards_summary()
|
119
|
-
logger.
|
128
|
+
logger.info(f"Restoring work units from chunk tracker: {len(shards_summary)} shards")
|
120
129
|
|
121
130
|
with self.lock:
|
131
|
+
restored_count = 0
|
122
132
|
for shard_name, shard_info in shards_summary.items():
|
123
133
|
chunks = shard_info.get("chunks", [])
|
124
|
-
logger.debug(f"Existing job ids: {storage.get_all_processed_job_ids()}")
|
125
134
|
for chunk_state in chunks:
|
126
135
|
# Only add incomplete chunks
|
127
|
-
if chunk_state.status
|
128
|
-
logger.debug(f"
|
136
|
+
if chunk_state.status == "completed":
|
137
|
+
logger.debug(f"Skipping completed chunk {chunk_state.chunk_id}")
|
138
|
+
continue
|
129
139
|
|
130
|
-
|
131
|
-
|
140
|
+
# Get unprocessed ranges
|
141
|
+
unprocessed_ranges = chunk_state.get_unprocessed_ranges()
|
142
|
+
if not unprocessed_ranges:
|
132
143
|
logger.debug(
|
133
|
-
f"Chunk {chunk_state.chunk_id} unprocessed ranges
|
144
|
+
f"Chunk {chunk_state.chunk_id} has no unprocessed ranges, marking as completed"
|
134
145
|
)
|
135
|
-
|
136
|
-
|
146
|
+
self.chunk_tracker.mark_completed(chunk_state.chunk_id)
|
147
|
+
continue
|
137
148
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
abs_start = chunk_state.start_index + start
|
142
|
-
abs_end = chunk_state.start_index + end
|
143
|
-
absolute_ranges.append((abs_start, abs_end))
|
149
|
+
logger.info(
|
150
|
+
f"Restoring chunk {chunk_state.chunk_id} with unprocessed ranges: {unprocessed_ranges}"
|
151
|
+
)
|
144
152
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
153
|
+
# Convert relative ranges to absolute file indices
|
154
|
+
absolute_ranges = []
|
155
|
+
for start, end in unprocessed_ranges:
|
156
|
+
abs_start = chunk_state.start_index + start
|
157
|
+
abs_end = chunk_state.start_index + end
|
158
|
+
absolute_ranges.append((abs_start, abs_end))
|
159
|
+
|
160
|
+
# Get shard index if available
|
161
|
+
shard_idx = None
|
162
|
+
if self.dataset:
|
163
|
+
for idx in range(self.dataset.num_shards):
|
164
|
+
shard_info = self._get_shard_info_cached(idx)
|
165
|
+
if shard_info and shard_info["name"] == shard_name:
|
166
|
+
shard_idx = idx
|
167
|
+
break
|
168
|
+
|
169
|
+
unit = WorkUnit(
|
170
|
+
unit_id=chunk_state.chunk_id,
|
171
|
+
chunk_id=chunk_state.chunk_id,
|
172
|
+
source_id=shard_name,
|
173
|
+
unit_size=chunk_state.chunk_size,
|
174
|
+
data={
|
175
|
+
"shard_url": chunk_state.shard_url,
|
176
|
+
"shard_name": shard_name,
|
177
|
+
"shard_idx": shard_idx,
|
178
|
+
"start_index": chunk_state.start_index,
|
179
|
+
"chunk_size": chunk_state.chunk_size,
|
180
|
+
"unprocessed_ranges": absolute_ranges,
|
181
|
+
},
|
182
|
+
metadata={
|
183
|
+
"shard_name": shard_name,
|
184
|
+
"chunk_index": chunk_state.start_index // self.chunk_size,
|
185
|
+
},
|
186
|
+
)
|
187
|
+
|
188
|
+
self.work_units[unit.unit_id] = unit
|
189
|
+
self.pending_units.append(unit.unit_id)
|
190
|
+
restored_count += 1
|
162
191
|
|
163
|
-
|
164
|
-
self.pending_units.append(unit.unit_id)
|
192
|
+
logger.info(f"Restored {restored_count} incomplete work units")
|
165
193
|
|
166
194
|
def _create_units_background(self) -> None:
|
167
195
|
"""Background thread to create work units on demand."""
|
@@ -407,22 +435,46 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
407
435
|
# Group by chunk
|
408
436
|
processed_by_chunk = defaultdict(set)
|
409
437
|
|
410
|
-
for
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
continue
|
438
|
+
for job_id_str in processed_job_ids:
|
439
|
+
try:
|
440
|
+
# Use JobId to parse the job ID string
|
441
|
+
job_id = JobId.from_str(job_id_str)
|
442
|
+
chunk_id = job_id.get_chunk_str()
|
443
|
+
sample_idx = int(job_id.sample_id)
|
444
|
+
processed_by_chunk[chunk_id].add(sample_idx)
|
445
|
+
except ValueError as e:
|
446
|
+
logger.warning(f"Invalid job ID format: {job_id_str} - {e}")
|
447
|
+
continue
|
421
448
|
|
422
449
|
# Update chunk tracker with processed items
|
423
450
|
if self.chunk_tracker:
|
424
451
|
for chunk_id, indices in processed_by_chunk.items():
|
425
452
|
if indices:
|
453
|
+
# Get or create chunk state
|
454
|
+
chunk_state = self.chunk_tracker.chunks.get(chunk_id)
|
455
|
+
if not chunk_state:
|
456
|
+
# Parse chunk_id using JobId to get shard info
|
457
|
+
try:
|
458
|
+
# chunk_id format: "shard_id:chunk:chunk_idx"
|
459
|
+
parts = chunk_id.split(":")
|
460
|
+
if len(parts) >= 3:
|
461
|
+
shard_name = parts[0]
|
462
|
+
chunk_idx = int(parts[2])
|
463
|
+
# Infer start index from chunk index and size
|
464
|
+
start_index = chunk_idx * self.chunk_size
|
465
|
+
# Create chunk state
|
466
|
+
self.chunk_tracker.add_chunk(
|
467
|
+
chunk_id,
|
468
|
+
shard_name,
|
469
|
+
f"{shard_name}.tar",
|
470
|
+
start_index,
|
471
|
+
self.chunk_size,
|
472
|
+
)
|
473
|
+
logger.info(f"Created missing chunk state for {chunk_id}")
|
474
|
+
except (ValueError, IndexError) as e:
|
475
|
+
logger.error(f"Failed to create chunk state for {chunk_id}: {e}")
|
476
|
+
continue
|
477
|
+
|
426
478
|
# Sort indices and convert to ranges
|
427
479
|
sorted_indices = sorted(indices)
|
428
480
|
if not sorted_indices:
|
@@ -443,10 +495,13 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
|
|
443
495
|
ranges.append((start_range, end_range))
|
444
496
|
|
445
497
|
# Mark each contiguous range as processed
|
446
|
-
logger.
|
498
|
+
logger.info(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
|
447
499
|
for start_idx, end_idx in ranges:
|
448
500
|
self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
|
449
501
|
|
502
|
+
# Save checkpoint after updating
|
503
|
+
self.chunk_tracker.save()
|
504
|
+
|
450
505
|
def get_stats(self) -> Dict[str, Any]:
|
451
506
|
"""Get processor statistics."""
|
452
507
|
with self.lock:
|
@@ -555,7 +610,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
555
610
|
# Generate mock results for unprocessed ranges
|
556
611
|
for start_idx, end_idx in unprocessed_ranges:
|
557
612
|
for idx in range(start_idx, end_idx + 1):
|
558
|
-
|
613
|
+
# Use JobId to create consistent job ID
|
614
|
+
job_id = JobId.from_values(
|
615
|
+
shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(idx)
|
616
|
+
)
|
617
|
+
job_id_str = job_id.get_sample_str()
|
559
618
|
|
560
619
|
yield {
|
561
620
|
"image": self._create_mock_image(idx),
|
@@ -565,11 +624,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
565
624
|
"metadata": {
|
566
625
|
"_item_index": idx,
|
567
626
|
"_chunk_relative_index": idx - unit.data["start_index"],
|
568
|
-
"_job_id":
|
627
|
+
"_job_id": job_id_str,
|
569
628
|
"_mock": True,
|
570
629
|
"_processed_indices": processed_indices,
|
571
630
|
},
|
572
|
-
"job_id":
|
631
|
+
"job_id": job_id_str,
|
573
632
|
}
|
574
633
|
|
575
634
|
processed_indices.append(idx)
|
@@ -614,8 +673,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
614
673
|
f"Error decoding image {entry.path} with cv2: {img_e}"
|
615
674
|
)
|
616
675
|
|
617
|
-
# Generate job ID
|
618
|
-
job_id =
|
676
|
+
# Generate job ID using JobId class
|
677
|
+
job_id = JobId.from_values(
|
678
|
+
shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(idx)
|
679
|
+
)
|
680
|
+
job_id_str = job_id.get_sample_str()
|
619
681
|
|
620
682
|
yield {
|
621
683
|
"image": image,
|
@@ -625,12 +687,12 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
|
|
625
687
|
"metadata": {
|
626
688
|
"_item_index": idx,
|
627
689
|
"_chunk_relative_index": idx - unit.data["start_index"],
|
628
|
-
"_job_id":
|
690
|
+
"_job_id": job_id_str,
|
629
691
|
"_filename": entry.path,
|
630
692
|
"_file_size": entry.size,
|
631
693
|
"_processed_indices": processed_indices,
|
632
694
|
},
|
633
|
-
"job_id":
|
695
|
+
"job_id": job_id_str,
|
634
696
|
}
|
635
697
|
|
636
698
|
processed_indices.append(idx)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|