caption-flow 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
caption_flow/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """CaptionFlow - Distributed community captioning system."""
2
2
 
3
- __version__ = "0.3.2"
3
+ __version__ = "0.3.3"
4
4
 
5
5
  from .orchestrator import Orchestrator
6
6
  from .workers.data import DataWorker
@@ -124,13 +124,14 @@ class Orchestrator:
124
124
 
125
125
  # Initialize storage
126
126
  await self.storage.initialize()
127
- await self.update_unprocessed_ranges()
128
127
 
129
128
  # Start background tasks
130
129
  asyncio.create_task(self._heartbeat_loop())
131
130
  asyncio.create_task(self._checkpoint_loop())
132
131
  asyncio.create_task(self._stats_update_loop())
133
132
 
133
+ await self.update_unprocessed_ranges()
134
+
134
135
  # Start WebSocket server
135
136
  websocket_logger = logging.getLogger("websockets")
136
137
  websocket_logger.setLevel(logging.WARNING)
@@ -110,58 +110,86 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
110
110
  return self.shard_info_cache[shard_idx]
111
111
 
112
112
  def _restore_state(self, storage: StorageManager) -> None:
113
- """Restore state from chunk tracker."""
114
- logger.debug("Restoring state from chunk tracker")
113
+ """Restore state from chunk tracker and synchronize with storage."""
114
+ logger.info("Restoring state from chunk tracker and synchronizing with storage")
115
115
  if not self.chunk_tracker:
116
116
  return
117
117
 
118
+ # First, update chunk tracker from storage
119
+ processed_job_ids = storage.get_all_processed_job_ids()
120
+ if processed_job_ids:
121
+ logger.info(
122
+ f"Synchronizing chunk tracker with {len(processed_job_ids)} processed items from storage"
123
+ )
124
+ self.update_from_storage(processed_job_ids)
125
+
126
+ # Then restore work units from chunk tracker
118
127
  shards_summary = self.chunk_tracker.get_shards_summary()
119
- logger.debug(f"Restoring state: {shards_summary}")
128
+ logger.info(f"Restoring work units from chunk tracker: {len(shards_summary)} shards")
120
129
 
121
130
  with self.lock:
131
+ restored_count = 0
122
132
  for shard_name, shard_info in shards_summary.items():
123
133
  chunks = shard_info.get("chunks", [])
124
- logger.debug(f"Existing job ids: {storage.get_all_processed_job_ids()}")
125
134
  for chunk_state in chunks:
126
135
  # Only add incomplete chunks
127
- if chunk_state.status != "completed":
128
- logger.debug(f"Restoring incomplete chunk {chunk_state}")
136
+ if chunk_state.status == "completed":
137
+ logger.debug(f"Skipping completed chunk {chunk_state.chunk_id}")
138
+ continue
129
139
 
130
- # Get unprocessed ranges
131
- unprocessed_ranges = chunk_state.get_unprocessed_ranges()
140
+ # Get unprocessed ranges
141
+ unprocessed_ranges = chunk_state.get_unprocessed_ranges()
142
+ if not unprocessed_ranges:
132
143
  logger.debug(
133
- f"Chunk {chunk_state.chunk_id} unprocessed ranges: {unprocessed_ranges}"
144
+ f"Chunk {chunk_state.chunk_id} has no unprocessed ranges, marking as completed"
134
145
  )
135
- if not unprocessed_ranges:
136
- continue
146
+ self.chunk_tracker.mark_completed(chunk_state.chunk_id)
147
+ continue
137
148
 
138
- # Convert relative ranges to absolute file indices
139
- absolute_ranges = []
140
- for start, end in unprocessed_ranges:
141
- abs_start = chunk_state.start_index + start
142
- abs_end = chunk_state.start_index + end
143
- absolute_ranges.append((abs_start, abs_end))
149
+ logger.info(
150
+ f"Restoring chunk {chunk_state.chunk_id} with unprocessed ranges: {unprocessed_ranges}"
151
+ )
144
152
 
145
- unit = WorkUnit(
146
- unit_id=chunk_state.chunk_id,
147
- chunk_id=chunk_state.chunk_id,
148
- source_id=shard_name,
149
- unit_size=chunk_state.chunk_size,
150
- data={
151
- "shard_url": chunk_state.shard_url,
152
- "shard_name": shard_name,
153
- "start_index": chunk_state.start_index,
154
- "chunk_size": chunk_state.chunk_size,
155
- "unprocessed_ranges": absolute_ranges,
156
- },
157
- metadata={
158
- "shard_name": shard_name,
159
- "chunk_index": chunk_state.start_index // self.chunk_size,
160
- },
161
- )
153
+ # Convert relative ranges to absolute file indices
154
+ absolute_ranges = []
155
+ for start, end in unprocessed_ranges:
156
+ abs_start = chunk_state.start_index + start
157
+ abs_end = chunk_state.start_index + end
158
+ absolute_ranges.append((abs_start, abs_end))
159
+
160
+ # Get shard index if available
161
+ shard_idx = None
162
+ if self.dataset:
163
+ for idx in range(self.dataset.num_shards):
164
+ shard_info = self._get_shard_info_cached(idx)
165
+ if shard_info and shard_info["name"] == shard_name:
166
+ shard_idx = idx
167
+ break
168
+
169
+ unit = WorkUnit(
170
+ unit_id=chunk_state.chunk_id,
171
+ chunk_id=chunk_state.chunk_id,
172
+ source_id=shard_name,
173
+ unit_size=chunk_state.chunk_size,
174
+ data={
175
+ "shard_url": chunk_state.shard_url,
176
+ "shard_name": shard_name,
177
+ "shard_idx": shard_idx,
178
+ "start_index": chunk_state.start_index,
179
+ "chunk_size": chunk_state.chunk_size,
180
+ "unprocessed_ranges": absolute_ranges,
181
+ },
182
+ metadata={
183
+ "shard_name": shard_name,
184
+ "chunk_index": chunk_state.start_index // self.chunk_size,
185
+ },
186
+ )
187
+
188
+ self.work_units[unit.unit_id] = unit
189
+ self.pending_units.append(unit.unit_id)
190
+ restored_count += 1
162
191
 
163
- self.work_units[unit.unit_id] = unit
164
- self.pending_units.append(unit.unit_id)
192
+ logger.info(f"Restored {restored_count} incomplete work units")
165
193
 
166
194
  def _create_units_background(self) -> None:
167
195
  """Background thread to create work units on demand."""
@@ -407,22 +435,46 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
407
435
  # Group by chunk
408
436
  processed_by_chunk = defaultdict(set)
409
437
 
410
- for job_id in processed_job_ids:
411
- # Parse job_id to extract chunk and index
412
- # Expected format: "shard:chunk:X:idx:Y"
413
- parts = job_id.split(":")
414
- if len(parts) >= 5 and parts[3] == "idx":
415
- chunk_id = ":".join(parts[:3]) # "shard:chunk:X"
416
- try:
417
- idx = int(parts[4])
418
- processed_by_chunk[chunk_id].add(idx)
419
- except ValueError:
420
- continue
438
+ for job_id_str in processed_job_ids:
439
+ try:
440
+ # Use JobId to parse the job ID string
441
+ job_id = JobId.from_str(job_id_str)
442
+ chunk_id = job_id.get_chunk_str()
443
+ sample_idx = int(job_id.sample_id)
444
+ processed_by_chunk[chunk_id].add(sample_idx)
445
+ except ValueError as e:
446
+ logger.warning(f"Invalid job ID format: {job_id_str} - {e}")
447
+ continue
421
448
 
422
449
  # Update chunk tracker with processed items
423
450
  if self.chunk_tracker:
424
451
  for chunk_id, indices in processed_by_chunk.items():
425
452
  if indices:
453
+ # Get or create chunk state
454
+ chunk_state = self.chunk_tracker.chunks.get(chunk_id)
455
+ if not chunk_state:
456
+ # Parse chunk_id using JobId to get shard info
457
+ try:
458
+ # chunk_id format: "shard_id:chunk:chunk_idx"
459
+ parts = chunk_id.split(":")
460
+ if len(parts) >= 3:
461
+ shard_name = parts[0]
462
+ chunk_idx = int(parts[2])
463
+ # Infer start index from chunk index and size
464
+ start_index = chunk_idx * self.chunk_size
465
+ # Create chunk state
466
+ self.chunk_tracker.add_chunk(
467
+ chunk_id,
468
+ shard_name,
469
+ f"{shard_name}.tar",
470
+ start_index,
471
+ self.chunk_size,
472
+ )
473
+ logger.info(f"Created missing chunk state for {chunk_id}")
474
+ except (ValueError, IndexError) as e:
475
+ logger.error(f"Failed to create chunk state for {chunk_id}: {e}")
476
+ continue
477
+
426
478
  # Sort indices and convert to ranges
427
479
  sorted_indices = sorted(indices)
428
480
  if not sorted_indices:
@@ -443,10 +495,13 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
443
495
  ranges.append((start_range, end_range))
444
496
 
445
497
  # Mark each contiguous range as processed
446
- logger.debug(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
498
+ logger.info(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
447
499
  for start_idx, end_idx in ranges:
448
500
  self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
449
501
 
502
+ # Save checkpoint after updating
503
+ self.chunk_tracker.save()
504
+
450
505
  def get_stats(self) -> Dict[str, Any]:
451
506
  """Get processor statistics."""
452
507
  with self.lock:
@@ -555,7 +610,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
555
610
  # Generate mock results for unprocessed ranges
556
611
  for start_idx, end_idx in unprocessed_ranges:
557
612
  for idx in range(start_idx, end_idx + 1):
558
- job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
613
+ # Use JobId to create consistent job ID
614
+ job_id = JobId.from_values(
615
+ shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(idx)
616
+ )
617
+ job_id_str = job_id.get_sample_str()
559
618
 
560
619
  yield {
561
620
  "image": self._create_mock_image(idx),
@@ -565,11 +624,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
565
624
  "metadata": {
566
625
  "_item_index": idx,
567
626
  "_chunk_relative_index": idx - unit.data["start_index"],
568
- "_job_id": job_id,
627
+ "_job_id": job_id_str,
569
628
  "_mock": True,
570
629
  "_processed_indices": processed_indices,
571
630
  },
572
- "job_id": job_id,
631
+ "job_id": job_id_str,
573
632
  }
574
633
 
575
634
  processed_indices.append(idx)
@@ -614,8 +673,11 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
614
673
  f"Error decoding image {entry.path} with cv2: {img_e}"
615
674
  )
616
675
 
617
- # Generate job ID compatible with chunk tracker
618
- job_id = f"{shard_name}:chunk:{chunk_index}:idx:{idx}"
676
+ # Generate job ID using JobId class
677
+ job_id = JobId.from_values(
678
+ shard_id=shard_name, chunk_id=str(chunk_index), sample_id=str(idx)
679
+ )
680
+ job_id_str = job_id.get_sample_str()
619
681
 
620
682
  yield {
621
683
  "image": image,
@@ -625,12 +687,12 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
625
687
  "metadata": {
626
688
  "_item_index": idx,
627
689
  "_chunk_relative_index": idx - unit.data["start_index"],
628
- "_job_id": job_id,
690
+ "_job_id": job_id_str,
629
691
  "_filename": entry.path,
630
692
  "_file_size": entry.size,
631
693
  "_processed_indices": processed_indices,
632
694
  },
633
- "job_id": job_id,
695
+ "job_id": job_id_str,
634
696
  }
635
697
 
636
698
  processed_indices.append(idx)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
@@ -1,14 +1,14 @@
1
- caption_flow/__init__.py,sha256=09Vyr0RqKrKe1caUhXq9beficJkmclryjT6BNiASUxQ,303
1
+ caption_flow/__init__.py,sha256=hNewpvkdcuW2JWSuF1u0gfovBCTRPwbIDlqlvLTWYGI,303
2
2
  caption_flow/cli.py,sha256=t_cYCxJE7f5UtB3br2Es51JjO5KPsWM1JTdDXAxM_Lw,41371
3
3
  caption_flow/models.py,sha256=2n6iphTEL62xK2FFcJM6axMsaE8KwsUv5Ak_cCF-TdQ,5652
4
4
  caption_flow/monitor.py,sha256=bAt9EJqfPgT_KdbknGdCxwBRH002pRDgyUmYIj6Dyso,7885
5
- caption_flow/orchestrator.py,sha256=34gZvaW14YZ7a7LagYOO3VKKwlbuS4aw0yoP1L8gwf0,36192
5
+ caption_flow/orchestrator.py,sha256=de3AuO-0zd8w-ESfjPK9U1e8lWr6ucgE3VMX0AZSM7Q,36193
6
6
  caption_flow/viewer.py,sha256=HxO98eHR1xtivG0dEdYC2U9T_RgeRfJqqTK-37u9bNM,20471
7
7
  caption_flow/processors/__init__.py,sha256=hvq-OuAJWQe6hFglKe7QmkS8473k20FmxZDSxfXpCrg,423
8
8
  caption_flow/processors/base.py,sha256=IAEr0pqHRuSkXunvDWk1vf2IKeYQ-2YERqej9iSQm94,6931
9
9
  caption_flow/processors/huggingface.py,sha256=w0j7PRosXYyJXZ0A0Y-J6_n-aHCGVW8tbt8lcvguO_Y,41237
10
10
  caption_flow/processors/local_filesystem.py,sha256=OuNNDemy0sdtpBBC_5GbI-c1vMqp8OIz983Cq85gdb8,27964
11
- caption_flow/processors/webdataset.py,sha256=TkC6xZO6m2FcwiBQGJsSQcrshBKcLdr4edFVtnBOd3U,28999
11
+ caption_flow/processors/webdataset.py,sha256=Em-GssF27oSctG15TANwEeHIzmyNl4sTSdtX02010Lo,32144
12
12
  caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
13
13
  caption_flow/storage/exporter.py,sha256=mFJqMDQ61cP-qcXe118_-oL1TUqULdQZ8LdjSTym44I,19697
14
14
  caption_flow/storage/manager.py,sha256=KPExcKPuFVQSsBnfCBdne5PO4PwN4NTfd-EJQk13OY0,47459
@@ -25,9 +25,9 @@ caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn0
25
25
  caption_flow/workers/base.py,sha256=2AGWERC5hbmO-0V_A1MUbgRVvRNN3blqGPyDokvvzmM,7575
26
26
  caption_flow/workers/caption.py,sha256=X4BEmb6C1c73hvgJDMsHtgCUlCuECtnloWSVolVpa4s,39353
27
27
  caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
28
- caption_flow-0.3.2.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
29
- caption_flow-0.3.2.dist-info/METADATA,sha256=8bHECzNi4R6_FlbHWSHMx9TDo4uTVKWWgVbqAe5cCIs,9708
30
- caption_flow-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- caption_flow-0.3.2.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
32
- caption_flow-0.3.2.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
33
- caption_flow-0.3.2.dist-info/RECORD,,
28
+ caption_flow-0.3.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
29
+ caption_flow-0.3.3.dist-info/METADATA,sha256=GBf1DAFTM6a_o-6-CaIcm3k5t_gFwzDmXc4lFaOAqY8,9708
30
+ caption_flow-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ caption_flow-0.3.3.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
32
+ caption_flow-0.3.3.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
33
+ caption_flow-0.3.3.dist-info/RECORD,,