caption-flow 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {caption_flow-0.3.1/src/caption_flow.egg-info → caption_flow-0.3.2}/PKG-INFO +1 -1
  2. {caption_flow-0.3.1 → caption_flow-0.3.2}/pyproject.toml +1 -1
  3. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/__init__.py +1 -1
  4. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/base.py +3 -0
  5. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/huggingface.py +1 -0
  6. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/local_filesystem.py +2 -0
  7. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/webdataset.py +62 -7
  8. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/chunk_tracker.py +1 -0
  9. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/caption.py +29 -11
  10. {caption_flow-0.3.1 → caption_flow-0.3.2/src/caption_flow.egg-info}/PKG-INFO +1 -1
  11. {caption_flow-0.3.1 → caption_flow-0.3.2}/LICENSE +0 -0
  12. {caption_flow-0.3.1 → caption_flow-0.3.2}/README.md +0 -0
  13. {caption_flow-0.3.1 → caption_flow-0.3.2}/setup.cfg +0 -0
  14. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/cli.py +0 -0
  15. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/models.py +0 -0
  16. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/monitor.py +0 -0
  17. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/orchestrator.py +0 -0
  18. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/processors/__init__.py +0 -0
  19. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/storage/__init__.py +0 -0
  20. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/storage/exporter.py +0 -0
  21. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/storage/manager.py +0 -0
  22. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/__init__.py +0 -0
  23. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/auth.py +0 -0
  24. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/caption_utils.py +0 -0
  25. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/certificates.py +0 -0
  26. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/checkpoint_tracker.py +0 -0
  27. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/image_processor.py +0 -0
  28. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/json_utils.py +0 -0
  29. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/prompt_template.py +0 -0
  30. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/utils/vllm_config.py +0 -0
  31. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/viewer.py +0 -0
  32. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/base.py +0 -0
  33. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow/workers/data.py +0 -0
  34. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/SOURCES.txt +0 -0
  35. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/dependency_links.txt +0 -0
  36. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/entry_points.txt +0 -0
  37. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/requires.txt +0 -0
  38. {caption_flow-0.3.1 → caption_flow-0.3.2}/src/caption_flow.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "caption-flow"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "Self-contained distributed community captioning system"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10,<3.13"
@@ -1,6 +1,6 @@
1
1
  """CaptionFlow - Distributed community captioning system."""
2
2
 
3
- __version__ = "0.3.1"
3
+ __version__ = "0.3.2"
4
4
 
5
5
  from .orchestrator import Orchestrator
6
6
  from .workers.data import DataWorker
@@ -14,6 +14,7 @@ class WorkUnit:
14
14
  unit_id: str # usually, but not always, the chunk id
15
15
  chunk_id: str # always the chunk id
16
16
  source_id: str # the shard name
17
+ unit_size: int # how many elements are in the workunit
17
18
  data: Dict[str, Any]
18
19
  metadata: Dict[str, Any] = field(default_factory=dict)
19
20
  priority: int = 0
@@ -44,6 +45,7 @@ class WorkAssignment:
44
45
  "unit_id": u.unit_id,
45
46
  "source_id": u.source_id,
46
47
  "chunk_id": u.chunk_id,
48
+ "unit_size": u.unit_size,
47
49
  "data": u.data,
48
50
  "metadata": u.metadata,
49
51
  "priority": u.priority,
@@ -62,6 +64,7 @@ class WorkAssignment:
62
64
  unit_id=u["unit_id"],
63
65
  chunk_id=u["chunk_id"],
64
66
  source_id=u["source_id"],
67
+ unit_size=u["unit_size"],
65
68
  data=u["data"],
66
69
  metadata=u.get("metadata", {}),
67
70
  priority=u.get("priority", 0),
@@ -425,6 +425,7 @@ class HuggingFaceDatasetOrchestratorProcessor(OrchestratorProcessor):
425
425
  unit_id=unit_id,
426
426
  chunk_id=unit_id,
427
427
  source_id=shard_name,
428
+ unit_size=chunk_size,
428
429
  data={
429
430
  "dataset_name": self.dataset_name,
430
431
  "config": self.config,
@@ -251,6 +251,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
251
251
  unit_id=chunk_id,
252
252
  chunk_id=chunk_id,
253
253
  source_id="local",
254
+ unit_size=chunk_state.chunk_size,
254
255
  data={
255
256
  "start_index": chunk_state.start_index,
256
257
  "chunk_size": chunk_state.chunk_size,
@@ -319,6 +320,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
319
320
  unit_id=unit_id,
320
321
  chunk_id=unit_id,
321
322
  source_id="local",
323
+ unit_size=chunk_size,
322
324
  data={
323
325
  "start_index": self.current_index,
324
326
  "chunk_size": chunk_size,
@@ -12,6 +12,7 @@ from datetime import datetime
12
12
  from PIL import Image
13
13
  import io
14
14
 
15
+ from caption_flow.models import JobId
15
16
  from caption_flow.storage import StorageManager
16
17
  from .base import OrchestratorProcessor, WorkerProcessor, ProcessorConfig, WorkUnit, WorkResult
17
18
  from ..utils import ChunkTracker
@@ -21,6 +22,7 @@ import cv2
21
22
  import numpy as np
22
23
 
23
24
  logger = logging.getLogger(__name__)
25
+ logger.setLevel(logging.INFO)
24
26
 
25
27
 
26
28
  class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
@@ -114,17 +116,22 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
114
116
  return
115
117
 
116
118
  shards_summary = self.chunk_tracker.get_shards_summary()
119
+ logger.debug(f"Restoring state: {shards_summary}")
117
120
 
118
121
  with self.lock:
119
122
  for shard_name, shard_info in shards_summary.items():
120
123
  chunks = shard_info.get("chunks", [])
124
+ logger.debug(f"Existing job ids: {storage.get_all_processed_job_ids()}")
121
125
  for chunk_state in chunks:
122
126
  # Only add incomplete chunks
123
127
  if chunk_state.status != "completed":
124
- logger.debug(f"Restoring incomplete chunk {chunk_state.chunk_id}")
128
+ logger.debug(f"Restoring incomplete chunk {chunk_state}")
125
129
 
126
130
  # Get unprocessed ranges
127
131
  unprocessed_ranges = chunk_state.get_unprocessed_ranges()
132
+ logger.debug(
133
+ f"Chunk {chunk_state.chunk_id} unprocessed ranges: {unprocessed_ranges}"
134
+ )
128
135
  if not unprocessed_ranges:
129
136
  continue
130
137
 
@@ -139,6 +146,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
139
146
  unit_id=chunk_state.chunk_id,
140
147
  chunk_id=chunk_state.chunk_id,
141
148
  source_id=shard_name,
149
+ unit_size=chunk_state.chunk_size,
142
150
  data={
143
151
  "shard_url": chunk_state.shard_url,
144
152
  "shard_name": shard_name,
@@ -201,7 +209,13 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
201
209
 
202
210
  # Create chunk for current position
203
211
  chunk_size = min(self.chunk_size, shard_files - current_file_idx)
204
- chunk_id = f"{shard_name}:chunk:{current_file_idx // self.chunk_size}"
212
+ self.current_chunk_index = current_file_idx // self.chunk_size
213
+ job_id_obj = JobId(
214
+ shard_id=shard_name,
215
+ chunk_id=self.current_chunk_index,
216
+ sample_id=current_file_idx,
217
+ )
218
+ chunk_id = job_id_obj.get_chunk_str()
205
219
 
206
220
  with self.lock:
207
221
  # Skip if already exists
@@ -224,6 +238,7 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
224
238
  unit_id=chunk_id,
225
239
  chunk_id=chunk_id,
226
240
  source_id=shard_name,
241
+ unit_size=chunk_size,
227
242
  data={
228
243
  "shard_url": shard_url,
229
244
  "shard_name": shard_name,
@@ -268,6 +283,25 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
268
283
  unit = self.work_units.get(unit_id)
269
284
 
270
285
  if unit:
286
+ # Update unprocessed ranges from chunk tracker before assigning
287
+ if self.chunk_tracker and unit_id in self.chunk_tracker.chunks:
288
+ chunk_state = self.chunk_tracker.chunks[unit_id]
289
+ relative_unprocessed = chunk_state.get_unprocessed_ranges()
290
+
291
+ # Convert relative to absolute indices
292
+ absolute_ranges = []
293
+ for start, end in relative_unprocessed:
294
+ abs_start = chunk_state.start_index + start
295
+ abs_end = chunk_state.start_index + end
296
+ absolute_ranges.append((abs_start, abs_end))
297
+
298
+ # Update the work unit's unprocessed ranges
299
+ unit.data["unprocessed_ranges"] = absolute_ranges
300
+
301
+ logger.debug(
302
+ f"Updated unit {unit_id} with unprocessed ranges: {absolute_ranges}"
303
+ )
304
+
271
305
  self.assigned_units[worker_id].add(unit_id)
272
306
  assigned.append(unit)
273
307
 
@@ -391,8 +425,27 @@ class WebDatasetOrchestratorProcessor(OrchestratorProcessor):
391
425
  if indices:
392
426
  # Sort indices and convert to ranges
393
427
  sorted_indices = sorted(indices)
394
- for idx in sorted_indices:
395
- self.chunk_tracker.mark_items_processed(chunk_id, idx, idx)
428
+ if not sorted_indices:
429
+ continue
430
+
431
+ # Condense into contiguous ranges
432
+ ranges = []
433
+ start_range = sorted_indices[0]
434
+ end_range = sorted_indices[0]
435
+
436
+ for i in range(1, len(sorted_indices)):
437
+ if sorted_indices[i] == end_range + 1:
438
+ end_range = sorted_indices[i]
439
+ else:
440
+ ranges.append((start_range, end_range))
441
+ start_range = sorted_indices[i]
442
+ end_range = sorted_indices[i]
443
+ ranges.append((start_range, end_range))
444
+
445
+ # Mark each contiguous range as processed
446
+ logger.debug(f"Marking ranges {ranges} as processed in chunk {chunk_id}")
447
+ for start_idx, end_idx in ranges:
448
+ self.chunk_tracker.mark_items_processed(chunk_id, start_idx, end_idx)
396
449
 
397
450
  def get_stats(self) -> Dict[str, Any]:
398
451
  """Get processor statistics."""
@@ -488,7 +541,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
488
541
 
489
542
  def process_unit(self, unit: WorkUnit, context: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
490
543
  """Process a work unit by iterating specified ranges."""
491
- logger.debug(f"Processing unit: {unit.unit_id}")
544
+ logger.debug(f"Processing unit: {unit}")
492
545
 
493
546
  shard_name = unit.data["shard_name"]
494
547
  shard_idx = unit.data.get("shard_idx")
@@ -514,6 +567,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
514
567
  "_chunk_relative_index": idx - unit.data["start_index"],
515
568
  "_job_id": job_id,
516
569
  "_mock": True,
570
+ "_processed_indices": processed_indices,
517
571
  },
518
572
  "job_id": job_id,
519
573
  }
@@ -574,6 +628,7 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
574
628
  "_job_id": job_id,
575
629
  "_filename": entry.path,
576
630
  "_file_size": entry.size,
631
+ "_processed_indices": processed_indices,
577
632
  },
578
633
  "job_id": job_id,
579
634
  }
@@ -605,8 +660,8 @@ class WebDatasetWorkerProcessor(WorkerProcessor):
605
660
  result = super().prepare_result(unit, outputs, processing_time_ms)
606
661
 
607
662
  # Add processed indices for chunk tracker
608
- if outputs and "_processed_indices" in outputs[0].get("metadata", {}):
609
- result.metadata["item_indices"] = outputs[0]["metadata"]["_processed_indices"]
663
+ if hasattr(self, "_last_context") and "_processed_indices" in self._last_context:
664
+ result.metadata["item_indices"] = self._last_context["_processed_indices"]
610
665
 
611
666
  return result
612
667
 
@@ -10,6 +10,7 @@ from dataclasses import dataclass, asdict, field
10
10
  from .checkpoint_tracker import CheckpointTracker
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
+ logger.setLevel(logging.DEBUG)
13
14
 
14
15
 
15
16
  @dataclass
@@ -565,7 +565,8 @@ class CaptionWorker(BaseWorker):
565
565
  batch = []
566
566
  batch_size = self.vllm_config.get("batch_size", 8)
567
567
  context = {}
568
-
568
+ self.items_processed = 0
569
+ self.items_failed = 0
569
570
  # Collect items for batching
570
571
  for item_data in self.processor.process_unit(unit, context):
571
572
  if self.should_stop_processing.is_set() or not self.connected.is_set():
@@ -604,16 +605,33 @@ class CaptionWorker(BaseWorker):
604
605
  self._process_batch(batch)
605
606
 
606
607
  # Notify orchestrator that unit is complete
607
- if self.connected.is_set() and self.websocket:
608
- try:
609
- asyncio.run_coroutine_threadsafe(
610
- self.websocket.send(
611
- json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
612
- ),
613
- self.main_loop,
614
- ).result(timeout=5)
615
- except Exception as e:
616
- logger.warning(f"Could not notify work complete: {e}")
608
+ # Check if the number of processed items matches the expected count for the unit.
609
+ # The context dictionary holds the count of items yielded by the processor.
610
+ total_items_in_unit = unit.unit_size
611
+
612
+ if (
613
+ not self.should_stop_processing.is_set()
614
+ and self.connected.is_set()
615
+ and self.items_failed == 0
616
+ and self.items_processed >= total_items_in_unit
617
+ ):
618
+ if self.websocket:
619
+ try:
620
+ asyncio.run_coroutine_threadsafe(
621
+ self.websocket.send(
622
+ json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
623
+ ),
624
+ self.main_loop,
625
+ ).result(timeout=5)
626
+ logger.info(
627
+ f"Unit {unit.unit_id} fully processed ({self.items_processed}/{total_items_in_unit}) and marked complete."
628
+ )
629
+ except Exception as e:
630
+ logger.warning(f"Could not notify work complete for unit {unit.unit_id}: {e}")
631
+ else:
632
+ logger.warning(
633
+ f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
634
+ )
617
635
 
618
636
  def _process_batch(self, batch: List[ProcessingItem]):
619
637
  """Process a batch of items through all stages."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
File without changes
File without changes
File without changes