caption-flow 0.2.4__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/__init__.py +1 -1
- caption_flow/orchestrator.py +9 -9
- caption_flow/processors/base.py +3 -0
- caption_flow/processors/huggingface.py +637 -464
- caption_flow/processors/local_filesystem.py +2 -0
- caption_flow/processors/webdataset.py +438 -538
- caption_flow/storage/manager.py +328 -305
- caption_flow/utils/__init__.py +0 -2
- caption_flow/utils/chunk_tracker.py +197 -164
- caption_flow/utils/image_processor.py +19 -132
- caption_flow/workers/caption.py +191 -138
- {caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/METADATA +2 -1
- caption_flow-0.3.2.dist-info/RECORD +33 -0
- caption_flow/utils/dataset_loader.py +0 -222
- caption_flow/utils/dataset_metadata_cache.py +0 -67
- caption_flow/utils/job_queue.py +0 -41
- caption_flow/utils/shard_processor.py +0 -119
- caption_flow/utils/shard_tracker.py +0 -83
- caption_flow-0.2.4.dist-info/RECORD +0 -38
- {caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/WHEEL +0 -0
- {caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.2.4.dist-info → caption_flow-0.3.2.dist-info}/top_level.txt +0 -0
@@ -251,6 +251,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
|
|
251
251
|
unit_id=chunk_id,
|
252
252
|
chunk_id=chunk_id,
|
253
253
|
source_id="local",
|
254
|
+
unit_size=chunk_state.chunk_size,
|
254
255
|
data={
|
255
256
|
"start_index": chunk_state.start_index,
|
256
257
|
"chunk_size": chunk_state.chunk_size,
|
@@ -319,6 +320,7 @@ class LocalFilesystemOrchestratorProcessor(OrchestratorProcessor):
|
|
319
320
|
unit_id=unit_id,
|
320
321
|
chunk_id=unit_id,
|
321
322
|
source_id="local",
|
323
|
+
unit_size=chunk_size,
|
322
324
|
data={
|
323
325
|
"start_index": self.current_index,
|
324
326
|
"chunk_size": chunk_size,
|