caption-flow 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/__init__.py +3 -3
- caption_flow/cli.py +934 -415
- caption_flow/models.py +45 -3
- caption_flow/monitor.py +2 -3
- caption_flow/orchestrator.py +153 -104
- caption_flow/processors/__init__.py +3 -3
- caption_flow/processors/base.py +8 -7
- caption_flow/processors/huggingface.py +439 -67
- caption_flow/processors/local_filesystem.py +24 -28
- caption_flow/processors/webdataset.py +28 -22
- caption_flow/storage/exporter.py +420 -339
- caption_flow/storage/manager.py +636 -756
- caption_flow/utils/__init__.py +1 -1
- caption_flow/utils/auth.py +1 -1
- caption_flow/utils/caption_utils.py +1 -1
- caption_flow/utils/certificates.py +15 -8
- caption_flow/utils/checkpoint_tracker.py +30 -28
- caption_flow/utils/chunk_tracker.py +153 -56
- caption_flow/utils/image_processor.py +9 -9
- caption_flow/utils/json_utils.py +37 -20
- caption_flow/utils/prompt_template.py +24 -16
- caption_flow/utils/vllm_config.py +5 -4
- caption_flow/viewer.py +4 -12
- caption_flow/workers/base.py +5 -4
- caption_flow/workers/caption.py +265 -90
- caption_flow/workers/data.py +6 -8
- {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
- caption_flow-0.4.0.dist-info/RECORD +33 -0
- caption_flow-0.3.4.dist-info/RECORD +0 -33
- {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
- {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0
caption_flow/workers/caption.py
CHANGED
@@ -7,34 +7,34 @@ os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
|
7
7
|
import asyncio
|
8
8
|
import json
|
9
9
|
import logging
|
10
|
-
import websockets
|
11
10
|
import time
|
12
|
-
from dataclasses import dataclass
|
13
|
-
from typing import Dict, Any, Optional, List, Tuple, Union
|
14
|
-
from queue import Queue, Empty
|
15
|
-
from threading import Thread, Event, Lock
|
16
11
|
from collections import defaultdict, deque
|
12
|
+
from dataclasses import dataclass
|
13
|
+
from queue import Empty, Queue
|
14
|
+
from threading import Event, Lock, Thread
|
15
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
17
16
|
|
18
|
-
|
17
|
+
import websockets
|
19
18
|
from huggingface_hub import get_token
|
19
|
+
from PIL import Image
|
20
20
|
|
21
|
-
from
|
21
|
+
from ..models import ProcessingStage, StageResult
|
22
22
|
from ..processors import (
|
23
|
+
HuggingFaceDatasetWorkerProcessor,
|
24
|
+
LocalFilesystemWorkerProcessor,
|
23
25
|
ProcessorConfig,
|
26
|
+
WebDatasetWorkerProcessor,
|
24
27
|
WorkAssignment,
|
25
|
-
WorkUnit,
|
26
28
|
WorkResult,
|
27
|
-
|
28
|
-
HuggingFaceDatasetWorkerProcessor,
|
29
|
-
LocalFilesystemWorkerProcessor,
|
29
|
+
WorkUnit,
|
30
30
|
)
|
31
|
-
from ..utils.vllm_config import VLLMConfigManager
|
32
31
|
from ..utils.image_processor import ImageProcessor
|
33
32
|
from ..utils.prompt_template import PromptTemplateManager
|
34
|
-
from ..
|
33
|
+
from ..utils.vllm_config import VLLMConfigManager
|
34
|
+
from .base import BaseWorker
|
35
35
|
|
36
36
|
logger = logging.getLogger(__name__)
|
37
|
-
logger.setLevel(
|
37
|
+
logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
|
38
38
|
|
39
39
|
|
40
40
|
@dataclass
|
@@ -72,8 +72,8 @@ class MultiStageVLLMManager:
|
|
72
72
|
logger.info(f"Model {model_name} already loaded, reusing instance")
|
73
73
|
return
|
74
74
|
|
75
|
-
from
|
76
|
-
from
|
75
|
+
from transformers import AutoProcessor, AutoTokenizer
|
76
|
+
from vllm import LLM
|
77
77
|
|
78
78
|
logger.info(f"Loading model {model_name} for stage {stage.name}")
|
79
79
|
|
@@ -305,7 +305,7 @@ class CaptionWorker(BaseWorker):
|
|
305
305
|
self.processor = LocalFilesystemWorkerProcessor()
|
306
306
|
else:
|
307
307
|
raise ValueError(f"Unknown processor type: {self.processor_type}")
|
308
|
-
|
308
|
+
self.processor.gpu_id = self.gpu_id
|
309
309
|
self.processor.initialize(processor_config)
|
310
310
|
self.dataset_path = self.processor.dataset_path
|
311
311
|
self.units_per_request = processor_config.config.get("chunks_per_request", 1)
|
@@ -463,7 +463,7 @@ class CaptionWorker(BaseWorker):
|
|
463
463
|
# Check if stages changed significantly
|
464
464
|
stages_changed = len(new_stages) != len(self.stages)
|
465
465
|
if not stages_changed:
|
466
|
-
for old, new in zip(self.stages, new_stages):
|
466
|
+
for old, new in zip(self.stages, new_stages, strict=False):
|
467
467
|
if (
|
468
468
|
old.name != new.name
|
469
469
|
or old.model != new.model
|
@@ -580,6 +580,7 @@ class CaptionWorker(BaseWorker):
|
|
580
580
|
|
581
581
|
try:
|
582
582
|
# Create processing item
|
583
|
+
logger.debug(f"Processing item data: {item_data}")
|
583
584
|
item = ProcessingItem(
|
584
585
|
unit_id=unit.unit_id,
|
585
586
|
chunk_id=unit.chunk_id,
|
@@ -610,34 +611,64 @@ class CaptionWorker(BaseWorker):
|
|
610
611
|
if batch and not self.should_stop_processing.is_set():
|
611
612
|
self._process_batch(batch)
|
612
613
|
|
613
|
-
# Notify orchestrator
|
614
|
+
# Notify orchestrator about unit completion or failure
|
614
615
|
# Check if the number of processed items matches the expected count for the unit.
|
615
616
|
# The context dictionary holds the count of items yielded by the processor.
|
616
617
|
total_items_in_unit = unit.unit_size
|
617
618
|
|
618
|
-
if (
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
619
|
+
if not self.should_stop_processing.is_set() and self.connected.is_set():
|
620
|
+
if self.items_failed == 0 and self.items_processed >= total_items_in_unit:
|
621
|
+
# Unit completed successfully
|
622
|
+
if self.websocket:
|
623
|
+
try:
|
624
|
+
asyncio.run_coroutine_threadsafe(
|
625
|
+
self.websocket.send(
|
626
|
+
json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
|
627
|
+
),
|
628
|
+
self.main_loop,
|
629
|
+
).result(timeout=5)
|
630
|
+
logger.info(
|
631
|
+
f"Unit {unit.unit_id} fully processed "
|
632
|
+
f"({self.items_processed}/{total_items_in_unit}) and marked complete."
|
633
|
+
)
|
634
|
+
except Exception as e:
|
635
|
+
logger.warning(
|
636
|
+
f"Could not notify work complete for unit {unit.unit_id}: {e}"
|
637
|
+
)
|
638
|
+
else:
|
639
|
+
# Unit failed or was incomplete
|
640
|
+
if self.items_failed > 0:
|
641
|
+
error_msg = (
|
642
|
+
f"Processing failed for {self.items_failed} out of "
|
643
|
+
f"{total_items_in_unit} items"
|
634
644
|
)
|
635
|
-
|
636
|
-
|
645
|
+
logger.error(f"Unit {unit.unit_id} failed: {error_msg}")
|
646
|
+
else:
|
647
|
+
error_msg = (
|
648
|
+
f"Processing incomplete: {self.items_processed}/"
|
649
|
+
f"{total_items_in_unit} items processed"
|
650
|
+
)
|
651
|
+
logger.warning(f"Unit {unit.unit_id} incomplete: {error_msg}")
|
652
|
+
|
653
|
+
if self.websocket:
|
654
|
+
try:
|
655
|
+
asyncio.run_coroutine_threadsafe(
|
656
|
+
self.websocket.send(
|
657
|
+
json.dumps(
|
658
|
+
{
|
659
|
+
"type": "work_failed",
|
660
|
+
"unit_id": unit.unit_id,
|
661
|
+
"error": error_msg,
|
662
|
+
}
|
663
|
+
)
|
664
|
+
),
|
665
|
+
self.main_loop,
|
666
|
+
).result(timeout=5)
|
667
|
+
logger.info(f"Unit {unit.unit_id} failure reported to orchestrator")
|
668
|
+
except Exception as e:
|
669
|
+
logger.warning(f"Could not notify work failed for unit {unit.unit_id}: {e}")
|
637
670
|
else:
|
638
|
-
logger.
|
639
|
-
f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
|
640
|
-
)
|
671
|
+
logger.info(f"Unit {unit.unit_id} processing stopped due to disconnect or shutdown")
|
641
672
|
|
642
673
|
def _process_batch(self, batch: List[ProcessingItem]):
|
643
674
|
"""Process a batch of items through all stages."""
|
@@ -672,6 +703,20 @@ class CaptionWorker(BaseWorker):
|
|
672
703
|
except Exception as e:
|
673
704
|
logger.error(f"Batch processing error: {e}", exc_info=True)
|
674
705
|
|
706
|
+
# Mark all items in batch as failed
|
707
|
+
self.items_failed += len(batch)
|
708
|
+
|
709
|
+
# Send error results for each item in the batch
|
710
|
+
for item in batch:
|
711
|
+
self.result_queue.put(
|
712
|
+
{
|
713
|
+
"item": item,
|
714
|
+
"outputs": {},
|
715
|
+
"processing_time_ms": 0.0,
|
716
|
+
"error": f"Batch processing failed: {str(e)}",
|
717
|
+
}
|
718
|
+
)
|
719
|
+
|
675
720
|
def _process_batch_mock(self, batch: List[ProcessingItem]) -> List[Tuple[ProcessingItem, Dict]]:
|
676
721
|
"""Process a batch in mock mode - return dummy captions."""
|
677
722
|
results = []
|
@@ -686,9 +731,9 @@ class CaptionWorker(BaseWorker):
|
|
686
731
|
|
687
732
|
# Create mock outputs based on stage prompts
|
688
733
|
stage_outputs = []
|
689
|
-
for i,
|
734
|
+
for i, _prompt in enumerate(stage.prompts):
|
690
735
|
mock_output = (
|
691
|
-
f"Mock {stage_name} output {i+1} for job {item.job_id} - {item.item_key}"
|
736
|
+
f"Mock {stage_name} output {i + 1} for job {item.job_id} - {item.item_key}"
|
692
737
|
)
|
693
738
|
stage_outputs.append(mock_output)
|
694
739
|
|
@@ -713,12 +758,129 @@ class CaptionWorker(BaseWorker):
|
|
713
758
|
|
714
759
|
return results
|
715
760
|
|
761
|
+
def _validate_and_split_batch(
|
762
|
+
self,
|
763
|
+
batch: List[ProcessingItem],
|
764
|
+
stage: ProcessingStage,
|
765
|
+
processor,
|
766
|
+
tokenizer,
|
767
|
+
sampling_params,
|
768
|
+
max_length: int = 16384,
|
769
|
+
) -> Tuple[List[ProcessingItem], List[ProcessingItem]]:
|
770
|
+
"""Validate batch items and split into processable and too-long items."""
|
771
|
+
logger.debug(
|
772
|
+
f"Validating batch of size {len(batch)} for stage '{stage.name}' "
|
773
|
+
f"with max_length {max_length}"
|
774
|
+
)
|
775
|
+
processable = []
|
776
|
+
too_long = []
|
777
|
+
|
778
|
+
for item in batch:
|
779
|
+
try:
|
780
|
+
# Create a test prompt for this item
|
781
|
+
converted_img = ImageProcessor.prepare_for_inference(item)
|
782
|
+
template_manager = PromptTemplateManager(
|
783
|
+
stage.prompts[:1]
|
784
|
+
) # Test with first prompt
|
785
|
+
|
786
|
+
# Build context
|
787
|
+
context = item.metadata.copy()
|
788
|
+
for prev_stage_name, stage_result in item.stage_results.items():
|
789
|
+
for i, output in enumerate(stage_result.outputs):
|
790
|
+
context[f"{prev_stage_name}_output_{i}"] = output
|
791
|
+
if len(stage_result.outputs) == 1:
|
792
|
+
context[stage_result.output_field] = stage_result.outputs[0]
|
793
|
+
else:
|
794
|
+
context[stage_result.output_field] = stage_result.outputs
|
795
|
+
logger.debug(f"Validation context for {item.item_key}: {context}")
|
796
|
+
|
797
|
+
# Format test prompt
|
798
|
+
formatted_prompts = template_manager.format_all(context)
|
799
|
+
if not formatted_prompts:
|
800
|
+
logger.warning(
|
801
|
+
f"Could not format prompt for {item.item_key}, marking as too long."
|
802
|
+
)
|
803
|
+
too_long.append(item)
|
804
|
+
continue
|
805
|
+
|
806
|
+
logger.debug(
|
807
|
+
f"Formatted validation prompt for {item.item_key}: {formatted_prompts[0]}"
|
808
|
+
)
|
809
|
+
|
810
|
+
# Build actual vLLM input to test
|
811
|
+
test_req = self._build_vllm_input(
|
812
|
+
converted_img, formatted_prompts[0], processor, tokenizer
|
813
|
+
)
|
814
|
+
|
815
|
+
# Use processor to get actual token count
|
816
|
+
if "prompt_token_ids" in test_req:
|
817
|
+
prompt_length = len(test_req["prompt_token_ids"])
|
818
|
+
else:
|
819
|
+
# Fallback to tokenizer
|
820
|
+
prompt_length = len(tokenizer.encode(test_req.get("prompt", "")))
|
821
|
+
|
822
|
+
# Check individual prompt length (prompts are processed one by one)
|
823
|
+
# Use a small safety buffer to account for token estimation variations
|
824
|
+
safety_buffer = 50
|
825
|
+
if prompt_length < max_length - safety_buffer:
|
826
|
+
processable.append(item)
|
827
|
+
logger.debug(
|
828
|
+
f"Item {item.item_key} validated: {prompt_length} tokens per prompt"
|
829
|
+
)
|
830
|
+
else:
|
831
|
+
too_long.append(item)
|
832
|
+
logger.warning(
|
833
|
+
f"Item {item.item_key} too long: {prompt_length} tokens "
|
834
|
+
f"vs max {max_length - safety_buffer} (with safety buffer)"
|
835
|
+
)
|
836
|
+
|
837
|
+
except Exception as e:
|
838
|
+
logger.error(f"Error validating item {item.item_key}: {e}", exc_info=True)
|
839
|
+
too_long.append(item)
|
840
|
+
|
841
|
+
logger.debug(
|
842
|
+
f"Validation complete: {len(processable)} processable, {len(too_long)} too long."
|
843
|
+
)
|
844
|
+
return processable, too_long
|
845
|
+
|
846
|
+
def _resize_image_for_tokens(
|
847
|
+
self, item: ProcessingItem, target_ratio: float = 0.7
|
848
|
+
) -> ProcessingItem:
|
849
|
+
"""Resize image to reduce token count."""
|
850
|
+
if not item.image:
|
851
|
+
return item
|
852
|
+
|
853
|
+
# Calculate new size
|
854
|
+
new_width = int(item.image.width * target_ratio)
|
855
|
+
new_height = int(item.image.height * target_ratio)
|
856
|
+
|
857
|
+
# Resize image
|
858
|
+
resized_image = item.image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
859
|
+
|
860
|
+
# Create new item with resized image
|
861
|
+
new_item = ProcessingItem(
|
862
|
+
unit_id=item.unit_id,
|
863
|
+
job_id=item.job_id,
|
864
|
+
chunk_id=item.chunk_id,
|
865
|
+
item_key=item.item_key,
|
866
|
+
item_index=item.item_index,
|
867
|
+
image=resized_image,
|
868
|
+
image_data=item.image_data, # Keep original data for metadata
|
869
|
+
metadata={**item.metadata, "_resized": True, "_resize_ratio": target_ratio},
|
870
|
+
stage_results=item.stage_results.copy(),
|
871
|
+
)
|
872
|
+
|
873
|
+
return new_item
|
874
|
+
|
716
875
|
def _process_batch_multi_stage(
|
717
876
|
self, batch: List[ProcessingItem], max_attempts: int = 3
|
718
877
|
) -> List[Tuple[ProcessingItem, Dict]]:
|
719
|
-
"""Process a batch through all stages
|
878
|
+
"""Process a batch through all stages with token validation."""
|
720
879
|
results = []
|
721
880
|
|
881
|
+
# Get max model length from config
|
882
|
+
max_model_len = self.vllm_config.get("max_model_len", 16384)
|
883
|
+
|
722
884
|
# Process each stage in order
|
723
885
|
for stage_name in self.stage_order:
|
724
886
|
stage = next(s for s in self.stages if s.name == stage_name)
|
@@ -729,26 +891,68 @@ class CaptionWorker(BaseWorker):
|
|
729
891
|
stage_name, stage.model
|
730
892
|
)
|
731
893
|
|
732
|
-
#
|
733
|
-
|
894
|
+
# Validate batch before processing
|
895
|
+
processable_batch, too_long_items = self._validate_and_split_batch(
|
896
|
+
batch, stage, processor, tokenizer, sampling_params, max_model_len
|
897
|
+
)
|
898
|
+
|
899
|
+
# Handle items that are too long
|
900
|
+
for item in too_long_items:
|
901
|
+
logger.warning(f"Item {item.item_key} exceeds token limit, attempting resize")
|
734
902
|
|
735
|
-
|
736
|
-
|
737
|
-
requests = []
|
903
|
+
# Try resizing the image
|
904
|
+
resized_item = self._resize_image_for_tokens(item, target_ratio=0.7)
|
738
905
|
|
739
|
-
|
740
|
-
|
906
|
+
# Re-validate
|
907
|
+
resized_processable, still_too_long = self._validate_and_split_batch(
|
908
|
+
[resized_item], stage, processor, tokenizer, sampling_params, max_model_len
|
909
|
+
)
|
741
910
|
|
742
|
-
|
743
|
-
|
911
|
+
if resized_processable:
|
912
|
+
processable_batch.extend(resized_processable)
|
913
|
+
logger.info(f"Successfully resized {item.item_key} for processing")
|
914
|
+
else:
|
915
|
+
# Try even smaller
|
916
|
+
resized_item = self._resize_image_for_tokens(item, target_ratio=0.5)
|
917
|
+
resized_processable, still_too_long = self._validate_and_split_batch(
|
918
|
+
[resized_item], stage, processor, tokenizer, sampling_params, max_model_len
|
919
|
+
)
|
744
920
|
|
745
|
-
|
921
|
+
if resized_processable:
|
922
|
+
processable_batch.extend(resized_processable)
|
923
|
+
logger.info(f"Successfully resized {item.item_key} to 50% for processing")
|
924
|
+
else:
|
925
|
+
logger.error(f"Item {item.item_key} still too long after resize, skipping")
|
926
|
+
self.items_failed += 1
|
927
|
+
|
928
|
+
# Send error result
|
929
|
+
stage_result = StageResult(
|
930
|
+
stage_name=stage_name,
|
931
|
+
output_field=stage.output_field,
|
932
|
+
outputs=[],
|
933
|
+
error="Image too large even after resizing",
|
934
|
+
)
|
935
|
+
item.stage_results[stage_name] = stage_result
|
936
|
+
|
937
|
+
self.result_queue.put(
|
938
|
+
{
|
939
|
+
"item": item,
|
940
|
+
"outputs": {},
|
941
|
+
"processing_time_ms": 0.0,
|
942
|
+
"error": f"Failed stage {stage_name}: token limit exceeded",
|
943
|
+
}
|
944
|
+
)
|
945
|
+
|
946
|
+
# Process the validated batch
|
947
|
+
if processable_batch:
|
948
|
+
# Build requests for processable items
|
949
|
+
requests = []
|
950
|
+
for item in processable_batch:
|
951
|
+
converted_img = ImageProcessor.prepare_for_inference(item)
|
746
952
|
template_manager = PromptTemplateManager(stage.prompts)
|
747
953
|
|
748
954
|
# Build context
|
749
955
|
context = item.metadata.copy()
|
750
|
-
|
751
|
-
# Add previous stage results
|
752
956
|
for prev_stage_name, stage_result in item.stage_results.items():
|
753
957
|
for i, output in enumerate(stage_result.outputs):
|
754
958
|
context[f"{prev_stage_name}_output_{i}"] = output
|
@@ -769,14 +973,7 @@ class CaptionWorker(BaseWorker):
|
|
769
973
|
outputs = llm.generate(requests, sampling_params)
|
770
974
|
|
771
975
|
# Process outputs
|
772
|
-
|
773
|
-
failed_items = []
|
774
|
-
|
775
|
-
for idx, (original_idx, item, attempt_count) in enumerate(current_batch):
|
776
|
-
if self.should_stop_processing.is_set():
|
777
|
-
return results
|
778
|
-
|
779
|
-
# Extract outputs
|
976
|
+
for idx, item in enumerate(processable_batch):
|
780
977
|
base_idx = idx * len(stage.prompts)
|
781
978
|
stage_outputs = []
|
782
979
|
|
@@ -788,40 +985,18 @@ class CaptionWorker(BaseWorker):
|
|
788
985
|
stage_outputs.append(cleaned_output)
|
789
986
|
|
790
987
|
if stage_outputs:
|
791
|
-
# Success
|
792
988
|
stage_result = StageResult(
|
793
989
|
stage_name=stage_name,
|
794
990
|
output_field=stage.output_field,
|
795
991
|
outputs=stage_outputs,
|
796
992
|
)
|
797
993
|
item.stage_results[stage_name] = stage_result
|
798
|
-
successful_items.append((original_idx, item))
|
799
994
|
else:
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
self.items_failed += 1
|
806
|
-
stage_result = StageResult(
|
807
|
-
stage_name=stage_name,
|
808
|
-
output_field=stage.output_field,
|
809
|
-
outputs=[],
|
810
|
-
error=f"Failed after {max_attempts} attempts",
|
811
|
-
)
|
812
|
-
item.stage_results[stage_name] = stage_result
|
813
|
-
self.result_queue.put(
|
814
|
-
{
|
815
|
-
"item": item,
|
816
|
-
"outputs": {},
|
817
|
-
"processing_time_ms": 0.0,
|
818
|
-
"error": f"Failed stage {stage_name} after {max_attempts} attempts",
|
819
|
-
}
|
820
|
-
)
|
821
|
-
|
822
|
-
# Update for next iteration
|
823
|
-
items_to_process = failed_items
|
824
|
-
batch = [item for _, item in successful_items]
|
995
|
+
logger.error(f"No outputs for {item.item_key} in stage {stage_name}")
|
996
|
+
self.items_failed += 1
|
997
|
+
|
998
|
+
# Update batch for next stage
|
999
|
+
batch = processable_batch
|
825
1000
|
|
826
1001
|
# Convert to results
|
827
1002
|
for item in batch:
|
caption_flow/workers/data.py
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
"""DataWorker for retrieving data from various sources and forwarding to orchestrator or storage."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import io
|
4
5
|
import json
|
5
6
|
import logging
|
6
|
-
import io
|
7
|
-
import time
|
8
7
|
from dataclasses import dataclass
|
9
8
|
from pathlib import Path
|
10
|
-
from
|
11
|
-
from
|
12
|
-
from
|
9
|
+
from queue import Empty, Queue
|
10
|
+
from threading import Event
|
11
|
+
from typing import Any, AsyncIterator, Dict, Optional
|
13
12
|
|
13
|
+
import boto3
|
14
14
|
import pandas as pd
|
15
15
|
import pyarrow.parquet as pq
|
16
|
-
from PIL import Image
|
17
|
-
import boto3
|
18
16
|
from botocore.config import Config
|
19
17
|
|
20
18
|
from .base import BaseWorker
|
@@ -179,7 +177,7 @@ class DataWorker(BaseWorker):
|
|
179
177
|
try:
|
180
178
|
self.send_queue.put_nowait(batch)
|
181
179
|
batch = []
|
182
|
-
except:
|
180
|
+
except Exception:
|
183
181
|
# Queue full, wait
|
184
182
|
await asyncio.sleep(1)
|
185
183
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -9,10 +9,9 @@ Classifier: Development Status :: 4 - Beta
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
15
|
-
Requires-Python: <3.13,>=3.
|
14
|
+
Requires-Python: <3.13,>=3.11
|
16
15
|
Description-Content-Type: text/markdown
|
17
16
|
License-File: LICENSE
|
18
17
|
Requires-Dist: websockets>=12.0
|
@@ -35,7 +34,9 @@ Requires-Dist: boto3<2.0.0,>=1.40.11
|
|
35
34
|
Requires-Dist: torchdata<0.12.0,>=0.11.0
|
36
35
|
Requires-Dist: textual<6.0.0,>=5.3.0
|
37
36
|
Requires-Dist: urwid<4.0.0,>=3.0.2
|
38
|
-
Requires-Dist: webshart<0.5.0,>=0.4.
|
37
|
+
Requires-Dist: webshart<0.5.0,>=0.4.3
|
38
|
+
Requires-Dist: pylance<0.36.0,>=0.35.0
|
39
|
+
Requires-Dist: duckdb<2.0.0,>=1.3.2
|
39
40
|
Provides-Extra: dev
|
40
41
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
41
42
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
@@ -0,0 +1,33 @@
|
|
1
|
+
caption_flow/__init__.py,sha256=IZoOP8s4lN05e6ww9M5HWVfwYOughmS_tDgG-BLajFo,303
|
2
|
+
caption_flow/cli.py,sha256=J_rjzhYvVyfoOvKQE4PsMSa_YO58iaKk6yi7kRDUYPU,57688
|
3
|
+
caption_flow/models.py,sha256=6-IJj_B3HAarucoLo8_PncJRnxofHuLFCsyRnmUXgRk,7063
|
4
|
+
caption_flow/monitor.py,sha256=j5RExadSLOUujVZQMe7btMeKNlq-WbZ9bYqfikgYJ8Q,7972
|
5
|
+
caption_flow/orchestrator.py,sha256=MWQKaAclI9rMjn7mWdvoSzl9y4b7bU_24aVr8I1YGhE,39645
|
6
|
+
caption_flow/viewer.py,sha256=40w2Zj7GaXbK-dgqvYYdFrMzSDE_ZPWNZc6kS0OrymQ,20281
|
7
|
+
caption_flow/processors/__init__.py,sha256=l1udEZLxAmqwFYS4-3GsRVcPT6WxnDOIk0s0UqsZsJM,423
|
8
|
+
caption_flow/processors/base.py,sha256=Zx6kRZSqG969x8kYJ5VY2Mo5mLeWEgBCEpo8D4GjsBM,6935
|
9
|
+
caption_flow/processors/huggingface.py,sha256=LELbCkvALoKSVf5zGOEL3f3nQG_UcRcPu0ZNZU95B3k,60222
|
10
|
+
caption_flow/processors/local_filesystem.py,sha256=auAWxnqplEH4YJ1DWZCaFmAd03iyhNLudgt71N8O7NE,27827
|
11
|
+
caption_flow/processors/webdataset.py,sha256=66y_7KaJBBntJqBHYKLzCXkBi9ly-TfYYaTCp_7pqTo,34206
|
12
|
+
caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
|
13
|
+
caption_flow/storage/exporter.py,sha256=6atbxWgxSu_5qg9l8amwgkXRL1SKTZQb2yryu62yPc8,22371
|
14
|
+
caption_flow/storage/manager.py,sha256=2jkyNl-2_B2Z7NfjCBua-Jgo7Km_JmJqMKrYsYj5uF4,41416
|
15
|
+
caption_flow/utils/__init__.py,sha256=ULJImkcFPc8QH2Zz6TW7AeVXMFdRpvfni2MgEo_PRyY,120
|
16
|
+
caption_flow/utils/auth.py,sha256=6HRNnWfX1j1Jh55M23crfSA1olkFGg-9kZ5Booy5wCM,2253
|
17
|
+
caption_flow/utils/caption_utils.py,sha256=7k6GnElIAqyyzDHQd3JC3Ffr7r57sFWqS3ET7itzdoM,5309
|
18
|
+
caption_flow/utils/certificates.py,sha256=NiHSeeZYKrf5BpAkwg5qOe-1C7-z42jZO3pjQo0N3I8,4889
|
19
|
+
caption_flow/utils/checkpoint_tracker.py,sha256=LoCGjb30QOcMESHLF5hKVCd8X8_gWACyyq9EKLTXIn4,4613
|
20
|
+
caption_flow/utils/chunk_tracker.py,sha256=And1krrTvpfiwG7xRxh9n6xy-_W8MSWSkcGmFSDFnB8,25460
|
21
|
+
caption_flow/utils/image_processor.py,sha256=_dmiKXcAKxjkQ6d9V5QgoZSf_dDOL52tFMOEXa3iA24,1581
|
22
|
+
caption_flow/utils/json_utils.py,sha256=AaGcNTToUcVYCQj2TXs2D_hxc_LeEqFquiK4CquS0U8,5537
|
23
|
+
caption_flow/utils/prompt_template.py,sha256=mq7FPnpjp8gVCMMh4NtRf0vL_B9LDMuBkbySvACRSZM,4401
|
24
|
+
caption_flow/utils/vllm_config.py,sha256=xFOnmniQGkUGwfTabfW6R0V01TF-_rN1UYJy0HwOvUI,6026
|
25
|
+
caption_flow/workers/base.py,sha256=Yh_PBsL3j1kXUuIOQHqIdR69Nepfq11je23i01iWSxw,7714
|
26
|
+
caption_flow/workers/caption.py,sha256=KnvRcZ6-Nc2JwastgqpQ8WfCw_AOzWBS-etYXEXJ6Os,47201
|
27
|
+
caption_flow/workers/data.py,sha256=iWnTM7UgpJeFzhSTly-gHzFu5sIYUGG-XO4yRNn_MQk,14775
|
28
|
+
caption_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
29
|
+
caption_flow-0.4.0.dist-info/METADATA,sha256=e1sdcAeXR-nYlRZlrDtvwXBuRPb1J-_jzTzIvWevsHs,9732
|
30
|
+
caption_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
31
|
+
caption_flow-0.4.0.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
32
|
+
caption_flow-0.4.0.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
33
|
+
caption_flow-0.4.0.dist-info/RECORD,,
|
@@ -1,33 +0,0 @@
|
|
1
|
-
caption_flow/__init__.py,sha256=2M1VLvkVjUmTHXuJFMLnZKqVYni5A0HJfxcnjz53K7c,303
|
2
|
-
caption_flow/cli.py,sha256=K3lML3WIYjD7OluGltHGP4N98S5w-KyhDUlQZudDQXE,41464
|
3
|
-
caption_flow/models.py,sha256=2n6iphTEL62xK2FFcJM6axMsaE8KwsUv5Ak_cCF-TdQ,5652
|
4
|
-
caption_flow/monitor.py,sha256=z2HakZSG799HvTJgjgG7u_MHvhq9-JL1LXzxBwP3WQc,7998
|
5
|
-
caption_flow/orchestrator.py,sha256=3XKZXFE1Aw1kCqb_Vw9loYpkmJ5LTLyZZf9pj4k6ldA,37175
|
6
|
-
caption_flow/viewer.py,sha256=HxO98eHR1xtivG0dEdYC2U9T_RgeRfJqqTK-37u9bNM,20471
|
7
|
-
caption_flow/processors/__init__.py,sha256=hvq-OuAJWQe6hFglKe7QmkS8473k20FmxZDSxfXpCrg,423
|
8
|
-
caption_flow/processors/base.py,sha256=IAEr0pqHRuSkXunvDWk1vf2IKeYQ-2YERqej9iSQm94,6931
|
9
|
-
caption_flow/processors/huggingface.py,sha256=t_dklhmNRAyk2jISu4FqmNecjg9hfY47omOiRVkbhvA,41215
|
10
|
-
caption_flow/processors/local_filesystem.py,sha256=OuNNDemy0sdtpBBC_5GbI-c1vMqp8OIz983Cq85gdb8,27964
|
11
|
-
caption_flow/processors/webdataset.py,sha256=tUBCUKunqooHibTWtQ1wljuRI55Wc6M1WrI2hOZgt7g,33858
|
12
|
-
caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
|
13
|
-
caption_flow/storage/exporter.py,sha256=mFJqMDQ61cP-qcXe118_-oL1TUqULdQZ8LdjSTym44I,19697
|
14
|
-
caption_flow/storage/manager.py,sha256=KPExcKPuFVQSsBnfCBdne5PO4PwN4NTfd-EJQk13OY0,47459
|
15
|
-
caption_flow/utils/__init__.py,sha256=bDcO5uR455TKCQ2hX-_XcdTnRXDBaT8Yn4jWqWzfFsE,120
|
16
|
-
caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
|
17
|
-
caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
|
18
|
-
caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
|
19
|
-
caption_flow/utils/checkpoint_tracker.py,sha256=nOZIIGsXTRUj09tFSnWtRgj_zoa8Og_-rutkr2GFz8Y,4417
|
20
|
-
caption_flow/utils/chunk_tracker.py,sha256=JZIFvaHS5AYaVOzsSJKrnNlS4E3BdzV64cRkQa_65g0,21508
|
21
|
-
caption_flow/utils/image_processor.py,sha256=wmOExkVfM7OeuLfX3AwMefsH-TxL8TNcn22gp0NmJKY,1541
|
22
|
-
caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
|
23
|
-
caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
|
24
|
-
caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
|
25
|
-
caption_flow/workers/base.py,sha256=nEWohozFZ0Bw3_8U8xirnKLeZsGR5k69rSu4j-oDitc,7698
|
26
|
-
caption_flow/workers/caption.py,sha256=swE4pYg4ZYAAtMxvyvlETa3wv4yKWUPXXulCAwPhPiQ,39477
|
27
|
-
caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
|
28
|
-
caption_flow-0.3.4.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
29
|
-
caption_flow-0.3.4.dist-info/METADATA,sha256=dfB40EF_Zgz2Ux8qvdBbfLdhzY85_MUFRX-904I-qb4,9708
|
30
|
-
caption_flow-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
31
|
-
caption_flow-0.3.4.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
32
|
-
caption_flow-0.3.4.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
33
|
-
caption_flow-0.3.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|