caption-flow 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/__init__.py +3 -3
- caption_flow/cli.py +937 -416
- caption_flow/models.py +45 -3
- caption_flow/monitor.py +5 -3
- caption_flow/orchestrator.py +186 -116
- caption_flow/processors/__init__.py +3 -3
- caption_flow/processors/base.py +8 -7
- caption_flow/processors/huggingface.py +440 -68
- caption_flow/processors/local_filesystem.py +24 -28
- caption_flow/processors/webdataset.py +66 -25
- caption_flow/storage/exporter.py +420 -339
- caption_flow/storage/manager.py +636 -756
- caption_flow/utils/__init__.py +1 -1
- caption_flow/utils/auth.py +1 -1
- caption_flow/utils/caption_utils.py +1 -1
- caption_flow/utils/certificates.py +15 -8
- caption_flow/utils/checkpoint_tracker.py +41 -19
- caption_flow/utils/chunk_tracker.py +200 -65
- caption_flow/utils/image_processor.py +9 -9
- caption_flow/utils/json_utils.py +37 -20
- caption_flow/utils/prompt_template.py +24 -16
- caption_flow/utils/vllm_config.py +5 -4
- caption_flow/viewer.py +4 -12
- caption_flow/workers/base.py +12 -6
- caption_flow/workers/caption.py +272 -91
- caption_flow/workers/data.py +6 -8
- {caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
- caption_flow-0.4.0.dist-info/RECORD +33 -0
- caption_flow-0.3.3.dist-info/RECORD +0 -33
- {caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
- {caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.3.3.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0
caption_flow/workers/caption.py
CHANGED
@@ -7,34 +7,34 @@ os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
|
7
7
|
import asyncio
|
8
8
|
import json
|
9
9
|
import logging
|
10
|
-
import websockets
|
11
10
|
import time
|
12
|
-
from dataclasses import dataclass
|
13
|
-
from typing import Dict, Any, Optional, List, Tuple, Union
|
14
|
-
from queue import Queue, Empty
|
15
|
-
from threading import Thread, Event, Lock
|
16
11
|
from collections import defaultdict, deque
|
12
|
+
from dataclasses import dataclass
|
13
|
+
from queue import Empty, Queue
|
14
|
+
from threading import Event, Lock, Thread
|
15
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
17
16
|
|
18
|
-
|
17
|
+
import websockets
|
19
18
|
from huggingface_hub import get_token
|
19
|
+
from PIL import Image
|
20
20
|
|
21
|
-
from
|
21
|
+
from ..models import ProcessingStage, StageResult
|
22
22
|
from ..processors import (
|
23
|
+
HuggingFaceDatasetWorkerProcessor,
|
24
|
+
LocalFilesystemWorkerProcessor,
|
23
25
|
ProcessorConfig,
|
26
|
+
WebDatasetWorkerProcessor,
|
24
27
|
WorkAssignment,
|
25
|
-
WorkUnit,
|
26
28
|
WorkResult,
|
27
|
-
|
28
|
-
HuggingFaceDatasetWorkerProcessor,
|
29
|
-
LocalFilesystemWorkerProcessor,
|
29
|
+
WorkUnit,
|
30
30
|
)
|
31
|
-
from ..utils.vllm_config import VLLMConfigManager
|
32
31
|
from ..utils.image_processor import ImageProcessor
|
33
32
|
from ..utils.prompt_template import PromptTemplateManager
|
34
|
-
from ..
|
33
|
+
from ..utils.vllm_config import VLLMConfigManager
|
34
|
+
from .base import BaseWorker
|
35
35
|
|
36
36
|
logger = logging.getLogger(__name__)
|
37
|
-
logger.setLevel(
|
37
|
+
logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
|
38
38
|
|
39
39
|
|
40
40
|
@dataclass
|
@@ -72,8 +72,8 @@ class MultiStageVLLMManager:
|
|
72
72
|
logger.info(f"Model {model_name} already loaded, reusing instance")
|
73
73
|
return
|
74
74
|
|
75
|
-
from
|
76
|
-
from
|
75
|
+
from transformers import AutoProcessor, AutoTokenizer
|
76
|
+
from vllm import LLM
|
77
77
|
|
78
78
|
logger.info(f"Loading model {model_name} for stage {stage.name}")
|
79
79
|
|
@@ -248,7 +248,13 @@ class CaptionWorker(BaseWorker):
|
|
248
248
|
async def _initial_connect_for_config(self):
|
249
249
|
"""Connect initially just to get configuration."""
|
250
250
|
logger.info(f"Connecting to {self.server_url}")
|
251
|
-
async with websockets.connect(
|
251
|
+
async with websockets.connect(
|
252
|
+
self.server_url,
|
253
|
+
ssl=self.ssl_context,
|
254
|
+
ping_interval=20,
|
255
|
+
ping_timeout=60,
|
256
|
+
close_timeout=10,
|
257
|
+
) as websocket:
|
252
258
|
await websocket.send(json.dumps(self._get_auth_data()))
|
253
259
|
|
254
260
|
welcome = await websocket.recv()
|
@@ -299,7 +305,7 @@ class CaptionWorker(BaseWorker):
|
|
299
305
|
self.processor = LocalFilesystemWorkerProcessor()
|
300
306
|
else:
|
301
307
|
raise ValueError(f"Unknown processor type: {self.processor_type}")
|
302
|
-
|
308
|
+
self.processor.gpu_id = self.gpu_id
|
303
309
|
self.processor.initialize(processor_config)
|
304
310
|
self.dataset_path = self.processor.dataset_path
|
305
311
|
self.units_per_request = processor_config.config.get("chunks_per_request", 1)
|
@@ -457,7 +463,7 @@ class CaptionWorker(BaseWorker):
|
|
457
463
|
# Check if stages changed significantly
|
458
464
|
stages_changed = len(new_stages) != len(self.stages)
|
459
465
|
if not stages_changed:
|
460
|
-
for old, new in zip(self.stages, new_stages):
|
466
|
+
for old, new in zip(self.stages, new_stages, strict=False):
|
461
467
|
if (
|
462
468
|
old.name != new.name
|
463
469
|
or old.model != new.model
|
@@ -574,6 +580,7 @@ class CaptionWorker(BaseWorker):
|
|
574
580
|
|
575
581
|
try:
|
576
582
|
# Create processing item
|
583
|
+
logger.debug(f"Processing item data: {item_data}")
|
577
584
|
item = ProcessingItem(
|
578
585
|
unit_id=unit.unit_id,
|
579
586
|
chunk_id=unit.chunk_id,
|
@@ -604,34 +611,64 @@ class CaptionWorker(BaseWorker):
|
|
604
611
|
if batch and not self.should_stop_processing.is_set():
|
605
612
|
self._process_batch(batch)
|
606
613
|
|
607
|
-
# Notify orchestrator
|
614
|
+
# Notify orchestrator about unit completion or failure
|
608
615
|
# Check if the number of processed items matches the expected count for the unit.
|
609
616
|
# The context dictionary holds the count of items yielded by the processor.
|
610
617
|
total_items_in_unit = unit.unit_size
|
611
618
|
|
612
|
-
if (
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
619
|
+
if not self.should_stop_processing.is_set() and self.connected.is_set():
|
620
|
+
if self.items_failed == 0 and self.items_processed >= total_items_in_unit:
|
621
|
+
# Unit completed successfully
|
622
|
+
if self.websocket:
|
623
|
+
try:
|
624
|
+
asyncio.run_coroutine_threadsafe(
|
625
|
+
self.websocket.send(
|
626
|
+
json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
|
627
|
+
),
|
628
|
+
self.main_loop,
|
629
|
+
).result(timeout=5)
|
630
|
+
logger.info(
|
631
|
+
f"Unit {unit.unit_id} fully processed "
|
632
|
+
f"({self.items_processed}/{total_items_in_unit}) and marked complete."
|
633
|
+
)
|
634
|
+
except Exception as e:
|
635
|
+
logger.warning(
|
636
|
+
f"Could not notify work complete for unit {unit.unit_id}: {e}"
|
637
|
+
)
|
638
|
+
else:
|
639
|
+
# Unit failed or was incomplete
|
640
|
+
if self.items_failed > 0:
|
641
|
+
error_msg = (
|
642
|
+
f"Processing failed for {self.items_failed} out of "
|
643
|
+
f"{total_items_in_unit} items"
|
628
644
|
)
|
629
|
-
|
630
|
-
|
645
|
+
logger.error(f"Unit {unit.unit_id} failed: {error_msg}")
|
646
|
+
else:
|
647
|
+
error_msg = (
|
648
|
+
f"Processing incomplete: {self.items_processed}/"
|
649
|
+
f"{total_items_in_unit} items processed"
|
650
|
+
)
|
651
|
+
logger.warning(f"Unit {unit.unit_id} incomplete: {error_msg}")
|
652
|
+
|
653
|
+
if self.websocket:
|
654
|
+
try:
|
655
|
+
asyncio.run_coroutine_threadsafe(
|
656
|
+
self.websocket.send(
|
657
|
+
json.dumps(
|
658
|
+
{
|
659
|
+
"type": "work_failed",
|
660
|
+
"unit_id": unit.unit_id,
|
661
|
+
"error": error_msg,
|
662
|
+
}
|
663
|
+
)
|
664
|
+
),
|
665
|
+
self.main_loop,
|
666
|
+
).result(timeout=5)
|
667
|
+
logger.info(f"Unit {unit.unit_id} failure reported to orchestrator")
|
668
|
+
except Exception as e:
|
669
|
+
logger.warning(f"Could not notify work failed for unit {unit.unit_id}: {e}")
|
631
670
|
else:
|
632
|
-
logger.
|
633
|
-
f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
|
634
|
-
)
|
671
|
+
logger.info(f"Unit {unit.unit_id} processing stopped due to disconnect or shutdown")
|
635
672
|
|
636
673
|
def _process_batch(self, batch: List[ProcessingItem]):
|
637
674
|
"""Process a batch of items through all stages."""
|
@@ -666,6 +703,20 @@ class CaptionWorker(BaseWorker):
|
|
666
703
|
except Exception as e:
|
667
704
|
logger.error(f"Batch processing error: {e}", exc_info=True)
|
668
705
|
|
706
|
+
# Mark all items in batch as failed
|
707
|
+
self.items_failed += len(batch)
|
708
|
+
|
709
|
+
# Send error results for each item in the batch
|
710
|
+
for item in batch:
|
711
|
+
self.result_queue.put(
|
712
|
+
{
|
713
|
+
"item": item,
|
714
|
+
"outputs": {},
|
715
|
+
"processing_time_ms": 0.0,
|
716
|
+
"error": f"Batch processing failed: {str(e)}",
|
717
|
+
}
|
718
|
+
)
|
719
|
+
|
669
720
|
def _process_batch_mock(self, batch: List[ProcessingItem]) -> List[Tuple[ProcessingItem, Dict]]:
|
670
721
|
"""Process a batch in mock mode - return dummy captions."""
|
671
722
|
results = []
|
@@ -680,9 +731,9 @@ class CaptionWorker(BaseWorker):
|
|
680
731
|
|
681
732
|
# Create mock outputs based on stage prompts
|
682
733
|
stage_outputs = []
|
683
|
-
for i,
|
734
|
+
for i, _prompt in enumerate(stage.prompts):
|
684
735
|
mock_output = (
|
685
|
-
f"Mock {stage_name} output {i+1} for job {item.job_id} - {item.item_key}"
|
736
|
+
f"Mock {stage_name} output {i + 1} for job {item.job_id} - {item.item_key}"
|
686
737
|
)
|
687
738
|
stage_outputs.append(mock_output)
|
688
739
|
|
@@ -707,12 +758,129 @@ class CaptionWorker(BaseWorker):
|
|
707
758
|
|
708
759
|
return results
|
709
760
|
|
761
|
+
def _validate_and_split_batch(
|
762
|
+
self,
|
763
|
+
batch: List[ProcessingItem],
|
764
|
+
stage: ProcessingStage,
|
765
|
+
processor,
|
766
|
+
tokenizer,
|
767
|
+
sampling_params,
|
768
|
+
max_length: int = 16384,
|
769
|
+
) -> Tuple[List[ProcessingItem], List[ProcessingItem]]:
|
770
|
+
"""Validate batch items and split into processable and too-long items."""
|
771
|
+
logger.debug(
|
772
|
+
f"Validating batch of size {len(batch)} for stage '{stage.name}' "
|
773
|
+
f"with max_length {max_length}"
|
774
|
+
)
|
775
|
+
processable = []
|
776
|
+
too_long = []
|
777
|
+
|
778
|
+
for item in batch:
|
779
|
+
try:
|
780
|
+
# Create a test prompt for this item
|
781
|
+
converted_img = ImageProcessor.prepare_for_inference(item)
|
782
|
+
template_manager = PromptTemplateManager(
|
783
|
+
stage.prompts[:1]
|
784
|
+
) # Test with first prompt
|
785
|
+
|
786
|
+
# Build context
|
787
|
+
context = item.metadata.copy()
|
788
|
+
for prev_stage_name, stage_result in item.stage_results.items():
|
789
|
+
for i, output in enumerate(stage_result.outputs):
|
790
|
+
context[f"{prev_stage_name}_output_{i}"] = output
|
791
|
+
if len(stage_result.outputs) == 1:
|
792
|
+
context[stage_result.output_field] = stage_result.outputs[0]
|
793
|
+
else:
|
794
|
+
context[stage_result.output_field] = stage_result.outputs
|
795
|
+
logger.debug(f"Validation context for {item.item_key}: {context}")
|
796
|
+
|
797
|
+
# Format test prompt
|
798
|
+
formatted_prompts = template_manager.format_all(context)
|
799
|
+
if not formatted_prompts:
|
800
|
+
logger.warning(
|
801
|
+
f"Could not format prompt for {item.item_key}, marking as too long."
|
802
|
+
)
|
803
|
+
too_long.append(item)
|
804
|
+
continue
|
805
|
+
|
806
|
+
logger.debug(
|
807
|
+
f"Formatted validation prompt for {item.item_key}: {formatted_prompts[0]}"
|
808
|
+
)
|
809
|
+
|
810
|
+
# Build actual vLLM input to test
|
811
|
+
test_req = self._build_vllm_input(
|
812
|
+
converted_img, formatted_prompts[0], processor, tokenizer
|
813
|
+
)
|
814
|
+
|
815
|
+
# Use processor to get actual token count
|
816
|
+
if "prompt_token_ids" in test_req:
|
817
|
+
prompt_length = len(test_req["prompt_token_ids"])
|
818
|
+
else:
|
819
|
+
# Fallback to tokenizer
|
820
|
+
prompt_length = len(tokenizer.encode(test_req.get("prompt", "")))
|
821
|
+
|
822
|
+
# Check individual prompt length (prompts are processed one by one)
|
823
|
+
# Use a small safety buffer to account for token estimation variations
|
824
|
+
safety_buffer = 50
|
825
|
+
if prompt_length < max_length - safety_buffer:
|
826
|
+
processable.append(item)
|
827
|
+
logger.debug(
|
828
|
+
f"Item {item.item_key} validated: {prompt_length} tokens per prompt"
|
829
|
+
)
|
830
|
+
else:
|
831
|
+
too_long.append(item)
|
832
|
+
logger.warning(
|
833
|
+
f"Item {item.item_key} too long: {prompt_length} tokens "
|
834
|
+
f"vs max {max_length - safety_buffer} (with safety buffer)"
|
835
|
+
)
|
836
|
+
|
837
|
+
except Exception as e:
|
838
|
+
logger.error(f"Error validating item {item.item_key}: {e}", exc_info=True)
|
839
|
+
too_long.append(item)
|
840
|
+
|
841
|
+
logger.debug(
|
842
|
+
f"Validation complete: {len(processable)} processable, {len(too_long)} too long."
|
843
|
+
)
|
844
|
+
return processable, too_long
|
845
|
+
|
846
|
+
def _resize_image_for_tokens(
|
847
|
+
self, item: ProcessingItem, target_ratio: float = 0.7
|
848
|
+
) -> ProcessingItem:
|
849
|
+
"""Resize image to reduce token count."""
|
850
|
+
if not item.image:
|
851
|
+
return item
|
852
|
+
|
853
|
+
# Calculate new size
|
854
|
+
new_width = int(item.image.width * target_ratio)
|
855
|
+
new_height = int(item.image.height * target_ratio)
|
856
|
+
|
857
|
+
# Resize image
|
858
|
+
resized_image = item.image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
859
|
+
|
860
|
+
# Create new item with resized image
|
861
|
+
new_item = ProcessingItem(
|
862
|
+
unit_id=item.unit_id,
|
863
|
+
job_id=item.job_id,
|
864
|
+
chunk_id=item.chunk_id,
|
865
|
+
item_key=item.item_key,
|
866
|
+
item_index=item.item_index,
|
867
|
+
image=resized_image,
|
868
|
+
image_data=item.image_data, # Keep original data for metadata
|
869
|
+
metadata={**item.metadata, "_resized": True, "_resize_ratio": target_ratio},
|
870
|
+
stage_results=item.stage_results.copy(),
|
871
|
+
)
|
872
|
+
|
873
|
+
return new_item
|
874
|
+
|
710
875
|
def _process_batch_multi_stage(
|
711
876
|
self, batch: List[ProcessingItem], max_attempts: int = 3
|
712
877
|
) -> List[Tuple[ProcessingItem, Dict]]:
|
713
|
-
"""Process a batch through all stages
|
878
|
+
"""Process a batch through all stages with token validation."""
|
714
879
|
results = []
|
715
880
|
|
881
|
+
# Get max model length from config
|
882
|
+
max_model_len = self.vllm_config.get("max_model_len", 16384)
|
883
|
+
|
716
884
|
# Process each stage in order
|
717
885
|
for stage_name in self.stage_order:
|
718
886
|
stage = next(s for s in self.stages if s.name == stage_name)
|
@@ -723,26 +891,68 @@ class CaptionWorker(BaseWorker):
|
|
723
891
|
stage_name, stage.model
|
724
892
|
)
|
725
893
|
|
726
|
-
#
|
727
|
-
|
894
|
+
# Validate batch before processing
|
895
|
+
processable_batch, too_long_items = self._validate_and_split_batch(
|
896
|
+
batch, stage, processor, tokenizer, sampling_params, max_model_len
|
897
|
+
)
|
898
|
+
|
899
|
+
# Handle items that are too long
|
900
|
+
for item in too_long_items:
|
901
|
+
logger.warning(f"Item {item.item_key} exceeds token limit, attempting resize")
|
728
902
|
|
729
|
-
|
730
|
-
|
731
|
-
requests = []
|
903
|
+
# Try resizing the image
|
904
|
+
resized_item = self._resize_image_for_tokens(item, target_ratio=0.7)
|
732
905
|
|
733
|
-
|
734
|
-
|
906
|
+
# Re-validate
|
907
|
+
resized_processable, still_too_long = self._validate_and_split_batch(
|
908
|
+
[resized_item], stage, processor, tokenizer, sampling_params, max_model_len
|
909
|
+
)
|
735
910
|
|
736
|
-
|
737
|
-
|
911
|
+
if resized_processable:
|
912
|
+
processable_batch.extend(resized_processable)
|
913
|
+
logger.info(f"Successfully resized {item.item_key} for processing")
|
914
|
+
else:
|
915
|
+
# Try even smaller
|
916
|
+
resized_item = self._resize_image_for_tokens(item, target_ratio=0.5)
|
917
|
+
resized_processable, still_too_long = self._validate_and_split_batch(
|
918
|
+
[resized_item], stage, processor, tokenizer, sampling_params, max_model_len
|
919
|
+
)
|
738
920
|
|
739
|
-
|
921
|
+
if resized_processable:
|
922
|
+
processable_batch.extend(resized_processable)
|
923
|
+
logger.info(f"Successfully resized {item.item_key} to 50% for processing")
|
924
|
+
else:
|
925
|
+
logger.error(f"Item {item.item_key} still too long after resize, skipping")
|
926
|
+
self.items_failed += 1
|
927
|
+
|
928
|
+
# Send error result
|
929
|
+
stage_result = StageResult(
|
930
|
+
stage_name=stage_name,
|
931
|
+
output_field=stage.output_field,
|
932
|
+
outputs=[],
|
933
|
+
error="Image too large even after resizing",
|
934
|
+
)
|
935
|
+
item.stage_results[stage_name] = stage_result
|
936
|
+
|
937
|
+
self.result_queue.put(
|
938
|
+
{
|
939
|
+
"item": item,
|
940
|
+
"outputs": {},
|
941
|
+
"processing_time_ms": 0.0,
|
942
|
+
"error": f"Failed stage {stage_name}: token limit exceeded",
|
943
|
+
}
|
944
|
+
)
|
945
|
+
|
946
|
+
# Process the validated batch
|
947
|
+
if processable_batch:
|
948
|
+
# Build requests for processable items
|
949
|
+
requests = []
|
950
|
+
for item in processable_batch:
|
951
|
+
converted_img = ImageProcessor.prepare_for_inference(item)
|
740
952
|
template_manager = PromptTemplateManager(stage.prompts)
|
741
953
|
|
742
954
|
# Build context
|
743
955
|
context = item.metadata.copy()
|
744
|
-
|
745
|
-
# Add previous stage results
|
746
956
|
for prev_stage_name, stage_result in item.stage_results.items():
|
747
957
|
for i, output in enumerate(stage_result.outputs):
|
748
958
|
context[f"{prev_stage_name}_output_{i}"] = output
|
@@ -763,14 +973,7 @@ class CaptionWorker(BaseWorker):
|
|
763
973
|
outputs = llm.generate(requests, sampling_params)
|
764
974
|
|
765
975
|
# Process outputs
|
766
|
-
|
767
|
-
failed_items = []
|
768
|
-
|
769
|
-
for idx, (original_idx, item, attempt_count) in enumerate(current_batch):
|
770
|
-
if self.should_stop_processing.is_set():
|
771
|
-
return results
|
772
|
-
|
773
|
-
# Extract outputs
|
976
|
+
for idx, item in enumerate(processable_batch):
|
774
977
|
base_idx = idx * len(stage.prompts)
|
775
978
|
stage_outputs = []
|
776
979
|
|
@@ -782,40 +985,18 @@ class CaptionWorker(BaseWorker):
|
|
782
985
|
stage_outputs.append(cleaned_output)
|
783
986
|
|
784
987
|
if stage_outputs:
|
785
|
-
# Success
|
786
988
|
stage_result = StageResult(
|
787
989
|
stage_name=stage_name,
|
788
990
|
output_field=stage.output_field,
|
789
991
|
outputs=stage_outputs,
|
790
992
|
)
|
791
993
|
item.stage_results[stage_name] = stage_result
|
792
|
-
successful_items.append((original_idx, item))
|
793
994
|
else:
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
self.items_failed += 1
|
800
|
-
stage_result = StageResult(
|
801
|
-
stage_name=stage_name,
|
802
|
-
output_field=stage.output_field,
|
803
|
-
outputs=[],
|
804
|
-
error=f"Failed after {max_attempts} attempts",
|
805
|
-
)
|
806
|
-
item.stage_results[stage_name] = stage_result
|
807
|
-
self.result_queue.put(
|
808
|
-
{
|
809
|
-
"item": item,
|
810
|
-
"outputs": {},
|
811
|
-
"processing_time_ms": 0.0,
|
812
|
-
"error": f"Failed stage {stage_name} after {max_attempts} attempts",
|
813
|
-
}
|
814
|
-
)
|
815
|
-
|
816
|
-
# Update for next iteration
|
817
|
-
items_to_process = failed_items
|
818
|
-
batch = [item for _, item in successful_items]
|
995
|
+
logger.error(f"No outputs for {item.item_key} in stage {stage_name}")
|
996
|
+
self.items_failed += 1
|
997
|
+
|
998
|
+
# Update batch for next stage
|
999
|
+
batch = processable_batch
|
819
1000
|
|
820
1001
|
# Convert to results
|
821
1002
|
for item in batch:
|
caption_flow/workers/data.py
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
"""DataWorker for retrieving data from various sources and forwarding to orchestrator or storage."""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import io
|
4
5
|
import json
|
5
6
|
import logging
|
6
|
-
import io
|
7
|
-
import time
|
8
7
|
from dataclasses import dataclass
|
9
8
|
from pathlib import Path
|
10
|
-
from
|
11
|
-
from
|
12
|
-
from
|
9
|
+
from queue import Empty, Queue
|
10
|
+
from threading import Event
|
11
|
+
from typing import Any, AsyncIterator, Dict, Optional
|
13
12
|
|
13
|
+
import boto3
|
14
14
|
import pandas as pd
|
15
15
|
import pyarrow.parquet as pq
|
16
|
-
from PIL import Image
|
17
|
-
import boto3
|
18
16
|
from botocore.config import Config
|
19
17
|
|
20
18
|
from .base import BaseWorker
|
@@ -179,7 +177,7 @@ class DataWorker(BaseWorker):
|
|
179
177
|
try:
|
180
178
|
self.send_queue.put_nowait(batch)
|
181
179
|
batch = []
|
182
|
-
except:
|
180
|
+
except Exception:
|
183
181
|
# Queue full, wait
|
184
182
|
await asyncio.sleep(1)
|
185
183
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -9,10 +9,9 @@ Classifier: Development Status :: 4 - Beta
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Programming Language :: Python :: 3.10
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
15
|
-
Requires-Python: <3.13,>=3.
|
14
|
+
Requires-Python: <3.13,>=3.11
|
16
15
|
Description-Content-Type: text/markdown
|
17
16
|
License-File: LICENSE
|
18
17
|
Requires-Dist: websockets>=12.0
|
@@ -35,7 +34,9 @@ Requires-Dist: boto3<2.0.0,>=1.40.11
|
|
35
34
|
Requires-Dist: torchdata<0.12.0,>=0.11.0
|
36
35
|
Requires-Dist: textual<6.0.0,>=5.3.0
|
37
36
|
Requires-Dist: urwid<4.0.0,>=3.0.2
|
38
|
-
Requires-Dist: webshart<0.5.0,>=0.4.
|
37
|
+
Requires-Dist: webshart<0.5.0,>=0.4.3
|
38
|
+
Requires-Dist: pylance<0.36.0,>=0.35.0
|
39
|
+
Requires-Dist: duckdb<2.0.0,>=1.3.2
|
39
40
|
Provides-Extra: dev
|
40
41
|
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
41
42
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
@@ -0,0 +1,33 @@
|
|
1
|
+
caption_flow/__init__.py,sha256=IZoOP8s4lN05e6ww9M5HWVfwYOughmS_tDgG-BLajFo,303
|
2
|
+
caption_flow/cli.py,sha256=J_rjzhYvVyfoOvKQE4PsMSa_YO58iaKk6yi7kRDUYPU,57688
|
3
|
+
caption_flow/models.py,sha256=6-IJj_B3HAarucoLo8_PncJRnxofHuLFCsyRnmUXgRk,7063
|
4
|
+
caption_flow/monitor.py,sha256=j5RExadSLOUujVZQMe7btMeKNlq-WbZ9bYqfikgYJ8Q,7972
|
5
|
+
caption_flow/orchestrator.py,sha256=MWQKaAclI9rMjn7mWdvoSzl9y4b7bU_24aVr8I1YGhE,39645
|
6
|
+
caption_flow/viewer.py,sha256=40w2Zj7GaXbK-dgqvYYdFrMzSDE_ZPWNZc6kS0OrymQ,20281
|
7
|
+
caption_flow/processors/__init__.py,sha256=l1udEZLxAmqwFYS4-3GsRVcPT6WxnDOIk0s0UqsZsJM,423
|
8
|
+
caption_flow/processors/base.py,sha256=Zx6kRZSqG969x8kYJ5VY2Mo5mLeWEgBCEpo8D4GjsBM,6935
|
9
|
+
caption_flow/processors/huggingface.py,sha256=LELbCkvALoKSVf5zGOEL3f3nQG_UcRcPu0ZNZU95B3k,60222
|
10
|
+
caption_flow/processors/local_filesystem.py,sha256=auAWxnqplEH4YJ1DWZCaFmAd03iyhNLudgt71N8O7NE,27827
|
11
|
+
caption_flow/processors/webdataset.py,sha256=66y_7KaJBBntJqBHYKLzCXkBi9ly-TfYYaTCp_7pqTo,34206
|
12
|
+
caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
|
13
|
+
caption_flow/storage/exporter.py,sha256=6atbxWgxSu_5qg9l8amwgkXRL1SKTZQb2yryu62yPc8,22371
|
14
|
+
caption_flow/storage/manager.py,sha256=2jkyNl-2_B2Z7NfjCBua-Jgo7Km_JmJqMKrYsYj5uF4,41416
|
15
|
+
caption_flow/utils/__init__.py,sha256=ULJImkcFPc8QH2Zz6TW7AeVXMFdRpvfni2MgEo_PRyY,120
|
16
|
+
caption_flow/utils/auth.py,sha256=6HRNnWfX1j1Jh55M23crfSA1olkFGg-9kZ5Booy5wCM,2253
|
17
|
+
caption_flow/utils/caption_utils.py,sha256=7k6GnElIAqyyzDHQd3JC3Ffr7r57sFWqS3ET7itzdoM,5309
|
18
|
+
caption_flow/utils/certificates.py,sha256=NiHSeeZYKrf5BpAkwg5qOe-1C7-z42jZO3pjQo0N3I8,4889
|
19
|
+
caption_flow/utils/checkpoint_tracker.py,sha256=LoCGjb30QOcMESHLF5hKVCd8X8_gWACyyq9EKLTXIn4,4613
|
20
|
+
caption_flow/utils/chunk_tracker.py,sha256=And1krrTvpfiwG7xRxh9n6xy-_W8MSWSkcGmFSDFnB8,25460
|
21
|
+
caption_flow/utils/image_processor.py,sha256=_dmiKXcAKxjkQ6d9V5QgoZSf_dDOL52tFMOEXa3iA24,1581
|
22
|
+
caption_flow/utils/json_utils.py,sha256=AaGcNTToUcVYCQj2TXs2D_hxc_LeEqFquiK4CquS0U8,5537
|
23
|
+
caption_flow/utils/prompt_template.py,sha256=mq7FPnpjp8gVCMMh4NtRf0vL_B9LDMuBkbySvACRSZM,4401
|
24
|
+
caption_flow/utils/vllm_config.py,sha256=xFOnmniQGkUGwfTabfW6R0V01TF-_rN1UYJy0HwOvUI,6026
|
25
|
+
caption_flow/workers/base.py,sha256=Yh_PBsL3j1kXUuIOQHqIdR69Nepfq11je23i01iWSxw,7714
|
26
|
+
caption_flow/workers/caption.py,sha256=KnvRcZ6-Nc2JwastgqpQ8WfCw_AOzWBS-etYXEXJ6Os,47201
|
27
|
+
caption_flow/workers/data.py,sha256=iWnTM7UgpJeFzhSTly-gHzFu5sIYUGG-XO4yRNn_MQk,14775
|
28
|
+
caption_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
29
|
+
caption_flow-0.4.0.dist-info/METADATA,sha256=e1sdcAeXR-nYlRZlrDtvwXBuRPb1J-_jzTzIvWevsHs,9732
|
30
|
+
caption_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
31
|
+
caption_flow-0.4.0.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
32
|
+
caption_flow-0.4.0.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
33
|
+
caption_flow-0.4.0.dist-info/RECORD,,
|
@@ -1,33 +0,0 @@
|
|
1
|
-
caption_flow/__init__.py,sha256=hNewpvkdcuW2JWSuF1u0gfovBCTRPwbIDlqlvLTWYGI,303
|
2
|
-
caption_flow/cli.py,sha256=t_cYCxJE7f5UtB3br2Es51JjO5KPsWM1JTdDXAxM_Lw,41371
|
3
|
-
caption_flow/models.py,sha256=2n6iphTEL62xK2FFcJM6axMsaE8KwsUv5Ak_cCF-TdQ,5652
|
4
|
-
caption_flow/monitor.py,sha256=bAt9EJqfPgT_KdbknGdCxwBRH002pRDgyUmYIj6Dyso,7885
|
5
|
-
caption_flow/orchestrator.py,sha256=de3AuO-0zd8w-ESfjPK9U1e8lWr6ucgE3VMX0AZSM7Q,36193
|
6
|
-
caption_flow/viewer.py,sha256=HxO98eHR1xtivG0dEdYC2U9T_RgeRfJqqTK-37u9bNM,20471
|
7
|
-
caption_flow/processors/__init__.py,sha256=hvq-OuAJWQe6hFglKe7QmkS8473k20FmxZDSxfXpCrg,423
|
8
|
-
caption_flow/processors/base.py,sha256=IAEr0pqHRuSkXunvDWk1vf2IKeYQ-2YERqej9iSQm94,6931
|
9
|
-
caption_flow/processors/huggingface.py,sha256=w0j7PRosXYyJXZ0A0Y-J6_n-aHCGVW8tbt8lcvguO_Y,41237
|
10
|
-
caption_flow/processors/local_filesystem.py,sha256=OuNNDemy0sdtpBBC_5GbI-c1vMqp8OIz983Cq85gdb8,27964
|
11
|
-
caption_flow/processors/webdataset.py,sha256=Em-GssF27oSctG15TANwEeHIzmyNl4sTSdtX02010Lo,32144
|
12
|
-
caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
|
13
|
-
caption_flow/storage/exporter.py,sha256=mFJqMDQ61cP-qcXe118_-oL1TUqULdQZ8LdjSTym44I,19697
|
14
|
-
caption_flow/storage/manager.py,sha256=KPExcKPuFVQSsBnfCBdne5PO4PwN4NTfd-EJQk13OY0,47459
|
15
|
-
caption_flow/utils/__init__.py,sha256=bDcO5uR455TKCQ2hX-_XcdTnRXDBaT8Yn4jWqWzfFsE,120
|
16
|
-
caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
|
17
|
-
caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
|
18
|
-
caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
|
19
|
-
caption_flow/utils/checkpoint_tracker.py,sha256=-nN5gLvXyMdKOCT2SNNL2Km6UYm2Hii9wuXeezWhwx4,3339
|
20
|
-
caption_flow/utils/chunk_tracker.py,sha256=HntWeINTbJmIERsW21p4q4FK8D9-4xKbZQUsj24DIqo,19975
|
21
|
-
caption_flow/utils/image_processor.py,sha256=wmOExkVfM7OeuLfX3AwMefsH-TxL8TNcn22gp0NmJKY,1541
|
22
|
-
caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
|
23
|
-
caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
|
24
|
-
caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
|
25
|
-
caption_flow/workers/base.py,sha256=2AGWERC5hbmO-0V_A1MUbgRVvRNN3blqGPyDokvvzmM,7575
|
26
|
-
caption_flow/workers/caption.py,sha256=X4BEmb6C1c73hvgJDMsHtgCUlCuECtnloWSVolVpa4s,39353
|
27
|
-
caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
|
28
|
-
caption_flow-0.3.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
29
|
-
caption_flow-0.3.3.dist-info/METADATA,sha256=GBf1DAFTM6a_o-6-CaIcm3k5t_gFwzDmXc4lFaOAqY8,9708
|
30
|
-
caption_flow-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
31
|
-
caption_flow-0.3.3.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
32
|
-
caption_flow-0.3.3.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
33
|
-
caption_flow-0.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|