caption-flow 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. caption_flow/__init__.py +3 -3
  2. caption_flow/cli.py +934 -415
  3. caption_flow/models.py +45 -3
  4. caption_flow/monitor.py +2 -3
  5. caption_flow/orchestrator.py +153 -104
  6. caption_flow/processors/__init__.py +3 -3
  7. caption_flow/processors/base.py +8 -7
  8. caption_flow/processors/huggingface.py +439 -67
  9. caption_flow/processors/local_filesystem.py +24 -28
  10. caption_flow/processors/webdataset.py +28 -22
  11. caption_flow/storage/exporter.py +420 -339
  12. caption_flow/storage/manager.py +636 -756
  13. caption_flow/utils/__init__.py +1 -1
  14. caption_flow/utils/auth.py +1 -1
  15. caption_flow/utils/caption_utils.py +1 -1
  16. caption_flow/utils/certificates.py +15 -8
  17. caption_flow/utils/checkpoint_tracker.py +30 -28
  18. caption_flow/utils/chunk_tracker.py +153 -56
  19. caption_flow/utils/image_processor.py +9 -9
  20. caption_flow/utils/json_utils.py +37 -20
  21. caption_flow/utils/prompt_template.py +24 -16
  22. caption_flow/utils/vllm_config.py +5 -4
  23. caption_flow/viewer.py +4 -12
  24. caption_flow/workers/base.py +5 -4
  25. caption_flow/workers/caption.py +265 -90
  26. caption_flow/workers/data.py +6 -8
  27. {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
  28. caption_flow-0.4.0.dist-info/RECORD +33 -0
  29. caption_flow-0.3.4.dist-info/RECORD +0 -33
  30. {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
  31. {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  32. {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  33. {caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -7,34 +7,34 @@ os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
7
7
  import asyncio
8
8
  import json
9
9
  import logging
10
- import websockets
11
10
  import time
12
- from dataclasses import dataclass
13
- from typing import Dict, Any, Optional, List, Tuple, Union
14
- from queue import Queue, Empty
15
- from threading import Thread, Event, Lock
16
11
  from collections import defaultdict, deque
12
+ from dataclasses import dataclass
13
+ from queue import Empty, Queue
14
+ from threading import Event, Lock, Thread
15
+ from typing import Any, Dict, List, Optional, Tuple, Union
17
16
 
18
- from PIL import Image
17
+ import websockets
19
18
  from huggingface_hub import get_token
19
+ from PIL import Image
20
20
 
21
- from .base import BaseWorker
21
+ from ..models import ProcessingStage, StageResult
22
22
  from ..processors import (
23
+ HuggingFaceDatasetWorkerProcessor,
24
+ LocalFilesystemWorkerProcessor,
23
25
  ProcessorConfig,
26
+ WebDatasetWorkerProcessor,
24
27
  WorkAssignment,
25
- WorkUnit,
26
28
  WorkResult,
27
- WebDatasetWorkerProcessor,
28
- HuggingFaceDatasetWorkerProcessor,
29
- LocalFilesystemWorkerProcessor,
29
+ WorkUnit,
30
30
  )
31
- from ..utils.vllm_config import VLLMConfigManager
32
31
  from ..utils.image_processor import ImageProcessor
33
32
  from ..utils.prompt_template import PromptTemplateManager
34
- from ..models import ProcessingStage, StageResult
33
+ from ..utils.vllm_config import VLLMConfigManager
34
+ from .base import BaseWorker
35
35
 
36
36
  logger = logging.getLogger(__name__)
37
- logger.setLevel(logging.INFO)
37
+ logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
38
38
 
39
39
 
40
40
  @dataclass
@@ -72,8 +72,8 @@ class MultiStageVLLMManager:
72
72
  logger.info(f"Model {model_name} already loaded, reusing instance")
73
73
  return
74
74
 
75
- from vllm import LLM, SamplingParams
76
- from transformers import AutoTokenizer, AutoProcessor
75
+ from transformers import AutoProcessor, AutoTokenizer
76
+ from vllm import LLM
77
77
 
78
78
  logger.info(f"Loading model {model_name} for stage {stage.name}")
79
79
 
@@ -305,7 +305,7 @@ class CaptionWorker(BaseWorker):
305
305
  self.processor = LocalFilesystemWorkerProcessor()
306
306
  else:
307
307
  raise ValueError(f"Unknown processor type: {self.processor_type}")
308
-
308
+ self.processor.gpu_id = self.gpu_id
309
309
  self.processor.initialize(processor_config)
310
310
  self.dataset_path = self.processor.dataset_path
311
311
  self.units_per_request = processor_config.config.get("chunks_per_request", 1)
@@ -463,7 +463,7 @@ class CaptionWorker(BaseWorker):
463
463
  # Check if stages changed significantly
464
464
  stages_changed = len(new_stages) != len(self.stages)
465
465
  if not stages_changed:
466
- for old, new in zip(self.stages, new_stages):
466
+ for old, new in zip(self.stages, new_stages, strict=False):
467
467
  if (
468
468
  old.name != new.name
469
469
  or old.model != new.model
@@ -580,6 +580,7 @@ class CaptionWorker(BaseWorker):
580
580
 
581
581
  try:
582
582
  # Create processing item
583
+ logger.debug(f"Processing item data: {item_data}")
583
584
  item = ProcessingItem(
584
585
  unit_id=unit.unit_id,
585
586
  chunk_id=unit.chunk_id,
@@ -610,34 +611,64 @@ class CaptionWorker(BaseWorker):
610
611
  if batch and not self.should_stop_processing.is_set():
611
612
  self._process_batch(batch)
612
613
 
613
- # Notify orchestrator that unit is complete
614
+ # Notify orchestrator about unit completion or failure
614
615
  # Check if the number of processed items matches the expected count for the unit.
615
616
  # The context dictionary holds the count of items yielded by the processor.
616
617
  total_items_in_unit = unit.unit_size
617
618
 
618
- if (
619
- not self.should_stop_processing.is_set()
620
- and self.connected.is_set()
621
- and self.items_failed == 0
622
- and self.items_processed >= total_items_in_unit
623
- ):
624
- if self.websocket:
625
- try:
626
- asyncio.run_coroutine_threadsafe(
627
- self.websocket.send(
628
- json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
629
- ),
630
- self.main_loop,
631
- ).result(timeout=5)
632
- logger.info(
633
- f"Unit {unit.unit_id} fully processed ({self.items_processed}/{total_items_in_unit}) and marked complete."
619
+ if not self.should_stop_processing.is_set() and self.connected.is_set():
620
+ if self.items_failed == 0 and self.items_processed >= total_items_in_unit:
621
+ # Unit completed successfully
622
+ if self.websocket:
623
+ try:
624
+ asyncio.run_coroutine_threadsafe(
625
+ self.websocket.send(
626
+ json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
627
+ ),
628
+ self.main_loop,
629
+ ).result(timeout=5)
630
+ logger.info(
631
+ f"Unit {unit.unit_id} fully processed "
632
+ f"({self.items_processed}/{total_items_in_unit}) and marked complete."
633
+ )
634
+ except Exception as e:
635
+ logger.warning(
636
+ f"Could not notify work complete for unit {unit.unit_id}: {e}"
637
+ )
638
+ else:
639
+ # Unit failed or was incomplete
640
+ if self.items_failed > 0:
641
+ error_msg = (
642
+ f"Processing failed for {self.items_failed} out of "
643
+ f"{total_items_in_unit} items"
634
644
  )
635
- except Exception as e:
636
- logger.warning(f"Could not notify work complete for unit {unit.unit_id}: {e}")
645
+ logger.error(f"Unit {unit.unit_id} failed: {error_msg}")
646
+ else:
647
+ error_msg = (
648
+ f"Processing incomplete: {self.items_processed}/"
649
+ f"{total_items_in_unit} items processed"
650
+ )
651
+ logger.warning(f"Unit {unit.unit_id} incomplete: {error_msg}")
652
+
653
+ if self.websocket:
654
+ try:
655
+ asyncio.run_coroutine_threadsafe(
656
+ self.websocket.send(
657
+ json.dumps(
658
+ {
659
+ "type": "work_failed",
660
+ "unit_id": unit.unit_id,
661
+ "error": error_msg,
662
+ }
663
+ )
664
+ ),
665
+ self.main_loop,
666
+ ).result(timeout=5)
667
+ logger.info(f"Unit {unit.unit_id} failure reported to orchestrator")
668
+ except Exception as e:
669
+ logger.warning(f"Could not notify work failed for unit {unit.unit_id}: {e}")
637
670
  else:
638
- logger.warning(
639
- f"Processing of unit {unit.unit_id} was incomplete ({self.items_processed}/{total_items_in_unit}). Not marking as complete."
640
- )
671
+ logger.info(f"Unit {unit.unit_id} processing stopped due to disconnect or shutdown")
641
672
 
642
673
  def _process_batch(self, batch: List[ProcessingItem]):
643
674
  """Process a batch of items through all stages."""
@@ -672,6 +703,20 @@ class CaptionWorker(BaseWorker):
672
703
  except Exception as e:
673
704
  logger.error(f"Batch processing error: {e}", exc_info=True)
674
705
 
706
+ # Mark all items in batch as failed
707
+ self.items_failed += len(batch)
708
+
709
+ # Send error results for each item in the batch
710
+ for item in batch:
711
+ self.result_queue.put(
712
+ {
713
+ "item": item,
714
+ "outputs": {},
715
+ "processing_time_ms": 0.0,
716
+ "error": f"Batch processing failed: {str(e)}",
717
+ }
718
+ )
719
+
675
720
  def _process_batch_mock(self, batch: List[ProcessingItem]) -> List[Tuple[ProcessingItem, Dict]]:
676
721
  """Process a batch in mock mode - return dummy captions."""
677
722
  results = []
@@ -686,9 +731,9 @@ class CaptionWorker(BaseWorker):
686
731
 
687
732
  # Create mock outputs based on stage prompts
688
733
  stage_outputs = []
689
- for i, prompt in enumerate(stage.prompts):
734
+ for i, _prompt in enumerate(stage.prompts):
690
735
  mock_output = (
691
- f"Mock {stage_name} output {i+1} for job {item.job_id} - {item.item_key}"
736
+ f"Mock {stage_name} output {i + 1} for job {item.job_id} - {item.item_key}"
692
737
  )
693
738
  stage_outputs.append(mock_output)
694
739
 
@@ -713,12 +758,129 @@ class CaptionWorker(BaseWorker):
713
758
 
714
759
  return results
715
760
 
761
+ def _validate_and_split_batch(
762
+ self,
763
+ batch: List[ProcessingItem],
764
+ stage: ProcessingStage,
765
+ processor,
766
+ tokenizer,
767
+ sampling_params,
768
+ max_length: int = 16384,
769
+ ) -> Tuple[List[ProcessingItem], List[ProcessingItem]]:
770
+ """Validate batch items and split into processable and too-long items."""
771
+ logger.debug(
772
+ f"Validating batch of size {len(batch)} for stage '{stage.name}' "
773
+ f"with max_length {max_length}"
774
+ )
775
+ processable = []
776
+ too_long = []
777
+
778
+ for item in batch:
779
+ try:
780
+ # Create a test prompt for this item
781
+ converted_img = ImageProcessor.prepare_for_inference(item)
782
+ template_manager = PromptTemplateManager(
783
+ stage.prompts[:1]
784
+ ) # Test with first prompt
785
+
786
+ # Build context
787
+ context = item.metadata.copy()
788
+ for prev_stage_name, stage_result in item.stage_results.items():
789
+ for i, output in enumerate(stage_result.outputs):
790
+ context[f"{prev_stage_name}_output_{i}"] = output
791
+ if len(stage_result.outputs) == 1:
792
+ context[stage_result.output_field] = stage_result.outputs[0]
793
+ else:
794
+ context[stage_result.output_field] = stage_result.outputs
795
+ logger.debug(f"Validation context for {item.item_key}: {context}")
796
+
797
+ # Format test prompt
798
+ formatted_prompts = template_manager.format_all(context)
799
+ if not formatted_prompts:
800
+ logger.warning(
801
+ f"Could not format prompt for {item.item_key}, marking as too long."
802
+ )
803
+ too_long.append(item)
804
+ continue
805
+
806
+ logger.debug(
807
+ f"Formatted validation prompt for {item.item_key}: {formatted_prompts[0]}"
808
+ )
809
+
810
+ # Build actual vLLM input to test
811
+ test_req = self._build_vllm_input(
812
+ converted_img, formatted_prompts[0], processor, tokenizer
813
+ )
814
+
815
+ # Use processor to get actual token count
816
+ if "prompt_token_ids" in test_req:
817
+ prompt_length = len(test_req["prompt_token_ids"])
818
+ else:
819
+ # Fallback to tokenizer
820
+ prompt_length = len(tokenizer.encode(test_req.get("prompt", "")))
821
+
822
+ # Check individual prompt length (prompts are processed one by one)
823
+ # Use a small safety buffer to account for token estimation variations
824
+ safety_buffer = 50
825
+ if prompt_length < max_length - safety_buffer:
826
+ processable.append(item)
827
+ logger.debug(
828
+ f"Item {item.item_key} validated: {prompt_length} tokens per prompt"
829
+ )
830
+ else:
831
+ too_long.append(item)
832
+ logger.warning(
833
+ f"Item {item.item_key} too long: {prompt_length} tokens "
834
+ f"vs max {max_length - safety_buffer} (with safety buffer)"
835
+ )
836
+
837
+ except Exception as e:
838
+ logger.error(f"Error validating item {item.item_key}: {e}", exc_info=True)
839
+ too_long.append(item)
840
+
841
+ logger.debug(
842
+ f"Validation complete: {len(processable)} processable, {len(too_long)} too long."
843
+ )
844
+ return processable, too_long
845
+
846
+ def _resize_image_for_tokens(
847
+ self, item: ProcessingItem, target_ratio: float = 0.7
848
+ ) -> ProcessingItem:
849
+ """Resize image to reduce token count."""
850
+ if not item.image:
851
+ return item
852
+
853
+ # Calculate new size
854
+ new_width = int(item.image.width * target_ratio)
855
+ new_height = int(item.image.height * target_ratio)
856
+
857
+ # Resize image
858
+ resized_image = item.image.resize((new_width, new_height), Image.Resampling.LANCZOS)
859
+
860
+ # Create new item with resized image
861
+ new_item = ProcessingItem(
862
+ unit_id=item.unit_id,
863
+ job_id=item.job_id,
864
+ chunk_id=item.chunk_id,
865
+ item_key=item.item_key,
866
+ item_index=item.item_index,
867
+ image=resized_image,
868
+ image_data=item.image_data, # Keep original data for metadata
869
+ metadata={**item.metadata, "_resized": True, "_resize_ratio": target_ratio},
870
+ stage_results=item.stage_results.copy(),
871
+ )
872
+
873
+ return new_item
874
+
716
875
  def _process_batch_multi_stage(
717
876
  self, batch: List[ProcessingItem], max_attempts: int = 3
718
877
  ) -> List[Tuple[ProcessingItem, Dict]]:
719
- """Process a batch through all stages sequentially."""
878
+ """Process a batch through all stages with token validation."""
720
879
  results = []
721
880
 
881
+ # Get max model length from config
882
+ max_model_len = self.vllm_config.get("max_model_len", 16384)
883
+
722
884
  # Process each stage in order
723
885
  for stage_name in self.stage_order:
724
886
  stage = next(s for s in self.stages if s.name == stage_name)
@@ -729,26 +891,68 @@ class CaptionWorker(BaseWorker):
729
891
  stage_name, stage.model
730
892
  )
731
893
 
732
- # Track items for retry
733
- items_to_process = [(i, item, 0) for i, item in enumerate(batch)]
894
+ # Validate batch before processing
895
+ processable_batch, too_long_items = self._validate_and_split_batch(
896
+ batch, stage, processor, tokenizer, sampling_params, max_model_len
897
+ )
898
+
899
+ # Handle items that are too long
900
+ for item in too_long_items:
901
+ logger.warning(f"Item {item.item_key} exceeds token limit, attempting resize")
734
902
 
735
- while items_to_process:
736
- current_batch = []
737
- requests = []
903
+ # Try resizing the image
904
+ resized_item = self._resize_image_for_tokens(item, target_ratio=0.7)
738
905
 
739
- for idx, (original_idx, item, attempt_count) in enumerate(items_to_process):
740
- current_batch.append((original_idx, item, attempt_count))
906
+ # Re-validate
907
+ resized_processable, still_too_long = self._validate_and_split_batch(
908
+ [resized_item], stage, processor, tokenizer, sampling_params, max_model_len
909
+ )
741
910
 
742
- # Prepare image from PIL frame or bytes
743
- converted_img = ImageProcessor.prepare_for_inference(item)
911
+ if resized_processable:
912
+ processable_batch.extend(resized_processable)
913
+ logger.info(f"Successfully resized {item.item_key} for processing")
914
+ else:
915
+ # Try even smaller
916
+ resized_item = self._resize_image_for_tokens(item, target_ratio=0.5)
917
+ resized_processable, still_too_long = self._validate_and_split_batch(
918
+ [resized_item], stage, processor, tokenizer, sampling_params, max_model_len
919
+ )
744
920
 
745
- # Create template manager
921
+ if resized_processable:
922
+ processable_batch.extend(resized_processable)
923
+ logger.info(f"Successfully resized {item.item_key} to 50% for processing")
924
+ else:
925
+ logger.error(f"Item {item.item_key} still too long after resize, skipping")
926
+ self.items_failed += 1
927
+
928
+ # Send error result
929
+ stage_result = StageResult(
930
+ stage_name=stage_name,
931
+ output_field=stage.output_field,
932
+ outputs=[],
933
+ error="Image too large even after resizing",
934
+ )
935
+ item.stage_results[stage_name] = stage_result
936
+
937
+ self.result_queue.put(
938
+ {
939
+ "item": item,
940
+ "outputs": {},
941
+ "processing_time_ms": 0.0,
942
+ "error": f"Failed stage {stage_name}: token limit exceeded",
943
+ }
944
+ )
945
+
946
+ # Process the validated batch
947
+ if processable_batch:
948
+ # Build requests for processable items
949
+ requests = []
950
+ for item in processable_batch:
951
+ converted_img = ImageProcessor.prepare_for_inference(item)
746
952
  template_manager = PromptTemplateManager(stage.prompts)
747
953
 
748
954
  # Build context
749
955
  context = item.metadata.copy()
750
-
751
- # Add previous stage results
752
956
  for prev_stage_name, stage_result in item.stage_results.items():
753
957
  for i, output in enumerate(stage_result.outputs):
754
958
  context[f"{prev_stage_name}_output_{i}"] = output
@@ -769,14 +973,7 @@ class CaptionWorker(BaseWorker):
769
973
  outputs = llm.generate(requests, sampling_params)
770
974
 
771
975
  # Process outputs
772
- successful_items = []
773
- failed_items = []
774
-
775
- for idx, (original_idx, item, attempt_count) in enumerate(current_batch):
776
- if self.should_stop_processing.is_set():
777
- return results
778
-
779
- # Extract outputs
976
+ for idx, item in enumerate(processable_batch):
780
977
  base_idx = idx * len(stage.prompts)
781
978
  stage_outputs = []
782
979
 
@@ -788,40 +985,18 @@ class CaptionWorker(BaseWorker):
788
985
  stage_outputs.append(cleaned_output)
789
986
 
790
987
  if stage_outputs:
791
- # Success
792
988
  stage_result = StageResult(
793
989
  stage_name=stage_name,
794
990
  output_field=stage.output_field,
795
991
  outputs=stage_outputs,
796
992
  )
797
993
  item.stage_results[stage_name] = stage_result
798
- successful_items.append((original_idx, item))
799
994
  else:
800
- # Failed - check retry
801
- if attempt_count + 1 < max_attempts:
802
- failed_items.append((original_idx, item, attempt_count + 1))
803
- else:
804
- logger.error(f"Stage {stage_name} failed for item {item.item_key}")
805
- self.items_failed += 1
806
- stage_result = StageResult(
807
- stage_name=stage_name,
808
- output_field=stage.output_field,
809
- outputs=[],
810
- error=f"Failed after {max_attempts} attempts",
811
- )
812
- item.stage_results[stage_name] = stage_result
813
- self.result_queue.put(
814
- {
815
- "item": item,
816
- "outputs": {},
817
- "processing_time_ms": 0.0,
818
- "error": f"Failed stage {stage_name} after {max_attempts} attempts",
819
- }
820
- )
821
-
822
- # Update for next iteration
823
- items_to_process = failed_items
824
- batch = [item for _, item in successful_items]
995
+ logger.error(f"No outputs for {item.item_key} in stage {stage_name}")
996
+ self.items_failed += 1
997
+
998
+ # Update batch for next stage
999
+ batch = processable_batch
825
1000
 
826
1001
  # Convert to results
827
1002
  for item in batch:
@@ -1,20 +1,18 @@
1
1
  """DataWorker for retrieving data from various sources and forwarding to orchestrator or storage."""
2
2
 
3
3
  import asyncio
4
+ import io
4
5
  import json
5
6
  import logging
6
- import io
7
- import time
8
7
  from dataclasses import dataclass
9
8
  from pathlib import Path
10
- from typing import Dict, Any, Optional, List, AsyncIterator
11
- from queue import Queue, Empty
12
- from threading import Thread, Event
9
+ from queue import Empty, Queue
10
+ from threading import Event
11
+ from typing import Any, AsyncIterator, Dict, Optional
13
12
 
13
+ import boto3
14
14
  import pandas as pd
15
15
  import pyarrow.parquet as pq
16
- from PIL import Image
17
- import boto3
18
16
  from botocore.config import Config
19
17
 
20
18
  from .base import BaseWorker
@@ -179,7 +177,7 @@ class DataWorker(BaseWorker):
179
177
  try:
180
178
  self.send_queue.put_nowait(batch)
181
179
  batch = []
182
- except:
180
+ except Exception:
183
181
  # Queue full, wait
184
182
  await asyncio.sleep(1)
185
183
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.3.4
3
+ Version: 0.4.0
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
@@ -9,10 +9,9 @@ Classifier: Development Status :: 4 - Beta
9
9
  Classifier: Intended Audience :: Developers
10
10
  Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
14
13
  Classifier: Programming Language :: Python :: 3.12
15
- Requires-Python: <3.13,>=3.10
14
+ Requires-Python: <3.13,>=3.11
16
15
  Description-Content-Type: text/markdown
17
16
  License-File: LICENSE
18
17
  Requires-Dist: websockets>=12.0
@@ -35,7 +34,9 @@ Requires-Dist: boto3<2.0.0,>=1.40.11
35
34
  Requires-Dist: torchdata<0.12.0,>=0.11.0
36
35
  Requires-Dist: textual<6.0.0,>=5.3.0
37
36
  Requires-Dist: urwid<4.0.0,>=3.0.2
38
- Requires-Dist: webshart<0.5.0,>=0.4.0
37
+ Requires-Dist: webshart<0.5.0,>=0.4.3
38
+ Requires-Dist: pylance<0.36.0,>=0.35.0
39
+ Requires-Dist: duckdb<2.0.0,>=1.3.2
39
40
  Provides-Extra: dev
40
41
  Requires-Dist: pytest>=7.4.0; extra == "dev"
41
42
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -0,0 +1,33 @@
1
+ caption_flow/__init__.py,sha256=IZoOP8s4lN05e6ww9M5HWVfwYOughmS_tDgG-BLajFo,303
2
+ caption_flow/cli.py,sha256=J_rjzhYvVyfoOvKQE4PsMSa_YO58iaKk6yi7kRDUYPU,57688
3
+ caption_flow/models.py,sha256=6-IJj_B3HAarucoLo8_PncJRnxofHuLFCsyRnmUXgRk,7063
4
+ caption_flow/monitor.py,sha256=j5RExadSLOUujVZQMe7btMeKNlq-WbZ9bYqfikgYJ8Q,7972
5
+ caption_flow/orchestrator.py,sha256=MWQKaAclI9rMjn7mWdvoSzl9y4b7bU_24aVr8I1YGhE,39645
6
+ caption_flow/viewer.py,sha256=40w2Zj7GaXbK-dgqvYYdFrMzSDE_ZPWNZc6kS0OrymQ,20281
7
+ caption_flow/processors/__init__.py,sha256=l1udEZLxAmqwFYS4-3GsRVcPT6WxnDOIk0s0UqsZsJM,423
8
+ caption_flow/processors/base.py,sha256=Zx6kRZSqG969x8kYJ5VY2Mo5mLeWEgBCEpo8D4GjsBM,6935
9
+ caption_flow/processors/huggingface.py,sha256=LELbCkvALoKSVf5zGOEL3f3nQG_UcRcPu0ZNZU95B3k,60222
10
+ caption_flow/processors/local_filesystem.py,sha256=auAWxnqplEH4YJ1DWZCaFmAd03iyhNLudgt71N8O7NE,27827
11
+ caption_flow/processors/webdataset.py,sha256=66y_7KaJBBntJqBHYKLzCXkBi9ly-TfYYaTCp_7pqTo,34206
12
+ caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
13
+ caption_flow/storage/exporter.py,sha256=6atbxWgxSu_5qg9l8amwgkXRL1SKTZQb2yryu62yPc8,22371
14
+ caption_flow/storage/manager.py,sha256=2jkyNl-2_B2Z7NfjCBua-Jgo7Km_JmJqMKrYsYj5uF4,41416
15
+ caption_flow/utils/__init__.py,sha256=ULJImkcFPc8QH2Zz6TW7AeVXMFdRpvfni2MgEo_PRyY,120
16
+ caption_flow/utils/auth.py,sha256=6HRNnWfX1j1Jh55M23crfSA1olkFGg-9kZ5Booy5wCM,2253
17
+ caption_flow/utils/caption_utils.py,sha256=7k6GnElIAqyyzDHQd3JC3Ffr7r57sFWqS3ET7itzdoM,5309
18
+ caption_flow/utils/certificates.py,sha256=NiHSeeZYKrf5BpAkwg5qOe-1C7-z42jZO3pjQo0N3I8,4889
19
+ caption_flow/utils/checkpoint_tracker.py,sha256=LoCGjb30QOcMESHLF5hKVCd8X8_gWACyyq9EKLTXIn4,4613
20
+ caption_flow/utils/chunk_tracker.py,sha256=And1krrTvpfiwG7xRxh9n6xy-_W8MSWSkcGmFSDFnB8,25460
21
+ caption_flow/utils/image_processor.py,sha256=_dmiKXcAKxjkQ6d9V5QgoZSf_dDOL52tFMOEXa3iA24,1581
22
+ caption_flow/utils/json_utils.py,sha256=AaGcNTToUcVYCQj2TXs2D_hxc_LeEqFquiK4CquS0U8,5537
23
+ caption_flow/utils/prompt_template.py,sha256=mq7FPnpjp8gVCMMh4NtRf0vL_B9LDMuBkbySvACRSZM,4401
24
+ caption_flow/utils/vllm_config.py,sha256=xFOnmniQGkUGwfTabfW6R0V01TF-_rN1UYJy0HwOvUI,6026
25
+ caption_flow/workers/base.py,sha256=Yh_PBsL3j1kXUuIOQHqIdR69Nepfq11je23i01iWSxw,7714
26
+ caption_flow/workers/caption.py,sha256=KnvRcZ6-Nc2JwastgqpQ8WfCw_AOzWBS-etYXEXJ6Os,47201
27
+ caption_flow/workers/data.py,sha256=iWnTM7UgpJeFzhSTly-gHzFu5sIYUGG-XO4yRNn_MQk,14775
28
+ caption_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
29
+ caption_flow-0.4.0.dist-info/METADATA,sha256=e1sdcAeXR-nYlRZlrDtvwXBuRPb1J-_jzTzIvWevsHs,9732
30
+ caption_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ caption_flow-0.4.0.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
32
+ caption_flow-0.4.0.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
33
+ caption_flow-0.4.0.dist-info/RECORD,,
@@ -1,33 +0,0 @@
1
- caption_flow/__init__.py,sha256=2M1VLvkVjUmTHXuJFMLnZKqVYni5A0HJfxcnjz53K7c,303
2
- caption_flow/cli.py,sha256=K3lML3WIYjD7OluGltHGP4N98S5w-KyhDUlQZudDQXE,41464
3
- caption_flow/models.py,sha256=2n6iphTEL62xK2FFcJM6axMsaE8KwsUv5Ak_cCF-TdQ,5652
4
- caption_flow/monitor.py,sha256=z2HakZSG799HvTJgjgG7u_MHvhq9-JL1LXzxBwP3WQc,7998
5
- caption_flow/orchestrator.py,sha256=3XKZXFE1Aw1kCqb_Vw9loYpkmJ5LTLyZZf9pj4k6ldA,37175
6
- caption_flow/viewer.py,sha256=HxO98eHR1xtivG0dEdYC2U9T_RgeRfJqqTK-37u9bNM,20471
7
- caption_flow/processors/__init__.py,sha256=hvq-OuAJWQe6hFglKe7QmkS8473k20FmxZDSxfXpCrg,423
8
- caption_flow/processors/base.py,sha256=IAEr0pqHRuSkXunvDWk1vf2IKeYQ-2YERqej9iSQm94,6931
9
- caption_flow/processors/huggingface.py,sha256=t_dklhmNRAyk2jISu4FqmNecjg9hfY47omOiRVkbhvA,41215
10
- caption_flow/processors/local_filesystem.py,sha256=OuNNDemy0sdtpBBC_5GbI-c1vMqp8OIz983Cq85gdb8,27964
11
- caption_flow/processors/webdataset.py,sha256=tUBCUKunqooHibTWtQ1wljuRI55Wc6M1WrI2hOZgt7g,33858
12
- caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
13
- caption_flow/storage/exporter.py,sha256=mFJqMDQ61cP-qcXe118_-oL1TUqULdQZ8LdjSTym44I,19697
14
- caption_flow/storage/manager.py,sha256=KPExcKPuFVQSsBnfCBdne5PO4PwN4NTfd-EJQk13OY0,47459
15
- caption_flow/utils/__init__.py,sha256=bDcO5uR455TKCQ2hX-_XcdTnRXDBaT8Yn4jWqWzfFsE,120
16
- caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
17
- caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
18
- caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
19
- caption_flow/utils/checkpoint_tracker.py,sha256=nOZIIGsXTRUj09tFSnWtRgj_zoa8Og_-rutkr2GFz8Y,4417
20
- caption_flow/utils/chunk_tracker.py,sha256=JZIFvaHS5AYaVOzsSJKrnNlS4E3BdzV64cRkQa_65g0,21508
21
- caption_flow/utils/image_processor.py,sha256=wmOExkVfM7OeuLfX3AwMefsH-TxL8TNcn22gp0NmJKY,1541
22
- caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
23
- caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
24
- caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
25
- caption_flow/workers/base.py,sha256=nEWohozFZ0Bw3_8U8xirnKLeZsGR5k69rSu4j-oDitc,7698
26
- caption_flow/workers/caption.py,sha256=swE4pYg4ZYAAtMxvyvlETa3wv4yKWUPXXulCAwPhPiQ,39477
27
- caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
28
- caption_flow-0.3.4.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
29
- caption_flow-0.3.4.dist-info/METADATA,sha256=dfB40EF_Zgz2Ux8qvdBbfLdhzY85_MUFRX-904I-qb4,9708
30
- caption_flow-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- caption_flow-0.3.4.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
32
- caption_flow-0.3.4.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
33
- caption_flow-0.3.4.dist-info/RECORD,,