PyPI - caption-flow - Versions diffs - 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

caption-flow 0.2.3py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

caption_flow/__init__.py +1 -1
caption_flow/cli.py +307 -0
caption_flow/models.py +26 -0
caption_flow/orchestrator.py +9 -9
caption_flow/processors/huggingface.py +636 -464
caption_flow/processors/webdataset.py +379 -534
caption_flow/storage/__init__.py +1 -0
caption_flow/storage/exporter.py +550 -0
caption_flow/{storage.py → storage/manager.py} +410 -303
caption_flow/utils/__init__.py +0 -2
caption_flow/utils/chunk_tracker.py +196 -164
caption_flow/utils/image_processor.py +19 -132
caption_flow/viewer.py +594 -0
caption_flow/workers/caption.py +164 -129
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/METADATA +45 -177
caption_flow-0.3.1.dist-info/RECORD +33 -0
caption_flow/utils/dataset_loader.py +0 -222
caption_flow/utils/dataset_metadata_cache.py +0 -67
caption_flow/utils/job_queue.py +0 -41
caption_flow/utils/shard_processor.py +0 -119
caption_flow/utils/shard_tracker.py +0 -83
caption_flow-0.2.3.dist-info/RECORD +0 -35
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/WHEEL +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/top_level.txt +0 -0

caption_flow/workers/caption.py CHANGED Viewed

@@ -34,7 +34,7 @@ from ..utils.prompt_template import PromptTemplateManager
 from ..models import ProcessingStage, StageResult
 logger = logging.getLogger(__name__)
-# logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
 @dataclass
@@ -163,14 +163,14 @@ class CaptionWorker(BaseWorker):
     def __init__(self, config: Dict[str, Any]):
         super().__init__(config)
-        # Processor configuration - will be set from orchestrator
+        # Processor configuration
         self.processor_type = None
         self.processor: Optional[
             Union[
                 WebDatasetWorkerProcessor,
                 HuggingFaceDatasetWorkerProcessor,
                 LocalFilesystemWorkerProcessor,
-            ],
+            ]
         ] = None
         self.dataset_path: Optional[str] = None
@@ -181,12 +181,16 @@ class CaptionWorker(BaseWorker):
         self.vllm_config_manager = VLLMConfigManager()
         self.model_manager = None
+        # Mock mode flag
+        self.mock_mode = False
         # GPU selection
         self.gpu_id = config.get("gpu_id", 0)
         self.hf_token = get_token()
         # Image processor
-        batch_image_processing = config.get("batch_image_processing", False)
+        batch_image_processing = config.get("batch_image_processing", True)
+        logger.info(f"Using batch processing: {batch_image_processing}")
         self.image_processor = ImageProcessor() if batch_image_processing else None
         # Work processing
@@ -194,9 +198,7 @@ class CaptionWorker(BaseWorker):
         self.assigned_units = deque()
         self.current_unit: Optional[WorkUnit] = None
-        # Processing queues
-        self.readahead_queue = Queue(maxsize=256)
-        self.inference_queue = Queue(maxsize=128)
+        # Single result queue for sending back to orchestrator
         self.result_queue = Queue()
         # Processing control
@@ -230,13 +232,18 @@ class CaptionWorker(BaseWorker):
                 logger.error(f"Failed to get config: {e}")
                 await asyncio.sleep(5)
-        # Initialize vLLM once we have config
-        if self.vllm_config:
-            self._setup_vllm()
+        # Check for mock mode
+        self.mock_mode = self.vllm_config.get("mock_results", False) if self.vllm_config else False
-        # Start background threads
-        Thread(target=self._unit_processor_thread, daemon=True).start()
-        Thread(target=self._inference_thread, daemon=True).start()
+        if self.mock_mode:
+            logger.info("🎭 MOCK MODE ENABLED - No vLLM models will be loaded")
+        else:
+            # Initialize vLLM once we have config
+            if self.vllm_config:
+                self._setup_vllm()
+        # Start processing thread
+        Thread(target=self._processing_thread, daemon=True).start()
     async def _initial_connect_for_config(self):
         """Connect initially just to get configuration."""
@@ -269,8 +276,6 @@ class CaptionWorker(BaseWorker):
             self.assigned_units.clear()
             self.current_unit = None
-        self._clear_queue(self.readahead_queue)
-        self._clear_queue(self.inference_queue)
         self._clear_queue(self.result_queue)
         # Reset counters
@@ -297,6 +302,7 @@ class CaptionWorker(BaseWorker):
         self.processor.initialize(processor_config)
         self.dataset_path = self.processor.dataset_path
+        self.units_per_request = processor_config.config.get("chunks_per_request", 1)
         # Update vLLM config if provided
         new_vllm_config = welcome_data.get("processor_config", {}).get("vllm")
@@ -309,7 +315,7 @@ class CaptionWorker(BaseWorker):
         # Request initial work
         if self.websocket:
-            await self.websocket.send(json.dumps({"type": "request_work", "count": 2}))
+            await self.websocket.send(json.dumps({"type": "get_work_units", "count": 2}))
     async def _handle_message(self, data: Dict[str, Any]):
         """Handle message from orchestrator."""
@@ -327,7 +333,7 @@ class CaptionWorker(BaseWorker):
             await asyncio.sleep(10)
             if self.websocket and self.connected.is_set():
-                await self.websocket.send(json.dumps({"type": "request_work", "count": 2}))
+                await self.websocket.send(json.dumps({"type": "get_work_units", "count": 2}))
     def _parse_stages_config(self, vllm_config: Dict[str, Any]) -> List[ProcessingStage]:
         """Parse stages configuration from vLLM config."""
@@ -405,6 +411,7 @@ class CaptionWorker(BaseWorker):
         self.model_manager = MultiStageVLLMManager(self.gpu_id)
         # Get base config for models
+        logger.info(f"vLLM config: {self.vllm_config}")
         base_config = {
             "tensor_parallel_size": self.vllm_config.get("tensor_parallel_size", 1),
             "max_model_len": self.vllm_config.get("max_model_len", 16384),
@@ -437,6 +444,13 @@ class CaptionWorker(BaseWorker):
         if not new_config:
             return True
+        # Check if mock mode changed
+        old_mock_mode = self.mock_mode
+        self.mock_mode = new_config.get("mock_results", False)
+        if old_mock_mode != self.mock_mode:
+            logger.info(f"Mock mode changed from {old_mock_mode} to {self.mock_mode}")
         # Parse new stages
         new_stages = self._parse_stages_config(new_config)
@@ -453,35 +467,43 @@ class CaptionWorker(BaseWorker):
                     stages_changed = True
                     break
-        if stages_changed:
-            logger.info("Stage configuration changed, reloading all models")
+        if stages_changed or old_mock_mode != self.mock_mode:
+            logger.info("Configuration changed significantly")
             old_config = self.vllm_config
             self.vllm_config = new_config
             self.stages = new_stages
             self.stage_order = self._topological_sort_stages(self.stages)
-            try:
+            if not self.mock_mode:
+                try:
+                    if self.model_manager:
+                        self.model_manager.cleanup()
+                    self._setup_vllm()
+                    return True
+                except Exception as e:
+                    logger.error(f"Failed to reload vLLM: {e}")
+                    self.vllm_config = old_config
+                    return False
+            else:
+                # Clean up models if switching to mock mode
                 if self.model_manager:
                     self.model_manager.cleanup()
-                self._setup_vllm()
+                    self.model_manager = None
                 return True
-            except Exception as e:
-                logger.error(f"Failed to reload vLLM: {e}")
-                self.vllm_config = old_config
-                return False
         else:
             # Just update sampling params
-            logger.info("Updating sampling parameters without model reload")
-            base_sampling = new_config.get("sampling", {})
-            for stage in self.stages:
-                self.model_manager.create_sampling_params(stage, base_sampling)
+            if not self.mock_mode:
+                logger.info("Updating sampling parameters without model reload")
+                base_sampling = new_config.get("sampling", {})
+                for stage in self.stages:
+                    self.model_manager.create_sampling_params(stage, base_sampling)
             self.vllm_config = new_config
             return True
-    def _unit_processor_thread(self):
-        """Background thread that processes work units."""
-        logger.info("Starting unit processor thread")
+    def _processing_thread(self):
+        """Main processing thread that handles work units."""
+        logger.info("Starting processing thread")
         while self.running:
             if self.should_stop_processing.is_set():
@@ -513,11 +535,13 @@ class CaptionWorker(BaseWorker):
                     with self.work_lock:
                         queue_size = len(self.assigned_units)
-                    if queue_size < 2 and self.websocket and self.main_loop:
+                    if queue_size < self.units_per_request and self.websocket and self.main_loop:
                         try:
                             asyncio.run_coroutine_threadsafe(
                                 self.websocket.send(
-                                    json.dumps({"type": "request_work", "count": 2})
+                                    json.dumps(
+                                        {"type": "get_work_units", "count": self.units_per_request}
+                                    )
                                 ),
                                 self.main_loop,
                             ).result(timeout=5)
@@ -533,16 +557,20 @@ class CaptionWorker(BaseWorker):
                     self.current_unit = None
     def _process_work_unit(self, unit: WorkUnit):
-        """Process a single work unit."""
+        """Process a single work unit with batching."""
         if not self.processor:
             logger.error("Processor not initialized")
             return
-        items_processed = 0
-        context = {}  # Will store processed indices
+        batch = []
+        batch_size = self.vllm_config.get("batch_size", 8)
+        context = {}
-        # Get items from processor
+        # Collect items for batching
         for item_data in self.processor.process_unit(unit, context):
+            if self.should_stop_processing.is_set() or not self.connected.is_set():
+                break
             try:
                 # Create processing item
                 item = ProcessingItem(
@@ -551,35 +579,19 @@ class CaptionWorker(BaseWorker):
                     job_id=item_data["job_id"],
                     item_key=item_data["item_key"],
                     item_index=item_data["item_index"],
-                    image=item_data["image"],
+                    image=item_data.get("image", None),
                     image_data=item_data.get("image_data", b""),
                     metadata=item_data.get("metadata", {}),
                 )
+                if "_processed_indices" in item_data:
+                    context["_processed_indices"] = item_data.pop("_processed_indices", [])
-                # Add to readahead queue
-                timeout_end = time.time() + 30
-                while (
-                    self.running
-                    and not self.should_stop_processing.is_set()
-                    and self.connected.is_set()
-                ):
-                    try:
-                        self.readahead_queue.put(item, timeout=1)
-                        break
-                    except:
-                        if time.time() > timeout_end:
-                            raise TimeoutError("Queue put timeout")
-                        continue
-                if not self.connected.is_set() or self.should_stop_processing.is_set():
-                    break
-                items_processed += 1
+                batch.append(item)
-                # Batch items for inference
-                batch_size = self.vllm_config.get("batch_size", 8)
-                if self.readahead_queue.qsize() >= batch_size:
-                    self._batch_for_inference()
+                # Process batch when it reaches size
+                if len(batch) >= batch_size:
+                    self._process_batch(batch)
+                    batch = []
             except Exception as e:
                 if self.should_stop_processing.is_set():
@@ -587,77 +599,95 @@ class CaptionWorker(BaseWorker):
                 logger.error(f"Error processing item {item_data.get('item_key')}: {e}")
                 self.items_failed += 1
-        # Process any remaining items
-        if not self.should_stop_processing.is_set():
-            self._batch_for_inference()
-            if self.connected.is_set():
-                # Notify orchestrator that unit is complete
+        # Process remaining items in batch
+        if batch and not self.should_stop_processing.is_set():
+            self._process_batch(batch)
+        # Notify orchestrator that unit is complete
+        if self.connected.is_set() and self.websocket:
+            try:
                 asyncio.run_coroutine_threadsafe(
                     self.websocket.send(
                         json.dumps({"type": "work_complete", "unit_id": unit.unit_id})
                     ),
                     self.main_loop,
                 ).result(timeout=5)
+            except Exception as e:
+                logger.warning(f"Could not notify work complete: {e}")
-        logger.info(f"Unit {unit.unit_id} processed {items_processed} items")
+    def _process_batch(self, batch: List[ProcessingItem]):
+        """Process a batch of items through all stages."""
+        if not batch:
+            return
-    def _batch_for_inference(self):
-        """Batch items from readahead queue for inference."""
-        batch = []
-        batch_size = self.vllm_config.get("batch_size", 8)
+        logger.debug(f"Processing batch of {len(batch)} images")
+        start_time = time.time()
         try:
-            while len(batch) < batch_size:
-                item = self.readahead_queue.get_nowait()
-                batch.append(item)
-        except Empty:
-            pass
+            # Process batch through all stages
+            if self.mock_mode:
+                results = self._process_batch_mock(batch)
+            else:
+                results = self._process_batch_multi_stage(batch)
-        if batch:
-            self.inference_queue.put(batch)
+            # Calculate processing time
+            if results:
+                processing_time_per_item = (time.time() - start_time) * 1000 / len(batch)
+                for item, result_outputs in results:
+                    self.result_queue.put(
+                        {
+                            "item": item,
+                            "outputs": result_outputs,
+                            "processing_time_ms": processing_time_per_item,
+                        }
+                    )
-    def _inference_thread(self):
-        """Background thread for multi-stage vLLM inference."""
-        logger.info("Starting multi-stage inference thread")
+            logger.debug(f"Batch processing complete: {len(results)} successful")
-        while self.running:
-            try:
-                batch = self.inference_queue.get(timeout=1)
-                if not batch:
-                    continue
+        except Exception as e:
+            logger.error(f"Batch processing error: {e}", exc_info=True)
-                if self.should_stop_processing.is_set():
-                    continue
+    def _process_batch_mock(self, batch: List[ProcessingItem]) -> List[Tuple[ProcessingItem, Dict]]:
+        """Process a batch in mock mode - return dummy captions."""
+        results = []
-                logger.debug(
-                    f"Processing batch of {len(batch)} images through {len(self.stages)} stages"
-                )
-                start_time = time.time()
+        # Simulate some processing time
+        time.sleep(0.1)
-                # Process batch through all stages
-                results = self._process_batch_multi_stage(batch)
+        for item in batch:
+            # Generate mock outputs for each stage
+            for stage_name in self.stage_order:
+                stage = next(s for s in self.stages if s.name == stage_name)
+                # Create mock outputs based on stage prompts
+                stage_outputs = []
+                for i, prompt in enumerate(stage.prompts):
+                    mock_output = (
+                        f"Mock {stage_name} output {i+1} for job {item.job_id} - {item.item_key}"
+                    )
+                    stage_outputs.append(mock_output)
-                # Calculate processing time
-                if results:
-                    processing_time_per_item = (time.time() - start_time) * 1000 / len(batch)
+                # Store stage result
+                stage_result = StageResult(
+                    stage_name=stage_name,
+                    output_field=stage.output_field,
+                    outputs=stage_outputs,
+                )
+                item.stage_results[stage_name] = stage_result
-                    for item, result_outputs in results:
-                        self.result_queue.put(
-                            {
-                                "item": item,
-                                "outputs": result_outputs,
-                                "processing_time_ms": processing_time_per_item,
-                            }
-                        )
+            # Aggregate outputs by field
+            outputs_by_field = defaultdict(list)
+            for stage_result in item.stage_results.values():
+                outputs_by_field[stage_result.output_field].extend(stage_result.outputs)
-                logger.debug(f"Batch processing complete: {len(results)} successful")
+            results.append((item, dict(outputs_by_field)))
+            self.items_processed += 1
-            except Empty:
-                continue
-            except Exception as e:
-                if self.should_stop_processing.is_set():
-                    continue
-                logger.error(f"Inference error: {e}", exc_info=True)
+            if self.items_processed % 10 == 0:
+                logger.info(f"🎭 Mock mode: Processed {self.items_processed} items")
+        return results
     def _process_batch_multi_stage(
         self, batch: List[ProcessingItem], max_attempts: int = 3
@@ -685,8 +715,8 @@ class CaptionWorker(BaseWorker):
                 for idx, (original_idx, item, attempt_count) in enumerate(items_to_process):
                     current_batch.append((original_idx, item, attempt_count))
-                    # Prepare image
-                    converted_img = ImageProcessor.prepare_for_inference(item.image)
+                    # Prepare image from PIL frame or bytes
+                    converted_img = ImageProcessor.prepare_for_inference(item)
                     # Create template manager
                     template_manager = PromptTemplateManager(stage.prompts)
@@ -832,12 +862,11 @@ class CaptionWorker(BaseWorker):
             "units_completed": self.units_completed,
             "current_unit": self._get_current_unit_id() if self.current_unit else None,
             "queue_sizes": {
-                "readahead": self.readahead_queue.qsize(),
-                "inference": self.inference_queue.qsize(),
                 "results": self.result_queue.qsize(),
             },
             "stages": len(self.stages),
             "models_loaded": len(self.model_manager.models) if self.model_manager else 0,
+            "mock_mode": self.mock_mode,
         }
     async def _create_tasks(self) -> list:
@@ -852,7 +881,7 @@ class CaptionWorker(BaseWorker):
         """Send results back to orchestrator."""
         while self.running and self.connected.is_set():
             try:
-                # Get result
+                # Get result with timeout
                 result_data = await asyncio.get_event_loop().run_in_executor(
                     None, self.result_queue.get, True, 1
                 )
@@ -863,7 +892,6 @@ class CaptionWorker(BaseWorker):
                     outputs = result_data["outputs"]
                     # Create work result
-                    # logger.info(f"Processed item: {item}")
                     work_result = WorkResult(
                         unit_id=item.unit_id,
                         source_id=item.metadata.get("shard_name", "unknown"),
@@ -873,9 +901,21 @@ class CaptionWorker(BaseWorker):
                         metadata={
                             "item_key": item.item_key,
                             "item_index": item.metadata.get("_item_index"),
-                            "image_width": item.image.width,
-                            "image_height": item.image.height,
-                            "image_format": item.image.format or "unknown",
+                            "image_width": (
+                                item.image.width
+                                if item.image is not None
+                                else item.metadata.get("image_width")
+                            ),
+                            "image_height": (
+                                item.image.height
+                                if item.image is not None
+                                else item.metadata.get("image_height")
+                            ),
+                            "image_format": (
+                                item.image.format
+                                if item.image is not None
+                                else item.metadata.get("image_format", "unknown")
+                            ),
                             "file_size": len(item.image_data) if item.image_data else 0,
                             **item.metadata,
                         },
@@ -883,7 +923,7 @@ class CaptionWorker(BaseWorker):
                         error=result_data.get("error", None),
                     )
-                    # Send result in format that orchestrator expects
+                    # Send result
                     await self.websocket.send(
                         json.dumps(
                             {
@@ -920,9 +960,7 @@ class CaptionWorker(BaseWorker):
             self.assigned_units.clear()
             self.current_unit = None
-        # Clear queues
-        self._clear_queue(self.readahead_queue)
-        self._clear_queue(self.inference_queue)
+        # Clear result queue
         self._clear_queue(self.result_queue)
     def _clear_queue(self, queue: Queue):
@@ -935,9 +973,6 @@ class CaptionWorker(BaseWorker):
     async def _pre_shutdown(self):
         """Cleanup before shutdown."""
-        self.readahead_queue.put(None)
-        self.inference_queue.put(None)
         if self.image_processor:
             self.image_processor.shutdown()

caption-flow 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl

caption-flow 0.2.3py3-none-any.whl → 0.3.1py3-none-any.whl