PyPI - caption-flow - Versions diffs - 0.4.0__tar.gz → 0.4.1__tar.gz - Mend

caption-flow 0.4.0tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{caption_flow-0.4.0/src/caption_flow.egg-info → caption_flow-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.4.0
+Version: 0.4.1
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT
@@ -48,6 +48,10 @@ Dynamic: license-file
 # CaptionFlow
+<!-- [![Tests](https://github.com/bghira/CaptionFlow/workflows/tests/badge.svg)](https://github.com/bghira/CaptionFlow/actions/workflows/tests.yml) -->
+[![codecov](https://codecov.io/github/bghira/CaptionFlow/graph/badge.svg?token=PRAQPNGYAS)](https://codecov.io/github/bghira/CaptionFlow)
+[![PyPI version](https://badge.fury.io/py/caption-flow.svg)](https://badge.fury.io/py/caption-flow)
 scalable, fault-tolerant **vLLM-powered image captioning**.
 a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.

{caption_flow-0.4.0 → caption_flow-0.4.1}/README.md RENAMED Viewed

@@ -1,5 +1,9 @@
 # CaptionFlow
+<!-- [![Tests](https://github.com/bghira/CaptionFlow/workflows/tests/badge.svg)](https://github.com/bghira/CaptionFlow/actions/workflows/tests.yml) -->
+[![codecov](https://codecov.io/github/bghira/CaptionFlow/graph/badge.svg?token=PRAQPNGYAS)](https://codecov.io/github/bghira/CaptionFlow)
+[![PyPI version](https://badge.fury.io/py/caption-flow.svg)](https://badge.fury.io/py/caption-flow)
 scalable, fault-tolerant **vLLM-powered image captioning**.
 a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.
@@ -190,4 +194,4 @@ Your contributions will be tracked and attributed in the final dataset!
 ## License
-AGPLv3
+AGPLv3

{caption_flow-0.4.0 → caption_flow-0.4.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "caption-flow"
-version = "0.4.0"
+version = "0.4.1"
 description = "Self-contained distributed community captioning system"
 readme = "README.md"
 requires-python = ">=3.11,<3.13"

{caption_flow-0.4.0 → caption_flow-0.4.1}/src/caption_flow/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """CaptionFlow - Distributed community captioning system."""
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 from .monitor import Monitor
 from .orchestrator import Orchestrator

{caption_flow-0.4.0 → caption_flow-0.4.1}/src/caption_flow/cli.py RENAMED Viewed

@@ -1276,33 +1276,6 @@ async def _export_single_format(
                 console.print(f"  • {shard_name}: {count:,} items")
-@main.command()
-@click.option("--data-dir", default="./caption_data", help="Storage directory")
-@click.option(
-    "--format",
-    type=click.Choice(
-        ["jsonl", "json", "csv", "txt", "parquet", "lance", "huggingface_hub", "all"],
-        case_sensitive=False,
-    ),
-    default="jsonl",
-    help="Export format (default: jsonl)",
-)
-@click.option("--output", "-o", help="Output path (file for jsonl/csv, directory for json/txt)")
-@click.option("--limit", type=int, help="Limit number of rows to export")
-@click.option("--columns", help="Comma-separated list of columns to export (default: all)")
-@click.option("--export-column", default="captions", help="Column to export for txt format")
-@click.option("--filename-column", default="filename", help="Column containing filenames")
-@click.option("--shard", help="Specific shard to export (e.g., data-0001)")
-@click.option("--shards", help="Comma-separated list of shards to export")
-@click.option("--include-empty", is_flag=True, help="Include rows with empty export column")
-@click.option("--stats-only", is_flag=True, help="Show statistics without exporting")
-@click.option("--optimize", is_flag=True, help="Optimize storage before export")
-@click.option("--verbose", is_flag=True, help="Show detailed export progress")
-@click.option("--hf-dataset", help="Dataset name on HF Hub (e.g., username/dataset-name)")
-@click.option("--license", default="apache-2.0", help="License for the dataset")
-@click.option("--private", is_flag=True, help="Make HF dataset private")
-@click.option("--nsfw", is_flag=True, help="Add not-for-all-audiences tag")
-@click.option("--tags", help="Comma-separated tags for HF dataset")
 def _validate_export_setup(data_dir):
     """Validate export setup and create storage manager."""
     from .storage import StorageManager
@@ -1333,6 +1306,7 @@ async def _run_export_process(
     tags,
     stats_only,
     optimize,
+    include_empty,
 ):
     """Execute the main export process."""
     from .storage.exporter import LanceStorageExporter
@@ -1448,6 +1422,7 @@ def export(
                 tags,
                 stats_only,
                 optimize,
+                include_empty,
             )
         )
     except ExportError as e:

{caption_flow-0.4.0 → caption_flow-0.4.1}/src/caption_flow/processors/huggingface.py RENAMED Viewed

@@ -1195,7 +1195,18 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                                 # Still extract URL if available for metadata
                                 if self.url_column and self.url_column in item:
-                                    image_url = item[self.url_column]
+                                    url_value = item[self.url_column]
+                                    if (
+                                        url_value
+                                        and str(url_value).strip()
+                                        and str(url_value).strip().lower() != "none"
+                                    ):
+                                        image_url = str(url_value).strip()
+                                    else:
+                                        logger.debug(
+                                            f"Invalid or None URL for item {global_idx}: {url_value}"
+                                        )
+                                        image_url = None
                                 # Create dummy image with metadata context
                                 image = self._create_dummy_image(
@@ -1209,7 +1220,19 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                                 # Normal processing - load real images
                                 if self.url_column:
                                     if self.url_column in item:
-                                        image_url = item[self.url_column]
+                                        url_value = item[self.url_column]
+                                        if (
+                                            url_value
+                                            and str(url_value).strip()
+                                            and str(url_value).strip().lower() != "none"
+                                        ):
+                                            image_url = str(url_value).strip()
+                                        else:
+                                            logger.debug(
+                                                f"Skipping invalid or None URL for item {global_idx}: {url_value}"
+                                            )
+                                            continue  # Skip this item entirely
                                         try:
                                             max_retries = 3
                                             backoff_factor = 2

{caption_flow-0.4.0 → caption_flow-0.4.1}/src/caption_flow/workers/caption.py RENAMED Viewed

@@ -137,6 +137,19 @@ class MultiStageVLLMManager:
     def get_model_for_stage(self, stage_name: str, model_name: str) -> Tuple[Any, Any, Any, Any]:
         """Get model components for a stage."""
+        if model_name not in self.models:
+            raise KeyError(
+                f"Model '{model_name}' not found in loaded models. Available models: {list(self.models.keys())}"
+            )
+        if model_name not in self.processors:
+            raise KeyError(f"Processor for model '{model_name}' not found")
+        if model_name not in self.tokenizers:
+            raise KeyError(f"Tokenizer for model '{model_name}' not found")
+        if stage_name not in self.sampling_params:
+            raise KeyError(
+                f"Sampling params for stage '{stage_name}' not found. Available stages: {list(self.sampling_params.keys())}"
+            )
         return (
             self.models[model_name],
             self.processors[model_name],
@@ -489,7 +502,19 @@ class CaptionWorker(BaseWorker):
                     return True
                 except Exception as e:
                     logger.error(f"Failed to reload vLLM: {e}")
+                    # Restore previous state
                     self.vllm_config = old_config
+                    self.stages = self._parse_stages_config(old_config)
+                    self.stage_order = self._topological_sort_stages(self.stages)
+                    # Attempt to restore previous models
+                    try:
+                        self._setup_vllm()
+                    except Exception as restore_error:
+                        logger.error(f"Failed to restore previous vLLM state: {restore_error}")
+                        # Clean up broken state
+                        if self.model_manager:
+                            self.model_manager.cleanup()
+                            self.model_manager = None
                     return False
             else:
                 # Clean up models if switching to mock mode
@@ -886,10 +911,21 @@ class CaptionWorker(BaseWorker):
             stage = next(s for s in self.stages if s.name == stage_name)
             logger.debug(f"Processing batch through stage: {stage_name}")
+            # Check if model manager is properly initialized
+            if not self.model_manager:
+                logger.error("Model manager not initialized")
+                self.items_failed += len(batch)
+                return []
             # Get model components
-            llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
-                stage_name, stage.model
-            )
+            try:
+                llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
+                    stage_name, stage.model
+                )
+            except KeyError as e:
+                logger.error(f"Model not found during batch processing: {e}")
+                self.items_failed += len(batch)
+                return []
             # Validate batch before processing
             processable_batch, too_long_items = self._validate_and_split_batch(

{caption_flow-0.4.0 → caption_flow-0.4.1/src/caption_flow.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.4.0
+Version: 0.4.1
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT
@@ -48,6 +48,10 @@ Dynamic: license-file
 # CaptionFlow
+<!-- [![Tests](https://github.com/bghira/CaptionFlow/workflows/tests/badge.svg)](https://github.com/bghira/CaptionFlow/actions/workflows/tests.yml) -->
+[![codecov](https://codecov.io/github/bghira/CaptionFlow/graph/badge.svg?token=PRAQPNGYAS)](https://codecov.io/github/bghira/CaptionFlow)
+[![PyPI version](https://badge.fury.io/py/caption-flow.svg)](https://badge.fury.io/py/caption-flow)
 scalable, fault-tolerant **vLLM-powered image captioning**.
 a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.

{caption_flow-0.4.0 → caption_flow-0.4.1}/tests/test_cli.py RENAMED Viewed

@@ -395,6 +395,32 @@ class TestExportCommand:
         assert result.exit_code == 0
         assert "export" in result.output.lower()
+    def test_export_command_no_duplicate_registration(self, runner):
+        """Regression test: Ensure export command is only registered once.
+        This test prevents the bug where @main.command() was incorrectly
+        applied to _validate_export_setup() causing duplicate command
+        registration and argument parsing errors.
+        """
+        # Get all registered commands
+        commands = list(main.commands.keys())
+        # Count occurrences of 'export'
+        export_count = commands.count("export")
+        # Should be exactly one export command
+        assert export_count == 1, f"Expected 1 export command, found {export_count}: {commands}"
+        # Test that export command can handle basic arguments without parsing errors
+        result = runner.invoke(main, ["export", "--help"])
+        assert result.exit_code == 0
+        assert "Got unexpected extra arguments" not in result.output
+        # Test with a data directory argument (the one that was causing issues)
+        result = runner.invoke(main, ["export", "--data-dir", "caption_data", "--stats-only"])
+        # Should not get parsing errors (though it may fail for other reasons like missing files)
+        assert "Got unexpected extra arguments" not in result.output
     @patch("caption_flow.storage.StorageManager")
     @patch("caption_flow.cli.asyncio.run")
     def test_export_stats_only(self, mock_asyncio_run, mock_storage_class, runner, tmp_path):

{caption_flow-0.4.0 → caption_flow-0.4.1}/tests/test_processors.py RENAMED Viewed

@@ -1034,5 +1034,114 @@ class TestProcessorIntegration(ProcessorTestBase):
         assert any(uid in available_ids for uid in unit_ids[1:])
+class TestHuggingFaceURLValidation:
+    """Test URL validation in HuggingFace processor."""
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        temp_dir = tempfile.mkdtemp()
+        yield Path(temp_dir)
+        shutil.rmtree(temp_dir)
+    @pytest.fixture
+    def mock_parquet_data_with_invalid_urls(self, temp_dir):
+        """Create mock parquet data with invalid URLs."""
+        # Create test data with various invalid URL scenarios
+        data = {
+            "id": [1, 2, 3, 4, 5],
+            "url": [
+                "https://example.com/valid.jpg",  # Valid URL
+                None,  # None URL
+                "",  # Empty string
+                "None",  # String "None"
+                "  ",  # Whitespace only
+            ],
+            "caption": [
+                "A valid image",
+                "Image with None URL",
+                "Image with empty URL",
+                "Image with None string URL",
+                "Image with whitespace URL",
+            ],
+        }
+        # Create parquet file
+        table = pa.table(data)
+        parquet_file = temp_dir / "test_data.parquet"
+        pq.write_table(table, parquet_file)
+        return str(parquet_file)
+    def test_url_validation_skips_invalid_urls(self):
+        """Test URL validation logic that skips invalid URLs."""
+        # Test the actual validation logic used in the processor
+        invalid_urls = [None, "", "   ", "None", "NONE", "none"]
+        valid_urls = ["https://example.com/image.jpg", "http://test.com/pic.png"]
+        # Track which URLs would be processed (not skipped)
+        processed_urls = []
+        for url_value in invalid_urls + valid_urls:
+            # This matches the exact logic from the processor
+            if url_value and str(url_value).strip() and str(url_value).strip().lower() != "none":
+                processed_urls.append(str(url_value).strip())
+        # Should only process valid URLs
+        assert len(processed_urls) == 2
+        assert "https://example.com/image.jpg" in processed_urls
+        assert "http://test.com/pic.png" in processed_urls
+    def test_url_validation_in_mock_mode(self):
+        """Test URL validation logic preserves valid URLs for metadata."""
+        test_urls = {
+            "valid": "https://example.com/valid.jpg",
+            "none": None,
+            "empty": "",
+            "none_string": "None",
+            "whitespace": "  ",
+        }
+        # Simulate extraction and validation for metadata
+        extracted_urls = {}
+        for key, url_value in test_urls.items():
+            if url_value and str(url_value).strip() and str(url_value).strip().lower() != "none":
+                extracted_urls[key] = str(url_value).strip()
+            else:
+                extracted_urls[key] = None
+        # Only valid URL should be extracted
+        assert extracted_urls["valid"] == "https://example.com/valid.jpg"
+        assert extracted_urls["none"] is None
+        assert extracted_urls["empty"] is None
+        assert extracted_urls["none_string"] is None
+        assert extracted_urls["whitespace"] is None
+    def test_url_validation_edge_cases(self):
+        """Test edge cases for URL validation."""
+        processor = HuggingFaceDatasetWorkerProcessor()
+        # Test different invalid URL values
+        test_cases = [
+            (None, False),
+            ("", False),
+            ("   ", False),
+            ("None", False),
+            ("NONE", False),
+            ("none", False),
+            ("https://valid.com/image.jpg", True),
+            ("http://valid.com/image.jpg", True),
+            ("  https://valid.com/image.jpg  ", True),  # Should be stripped
+        ]
+        for url_value, should_be_valid in test_cases:
+            # Simulate the validation logic from the processor (matches the actual code)
+            is_valid = bool(
+                url_value and str(url_value).strip() and str(url_value).strip().lower() != "none"
+            )
+            assert is_valid == should_be_valid, f"URL validation failed for: {url_value!r}"
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "-s"])

{caption_flow-0.4.0 → caption_flow-0.4.1}/tests/test_worker_caption.py RENAMED Viewed

@@ -879,5 +879,194 @@ class TestCaptionWorkerProcessors:
         assert worker.processor is not None
+class TestCaptionWorkerConfigReload:
+    """Test CaptionWorker config reload functionality."""
+    @pytest.fixture
+    def worker_config(self):
+        return {
+            "name": "test_worker",
+            "token": "test_token",
+            "server": "ws://localhost:8765",
+            "server_url": "ws://localhost:8765",
+            "gpu_id": 0,
+        }
+    @pytest.fixture
+    def initial_vllm_config(self):
+        return {
+            "model": "test-model-v1",
+            "batch_size": 4,
+            "max_model_len": 16384,
+            "stages": [
+                {
+                    "name": "caption",
+                    "model": "test-model-v1",
+                    "prompts": ["describe this image"],
+                    "output_field": "captions",
+                    "requires": [],
+                }
+            ],
+        }
+    def test_config_reload_failure_restores_state(self, worker_config, initial_vllm_config):
+        """Test that config reload failure properly restores previous state."""
+        worker = CaptionWorker(worker_config)
+        # Set up initial state
+        worker.vllm_config = initial_vllm_config
+        worker.stages = worker._parse_stages_config(initial_vllm_config)
+        worker.stage_order = worker._topological_sort_stages(worker.stages)
+        worker.mock_mode = False
+        # Mock model manager with working models
+        mock_model_manager = Mock()
+        mock_model_manager.models = {"test-model-v1": "loaded_model"}
+        mock_model_manager.processors = {"test-model-v1": "loaded_processor"}
+        mock_model_manager.tokenizers = {"test-model-v1": "loaded_tokenizer"}
+        mock_model_manager.sampling_params = {"caption": "loaded_sampling"}
+        worker.model_manager = mock_model_manager
+        # New config that will cause setup failure
+        new_config = {
+            "model": "test-model-v2",
+            "batch_size": 8,
+            "stages": [
+                {
+                    "name": "caption",
+                    "model": "test-model-v2",
+                    "prompts": ["analyze this image"],
+                    "output_field": "captions",
+                    "requires": [],
+                }
+            ],
+        }
+        # Mock _setup_vllm to fail on first call (new config) but succeed on second call (restore)
+        setup_call_count = 0
+        def mock_setup_vllm():
+            nonlocal setup_call_count
+            setup_call_count += 1
+            if setup_call_count == 1:
+                raise Exception("Failed to load new model")
+            # Second call succeeds (restoration)
+            return
+        with patch.object(worker, "_setup_vllm", side_effect=mock_setup_vllm):
+            # Attempt config update
+            result = worker._handle_vllm_config_update(new_config)
+        # Should return False due to failure
+        assert result is False
+        # Should have restored original config
+        assert worker.vllm_config == initial_vllm_config
+        # Should have restored original stages
+        assert len(worker.stages) == 1
+        assert worker.stages[0].model == "test-model-v1"
+        assert worker.stages[0].prompts == ["describe this image"]
+        # Should have called _setup_vllm twice (once for new config, once for restore)
+        assert setup_call_count == 2
+        # Model manager cleanup should have been called
+        mock_model_manager.cleanup.assert_called()
+    def test_model_manager_get_model_for_stage_keyerror_handling(self):
+        """Test that get_model_for_stage provides helpful error messages."""
+        from caption_flow.workers.caption import MultiStageVLLMManager
+        manager = MultiStageVLLMManager()
+        # Test missing model
+        with pytest.raises(KeyError) as exc_info:
+            manager.get_model_for_stage("caption", "missing-model")
+        assert "Model 'missing-model' not found" in str(exc_info.value)
+        assert "Available models: []" in str(exc_info.value)
+        # Add a model but missing stage
+        manager.models["test-model"] = Mock()
+        manager.processors["test-model"] = Mock()
+        manager.tokenizers["test-model"] = Mock()
+        with pytest.raises(KeyError) as exc_info:
+            manager.get_model_for_stage("missing-stage", "test-model")
+        assert "Sampling params for stage 'missing-stage' not found" in str(exc_info.value)
+        assert "Available stages: []" in str(exc_info.value)
+    def test_process_batch_handles_missing_model_manager(self, worker_config):
+        """Test that batch processing handles missing model manager gracefully."""
+        worker = CaptionWorker(worker_config)
+        # Create a mock processing item
+        mock_image = Image.new("RGB", (100, 100), "red")
+        item = ProcessingItem(
+            unit_id="test-unit",
+            job_id="test-job",
+            chunk_id="test-chunk",
+            item_key="test-item",
+            item_index=0,
+            image=mock_image,
+            image_data=b"fake_data",
+            metadata={},
+        )
+        # Set up worker state without model manager
+        worker.vllm_config = {"max_model_len": 16384}
+        mock_stage = Mock()
+        mock_stage.name = "test-stage"
+        worker.stages = [mock_stage]
+        worker.stage_order = ["test-stage"]
+        worker.model_manager = None  # Simulate missing model manager
+        # Process batch should handle missing model manager
+        result = worker._process_batch_multi_stage([item])
+        # Should return empty results and increment failed items
+        assert result == []
+        assert worker.items_failed == 1
+    def test_process_batch_handles_model_keyerror(self, worker_config):
+        """Test that batch processing handles KeyError from get_model_for_stage."""
+        worker = CaptionWorker(worker_config)
+        # Create a mock processing item
+        mock_image = Image.new("RGB", (100, 100), "red")
+        item = ProcessingItem(
+            unit_id="test-unit",
+            job_id="test-job",
+            chunk_id="test-chunk",
+            item_key="test-item",
+            item_index=0,
+            image=mock_image,
+            image_data=b"fake_data",
+            metadata={},
+        )
+        # Set up worker state
+        worker.vllm_config = {"max_model_len": 16384}
+        # Create mock stage
+        mock_stage = Mock()
+        mock_stage.name = "test-stage"
+        mock_stage.model = "missing-model"
+        worker.stages = [mock_stage]
+        worker.stage_order = ["test-stage"]
+        # Mock model manager that raises KeyError
+        mock_model_manager = Mock()
+        mock_model_manager.get_model_for_stage.side_effect = KeyError("Model not found")
+        worker.model_manager = mock_model_manager
+        # Process batch should handle KeyError gracefully
+        result = worker._process_batch_multi_stage([item])
+        # Should return empty results and increment failed items
+        assert result == []
+        assert worker.items_failed == 1
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "-s"])