PyPI - caption-flow - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

caption-flow 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

caption_flow/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """CaptionFlow - Distributed community captioning system."""
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 from .monitor import Monitor
 from .orchestrator import Orchestrator

caption_flow/cli.py CHANGED Viewed

@@ -1276,33 +1276,6 @@ async def _export_single_format(
                 console.print(f"  • {shard_name}: {count:,} items")
-@main.command()
-@click.option("--data-dir", default="./caption_data", help="Storage directory")
-@click.option(
-    "--format",
-    type=click.Choice(
-        ["jsonl", "json", "csv", "txt", "parquet", "lance", "huggingface_hub", "all"],
-        case_sensitive=False,
-    ),
-    default="jsonl",
-    help="Export format (default: jsonl)",
-)
-@click.option("--output", "-o", help="Output path (file for jsonl/csv, directory for json/txt)")
-@click.option("--limit", type=int, help="Limit number of rows to export")
-@click.option("--columns", help="Comma-separated list of columns to export (default: all)")
-@click.option("--export-column", default="captions", help="Column to export for txt format")
-@click.option("--filename-column", default="filename", help="Column containing filenames")
-@click.option("--shard", help="Specific shard to export (e.g., data-0001)")
-@click.option("--shards", help="Comma-separated list of shards to export")
-@click.option("--include-empty", is_flag=True, help="Include rows with empty export column")
-@click.option("--stats-only", is_flag=True, help="Show statistics without exporting")
-@click.option("--optimize", is_flag=True, help="Optimize storage before export")
-@click.option("--verbose", is_flag=True, help="Show detailed export progress")
-@click.option("--hf-dataset", help="Dataset name on HF Hub (e.g., username/dataset-name)")
-@click.option("--license", default="apache-2.0", help="License for the dataset")
-@click.option("--private", is_flag=True, help="Make HF dataset private")
-@click.option("--nsfw", is_flag=True, help="Add not-for-all-audiences tag")
-@click.option("--tags", help="Comma-separated tags for HF dataset")
 def _validate_export_setup(data_dir):
     """Validate export setup and create storage manager."""
     from .storage import StorageManager
@@ -1333,6 +1306,7 @@ async def _run_export_process(
     tags,
     stats_only,
     optimize,
+    include_empty,
 ):
     """Execute the main export process."""
     from .storage.exporter import LanceStorageExporter
@@ -1448,6 +1422,7 @@ def export(
                 tags,
                 stats_only,
                 optimize,
+                include_empty,
             )
         )
     except ExportError as e:

caption_flow/processors/huggingface.py CHANGED Viewed

@@ -1195,7 +1195,18 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                                 # Still extract URL if available for metadata
                                 if self.url_column and self.url_column in item:
-                                    image_url = item[self.url_column]
+                                    url_value = item[self.url_column]
+                                    if (
+                                        url_value
+                                        and str(url_value).strip()
+                                        and str(url_value).strip().lower() != "none"
+                                    ):
+                                        image_url = str(url_value).strip()
+                                    else:
+                                        logger.debug(
+                                            f"Invalid or None URL for item {global_idx}: {url_value}"
+                                        )
+                                        image_url = None
                                 # Create dummy image with metadata context
                                 image = self._create_dummy_image(
@@ -1209,7 +1220,19 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
                                 # Normal processing - load real images
                                 if self.url_column:
                                     if self.url_column in item:
-                                        image_url = item[self.url_column]
+                                        url_value = item[self.url_column]
+                                        if (
+                                            url_value
+                                            and str(url_value).strip()
+                                            and str(url_value).strip().lower() != "none"
+                                        ):
+                                            image_url = str(url_value).strip()
+                                        else:
+                                            logger.debug(
+                                                f"Skipping invalid or None URL for item {global_idx}: {url_value}"
+                                            )
+                                            continue  # Skip this item entirely
                                         try:
                                             max_retries = 3
                                             backoff_factor = 2

caption_flow/workers/caption.py CHANGED Viewed

@@ -137,6 +137,19 @@ class MultiStageVLLMManager:
     def get_model_for_stage(self, stage_name: str, model_name: str) -> Tuple[Any, Any, Any, Any]:
         """Get model components for a stage."""
+        if model_name not in self.models:
+            raise KeyError(
+                f"Model '{model_name}' not found in loaded models. Available models: {list(self.models.keys())}"
+            )
+        if model_name not in self.processors:
+            raise KeyError(f"Processor for model '{model_name}' not found")
+        if model_name not in self.tokenizers:
+            raise KeyError(f"Tokenizer for model '{model_name}' not found")
+        if stage_name not in self.sampling_params:
+            raise KeyError(
+                f"Sampling params for stage '{stage_name}' not found. Available stages: {list(self.sampling_params.keys())}"
+            )
         return (
             self.models[model_name],
             self.processors[model_name],
@@ -489,7 +502,19 @@ class CaptionWorker(BaseWorker):
                     return True
                 except Exception as e:
                     logger.error(f"Failed to reload vLLM: {e}")
+                    # Restore previous state
                     self.vllm_config = old_config
+                    self.stages = self._parse_stages_config(old_config)
+                    self.stage_order = self._topological_sort_stages(self.stages)
+                    # Attempt to restore previous models
+                    try:
+                        self._setup_vllm()
+                    except Exception as restore_error:
+                        logger.error(f"Failed to restore previous vLLM state: {restore_error}")
+                        # Clean up broken state
+                        if self.model_manager:
+                            self.model_manager.cleanup()
+                            self.model_manager = None
                     return False
             else:
                 # Clean up models if switching to mock mode
@@ -886,10 +911,21 @@ class CaptionWorker(BaseWorker):
             stage = next(s for s in self.stages if s.name == stage_name)
             logger.debug(f"Processing batch through stage: {stage_name}")
+            # Check if model manager is properly initialized
+            if not self.model_manager:
+                logger.error("Model manager not initialized")
+                self.items_failed += len(batch)
+                return []
             # Get model components
-            llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
-                stage_name, stage.model
-            )
+            try:
+                llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
+                    stage_name, stage.model
+                )
+            except KeyError as e:
+                logger.error(f"Model not found during batch processing: {e}")
+                self.items_failed += len(batch)
+                return []
             # Validate batch before processing
             processable_batch, too_long_items = self._validate_and_split_batch(

{caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.4.0
+Version: 0.4.1
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT
@@ -48,6 +48,10 @@ Dynamic: license-file
 # CaptionFlow
+<!-- [![Tests](https://github.com/bghira/CaptionFlow/workflows/tests/badge.svg)](https://github.com/bghira/CaptionFlow/actions/workflows/tests.yml) -->
+[![codecov](https://codecov.io/github/bghira/CaptionFlow/graph/badge.svg?token=PRAQPNGYAS)](https://codecov.io/github/bghira/CaptionFlow)
+[![PyPI version](https://badge.fury.io/py/caption-flow.svg)](https://badge.fury.io/py/caption-flow)
 scalable, fault-tolerant **vLLM-powered image captioning**.
 a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.

{caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-caption_flow/__init__.py,sha256=IZoOP8s4lN05e6ww9M5HWVfwYOughmS_tDgG-BLajFo,303
-caption_flow/cli.py,sha256=J_rjzhYvVyfoOvKQE4PsMSa_YO58iaKk6yi7kRDUYPU,57688
+caption_flow/__init__.py,sha256=AanaoBXNzR2j3ow-uWQQXmYpv6sUXLfLrqACm55_BMY,303
+caption_flow/cli.py,sha256=q3M6ekz70huVGD7NBqsO5xZUqMYBhLqe0ZGo85Vb69g,56072
 caption_flow/models.py,sha256=6-IJj_B3HAarucoLo8_PncJRnxofHuLFCsyRnmUXgRk,7063
 caption_flow/monitor.py,sha256=j5RExadSLOUujVZQMe7btMeKNlq-WbZ9bYqfikgYJ8Q,7972
 caption_flow/orchestrator.py,sha256=MWQKaAclI9rMjn7mWdvoSzl9y4b7bU_24aVr8I1YGhE,39645
 caption_flow/viewer.py,sha256=40w2Zj7GaXbK-dgqvYYdFrMzSDE_ZPWNZc6kS0OrymQ,20281
 caption_flow/processors/__init__.py,sha256=l1udEZLxAmqwFYS4-3GsRVcPT6WxnDOIk0s0UqsZsJM,423
 caption_flow/processors/base.py,sha256=Zx6kRZSqG969x8kYJ5VY2Mo5mLeWEgBCEpo8D4GjsBM,6935
-caption_flow/processors/huggingface.py,sha256=LELbCkvALoKSVf5zGOEL3f3nQG_UcRcPu0ZNZU95B3k,60222
+caption_flow/processors/huggingface.py,sha256=i-DZRt5nTnPN8180Yf8FKBiYPUPmxfKMEZ68CUZECWk,61603
 caption_flow/processors/local_filesystem.py,sha256=auAWxnqplEH4YJ1DWZCaFmAd03iyhNLudgt71N8O7NE,27827
 caption_flow/processors/webdataset.py,sha256=66y_7KaJBBntJqBHYKLzCXkBi9ly-TfYYaTCp_7pqTo,34206
 caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
@@ -23,11 +23,11 @@ caption_flow/utils/json_utils.py,sha256=AaGcNTToUcVYCQj2TXs2D_hxc_LeEqFquiK4CquS
 caption_flow/utils/prompt_template.py,sha256=mq7FPnpjp8gVCMMh4NtRf0vL_B9LDMuBkbySvACRSZM,4401
 caption_flow/utils/vllm_config.py,sha256=xFOnmniQGkUGwfTabfW6R0V01TF-_rN1UYJy0HwOvUI,6026
 caption_flow/workers/base.py,sha256=Yh_PBsL3j1kXUuIOQHqIdR69Nepfq11je23i01iWSxw,7714
-caption_flow/workers/caption.py,sha256=KnvRcZ6-Nc2JwastgqpQ8WfCw_AOzWBS-etYXEXJ6Os,47201
+caption_flow/workers/caption.py,sha256=qph-TVMUqObRQBgriXOJtCgkWOo3qBdTg883D1TuXlw,48994
 caption_flow/workers/data.py,sha256=iWnTM7UgpJeFzhSTly-gHzFu5sIYUGG-XO4yRNn_MQk,14775
-caption_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-caption_flow-0.4.0.dist-info/METADATA,sha256=e1sdcAeXR-nYlRZlrDtvwXBuRPb1J-_jzTzIvWevsHs,9732
-caption_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-caption_flow-0.4.0.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
-caption_flow-0.4.0.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
-caption_flow-0.4.0.dist-info/RECORD,,
+caption_flow-0.4.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+caption_flow-0.4.1.dist-info/METADATA,sha256=2mg45AYJVVZrgBzD611qFaWfNFId_3Xhl8xpwlFNrjg,10123
+caption_flow-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+caption_flow-0.4.1.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
+caption_flow-0.4.1.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
+caption_flow-0.4.1.dist-info/RECORD,,

{caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

caption-flow 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

caption-flow 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl