caption-flow 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
caption_flow/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """CaptionFlow - Distributed community captioning system."""
2
2
 
3
- __version__ = "0.4.0"
3
+ __version__ = "0.4.1"
4
4
 
5
5
  from .monitor import Monitor
6
6
  from .orchestrator import Orchestrator
caption_flow/cli.py CHANGED
@@ -1276,33 +1276,6 @@ async def _export_single_format(
1276
1276
  console.print(f" • {shard_name}: {count:,} items")
1277
1277
 
1278
1278
 
1279
- @main.command()
1280
- @click.option("--data-dir", default="./caption_data", help="Storage directory")
1281
- @click.option(
1282
- "--format",
1283
- type=click.Choice(
1284
- ["jsonl", "json", "csv", "txt", "parquet", "lance", "huggingface_hub", "all"],
1285
- case_sensitive=False,
1286
- ),
1287
- default="jsonl",
1288
- help="Export format (default: jsonl)",
1289
- )
1290
- @click.option("--output", "-o", help="Output path (file for jsonl/csv, directory for json/txt)")
1291
- @click.option("--limit", type=int, help="Limit number of rows to export")
1292
- @click.option("--columns", help="Comma-separated list of columns to export (default: all)")
1293
- @click.option("--export-column", default="captions", help="Column to export for txt format")
1294
- @click.option("--filename-column", default="filename", help="Column containing filenames")
1295
- @click.option("--shard", help="Specific shard to export (e.g., data-0001)")
1296
- @click.option("--shards", help="Comma-separated list of shards to export")
1297
- @click.option("--include-empty", is_flag=True, help="Include rows with empty export column")
1298
- @click.option("--stats-only", is_flag=True, help="Show statistics without exporting")
1299
- @click.option("--optimize", is_flag=True, help="Optimize storage before export")
1300
- @click.option("--verbose", is_flag=True, help="Show detailed export progress")
1301
- @click.option("--hf-dataset", help="Dataset name on HF Hub (e.g., username/dataset-name)")
1302
- @click.option("--license", default="apache-2.0", help="License for the dataset")
1303
- @click.option("--private", is_flag=True, help="Make HF dataset private")
1304
- @click.option("--nsfw", is_flag=True, help="Add not-for-all-audiences tag")
1305
- @click.option("--tags", help="Comma-separated tags for HF dataset")
1306
1279
  def _validate_export_setup(data_dir):
1307
1280
  """Validate export setup and create storage manager."""
1308
1281
  from .storage import StorageManager
@@ -1333,6 +1306,7 @@ async def _run_export_process(
1333
1306
  tags,
1334
1307
  stats_only,
1335
1308
  optimize,
1309
+ include_empty,
1336
1310
  ):
1337
1311
  """Execute the main export process."""
1338
1312
  from .storage.exporter import LanceStorageExporter
@@ -1448,6 +1422,7 @@ def export(
1448
1422
  tags,
1449
1423
  stats_only,
1450
1424
  optimize,
1425
+ include_empty,
1451
1426
  )
1452
1427
  )
1453
1428
  except ExportError as e:
@@ -1195,7 +1195,18 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
1195
1195
 
1196
1196
  # Still extract URL if available for metadata
1197
1197
  if self.url_column and self.url_column in item:
1198
- image_url = item[self.url_column]
1198
+ url_value = item[self.url_column]
1199
+ if (
1200
+ url_value
1201
+ and str(url_value).strip()
1202
+ and str(url_value).strip().lower() != "none"
1203
+ ):
1204
+ image_url = str(url_value).strip()
1205
+ else:
1206
+ logger.debug(
1207
+ f"Invalid or None URL for item {global_idx}: {url_value}"
1208
+ )
1209
+ image_url = None
1199
1210
 
1200
1211
  # Create dummy image with metadata context
1201
1212
  image = self._create_dummy_image(
@@ -1209,7 +1220,19 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
1209
1220
  # Normal processing - load real images
1210
1221
  if self.url_column:
1211
1222
  if self.url_column in item:
1212
- image_url = item[self.url_column]
1223
+ url_value = item[self.url_column]
1224
+ if (
1225
+ url_value
1226
+ and str(url_value).strip()
1227
+ and str(url_value).strip().lower() != "none"
1228
+ ):
1229
+ image_url = str(url_value).strip()
1230
+ else:
1231
+ logger.debug(
1232
+ f"Skipping invalid or None URL for item {global_idx}: {url_value}"
1233
+ )
1234
+ continue # Skip this item entirely
1235
+
1213
1236
  try:
1214
1237
  max_retries = 3
1215
1238
  backoff_factor = 2
@@ -137,6 +137,19 @@ class MultiStageVLLMManager:
137
137
 
138
138
  def get_model_for_stage(self, stage_name: str, model_name: str) -> Tuple[Any, Any, Any, Any]:
139
139
  """Get model components for a stage."""
140
+ if model_name not in self.models:
141
+ raise KeyError(
142
+ f"Model '{model_name}' not found in loaded models. Available models: {list(self.models.keys())}"
143
+ )
144
+ if model_name not in self.processors:
145
+ raise KeyError(f"Processor for model '{model_name}' not found")
146
+ if model_name not in self.tokenizers:
147
+ raise KeyError(f"Tokenizer for model '{model_name}' not found")
148
+ if stage_name not in self.sampling_params:
149
+ raise KeyError(
150
+ f"Sampling params for stage '{stage_name}' not found. Available stages: {list(self.sampling_params.keys())}"
151
+ )
152
+
140
153
  return (
141
154
  self.models[model_name],
142
155
  self.processors[model_name],
@@ -489,7 +502,19 @@ class CaptionWorker(BaseWorker):
489
502
  return True
490
503
  except Exception as e:
491
504
  logger.error(f"Failed to reload vLLM: {e}")
505
+ # Restore previous state
492
506
  self.vllm_config = old_config
507
+ self.stages = self._parse_stages_config(old_config)
508
+ self.stage_order = self._topological_sort_stages(self.stages)
509
+ # Attempt to restore previous models
510
+ try:
511
+ self._setup_vllm()
512
+ except Exception as restore_error:
513
+ logger.error(f"Failed to restore previous vLLM state: {restore_error}")
514
+ # Clean up broken state
515
+ if self.model_manager:
516
+ self.model_manager.cleanup()
517
+ self.model_manager = None
493
518
  return False
494
519
  else:
495
520
  # Clean up models if switching to mock mode
@@ -886,10 +911,21 @@ class CaptionWorker(BaseWorker):
886
911
  stage = next(s for s in self.stages if s.name == stage_name)
887
912
  logger.debug(f"Processing batch through stage: {stage_name}")
888
913
 
914
+ # Check if model manager is properly initialized
915
+ if not self.model_manager:
916
+ logger.error("Model manager not initialized")
917
+ self.items_failed += len(batch)
918
+ return []
919
+
889
920
  # Get model components
890
- llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
891
- stage_name, stage.model
892
- )
921
+ try:
922
+ llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
923
+ stage_name, stage.model
924
+ )
925
+ except KeyError as e:
926
+ logger.error(f"Model not found during batch processing: {e}")
927
+ self.items_failed += len(batch)
928
+ return []
893
929
 
894
930
  # Validate batch before processing
895
931
  processable_batch, too_long_items = self._validate_and_split_batch(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: caption-flow
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Self-contained distributed community captioning system
5
5
  Author-email: bghira <bghira@users.github.com>
6
6
  License: MIT
@@ -48,6 +48,10 @@ Dynamic: license-file
48
48
 
49
49
  # CaptionFlow
50
50
 
51
+ <!-- [![Tests](https://github.com/bghira/CaptionFlow/workflows/tests/badge.svg)](https://github.com/bghira/CaptionFlow/actions/workflows/tests.yml) -->
52
+ [![codecov](https://codecov.io/github/bghira/CaptionFlow/graph/badge.svg?token=PRAQPNGYAS)](https://codecov.io/github/bghira/CaptionFlow)
53
+ [![PyPI version](https://badge.fury.io/py/caption-flow.svg)](https://badge.fury.io/py/caption-flow)
54
+
51
55
  scalable, fault-tolerant **vLLM-powered image captioning**.
52
56
 
53
57
  a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.
@@ -1,12 +1,12 @@
1
- caption_flow/__init__.py,sha256=IZoOP8s4lN05e6ww9M5HWVfwYOughmS_tDgG-BLajFo,303
2
- caption_flow/cli.py,sha256=J_rjzhYvVyfoOvKQE4PsMSa_YO58iaKk6yi7kRDUYPU,57688
1
+ caption_flow/__init__.py,sha256=AanaoBXNzR2j3ow-uWQQXmYpv6sUXLfLrqACm55_BMY,303
2
+ caption_flow/cli.py,sha256=q3M6ekz70huVGD7NBqsO5xZUqMYBhLqe0ZGo85Vb69g,56072
3
3
  caption_flow/models.py,sha256=6-IJj_B3HAarucoLo8_PncJRnxofHuLFCsyRnmUXgRk,7063
4
4
  caption_flow/monitor.py,sha256=j5RExadSLOUujVZQMe7btMeKNlq-WbZ9bYqfikgYJ8Q,7972
5
5
  caption_flow/orchestrator.py,sha256=MWQKaAclI9rMjn7mWdvoSzl9y4b7bU_24aVr8I1YGhE,39645
6
6
  caption_flow/viewer.py,sha256=40w2Zj7GaXbK-dgqvYYdFrMzSDE_ZPWNZc6kS0OrymQ,20281
7
7
  caption_flow/processors/__init__.py,sha256=l1udEZLxAmqwFYS4-3GsRVcPT6WxnDOIk0s0UqsZsJM,423
8
8
  caption_flow/processors/base.py,sha256=Zx6kRZSqG969x8kYJ5VY2Mo5mLeWEgBCEpo8D4GjsBM,6935
9
- caption_flow/processors/huggingface.py,sha256=LELbCkvALoKSVf5zGOEL3f3nQG_UcRcPu0ZNZU95B3k,60222
9
+ caption_flow/processors/huggingface.py,sha256=i-DZRt5nTnPN8180Yf8FKBiYPUPmxfKMEZ68CUZECWk,61603
10
10
  caption_flow/processors/local_filesystem.py,sha256=auAWxnqplEH4YJ1DWZCaFmAd03iyhNLudgt71N8O7NE,27827
11
11
  caption_flow/processors/webdataset.py,sha256=66y_7KaJBBntJqBHYKLzCXkBi9ly-TfYYaTCp_7pqTo,34206
12
12
  caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
@@ -23,11 +23,11 @@ caption_flow/utils/json_utils.py,sha256=AaGcNTToUcVYCQj2TXs2D_hxc_LeEqFquiK4CquS
23
23
  caption_flow/utils/prompt_template.py,sha256=mq7FPnpjp8gVCMMh4NtRf0vL_B9LDMuBkbySvACRSZM,4401
24
24
  caption_flow/utils/vllm_config.py,sha256=xFOnmniQGkUGwfTabfW6R0V01TF-_rN1UYJy0HwOvUI,6026
25
25
  caption_flow/workers/base.py,sha256=Yh_PBsL3j1kXUuIOQHqIdR69Nepfq11je23i01iWSxw,7714
26
- caption_flow/workers/caption.py,sha256=KnvRcZ6-Nc2JwastgqpQ8WfCw_AOzWBS-etYXEXJ6Os,47201
26
+ caption_flow/workers/caption.py,sha256=qph-TVMUqObRQBgriXOJtCgkWOo3qBdTg883D1TuXlw,48994
27
27
  caption_flow/workers/data.py,sha256=iWnTM7UgpJeFzhSTly-gHzFu5sIYUGG-XO4yRNn_MQk,14775
28
- caption_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
29
- caption_flow-0.4.0.dist-info/METADATA,sha256=e1sdcAeXR-nYlRZlrDtvwXBuRPb1J-_jzTzIvWevsHs,9732
30
- caption_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
- caption_flow-0.4.0.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
32
- caption_flow-0.4.0.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
33
- caption_flow-0.4.0.dist-info/RECORD,,
28
+ caption_flow-0.4.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
29
+ caption_flow-0.4.1.dist-info/METADATA,sha256=2mg45AYJVVZrgBzD611qFaWfNFId_3Xhl8xpwlFNrjg,10123
30
+ caption_flow-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
31
+ caption_flow-0.4.1.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
32
+ caption_flow-0.4.1.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
33
+ caption_flow-0.4.1.dist-info/RECORD,,