caption-flow 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caption_flow/__init__.py +1 -1
- caption_flow/cli.py +2 -27
- caption_flow/processors/huggingface.py +25 -2
- caption_flow/workers/caption.py +39 -3
- {caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/METADATA +5 -1
- {caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/RECORD +10 -10
- {caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/WHEEL +0 -0
- {caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/entry_points.txt +0 -0
- {caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {caption_flow-0.4.0.dist-info → caption_flow-0.4.1.dist-info}/top_level.txt +0 -0
caption_flow/__init__.py
CHANGED
caption_flow/cli.py
CHANGED
@@ -1276,33 +1276,6 @@ async def _export_single_format(
|
|
1276
1276
|
console.print(f" • {shard_name}: {count:,} items")
|
1277
1277
|
|
1278
1278
|
|
1279
|
-
@main.command()
|
1280
|
-
@click.option("--data-dir", default="./caption_data", help="Storage directory")
|
1281
|
-
@click.option(
|
1282
|
-
"--format",
|
1283
|
-
type=click.Choice(
|
1284
|
-
["jsonl", "json", "csv", "txt", "parquet", "lance", "huggingface_hub", "all"],
|
1285
|
-
case_sensitive=False,
|
1286
|
-
),
|
1287
|
-
default="jsonl",
|
1288
|
-
help="Export format (default: jsonl)",
|
1289
|
-
)
|
1290
|
-
@click.option("--output", "-o", help="Output path (file for jsonl/csv, directory for json/txt)")
|
1291
|
-
@click.option("--limit", type=int, help="Limit number of rows to export")
|
1292
|
-
@click.option("--columns", help="Comma-separated list of columns to export (default: all)")
|
1293
|
-
@click.option("--export-column", default="captions", help="Column to export for txt format")
|
1294
|
-
@click.option("--filename-column", default="filename", help="Column containing filenames")
|
1295
|
-
@click.option("--shard", help="Specific shard to export (e.g., data-0001)")
|
1296
|
-
@click.option("--shards", help="Comma-separated list of shards to export")
|
1297
|
-
@click.option("--include-empty", is_flag=True, help="Include rows with empty export column")
|
1298
|
-
@click.option("--stats-only", is_flag=True, help="Show statistics without exporting")
|
1299
|
-
@click.option("--optimize", is_flag=True, help="Optimize storage before export")
|
1300
|
-
@click.option("--verbose", is_flag=True, help="Show detailed export progress")
|
1301
|
-
@click.option("--hf-dataset", help="Dataset name on HF Hub (e.g., username/dataset-name)")
|
1302
|
-
@click.option("--license", default="apache-2.0", help="License for the dataset")
|
1303
|
-
@click.option("--private", is_flag=True, help="Make HF dataset private")
|
1304
|
-
@click.option("--nsfw", is_flag=True, help="Add not-for-all-audiences tag")
|
1305
|
-
@click.option("--tags", help="Comma-separated tags for HF dataset")
|
1306
1279
|
def _validate_export_setup(data_dir):
|
1307
1280
|
"""Validate export setup and create storage manager."""
|
1308
1281
|
from .storage import StorageManager
|
@@ -1333,6 +1306,7 @@ async def _run_export_process(
|
|
1333
1306
|
tags,
|
1334
1307
|
stats_only,
|
1335
1308
|
optimize,
|
1309
|
+
include_empty,
|
1336
1310
|
):
|
1337
1311
|
"""Execute the main export process."""
|
1338
1312
|
from .storage.exporter import LanceStorageExporter
|
@@ -1448,6 +1422,7 @@ def export(
|
|
1448
1422
|
tags,
|
1449
1423
|
stats_only,
|
1450
1424
|
optimize,
|
1425
|
+
include_empty,
|
1451
1426
|
)
|
1452
1427
|
)
|
1453
1428
|
except ExportError as e:
|
@@ -1195,7 +1195,18 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
|
|
1195
1195
|
|
1196
1196
|
# Still extract URL if available for metadata
|
1197
1197
|
if self.url_column and self.url_column in item:
|
1198
|
-
|
1198
|
+
url_value = item[self.url_column]
|
1199
|
+
if (
|
1200
|
+
url_value
|
1201
|
+
and str(url_value).strip()
|
1202
|
+
and str(url_value).strip().lower() != "none"
|
1203
|
+
):
|
1204
|
+
image_url = str(url_value).strip()
|
1205
|
+
else:
|
1206
|
+
logger.debug(
|
1207
|
+
f"Invalid or None URL for item {global_idx}: {url_value}"
|
1208
|
+
)
|
1209
|
+
image_url = None
|
1199
1210
|
|
1200
1211
|
# Create dummy image with metadata context
|
1201
1212
|
image = self._create_dummy_image(
|
@@ -1209,7 +1220,19 @@ class HuggingFaceDatasetWorkerProcessor(WorkerProcessor):
|
|
1209
1220
|
# Normal processing - load real images
|
1210
1221
|
if self.url_column:
|
1211
1222
|
if self.url_column in item:
|
1212
|
-
|
1223
|
+
url_value = item[self.url_column]
|
1224
|
+
if (
|
1225
|
+
url_value
|
1226
|
+
and str(url_value).strip()
|
1227
|
+
and str(url_value).strip().lower() != "none"
|
1228
|
+
):
|
1229
|
+
image_url = str(url_value).strip()
|
1230
|
+
else:
|
1231
|
+
logger.debug(
|
1232
|
+
f"Skipping invalid or None URL for item {global_idx}: {url_value}"
|
1233
|
+
)
|
1234
|
+
continue # Skip this item entirely
|
1235
|
+
|
1213
1236
|
try:
|
1214
1237
|
max_retries = 3
|
1215
1238
|
backoff_factor = 2
|
caption_flow/workers/caption.py
CHANGED
@@ -137,6 +137,19 @@ class MultiStageVLLMManager:
|
|
137
137
|
|
138
138
|
def get_model_for_stage(self, stage_name: str, model_name: str) -> Tuple[Any, Any, Any, Any]:
|
139
139
|
"""Get model components for a stage."""
|
140
|
+
if model_name not in self.models:
|
141
|
+
raise KeyError(
|
142
|
+
f"Model '{model_name}' not found in loaded models. Available models: {list(self.models.keys())}"
|
143
|
+
)
|
144
|
+
if model_name not in self.processors:
|
145
|
+
raise KeyError(f"Processor for model '{model_name}' not found")
|
146
|
+
if model_name not in self.tokenizers:
|
147
|
+
raise KeyError(f"Tokenizer for model '{model_name}' not found")
|
148
|
+
if stage_name not in self.sampling_params:
|
149
|
+
raise KeyError(
|
150
|
+
f"Sampling params for stage '{stage_name}' not found. Available stages: {list(self.sampling_params.keys())}"
|
151
|
+
)
|
152
|
+
|
140
153
|
return (
|
141
154
|
self.models[model_name],
|
142
155
|
self.processors[model_name],
|
@@ -489,7 +502,19 @@ class CaptionWorker(BaseWorker):
|
|
489
502
|
return True
|
490
503
|
except Exception as e:
|
491
504
|
logger.error(f"Failed to reload vLLM: {e}")
|
505
|
+
# Restore previous state
|
492
506
|
self.vllm_config = old_config
|
507
|
+
self.stages = self._parse_stages_config(old_config)
|
508
|
+
self.stage_order = self._topological_sort_stages(self.stages)
|
509
|
+
# Attempt to restore previous models
|
510
|
+
try:
|
511
|
+
self._setup_vllm()
|
512
|
+
except Exception as restore_error:
|
513
|
+
logger.error(f"Failed to restore previous vLLM state: {restore_error}")
|
514
|
+
# Clean up broken state
|
515
|
+
if self.model_manager:
|
516
|
+
self.model_manager.cleanup()
|
517
|
+
self.model_manager = None
|
493
518
|
return False
|
494
519
|
else:
|
495
520
|
# Clean up models if switching to mock mode
|
@@ -886,10 +911,21 @@ class CaptionWorker(BaseWorker):
|
|
886
911
|
stage = next(s for s in self.stages if s.name == stage_name)
|
887
912
|
logger.debug(f"Processing batch through stage: {stage_name}")
|
888
913
|
|
914
|
+
# Check if model manager is properly initialized
|
915
|
+
if not self.model_manager:
|
916
|
+
logger.error("Model manager not initialized")
|
917
|
+
self.items_failed += len(batch)
|
918
|
+
return []
|
919
|
+
|
889
920
|
# Get model components
|
890
|
-
|
891
|
-
|
892
|
-
|
921
|
+
try:
|
922
|
+
llm, processor, tokenizer, sampling_params = self.model_manager.get_model_for_stage(
|
923
|
+
stage_name, stage.model
|
924
|
+
)
|
925
|
+
except KeyError as e:
|
926
|
+
logger.error(f"Model not found during batch processing: {e}")
|
927
|
+
self.items_failed += len(batch)
|
928
|
+
return []
|
893
929
|
|
894
930
|
# Validate batch before processing
|
895
931
|
processable_batch, too_long_items = self._validate_and_split_batch(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: caption-flow
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: Self-contained distributed community captioning system
|
5
5
|
Author-email: bghira <bghira@users.github.com>
|
6
6
|
License: MIT
|
@@ -48,6 +48,10 @@ Dynamic: license-file
|
|
48
48
|
|
49
49
|
# CaptionFlow
|
50
50
|
|
51
|
+
<!-- [](https://github.com/bghira/CaptionFlow/actions/workflows/tests.yml) -->
|
52
|
+
[](https://codecov.io/github/bghira/CaptionFlow)
|
53
|
+
[](https://badge.fury.io/py/caption-flow)
|
54
|
+
|
51
55
|
scalable, fault-tolerant **vLLM-powered image captioning**.
|
52
56
|
|
53
57
|
a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.
|
@@ -1,12 +1,12 @@
|
|
1
|
-
caption_flow/__init__.py,sha256=
|
2
|
-
caption_flow/cli.py,sha256=
|
1
|
+
caption_flow/__init__.py,sha256=AanaoBXNzR2j3ow-uWQQXmYpv6sUXLfLrqACm55_BMY,303
|
2
|
+
caption_flow/cli.py,sha256=q3M6ekz70huVGD7NBqsO5xZUqMYBhLqe0ZGo85Vb69g,56072
|
3
3
|
caption_flow/models.py,sha256=6-IJj_B3HAarucoLo8_PncJRnxofHuLFCsyRnmUXgRk,7063
|
4
4
|
caption_flow/monitor.py,sha256=j5RExadSLOUujVZQMe7btMeKNlq-WbZ9bYqfikgYJ8Q,7972
|
5
5
|
caption_flow/orchestrator.py,sha256=MWQKaAclI9rMjn7mWdvoSzl9y4b7bU_24aVr8I1YGhE,39645
|
6
6
|
caption_flow/viewer.py,sha256=40w2Zj7GaXbK-dgqvYYdFrMzSDE_ZPWNZc6kS0OrymQ,20281
|
7
7
|
caption_flow/processors/__init__.py,sha256=l1udEZLxAmqwFYS4-3GsRVcPT6WxnDOIk0s0UqsZsJM,423
|
8
8
|
caption_flow/processors/base.py,sha256=Zx6kRZSqG969x8kYJ5VY2Mo5mLeWEgBCEpo8D4GjsBM,6935
|
9
|
-
caption_flow/processors/huggingface.py,sha256=
|
9
|
+
caption_flow/processors/huggingface.py,sha256=i-DZRt5nTnPN8180Yf8FKBiYPUPmxfKMEZ68CUZECWk,61603
|
10
10
|
caption_flow/processors/local_filesystem.py,sha256=auAWxnqplEH4YJ1DWZCaFmAd03iyhNLudgt71N8O7NE,27827
|
11
11
|
caption_flow/processors/webdataset.py,sha256=66y_7KaJBBntJqBHYKLzCXkBi9ly-TfYYaTCp_7pqTo,34206
|
12
12
|
caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
|
@@ -23,11 +23,11 @@ caption_flow/utils/json_utils.py,sha256=AaGcNTToUcVYCQj2TXs2D_hxc_LeEqFquiK4CquS
|
|
23
23
|
caption_flow/utils/prompt_template.py,sha256=mq7FPnpjp8gVCMMh4NtRf0vL_B9LDMuBkbySvACRSZM,4401
|
24
24
|
caption_flow/utils/vllm_config.py,sha256=xFOnmniQGkUGwfTabfW6R0V01TF-_rN1UYJy0HwOvUI,6026
|
25
25
|
caption_flow/workers/base.py,sha256=Yh_PBsL3j1kXUuIOQHqIdR69Nepfq11je23i01iWSxw,7714
|
26
|
-
caption_flow/workers/caption.py,sha256=
|
26
|
+
caption_flow/workers/caption.py,sha256=qph-TVMUqObRQBgriXOJtCgkWOo3qBdTg883D1TuXlw,48994
|
27
27
|
caption_flow/workers/data.py,sha256=iWnTM7UgpJeFzhSTly-gHzFu5sIYUGG-XO4yRNn_MQk,14775
|
28
|
-
caption_flow-0.4.
|
29
|
-
caption_flow-0.4.
|
30
|
-
caption_flow-0.4.
|
31
|
-
caption_flow-0.4.
|
32
|
-
caption_flow-0.4.
|
33
|
-
caption_flow-0.4.
|
28
|
+
caption_flow-0.4.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
29
|
+
caption_flow-0.4.1.dist-info/METADATA,sha256=2mg45AYJVVZrgBzD611qFaWfNFId_3Xhl8xpwlFNrjg,10123
|
30
|
+
caption_flow-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
31
|
+
caption_flow-0.4.1.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
|
32
|
+
caption_flow-0.4.1.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
|
33
|
+
caption_flow-0.4.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|