PyPI - caption-flow - Versions diffs - 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

caption-flow 0.2.3py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

caption_flow/__init__.py +1 -1
caption_flow/cli.py +307 -0
caption_flow/models.py +26 -0
caption_flow/orchestrator.py +9 -9
caption_flow/processors/huggingface.py +636 -464
caption_flow/processors/webdataset.py +379 -534
caption_flow/storage/__init__.py +1 -0
caption_flow/storage/exporter.py +550 -0
caption_flow/{storage.py → storage/manager.py} +410 -303
caption_flow/utils/__init__.py +0 -2
caption_flow/utils/chunk_tracker.py +196 -164
caption_flow/utils/image_processor.py +19 -132
caption_flow/viewer.py +594 -0
caption_flow/workers/caption.py +164 -129
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/METADATA +45 -177
caption_flow-0.3.1.dist-info/RECORD +33 -0
caption_flow/utils/dataset_loader.py +0 -222
caption_flow/utils/dataset_metadata_cache.py +0 -67
caption_flow/utils/job_queue.py +0 -41
caption_flow/utils/shard_processor.py +0 -119
caption_flow/utils/shard_tracker.py +0 -83
caption_flow-0.2.3.dist-info/RECORD +0 -35
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/WHEEL +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/entry_points.txt +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/top_level.txt +0 -0

{caption_flow-0.2.3.dist-info → caption_flow-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: caption-flow
-Version: 0.2.3
+Version: 0.3.1
 Summary: Self-contained distributed community captioning system
 Author-email: bghira <bghira@users.github.com>
 License: MIT
@@ -33,6 +33,9 @@ Requires-Dist: arrow<2.0.0,>=1.3.0
 Requires-Dist: datasets<5.0.0,>=4.0.0
 Requires-Dist: boto3<2.0.0,>=1.40.11
 Requires-Dist: torchdata<0.12.0,>=0.11.0
+Requires-Dist: textual<6.0.0,>=5.3.0
+Requires-Dist: urwid<4.0.0,>=3.0.2
+Requires-Dist: webshart<0.5.0,>=0.4.0
 Provides-Extra: dev
 Requires-Dist: pytest>=7.4.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -44,12 +47,13 @@ Dynamic: license-file
 # CaptionFlow
-scalable, fault-tolerant **vLLM-powered image captioning**. this "first round" focuses on a fast websocket orchestrator plus lightweight gpu workers that batch requests through vLLM.
+scalable, fault-tolerant **vLLM-powered image captioning**.
+a fast websocket-based orchestrator paired with lightweight gpu workers achieves exceptional performance for batched requests through vLLM.
 * **orchestrator**: hands out work in chunked shards, collects captions, checkpoints progress, and keeps simple stats.
 * **workers (vLLM)**: connect to the orchestrator, stream in image samples, batch them, and generate 1..N captions per image using prompts supplied by the orchestrator.
 * **config-driven**: all components read YAML config; flags can override.
-* **tui monitor (optional)**: a monitor client is wired into the CLI; ship a `monitor` module to enable it.
 > no conda. just `venv` + `pip`.
@@ -103,6 +107,25 @@ caption-flow worker --config my-worker.yaml --server ws://your.hostname.address:
 caption-flow monitor --config my-monitor.yaml
 ```
+5. export the data
+```bash
+% caption-flow export --help
+Usage: caption-flow export [OPTIONS]
+  Export caption data to various formats.
+Options:
+  --format [jsonl|json|csv|txt|huggingface_hub|all] Export format (default: jsonl)
+```
+* **jsonl**: create JSON line file in the specified `--output` path
+* **csv**: exports CSV-compatible data columns to the `--output` path containing incomplete metadata
+* **json**: creates a `.json` file for each sample inside the `--output` subdirectory containing **complete** metadata; useful for webdatasets
+* **txt**: creates `.txt` file for each sample inside the `--output` subdirectory containing ONLY captions
+* **huggingface_hub**: creates a dataset on Hugging Face Hub, possibly `--private` and `--nsfw` where necessary
+* **all**: creates all export formats in a specified `--output` directory
 ---
 ## how it’s wired
@@ -111,20 +134,11 @@ caption-flow monitor --config my-monitor.yaml
 * **websocket server** (default `0.0.0.0:8765`) with three client roles: workers, data-feeders, and admin.
 * **dataset control**: the orchestrator centrally defines the dataset (`huggingface` or `local`) and version/name. it chunk-slices shards and assigns work.
+* **data serving to remote workers**: local files can be captioned by remote workers that don't have access to the same files, automatically.
 * **vLLM config broadcast**: model, tp size, dtype, max seq len, memory targets, batching, sampling params, and **inference prompts** are all pushed to workers; workers can apply many changes without a model reload.
 * **storage + checkpoints**: captions buffer to disk with periodic checkpoints. chunk state is tracked so restarts don’t double-work.
 * **auth**: token lists for `worker`, `monitor`, and `admin` roles.
-start flags you’ll likely use:
-```text
---config PATH                # yaml config for the orchestrator
---port INT, --host STR       # bind controls
---data-dir PATH              # overrides storage.data_dir
---cert PATH, --key PATH      # enable TLS (or use --no-ssl for ws:// in dev)
---vllm                       # use the vLLM-style orchestrator (webdataset/hf)
-```
 ### vLLM worker
 * **one process per gpu**. select the device with `--gpu-id` (or `worker.gpu_id` in YAML).
@@ -132,27 +146,15 @@ start flags you’ll likely use:
 * **resilient**: detects disconnects, abandons the current chunk cleanly, clears queues, reconnects, and resumes.
 * **batched generate()**: images are resized down for consistent batching; each image can get multiple captions (one per prompt).
-start flags you’ll likely use:
-```text
---config PATH                 # yaml for the worker
---server URL                  # ws(s)://host:port
---token STR                   # must match an allowed worker token on the orchestrator
---name STR                    # display name
---batch-size INT              # override vLLM batch size
---vllm                        # use the vLLM worker implementation
---gpu-id INT                  # which gpu to use
---precision STR, --model STR  # optional overrides for dtype/model
---no-verify-ssl               # accept self-signed certs in dev
-```
-### (optional) monitor
+---
-* a CLI entry exists for a TUI monitor; wire in a `monitor` module to enable it. config lives in `monitor.yaml` or inside `orchestrator.yaml` under `monitor:`.
+## dataset formats
----
+* huggingface hub or local based URL list datasets that are compatible with the datasets library
+* webdatasets shards containing full image data; also can be hosted on the hub
+* local folder filled with images; orchestrator will serve the data to workers
-## configuration
+## configuration path
 ### config discovery order
@@ -166,98 +168,6 @@ for any component, the CLI looks for config in this order (first match wins):
 6. any `$XDG_CONFIG_DIRS` entries under `caption-flow/`
 7. `./examples/<component>.yaml` (fallback)
-### orchestrator.yaml (highlights)
-```yaml
-orchestrator:
-  host: 0.0.0.0
-  port: 8765
-  # ssl:
-  #   cert: /path/fullchain.pem
-  #   key:  /path/privkey.pem
-  dataset:
-    type: huggingface
-    path: <hf-dataset-or-local-path>
-    name: <logical-name>
-    version: "1.0"
-  vllm:
-    model: Qwen/Qwen2.5-VL-3B-Instruct
-    tensor_parallel_size: 1
-    max_model_len: 16384
-    dtype: float16
-    gpu_memory_utilization: 0.92
-    enforce_eager: true
-    disable_mm_preprocessor_cache: true
-    limit_mm_per_prompt: { image: 1 }
-    batch_size: 8
-    sampling:
-      temperature: 0.7
-      top_p: 0.95
-      max_tokens: 256
-      repetition_penalty: 1.05
-      skip_special_tokens: true
-      stop: ["<|end|>", "<|endoftext|>", "<|im_end|>"]
-    inference_prompts:
-      - "describe this image in detail"
-      - "provide a comprehensive description of the visual content"
-      - "what are the key elements in this image?"
-  storage:
-    data_dir: ./caption_data
-    checkpoint_dir: ./checkpoints
-    caption_buffer_size: 100
-    checkpoint_interval: 1000
-  # chunking/queueing
-  chunk_size: 1000
-  chunks_per_request: 2
-  chunk_buffer_multiplier: 3
-  min_chunk_buffer: 10
-  auth:
-    worker_tokens:
-      - { token: "example-worker-token", name: "Example Worker" }
-    monitor_tokens:
-      - { token: "letmein", name: "Default monitor" }
-    admin_tokens:
-      - { token: "admin-secret-2024", name: "Admin" }
-```
-### worker.yaml (highlights)
-```yaml
-worker:
-  server: ws://localhost:8765   # use wss:// in prod
-  token: example-worker-token
-  name: local-gpu
-  gpu_id: 0
-  vllm: true
-  # local queues
-  readahead_size: 256
-  inference_queue_size: 128
-```
-### monitor.yaml (optional)
-```yaml
-monitor:
-  server: ws://localhost:8765
-  token: letmein
-  refresh_rate: 1.0
-  show_contributors: true
-  show_quality_metrics: true
-  max_activity_items: 20
-  show_chunk_progress: true
-  show_worker_queues: true
-  show_throughput_graph: true
-```
 ---
 ## tls / certificates
@@ -300,66 +210,24 @@ PRs welcome. keep it simple and fast.
 ```
 ┌─────────────┐     WebSocket      ┌─────────────┐
 │   Worker    │◄──────────────────►│             │
-└─────────────┘                    │             │     ┌──────────────┐
-                                   │ Orchestrator│────►│Arrow/Parquet │
-┌─────────────┐                    │             │     │   Storage    │
-│   Worker    │◄──────────────────►│             │     └──────────────┘
-└─────────────┘                    └─────────────┘
+│             │                    │             │     ┌──────────────┐
+│             │◄───────────────────│             │────►│Arrow/Parquet │
+└─────────────┘   HTTP (img data)  │ Orchestrator│     │   Storage    │
+                                   │             │     └──────────────┘
+┌─────────────┐                    │             │
+│   Worker    │◄──────────────────►│             │
+│             │                    │             │
+│             │◄───────────────────│             │
+└─────────────┘   HTTP (img data)  └─────────────┘
                                            ▲
 ┌─────────────┐                           │
 │   Monitor   │◄──────────────────────────┘
 └─────────────┘
 ```
-## Storage Schema
-### captions.parquet
-- `job_id`: Unique job identifier
-* `dataset`: Dataset name
-* `shard`: Shard identifier
-* `item_key`: Item within shard
-* `caption`: Generated caption text
-* `contributor_id`: Worker who generated it
-* `timestamp`: Generation time
-* `quality_score`: Optional quality metric
-### jobs.parquet
-- `job_id`: Unique identifier
-* `dataset`: Dataset name
-* `shard`: Shard identifier
-* `status`: pending/processing/completed/failed
-* `assigned_to`: Worker ID
-* `timestamp`: Status change time
-### contributors.parquet
-- `contributor_id`: Unique identifier
-* `name`: Display name
-* `total_captions`: Lifetime count
-* `trust_level`: Quality tier (0-5)
-## Development
-```bash
-# Install with dev dependencies
-pip install -e ".[dev]"
-# Run tests
-pytest
-# Format code
-black src/
-ruff --fix src/
-# Type checking
-mypy src/
-```
-## Community Contribution
+## Community Clusters
-To contribute compute:
+To contribute compute to a cluster:
 1. Install caption-flow: `pip install caption-flow`
 2. Get a worker token from the project maintainer
@@ -369,4 +237,4 @@ Your contributions will be tracked and attributed in the final dataset!
 ## License
-MIT
+AGPLv3

caption_flow-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,33 @@
+caption_flow/__init__.py,sha256=pL77m-1slbrkzValJF7YfHpcp3yol6iTvSyjHpjJFOA,303
+caption_flow/cli.py,sha256=t_cYCxJE7f5UtB3br2Es51JjO5KPsWM1JTdDXAxM_Lw,41371
+caption_flow/models.py,sha256=2n6iphTEL62xK2FFcJM6axMsaE8KwsUv5Ak_cCF-TdQ,5652
+caption_flow/monitor.py,sha256=bAt9EJqfPgT_KdbknGdCxwBRH002pRDgyUmYIj6Dyso,7885
+caption_flow/orchestrator.py,sha256=34gZvaW14YZ7a7LagYOO3VKKwlbuS4aw0yoP1L8gwf0,36192
+caption_flow/viewer.py,sha256=HxO98eHR1xtivG0dEdYC2U9T_RgeRfJqqTK-37u9bNM,20471
+caption_flow/processors/__init__.py,sha256=hvq-OuAJWQe6hFglKe7QmkS8473k20FmxZDSxfXpCrg,423
+caption_flow/processors/base.py,sha256=JlTqCHo5HRXrXMVzgle_6pNwh4HGHsF7jLF6PeSnWr0,6783
+caption_flow/processors/huggingface.py,sha256=Q1PNQRXZT4NzEzGKtF1A1e8K_5-hgeM4G4lz_CZYuN4,41203
+caption_flow/processors/local_filesystem.py,sha256=EYmsImbkqsIU7UZL2FijL0hotKLtPOtkzfwernQDSxA,27860
+caption_flow/processors/webdataset.py,sha256=1JS3TmQe-fComBKzLPUMhUHx1T0Wf7m9nFkusM7tTXI,26152
+caption_flow/storage/__init__.py,sha256=IVnzcSCPpPuyp-QLlgJirRZ9Sb3tR0F4sfuF5u2cNMk,36
+caption_flow/storage/exporter.py,sha256=mFJqMDQ61cP-qcXe118_-oL1TUqULdQZ8LdjSTym44I,19697
+caption_flow/storage/manager.py,sha256=KPExcKPuFVQSsBnfCBdne5PO4PwN4NTfd-EJQk13OY0,47459
+caption_flow/utils/__init__.py,sha256=bDcO5uR455TKCQ2hX-_XcdTnRXDBaT8Yn4jWqWzfFsE,120
+caption_flow/utils/auth.py,sha256=UrxX2n8OEEcfMD1Ey27TxGfrJFmUCpC59x-SCrQJoVE,2253
+caption_flow/utils/caption_utils.py,sha256=esUMAdcCkNjRroZ0Bhxv0_yKlLtMf0XeDCTt-5k6bik,5309
+caption_flow/utils/certificates.py,sha256=eu4blQZEkL9NRaY1ynQWg1asvDorRYhGRZea7STonJE,4635
+caption_flow/utils/checkpoint_tracker.py,sha256=-nN5gLvXyMdKOCT2SNNL2Km6UYm2Hii9wuXeezWhwx4,3339
+caption_flow/utils/chunk_tracker.py,sha256=lyso_V-ckYUVrDmlCCsaZKF9E_sR4ipef5W6BiVAS5M,19944
+caption_flow/utils/image_processor.py,sha256=wmOExkVfM7OeuLfX3AwMefsH-TxL8TNcn22gp0NmJKY,1541
+caption_flow/utils/json_utils.py,sha256=IiZYn8uCM-3pYmyIbX2fmaOIyutArn67SqAyp0ggNpU,5396
+caption_flow/utils/prompt_template.py,sha256=AKp0diSZqNBMwZkpiTNjw8-bbQwHStr7QZTOJ7o1dC4,4345
+caption_flow/utils/vllm_config.py,sha256=TC7Rmjk0zRKbBXbWUXrFL4Z58hzax_-4L0pXZn09hdM,6019
+caption_flow/workers/base.py,sha256=2AGWERC5hbmO-0V_A1MUbgRVvRNN3blqGPyDokvvzmM,7575
+caption_flow/workers/caption.py,sha256=4nETqDmHgb2dVgT7_zxzr3bcrTtWSxr3FSdB811boEw,38436
+caption_flow/workers/data.py,sha256=0Tg8NE0wdONeMlivYQ4nvbcfWdLuU51O7vR8_YSnJgo,14813
+caption_flow-0.3.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+caption_flow-0.3.1.dist-info/METADATA,sha256=Bc8LSEqMhK1rmzhyu9-P-amdGpjML_AVWorr93jrYGo,9708
+caption_flow-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+caption_flow-0.3.1.dist-info/entry_points.txt,sha256=KnVlyrGKZj6p2zNyuEnCx4Y6jvJ4V-mcfN0lddPKTlQ,55
+caption_flow-0.3.1.dist-info/top_level.txt,sha256=_bXpKRutqded0FQ80dCChIz26ETV7tL4d4e2E_Y1FXs,13
+caption_flow-0.3.1.dist-info/RECORD,,

caption_flow/utils/dataset_loader.py DELETED Viewed

@@ -1,222 +0,0 @@
-"""Dataset loading utilities for WebDataset and HuggingFace."""
-import asyncio
-import shlex
-import logging
-from pathlib import Path
-from typing import List, Dict, Any, Generator, Optional, Tuple
-import json
-import webdataset as wds
-from huggingface_hub import HfFileSystem, get_token, hf_hub_url
-logger = logging.getLogger(__name__)
-class DatasetLoader:
-    """Handles loading datasets from various sources."""
-    def __init__(
-        self,
-        dataset_path: str,
-        dataset_type: str = "huggingface",
-        split: str = "train",
-        image_column: str = "image",
-        cache_dir: Optional[Path] = None,
-    ):
-        """
-        Initialize dataset loader.
-        Args:
-            dataset_path: Path to dataset (HF repo, local dir, etc.)
-            dataset_type: Type of dataset ("huggingface", "webdataset", "local")
-            split: Split to use for HuggingFace datasets (default: "train")
-            image_column: Column name containing image data or URLs (default: "image")
-        """
-        self.dataset_path = dataset_path
-        self.dataset_type = dataset_type
-        self.split = split
-        self.image_column = image_column
-        self.token = get_token()
-        self.dataset_format = None  # Will be detected: "webdataset" or "huggingface_datasets"
-        if not self.token and dataset_type == "huggingface":
-            logger.warning("No HuggingFace token found; run `huggingface-cli login`")
-        # Detect the actual format if it's a HuggingFace dataset
-        if dataset_type == "huggingface":
-            self.dataset_format = self._detect_dataset_format()
-            logger.info(f"Detected dataset format: {self.dataset_format}")
-    def _detect_dataset_format(self) -> str:
-        """Detect whether it's WebDataset or HuggingFace datasets format."""
-        fs = HfFileSystem(token=self.token)
-        # Check for .tar files (WebDataset)
-        tar_files = list(fs.glob(f"hf://datasets/{self.dataset_path}/**/*.tar"))
-        if tar_files:
-            return "webdataset"
-        # Check for .parquet files (Huggingface Arrow DB)
-        parquet_files = list(fs.glob(f"hf://datasets/{self.dataset_path}/**/*.parquet"))
-        if parquet_files:
-            return "huggingface_datasets"
-        raise AssertionError(f"Could not detect dataset format for {self.dataset_path}")
-    def get_shard_list(self) -> List[str]:
-        """Get list of all shards in the dataset."""
-        if self.dataset_type == "huggingface":
-            if self.dataset_format == "webdataset":
-                return self._get_hf_webdataset_shards()
-            else:
-                logger.error(f"Unknown dataset format: {self.dataset_format}")
-                return []
-        elif self.dataset_type == "local":
-            return self._get_local_shards()
-        else:
-            raise ValueError(f"Unknown dataset type: {self.dataset_type}")
-    def _get_hf_webdataset_shards(self) -> List[str]:
-        """Get shard URLs from HuggingFace WebDataset."""
-        logger.info(f"Getting WebDataset shard list from HuggingFace: {self.dataset_path}")
-        fs = HfFileSystem(token=self.token)
-        files = [fs.resolve_path(p) for p in fs.glob(f"hf://datasets/{self.dataset_path}/**/*.tar")]
-        urls = [hf_hub_url(f.repo_id, f.path_in_repo, repo_type="dataset") for f in files]
-        logger.info(f"Found {len(urls)} WebDataset shards")
-        return sorted(urls)
-    def _get_local_shards(self) -> List[str]:
-        """Get shard files from local directory."""
-        path = Path(self.dataset_path)
-        if not path.exists():
-            raise ValueError(f"Local dataset path does not exist: {path}")
-        shards = list(path.glob("*.tar"))
-        logger.info(f"Found {len(shards)} local shards")
-        return [str(s) for s in sorted(shards)]
-    def load_shard(self, shard_url: str, processed_keys: Optional[set] = None) -> wds.DataPipeline:
-        """
-        Load a single shard as a WebDataset pipeline.
-        Args:
-            shard_url: URL or path to the shard
-            processed_keys: Set of already processed keys to skip
-        """
-        if processed_keys is None:
-            processed_keys = set()
-        if self.dataset_type == "huggingface" and self.dataset_format == "webdataset":
-            # Use curl with auth token for HuggingFace
-            url_cmd = f"pipe:curl -s -L -H 'Authorization:Bearer {shlex.quote(self.token)}' {shlex.quote(shard_url)} || true"
-            ds = wds.DataPipeline(
-                wds.SimpleShardList(url_cmd),
-                wds.tarfile_to_samples(),
-                wds.to_tuple("__key__", "__url__", "jpg;png;jpeg;webp;jxl"),
-                wds.select(lambda x: x[0] not in processed_keys),
-            )
-        else:
-            # Local file access
-            ds = wds.DataPipeline(
-                wds.SimpleShardList(shard_url),
-                wds.tarfile_to_samples(),
-                wds.to_tuple("__key__", "__url__", "jpg;png;jpeg;webp;jxl"),
-                wds.select(lambda x: x[0] not in processed_keys),
-            )
-        return ds
-    def iterate_shard(
-        self,
-        shard_url: str,
-        processed_keys: Optional[set] = None,
-        unprocessed_ranges: Optional[List[Tuple[int, int]]] = None,
-    ) -> Generator[Dict[str, Any], None, None]:
-        """
-        Iterate over items in a shard, returning full sample dictionaries.
-        Args:
-            shard_url: URL or identifier of the shard
-            processed_keys: Set of already processed keys to skip
-            unprocessed_ranges: Specific ranges to process (for range-based processing)
-        Yields:
-            Dictionary containing the full WebDataset sample
-        """
-        if processed_keys is None:
-            processed_keys = set()
-        if self.dataset_type == "huggingface" and self.dataset_format == "webdataset":
-            # Use curl with auth token for HuggingFace
-            url_cmd = f"pipe:curl -s -L -H 'Authorization:Bearer {shlex.quote(self.token)}' {shlex.quote(shard_url)} || true"
-            ds = wds.DataPipeline(
-                wds.SimpleShardList(url_cmd),
-                wds.tarfile_to_samples(),
-                wds.select(lambda x: x.get("__key__", "") not in processed_keys),
-            )
-        else:
-            # Local file access
-            ds = wds.DataPipeline(
-                wds.SimpleShardList(shard_url),
-                wds.tarfile_to_samples(),
-                wds.select(lambda x: x.get("__key__", "") not in processed_keys),
-            )
-        # Return full samples as dictionaries
-        for sample in ds:
-            # Ensure it's a dict and has required fields
-            if isinstance(sample, dict) and "__key__" in sample:
-                yield sample
-    def count_shard_items(self, shard_url: str, processed_keys: Optional[set] = None) -> int:
-        """Count items in a shard (can be slow for large shards)."""
-        count = 0
-        try:
-            for _ in self.iterate_shard(shard_url, processed_keys):
-                count += 1
-        except Exception as e:
-            logger.error(f"Error counting shard {shard_url}: {e}")
-        return count
-    def get_dataset_info(self) -> Dict[str, Any]:
-        """Get information about the dataset."""
-        info = {
-            "dataset_path": self.dataset_path,
-            "dataset_type": self.dataset_type,
-            "dataset_format": self.dataset_format,
-        }
-        if self.dataset_format == "huggingface_datasets":
-            # Include cached metadata if available
-            if hasattr(self, "_hf_metadata"):
-                info.update(self._hf_metadata)
-            else:
-                try:
-                    # Try to get more info about the dataset
-                    dataset_info = load_dataset(
-                        self.dataset_path, split=self.split, streaming=True, token=self.token
-                    )
-                    # Get features info
-                    if hasattr(dataset_info, "features"):
-                        info["features"] = str(dataset_info.features)
-                    # Try to get total size (might not work for all datasets)
-                    try:
-                        # This might be expensive for large datasets
-                        total_examples = len(
-                            load_dataset(self.dataset_path, split=self.split, token=self.token)
-                        )
-                        info["total_examples"] = total_examples
-                        self._hf_total_items = total_examples
-                    except:
-                        info["total_examples"] = "unknown"
-                except Exception as e:
-                    logger.error(f"Error getting dataset info: {e}")
-        return info

caption_flow/utils/dataset_metadata_cache.py DELETED Viewed

@@ -1,67 +0,0 @@
-"""Dataset metadata caching for efficient HuggingFace dataset handling."""
-import json
-import logging
-from pathlib import Path
-from typing import Dict, Any, Optional, List
-from datetime import datetime
-logger = logging.getLogger(__name__)
-class DatasetMetadataCache:
-    """Caches dataset metadata to avoid repeated full iterations."""
-    def __init__(self, cache_dir: Path):
-        self.cache_dir = Path(cache_dir)
-        self.cache_dir.mkdir(parents=True, exist_ok=True)
-        self.cache_file = self.cache_dir / "dataset_metadata.json"
-        self.metadata: Dict[str, Any] = {}
-        self._load_cache()
-    def _load_cache(self):
-        """Load cached metadata from disk."""
-        if self.cache_file.exists():
-            try:
-                with open(self.cache_file, "r") as f:
-                    self.metadata = json.load(f)
-                logger.info(f"Loaded dataset metadata cache with {len(self.metadata)} datasets")
-            except Exception as e:
-                logger.error(f"Failed to load metadata cache: {e}")
-                self.metadata = {}
-    def _save_cache(self):
-        """Save metadata cache to disk."""
-        try:
-            with open(self.cache_file, "w") as f:
-                json.dump(self.metadata, f, indent=2)
-            logger.debug("Saved dataset metadata cache")
-        except Exception as e:
-            logger.error(f"Failed to save metadata cache: {e}")
-    def get_dataset_key(self, dataset_path: str, split: str) -> str:
-        """Generate a unique key for a dataset+split combination."""
-        return f"{dataset_path}:{split}"
-    def get_metadata(self, dataset_path: str, split: str) -> Optional[Dict[str, Any]]:
-        """Get cached metadata for a dataset."""
-        key = self.get_dataset_key(dataset_path, split)
-        return self.metadata.get(key)
-    def set_metadata(self, dataset_path: str, split: str, metadata: Dict[str, Any]):
-        """Cache metadata for a dataset."""
-        key = self.get_dataset_key(dataset_path, split)
-        metadata["cached_at"] = datetime.utcnow().isoformat()
-        metadata["dataset_path"] = dataset_path
-        metadata["split"] = split
-        self.metadata[key] = metadata
-        self._save_cache()
-        logger.info(f"Cached metadata for {key}: {metadata.get('total_items', 0)} items")
-    def invalidate(self, dataset_path: str, split: str):
-        """Remove cached metadata for a dataset."""
-        key = self.get_dataset_key(dataset_path, split)
-        if key in self.metadata:
-            del self.metadata[key]
-            self._save_cache()
-            logger.info(f"Invalidated metadata cache for {key}")

caption_flow/utils/job_queue.py DELETED Viewed

@@ -1,41 +0,0 @@
-"""Job queue management."""
-import asyncio
-from typing import Optional
-from collections import deque
-from ..models import Job
-class JobQueue:
-    """Priority job queue with backpressure."""
-    def __init__(self):
-        self.queue = deque()
-        self.processing = set()
-        self.lock = asyncio.Lock()
-    async def add(self, job: Job):
-        """Add job to queue."""
-        async with self.lock:
-            self.queue.append(job)
-    async def get_next(self) -> Optional[Job]:
-        """Get next available job."""
-        async with self.lock:
-            if self.queue:
-                job = self.queue.popleft()
-                self.processing.add(job.job_id)
-                return job
-        return None
-    async def complete(self, job_id: str):
-        """Mark job as complete."""
-        async with self.lock:
-            self.processing.discard(job_id)
-    async def requeue(self, job: Job):
-        """Requeue a job (for failures)."""
-        async with self.lock:
-            self.processing.discard(job.job_id)
-            self.queue.appendleft(job)  # Priority requeue

caption-flow 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl

caption-flow 0.2.3py3-none-any.whl → 0.3.1py3-none-any.whl