PyPI - caption-flow - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

caption_flow/__init__.py +3 -3
caption_flow/cli.py +934 -415
caption_flow/models.py +45 -3
caption_flow/monitor.py +2 -3
caption_flow/orchestrator.py +153 -104
caption_flow/processors/__init__.py +3 -3
caption_flow/processors/base.py +8 -7
caption_flow/processors/huggingface.py +439 -67
caption_flow/processors/local_filesystem.py +24 -28
caption_flow/processors/webdataset.py +28 -22
caption_flow/storage/exporter.py +420 -339
caption_flow/storage/manager.py +636 -756
caption_flow/utils/__init__.py +1 -1
caption_flow/utils/auth.py +1 -1
caption_flow/utils/caption_utils.py +1 -1
caption_flow/utils/certificates.py +15 -8
caption_flow/utils/checkpoint_tracker.py +30 -28
caption_flow/utils/chunk_tracker.py +153 -56
caption_flow/utils/image_processor.py +9 -9
caption_flow/utils/json_utils.py +37 -20
caption_flow/utils/prompt_template.py +24 -16
caption_flow/utils/vllm_config.py +5 -4
caption_flow/viewer.py +4 -12
caption_flow/workers/base.py +5 -4
caption_flow/workers/caption.py +265 -90
caption_flow/workers/data.py +6 -8
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/METADATA +5 -4
caption_flow-0.4.0.dist-info/RECORD +33 -0
caption_flow-0.3.4.dist-info/RECORD +0 -33
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/WHEEL +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/entry_points.txt +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
{caption_flow-0.3.4.dist-info → caption_flow-0.4.0.dist-info}/top_level.txt +0 -0

caption_flow/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 """Utility modules for CaptionFlow."""
-from .chunk_tracker import ChunkTracker
 from .caption_utils import CaptionUtils
+from .chunk_tracker import ChunkTracker

caption_flow/utils/auth.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Authentication management."""
-from typing import Dict, Any, Optional
 from dataclasses import dataclass
+from typing import Any, Dict, Optional
 @dataclass

caption_flow/utils/caption_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Caption processing utilities from the original vLLM script."""
-from typing import List, Dict
+from typing import Dict, List
 class CaptionUtils:

caption_flow/utils/certificates.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """SSL certificate management."""
+import datetime as _datetime
 import subprocess
+from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Optional
 from cryptography import x509
-from cryptography.x509.oid import NameOID
 from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
-from datetime import datetime, timedelta
+from cryptography.x509.oid import NameOID
 class CertificateManager:
@@ -35,8 +37,8 @@ class CertificateManager:
             .issuer_name(issuer)
             .public_key(key.public_key())
             .serial_number(x509.random_serial_number())
-            .not_valid_before(datetime.utcnow())
-            .not_valid_after(datetime.utcnow() + timedelta(days=365))
+            .not_valid_before(datetime.now(_datetime.UTC))
+            .not_valid_after(datetime.now(_datetime.UTC) + timedelta(days=365))
             .add_extension(
                 x509.SubjectAlternativeName(
                     [
@@ -71,14 +73,15 @@ class CertificateManager:
     def generate_letsencrypt(
         self, domain: str, email: str, output_dir: Optional[Path] = None, staging: bool = False
     ) -> tuple[Path, Path]:
-        """
-        Generate Let's Encrypt certificate.
+        """Generate Let's Encrypt certificate.
         Args:
+        ----
             domain: Domain name for certificate
             email: Email for Let's Encrypt account
             output_dir: Custom output directory (uses /etc/letsencrypt by default)
             staging: Use Let's Encrypt staging server for testing
         """
         cmd = [
             "certbot",
@@ -133,8 +136,12 @@ class CertificateManager:
         return {
             "subject": cert.subject.rfc4514_string(),
             "issuer": cert.issuer.rfc4514_string(),
-            "not_before": cert.not_valid_before,
-            "not_after": cert.not_valid_after,
+            "not_before": cert.not_valid_before_utc,
+            "not_after": cert.not_valid_after_utc,
             "serial_number": cert.serial_number,
             "is_self_signed": cert.issuer == cert.subject,
         }
+    def inspect_certificate(self, cert_path: Path) -> dict:
+        """Inspect a certificate (alias for get_cert_info for CLI compatibility)."""
+        return self.get_cert_info(cert_path)

caption_flow/utils/checkpoint_tracker.py CHANGED Viewed

@@ -1,14 +1,17 @@
 """Base class for checkpoint tracking with persistent state."""
+import datetime as _datetime
 import json
 import logging
+import os
 from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Dict, Any, Optional
-from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
 logger = logging.getLogger(__name__)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class CheckpointTracker(ABC):
@@ -53,35 +56,34 @@ class CheckpointTracker(ABC):
     def save(self) -> None:
         """Save checkpoint to disk atomically."""
-        with self.lock:
-            try:
-                # Prepare data with metadata
+        try:
+            # If a save is already in progress, let it finish.
+            # This prevents race conditions if save() is called rapidly.
+            if hasattr(self, "_save_future") and self._save_future and not self._save_future.done():
+                logger.warning("Previous save still in progress, skipping this save")
+                return  # don't save this time,
+            logger.info("Saving chunk tracker state...")
+            # Prepare data with metadata
+            with self.lock:
                 data = self._serialize_state()
-                data["updated_at"] = datetime.utcnow().isoformat()
-                # Write atomically using temp file
-                tmp_file = self.checkpoint_path.with_suffix(".tmp")
-                # If a save is already in progress, let it finish.
-                # This prevents race conditions if save() is called rapidly.
-                if (
-                    hasattr(self, "_save_future")
-                    and self._save_future
-                    and not self._save_future.done()
-                ):
-                    self._save_future.result()  # Wait for the previous save to complete
-                # Use an executor to run the save operation in a background thread.
-                # This makes the save call non-blocking.
-                with ThreadPoolExecutor(max_workers=1) as executor:
-                    data_to_save = data.copy()
-                    self._save_future = executor.submit(self._write_to_disk, data_to_save, tmp_file)
-            except Exception as e:
-                logger.error(f"Failed to submit save task: {e}", exc_info=True)
+            data["updated_at"] = datetime.now(_datetime.UTC).isoformat()
+            # Write atomically using temp file
+            tmp_file = self.checkpoint_path.with_suffix(".tmp")
+            # Use an executor to run the save operation in a background thread.
+            # This makes the save call non-blocking.
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                data_to_save = data.copy()
+                self._save_future = executor.submit(self._write_to_disk, data_to_save, tmp_file)
+        except Exception as e:
+            logger.error(f"Failed to submit save task: {e}", exc_info=True)
-    def _write_to_disk(self, data: Dict[str, Any]) -> None:
+    def _write_to_disk(self, data: Dict[str, Any], checkpoint_path: Optional[str] = None) -> None:
         """Write checkpoint data to disk atomically."""
         # Create a temporary file in the same directory as the checkpoint
-        tmp_file = self.checkpoint_path.with_suffix(".tmp")
+        tmp_file = (checkpoint_path or self.checkpoint_path).with_suffix(".tmp")
+        logger.debug(f"Checkpoint {tmp_file=}")
         try:
             # Ensure the parent directory exists

caption_flow/utils/chunk_tracker.py CHANGED Viewed

@@ -1,17 +1,19 @@
 """Chunk tracking using CheckpointTracker base class with memory optimization."""
-from collections import defaultdict
+import datetime as _datetime
 import logging
-from pathlib import Path
-from typing import Set, Dict, List, Optional, Any, Tuple
+import os
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
 from datetime import datetime, timedelta
-from dataclasses import dataclass, asdict, field
+from pathlib import Path
+from threading import Lock
+from typing import Any, Dict, List, Optional, Set, Tuple
 from .checkpoint_tracker import CheckpointTracker
-from threading import Lock
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 @dataclass
@@ -32,8 +34,16 @@ class ChunkState:
     assigned_to: Optional[str] = None
     assigned_at: Optional[datetime] = None
+    # Cache for expensive range calculations
+    _cached_merged_ranges: Optional[List[Tuple[int, int]]] = field(default=None, init=False)
+    _cached_unprocessed_ranges: Optional[List[Tuple[int, int]]] = field(default=None, init=False)
+    _cache_invalidated: bool = field(default=True, init=False)
     def add_processed_range(self, start: int, end: int):
         """Add a processed range and merge if needed."""
+        # Invalidate cache before modifying ranges
+        self._invalidate_cache()
         # Add new range
         self.processed_ranges.append((start, end))
@@ -58,24 +68,43 @@ class ChunkState:
     def mark_completed(self):
         """Mark chunk as completed and clear unnecessary data to save memory."""
+        self._invalidate_cache()
         self.status = "completed"
-        self.completed_at = datetime.utcnow()
+        self.completed_at = datetime.now(_datetime.UTC)
         # Clear processed_ranges since we don't need them after completion
         # self.processed_ranges = []
         # self.assigned_to = None
         # self.assigned_at = None
+    def _invalidate_cache(self):
+        """Invalidate cached range calculations."""
+        self._cached_merged_ranges = None
+        self._cached_unprocessed_ranges = None
+        self._cache_invalidated = True
+    def _get_merged_ranges(self) -> List[Tuple[int, int]]:
+        """Get merged ranges with caching."""
+        if self._cached_merged_ranges is None:
+            self._cached_merged_ranges = self._merge_ranges(self.processed_ranges)
+        return self._cached_merged_ranges
     def get_unprocessed_ranges(self) -> List[Tuple[int, int]]:
         """Get ranges of unprocessed items within the chunk (relative indices)."""
         if self.status == "completed":
             return []
         if not self.processed_ranges:
-            logger.info(f"Chunk {self.chunk_id} has no processed ranges, returning full range")
+            if self._cache_invalidated:  # Only log once per invalidation
+                logger.info(f"Chunk {self.chunk_id} has no processed ranges, returning full range")
+                self._cache_invalidated = False
             return [(0, self.chunk_size - 1)]
-        # Merge ranges first to ensure no overlaps
-        merged_ranges = self._merge_ranges(self.processed_ranges)
+        # Use cached result if available
+        if self._cached_unprocessed_ranges is not None:
+            return self._cached_unprocessed_ranges
+        # Calculate and cache unprocessed ranges
+        merged_ranges = self._get_merged_ranges()
         unprocessed = []
         current_pos = 0
@@ -89,17 +118,23 @@ class ChunkState:
         if current_pos < self.chunk_size:
             unprocessed.append((current_pos, self.chunk_size - 1))
-        # Log for debugging
-        if not unprocessed:
-            logger.info(
-                f"Chunk {self.chunk_id} has processed ranges {merged_ranges} covering entire chunk size {self.chunk_size}"
-            )
-        else:
-            total_processed = sum(end - start + 1 for start, end in merged_ranges)
-            total_unprocessed = sum(end - start + 1 for start, end in unprocessed)
-            logger.debug(
-                f"Chunk {self.chunk_id}: {total_processed} processed, {total_unprocessed} unprocessed"
-            )
+        # Cache the result
+        self._cached_unprocessed_ranges = unprocessed
+        # Log for debugging (only when cache is being computed)
+        if self._cache_invalidated:
+            if not unprocessed:
+                logger.info(
+                    f"Chunk {self.chunk_id} has processed ranges {merged_ranges} covering entire chunk size {self.chunk_size}"
+                )
+            else:
+                logger.debug(f"Merged ranges for chunk {self.chunk_id}: {merged_ranges}")
+                total_processed = sum(end - start + 1 for start, end in merged_ranges)
+                total_unprocessed = sum(end - start + 1 for start, end in unprocessed)
+                logger.debug(
+                    f"Chunk {self.chunk_id}: {total_processed} processed, {total_unprocessed} unprocessed"
+                )
+            self._cache_invalidated = False
         return unprocessed
@@ -144,6 +179,10 @@ class ChunkState:
         # Ensure processed_ranges exists
         d.setdefault("processed_ranges", [])
         d.setdefault("processed_count", 0)
+        # Remove cache fields from dict if they exist (shouldn't be serialized)
+        d.pop("_cached_merged_ranges", None)
+        d.pop("_cached_unprocessed_ranges", None)
+        d.pop("_cache_invalidated", None)
         return cls(**d)
@@ -155,12 +194,22 @@ class ChunkTracker(CheckpointTracker):
         checkpoint_file: Path,
         max_completed_chunks_in_memory: int = 1000,
         archive_after_hours: int = 24,
+        save_batch_size: int = 10,
+        auto_save_interval: int = 60,
     ):
         self.chunks: Dict[str, ChunkState] = {}
         self.max_completed_chunks_in_memory = max_completed_chunks_in_memory
         self.archive_after_hours = archive_after_hours
         self._completed_count = 0  # Track count without storing all IDs
         self.lock = Lock()
+        # Batching mechanism
+        self._dirty = False
+        self._pending_changes = 0
+        self._save_batch_size = save_batch_size
+        self._auto_save_interval = auto_save_interval
+        self._last_save = datetime.now(_datetime.UTC)
         super().__init__(checkpoint_file)
     def _get_default_state(self) -> Dict[str, Any]:
@@ -169,17 +218,17 @@ class ChunkTracker(CheckpointTracker):
     def _deserialize_state(self, data: Dict[str, Any]) -> None:
         """Deserialize loaded data into instance state."""
-        with self.lock:
-            self.chunks = {}
-            self._completed_count = data.get("completed_count", 0)
-            # Load chunk states
-            completed_chunks = 0
-            for chunk_id, chunk_data in data.get("chunks", {}).items():
-                chunk_state = ChunkState.from_dict(chunk_data)
+        self.chunks = {}
+        self._completed_count = data.get("completed_count", 0)
+        # Load chunk states
+        completed_chunks = 0
+        for chunk_id, chunk_data in data.get("chunks", {}).items():
+            chunk_state = ChunkState.from_dict(chunk_data)
+            with self.lock:
                 self.chunks[chunk_id] = chunk_state
-                if chunk_state.status == "completed":
-                    completed_chunks += 1
+            if chunk_state.status == "completed":
+                completed_chunks += 1
         logger.info(
             f"Loaded {len(self.chunks)} chunks from checkpoint, "
@@ -194,12 +243,47 @@ class ChunkTracker(CheckpointTracker):
             "completed_count": self._completed_count,
         }
+    def _mark_dirty(self):
+        """Mark tracker as having pending changes."""
+        self._dirty = True
+        self._pending_changes += 1
+        # Auto-save based on batch size or time interval
+        now = datetime.now(_datetime.UTC)
+        time_since_last_save = (now - self._last_save).total_seconds()
+        if (
+            self._pending_changes >= self._save_batch_size
+            or time_since_last_save >= self._auto_save_interval
+        ):
+            self._do_save()
+    def _do_save(self) -> bool:
+        """Internal method to perform the actual save."""
+        super().save()  # Parent method returns None but triggers save
+        # Reset dirty state since save was initiated successfully
+        self._dirty = False
+        self._pending_changes = 0
+        self._last_save = datetime.now(_datetime.UTC)
+        return True
+    def save(self, force: bool = False) -> bool:
+        """Save state to checkpoint file, with batching optimization."""
+        if not force and not self._dirty:
+            return False
+        return self._do_save()
+    def flush(self):
+        """Force save any pending changes."""
+        if self._dirty:
+            self._do_save()
     def _archive_old_completed_chunks(self):
         """Remove old completed chunks from memory to prevent unbounded growth."""
         if not self.archive_after_hours:
             return
-        cutoff_time = datetime.utcnow() - timedelta(hours=self.archive_after_hours)
+        cutoff_time = datetime.now(_datetime.UTC) - timedelta(hours=self.archive_after_hours)
         chunks_to_remove = []
         for chunk_id, chunk in self.chunks.items():
@@ -214,7 +298,7 @@ class ChunkTracker(CheckpointTracker):
             for chunk_id in chunks_to_remove:
                 del self.chunks[chunk_id]
             logger.info(f"Archived {len(chunks_to_remove)} old completed chunks from memory")
-            self.save()
+            self._mark_dirty()
     def _limit_completed_chunks_in_memory(self):
         """Keep only the most recent completed chunks in memory."""
@@ -232,7 +316,7 @@ class ChunkTracker(CheckpointTracker):
                 del self.chunks[chunk_id]
             logger.info(f"Removed {to_remove} oldest completed chunks from memory")
-            self.save()
+            self._mark_dirty()
     def add_chunk(
         self, chunk_id: str, shard_name: str, shard_url: str, start_index: int, chunk_size: int
@@ -252,7 +336,7 @@ class ChunkTracker(CheckpointTracker):
             chunk_size=chunk_size,
             status="pending",
         )
-        self.save()
+        self._mark_dirty()
         # Periodically clean up old chunks
         if len(self.chunks) % 100 == 0:
@@ -267,8 +351,8 @@ class ChunkTracker(CheckpointTracker):
             chunk = self.chunks[chunk_id]
             chunk.status = "assigned"
             chunk.assigned_to = worker_id
-            chunk.assigned_at = datetime.utcnow()
-            self.save()
+            chunk.assigned_at = datetime.now(_datetime.UTC)
+            self._mark_dirty()
     def mark_completed(self, chunk_id: str):
         """Mark chunk as completed."""
@@ -278,7 +362,7 @@ class ChunkTracker(CheckpointTracker):
             chunk.mark_completed()  # This clears processed_ranges
             if not was_completed:
                 self._completed_count += 1
-            self.save()
+            self._mark_dirty()
             logger.debug(f"Chunk {chunk_id} marked as completed")
             # Check if we need to clean up
@@ -292,7 +376,7 @@ class ChunkTracker(CheckpointTracker):
             chunk.status = "pending"  # Reset to pending for retry
             chunk.assigned_to = None
             chunk.assigned_at = None
-            self.save()
+            self._mark_dirty()
     def mark_pending(self, chunk_id: str):
         """Mark chunk as pending (for manual reset)."""
@@ -303,7 +387,7 @@ class ChunkTracker(CheckpointTracker):
             chunk.status = "pending"
             chunk.assigned_to = None
             chunk.assigned_at = None
-            self.save()
+            self._mark_dirty()
     def release_worker_chunks(self, worker_id: str):
         """Release all chunks assigned to a worker."""
@@ -314,7 +398,8 @@ class ChunkTracker(CheckpointTracker):
                 chunk.assigned_to = None
                 chunk.assigned_at = None
                 released_chunks.append(chunk_id)
-        self.save()
+        if released_chunks:
+            self._mark_dirty()
         return released_chunks
     def get_pending_chunks(self, shard_name: Optional[str] = None) -> List[str]:
@@ -368,7 +453,7 @@ class ChunkTracker(CheckpointTracker):
         """Get summary of all shards and their chunk status."""
         shards = {}
-        for chunk_id, chunk_state in self.chunks.items():
+        for _chunk_id, chunk_state in self.chunks.items():
             shard_name = chunk_state.shard_name
             if shard_name not in shards:
                 shards[shard_name] = {
@@ -378,9 +463,11 @@ class ChunkTracker(CheckpointTracker):
                     "assigned_chunks": 0,
                     "failed_chunks": 0,
                     "is_complete": True,
+                    "chunks": [],
                 }
             shards[shard_name]["total_chunks"] += 1
+            shards[shard_name]["chunks"].append(chunk_state)
             if chunk_state.status == "completed":
                 shards[shard_name]["completed_chunks"] += 1
@@ -399,7 +486,7 @@ class ChunkTracker(CheckpointTracker):
     def get_incomplete_shards(self) -> Set[str]:
         """Get set of shard names that have incomplete chunks."""
         incomplete = set()
-        for chunk_id, chunk_state in self.chunks.items():
+        for _chunk_id, chunk_state in self.chunks.items():
             if chunk_state.status != "completed":
                 incomplete.add(chunk_state.shard_name)
         return incomplete
@@ -411,22 +498,21 @@ class ChunkTracker(CheckpointTracker):
         if not storage_manager.captions_path.exists():
             return
-        import pyarrow as pa
-        import pyarrow.parquet as pq
+        import lance
         # Check if item_index column exists
-        table_metadata = pq.read_metadata(storage_manager.captions_path)
+        table_metadata = lance.dataset(storage_manager.captions_path).schema
         columns = ["job_id", "chunk_id", "item_key"]
-        if "item_index" in table_metadata.schema.names:
+        if "item_index" in table_metadata.names:
             columns.append("item_index")
         # Process in batches to avoid loading entire table
         batch_size = 10000
-        parquet_file = pq.ParquetFile(storage_manager.captions_path)
+        lance_dataset = lance.dataset(storage_manager.captions_path)
         chunk_indices = defaultdict(set)
-        for batch in parquet_file.iter_batches(batch_size=batch_size, columns=columns):
+        for batch in lance_dataset.to_batches(batch_size=batch_size, columns=columns):
             batch_dict = batch.to_pydict()
             for i in range(len(batch_dict["chunk_id"])):
@@ -491,11 +577,12 @@ class ChunkTracker(CheckpointTracker):
             self._process_chunk_indices(chunk_indices)
         logger.info("Sync with storage completed")
-        self.save()
+        self._mark_dirty()
     def _process_chunk_indices(self, chunk_indices: Dict[str, Set[int]]):
         """Process a batch of chunk indices."""
         for chunk_id, abs_indices in chunk_indices.items():
+            logger.debug(f"Processing indices: {abs_indices} for chunk {chunk_id}")
             if chunk_id not in self.chunks:
                 continue
@@ -544,27 +631,37 @@ class ChunkTracker(CheckpointTracker):
         relative_start = start_idx - chunk_state.start_index
         relative_end = end_idx - chunk_state.start_index
-        # Ensure indices are within chunk bounds
+        # Ensure indices are within chunk bounds and maintain valid range
         relative_start = max(0, relative_start)
         relative_end = min(chunk_state.chunk_size - 1, relative_end)
+        # Skip invalid ranges where start > end
+        if relative_start > relative_end:
+            logger.warning(
+                f"Invalid range for chunk {chunk_id}: start={relative_start}, end={relative_end}, skipping"
+            )
+            return
+        # Invalidate cache before modifying ranges
+        chunk_state._invalidate_cache()
         # Add to processed ranges
         chunk_state.processed_ranges.append((relative_start, relative_end))
         # Merge overlapping ranges
         chunk_state.processed_ranges = chunk_state._merge_ranges(chunk_state.processed_ranges)
-        logger.debug(
-            f"Marked items {start_idx}-{end_idx} as processed in chunk {chunk_id} (relative indices: {relative_start}-{relative_end})"
-        )
+        # logger.debug(
+        #     f"Marked items {start_idx}-{end_idx} as processed in chunk {chunk_id} (relative indices: {relative_start}-{relative_end})"
+        # )
         # Check if chunk is now complete
         if chunk_state.get_unprocessed_ranges() == []:
             logger.info(f"Chunk {chunk_id} is now complete")
             chunk_state.status = "completed"
-        # Save checkpoint after updating
-        self.save()
+        # Mark as dirty, will be saved based on batching logic
+        self._mark_dirty()
     def get_chunk_with_unprocessed_items(self, chunk_id: str) -> Optional[Dict[str, Any]]:
         """Get chunk info with unprocessed item ranges."""

caption_flow/utils/image_processor.py CHANGED Viewed

@@ -1,19 +1,16 @@
 """Image preprocessing utilities."""
-import asyncio
 import logging
+import os
 from concurrent.futures import ProcessPoolExecutor
 from io import BytesIO
-from pathlib import Path
-from typing import List, Any, Optional, Tuple, Union
-import numpy as np
-import requests
 from PIL import Image
-from ..models import ProcessingItem
+from ..models import ProcessingItem
 logger = logging.getLogger(__name__)
+logger.setLevel(os.environ.get("CAPTIONFLOW_LOG_LEVEL", "INFO").upper())
 class ImageProcessor:
@@ -24,22 +21,25 @@ class ImageProcessor:
     @staticmethod
     def prepare_for_inference(item: ProcessingItem) -> Image.Image:
-        """
-        Prepare image for inference.
+        """Prepare image for inference.
         Args:
+        ----
             image: PIL Image to prepare
         Returns:
+        -------
             Prepared PIL Image
         """
         # We used to do a lot more hand-holding here with transparency, but oh well.
+        logger.debug(f"Preparing item for inference: {item}")
         if item.image is not None:
             image = item.image
             item.metadata["image_width"], item.metadata["image_height"] = image.size
             item.metadata["image_format"] = image.format or "unknown"
-            item.image = None
+            # item.image = None
             return image
         item.image = None

caption-flow 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

caption-flow 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl