PyPI - classifyre-cli - Versions diffs - 0.4.22__tar.gz → 0.4.24__tar.gz - Mend

classifyre-cli 0.4.22tar.gz → 0.4.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/.turbo/turbo-build.log RENAMED Viewed

@@ -1,3 +1,3 @@
 $ uv sync
-Resolved 262 packages in 141ms
+Resolved 262 packages in 149ms
 Checked 50 packages in 1ms

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: classifyre-cli
-Version: 0.4.22
+Version: 0.4.24
 Summary: Classifyre CLI — scan and classify unstructured data sources
 License: MIT
 Keywords: data,ingestion,metadata,pii,secrets,unstructured

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/package.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@classifyre/cli",
-  "version": "0.4.22",
+  "version": "0.4.24",
   "private": true,
   "scripts": {
     "build": "uv sync",

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "classifyre-cli"
-version = "0.4.22"
+version = "0.4.24"
 description = "Classifyre CLI — scan and classify unstructured data sources"
 readme = "README.md"
 requires-python = ">=3.12"

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/detector_pipeline.py RENAMED Viewed

@@ -109,7 +109,10 @@ class DetectorPipeline:
         scan_started = datetime.now(UTC)
         ocr_enabled = self.source.ocr_enabled()
-        text_content_type = self._text_content_type_for_asset(asset.asset_type, ocr_enabled)
+        transcription_enabled = self.source.transcription_enabled()
+        text_content_type = self._text_content_type_for_asset(
+            asset.asset_type, ocr_enabled, transcription_enabled
+        )
         link_content = self._build_links_payload(asset.links)
         text_detectors = []
@@ -294,13 +297,23 @@ class DetectorPipeline:
                 page_num=page_num,
             )
             elapsed = int((time.monotonic() - t0) * 1000)
+            snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
             logger.info(
-                "  %s page %d done: %d findings (%dms)",
+                "  %s page %d: %d findings in %dms — snippet: %s",
                 asset.name,
                 page_num,
                 len(page_findings),
                 elapsed,
+                snippet,
             )
+            if page_findings:
+                for f in page_findings[:5]:
+                    logger.info(
+                        "    finding: type=%s detector=%s matched=%.100s",
+                        f.finding_type,
+                        f.detector_type,
+                        f.matched_content[:100].replace("\n", " "),
+                    )
             return page_findings, page_types, page_errors, page_content, page_num
         def _collect_done() -> None:
@@ -401,13 +414,23 @@ class DetectorPipeline:
                 page_num=page_num,
             )
             elapsed = int((time.monotonic() - t0) * 1000)
+            snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
             logger.info(
-                "  %s page %d done: %d findings (%dms)",
+                "  %s page %d: %d findings in %dms — snippet: %s",
                 asset.name,
                 page_num,
                 len(page_findings),
                 elapsed,
+                snippet,
             )
+            if page_findings:
+                for f in page_findings[:5]:
+                    logger.info(
+                        "    finding: type=%s detector=%s matched=%.100s",
+                        f.finding_type,
+                        f.detector_type,
+                        f.matched_content[:100].replace("\n", " "),
+                    )
             return page_findings, page_types, page_errors, page_content, page_num
         async def _collect_done_and_flush(min_findings: int = 1) -> None:
@@ -488,6 +511,12 @@ class DetectorPipeline:
                 continue
             candidate_ids.append(value)
+        logger.info(
+            "_iter_text_content_pages(%s): trying candidates %s",
+            asset.name,
+            candidate_ids,
+        )
         for candidate_id in candidate_ids:
             saw_candidate_content = False
             async for text_content in self.content_provider.fetch_text_pages(candidate_id):
@@ -499,6 +528,16 @@ class DetectorPipeline:
             if saw_candidate_content:
                 return
+            # If fetch_content_pages ran the full bytes-path extraction (even
+            # yielding 0 text, e.g. silent audio), the source already did the
+            # expensive work.  Don't re-process with another candidate ID for
+            # the same asset.
+            source = getattr(self.content_provider, "_source", None)
+            if source is not None:
+                processed: set[str] = getattr(source, "_content_pages_processed", set())
+                if candidate_id in processed:
+                    return
     async def _run_binary_detectors_for_asset(
         self,
         *,
@@ -727,6 +766,7 @@ class DetectorPipeline:
         self,
         asset_type: OutputAssetType,
         ocr_enabled: bool,
+        transcription_enabled: bool = False,
     ) -> str | None:
         mapping = {
             OutputAssetType.TXT: "text/plain",
@@ -737,6 +777,8 @@ class DetectorPipeline:
             return mapping[asset_type]
         if ocr_enabled and asset_type in {OutputAssetType.IMAGE, OutputAssetType.BINARY}:
             return "text/plain"
+        if transcription_enabled and asset_type in {OutputAssetType.AUDIO, OutputAssetType.VIDEO}:
+            return "text/plain"
         return None
     @staticmethod

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/parsed_content_provider.py RENAMED Viewed

@@ -3,11 +3,14 @@
 from __future__ import annotations
 import asyncio
+import logging
 from collections.abc import AsyncGenerator
 from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
 from ..sources.base import BaseSource
+logger = logging.getLogger(__name__)
 class ParsedContentProvider:
     """
@@ -32,11 +35,30 @@ class ParsedContentProvider:
         if saw_text:
             return
+        # If fetch_content_pages already ran the full extraction pipeline for
+        # this asset (tracked via _content_pages_processed), skip the fallback
+        # iter_asset_pages call.  Without this, an all-silence audio file would
+        # trigger a redundant second transcription pass.
+        pages_processed: set[str] | None = getattr(self._source, "_content_pages_processed", None)
+        if isinstance(pages_processed, set) and asset_id in pages_processed:
+            logger.info(
+                "fetch_text_pages(%s): source already processed, skipping fallback",
+                asset_id,
+            )
+            return
         result = await self._source.fetch_content_bytes(asset_id)
         if result is None:
+            logger.info("fetch_text_pages(%s): fetch_content_bytes returned None", asset_id)
             return
         raw_bytes, mime = result
+        logger.info(
+            "fetch_text_pages(%s): fallback iter_asset_pages path (%s, %d bytes)",
+            asset_id,
+            mime,
+            len(raw_bytes),
+        )
         pages: list[str] = await asyncio.to_thread(
             list,
             self._source.iter_asset_pages(raw_bytes, mime),

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/pipeline/worker_pool.py RENAMED Viewed

@@ -21,6 +21,9 @@ from concurrent.futures import ProcessPoolExecutor
 from typing import Any
 from ..models.generated_single_asset_scan_results import DetectionResult
+from ..utils.resources import get_effective_cpu_count, get_effective_memory_mb
+__all__ = ["get_effective_cpu_count", "get_effective_memory_mb"]
 logger = logging.getLogger(__name__)
@@ -130,63 +133,6 @@ def is_io_bound_detector(detector_name: str) -> bool:
     return detector_name in _IO_BOUND_DETECTORS
-def get_effective_cpu_count() -> int:
-    """Return the number of usable CPUs, respecting cgroup limits (K8s/Docker).
-    ``os.cpu_count()`` returns the *host* CPU count, which can be much larger
-    than what the container is allowed to use.  This function reads the cgroup
-    v2 ``cpu.max`` (or v1 ``cpu.cfs_quota_us``/``cpu.cfs_period_us``) to
-    determine the actual allocation.
-    """
-    try:
-        data = open("/sys/fs/cgroup/cpu.max").read().strip()
-        quota_str, period_str = data.split()
-        if quota_str != "max":
-            cpus = int(quota_str) / int(period_str)
-            if cpus >= 0.5:
-                return max(1, int(cpus))
-    except (FileNotFoundError, OSError, ValueError):
-        pass
-    try:
-        quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
-        period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
-        if quota > 0 and period > 0:
-            cpus = quota / period
-            if cpus >= 0.5:
-                return max(1, int(cpus))
-    except (FileNotFoundError, OSError, ValueError):
-        pass
-    return os.cpu_count() or 4
-def get_effective_memory_mb() -> int:
-    """Return usable memory in MB, respecting cgroup limits."""
-    try:
-        mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
-        if mem_bytes < 2**50:
-            return max(256, mem_bytes // (1024 * 1024))
-    except (FileNotFoundError, OSError, ValueError):
-        pass
-    try:
-        mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
-        if mem_bytes < 2**50:
-            return max(256, mem_bytes // (1024 * 1024))
-    except (FileNotFoundError, OSError, ValueError):
-        pass
-    try:
-        for line in open("/proc/meminfo"):
-            if line.startswith("MemTotal:"):
-                return max(256, int(line.split()[1]) // 1024)
-    except (FileNotFoundError, OSError, ValueError):
-        pass
-    return 4096
 def compute_pool_workers(override: int | None = None) -> int:
     """Compute optimal pool size from actual resource limits.

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/sources/object_storage/base.py RENAMED Viewed

@@ -135,6 +135,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
         # Keyed by both asset_hash and external_url for O(1) lookup from either.
         self._bytes_cache: dict[str, bytes] = {}
         self._mime_cache: dict[str, str] = {}
+        # asset_ids for which fetch_content_pages ran the full bytes path
+        # (even if it produced no text, e.g. all-silence audio).  Checked by
+        # ParsedContentProvider to skip its fallback iter_asset_pages path,
+        # which would otherwise re-run an expensive transcription a second time.
+        self._content_pages_processed: set[str] = set()
         # Child IMAGE assets queued while transforming the current object.
         self._pending_child_assets: list[SingleAssetScanResults] = []
@@ -302,6 +307,15 @@ class ObjectStorageSourceBase(BaseSource, ABC):
         return OutputAssetType.OTHER
+    @staticmethod
+    def _asset_kind_for_asset_type(asset_type: OutputAssetType) -> str:
+        mapping: dict[OutputAssetType, str] = {
+            OutputAssetType.IMAGE: "image",
+            OutputAssetType.AUDIO: "audio",
+            OutputAssetType.VIDEO: "video",
+        }
+        return mapping.get(asset_type, "file")
     def _ensure_file_processing_dependencies(self) -> None:
         if self._file_processing_deps_checked:
             return
@@ -446,7 +460,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
             created_at=ref.last_modified,
             updated_at=ref.last_modified,
             runner_id=self.runner_id,
-            **self.metadata_fields("file", asset_metadata),
+            **self.metadata_fields(self._asset_kind_for_asset_type(asset_type), asset_metadata),
         )
         self._hash_to_uri[asset_hash] = external_url
         self._object_ref_by_hash[asset_hash] = ref
@@ -549,6 +563,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
         self._object_ref_by_hash = {}
         self._bytes_cache = {}
         self._mime_cache = {}
+        self._content_pages_processed = set()
         self._pending_child_assets = []
         refs = self._list_objects()
@@ -628,26 +643,69 @@ class ObjectStorageSourceBase(BaseSource, ABC):
         raw_bytes = self._bytes_cache.get(asset_id)
         mime = self._mime_cache.get(asset_id, "")
+        logger.info(
+            "fetch_content_pages(%s): raw_bytes=%s mime=%s processed=%s",
+            asset_id,
+            f"{len(raw_bytes)} bytes" if raw_bytes is not None else "MISS",
+            mime or "MISS",
+            asset_id in self._content_pages_processed,
+        )
         if raw_bytes is not None:
             sampling = self.config.sampling
             batch_size = int(sampling.rows_per_page or 100)
             include_col_names = bool(
                 sampling.include_column_names if sampling.include_column_names is not None else True
             )
-            # Run the (potentially blocking) file parsing in a thread so pyarrow /
-            # pdfplumber can't freeze the event loop during large file iteration.
-            pages: list[str] = await asyncio.to_thread(
-                list,
-                self.iter_asset_pages(
-                    raw_bytes,
-                    mime,
-                    batch_size,
-                    include_col_names,
-                    file_name=self._file_name_for_asset_id(asset_id),
-                ),
+            file_name = self._file_name_for_asset_id(asset_id)
+            # Stream pages from a thread instead of materializing via list().
+            # For transcription this lets detectors start working on the first
+            # chunk while later chunks are still being transcribed.
+            loop = asyncio.get_running_loop()
+            queue: asyncio.Queue[str | None] = asyncio.Queue()
+            exc_info: list[BaseException | None] = [None]
+            page_count: int = 0
+            def _produce() -> None:
+                nonlocal page_count
+                try:
+                    for page in self.iter_asset_pages(
+                        raw_bytes,
+                        mime,
+                        batch_size,
+                        include_col_names,
+                        file_name=file_name,
+                    ):
+                        loop.call_soon_threadsafe(queue.put_nowait, page)
+                        page_count += 1
+                except BaseException as exc:
+                    exc_info[0] = exc
+                finally:
+                    loop.call_soon_threadsafe(queue.put_nowait, None)
+            task = loop.run_in_executor(None, _produce)
+            while True:
+                page = await queue.get()
+                if page is None:
+                    break
+                yield "", page
+            await task
+            if exc_info[0] is not None:
+                raise exc_info[0]  # type: ignore[misc]
+            logger.info(
+                "fetch_content_pages(%s): streamed %d page(s) from %s",
+                asset_id,
+                page_count,
+                file_name,
             )
-            for batch_text in pages:
-                yield "", batch_text
+            self._content_pages_processed.add(asset_id)
             return
         result = await self.fetch_content(asset_id)

{classifyre_cli-0.4.22 → classifyre_cli-0.4.24}/src/utils/file_parser.py RENAMED Viewed

@@ -690,6 +690,18 @@ def iter_file_pages(
         yield from _iter_parquet_pages(file_bytes, batch_size, include_column_names)
     elif normalized in ("text/csv", "text/tab-separated-values"):
         yield from _iter_csv_pages(file_bytes, include_column_names)
+    elif normalized.startswith(("audio/", "video/")) and enable_transcription:
+        # Stream transcript pages directly from the chunked transcription pipeline
+        # so the detector receives text as each ~10-min audio chunk completes
+        # instead of waiting for the full file and buffering the entire transcript.
+        from .transcription import iter_transcription_pages
+        yield from iter_transcription_pages(
+            file_bytes,
+            mime_type=normalized,
+            file_name=file_name,
+            segments_per_page=batch_size,
+        )
     else:
         text, error = extract_text(
             file_bytes,

classifyre_cli-0.4.24/src/utils/resources.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Cgroup-aware CPU and memory introspection.
+Shared by the detector worker pool (to size the process pool) and the
+transcription pipeline (to select the right Whisper model at runtime).
+"""
+from __future__ import annotations
+import os
+def get_effective_cpu_count() -> int:
+    """Return usable CPUs, respecting cgroup limits (K8s / Docker).
+    ``os.cpu_count()`` returns the *host* count, which is usually much larger
+    than the container's CPU quota.  This reads cgroup v2 / v1 to get the
+    actual allocation.
+    """
+    try:
+        data = open("/sys/fs/cgroup/cpu.max").read().strip()
+        quota_str, period_str = data.split()
+        if quota_str != "max":
+            cpus = int(quota_str) / int(period_str)
+            if cpus >= 0.5:
+                return max(1, int(cpus))
+    except (FileNotFoundError, OSError, ValueError):
+        pass
+    try:
+        quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
+        period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
+        if quota > 0 and period > 0:
+            cpus = quota / period
+            if cpus >= 0.5:
+                return max(1, int(cpus))
+    except (FileNotFoundError, OSError, ValueError):
+        pass
+    return os.cpu_count() or 4
+def get_effective_memory_mb() -> int:
+    """Return usable memory in MB, respecting cgroup limits (K8s / Docker)."""
+    try:
+        mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
+        if mem_bytes < 2**50:
+            return max(256, mem_bytes // (1024 * 1024))
+    except (FileNotFoundError, OSError, ValueError):
+        pass
+    try:
+        mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
+        if mem_bytes < 2**50:
+            return max(256, mem_bytes // (1024 * 1024))
+    except (FileNotFoundError, OSError, ValueError):
+        pass
+    try:
+        for line in open("/proc/meminfo"):
+            if line.startswith("MemTotal:"):
+                return max(256, int(line.split()[1]) // 1024)
+    except (FileNotFoundError, OSError, ValueError):
+        pass
+    return 4096

classifyre-cli 0.4.22__tar.gz → 0.4.24__tar.gz

classifyre-cli 0.4.22tar.gz → 0.4.24tar.gz