PyPI - classifyre-cli - Versions diffs - 0.4.9__tar.gz → 0.4.11__tar.gz - Mend

classifyre-cli 0.4.9tar.gz → 0.4.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

{classifyre_cli-0.4.9 → classifyre_cli-0.4.11}/.turbo/turbo-build.log RENAMED Viewed

@@ -1,3 +1,3 @@
 $ uv sync
-Resolved 256 packages in 201ms
+Resolved 265 packages in 157ms
 Checked 50 packages in 1ms

{classifyre_cli-0.4.9 → classifyre_cli-0.4.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: classifyre-cli
-Version: 0.4.9
+Version: 0.4.11
 Summary: Classifyre CLI — scan and classify unstructured data sources
 License: MIT
 Keywords: data,ingestion,metadata,pii,secrets,unstructured

{classifyre_cli-0.4.9 → classifyre_cli-0.4.11}/package.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@classifyre/cli",
-  "version": "0.4.9",
+  "version": "0.4.11",
   "private": true,
   "scripts": {
     "build": "uv sync",

{classifyre_cli-0.4.9 → classifyre_cli-0.4.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "classifyre-cli"
-version = "0.4.9"
+version = "0.4.11"
 description = "Classifyre CLI — scan and classify unstructured data sources"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -47,7 +47,7 @@ privacy = [
     # mid-run in frozen/venv contexts. 8.x eagerly loads all data at import time,
     # avoiding ModuleNotFoundError during Presidio phone number analysis.
     "phonenumbers>=8.13.0,<10.0.0",
-    "numpy>=1.26.0,<2.0.0",
+    "numpy>=1.26.0,<3.0.0",
 ]
 security = [
     "detect-secrets>=1.5.0",
@@ -91,6 +91,13 @@ custom = [
 regex = [
     "google-re2>=1.1",
 ]
+llm = [
+    "litellm>=1.86.2",
+    # Pure-wheel PDF renderer (permissive license, no system binaries) used to
+    # rasterise PDF pages to images for vision-capable LLM detectors.
+    "pypdfium2>=4.30.0",
+    "pillow>=12.2.0",
+]
 detectors = [
     { include-group = "file-processing" },
     { include-group = "privacy" },
@@ -101,6 +108,7 @@ detectors = [
     { include-group = "classification" },
     { include-group = "custom" },
     { include-group = "regex" },
+    { include-group = "llm" },
 ]
 file-processing = [
     "filetype>=1.2.0",
@@ -264,6 +272,10 @@ module = [
     "datasets",
     "setfit.*",
     "setfit",
+    "litellm.*",
+    "litellm",
+    "pypdfium2.*",
+    "pypdfium2",
     "sklearn.*",
     "sklearn",
     "numpy",

{classifyre_cli-0.4.9 → classifyre_cli-0.4.11}/src/detectors/custom/runners/_base.py RENAMED Viewed

@@ -2,6 +2,8 @@
 from __future__ import annotations
+import io
+import logging
 import re
 from abc import ABC, abstractmethod
 from datetime import UTC, datetime
@@ -38,6 +40,32 @@ _IMAGE_CONTENT_TYPES = [
     "image/bmp",
     "image/tiff",
 ]
+# Content types HuggingFace image detectors accept. Non-image renderable files
+# (PDFs) are rasterised page-by-page via render_to_images before classification,
+# mirroring the vision LLM detector's input handling.
+_IMAGE_INPUT_CONTENT_TYPES = [*_IMAGE_CONTENT_TYPES, "application/pdf"]
+logger = logging.getLogger(__name__)
+def _load_input_images(content: bytes, content_type: str, pil: Any) -> list[tuple[int, Any]]:
+    """Return ``(page_index, PIL.Image)`` tuples for an image or renderable file.
+    Image MIME types open directly; PDFs (and any type ``render_to_images`` supports)
+    are rasterised to one image per page. Unsupported types return ``[]``.
+    """
+    from ....utils.file_to_images import render_to_images, supported_mime_type
+    normalized = content_type.split(";", 1)[0].strip().lower()
+    try:
+        if normalized.startswith("image/"):
+            return [(0, pil.open(io.BytesIO(content)))]
+        if supported_mime_type(content_type):
+            pages = render_to_images(content, content_type)
+            return [(idx, pil.open(io.BytesIO(png))) for idx, png in enumerate(pages)]
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning("Failed to load input images (%s): %s", normalized, exc)
+    return []
 def _resolve_pipeline_severity(

{classifyre_cli-0.4.9 → classifyre_cli-0.4.11}/src/detectors/custom/runners/_image_classification.py RENAMED Viewed

@@ -2,7 +2,6 @@
 from __future__ import annotations
-import io
 import logging
 from typing import Any
@@ -11,8 +10,9 @@ from ....models.generated_single_asset_scan_results import DetectionResult
 from ...dependencies import ensure_torch, require_module
 from ._base import (
     _DEFAULT_IMAGE_CLASSIFICATION_MODEL,
-    _IMAGE_CONTENT_TYPES,
+    _IMAGE_INPUT_CONTENT_TYPES,
     BaseRunner,
+    _load_input_images,
     _resolve_pipeline_severity,
 )
@@ -54,45 +54,55 @@ class ImageClassificationRunner(BaseRunner):
         raise NotImplementedError("ImageClassificationRunner uses detect() directly")
     def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
-        if not content_type.startswith("image/"):
-            return []
         if isinstance(content, str):
             logger.warning("image_classification: received string content, expected bytes")
             return []
+        # image/* opens directly; PDFs are rasterised to one image per page.
+        images = _load_input_images(content, content_type, self._pil)
+        if not images:
+            return []
         schema = self._schema
         threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.0
+        multi_page = len(images) > 1
         results: list[DetectionResult] = []
-        try:
-            image = self._pil.open(io.BytesIO(content))
-            predictions: list[dict[str, Any]] = self._pipe(image) or []
-            for pred in predictions:
-                label: str = pred.get("label", "unknown")
-                score: float = float(pred.get("score", 0.0))
-                if score < threshold:
-                    continue
-                severity = _resolve_pipeline_severity(label, schema.severity_map)
-                results.append(
-                    self._make_result(
-                        finding_type=f"classification:{label}",
-                        category="CONTENT",
-                        severity=severity,
-                        confidence=score,
-                        matched_content=f"Image classified as: {label} ({score:.3f})",
-                        location=None,
-                        metadata={
-                            "image_size": f"{image.size[0]}x{image.size[1]}",
-                            "image_mode": image.mode,
-                            "model": self._model_id,
-                        },
+        for page_index, image in images:
+            try:
+                predictions: list[dict[str, Any]] = self._pipe(image) or []
+                for pred in predictions:
+                    label: str = pred.get("label", "unknown")
+                    score: float = float(pred.get("score", 0.0))
+                    if score < threshold:
+                        continue
+                    severity = _resolve_pipeline_severity(label, schema.severity_map)
+                    page_suffix = f" (page {page_index + 1})" if multi_page else ""
+                    metadata: dict[str, Any] = {
+                        "image_size": f"{image.size[0]}x{image.size[1]}",
+                        "image_mode": image.mode,
+                        "model": self._model_id,
+                    }
+                    if multi_page:
+                        metadata["page"] = page_index + 1
+                    results.append(
+                        self._make_result(
+                            finding_type=f"classification:{label}",
+                            category="CONTENT",
+                            severity=severity,
+                            confidence=score,
+                            matched_content=(
+                                f"Image classified as: {label} ({score:.3f}){page_suffix}"
+                            ),
+                            location=None,
+                            metadata=metadata,
+                        )
                     )
+            except Exception as exc:
+                logger.error(
+                    "image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
                 )
-        except Exception as exc:
-            logger.error(
-                "image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
-            )
         results.sort(key=lambda r: r.confidence, reverse=True)
         return results
     def get_supported_content_types(self) -> list[str]:
-        return list(_IMAGE_CONTENT_TYPES)
+        return list(_IMAGE_INPUT_CONTENT_TYPES)

classifyre_cli-0.4.11/src/detectors/custom/runners/_llm.py ADDED Viewed

@@ -0,0 +1,295 @@
+"""AI/LLM pipeline runner — prompt-driven classification and field extraction."""
+from __future__ import annotations
+import base64
+import json
+import logging
+import os
+from datetime import UTC, datetime
+from typing import Any
+# Quiet litellm's import-time provider preload warnings (bedrock/sagemaker need
+# botocore, which we don't install) before the library is ever imported.
+os.environ.setdefault("LITELLM_LOG", "ERROR")
+from ....models.generated_detectors import LLMPipelineSchema, Severity
+from ....models.generated_single_asset_scan_results import (
+    DetectionResult,
+    DetectorType,
+)
+from ....utils.file_to_images import render_to_images, supported_mime_type
+from ...dependencies import require_module
+from ._base import _IMAGE_CONTENT_TYPES, _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
+logger = logging.getLogger(__name__)
+# Map the stored AI provider type onto the litellm model-string convention.
+_PROVIDER_PREFIX: dict[str, str] = {
+    "CLAUDE": "anthropic",
+    "GEMINI": "gemini",
+    "OPENAI_COMPATIBLE": "openai",
+}
+# Content types a vision-capable LLM detector renders to images and sends to the
+# model directly. PDFs are rasterised page-by-page; images pass through.
+_VISION_CONTENT_TYPES = [*_IMAGE_CONTENT_TYPES, "application/pdf"]
+# Cap the number of rendered page images sent in a single completion to bound
+# token cost and request size for multi-page PDFs.
+_MAX_VISION_IMAGES = 20
+class LLMRunner(BaseRunner):
+    """AI detector — sends content to a configured LLM provider for classification + extraction."""
+    def __init__(
+        self, schema: LLMPipelineSchema, detector_key: str = "", detector_name: str = ""
+    ) -> None:
+        self._schema = schema
+        self._detector_key = detector_key
+        self._detector_name = detector_name
+        runtime = schema.provider_runtime
+        if runtime is None:
+            raise ValueError(
+                f"AI detector '{detector_key}' is missing provider_runtime — the API must "
+                "inject resolved provider credentials before dispatch."
+            )
+        self._runtime = runtime
+        self._litellm = require_module("litellm", "llm", ["llm"])
+        # Let litellm silently drop params an endpoint doesn't support (e.g.
+        # response_format / temperature on some OpenAI-compatible gateways)
+        # instead of raising. Keep its own logging quiet.
+        self._litellm.drop_params = True
+        self._litellm.suppress_debug_info = True
+        logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+    def run(self, text: str) -> None:  # type: ignore[override]  # pragma: no cover
+        raise NotImplementedError("LLMRunner uses detect() directly")
+    def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
+        if isinstance(content, bytes):
+            return self._detect_vision(content, content_type)
+        if content_type not in _TEXT_CONTENT_TYPES:
+            return []
+        text = content.strip()
+        if not text:
+            return []
+        schema = self._schema
+        content_limit = schema.content_limit or 8000
+        snippet = text[:content_limit]
+        messages = [
+            {"role": "system", "content": self._build_system_prompt()},
+            {"role": "user", "content": snippet},
+        ]
+        return self._complete_and_parse(messages, snippet)
+    def _detect_vision(self, content: bytes, content_type: str) -> list[DetectionResult]:
+        """Render a binary file (image/PDF) to images and classify via the model."""
+        if not self._vision_enabled():
+            return []
+        if not supported_mime_type(content_type):
+            return []
+        images = render_to_images(
+            content,
+            content_type,
+            max_pages=_MAX_VISION_IMAGES,
+        )
+        if not images:
+            return []
+        image_blocks = [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{base64.b64encode(png).decode('ascii')}"
+                },
+            }
+            for png in images[:_MAX_VISION_IMAGES]
+        ]
+        messages = [
+            {"role": "system", "content": self._build_system_prompt()},
+            {"role": "user", "content": image_blocks},
+        ]
+        # matched_content fallback descriptor — there is no text snippet for files.
+        descriptor = f"[{content_type}, {len(image_blocks)} page image(s)]"
+        return self._complete_and_parse(messages, descriptor, vision_pages=len(image_blocks))
+    def _complete_and_parse(
+        self,
+        messages: list[dict[str, Any]],
+        snippet: str,
+        *,
+        vision_pages: int | None = None,
+    ) -> list[DetectionResult]:
+        schema = self._schema
+        try:
+            response = self._litellm.completion(
+                model=self._model_string(),
+                api_key=self._runtime.api_key,
+                api_base=self._runtime.base_url or None,
+                temperature=schema.temperature if schema.temperature is not None else 0.0,
+                max_tokens=self._max_tokens(),
+                messages=messages,
+                response_format={"type": "json_object"},
+            )
+            raw = response.choices[0].message.content or "{}"
+            parsed = self._parse_json(raw)
+        except Exception as exc:
+            logger.error(
+                "llm detector error (detector=%s, model=%s): %s",
+                self._detector_key,
+                self._runtime.model,
+                exc,
+                exc_info=True,
+            )
+            return []
+        return self._results_from_payload(snippet, parsed, vision_pages=vision_pages)
+    def _vision_enabled(self) -> bool:
+        return bool(getattr(self._runtime, "supports_vision", False))
+    def get_supported_content_types(self) -> list[str]:
+        types = list(_TEXT_CONTENT_TYPES)
+        if self._vision_enabled():
+            types.extend(_VISION_CONTENT_TYPES)
+        return types
+    # ── Internals ────────────────────────────────────────────────────────────
+    def _max_tokens(self) -> int | None:
+        # `max_tokens` is generated as a RootModel[int] wrapper, so unwrap `.root`
+        # before handing it to litellm — passing the model object serialises to an
+        # invalid request body and fails the whole completion.
+        raw = self._schema.max_tokens
+        if raw is None:
+            return None
+        return getattr(raw, "root", raw)
+    def _model_string(self) -> str:
+        prefix = _PROVIDER_PREFIX.get(self._runtime.provider.value, "openai")
+        return f"{prefix}/{self._runtime.model}"
+    def _build_system_prompt(self) -> str:
+        schema = self._schema
+        parts: list[str] = [schema.system_prompt.strip()]
+        labels = schema.labels or []
+        if labels:
+            label_lines = "\n".join(
+                f"- {lbl.name}: {lbl.description}" if lbl.description else f"- {lbl.name}"
+                for lbl in labels
+            )
+            parts.append(
+                "Classify the content using these labels:\n"
+                + label_lines
+                + (
+                    "\nMultiple labels may apply."
+                    if schema.multi_label
+                    else "\nChoose the single best label."
+                )
+            )
+        fields = schema.output_fields or []
+        if fields:
+            field_lines = "\n".join(
+                f"- {f.name} ({f.type.value if f.type else 'string'}): {f.description}"
+                if f.description
+                else f"- {f.name} ({f.type.value if f.type else 'string'})"
+                for f in fields
+            )
+            parts.append("Also extract these fields:\n" + field_lines)
+        parts.append(
+            "Respond with a JSON object of the form: "
+            '{"labels": [{"name": "<label>", "confidence": <0-1>, '
+            '"matched_content": "<relevant snippet>"}], "fields": {<field name>: <value>}}. '
+            "Use only the labels listed above. Return an empty labels array when none apply."
+        )
+        if schema.response_example:
+            parts.append("Example response:\n" + schema.response_example.strip())
+        return "\n\n".join(parts)
+    @staticmethod
+    def _parse_json(raw: str) -> dict[str, Any]:
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            start = raw.find("{")
+            end = raw.rfind("}")
+            if start == -1 or end == -1 or end <= start:
+                return {}
+            try:
+                parsed = json.loads(raw[start : end + 1])
+            except json.JSONDecodeError:
+                return {}
+        return parsed if isinstance(parsed, dict) else {}
+    def _results_from_payload(
+        self,
+        snippet: str,
+        payload: dict[str, Any],
+        *,
+        vision_pages: int | None = None,
+    ) -> list[DetectionResult]:
+        schema = self._schema
+        threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
+        default_severity = schema.severity or Severity.info
+        extracted = self._coerce_fields(payload.get("fields"))
+        raw_labels = payload.get("labels")
+        label_entries: list[dict[str, Any]] = (
+            [lbl for lbl in raw_labels if isinstance(lbl, dict)]
+            if isinstance(raw_labels, list)
+            else []
+        )
+        results: list[DetectionResult] = []
+        for entry in label_entries:
+            label = str(entry.get("name", "")).strip()
+            if not label:
+                continue
+            confidence = float(entry.get("confidence", 1.0) or 0.0)
+            if confidence < threshold:
+                continue
+            severity = _resolve_pipeline_severity(label, schema.severity_map, default_severity)
+            matched = str(entry.get("matched_content") or "").strip() or snippet[:320]
+            results.append(
+                DetectionResult(
+                    detector_type=DetectorType.CUSTOM,
+                    finding_type=label,
+                    category="CLASSIFICATION",
+                    severity=severity,
+                    confidence=min(0.99, confidence),
+                    matched_content=matched,
+                    location=None,
+                    custom_detector_key=self._detector_key,
+                    custom_detector_name=self._detector_name,
+                    detected_at=datetime.now(UTC),
+                    metadata={
+                        "runner": "LLM",
+                        "provider": self._runtime.provider.value,
+                        "model": self._runtime.model,
+                        "label": label,
+                        "fields": extracted,
+                        "input": "vision" if vision_pages is not None else "text",
+                        **({"vision_pages": vision_pages} if vision_pages is not None else {}),
+                    },
+                    extracted_data=extracted or None,
+                    extraction_method="LLM",
+                )
+            )
+        results.sort(key=lambda r: r.confidence, reverse=True)
+        return results
+    @staticmethod
+    def _coerce_fields(raw: Any) -> dict[str, Any]:
+        return {str(k): v for k, v in raw.items()} if isinstance(raw, dict) else {}

classifyre_cli-0.4.11/src/detectors/custom/runners/_object_detection.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Object detection pipeline runner."""
+from __future__ import annotations
+import logging
+from typing import Any
+from ....models.generated_detectors import ObjectDetectionPipelineSchema
+from ....models.generated_single_asset_scan_results import DetectionResult, Location
+from ...dependencies import MissingDependencyError, ensure_torch, require_module
+from ._base import (
+    _IMAGE_INPUT_CONTENT_TYPES,
+    BaseRunner,
+    _load_input_images,
+    _resolve_pipeline_severity,
+)
+logger = logging.getLogger(__name__)
+class ObjectDetectionRunner(BaseRunner):
+    """Object detection via a single HuggingFace object-detection pipeline."""
+    def __init__(
+        self,
+        schema: ObjectDetectionPipelineSchema,
+        detector_key: str = "",
+        detector_name: str = "",
+    ) -> None:
+        self._schema = schema
+        self._detector_key = detector_key
+        self._detector_name = detector_name
+        ensure_torch("object_detection", ["custom", "detectors"])
+        transformers = require_module("transformers", "object_detection", ["custom", "detectors"])
+        self._pil = require_module("PIL.Image", "object_detection", ["custom", "detectors"])
+        pipeline_kwargs: dict[str, Any] = {
+            "model": schema.model,
+            "device": schema.device or "cpu",
+        }
+        if schema.model_revision:
+            pipeline_kwargs["revision"] = schema.model_revision
+        nms = getattr(schema.nms_threshold, "root", schema.nms_threshold)
+        if nms is not None:
+            pipeline_kwargs["threshold"] = nms
+        try:
+            self._pipe: Any = transformers.pipeline("object-detection", **pipeline_kwargs)
+        except ImportError as exc:
+            raise MissingDependencyError(
+                "object_detection",
+                ["custom", "detectors"],
+                f"ObjectDetectionRunner requires additional dependencies: {exc}",
+            ) from exc
+    def run(self, text: str) -> None:  # type: ignore[override]  # pragma: no cover
+        raise NotImplementedError("ObjectDetectionRunner uses detect() directly")
+    def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
+        if isinstance(content, str):
+            logger.warning("object_detection: received string content, expected bytes")
+            return []
+        # image/* opens directly; PDFs are rasterised to one image per page.
+        images = _load_input_images(content, content_type, self._pil)
+        if not images:
+            return []
+        schema = self._schema
+        threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
+        multi_page = len(images) > 1
+        results: list[DetectionResult] = []
+        for page_index, image in images:
+            try:
+                detections: list[dict[str, Any]] = self._pipe(image) or []
+                for det in detections:
+                    label: str = det.get("label", "unknown")
+                    score: float = float(det.get("score", 0.0))
+                    box: dict[str, int] = det.get("box", {})
+                    if score < threshold:
+                        continue
+                    if schema.min_box_area is not None:
+                        w = max(0, box.get("xmax", 0) - box.get("xmin", 0))
+                        h = max(0, box.get("ymax", 0) - box.get("ymin", 0))
+                        if w * h < schema.min_box_area:
+                            continue
+                    severity = _resolve_pipeline_severity(label, schema.severity_map)
+                    page_prefix = f"page {page_index + 1} " if multi_page else ""
+                    metadata: dict[str, Any] = {
+                        "box": box,
+                        "score": score,
+                        "image_size": f"{image.size[0]}x{image.size[1]}",
+                        "model": schema.model,
+                    }
+                    if multi_page:
+                        metadata["page"] = page_index + 1
+                    results.append(
+                        self._make_result(
+                            finding_type=label,
+                            category="CONTENT",
+                            severity=severity,
+                            confidence=score,
+                            matched_content=label,
+                            location=Location(
+                                description=(
+                                    f"{page_prefix}box xmin={box.get('xmin')} ymin={box.get('ymin')}"
+                                    f" xmax={box.get('xmax')} ymax={box.get('ymax')}"
+                                ),
+                            ),
+                            metadata=metadata,
+                        )
+                    )
+            except Exception as exc:
+                logger.error(
+                    "object_detection error (model=%s): %s", schema.model, exc, exc_info=True
+                )
+        results.sort(key=lambda r: r.confidence, reverse=True)
+        if schema.top_k is not None:
+            results = results[: schema.top_k]
+        return results
+    def get_supported_content_types(self) -> list[str]:
+        return list(_IMAGE_INPUT_CONTENT_TYPES)

classifyre-cli 0.4.9__tar.gz → 0.4.11__tar.gz

classifyre-cli 0.4.9tar.gz → 0.4.11tar.gz