PyPI - classifyre-cli - Versions diffs - 0.4.8__tar.gz → 0.4.10__tar.gz - Mend

classifyre-cli 0.4.8tar.gz → 0.4.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

{classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.turbo/turbo-build.log RENAMED Viewed

@@ -1,3 +1,3 @@
 $ uv sync
-Resolved 256 packages in 185ms
+Resolved 265 packages in 156ms
 Checked 50 packages in 1ms

{classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: classifyre-cli
-Version: 0.4.8
+Version: 0.4.10
 Summary: Classifyre CLI — scan and classify unstructured data sources
 License: MIT
 Keywords: data,ingestion,metadata,pii,secrets,unstructured

{classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/package.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@classifyre/cli",
-  "version": "0.4.8",
+  "version": "0.4.10",
   "private": true,
   "scripts": {
     "build": "uv sync",

{classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "classifyre-cli"
-version = "0.4.8"
+version = "0.4.10"
 description = "Classifyre CLI — scan and classify unstructured data sources"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -91,6 +91,9 @@ custom = [
 regex = [
     "google-re2>=1.1",
 ]
+llm = [
+    "litellm>=1.86.2",
+]
 detectors = [
     { include-group = "file-processing" },
     { include-group = "privacy" },
@@ -101,6 +104,7 @@ detectors = [
     { include-group = "classification" },
     { include-group = "custom" },
     { include-group = "regex" },
+    { include-group = "llm" },
 ]
 file-processing = [
     "filetype>=1.2.0",
@@ -264,6 +268,8 @@ module = [
     "datasets",
     "setfit.*",
     "setfit",
+    "litellm.*",
+    "litellm",
     "sklearn.*",
     "sklearn",
     "numpy",

classifyre_cli-0.4.10/src/detectors/custom/runners/_llm.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""AI/LLM pipeline runner — prompt-driven classification and field extraction."""
+from __future__ import annotations
+import json
+import logging
+import os
+from datetime import UTC, datetime
+from typing import Any
+# Quiet litellm's import-time provider preload warnings (bedrock/sagemaker need
+# botocore, which we don't install) before the library is ever imported.
+os.environ.setdefault("LITELLM_LOG", "ERROR")
+from ....models.generated_detectors import LLMPipelineSchema, Severity
+from ....models.generated_single_asset_scan_results import (
+    DetectionResult,
+    DetectorType,
+)
+from ...dependencies import require_module
+from ._base import _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
+logger = logging.getLogger(__name__)
+# Map the stored AI provider type onto the litellm model-string convention.
+_PROVIDER_PREFIX: dict[str, str] = {
+    "CLAUDE": "anthropic",
+    "GEMINI": "gemini",
+    "OPENAI_COMPATIBLE": "openai",
+}
+class LLMRunner(BaseRunner):
+    """AI detector — sends content to a configured LLM provider for classification + extraction."""
+    def __init__(
+        self, schema: LLMPipelineSchema, detector_key: str = "", detector_name: str = ""
+    ) -> None:
+        self._schema = schema
+        self._detector_key = detector_key
+        self._detector_name = detector_name
+        runtime = schema.provider_runtime
+        if runtime is None:
+            raise ValueError(
+                f"AI detector '{detector_key}' is missing provider_runtime — the API must "
+                "inject resolved provider credentials before dispatch."
+            )
+        self._runtime = runtime
+        self._litellm = require_module("litellm", "llm", ["llm"])
+        # Let litellm silently drop params an endpoint doesn't support (e.g.
+        # response_format / temperature on some OpenAI-compatible gateways)
+        # instead of raising. Keep its own logging quiet.
+        self._litellm.drop_params = True
+        self._litellm.suppress_debug_info = True
+        logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+    def run(self, text: str) -> None:  # type: ignore[override]  # pragma: no cover
+        raise NotImplementedError("LLMRunner uses detect() directly")
+    def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
+        if isinstance(content, bytes):
+            return []
+        if content_type not in _TEXT_CONTENT_TYPES:
+            return []
+        text = content.strip()
+        if not text:
+            return []
+        schema = self._schema
+        content_limit = schema.content_limit or 8000
+        snippet = text[:content_limit]
+        messages = [
+            {"role": "system", "content": self._build_system_prompt()},
+            {"role": "user", "content": snippet},
+        ]
+        try:
+            response = self._litellm.completion(
+                model=self._model_string(),
+                api_key=self._runtime.api_key,
+                api_base=self._runtime.base_url or None,
+                temperature=schema.temperature if schema.temperature is not None else 0.0,
+                max_tokens=self._max_tokens(),
+                messages=messages,
+                response_format={"type": "json_object"},
+            )
+            raw = response.choices[0].message.content or "{}"
+            parsed = self._parse_json(raw)
+        except Exception as exc:
+            logger.error(
+                "llm detector error (detector=%s, model=%s): %s",
+                self._detector_key,
+                self._runtime.model,
+                exc,
+                exc_info=True,
+            )
+            return []
+        return self._results_from_payload(snippet, parsed)
+    def get_supported_content_types(self) -> list[str]:
+        return list(_TEXT_CONTENT_TYPES)
+    # ── Internals ────────────────────────────────────────────────────────────
+    def _max_tokens(self) -> int | None:
+        # `max_tokens` is generated as a RootModel[int] wrapper, so unwrap `.root`
+        # before handing it to litellm — passing the model object serialises to an
+        # invalid request body and fails the whole completion.
+        raw = self._schema.max_tokens
+        if raw is None:
+            return None
+        return getattr(raw, "root", raw)
+    def _model_string(self) -> str:
+        prefix = _PROVIDER_PREFIX.get(self._runtime.provider.value, "openai")
+        return f"{prefix}/{self._runtime.model}"
+    def _build_system_prompt(self) -> str:
+        schema = self._schema
+        parts: list[str] = [schema.system_prompt.strip()]
+        labels = schema.labels or []
+        if labels:
+            label_lines = "\n".join(
+                f"- {lbl.name}: {lbl.description}" if lbl.description else f"- {lbl.name}"
+                for lbl in labels
+            )
+            parts.append(
+                "Classify the content using these labels:\n"
+                + label_lines
+                + (
+                    "\nMultiple labels may apply."
+                    if schema.multi_label
+                    else "\nChoose the single best label."
+                )
+            )
+        fields = schema.output_fields or []
+        if fields:
+            field_lines = "\n".join(
+                f"- {f.name} ({f.type.value if f.type else 'string'}): {f.description}"
+                if f.description
+                else f"- {f.name} ({f.type.value if f.type else 'string'})"
+                for f in fields
+            )
+            parts.append("Also extract these fields:\n" + field_lines)
+        parts.append(
+            "Respond with a JSON object of the form: "
+            '{"labels": [{"name": "<label>", "confidence": <0-1>, '
+            '"matched_content": "<relevant snippet>"}], "fields": {<field name>: <value>}}. '
+            "Use only the labels listed above. Return an empty labels array when none apply."
+        )
+        if schema.response_example:
+            parts.append("Example response:\n" + schema.response_example.strip())
+        return "\n\n".join(parts)
+    @staticmethod
+    def _parse_json(raw: str) -> dict[str, Any]:
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            start = raw.find("{")
+            end = raw.rfind("}")
+            if start == -1 or end == -1 or end <= start:
+                return {}
+            try:
+                parsed = json.loads(raw[start : end + 1])
+            except json.JSONDecodeError:
+                return {}
+        return parsed if isinstance(parsed, dict) else {}
+    def _results_from_payload(self, snippet: str, payload: dict[str, Any]) -> list[DetectionResult]:
+        schema = self._schema
+        threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
+        default_severity = schema.severity or Severity.info
+        extracted = self._coerce_fields(payload.get("fields"))
+        raw_labels = payload.get("labels")
+        label_entries: list[dict[str, Any]] = (
+            [lbl for lbl in raw_labels if isinstance(lbl, dict)]
+            if isinstance(raw_labels, list)
+            else []
+        )
+        results: list[DetectionResult] = []
+        for entry in label_entries:
+            label = str(entry.get("name", "")).strip()
+            if not label:
+                continue
+            confidence = float(entry.get("confidence", 1.0) or 0.0)
+            if confidence < threshold:
+                continue
+            severity = _resolve_pipeline_severity(label, schema.severity_map, default_severity)
+            matched = str(entry.get("matched_content") or "").strip() or snippet[:320]
+            results.append(
+                DetectionResult(
+                    detector_type=DetectorType.CUSTOM,
+                    finding_type=f"llm:{label}",
+                    category="CLASSIFICATION",
+                    severity=severity,
+                    confidence=min(0.99, confidence),
+                    matched_content=matched,
+                    location=None,
+                    custom_detector_key=self._detector_key,
+                    custom_detector_name=self._detector_name,
+                    detected_at=datetime.now(UTC),
+                    metadata={
+                        "runner": "LLM",
+                        "provider": self._runtime.provider.value,
+                        "model": self._runtime.model,
+                        "label": label,
+                        "fields": extracted,
+                    },
+                    extracted_data=extracted or None,
+                    extraction_method="LLM",
+                )
+            )
+        results.sort(key=lambda r: r.confidence, reverse=True)
+        return results
+    @staticmethod
+    def _coerce_fields(raw: Any) -> dict[str, Any]:
+        return {str(k): v for k, v in raw.items()} if isinstance(raw, dict) else {}

{classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_detectors.py RENAMED Viewed

@@ -189,7 +189,7 @@ class DetectorCatalog(RootModel[list[DetectorCatalogEntry]]):
                 'categories': ['CLASSIFICATION', 'COMPLIANCE'],
                 'supported_asset_types': ['TXT', 'TABLE', 'URL', 'IMAGE'],
                 'recommended_model': 'mDeBERTa-v3 + SetFit + GLiNER + HuggingFace transformers',
-                'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, LLM, text classification, image classification, feature extraction, and object detection pipelines.',
+                'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, AI/LLM (prompt-driven classification + extraction via a configured provider), text classification, image classification, feature extraction, and object detection pipelines.',
             },
         ],
         description='Detector capability catalog used for planning and runtime routing',
@@ -954,18 +954,156 @@ class RegexPipelineSchema(BaseModel):
     validation: PipelineValidationConfig | None = None
+class LLMLabelDefinition(BaseModel):
+    """
+    One classification label the AI detector may assign to content.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    name: str = Field(
+        ...,
+        description="Label name returned by the model (e.g. 'good', 'bad', 'violent').",
+    )
+    description: str | None = Field(
+        '', description='Guidance describing when this label applies.'
+    )
 class Type3(StrEnum):
+    string = 'string'
+    number = 'number'
+    boolean = 'boolean'
+    list_string_ = 'list[string]'
+    list_number_ = 'list[number]'
+class LLMOutputField(BaseModel):
+    """
+    One structured property the AI detector extracts and stores in finding metadata and extracted_data.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    name: str = Field(
+        ..., description='Output field name — becomes a key in extracted_data JSON.'
+    )
+    description: str | None = Field(
+        '', description='Hint for what this field captures.'
+    )
+    type: Type3 | None = 'string'
+class Provider(StrEnum):
+    """
+    Resolved AI provider type.
+    """
+    OPENAI_COMPATIBLE = 'OPENAI_COMPATIBLE'
+    CLAUDE = 'CLAUDE'
+    GEMINI = 'GEMINI'
+class LLMProviderRuntime(BaseModel):
+    """
+    Runtime-only provider credentials injected by the API at dispatch time. Never persisted with the detector config and rejected on create/update.
+    """
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    provider: Provider = Field(..., description='Resolved AI provider type.')
+    model: str = Field(
+        ...,
+        description='Resolved model identifier (e.g. gpt-4o, claude-sonnet-4-5, gemini-2.0-flash).',
+    )
+    api_key: str = Field(..., description='Decrypted provider API key.')
+    base_url: str | None = Field(
+        None,
+        description='Base URL for OpenAI-compatible endpoints. Null for managed providers.',
+    )
+    context_size: int | None = Field(
+        None, description='Optional context window size configured for the provider.'
+    )
+class Type4(StrEnum):
     LLM = 'LLM'
+class MaxTokens(RootModel[int]):
+    root: int = Field(
+        None,
+        description='Maximum tokens to generate. Provider default when null.',
+        ge=1,
+    )
 class LLMPipelineSchema(BaseModel):
+    """
+    AI detector pipeline. Sends content to a configured LLM provider with a system prompt, classifies it against a label set, and extracts structured fields. Predicted labels become findings (severity via severity_map); extracted fields are stored in finding metadata and extracted_data.
+    """
     model_config = ConfigDict(
         extra='forbid',
     )
     type: Literal['LLM'] = 'LLM'
+    system_prompt: str = Field(
+        ...,
+        description='Instruction describing what the model should detect, classify, and extract.',
+    )
+    response_example: str | None = Field(
+        None,
+        description='Optional few-shot example of the JSON the model should return.',
+    )
+    temperature: float | None = Field(
+        0.0,
+        description='Sampling temperature. Lower is more deterministic.',
+        ge=0.0,
+        le=2.0,
+    )
+    max_tokens: MaxTokens | None = Field(
+        None, description='Maximum tokens to generate. Provider default when null.'
+    )
+    labels: list[LLMLabelDefinition] | None = Field(
+        [],
+        description='Classification taxonomy the model assigns to content.',
+        validate_default=True,
+    )
+    multi_label: bool | None = Field(
+        False, description='Allow more than one label per asset.'
+    )
+    severity: Severity | None = Field(
+        'info',
+        description='Default severity when no severity_map rule matches a predicted label.',
+    )
+    severity_map: list[PipelineSeverityRule] | None = Field(
+        None,
+        description='Ordered rules mapping predicted labels to severity levels. First matching rule wins.',
+    )
+    confidence_threshold: float | None = Field(
+        0.5,
+        description='Minimum model confidence to report a label as a finding (0-1).',
+        ge=0.0,
+        le=1.0,
+    )
+    output_fields: list[LLMOutputField] | None = Field(
+        [],
+        description='Structured properties the model extracts. Stored in finding metadata and extracted_data.',
+        validate_default=True,
+    )
+    content_limit: int | None = Field(
+        8000, description='Maximum characters of content sent to the model.', ge=1
+    )
+    provider_runtime: LLMProviderRuntime | None = Field(
+        None,
+        description='Runtime-only credentials injected by the API at dispatch. Never persisted; rejected on create/update.',
+    )
-class Type4(StrEnum):
+class Type5(StrEnum):
     TEXT_CLASSIFICATION = 'TEXT_CLASSIFICATION'
@@ -1055,7 +1193,7 @@ class TextClassificationPipelineSchema(BaseModel):
     )
-class Type5(StrEnum):
+class Type6(StrEnum):
     IMAGE_CLASSIFICATION = 'IMAGE_CLASSIFICATION'
@@ -1108,7 +1246,7 @@ class ImageClassificationPipelineSchema(BaseModel):
     )
-class Type6(StrEnum):
+class Type7(StrEnum):
     FEATURE_EXTRACTION = 'FEATURE_EXTRACTION'
@@ -1180,7 +1318,7 @@ class FeatureExtractionPipelineSchema(BaseModel):
     )
-class Type7(StrEnum):
+class Type8(StrEnum):
     OBJECT_DETECTION = 'OBJECT_DETECTION'

{classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/rest.py RENAMED Viewed

@@ -1,17 +1,85 @@
 from __future__ import annotations
 import logging
+import random
 from typing import Any, Literal, cast
 from urllib.parse import urljoin
 import requests  # type: ignore[import-untyped]
 from pydantic import BaseModel, ConfigDict, Field
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry  # type: ignore[import-untyped]
 from .base import OutputRuntimeContext, OutputType
 logger = logging.getLogger(__name__)
+class _JitteredRetry(Retry):
+    """urllib3 Retry subclass that adds ±25 % multiplicative jitter to the
+    computed backoff so that multiple concurrent CLI jobs do not all retry
+    at exactly the same moment (thundering-herd mitigation).
+    The jitter is applied *after* the standard exponential backoff formula
+    and the backoff_max cap, so it never pushes the delay above
+    backoff_max * 1.25.
+    """
+    _JITTER_FACTOR: float = 0.25
+    def get_backoff_time(self) -> float:  # type: ignore[override]
+        base = super().get_backoff_time()
+        if base == 0:
+            return 0.0
+        lo = base * (1 - self._JITTER_FACTOR)
+        hi = base * (1 + self._JITTER_FACTOR)
+        return random.uniform(lo, hi)
+# Retry policy for CLI → API REST calls.
+#
+# What we retry and why:
+#   connect=8  — pod restarted / not yet ready (RemoteDisconnected, ConnectionReset,
+#                ConnectTimeout). Request never reached the application.
+#   read=8     — API is under load and slow to respond (ReadTimeout). Safe to retry
+#                because all endpoints are idempotent (bulk ingest is upsert-based,
+#                status/findings updates are set-operations).
+#   status=8   — transient HTTP errors from an overloaded or restarting API:
+#                  408 Request Timeout   - API-level timeout
+#                  429 Too Many Requests - rate-limited / backpressure
+#                  502 Bad Gateway       - proxy has no upstream yet
+#                  503 Service Unavail.  - under-pressure / pod not ready
+#                  504 Gateway Timeout   - upstream took too long
+#
+# backoff_factor=2, backoff_max=60: exponential cap at 60 s, with ±25 % jitter
+# (see _JitteredRetry). Approximate wait schedule between attempts:
+#   attempt 1 → immediate (0 s)
+#   attempt 2 → ~2 s
+#   attempt 3 → ~4 s
+#   attempt 4 → ~8 s
+#   attempt 5 → ~16 s
+#   attempt 6 → ~32 s
+#   attempt 7 → ~60 s  (capped)
+#   attempt 8 → ~60 s  (capped)
+# Total extra wait: ~182 s (~3 min) — covers extended load spikes on a
+# single-node VPS before event-loop pressure drops. Worst-case a single
+# call costs 8 * 120 s + 182 s = ~18 min, acceptable for long-running scans.
+#
+# POST and PATCH are explicitly allowed: without this urllib3 only retries
+# idempotent methods (GET/HEAD) by default.
+_RETRY_POLICY = _JitteredRetry(
+    total=8,
+    connect=8,
+    read=8,
+    status=8,
+    backoff_factor=2,
+    backoff_max=60,
+    status_forcelist={408, 429, 502, 503, 504},
+    allowed_methods={"GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS"},
+    raise_on_status=False,
+)
 def _drop_none_recursive(value: Any) -> Any:
     if isinstance(value, dict):
         return {key: _drop_none_recursive(item) for key, item in value.items() if item is not None}
@@ -63,6 +131,9 @@ class RestOutputSink:
         self.base_url = base_url.rstrip("/")
         self.timeout_sec = timeout_sec
         self.session = requests.Session()
+        adapter = HTTPAdapter(max_retries=_RETRY_POLICY)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
         self._runner_id = context.runner_id
         self._seen_hashes: set[str] = set()

classifyre-cli 0.4.8__tar.gz → 0.4.10__tar.gz

classifyre-cli 0.4.8tar.gz → 0.4.10tar.gz