PyPI - classifyre-cli - Versions diffs - 0.4.2__py3-none-any.whl - Mend

classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

classifyre_cli-0.4.2.dist-info/METADATA +167 -0
classifyre_cli-0.4.2.dist-info/RECORD +101 -0
classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
src/__init__.py +1 -0
src/detectors/__init__.py +105 -0
src/detectors/base.py +97 -0
src/detectors/broken_links/__init__.py +3 -0
src/detectors/broken_links/detector.py +280 -0
src/detectors/config.py +59 -0
src/detectors/content/__init__.py +0 -0
src/detectors/custom/__init__.py +13 -0
src/detectors/custom/detector.py +45 -0
src/detectors/custom/runners/__init__.py +56 -0
src/detectors/custom/runners/_base.py +177 -0
src/detectors/custom/runners/_factory.py +51 -0
src/detectors/custom/runners/_feature_extraction.py +138 -0
src/detectors/custom/runners/_gliner2.py +324 -0
src/detectors/custom/runners/_image_classification.py +98 -0
src/detectors/custom/runners/_llm.py +22 -0
src/detectors/custom/runners/_object_detection.py +107 -0
src/detectors/custom/runners/_regex.py +147 -0
src/detectors/custom/runners/_text_classification.py +109 -0
src/detectors/custom/trainer.py +293 -0
src/detectors/dependencies.py +109 -0
src/detectors/pii/__init__.py +0 -0
src/detectors/pii/detector.py +883 -0
src/detectors/secrets/__init__.py +0 -0
src/detectors/secrets/detector.py +399 -0
src/detectors/threat/__init__.py +0 -0
src/detectors/threat/code_security_detector.py +206 -0
src/detectors/threat/yara_detector.py +177 -0
src/main.py +608 -0
src/models/generated_detectors.py +1296 -0
src/models/generated_input.py +2732 -0
src/models/generated_single_asset_scan_results.py +240 -0
src/outputs/__init__.py +3 -0
src/outputs/base.py +69 -0
src/outputs/console.py +62 -0
src/outputs/factory.py +156 -0
src/outputs/file.py +83 -0
src/outputs/rest.py +258 -0
src/pipeline/__init__.py +7 -0
src/pipeline/content_provider.py +26 -0
src/pipeline/detector_pipeline.py +742 -0
src/pipeline/parsed_content_provider.py +59 -0
src/sandbox/__init__.py +5 -0
src/sandbox/runner.py +145 -0
src/sources/__init__.py +95 -0
src/sources/atlassian_common.py +389 -0
src/sources/azure_blob_storage/__init__.py +3 -0
src/sources/azure_blob_storage/source.py +130 -0
src/sources/base.py +296 -0
src/sources/confluence/__init__.py +3 -0
src/sources/confluence/source.py +733 -0
src/sources/databricks/__init__.py +3 -0
src/sources/databricks/source.py +1279 -0
src/sources/dependencies.py +81 -0
src/sources/google_cloud_storage/__init__.py +3 -0
src/sources/google_cloud_storage/source.py +114 -0
src/sources/hive/__init__.py +3 -0
src/sources/hive/source.py +709 -0
src/sources/jira/__init__.py +3 -0
src/sources/jira/source.py +605 -0
src/sources/mongodb/__init__.py +3 -0
src/sources/mongodb/source.py +550 -0
src/sources/mssql/__init__.py +3 -0
src/sources/mssql/source.py +1034 -0
src/sources/mysql/__init__.py +3 -0
src/sources/mysql/source.py +797 -0
src/sources/neo4j/__init__.py +0 -0
src/sources/neo4j/source.py +523 -0
src/sources/object_storage/base.py +679 -0
src/sources/oracle/__init__.py +3 -0
src/sources/oracle/source.py +982 -0
src/sources/postgresql/__init__.py +3 -0
src/sources/postgresql/source.py +774 -0
src/sources/powerbi/__init__.py +3 -0
src/sources/powerbi/source.py +774 -0
src/sources/recipe_normalizer.py +179 -0
src/sources/s3_compatible_storage/README.md +66 -0
src/sources/s3_compatible_storage/__init__.py +3 -0
src/sources/s3_compatible_storage/source.py +150 -0
src/sources/servicedesk/__init__.py +3 -0
src/sources/servicedesk/source.py +620 -0
src/sources/slack/__init__.py +3 -0
src/sources/slack/source.py +534 -0
src/sources/snowflake/__init__.py +3 -0
src/sources/snowflake/source.py +912 -0
src/sources/tableau/__init__.py +3 -0
src/sources/tableau/source.py +799 -0
src/sources/tabular_utils.py +165 -0
src/sources/wordpress/__init__.py +3 -0
src/sources/wordpress/source.py +590 -0
src/telemetry.py +96 -0
src/utils/__init__.py +1 -0
src/utils/content_extraction.py +108 -0
src/utils/file_parser.py +777 -0
src/utils/hashing.py +82 -0
src/utils/uv_sync.py +79 -0
src/utils/validation.py +56 -0

src/models/generated_single_asset_scan_results.py ADDED Viewed

@@ -0,0 +1,240 @@
+# generated by datamodel-codegen:
+#   filename:  single_asset_scan_results.json
+from __future__ import annotations
+from enum import StrEnum
+from typing import Any
+from pydantic import AwareDatetime, BaseModel, Field
+class AssetType(StrEnum):
+    """
+    Canonical type of the asset payload
+    """
+    TXT = 'TXT'
+    TABLE = 'TABLE'
+    IMAGE = 'IMAGE'
+    VIDEO = 'VIDEO'
+    AUDIO = 'AUDIO'
+    URL = 'URL'
+    BINARY = 'BINARY'
+    OTHER = 'OTHER'
+class DetectorType(StrEnum):
+    """
+    Type of detector for content analysis
+    """
+    SECRETS = 'SECRETS'
+    PII = 'PII'
+    YARA = 'YARA'
+    BROKEN_LINKS = 'BROKEN_LINKS'
+    CODE_SECURITY = 'CODE_SECURITY'
+    CUSTOM = 'CUSTOM'
+class FindingCategory(StrEnum):
+    """
+    Normalized finding category for reporting and filtering
+    """
+    SECURITY = 'SECURITY'
+    PRIVACY = 'PRIVACY'
+    THREAT = 'THREAT'
+    CONTENT = 'CONTENT'
+    QUALITY = 'QUALITY'
+    FAIRNESS = 'FAIRNESS'
+    COMPLIANCE = 'COMPLIANCE'
+    SECRETS = 'SECRETS'
+    PII = 'PII'
+    CLASSIFICATION = 'CLASSIFICATION'
+class Severity(StrEnum):
+    """
+    Severity level of finding
+    """
+    critical = 'critical'
+    high = 'high'
+    medium = 'medium'
+    low = 'low'
+    info = 'info'
+class Location(BaseModel):
+    """
+    Location of finding in source content
+    """
+    path: str | None = Field(
+        None,
+        description="Human-readable source reference: 'schema.table, row N' for tabular, URL for web/Slack",
+        title='Path',
+    )
+    description: str | None = Field(
+        None,
+        description='Additional detail, e.g. column name where value was found',
+        title='Description',
+    )
+    line: int | None = Field(None, description='Line number (1-indexed)')
+    column: int | None = Field(None, description='Column number (1-indexed)')
+    start: int | None = Field(None, description='Start offset (0-indexed)')
+    end: int | None = Field(None, description='End offset (0-indexed)')
+class ScanStats(BaseModel):
+    """
+    Statistics about detector scan for an asset
+    """
+    scanned_at: AwareDatetime = Field(
+        ..., description='Timestamp when the scan started'
+    )
+    duration_ms: int = Field(..., description='Duration of the scan in milliseconds')
+    detectors_run: list[DetectorType] = Field(
+        ..., description='List of detector types that were run'
+    )
+    content_size_bytes: int | None = Field(
+        None, description='Size of the content that was scanned'
+    )
+    findings_count: int | None = Field(
+        None, description='Total number of findings detected'
+    )
+    warnings: list[str] | None = Field(
+        None,
+        description='Non-fatal issues during scan (e.g. content truncation, empty content)',
+    )
+    errors: list[str] | None = Field(None, description='Detector errors during scan')
+class DetectionResult(BaseModel):
+    """
+    Result from detector scan
+    """
+    detector_type: DetectorType = Field(
+        ..., description='Type of detector that found this', title='Detector Type'
+    )
+    finding_type: str = Field(
+        ...,
+        description="Type of finding (e.g., 'aws_key', 'ssn', 'toxicity')",
+        title='Finding Type',
+    )
+    category: FindingCategory | str = Field(
+        ...,
+        description='Category of finding (normalized category preferred, string allowed for compatibility)',
+        title='Category',
+    )
+    severity: Severity
+    confidence: float = Field(
+        ..., description='Confidence score (0-1)', ge=0.0, le=1.0, title='Confidence'
+    )
+    matched_content: str = Field(
+        ..., description='The content that matched', title='Matched Content'
+    )
+    redacted_content: str | None = Field(
+        None,
+        description='Redacted version of matched content',
+        title='Redacted Content',
+    )
+    location: Location | None = Field(
+        None, description='Location of finding in content'
+    )
+    context_before: str | None = Field(
+        None, description='Text before the match', title='Context Before'
+    )
+    context_after: str | None = Field(
+        None, description='Text after the match', title='Context After'
+    )
+    runner_id: str | None = Field(
+        None,
+        description='ID of the runner that detected this finding',
+        title='Runner Id',
+    )
+    custom_detector_id: str | None = Field(
+        None,
+        description='Database ID of custom detector instance when detector_type is CUSTOM',
+        title='Custom Detector Id',
+    )
+    custom_detector_key: str | None = Field(
+        None,
+        description='Stable key of custom detector instance when detector_type is CUSTOM',
+        title='Custom Detector Key',
+    )
+    custom_detector_name: str | None = Field(
+        None,
+        description='Display name of custom detector instance when detector_type is CUSTOM',
+        title='Custom Detector Name',
+    )
+    detected_at: AwareDatetime | None = Field(
+        None,
+        description='Timestamp when this finding was detected',
+        title='Detected At',
+    )
+    metadata: dict[str, Any] | None = Field(
+        None, description='Additional detector-specific metadata', title='Metadata'
+    )
+    extracted_data: dict[str, Any] | None = Field(
+        None,
+        description='Structured field values extracted from matched content',
+        title='Extracted Data',
+    )
+    extraction_method: str | None = Field(
+        None,
+        description='Which extraction strategy was used: REGEX, GLINER, CLASSIFIER_GLINER',
+        title='Extraction Method',
+    )
+class SingleAssetScanResults(BaseModel):
+    """
+    Single asset scan results with detector findings
+    """
+    hash: str = Field(..., description='Unique stable hash of the asset', title='Hash')
+    checksum: str = Field(
+        ...,
+        description='SHA-256 checksum of the asset metadata to detect changes',
+        title='Checksum',
+    )
+    name: str = Field(..., description='Name of the asset', title='Name')
+    external_url: str = Field(
+        ..., description='External URL of the asset', title='External Url'
+    )
+    links: list[str] = Field(
+        ..., description='Linked asset hashes referenced by this asset', title='Links'
+    )
+    asset_type: AssetType = Field(
+        ..., description='Canonical asset content type', title='Asset Type'
+    )
+    source_id: str | None = Field(
+        None,
+        description='ID of the source this asset belongs to (optional for local runs)',
+        title='Source Id',
+    )
+    created_at: AwareDatetime = Field(
+        ..., description='The date and time the asset was created', title='Created At'
+    )
+    updated_at: AwareDatetime = Field(
+        ...,
+        description='The date and time the asset was last updated',
+        title='Updated At',
+    )
+    runner_id: str | None = Field(
+        None,
+        description='ID of the runner that produced this asset (optional for local runs)',
+        title='Runner Id',
+    )
+    findings: list[DetectionResult] | None = Field(
+        None, description='Detector findings for this asset', title='Findings'
+    )
+    scan_stats: ScanStats | None = Field(
+        None,
+        description='Statistics about the detector scan for this asset',
+        title='Scan Stats',
+    )

src/outputs/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .factory import create_output_sink, resolve_output_settings
+__all__ = ["create_output_sink", "resolve_output_settings"]

src/outputs/base.py ADDED Viewed

@@ -0,0 +1,69 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Literal, Protocol
+from pydantic import BaseModel
+OutputType = Literal["rest", "file", "console"]
+@dataclass(frozen=True)
+class OutputRuntimeContext:
+    source_id: str | None
+    runner_id: str | None
+    managed_runner: bool
+    batch_size: int
+@dataclass(frozen=True)
+class OutputSettings:
+    output_type: OutputType
+    batch_size: int
+    source_id: str | None
+    runner_id: str | None
+    managed_runner: bool
+    rest_url: str | None = None
+    rest_timeout_sec: int = 30
+    file_path: str | None = None
+class BatchEnvelope(BaseModel):
+    event: Literal["batch"] = "batch"
+    output_type: OutputType
+    source_id: str | None = None
+    runner_id: str | None = None
+    batch_index: int
+    asset_count: int
+    assets: list[dict[str, Any]]
+class FinishEnvelope(BaseModel):
+    event: Literal["finish"] = "finish"
+    output_type: OutputType
+    source_id: str | None = None
+    runner_id: str | None = None
+    batch_count: int
+    total_assets: int
+class ErrorEnvelope(BaseModel):
+    event: Literal["error"] = "error"
+    output_type: OutputType
+    source_id: str | None = None
+    runner_id: str | None = None
+    error: str
+class OutputSink(Protocol):
+    batch_size: int
+    async def start(self) -> None: ...
+    async def emit_batch(
+        self, assets: list[dict[str, Any]], *, skip_findings: bool = False
+    ) -> None: ...
+    async def finish(self) -> None: ...
+    async def fail(self, error: Exception) -> None: ...

src/outputs/console.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+import json
+from typing import Any
+from .base import (
+    BatchEnvelope,
+    ErrorEnvelope,
+    FinishEnvelope,
+    OutputRuntimeContext,
+    OutputType,
+)
+class ConsoleOutputSink:
+    output_type: OutputType = "console"
+    def __init__(self, context: OutputRuntimeContext):
+        self.context = context
+        self.batch_size = context.batch_size
+        self._batch_count = 0
+        self._total_assets = 0
+    async def start(self) -> None:
+        return None
+    async def emit_batch(
+        self, assets: list[dict[str, Any]], *, skip_findings: bool = False
+    ) -> None:
+        if not assets:
+            return
+        self._batch_count += 1
+        self._total_assets += len(assets)
+        payload = BatchEnvelope(
+            output_type=self.output_type,
+            source_id=self.context.source_id,
+            runner_id=self.context.runner_id,
+            batch_index=self._batch_count,
+            asset_count=len(assets),
+            assets=assets,
+        )
+        print(json.dumps(payload.model_dump(mode="json")), flush=True)
+    async def finish(self) -> None:
+        payload = FinishEnvelope(
+            output_type=self.output_type,
+            source_id=self.context.source_id,
+            runner_id=self.context.runner_id,
+            batch_count=self._batch_count,
+            total_assets=self._total_assets,
+        )
+        print(json.dumps(payload.model_dump(mode="json")), flush=True)
+    async def fail(self, error: Exception) -> None:
+        payload = ErrorEnvelope(
+            output_type=self.output_type,
+            source_id=self.context.source_id,
+            runner_id=self.context.runner_id,
+            error=str(error),
+        )
+        print(json.dumps(payload.model_dump(mode="json")), flush=True)

src/outputs/factory.py ADDED Viewed

@@ -0,0 +1,156 @@
+from __future__ import annotations
+import argparse
+import os
+from typing import Any, cast
+from .base import OutputRuntimeContext, OutputSettings, OutputSink, OutputType
+from .console import ConsoleOutputSink
+from .file import FileOutputSink
+from .rest import RestOutputSink
+def _normalize_output_type(value: str) -> OutputType:
+    normalized = value.strip().lower()
+    if normalized not in {"rest", "file", "console"}:
+        raise ValueError("output type must be one of: rest, file, console")
+    return cast(OutputType, normalized)
+def _parse_int(value: Any, fallback: int) -> int:
+    if value is None:
+        return fallback
+    if isinstance(value, bool):
+        return fallback
+    if isinstance(value, int):
+        return value
+    try:
+        return int(str(value).strip())
+    except (TypeError, ValueError):
+        return fallback
+def _coalesce(*values: Any) -> Any:
+    for value in values:
+        if value is None:
+            continue
+        if isinstance(value, str) and not value.strip():
+            continue
+        return value
+    return None
+def resolve_output_settings(
+    args: argparse.Namespace,
+) -> OutputSettings:
+    env_type = os.environ.get("CLASSIFYRE_OUTPUT_TYPE")
+    env_batch_size = os.environ.get("CLASSIFYRE_OUTPUT_BATCH_SIZE")
+    env_rest_url = os.environ.get("CLASSIFYRE_OUTPUT_REST_URL")
+    env_rest_timeout = os.environ.get("CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC")
+    env_file_path = os.environ.get("CLASSIFYRE_OUTPUT_FILE_PATH")
+    env_api_url = os.environ.get("API_URL")
+    source_id_value = _coalesce(
+        getattr(args, "source_id", None),
+        os.environ.get("SOURCE_ID"),
+    )
+    runner_id_value = _coalesce(
+        getattr(args, "runner_id", None),
+        os.environ.get("RUNNER_ID"),
+    )
+    source_id = str(source_id_value) if source_id_value is not None else None
+    runner_id = str(runner_id_value) if runner_id_value is not None else None
+    default_output_type: OutputType = "rest" if source_id else "console"
+    output_type = _normalize_output_type(
+        str(
+            _coalesce(
+                getattr(args, "output_type", None),
+                env_type,
+                default_output_type,
+            )
+        )
+    )
+    batch_size = _parse_int(
+        _coalesce(
+            getattr(args, "output_batch_size", None),
+            env_batch_size,
+            20,
+        ),
+        fallback=20,
+    )
+    if batch_size < 1:
+        raise ValueError("output_batch_size must be >= 1")
+    managed_runner = bool(getattr(args, "managed_runner", False))
+    if managed_runner and output_type != "rest":
+        raise ValueError("--managed-runner can only be used with output type 'rest'")
+    rest_url_value = _coalesce(
+        getattr(args, "output_rest_url", None),
+        env_rest_url,
+        env_api_url,
+    )
+    rest_url = str(rest_url_value) if rest_url_value is not None else None
+    rest_timeout_sec = _parse_int(_coalesce(env_rest_timeout, 30), 30)
+    if rest_timeout_sec < 1:
+        rest_timeout_sec = 30
+    file_path_value = _coalesce(
+        getattr(args, "output_file_path", None),
+        env_file_path,
+    )
+    file_path = str(file_path_value) if file_path_value is not None else None
+    if output_type == "rest":
+        if not source_id:
+            raise ValueError("REST output requires source_id (--source-id or SOURCE_ID)")
+        if not rest_url:
+            rest_url = "http://localhost:8000"
+        if managed_runner and not runner_id:
+            raise ValueError("managed REST output requires runner_id")
+    elif output_type == "file" and not file_path:
+        raise ValueError(
+            "file output requires output_file_path (--output-file-path or CLASSIFYRE_OUTPUT_FILE_PATH)"
+        )
+    return OutputSettings(
+        output_type=output_type,
+        batch_size=batch_size,
+        source_id=source_id,
+        runner_id=runner_id,
+        managed_runner=managed_runner,
+        rest_url=rest_url,
+        rest_timeout_sec=rest_timeout_sec,
+        file_path=file_path,
+    )
+def create_output_sink(args: argparse.Namespace) -> OutputSink:
+    settings = resolve_output_settings(args)
+    context = OutputRuntimeContext(
+        source_id=settings.source_id,
+        runner_id=settings.runner_id,
+        managed_runner=settings.managed_runner,
+        batch_size=settings.batch_size,
+    )
+    if settings.output_type == "rest":
+        if not settings.rest_url:
+            raise ValueError("rest_url must be provided for REST output")
+        return RestOutputSink(
+            context,
+            base_url=settings.rest_url,
+            timeout_sec=settings.rest_timeout_sec,
+        )
+    if settings.output_type == "file":
+        if not settings.file_path:
+            raise ValueError("file_path must be provided for file output")
+        return FileOutputSink(context, file_path=settings.file_path)
+    return ConsoleOutputSink(context)

src/outputs/file.py ADDED Viewed

@@ -0,0 +1,83 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, TextIO
+from .base import (
+    BatchEnvelope,
+    ErrorEnvelope,
+    FinishEnvelope,
+    OutputRuntimeContext,
+    OutputType,
+)
+class FileOutputSink:
+    output_type: OutputType = "file"
+    def __init__(self, context: OutputRuntimeContext, file_path: str):
+        self.context = context
+        self.batch_size = context.batch_size
+        self.file_path = Path(file_path)
+        self._batch_count = 0
+        self._total_assets = 0
+        self._handle: TextIO | None = None
+    async def start(self) -> None:
+        self.file_path.parent.mkdir(parents=True, exist_ok=True)
+        self._handle = self.file_path.open("a", encoding="utf-8")
+    async def emit_batch(
+        self, assets: list[dict[str, Any]], *, skip_findings: bool = False
+    ) -> None:
+        if not assets:
+            return
+        handle = self._require_handle()
+        self._batch_count += 1
+        self._total_assets += len(assets)
+        payload = BatchEnvelope(
+            output_type=self.output_type,
+            source_id=self.context.source_id,
+            runner_id=self.context.runner_id,
+            batch_index=self._batch_count,
+            asset_count=len(assets),
+            assets=assets,
+        )
+        handle.write(json.dumps(payload.model_dump(mode="json")))
+        handle.write("\n")
+        handle.flush()
+    async def finish(self) -> None:
+        handle = self._require_handle()
+        payload = FinishEnvelope(
+            output_type=self.output_type,
+            source_id=self.context.source_id,
+            runner_id=self.context.runner_id,
+            batch_count=self._batch_count,
+            total_assets=self._total_assets,
+        )
+        handle.write(json.dumps(payload.model_dump(mode="json")))
+        handle.write("\n")
+        handle.flush()
+        handle.close()
+        self._handle = None
+    async def fail(self, error: Exception) -> None:
+        handle = self._require_handle()
+        payload = ErrorEnvelope(
+            output_type=self.output_type,
+            source_id=self.context.source_id,
+            runner_id=self.context.runner_id,
+            error=str(error),
+        )
+        handle.write(json.dumps(payload.model_dump(mode="json")))
+        handle.write("\n")
+        handle.flush()
+        handle.close()
+        self._handle = None
+    def _require_handle(self) -> TextIO:
+        if self._handle is None:
+            raise RuntimeError("File output sink was not started before attempting to emit.")
+        return self._handle