PyPI - classifyre-cli - Versions diffs - 0.4.2__py3-none-any.whl - Mend

classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

classifyre_cli-0.4.2.dist-info/METADATA +167 -0
classifyre_cli-0.4.2.dist-info/RECORD +101 -0
classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
src/__init__.py +1 -0
src/detectors/__init__.py +105 -0
src/detectors/base.py +97 -0
src/detectors/broken_links/__init__.py +3 -0
src/detectors/broken_links/detector.py +280 -0
src/detectors/config.py +59 -0
src/detectors/content/__init__.py +0 -0
src/detectors/custom/__init__.py +13 -0
src/detectors/custom/detector.py +45 -0
src/detectors/custom/runners/__init__.py +56 -0
src/detectors/custom/runners/_base.py +177 -0
src/detectors/custom/runners/_factory.py +51 -0
src/detectors/custom/runners/_feature_extraction.py +138 -0
src/detectors/custom/runners/_gliner2.py +324 -0
src/detectors/custom/runners/_image_classification.py +98 -0
src/detectors/custom/runners/_llm.py +22 -0
src/detectors/custom/runners/_object_detection.py +107 -0
src/detectors/custom/runners/_regex.py +147 -0
src/detectors/custom/runners/_text_classification.py +109 -0
src/detectors/custom/trainer.py +293 -0
src/detectors/dependencies.py +109 -0
src/detectors/pii/__init__.py +0 -0
src/detectors/pii/detector.py +883 -0
src/detectors/secrets/__init__.py +0 -0
src/detectors/secrets/detector.py +399 -0
src/detectors/threat/__init__.py +0 -0
src/detectors/threat/code_security_detector.py +206 -0
src/detectors/threat/yara_detector.py +177 -0
src/main.py +608 -0
src/models/generated_detectors.py +1296 -0
src/models/generated_input.py +2732 -0
src/models/generated_single_asset_scan_results.py +240 -0
src/outputs/__init__.py +3 -0
src/outputs/base.py +69 -0
src/outputs/console.py +62 -0
src/outputs/factory.py +156 -0
src/outputs/file.py +83 -0
src/outputs/rest.py +258 -0
src/pipeline/__init__.py +7 -0
src/pipeline/content_provider.py +26 -0
src/pipeline/detector_pipeline.py +742 -0
src/pipeline/parsed_content_provider.py +59 -0
src/sandbox/__init__.py +5 -0
src/sandbox/runner.py +145 -0
src/sources/__init__.py +95 -0
src/sources/atlassian_common.py +389 -0
src/sources/azure_blob_storage/__init__.py +3 -0
src/sources/azure_blob_storage/source.py +130 -0
src/sources/base.py +296 -0
src/sources/confluence/__init__.py +3 -0
src/sources/confluence/source.py +733 -0
src/sources/databricks/__init__.py +3 -0
src/sources/databricks/source.py +1279 -0
src/sources/dependencies.py +81 -0
src/sources/google_cloud_storage/__init__.py +3 -0
src/sources/google_cloud_storage/source.py +114 -0
src/sources/hive/__init__.py +3 -0
src/sources/hive/source.py +709 -0
src/sources/jira/__init__.py +3 -0
src/sources/jira/source.py +605 -0
src/sources/mongodb/__init__.py +3 -0
src/sources/mongodb/source.py +550 -0
src/sources/mssql/__init__.py +3 -0
src/sources/mssql/source.py +1034 -0
src/sources/mysql/__init__.py +3 -0
src/sources/mysql/source.py +797 -0
src/sources/neo4j/__init__.py +0 -0
src/sources/neo4j/source.py +523 -0
src/sources/object_storage/base.py +679 -0
src/sources/oracle/__init__.py +3 -0
src/sources/oracle/source.py +982 -0
src/sources/postgresql/__init__.py +3 -0
src/sources/postgresql/source.py +774 -0
src/sources/powerbi/__init__.py +3 -0
src/sources/powerbi/source.py +774 -0
src/sources/recipe_normalizer.py +179 -0
src/sources/s3_compatible_storage/README.md +66 -0
src/sources/s3_compatible_storage/__init__.py +3 -0
src/sources/s3_compatible_storage/source.py +150 -0
src/sources/servicedesk/__init__.py +3 -0
src/sources/servicedesk/source.py +620 -0
src/sources/slack/__init__.py +3 -0
src/sources/slack/source.py +534 -0
src/sources/snowflake/__init__.py +3 -0
src/sources/snowflake/source.py +912 -0
src/sources/tableau/__init__.py +3 -0
src/sources/tableau/source.py +799 -0
src/sources/tabular_utils.py +165 -0
src/sources/wordpress/__init__.py +3 -0
src/sources/wordpress/source.py +590 -0
src/telemetry.py +96 -0
src/utils/__init__.py +1 -0
src/utils/content_extraction.py +108 -0
src/utils/file_parser.py +777 -0
src/utils/hashing.py +82 -0
src/utils/uv_sync.py +79 -0
src/utils/validation.py +56 -0

src/sources/azure_blob_storage/source.py ADDED Viewed

@@ -0,0 +1,130 @@
+from __future__ import annotations
+import logging
+from collections.abc import Iterator
+from typing import Any
+from urllib.parse import quote
+from ...models.generated_input import AzureBlobStorageInput
+from ..dependencies import require_module
+from ..object_storage.base import ObjectRef, ObjectStorageSourceBase
+logger = logging.getLogger(__name__)
+class AzureBlobStorageSource(ObjectStorageSourceBase):
+    source_type = "azure_blob_storage"
+    provider_label = "AZURE_BLOB_STORAGE"
+    input_model = AzureBlobStorageInput
+    def _required_container(self) -> str:
+        container = str(self.config.required.container).strip()
+        if not container:
+            raise ValueError("required.container must be set")
+        return container
+    def _required_account_url(self) -> str:
+        account_url = str(self.config.required.account_url).strip()
+        if not account_url:
+            raise ValueError("required.account_url must be set")
+        return account_url.rstrip("/")
+    def _build_client(self) -> Any:
+        blob_module = require_module(
+            module_name="azure.storage.blob",
+            source_name="Azure Blob Storage",
+            uv_groups=["azure-blob-storage"],
+            detail="Azure Blob storage requires azure-storage-blob.",
+        )
+        blob_service_client_cls = blob_module.BlobServiceClient
+        connection_string = self._masked_value("azure_connection_string")
+        if connection_string:
+            return blob_service_client_cls.from_connection_string(connection_string)
+        account_url = self._required_account_url()
+        account_key = self._masked_value("azure_account_key")
+        sas_token = self._masked_value("azure_sas_token")
+        if account_key:
+            return blob_service_client_cls(account_url=account_url, credential=account_key)
+        if sas_token:
+            return blob_service_client_cls(account_url=account_url, credential=sas_token)
+        client_id = self._masked_value("azure_client_id")
+        client_secret = self._masked_value("azure_client_secret")
+        tenant_id = self._masked_value("azure_tenant_id")
+        identity_module = require_module(
+            module_name="azure.identity",
+            source_name="Azure Blob Storage",
+            uv_groups=["azure-blob-storage"],
+            detail="Managed identity and service principal auth require azure-identity.",
+        )
+        if client_id and client_secret and tenant_id:
+            credential = identity_module.ClientSecretCredential(
+                tenant_id=tenant_id,
+                client_id=client_id,
+                client_secret=client_secret,
+            )
+        else:
+            credential = identity_module.DefaultAzureCredential()
+        return blob_service_client_cls(account_url=account_url, credential=credential)
+    def _client(self) -> Any:
+        if self._cached_client is None:
+            self._cached_client = self._build_client()
+        return self._cached_client
+    def _list_objects(self) -> Iterator[ObjectRef]:
+        blob_service_client = self._client()
+        container_client = blob_service_client.get_container_client(self._required_container())
+        prefix = self._prefix()
+        max_keys = self._max_keys_per_page()
+        timeout = self._request_timeout_seconds()
+        list_blobs = container_client.list_blobs(name_starts_with=prefix, timeout=timeout)
+        for page in list_blobs.by_page(results_per_page=max_keys):
+            for item in page:
+                key = str(getattr(item, "name", "") or "")
+                if not key or key.endswith("/"):
+                    continue
+                size = int(getattr(item, "size", 0) or 0)
+                if size == 0 and not self._include_empty_objects():
+                    continue
+                if not self._object_matches_extension_filters(key):
+                    continue
+                content_settings = getattr(item, "content_settings", None)
+                content_type_hint = getattr(content_settings, "content_type", None)
+                yield ObjectRef(
+                    key=key,
+                    size=size,
+                    last_modified=self._parse_datetime(getattr(item, "last_modified", None)),
+                    etag=str(getattr(item, "etag", "") or "") or None,
+                    content_type_hint=str(content_type_hint) if content_type_hint else None,
+                )
+    def _download_object(self, ref: ObjectRef) -> tuple[bytes, str | None, bool]:
+        blob_service_client = self._client()
+        container_client = blob_service_client.get_container_client(self._required_container())
+        blob_client = container_client.get_blob_client(ref.key)
+        max_bytes = self._max_object_bytes()
+        timeout = self._request_timeout_seconds()
+        length = max_bytes if ref.size > max_bytes else None
+        downloader = blob_client.download_blob(offset=0, length=length, timeout=timeout)
+        file_bytes = downloader.readall()
+        return file_bytes, ref.content_type_hint, ref.size > max_bytes
+    def _external_url(self, key: str) -> str:
+        account_url = self._required_account_url().rstrip("/")
+        container = self._required_container()
+        encoded_container = quote(container, safe="")
+        encoded_key = quote(key, safe="/")
+        return f"{account_url}/{encoded_container}/{encoded_key}"

src/sources/base.py ADDED Viewed

@@ -0,0 +1,296 @@
+import os
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator, Generator
+from typing import TYPE_CHECKING, Any
+from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
+if TYPE_CHECKING:
+    from ..utils.file_parser import ParsedBytes
+from ..utils.hashing import calculate_checksum, normalize_http_url
+from ..utils.validation import validate_output
+from .recipe_normalizer import normalize_source_recipe
+class BaseSource(ABC):
+    """
+    Abstract base class for all metadata extraction sources.
+    """
+    # Default batch size for streaming asset results
+    BATCH_SIZE: int = 50
+    HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
+    def __init__(
+        self,
+        recipe: dict[str, Any],
+        source_id: str | None = None,
+        runner_id: str | None = None,
+    ):
+        """
+        Initialize the source with a validated recipe.
+        Args:
+            recipe: The source configuration recipe
+            source_id: Optional source ID (for API runs)
+            runner_id: Optional runner ID (for API runs)
+        """
+        normalized_recipe = normalize_source_recipe(recipe, recipe.get("type"))
+        self._apply_initial_sampling_override(normalized_recipe)
+        recipe.clear()
+        recipe.update(normalized_recipe)
+        self.recipe = normalized_recipe
+        self.source_id = source_id
+        self.runner_id = runner_id
+        self._aborted = False
+        self._discovery_only = False
+        self._attachment_name_by_hash: dict[str, str] = {}
+    def _apply_initial_sampling_override(self, recipe: dict[str, Any]) -> None:
+        pass
+    @staticmethod
+    def _read_bool_env(name: str) -> bool | None:
+        raw = os.environ.get(name)
+        if raw is None:
+            return None
+        normalized = raw.strip().lower()
+        if normalized in {"1", "true", "yes", "y", "on"}:
+            return True
+        if normalized in {"0", "false", "no", "n", "off"}:
+            return False
+        return None
+    def set_discovery_only(self, value: bool) -> None:
+        self._discovery_only = value
+    def evict_asset_cache(self, asset_hash: str) -> None:
+        """Free cached content for a processed asset. Override in subclasses."""
+        pass
+    @abstractmethod
+    def test_connection(self) -> dict[str, Any]:
+        """
+        Verify that the connection to the source is working.
+        Should return a dictionary conforming to the test-connection schema.
+        """
+        pass
+    STREAM_DETECTIONS: bool = False
+    async def extract(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
+        """
+        Orchestrates extraction + detection.  Calls ``extract_raw()`` for batches,
+        then runs the detector pipeline (if configured) before yielding results.
+        Sources should override ``extract_raw()`` instead of this method.
+        """
+        pipeline = self._build_pipeline()
+        async for batch in self.extract_raw():
+            if pipeline:
+                if self.STREAM_DETECTIONS:
+                    async for processed in pipeline.process_stream(batch):
+                        yield [processed]
+                    continue
+                batch = await pipeline.process(batch)  # noqa: PLW2901
+            if batch:
+                yield batch
+    @abstractmethod
+    async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
+        """
+        The main extraction logic.  Yields batches of raw assets **without**
+        running detectors.  The base ``extract()`` wraps this with pipeline
+        processing automatically.
+        Yields:
+            Batches of SingleAssetScanResults objects
+        """
+        yield []
+    def _build_pipeline(self) -> Any:
+        config = getattr(self, "config", None)
+        detectors = getattr(config, "detectors", None) if config else None
+        if not detectors or not any(getattr(d, "enabled", False) for d in detectors):
+            return None
+        from ..pipeline.detector_pipeline import DetectorPipeline
+        return DetectorPipeline.from_recipe(self.recipe, self, self.runner_id)
+    @abstractmethod
+    def generate_hash_id(self, asset_id: str) -> str:
+        """
+        Generate a unique stable ID for an asset.
+        """
+        pass
+    def calculate_checksum(self, data: dict[str, Any]) -> str:
+        """
+        Calculate a stable SHA-256 checksum for a dictionary.
+        """
+        return calculate_checksum(data)
+    @abstractmethod
+    def abort(self) -> None:
+        """
+        Signal the source to stop extraction as soon as possible.
+        """
+        self._aborted = True
+    def cleanup(self) -> None:
+        """
+        Optional: Clean up resources (close sessions, delete temp files).
+        """
+        # Default implementation does nothing.
+    def get_stats(self) -> dict[str, Any]:
+        """
+        Optional: Return statistics about the current extraction (total items, success/fail counts).
+        """
+        return {}
+    def discover(self) -> dict[str, Any]:
+        """
+        Optional: Discover available resources (e.g., list all spaces/projects)
+        without performing a full extraction.
+        """
+        return {}
+    def validate_output(self, data: dict[str, Any]) -> None:
+        """
+        Optional: Use the validation utility to ensure output conforms to schema.
+        Can be called during extraction to fail early on bad data.
+        """
+        source_type = self.recipe.get("type", "").lower()
+        validate_output(data, source_type)
+    def ensure_location(self, external_url: str, *, fallback: str | None = None) -> str:
+        """
+        Ensure the asset has a non-empty external URL.
+        """
+        location = (external_url or "").strip()
+        if location:
+            return location
+        if fallback:
+            fallback_value = fallback.strip()
+            if fallback_value:
+                return fallback_value
+        raise ValueError("Asset external_url is required")
+    def _attachment_file_name(self, asset_id: str, fallback_url: str) -> str:
+        """Return the stored file name for an attachment, or fallback_url if not recorded."""
+        stored = self._attachment_name_by_hash.get(asset_id)
+        if isinstance(stored, str) and stored.strip():
+            return stored.strip()
+        return fallback_url
+    def ocr_enabled(self) -> bool:
+        """Return whether sampling-level OCR is enabled for this source."""
+        config = getattr(self, "config", None)
+        sampling = getattr(config, "sampling", None) if config is not None else None
+        return bool(getattr(sampling, "enable_ocr", False))
+    def parse_asset_bytes(
+        self,
+        file_bytes: bytes,
+        *,
+        declared_mime_type: str | None = None,
+        file_name: str = "",
+    ) -> "ParsedBytes":
+        from ..utils.file_parser import parse_bytes
+        return parse_bytes(
+            file_bytes,
+            declared_mime_type=declared_mime_type,
+            file_name=file_name,
+            enable_ocr=self.ocr_enabled(),
+        )
+    def iter_asset_pages(
+        self,
+        file_bytes: bytes,
+        mime_type: str,
+        batch_size: int = 100,
+        include_column_names: bool = True,
+        *,
+        file_name: str = "",
+    ) -> Generator[str, None, None]:
+        from ..utils.file_parser import iter_file_pages
+        return iter_file_pages(
+            file_bytes,
+            mime_type,
+            batch_size,
+            include_column_names,
+            file_name=file_name,
+            enable_ocr=self.ocr_enabled(),
+        )
+    async def fetch_content_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
+        """
+        Fetch raw bytes and MIME type for an asset (for binary/image detectors).
+        Returns (raw_bytes, mime_type) or None if binary content is not available.
+        Sources that store raw file bytes should override this method.
+        """
+        return None
+    async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
+        """
+        Async generator yielding (raw_content, text_content) pages for an asset.
+        Default: yields a single result from fetch_content.
+        Tabular sources override this to stream pages for ALL strategy.
+        """
+        result = await self.fetch_content(asset_id)
+        if result:
+            yield result
+    async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
+        """
+        Fetch full content for an asset (for detector scanning).
+        This method should be implemented by sources that support content fetching.
+        It retrieves the full content of an asset given its identifier.
+        Args:
+            asset_id: Asset identifier (page_id, post_id, document_id, etc.)
+        Returns:
+            Tuple of (raw_content, text_content) where:
+            - raw_content: Original HTML/markup content
+            - text_content: Plain text extracted from content
+            Returns None if content fetching is not supported or fails.
+        Note:
+            Default implementation returns None. Sources that support detector
+            integration should override this method.
+        """
+        return None
+    def enrich_finding_location(
+        self,
+        finding: DetectionResult,
+        asset: SingleAssetScanResults,
+        text_content: str,
+    ) -> None:
+        """
+        Set a human-readable path on finding.location so users can find the source.
+        Override per source type:
+        - Tabular (PostgreSQL, MySQL): "schema.table, row N"
+        - Web (WordPress): the page URL
+        - Slack: permalink or "channel / message_ts"
+        """
+        pass
+    def resolve_link_for_detection(self, link: str) -> str | None:
+        """
+        Resolve a stored asset link into a concrete HTTP(S) URL for link-based detectors.
+        Sources that store non-URL link identifiers (for example, hashed IDs) can override
+        this and map those identifiers back to their original URLs.
+        """
+        return normalize_http_url(link)

src/sources/confluence/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .source import ConfluenceSource
+__all__ = ["ConfluenceSource"]