PyPI - classifyre-cli - Versions diffs - 0.4.2__py3-none-any.whl - Mend

classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

classifyre_cli-0.4.2.dist-info/METADATA +167 -0
classifyre_cli-0.4.2.dist-info/RECORD +101 -0
classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
src/__init__.py +1 -0
src/detectors/__init__.py +105 -0
src/detectors/base.py +97 -0
src/detectors/broken_links/__init__.py +3 -0
src/detectors/broken_links/detector.py +280 -0
src/detectors/config.py +59 -0
src/detectors/content/__init__.py +0 -0
src/detectors/custom/__init__.py +13 -0
src/detectors/custom/detector.py +45 -0
src/detectors/custom/runners/__init__.py +56 -0
src/detectors/custom/runners/_base.py +177 -0
src/detectors/custom/runners/_factory.py +51 -0
src/detectors/custom/runners/_feature_extraction.py +138 -0
src/detectors/custom/runners/_gliner2.py +324 -0
src/detectors/custom/runners/_image_classification.py +98 -0
src/detectors/custom/runners/_llm.py +22 -0
src/detectors/custom/runners/_object_detection.py +107 -0
src/detectors/custom/runners/_regex.py +147 -0
src/detectors/custom/runners/_text_classification.py +109 -0
src/detectors/custom/trainer.py +293 -0
src/detectors/dependencies.py +109 -0
src/detectors/pii/__init__.py +0 -0
src/detectors/pii/detector.py +883 -0
src/detectors/secrets/__init__.py +0 -0
src/detectors/secrets/detector.py +399 -0
src/detectors/threat/__init__.py +0 -0
src/detectors/threat/code_security_detector.py +206 -0
src/detectors/threat/yara_detector.py +177 -0
src/main.py +608 -0
src/models/generated_detectors.py +1296 -0
src/models/generated_input.py +2732 -0
src/models/generated_single_asset_scan_results.py +240 -0
src/outputs/__init__.py +3 -0
src/outputs/base.py +69 -0
src/outputs/console.py +62 -0
src/outputs/factory.py +156 -0
src/outputs/file.py +83 -0
src/outputs/rest.py +258 -0
src/pipeline/__init__.py +7 -0
src/pipeline/content_provider.py +26 -0
src/pipeline/detector_pipeline.py +742 -0
src/pipeline/parsed_content_provider.py +59 -0
src/sandbox/__init__.py +5 -0
src/sandbox/runner.py +145 -0
src/sources/__init__.py +95 -0
src/sources/atlassian_common.py +389 -0
src/sources/azure_blob_storage/__init__.py +3 -0
src/sources/azure_blob_storage/source.py +130 -0
src/sources/base.py +296 -0
src/sources/confluence/__init__.py +3 -0
src/sources/confluence/source.py +733 -0
src/sources/databricks/__init__.py +3 -0
src/sources/databricks/source.py +1279 -0
src/sources/dependencies.py +81 -0
src/sources/google_cloud_storage/__init__.py +3 -0
src/sources/google_cloud_storage/source.py +114 -0
src/sources/hive/__init__.py +3 -0
src/sources/hive/source.py +709 -0
src/sources/jira/__init__.py +3 -0
src/sources/jira/source.py +605 -0
src/sources/mongodb/__init__.py +3 -0
src/sources/mongodb/source.py +550 -0
src/sources/mssql/__init__.py +3 -0
src/sources/mssql/source.py +1034 -0
src/sources/mysql/__init__.py +3 -0
src/sources/mysql/source.py +797 -0
src/sources/neo4j/__init__.py +0 -0
src/sources/neo4j/source.py +523 -0
src/sources/object_storage/base.py +679 -0
src/sources/oracle/__init__.py +3 -0
src/sources/oracle/source.py +982 -0
src/sources/postgresql/__init__.py +3 -0
src/sources/postgresql/source.py +774 -0
src/sources/powerbi/__init__.py +3 -0
src/sources/powerbi/source.py +774 -0
src/sources/recipe_normalizer.py +179 -0
src/sources/s3_compatible_storage/README.md +66 -0
src/sources/s3_compatible_storage/__init__.py +3 -0
src/sources/s3_compatible_storage/source.py +150 -0
src/sources/servicedesk/__init__.py +3 -0
src/sources/servicedesk/source.py +620 -0
src/sources/slack/__init__.py +3 -0
src/sources/slack/source.py +534 -0
src/sources/snowflake/__init__.py +3 -0
src/sources/snowflake/source.py +912 -0
src/sources/tableau/__init__.py +3 -0
src/sources/tableau/source.py +799 -0
src/sources/tabular_utils.py +165 -0
src/sources/wordpress/__init__.py +3 -0
src/sources/wordpress/source.py +590 -0
src/telemetry.py +96 -0
src/utils/__init__.py +1 -0
src/utils/content_extraction.py +108 -0
src/utils/file_parser.py +777 -0
src/utils/hashing.py +82 -0
src/utils/uv_sync.py +79 -0
src/utils/validation.py +56 -0

src/detectors/secrets/__init__.py ADDED Viewed

File without changes

src/detectors/secrets/detector.py ADDED Viewed

@@ -0,0 +1,399 @@
+"""Secrets detector powered by the detect-secrets library.
+Operates entirely in-memory: splits text into lines and invokes each enabled
+plugin's ``analyze_line`` directly.  No temp files, no global Settings state,
+and no ``SecretsCollection`` needed.
+"""
+import importlib
+import logging
+import pkgutil
+from typing import Any
+from ...models.generated_detectors import DetectorConfig, SecretsDetectorConfig, Severity
+from ...models.generated_single_asset_scan_results import DetectionResult, DetectorType, Location
+from ..base import BaseDetector
+from ..dependencies import MissingDependencyError, require_module
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Lazy plugin discovery
+# ---------------------------------------------------------------------------
+# detect-secrets is an optional dependency (security group).  We must NOT
+# touch the package at module-import time because the CLI auto-installs it
+# lazily when the detector is instantiated.  _discover_plugins() is therefore
+# deferred until the first call to _build_plugins().
+# ---------------------------------------------------------------------------
+# Mutable container avoids the need for the ``global`` keyword.
+_plugin_cache: dict[str, Any] = {"_loaded": False}
+def _discover_plugins() -> dict[str, tuple[str, str]]:
+    """Return {pattern_key: (module_path, class_name)} by scanning detect_secrets.plugins."""
+    import detect_secrets.plugins
+    # Build {class_name -> module_path} from the installed package
+    class_to_mod: dict[str, str] = {}
+    for _, mod_name, is_pkg in pkgutil.iter_modules(detect_secrets.plugins.__path__):
+        if is_pkg or mod_name == "base":
+            continue
+        full_mod = f"detect_secrets.plugins.{mod_name}"
+        try:
+            mod = importlib.import_module(full_mod)
+            for name in dir(mod):
+                obj = getattr(mod, name)
+                if isinstance(obj, type) and obj.__module__ == full_mod:
+                    class_to_mod[name] = full_mod
+        except Exception:
+            continue
+    _pattern_to_class: dict[str, str] = {
+        "artifactory": "ArtifactoryDetector",
+        "aws": "AWSKeyDetector",
+        "azure_storage": "AzureStorageKeyDetector",
+        "basic_auth": "BasicAuthDetector",
+        "cloudant": "CloudantDetector",
+        "discord": "DiscordBotTokenDetector",
+        "github": "GitHubTokenDetector",
+        "gitlab": "GitLabTokenDetector",
+        "high_entropy_base64": "Base64HighEntropyString",
+        "high_entropy_hex": "HexHighEntropyString",
+        "ibm_cloud_iam": "IbmCloudIamDetector",
+        "ibm_cos_hmac": "IbmCosHmacDetector",
+        "ip_public": "IPPublicDetector",
+        "jwt": "JwtTokenDetector",
+        "keyword": "KeywordDetector",
+        "mailchimp": "MailchimpDetector",
+        "npm": "NpmDetector",
+        "openai": "OpenAIDetector",
+        "private_key": "PrivateKeyDetector",
+        "pypi": "PypiTokenDetector",
+        "sendgrid": "SendGridDetector",
+        "slack": "SlackDetector",
+        "softlayer": "SoftlayerDetector",
+        "square_oauth": "SquareOAuthDetector",
+        "stripe": "StripeDetector",
+        "telegram": "TelegramBotTokenDetector",
+        "twilio": "TwilioKeyDetector",
+    }
+    specs: dict[str, tuple[str, str]] = {}
+    for key, cls_name in _pattern_to_class.items():
+        mod = class_to_mod.get(cls_name)
+        if mod:
+            specs[key] = (mod, cls_name)
+        else:
+            logger.warning(
+                "Plugin class '%s' not found in installed detect-secrets; "
+                "pattern '%s' will be skipped",
+                cls_name,
+                key,
+            )
+    return specs
+def _get_plugin_specs() -> dict[str, tuple[str, str]]:
+    """Lazy accessor for plugin specs (populated on first call)."""
+    if not _plugin_cache["_loaded"]:
+        _plugin_cache["specs"] = _discover_plugins()
+        _plugin_cache["defaults"] = list(_plugin_cache["specs"].keys())
+        _plugin_cache["_loaded"] = True
+    return _plugin_cache["specs"]
+# Severity classification by keywords in detect-secrets finding type (lowercased).
+_SEVERITY_RULES: list[tuple[Severity, list[str]]] = [
+    (
+        Severity.critical,
+        [
+            "aws",
+            "private key",
+            "github",
+            "gitlab",
+            "slack",
+            "stripe",
+            "azure storage",
+            "google oauth",
+            "openai",
+        ],
+    ),
+    (
+        Severity.high,
+        [
+            "artifactory",
+            "basic auth",
+            "cloudant",
+            "discord",
+            "ibm",
+            "json web token",
+            "mailchimp",
+            "npm",
+            "pypi",
+            "sendgrid",
+            "softlayer",
+            "square",
+            "telegram",
+            "twilio",
+        ],
+    ),
+    (Severity.medium, ["entropy", "keyword", "ip public"]),
+]
+_SEVERITY_RANK: dict[Severity, int] = {
+    Severity.info: 0,
+    Severity.low: 1,
+    Severity.medium: 2,
+    Severity.high: 3,
+    Severity.critical: 4,
+}
+# Confidence by keywords in detect-secrets finding type (lowercased).
+_CONFIDENCE_RULES: list[tuple[float, list[str]]] = [
+    (
+        0.95,
+        [
+            "aws",
+            "github",
+            "gitlab",
+            "private key",
+            "slack",
+            "stripe",
+            "azure storage",
+            "openai",
+            "pypi",
+        ],
+    ),
+    (
+        0.85,
+        [
+            "artifactory",
+            "basic auth",
+            "cloudant",
+            "discord",
+            "ibm",
+            "mailchimp",
+            "npm",
+            "sendgrid",
+            "softlayer",
+            "square",
+            "telegram",
+            "twilio",
+        ],
+    ),
+    (0.80, ["json web token"]),
+    (0.75, ["entropy"]),
+    (0.70, ["keyword", "ip public"]),
+]
+class SecretsDetector(BaseDetector):
+    """Secrets detector backed by the detect-secrets library.
+    Each enabled plugin is imported and instantiated directly.  Text is scanned
+    line-by-line in memory via ``analyze_line`` -- no temp files, no global
+    Settings state, and no async locking required.
+    """
+    detector_type = "secrets"
+    detector_name = "secrets"
+    def __init__(self, config: DetectorConfig | None = None):
+        super().__init__(config)
+        self._cfg: SecretsDetectorConfig = (
+            config if isinstance(config, SecretsDetectorConfig) else SecretsDetectorConfig()
+        )
+        # Fail fast at construction time if detect-secrets is not installed.
+        try:
+            require_module("detect_secrets", "secrets", ["security", "detectors"])
+        except MissingDependencyError:
+            raise
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _enabled_pattern_names(self) -> list[str]:
+        """Return the list of pattern string keys to activate."""
+        specs = _get_plugin_specs()
+        defaults = list(specs.keys())
+        raw = self._cfg.enabled_patterns
+        if raw is None:
+            return defaults
+        # Unwrap Pydantic RootModel
+        items = raw.root if hasattr(raw, "root") else raw
+        if not items:
+            return defaults
+        names: list[str] = []
+        for item in items:
+            # item may be a str or a SecretsEnabledPattern enum member
+            name = item.value if hasattr(item, "value") else str(item)
+            if name in specs:
+                names.append(name)
+            else:
+                logger.warning("Unknown secrets pattern '%s' ignored", name)
+        return names
+    def _build_plugins(self) -> list[Any]:
+        """Import and instantiate each enabled detect-secrets plugin."""
+        specs = _get_plugin_specs()
+        names = self._enabled_pattern_names()
+        plugins: list[Any] = []
+        for name in names:
+            mod_path, cls_name = specs[name]
+            try:
+                mod = importlib.import_module(mod_path)
+                cls = getattr(mod, cls_name)
+            except Exception as exc:
+                logger.warning("Failed to import plugin '%s' from %s: %s", cls_name, mod_path, exc)
+                continue
+            kwargs: dict[str, Any] = {}
+            if name == "high_entropy_base64":
+                limit = self._cfg.entropy_limit_base64
+                if limit is not None:
+                    kwargs["limit"] = float(limit.root if hasattr(limit, "root") else limit)
+            elif name == "high_entropy_hex":
+                limit = self._cfg.entropy_limit_hex
+                if limit is not None:
+                    kwargs["limit"] = float(limit.root if hasattr(limit, "root") else limit)
+            try:
+                plugin = cls(**kwargs)
+                plugins.append(plugin)
+                logger.debug("Initialized secrets plugin: %s", cls_name)
+            except Exception as exc:
+                logger.warning("Failed to instantiate plugin '%s': %s", cls_name, exc)
+        return plugins
+    @classmethod
+    def _get_severity(cls, secret_type: str) -> Severity:
+        t = secret_type.lower()
+        for severity, keywords in _SEVERITY_RULES:
+            if any(kw in t for kw in keywords):
+                return severity
+        return Severity.high
+    @classmethod
+    def _get_confidence(cls, secret_type: str) -> float:
+        t = secret_type.lower()
+        for confidence, keywords in _CONFIDENCE_RULES:
+            if any(kw in t for kw in keywords):
+                return confidence
+        return 0.85
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    async def detect(
+        self, content: str | bytes, content_type: str = "text/plain"
+    ) -> list[DetectionResult]:
+        if isinstance(content, bytes):
+            try:
+                content = content.decode("utf-8", errors="replace")
+            except Exception:
+                logger.warning(
+                    "Secrets detector received non-decodable binary content (%d bytes) and cannot scan it",
+                    len(content),
+                )
+                return []
+        plugins = self._build_plugins()
+        if not plugins:
+            return []
+        lines = content.splitlines()
+        confidence_threshold: float = self._cfg.confidence_threshold or 0.7
+        severity_threshold = self._cfg.severity_threshold
+        min_severity_rank = _SEVERITY_RANK.get(severity_threshold, 0) if severity_threshold else 0
+        results: list[DetectionResult] = []
+        for line_number, line_text in enumerate(lines, start=1):
+            for plugin in plugins:
+                try:
+                    secrets = plugin.analyze_line(
+                        filename="<inline>",
+                        line=line_text,
+                        line_number=line_number,
+                    )
+                except Exception as exc:
+                    logger.debug(
+                        "Plugin %s failed on line %d: %s",
+                        plugin.__class__.__name__,
+                        line_number,
+                        exc,
+                    )
+                    continue
+                for secret in secrets:
+                    try:
+                        secret_type = str(secret.type) if secret.type else ""
+                        secret_value = (
+                            str(secret.secret_value) if secret.secret_value is not None else ""
+                        )
+                        is_verified = bool(secret.is_verified)
+                    except Exception:
+                        continue
+                    if not secret_type:
+                        continue
+                    confidence = self._get_confidence(secret_type)
+                    if confidence < confidence_threshold:
+                        continue
+                    severity = self._get_severity(secret_type)
+                    if _SEVERITY_RANK.get(severity, 0) < min_severity_rank:
+                        continue
+                    if not secret_value:
+                        continue
+                    col_offset = line_text.find(secret_value) if secret_value in line_text else 0
+                    start = col_offset
+                    end = start + len(secret_value)
+                    results.append(
+                        DetectionResult(
+                            detector_type=DetectorType.SECRETS,
+                            finding_type=secret_type,
+                            category="SECRETS",
+                            severity=severity,
+                            confidence=confidence,
+                            matched_content=secret_value,
+                            location=Location(
+                                start=start,
+                                end=end,
+                                line=line_number,
+                                path=f"line {line_number}",
+                            ),
+                            metadata={
+                                "detector": "secrets",
+                                "plugin": secret_type,
+                                "is_verified": is_verified,
+                            },
+                        )
+                    )
+        if self._cfg.max_findings and len(results) > self._cfg.max_findings:
+            results = results[: self._cfg.max_findings]
+        return results
+    def get_supported_content_types(self) -> list[str]:
+        return [
+            "text/plain",
+            "application/json",
+            "application/yaml",
+            "application/x-yaml",
+            "text/yaml",
+            "application/xml",
+            "text/xml",
+        ]

src/detectors/threat/__init__.py ADDED Viewed

File without changes

src/detectors/threat/code_security_detector.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Code security detector using Bandit static analysis."""
+import json
+import logging
+import subprocess
+import sys
+import tempfile
+from importlib.util import find_spec
+from pathlib import Path
+from typing import Any
+from ...models.generated_detectors import (
+    CodeSecurityDetectorConfig,
+    DetectorConfig,
+    GenericDetectorConfig,
+    Severity,
+)
+from ...models.generated_single_asset_scan_results import (
+    DetectionResult,
+    DetectorType,
+)
+from ..base import BaseDetector
+from ..dependencies import MissingDependencyError, require_module
+logger = logging.getLogger(__name__)
+_SEVERITY_ORDER: dict[Severity, int] = {
+    Severity.info: 0,
+    Severity.low: 1,
+    Severity.medium: 2,
+    Severity.high: 3,
+    Severity.critical: 4,
+}
+class CodeSecurityDetector(BaseDetector):
+    """Detect insecure code patterns with Bandit (rule-based)."""
+    detector_type = "code_security"
+    detector_name = "code_security"
+    def __init__(self, config: DetectorConfig | None = None):
+        super().__init__(config)
+        self._cfg: CodeSecurityDetectorConfig | GenericDetectorConfig
+        if isinstance(config, CodeSecurityDetectorConfig):
+            self._cfg = config
+        elif isinstance(config, GenericDetectorConfig):
+            self._cfg = config
+        else:
+            self._cfg = CodeSecurityDetectorConfig()
+        # Importing `bandit` eagerly can trigger stevedore plugin discovery noise.
+        # We only verify Bandit availability here; execution happens in a subprocess.
+        if find_spec("bandit") is None:
+            try:
+                require_module("bandit", "code_security", ["security", "detectors"])
+            except MissingDependencyError:
+                raise
+    @staticmethod
+    def _severity_from_bandit(level: str) -> Severity:
+        normalized = level.upper()
+        if normalized == "HIGH":
+            return Severity.high
+        if normalized == "MEDIUM":
+            return Severity.medium
+        if normalized == "LOW":
+            return Severity.low
+        return Severity.info
+    @staticmethod
+    def _confidence_from_bandit(level: str) -> float:
+        normalized = level.upper()
+        if normalized == "HIGH":
+            return 0.95
+        if normalized == "MEDIUM":
+            return 0.8
+        if normalized == "LOW":
+            return 0.6
+        return 0.5
+    def _run_bandit_json(
+        self,
+        content: str,
+        skips: list[str] | None = None,
+        tests: list[str] | None = None,
+    ) -> tuple[list[dict[str, Any]], list[str]]:
+        with tempfile.NamedTemporaryFile(
+            mode="w",
+            suffix=".py",
+            encoding="utf-8",
+            delete=False,
+        ) as handle:
+            handle.write(content)
+            tmp_path = Path(handle.name)
+        try:
+            cmd = [sys.executable, "-m", "bandit", "-q", "-f", "json"]
+            if tests:
+                cmd += ["--test", ",".join(tests)]
+            if skips:
+                cmd += ["--skip", ",".join(skips)]
+            cmd.append(str(tmp_path))
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if proc.returncode not in (0, 1):
+                stderr = proc.stderr.strip() or "Unknown Bandit execution error"
+                logger.error(f"Bandit execution failed: {stderr}")
+                return [], [stderr]
+            stdout = proc.stdout.strip() or "{}"
+            payload = json.loads(stdout)
+            if not isinstance(payload, dict):
+                return [], []
+            results = payload.get("results", [])
+            errors = payload.get("errors", [])
+            return (
+                [item for item in results if isinstance(item, dict)],
+                [str(item) for item in errors],
+            )
+        except Exception as exc:
+            logger.error(f"Code security scan failed: {exc}")
+            return [], [str(exc)]
+        finally:
+            tmp_path.unlink(missing_ok=True)
+    async def detect(
+        self, content: str | bytes, content_type: str = "text/plain"
+    ) -> list[DetectionResult]:
+        if isinstance(content, bytes):
+            return []
+        if not content.strip():
+            return []
+        threshold = self._cfg.confidence_threshold or 0.7
+        max_findings = self._cfg.max_findings or 25
+        findings: list[DetectionResult] = []
+        skips: list[str] | None = None
+        tests: list[str] | None = None
+        severity_threshold: Severity | None = None
+        if isinstance(self._cfg, CodeSecurityDetectorConfig):
+            skips = self._cfg.skips
+            tests = self._cfg.tests
+            severity_threshold = self._cfg.severity_threshold
+        issues, errors = self._run_bandit_json(content, skips=skips, tests=tests)
+        if not issues:
+            if errors:
+                logger.debug(f"Bandit returned no issues with errors: {errors}")
+            return []
+        min_severity_rank = _SEVERITY_ORDER.get(severity_threshold, 0) if severity_threshold else 0
+        for issue in issues:
+            confidence = self._confidence_from_bandit(str(issue.get("issue_confidence", "")))
+            if confidence < threshold:
+                continue
+            severity = self._severity_from_bandit(str(issue.get("issue_severity", "")))
+            if _SEVERITY_ORDER.get(severity, 0) < min_severity_rank:
+                continue
+            issue_text = str(issue.get("issue_text", "Potential insecure code pattern"))
+            code_snippet = str(issue.get("code", "")).strip()
+            finding_type = str(issue.get("test_id", issue.get("test_name", "code_security")))
+            findings.append(
+                DetectionResult(
+                    detector_type=DetectorType.CODE_SECURITY,
+                    finding_type=finding_type,
+                    category="SECURITY",
+                    severity=severity,
+                    confidence=confidence,
+                    matched_content=code_snippet or issue_text,
+                    location=None,
+                    metadata={
+                        "tool": "bandit",
+                        "issue_text": issue_text,
+                        "test_name": issue.get("test_name"),
+                        "test_id": issue.get("test_id"),
+                        "issue_severity": issue.get("issue_severity"),
+                        "issue_confidence": issue.get("issue_confidence"),
+                    },
+                )
+            )
+            if len(findings) >= max_findings:
+                break
+        return findings
+    def get_supported_content_types(self) -> list[str]:
+        return [
+            "text/plain",
+            "text/html",
+            "text/markdown",
+            "application/json",
+            "application/octet-stream",
+        ]