PyPI - classifyre-cli - Versions diffs - 0.4.2__py3-none-any.whl - Mend

classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

classifyre_cli-0.4.2.dist-info/METADATA +167 -0
classifyre_cli-0.4.2.dist-info/RECORD +101 -0
classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
src/__init__.py +1 -0
src/detectors/__init__.py +105 -0
src/detectors/base.py +97 -0
src/detectors/broken_links/__init__.py +3 -0
src/detectors/broken_links/detector.py +280 -0
src/detectors/config.py +59 -0
src/detectors/content/__init__.py +0 -0
src/detectors/custom/__init__.py +13 -0
src/detectors/custom/detector.py +45 -0
src/detectors/custom/runners/__init__.py +56 -0
src/detectors/custom/runners/_base.py +177 -0
src/detectors/custom/runners/_factory.py +51 -0
src/detectors/custom/runners/_feature_extraction.py +138 -0
src/detectors/custom/runners/_gliner2.py +324 -0
src/detectors/custom/runners/_image_classification.py +98 -0
src/detectors/custom/runners/_llm.py +22 -0
src/detectors/custom/runners/_object_detection.py +107 -0
src/detectors/custom/runners/_regex.py +147 -0
src/detectors/custom/runners/_text_classification.py +109 -0
src/detectors/custom/trainer.py +293 -0
src/detectors/dependencies.py +109 -0
src/detectors/pii/__init__.py +0 -0
src/detectors/pii/detector.py +883 -0
src/detectors/secrets/__init__.py +0 -0
src/detectors/secrets/detector.py +399 -0
src/detectors/threat/__init__.py +0 -0
src/detectors/threat/code_security_detector.py +206 -0
src/detectors/threat/yara_detector.py +177 -0
src/main.py +608 -0
src/models/generated_detectors.py +1296 -0
src/models/generated_input.py +2732 -0
src/models/generated_single_asset_scan_results.py +240 -0
src/outputs/__init__.py +3 -0
src/outputs/base.py +69 -0
src/outputs/console.py +62 -0
src/outputs/factory.py +156 -0
src/outputs/file.py +83 -0
src/outputs/rest.py +258 -0
src/pipeline/__init__.py +7 -0
src/pipeline/content_provider.py +26 -0
src/pipeline/detector_pipeline.py +742 -0
src/pipeline/parsed_content_provider.py +59 -0
src/sandbox/__init__.py +5 -0
src/sandbox/runner.py +145 -0
src/sources/__init__.py +95 -0
src/sources/atlassian_common.py +389 -0
src/sources/azure_blob_storage/__init__.py +3 -0
src/sources/azure_blob_storage/source.py +130 -0
src/sources/base.py +296 -0
src/sources/confluence/__init__.py +3 -0
src/sources/confluence/source.py +733 -0
src/sources/databricks/__init__.py +3 -0
src/sources/databricks/source.py +1279 -0
src/sources/dependencies.py +81 -0
src/sources/google_cloud_storage/__init__.py +3 -0
src/sources/google_cloud_storage/source.py +114 -0
src/sources/hive/__init__.py +3 -0
src/sources/hive/source.py +709 -0
src/sources/jira/__init__.py +3 -0
src/sources/jira/source.py +605 -0
src/sources/mongodb/__init__.py +3 -0
src/sources/mongodb/source.py +550 -0
src/sources/mssql/__init__.py +3 -0
src/sources/mssql/source.py +1034 -0
src/sources/mysql/__init__.py +3 -0
src/sources/mysql/source.py +797 -0
src/sources/neo4j/__init__.py +0 -0
src/sources/neo4j/source.py +523 -0
src/sources/object_storage/base.py +679 -0
src/sources/oracle/__init__.py +3 -0
src/sources/oracle/source.py +982 -0
src/sources/postgresql/__init__.py +3 -0
src/sources/postgresql/source.py +774 -0
src/sources/powerbi/__init__.py +3 -0
src/sources/powerbi/source.py +774 -0
src/sources/recipe_normalizer.py +179 -0
src/sources/s3_compatible_storage/README.md +66 -0
src/sources/s3_compatible_storage/__init__.py +3 -0
src/sources/s3_compatible_storage/source.py +150 -0
src/sources/servicedesk/__init__.py +3 -0
src/sources/servicedesk/source.py +620 -0
src/sources/slack/__init__.py +3 -0
src/sources/slack/source.py +534 -0
src/sources/snowflake/__init__.py +3 -0
src/sources/snowflake/source.py +912 -0
src/sources/tableau/__init__.py +3 -0
src/sources/tableau/source.py +799 -0
src/sources/tabular_utils.py +165 -0
src/sources/wordpress/__init__.py +3 -0
src/sources/wordpress/source.py +590 -0
src/telemetry.py +96 -0
src/utils/__init__.py +1 -0
src/utils/content_extraction.py +108 -0
src/utils/file_parser.py +777 -0
src/utils/hashing.py +82 -0
src/utils/uv_sync.py +79 -0
src/utils/validation.py +56 -0

src/detectors/threat/yara_detector.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""YARA-based threat detector — compiles structured rule objects into a live ruleset."""
+import logging
+import re
+from ...models.generated_detectors import (
+    DetectorConfig,
+    Severity,
+    ThreatDetectorConfig,
+    YaraRuleConfig,
+)
+from ...models.generated_single_asset_scan_results import DetectionResult, DetectorType, Location
+from ..base import BaseDetector
+from ..dependencies import require_module
+logger = logging.getLogger(__name__)
+_SEVERITY_MAP: dict[str, Severity] = {
+    "critical": Severity.critical,
+    "high": Severity.high,
+    "medium": Severity.medium,
+    "low": Severity.low,
+}
+_SEVERITY_ORDER: dict[str, int] = {"low": 1, "medium": 2, "high": 3, "critical": 4}
+_SAFE_NAME = re.compile(r"[^A-Za-z0-9_]")
+def _sanitize_name(name: str) -> str:
+    sanitized = _SAFE_NAME.sub("_", name)
+    return ("_" + sanitized) if sanitized and sanitized[0].isdigit() else sanitized or "Rule"
+def _build_source(rules: list[YaraRuleConfig]) -> str:
+    parts: list[str] = []
+    for rule in rules:
+        strings_block = "\n".join(
+            f"        $s{i} = {pattern}" for i, pattern in enumerate(rule.strings)
+        )
+        desc = (rule.description or "").replace('"', '\\"')
+        sev = rule.severity.value if hasattr(rule.severity, "value") else str(rule.severity)
+        cat = (rule.category or "").replace('"', '\\"')
+        parts.append(
+            f"rule {_sanitize_name(rule.name)} {{\n"
+            f"    meta:\n"
+            f'        description = "{desc}"\n'
+            f'        severity = "{sev}"\n'
+            f'        category = "{cat}"\n'
+            f"    strings:\n"
+            f"{strings_block}\n"
+            f"    condition:\n"
+            f"        {rule.condition}\n"
+            f"}}"
+        )
+    return "\n\n".join(parts)
+class YaraDetector(BaseDetector):
+    """
+    Threat detector powered by yara-python.
+    Takes structured rule objects from config, compiles them into a YARA ruleset,
+    and scans extracted text or raw bytes for matches. Use the bundled examples in
+    all_detectors_examples.json as starting points and extend with custom rules.
+    """
+    detector_type = "yara"
+    detector_name = "yara"
+    def __init__(self, config: DetectorConfig | None = None) -> None:
+        super().__init__(config)
+        self._yara = require_module("yara", "yara", ["security"])
+        self._threat_config = (
+            config if isinstance(config, ThreatDetectorConfig) else ThreatDetectorConfig()
+        )
+        self._rules = self._compile()
+    def _compile(self) -> object | None:
+        rules = self._threat_config.rules
+        if not rules:
+            return None
+        source = _build_source(rules)
+        try:
+            return self._yara.compile(source=source)
+        except Exception:
+            logger.exception("YARA compilation failed")
+            return None
+    async def detect(
+        self, content: str | bytes, content_type: str = "text/plain"
+    ) -> list[DetectionResult]:
+        if self._rules is None:
+            return []
+        data = content if isinstance(content, bytes) else content.encode("utf-8", errors="ignore")
+        timeout = self._threat_config.timeout or 60
+        try:
+            matches = self._rules.match(data=data, timeout=timeout)
+        except Exception as exc:
+            if "timeout" in str(exc).lower():
+                logger.warning("YARA scan timed out after %ds on %s", timeout, content_type)
+            else:
+                logger.error("YARA scan error on %s: %s", content_type, exc)
+            return []
+        threshold = self._threat_config.confidence_threshold or 0.7
+        results: list[DetectionResult] = []
+        for match in matches:
+            meta: dict[str, object] = getattr(match, "meta", {}) or {}
+            rule_name = str(getattr(match, "rule", "unknown"))
+            description = str(meta.get("description", rule_name))
+            severity = _SEVERITY_MAP.get(str(meta.get("severity", "medium")), Severity.medium)
+            matched_texts = [
+                inst.matched_data.decode("utf-8", errors="replace")
+                for sm in getattr(match, "strings", [])
+                for inst in getattr(sm, "instances", [])
+            ]
+            count = len(matched_texts)
+            confidence = min(0.70 + max(count - 1, 0) * 0.04, 0.99)
+            if confidence < threshold:
+                continue
+            results.append(
+                DetectionResult(
+                    detector_type=DetectorType.YARA,
+                    finding_type=rule_name,
+                    category="THREAT",
+                    severity=severity,
+                    confidence=confidence,
+                    matched_content=", ".join(matched_texts[:3]),
+                    location=Location(
+                        path=f"yara:{content_type}",
+                        description=description,
+                    ),
+                    metadata={
+                        "rule": rule_name,
+                        "description": description,
+                        "match_count": count,
+                        "tags": list(getattr(match, "tags", [])),
+                    },
+                )
+            )
+        results.sort(
+            key=lambda r: (_SEVERITY_ORDER.get(r.severity.value, 0), r.confidence),
+            reverse=True,
+        )
+        max_f = self._threat_config.max_findings
+        return results[:max_f] if max_f and len(results) > max_f else results
+    def get_supported_content_types(self) -> list[str]:
+        return [
+            "text/plain",
+            "text/html",
+            "text/csv",
+            "text/markdown",
+            "text/x-python",
+            "text/x-shellscript",
+            "text/javascript",
+            "application/json",
+            "application/xml",
+            "application/pdf",
+            "application/octet-stream",
+            "application/x-sh",
+            "application/x-executable",
+            "application/javascript",
+            "application/vnd.ms-excel",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ]
+    def requires_gpu(self) -> bool:
+        return False