PyPI - agentic-threat-hunting-framework - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

agentic-threat-hunting-framework 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

athf/commands/similar.py ADDED Viewed

@@ -0,0 +1,376 @@
+"""Semantic similarity search for past hunts."""
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import click
+import yaml
+from rich.console import Console
+from rich.table import Table
+console = Console()
+SIMILAR_EPILOG = """
+\b
+Examples:
+  # Find hunts similar to a text query
+  athf similar "password spraying via RDP"
+  # Find hunts similar to specific hunt
+  athf similar --hunt H-0013
+  # Limit results to top 5
+  athf similar "kerberos" --limit 5
+  # Export as JSON
+  athf similar "credential theft" --format json
+\b
+Why This Helps AI:
+  • Semantic search (not just keyword matching)
+  • Find related hunts with different terminology
+  • Discover patterns across hunt history
+  • Better than grep for conceptual matches
+  • Identify similar hunts to avoid duplication
+"""
+@click.command(epilog=SIMILAR_EPILOG)
+@click.argument("query", required=False)
+@click.option("--hunt", help="Hunt ID to find similar hunts for (e.g., H-0013)")
+@click.option("--limit", default=10, type=int, help="Maximum number of results (default: 10)")
+@click.option(
+    "--format",
+    "output_format",
+    type=click.Choice(["table", "json", "yaml"]),
+    default="table",
+    help="Output format (default: table)",
+)
+@click.option("--threshold", default=0.1, type=float, help="Minimum similarity score (0-1, default: 0.1)")
+def similar(
+    query: Optional[str],
+    hunt: Optional[str],
+    limit: int,
+    output_format: str,
+    threshold: float,
+) -> None:
+    """Find hunts similar to a query or hunt ID.
+    Uses semantic similarity to find related hunts even when
+    terminology differs. Better than keyword search for discovering
+    patterns and avoiding duplicate hunts.
+    \b
+    Use Cases:
+    • Check if similar hunt already exists
+    • Find related hunts for context
+    • Discover patterns across hunt history
+    • Identify hunt clusters by topic
+    \b
+    Examples:
+      # Text query
+      athf similar "password spraying"
+      # Similar to existing hunt
+      athf similar --hunt H-0013
+      # Top 5 results
+      athf similar "lateral movement" --limit 5
+    """
+    # Validate inputs
+    if not query and not hunt:
+        console.print("[red]Error: Must provide either QUERY or --hunt option[/red]")
+        console.print("\n[dim]Examples:[/dim]")
+        console.print('  athf similar "password spraying"')
+        console.print("  athf similar --hunt H-0013")
+        raise click.Abort()
+    if query and hunt:
+        console.print("[red]Error: Cannot specify both QUERY and --hunt[/red]")
+        raise click.Abort()
+    # Get query text
+    query_text: str
+    if hunt:
+        hunt_text = _get_hunt_text(hunt)
+        if not hunt_text:
+            console.print(f"[red]Error: Hunt {hunt} not found[/red]")
+            raise click.Abort()
+        query_text = hunt_text
+    else:
+        query_text = query or ""  # Should never be None due to validation above
+    # Find similar hunts
+    results = _find_similar_hunts(query_text, limit=limit, threshold=threshold, exclude_hunt=hunt)
+    # Format and display results
+    if output_format == "json":
+        output = json.dumps(results, indent=2)
+        console.print(output)
+    elif output_format == "yaml":
+        output = yaml.dump(results, default_flow_style=False, sort_keys=False)
+        console.print(output)
+    else:  # table
+        _display_results_table(results, query_text=query_text, reference_hunt=hunt)
+def _get_hunt_text(hunt_id: str) -> Optional[str]:
+    """Get full text content of a hunt."""
+    hunt_file = Path(f"hunts/{hunt_id}.md")
+    if not hunt_file.exists():
+        return None
+    return hunt_file.read_text()
+def _find_similar_hunts(
+    query_text: str,
+    limit: int = 10,
+    threshold: float = 0.1,
+    exclude_hunt: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+    """Find similar hunts using TF-IDF similarity."""
+    try:
+        from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore
+        from sklearn.metrics.pairwise import cosine_similarity  # type: ignore
+    except ImportError:
+        console.print("[red]Error: scikit-learn not installed[/red]")
+        console.print("[dim]Install with: pip install scikit-learn[/dim]")
+        raise click.Abort()
+    # Load all hunts
+    hunts_dir = Path("hunts")
+    hunt_files = list(hunts_dir.glob("H-*.md"))
+    if not hunt_files:
+        console.print("[yellow]No hunts found in hunts/ directory[/yellow]")
+        return []
+    # Extract hunt content and metadata
+    hunt_data = []
+    for hunt_file in hunt_files:
+        hunt_id = hunt_file.stem
+        # Skip excluded hunt
+        if exclude_hunt and hunt_id == exclude_hunt:
+            continue
+        content = hunt_file.read_text()
+        metadata = _extract_hunt_metadata(content)
+        # Extract searchable text (weighted semantic sections)
+        searchable_text = _extract_searchable_text(content, metadata)
+        hunt_data.append(
+            {
+                "hunt_id": hunt_id,
+                "content": content,
+                "searchable_text": searchable_text,
+                "metadata": metadata,
+            }
+        )
+    if not hunt_data:
+        console.print("[yellow]No hunts available for comparison[/yellow]")
+        return []
+    # Build TF-IDF vectors using searchable text (weighted semantic sections)
+    documents = [query_text] + [h["searchable_text"] for h in hunt_data]
+    vectorizer = TfidfVectorizer(
+        max_features=1000,
+        stop_words="english",
+        ngram_range=(1, 2),  # Unigrams and bigrams
+    )
+    tfidf_matrix = vectorizer.fit_transform(documents)
+    # Calculate similarity scores
+    query_vector = tfidf_matrix[0:1]
+    hunt_vectors = tfidf_matrix[1:]
+    similarities = cosine_similarity(query_vector, hunt_vectors)[0]
+    # Combine results with metadata
+    results = []
+    for i, hunt_info in enumerate(hunt_data):
+        score = float(similarities[i])
+        if score >= threshold:
+            metadata = hunt_info["metadata"]  # type: ignore[assignment]
+            results.append(
+                {
+                    "hunt_id": hunt_info["hunt_id"],
+                    "similarity_score": round(score, 4),
+                    "title": metadata.get("title", "Unknown"),
+                    "status": metadata.get("status", "unknown"),
+                    "tactics": metadata.get("tactics", []),
+                    "techniques": metadata.get("techniques", []),
+                    "platform": metadata.get("platform", []),
+                }
+            )
+    # Sort by similarity score (descending)
+    results.sort(key=lambda x: x["similarity_score"], reverse=True)
+    return results[:limit]
+def _extract_hunt_metadata(content: str) -> Dict[str, Any]:
+    """Extract YAML frontmatter metadata from hunt file."""
+    if not content.startswith("---"):
+        return {}
+    try:
+        yaml_end = content.find("---", 3)
+        if yaml_end > 0:
+            frontmatter = content[3:yaml_end]
+            return yaml.safe_load(frontmatter) or {}
+    except yaml.YAMLError:
+        return {}
+    return {}
+def _extract_searchable_text(content: str, metadata: Dict[str, Any]) -> str:
+    """Extract semantically important text for similarity matching.
+    Focuses on key sections and applies weighting to improve match accuracy:
+    - Title (3x weight)
+    - Hypothesis (2x weight)
+    - ABLE framework sections (1.5x weight)
+    - Tactics/Techniques (1x weight)
+    Ignores: SQL queries, results, timestamps, org IDs, lessons learned
+    """
+    parts = []
+    # Title (3x weight - most important)
+    title = metadata.get("title", "")
+    if title:
+        parts.extend([title] * 3)
+    # Tactics and techniques (1x weight)
+    tactics = metadata.get("tactics", [])
+    if isinstance(tactics, list):
+        parts.extend(tactics)
+    elif tactics:
+        parts.append(str(tactics))
+    techniques = metadata.get("techniques", [])
+    if isinstance(techniques, list):
+        parts.extend(techniques)
+    elif techniques:
+        parts.append(str(techniques))
+    platform = metadata.get("platform", [])
+    if isinstance(platform, list):
+        parts.extend(platform)
+    elif platform:
+        parts.append(str(platform))
+    # Extract hypothesis section (2x weight)
+    hypothesis = _extract_section(content, "## Hypothesis")
+    if hypothesis:
+        parts.extend([hypothesis] * 2)
+    # Extract ABLE framework sections (1.5x weight each)
+    able_sections = ["Actor", "Behavior", "Location", "Evidence"]
+    for section in able_sections:
+        text = _extract_section(content, f"### {section}")
+        if text:
+            # Weight 1.5x = add once + half again
+            parts.append(text)
+            parts.append(text[: len(text) // 2])  # Add first half again for 1.5x weight
+    return " ".join(parts)
+def _extract_section(content: str, heading: str) -> str:
+    """Extract text from a markdown section until the next heading."""
+    lines = content.split("\n")
+    section_lines = []
+    in_section = False
+    for line in lines:
+        if line.startswith(heading):
+            in_section = True
+            continue
+        if in_section:
+            # Stop at next heading of same or higher level
+            if line.startswith("#"):
+                break
+            section_lines.append(line)
+    return " ".join(section_lines).strip()
+def _display_results_table(
+    results: List[Dict[str, Any]],
+    query_text: str,
+    reference_hunt: Optional[str] = None,
+) -> None:
+    """Display results in rich table format."""
+    # Header (always show, even if no results)
+    if reference_hunt:
+        console.print(f"\n[bold]Similar to {reference_hunt}:[/bold]")
+    else:
+        query_preview = query_text[:60] + "..." if len(query_text) > 60 else query_text
+        console.print(f"\n[bold]Similar to:[/bold] [dim]{query_preview}[/dim]")
+    if not results:
+        console.print("[yellow]No similar hunts found[/yellow]")
+        return
+    console.print(f"[dim]Found {len(results)} similar hunts[/dim]\n")
+    # Table
+    table = Table(show_header=True, header_style="bold cyan")
+    table.add_column("Score", style="green", no_wrap=True, width=6)
+    table.add_column("Hunt ID", style="cyan", no_wrap=True, width=10)
+    table.add_column("Title", style="white")
+    table.add_column("Status", style="yellow", no_wrap=True, width=12)
+    table.add_column("Tactics", style="dim", width=20)
+    for result in results:
+        score = result["similarity_score"]
+        hunt_id = result["hunt_id"]
+        title = result["title"]
+        status = result["status"]
+        # Format tactics (abbreviate if too long)
+        tactics = result.get("tactics", [])
+        tactics_str = ", ".join(tactics[:2])
+        if len(tactics) > 2:
+            tactics_str += f" +{len(tactics) - 2}"
+        # Color-code score
+        if score >= 0.5:
+            score_str = f"[bold green]{score:.3f}[/bold green]"
+        elif score >= 0.3:
+            score_str = f"[green]{score:.3f}[/green]"
+        elif score >= 0.15:
+            score_str = f"[yellow]{score:.3f}[/yellow]"
+        else:
+            score_str = f"[dim]{score:.3f}[/dim]"
+        # Status emoji
+        status_map = {
+            "completed": "✅",
+            "in-progress": "🔄",
+            "planning": "📋",
+        }
+        status_emoji = status_map.get(status, "❓")
+        status_display = f"{status_emoji} {status}"
+        table.add_row(score_str, hunt_id, title, status_display, tactics_str)
+    console.print(table)
+    # Legend
+    console.print("\n[dim]Similarity Score Legend:[/dim]")
+    console.print(
+        "[dim]  ≥0.50 = Very similar  |  0.30-0.49 = Similar  |  0.15-0.29 = Somewhat similar  |  <0.15 = Low similarity[/dim]\n"
+    )

athf/core/attack_matrix.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""MITRE ATT&CK Matrix reference data.
+This module contains reference data for the MITRE ATT&CK Enterprise matrix,
+including tactic ordering and technique counts.
+"""
+# MITRE ATT&CK Enterprise Matrix v14 (January 2024)
+# Approximate technique counts per tactic (includes sub-techniques)
+ATTACK_TACTICS = {
+    "reconnaissance": {
+        "name": "Reconnaissance",
+        "technique_count": 10,
+        "order": 1,
+    },
+    "resource-development": {
+        "name": "Resource Development",
+        "technique_count": 7,
+        "order": 2,
+    },
+    "initial-access": {
+        "name": "Initial Access",
+        "technique_count": 9,
+        "order": 3,
+    },
+    "execution": {
+        "name": "Execution",
+        "technique_count": 12,
+        "order": 4,
+    },
+    "persistence": {
+        "name": "Persistence",
+        "technique_count": 19,
+        "order": 5,
+    },
+    "privilege-escalation": {
+        "name": "Privilege Escalation",
+        "technique_count": 13,
+        "order": 6,
+    },
+    "defense-evasion": {
+        "name": "Defense Evasion",
+        "technique_count": 42,
+        "order": 7,
+    },
+    "credential-access": {
+        "name": "Credential Access",
+        "technique_count": 15,
+        "order": 8,
+    },
+    "discovery": {
+        "name": "Discovery",
+        "technique_count": 30,
+        "order": 9,
+    },
+    "lateral-movement": {
+        "name": "Lateral Movement",
+        "technique_count": 9,
+        "order": 10,
+    },
+    "collection": {
+        "name": "Collection",
+        "technique_count": 17,
+        "order": 11,
+    },
+    "command-and-control": {
+        "name": "Command and Control",
+        "technique_count": 16,
+        "order": 12,
+    },
+    "exfiltration": {
+        "name": "Exfiltration",
+        "technique_count": 9,
+        "order": 13,
+    },
+    "impact": {
+        "name": "Impact",
+        "technique_count": 13,
+        "order": 14,
+    },
+}
+# Total techniques across all tactics
+TOTAL_TECHNIQUES = sum(tactic["technique_count"] for tactic in ATTACK_TACTICS.values())
+def get_tactic_display_name(tactic_key: str) -> str:
+    """Get the display name for a tactic key.
+    Args:
+        tactic_key: Tactic key (e.g., "credential-access")
+    Returns:
+        Display name (e.g., "Credential Access")
+    """
+    return ATTACK_TACTICS.get(tactic_key, {}).get("name", tactic_key.replace("-", " ").title())
+def get_tactic_technique_count(tactic_key: str) -> int:
+    """Get the total technique count for a tactic.
+    Args:
+        tactic_key: Tactic key (e.g., "credential-access")
+    Returns:
+        Total technique count for the tactic
+    """
+    return ATTACK_TACTICS.get(tactic_key, {}).get("technique_count", 0)
+def get_sorted_tactics() -> list[str]:
+    """Get all tactic keys sorted by ATT&CK matrix order.
+    Returns:
+        List of tactic keys in matrix order
+    """
+    return sorted(ATTACK_TACTICS.keys(), key=lambda k: ATTACK_TACTICS[k]["order"])

athf/core/hunt_manager.py CHANGED Viewed

@@ -2,8 +2,9 @@
 import re
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
+from athf.core.attack_matrix import ATTACK_TACTICS, TOTAL_TECHNIQUES, get_sorted_tactics
 from athf.core.hunt_parser import parse_hunt_file
@@ -220,26 +221,93 @@ class HuntManager:
             "tp_fp_ratio": round(tp_fp_ratio, 2) if tp_fp_ratio != float("inf") else "∞",
         }
-    def calculate_attack_coverage(self) -> Dict[str, List[str]]:
-        """Calculate MITRE ATT&CK technique coverage.
+    def calculate_attack_coverage(self) -> Dict[str, Any]:
+        """Calculate MITRE ATT&CK technique coverage with hunt references.
         Returns:
-            Dict mapping tactics to lists of covered techniques
+            Dict with structure:
+            {
+                "summary": {
+                    "total_hunts": int,
+                    "completed_hunts": int,
+                    "unique_techniques": int,
+                    "tactics_covered": int,
+                    "total_techniques": int,
+                    "overall_coverage_pct": float
+                },
+                "by_tactic": {
+                    "tactic-name": {
+                        "hunt_count": int,
+                        "hunt_ids": List[str],
+                        "techniques": {
+                            "T1234.001": ["H-0001", "H-0003"]
+                        },
+                        "techniques_covered": int,
+                        "total_techniques": int,
+                        "coverage_pct": float
+                    }
+                }
+            }
         """
         hunts = self.list_hunts()
-        coverage: Dict = {}
+        # Initialize coverage structure for ALL ATT&CK tactics (not just ones with hunts)
+        coverage_by_tactic: Dict[str, Dict[str, Any]] = {}
+        for tactic_key in get_sorted_tactics():
+            coverage_by_tactic[tactic_key] = {
+                "hunt_count": 0,
+                "hunt_ids": set(),
+                "techniques": {},
+                "total_techniques": ATTACK_TACTICS[tactic_key]["technique_count"],
+            }
+        all_unique_techniques: set[str] = set()
         for hunt in hunts:
+            hunt_id = hunt.get("hunt_id", "UNKNOWN")
             tactics = hunt.get("tactics", [])
             techniques = hunt.get("techniques", [])
+            # Track all unique techniques across all hunts
+            all_unique_techniques.update(techniques)
             for tactic in tactics:
-                if tactic not in coverage:
-                    coverage[tactic] = set()
+                # Skip if tactic not in ATT&CK matrix (might be custom tactic)
+                if tactic not in coverage_by_tactic:
+                    continue
+                # Track hunt IDs for this tactic
+                coverage_by_tactic[tactic]["hunt_ids"].add(hunt_id)
+                # Track which hunts cover each technique under this tactic
                 for technique in techniques:
-                    coverage[tactic].add(technique)
+                    if technique not in coverage_by_tactic[tactic]["techniques"]:
+                        coverage_by_tactic[tactic]["techniques"][technique] = []
+                    coverage_by_tactic[tactic]["techniques"][technique].append(hunt_id)
+        # Calculate coverage percentages and convert sets to sorted lists
+        for tactic in coverage_by_tactic:
+            coverage_by_tactic[tactic]["hunt_count"] = len(coverage_by_tactic[tactic]["hunt_ids"])
+            coverage_by_tactic[tactic]["hunt_ids"] = sorted(coverage_by_tactic[tactic]["hunt_ids"])
+            coverage_by_tactic[tactic]["techniques_covered"] = len(coverage_by_tactic[tactic]["techniques"])
+            # Calculate coverage percentage
+            total = coverage_by_tactic[tactic]["total_techniques"]
+            covered = coverage_by_tactic[tactic]["techniques_covered"]
+            coverage_by_tactic[tactic]["coverage_pct"] = (covered / total * 100) if total > 0 else 0.0
+        # Calculate overall coverage
+        tactics_with_hunts = len([t for t in coverage_by_tactic.values() if t["hunt_count"] > 0])
+        overall_coverage_pct = (len(all_unique_techniques) / TOTAL_TECHNIQUES * 100) if TOTAL_TECHNIQUES > 0 else 0.0
+        # Build summary
+        summary = {
+            "total_hunts": len(hunts),
+            "completed_hunts": len([h for h in hunts if h.get("status") == "completed"]),
+            "unique_techniques": len(all_unique_techniques),
+            "tactics_covered": tactics_with_hunts,
+            "total_techniques": TOTAL_TECHNIQUES,
+            "overall_coverage_pct": overall_coverage_pct,
+        }
-        # Convert sets to sorted lists
-        return {tactic: sorted(list(techniques)) for tactic, techniques in coverage.items()}
+        return {"summary": summary, "by_tactic": coverage_by_tactic}

agentic-threat-hunting-framework 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

agentic-threat-hunting-framework 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl