agentic-threat-hunting-framework 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,376 @@
1
+ """Semantic similarity search for past hunts."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import click
8
+ import yaml
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ console = Console()
13
+
14
+ SIMILAR_EPILOG = """
15
+ \b
16
+ Examples:
17
+ # Find hunts similar to a text query
18
+ athf similar "password spraying via RDP"
19
+
20
+ # Find hunts similar to specific hunt
21
+ athf similar --hunt H-0013
22
+
23
+ # Limit results to top 5
24
+ athf similar "kerberos" --limit 5
25
+
26
+ # Export as JSON
27
+ athf similar "credential theft" --format json
28
+
29
+ \b
30
+ Why This Helps AI:
31
+ • Semantic search (not just keyword matching)
32
+ • Find related hunts with different terminology
33
+ • Discover patterns across hunt history
34
+ • Better than grep for conceptual matches
35
+ • Identify similar hunts to avoid duplication
36
+ """
37
+
38
+
39
+ @click.command(epilog=SIMILAR_EPILOG)
40
+ @click.argument("query", required=False)
41
+ @click.option("--hunt", help="Hunt ID to find similar hunts for (e.g., H-0013)")
42
+ @click.option("--limit", default=10, type=int, help="Maximum number of results (default: 10)")
43
+ @click.option(
44
+ "--format",
45
+ "output_format",
46
+ type=click.Choice(["table", "json", "yaml"]),
47
+ default="table",
48
+ help="Output format (default: table)",
49
+ )
50
+ @click.option("--threshold", default=0.1, type=float, help="Minimum similarity score (0-1, default: 0.1)")
51
+ def similar(
52
+ query: Optional[str],
53
+ hunt: Optional[str],
54
+ limit: int,
55
+ output_format: str,
56
+ threshold: float,
57
+ ) -> None:
58
+ """Find hunts similar to a query or hunt ID.
59
+
60
+ Uses semantic similarity to find related hunts even when
61
+ terminology differs. Better than keyword search for discovering
62
+ patterns and avoiding duplicate hunts.
63
+
64
+ \b
65
+ Use Cases:
66
+ • Check if similar hunt already exists
67
+ • Find related hunts for context
68
+ • Discover patterns across hunt history
69
+ • Identify hunt clusters by topic
70
+
71
+ \b
72
+ Examples:
73
+ # Text query
74
+ athf similar "password spraying"
75
+
76
+ # Similar to existing hunt
77
+ athf similar --hunt H-0013
78
+
79
+ # Top 5 results
80
+ athf similar "lateral movement" --limit 5
81
+ """
82
+ # Validate inputs
83
+ if not query and not hunt:
84
+ console.print("[red]Error: Must provide either QUERY or --hunt option[/red]")
85
+ console.print("\n[dim]Examples:[/dim]")
86
+ console.print(' athf similar "password spraying"')
87
+ console.print(" athf similar --hunt H-0013")
88
+ raise click.Abort()
89
+
90
+ if query and hunt:
91
+ console.print("[red]Error: Cannot specify both QUERY and --hunt[/red]")
92
+ raise click.Abort()
93
+
94
+ # Get query text
95
+ query_text: str
96
+ if hunt:
97
+ hunt_text = _get_hunt_text(hunt)
98
+ if not hunt_text:
99
+ console.print(f"[red]Error: Hunt {hunt} not found[/red]")
100
+ raise click.Abort()
101
+ query_text = hunt_text
102
+ else:
103
+ query_text = query or "" # Should never be None due to validation above
104
+
105
+ # Find similar hunts
106
+ results = _find_similar_hunts(query_text, limit=limit, threshold=threshold, exclude_hunt=hunt)
107
+
108
+ # Format and display results
109
+ if output_format == "json":
110
+ output = json.dumps(results, indent=2)
111
+ console.print(output)
112
+ elif output_format == "yaml":
113
+ output = yaml.dump(results, default_flow_style=False, sort_keys=False)
114
+ console.print(output)
115
+ else: # table
116
+ _display_results_table(results, query_text=query_text, reference_hunt=hunt)
117
+
118
+
119
+ def _get_hunt_text(hunt_id: str) -> Optional[str]:
120
+ """Get full text content of a hunt."""
121
+ hunt_file = Path(f"hunts/{hunt_id}.md")
122
+ if not hunt_file.exists():
123
+ return None
124
+ return hunt_file.read_text()
125
+
126
+
127
+ def _find_similar_hunts(
128
+ query_text: str,
129
+ limit: int = 10,
130
+ threshold: float = 0.1,
131
+ exclude_hunt: Optional[str] = None,
132
+ ) -> List[Dict[str, Any]]:
133
+ """Find similar hunts using TF-IDF similarity."""
134
+ try:
135
+ from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore
136
+ from sklearn.metrics.pairwise import cosine_similarity # type: ignore
137
+ except ImportError:
138
+ console.print("[red]Error: scikit-learn not installed[/red]")
139
+ console.print("[dim]Install with: pip install scikit-learn[/dim]")
140
+ raise click.Abort()
141
+
142
+ # Load all hunts
143
+ hunts_dir = Path("hunts")
144
+ hunt_files = list(hunts_dir.glob("H-*.md"))
145
+
146
+ if not hunt_files:
147
+ console.print("[yellow]No hunts found in hunts/ directory[/yellow]")
148
+ return []
149
+
150
+ # Extract hunt content and metadata
151
+ hunt_data = []
152
+ for hunt_file in hunt_files:
153
+ hunt_id = hunt_file.stem
154
+
155
+ # Skip excluded hunt
156
+ if exclude_hunt and hunt_id == exclude_hunt:
157
+ continue
158
+
159
+ content = hunt_file.read_text()
160
+ metadata = _extract_hunt_metadata(content)
161
+
162
+ # Extract searchable text (weighted semantic sections)
163
+ searchable_text = _extract_searchable_text(content, metadata)
164
+
165
+ hunt_data.append(
166
+ {
167
+ "hunt_id": hunt_id,
168
+ "content": content,
169
+ "searchable_text": searchable_text,
170
+ "metadata": metadata,
171
+ }
172
+ )
173
+
174
+ if not hunt_data:
175
+ console.print("[yellow]No hunts available for comparison[/yellow]")
176
+ return []
177
+
178
+ # Build TF-IDF vectors using searchable text (weighted semantic sections)
179
+ documents = [query_text] + [h["searchable_text"] for h in hunt_data]
180
+
181
+ vectorizer = TfidfVectorizer(
182
+ max_features=1000,
183
+ stop_words="english",
184
+ ngram_range=(1, 2), # Unigrams and bigrams
185
+ )
186
+
187
+ tfidf_matrix = vectorizer.fit_transform(documents)
188
+
189
+ # Calculate similarity scores
190
+ query_vector = tfidf_matrix[0:1]
191
+ hunt_vectors = tfidf_matrix[1:]
192
+
193
+ similarities = cosine_similarity(query_vector, hunt_vectors)[0]
194
+
195
+ # Combine results with metadata
196
+ results = []
197
+ for i, hunt_info in enumerate(hunt_data):
198
+ score = float(similarities[i])
199
+
200
+ if score >= threshold:
201
+ metadata = hunt_info["metadata"] # type: ignore[assignment]
202
+ results.append(
203
+ {
204
+ "hunt_id": hunt_info["hunt_id"],
205
+ "similarity_score": round(score, 4),
206
+ "title": metadata.get("title", "Unknown"),
207
+ "status": metadata.get("status", "unknown"),
208
+ "tactics": metadata.get("tactics", []),
209
+ "techniques": metadata.get("techniques", []),
210
+ "platform": metadata.get("platform", []),
211
+ }
212
+ )
213
+
214
+ # Sort by similarity score (descending)
215
+ results.sort(key=lambda x: x["similarity_score"], reverse=True)
216
+
217
+ return results[:limit]
218
+
219
+
220
+ def _extract_hunt_metadata(content: str) -> Dict[str, Any]:
221
+ """Extract YAML frontmatter metadata from hunt file."""
222
+ if not content.startswith("---"):
223
+ return {}
224
+
225
+ try:
226
+ yaml_end = content.find("---", 3)
227
+ if yaml_end > 0:
228
+ frontmatter = content[3:yaml_end]
229
+ return yaml.safe_load(frontmatter) or {}
230
+ except yaml.YAMLError:
231
+ return {}
232
+
233
+ return {}
234
+
235
+
236
+ def _extract_searchable_text(content: str, metadata: Dict[str, Any]) -> str:
237
+ """Extract semantically important text for similarity matching.
238
+
239
+ Focuses on key sections and applies weighting to improve match accuracy:
240
+ - Title (3x weight)
241
+ - Hypothesis (2x weight)
242
+ - ABLE framework sections (1.5x weight)
243
+ - Tactics/Techniques (1x weight)
244
+
245
+ Ignores: SQL queries, results, timestamps, org IDs, lessons learned
246
+ """
247
+ parts = []
248
+
249
+ # Title (3x weight - most important)
250
+ title = metadata.get("title", "")
251
+ if title:
252
+ parts.extend([title] * 3)
253
+
254
+ # Tactics and techniques (1x weight)
255
+ tactics = metadata.get("tactics", [])
256
+ if isinstance(tactics, list):
257
+ parts.extend(tactics)
258
+ elif tactics:
259
+ parts.append(str(tactics))
260
+
261
+ techniques = metadata.get("techniques", [])
262
+ if isinstance(techniques, list):
263
+ parts.extend(techniques)
264
+ elif techniques:
265
+ parts.append(str(techniques))
266
+
267
+ platform = metadata.get("platform", [])
268
+ if isinstance(platform, list):
269
+ parts.extend(platform)
270
+ elif platform:
271
+ parts.append(str(platform))
272
+
273
+ # Extract hypothesis section (2x weight)
274
+ hypothesis = _extract_section(content, "## Hypothesis")
275
+ if hypothesis:
276
+ parts.extend([hypothesis] * 2)
277
+
278
+ # Extract ABLE framework sections (1.5x weight each)
279
+ able_sections = ["Actor", "Behavior", "Location", "Evidence"]
280
+ for section in able_sections:
281
+ text = _extract_section(content, f"### {section}")
282
+ if text:
283
+ # Weight 1.5x = add once + half again
284
+ parts.append(text)
285
+ parts.append(text[: len(text) // 2]) # Add first half again for 1.5x weight
286
+
287
+ return " ".join(parts)
288
+
289
+
290
+ def _extract_section(content: str, heading: str) -> str:
291
+ """Extract text from a markdown section until the next heading."""
292
+ lines = content.split("\n")
293
+ section_lines = []
294
+ in_section = False
295
+
296
+ for line in lines:
297
+ if line.startswith(heading):
298
+ in_section = True
299
+ continue
300
+
301
+ if in_section:
302
+ # Stop at next heading of same or higher level
303
+ if line.startswith("#"):
304
+ break
305
+ section_lines.append(line)
306
+
307
+ return " ".join(section_lines).strip()
308
+
309
+
310
+ def _display_results_table(
311
+ results: List[Dict[str, Any]],
312
+ query_text: str,
313
+ reference_hunt: Optional[str] = None,
314
+ ) -> None:
315
+ """Display results in rich table format."""
316
+ # Header (always show, even if no results)
317
+ if reference_hunt:
318
+ console.print(f"\n[bold]Similar to {reference_hunt}:[/bold]")
319
+ else:
320
+ query_preview = query_text[:60] + "..." if len(query_text) > 60 else query_text
321
+ console.print(f"\n[bold]Similar to:[/bold] [dim]{query_preview}[/dim]")
322
+
323
+ if not results:
324
+ console.print("[yellow]No similar hunts found[/yellow]")
325
+ return
326
+
327
+ console.print(f"[dim]Found {len(results)} similar hunts[/dim]\n")
328
+
329
+ # Table
330
+ table = Table(show_header=True, header_style="bold cyan")
331
+ table.add_column("Score", style="green", no_wrap=True, width=6)
332
+ table.add_column("Hunt ID", style="cyan", no_wrap=True, width=10)
333
+ table.add_column("Title", style="white")
334
+ table.add_column("Status", style="yellow", no_wrap=True, width=12)
335
+ table.add_column("Tactics", style="dim", width=20)
336
+
337
+ for result in results:
338
+ score = result["similarity_score"]
339
+ hunt_id = result["hunt_id"]
340
+ title = result["title"]
341
+ status = result["status"]
342
+
343
+ # Format tactics (abbreviate if too long)
344
+ tactics = result.get("tactics", [])
345
+ tactics_str = ", ".join(tactics[:2])
346
+ if len(tactics) > 2:
347
+ tactics_str += f" +{len(tactics) - 2}"
348
+
349
+ # Color-code score
350
+ if score >= 0.5:
351
+ score_str = f"[bold green]{score:.3f}[/bold green]"
352
+ elif score >= 0.3:
353
+ score_str = f"[green]{score:.3f}[/green]"
354
+ elif score >= 0.15:
355
+ score_str = f"[yellow]{score:.3f}[/yellow]"
356
+ else:
357
+ score_str = f"[dim]{score:.3f}[/dim]"
358
+
359
+ # Status emoji
360
+ status_map = {
361
+ "completed": "✅",
362
+ "in-progress": "🔄",
363
+ "planning": "📋",
364
+ }
365
+ status_emoji = status_map.get(status, "❓")
366
+ status_display = f"{status_emoji} {status}"
367
+
368
+ table.add_row(score_str, hunt_id, title, status_display, tactics_str)
369
+
370
+ console.print(table)
371
+
372
+ # Legend
373
+ console.print("\n[dim]Similarity Score Legend:[/dim]")
374
+ console.print(
375
+ "[dim] ≥0.50 = Very similar | 0.30-0.49 = Similar | 0.15-0.29 = Somewhat similar | <0.15 = Low similarity[/dim]\n"
376
+ )
@@ -0,0 +1,116 @@
1
+ """MITRE ATT&CK Matrix reference data.
2
+
3
+ This module contains reference data for the MITRE ATT&CK Enterprise matrix,
4
+ including tactic ordering and technique counts.
5
+ """
6
+
7
+ # MITRE ATT&CK Enterprise Matrix v14 (January 2024)
8
+ # Approximate technique counts per tactic (includes sub-techniques)
9
+ ATTACK_TACTICS = {
10
+ "reconnaissance": {
11
+ "name": "Reconnaissance",
12
+ "technique_count": 10,
13
+ "order": 1,
14
+ },
15
+ "resource-development": {
16
+ "name": "Resource Development",
17
+ "technique_count": 7,
18
+ "order": 2,
19
+ },
20
+ "initial-access": {
21
+ "name": "Initial Access",
22
+ "technique_count": 9,
23
+ "order": 3,
24
+ },
25
+ "execution": {
26
+ "name": "Execution",
27
+ "technique_count": 12,
28
+ "order": 4,
29
+ },
30
+ "persistence": {
31
+ "name": "Persistence",
32
+ "technique_count": 19,
33
+ "order": 5,
34
+ },
35
+ "privilege-escalation": {
36
+ "name": "Privilege Escalation",
37
+ "technique_count": 13,
38
+ "order": 6,
39
+ },
40
+ "defense-evasion": {
41
+ "name": "Defense Evasion",
42
+ "technique_count": 42,
43
+ "order": 7,
44
+ },
45
+ "credential-access": {
46
+ "name": "Credential Access",
47
+ "technique_count": 15,
48
+ "order": 8,
49
+ },
50
+ "discovery": {
51
+ "name": "Discovery",
52
+ "technique_count": 30,
53
+ "order": 9,
54
+ },
55
+ "lateral-movement": {
56
+ "name": "Lateral Movement",
57
+ "technique_count": 9,
58
+ "order": 10,
59
+ },
60
+ "collection": {
61
+ "name": "Collection",
62
+ "technique_count": 17,
63
+ "order": 11,
64
+ },
65
+ "command-and-control": {
66
+ "name": "Command and Control",
67
+ "technique_count": 16,
68
+ "order": 12,
69
+ },
70
+ "exfiltration": {
71
+ "name": "Exfiltration",
72
+ "technique_count": 9,
73
+ "order": 13,
74
+ },
75
+ "impact": {
76
+ "name": "Impact",
77
+ "technique_count": 13,
78
+ "order": 14,
79
+ },
80
+ }
81
+
82
+ # Total techniques across all tactics
83
+ TOTAL_TECHNIQUES = sum(tactic["technique_count"] for tactic in ATTACK_TACTICS.values())
84
+
85
+
86
+ def get_tactic_display_name(tactic_key: str) -> str:
87
+ """Get the display name for a tactic key.
88
+
89
+ Args:
90
+ tactic_key: Tactic key (e.g., "credential-access")
91
+
92
+ Returns:
93
+ Display name (e.g., "Credential Access")
94
+ """
95
+ return ATTACK_TACTICS.get(tactic_key, {}).get("name", tactic_key.replace("-", " ").title())
96
+
97
+
98
+ def get_tactic_technique_count(tactic_key: str) -> int:
99
+ """Get the total technique count for a tactic.
100
+
101
+ Args:
102
+ tactic_key: Tactic key (e.g., "credential-access")
103
+
104
+ Returns:
105
+ Total technique count for the tactic
106
+ """
107
+ return ATTACK_TACTICS.get(tactic_key, {}).get("technique_count", 0)
108
+
109
+
110
+ def get_sorted_tactics() -> list[str]:
111
+ """Get all tactic keys sorted by ATT&CK matrix order.
112
+
113
+ Returns:
114
+ List of tactic keys in matrix order
115
+ """
116
+ return sorted(ATTACK_TACTICS.keys(), key=lambda k: ATTACK_TACTICS[k]["order"])
athf/core/hunt_manager.py CHANGED
@@ -2,8 +2,9 @@
2
2
 
3
3
  import re
4
4
  from pathlib import Path
5
- from typing import Dict, List, Optional
5
+ from typing import Any, Dict, List, Optional
6
6
 
7
+ from athf.core.attack_matrix import ATTACK_TACTICS, TOTAL_TECHNIQUES, get_sorted_tactics
7
8
  from athf.core.hunt_parser import parse_hunt_file
8
9
 
9
10
 
@@ -220,26 +221,93 @@ class HuntManager:
220
221
  "tp_fp_ratio": round(tp_fp_ratio, 2) if tp_fp_ratio != float("inf") else "∞",
221
222
  }
222
223
 
223
- def calculate_attack_coverage(self) -> Dict[str, List[str]]:
224
- """Calculate MITRE ATT&CK technique coverage.
224
+ def calculate_attack_coverage(self) -> Dict[str, Any]:
225
+ """Calculate MITRE ATT&CK technique coverage with hunt references.
225
226
 
226
227
  Returns:
227
- Dict mapping tactics to lists of covered techniques
228
+ Dict with structure:
229
+ {
230
+ "summary": {
231
+ "total_hunts": int,
232
+ "completed_hunts": int,
233
+ "unique_techniques": int,
234
+ "tactics_covered": int,
235
+ "total_techniques": int,
236
+ "overall_coverage_pct": float
237
+ },
238
+ "by_tactic": {
239
+ "tactic-name": {
240
+ "hunt_count": int,
241
+ "hunt_ids": List[str],
242
+ "techniques": {
243
+ "T1234.001": ["H-0001", "H-0003"]
244
+ },
245
+ "techniques_covered": int,
246
+ "total_techniques": int,
247
+ "coverage_pct": float
248
+ }
249
+ }
250
+ }
228
251
  """
229
252
  hunts = self.list_hunts()
230
253
 
231
- coverage: Dict = {}
254
+ # Initialize coverage structure for ALL ATT&CK tactics (not just ones with hunts)
255
+ coverage_by_tactic: Dict[str, Dict[str, Any]] = {}
256
+ for tactic_key in get_sorted_tactics():
257
+ coverage_by_tactic[tactic_key] = {
258
+ "hunt_count": 0,
259
+ "hunt_ids": set(),
260
+ "techniques": {},
261
+ "total_techniques": ATTACK_TACTICS[tactic_key]["technique_count"],
262
+ }
263
+
264
+ all_unique_techniques: set[str] = set()
232
265
 
233
266
  for hunt in hunts:
267
+ hunt_id = hunt.get("hunt_id", "UNKNOWN")
234
268
  tactics = hunt.get("tactics", [])
235
269
  techniques = hunt.get("techniques", [])
236
270
 
271
+ # Track all unique techniques across all hunts
272
+ all_unique_techniques.update(techniques)
273
+
237
274
  for tactic in tactics:
238
- if tactic not in coverage:
239
- coverage[tactic] = set()
275
+ # Skip if tactic not in ATT&CK matrix (might be custom tactic)
276
+ if tactic not in coverage_by_tactic:
277
+ continue
240
278
 
279
+ # Track hunt IDs for this tactic
280
+ coverage_by_tactic[tactic]["hunt_ids"].add(hunt_id)
281
+
282
+ # Track which hunts cover each technique under this tactic
241
283
  for technique in techniques:
242
- coverage[tactic].add(technique)
284
+ if technique not in coverage_by_tactic[tactic]["techniques"]:
285
+ coverage_by_tactic[tactic]["techniques"][technique] = []
286
+ coverage_by_tactic[tactic]["techniques"][technique].append(hunt_id)
287
+
288
+ # Calculate coverage percentages and convert sets to sorted lists
289
+ for tactic in coverage_by_tactic:
290
+ coverage_by_tactic[tactic]["hunt_count"] = len(coverage_by_tactic[tactic]["hunt_ids"])
291
+ coverage_by_tactic[tactic]["hunt_ids"] = sorted(coverage_by_tactic[tactic]["hunt_ids"])
292
+ coverage_by_tactic[tactic]["techniques_covered"] = len(coverage_by_tactic[tactic]["techniques"])
293
+
294
+ # Calculate coverage percentage
295
+ total = coverage_by_tactic[tactic]["total_techniques"]
296
+ covered = coverage_by_tactic[tactic]["techniques_covered"]
297
+ coverage_by_tactic[tactic]["coverage_pct"] = (covered / total * 100) if total > 0 else 0.0
298
+
299
+ # Calculate overall coverage
300
+ tactics_with_hunts = len([t for t in coverage_by_tactic.values() if t["hunt_count"] > 0])
301
+ overall_coverage_pct = (len(all_unique_techniques) / TOTAL_TECHNIQUES * 100) if TOTAL_TECHNIQUES > 0 else 0.0
302
+
303
+ # Build summary
304
+ summary = {
305
+ "total_hunts": len(hunts),
306
+ "completed_hunts": len([h for h in hunts if h.get("status") == "completed"]),
307
+ "unique_techniques": len(all_unique_techniques),
308
+ "tactics_covered": tactics_with_hunts,
309
+ "total_techniques": TOTAL_TECHNIQUES,
310
+ "overall_coverage_pct": overall_coverage_pct,
311
+ }
243
312
 
244
- # Convert sets to sorted lists
245
- return {tactic: sorted(list(techniques)) for tactic, techniques in coverage.items()}
313
+ return {"summary": summary, "by_tactic": coverage_by_tactic}