agentic-threat-hunting-framework 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,419 @@
1
+ """Manage research files and operations."""
2
+
3
+ import re
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import yaml
9
+
10
+
11
+ class ResearchParser:
12
+ """Parser for research files (YAML frontmatter + markdown)."""
13
+
14
+ def __init__(self, file_path: Path) -> None:
15
+ """Initialize parser with research file path."""
16
+ self.file_path = Path(file_path)
17
+ self.frontmatter: Dict[str, Any] = {}
18
+ self.content = ""
19
+ self.sections: Dict[str, str] = {}
20
+
21
+ def parse(self) -> Dict[str, Any]:
22
+ """Parse research file and return structured data.
23
+
24
+ Returns:
25
+ Dict containing frontmatter, content, and sections
26
+ """
27
+ if not self.file_path.exists():
28
+ raise FileNotFoundError(f"Research file not found: {self.file_path}")
29
+
30
+ with open(self.file_path, "r", encoding="utf-8") as f:
31
+ content = f.read()
32
+
33
+ # Parse YAML frontmatter
34
+ self.frontmatter = self._parse_frontmatter(content)
35
+
36
+ # Extract main content (after frontmatter)
37
+ self.content = self._extract_content(content)
38
+
39
+ # Parse research sections
40
+ self.sections = self._parse_sections(self.content)
41
+
42
+ return {
43
+ "file_path": str(self.file_path),
44
+ "research_id": self.frontmatter.get("research_id"),
45
+ "frontmatter": self.frontmatter,
46
+ "content": self.content,
47
+ "sections": self.sections,
48
+ }
49
+
50
+ def _parse_frontmatter(self, content: str) -> Dict[str, Any]:
51
+ """Extract and parse YAML frontmatter."""
52
+ frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
53
+ match = re.match(frontmatter_pattern, content, re.DOTALL)
54
+
55
+ if not match:
56
+ return {}
57
+
58
+ frontmatter_text = match.group(1)
59
+
60
+ try:
61
+ return yaml.safe_load(frontmatter_text) or {}
62
+ except yaml.YAMLError as e:
63
+ raise ValueError(f"Invalid YAML frontmatter: {e}")
64
+
65
+ def _extract_content(self, content: str) -> str:
66
+ """Extract content after frontmatter."""
67
+ frontmatter_pattern = r"^---\s*\n.*?\n---\s*\n"
68
+ content_without_fm = re.sub(frontmatter_pattern, "", content, count=1, flags=re.DOTALL)
69
+ return content_without_fm.strip()
70
+
71
+ def _parse_sections(self, content: str) -> Dict[str, str]:
72
+ """Parse research sections from content.
73
+
74
+ Returns:
75
+ Dict with section names and content
76
+ """
77
+ sections = {}
78
+
79
+ # Define section patterns for the 5 research skills
80
+ section_patterns = {
81
+ "system_research": r"##\s+1\.\s+System Research.*?(?=##\s+2\.|$)",
82
+ "adversary_tradecraft": r"##\s+2\.\s+Adversary Tradecraft.*?(?=##\s+3\.|$)",
83
+ "telemetry_mapping": r"##\s+3\.\s+Telemetry Mapping.*?(?=##\s+4\.|$)",
84
+ "related_work": r"##\s+4\.\s+Related Work.*?(?=##\s+5\.|$)",
85
+ "synthesis": r"##\s+5\.\s+Research Synthesis.*?(?=##\s+[A-Z]|$)",
86
+ }
87
+
88
+ for section_name, pattern in section_patterns.items():
89
+ match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
90
+ if match:
91
+ sections[section_name] = match.group(0).strip()
92
+
93
+ return sections
94
+
95
+
96
+ def parse_research_file(file_path: Path) -> Dict[str, Any]:
97
+ """Convenience function to parse a research file."""
98
+ parser = ResearchParser(file_path)
99
+ return parser.parse()
100
+
101
+
102
+ class ResearchManager:
103
+ """Manage research files and operations.
104
+
105
+ Similar pattern to HuntManager but for research documents.
106
+ Research files use R-XXXX IDs and are stored in research/ directory.
107
+ """
108
+
109
+ def __init__(self, research_dir: Optional[Path] = None) -> None:
110
+ """Initialize research manager.
111
+
112
+ Args:
113
+ research_dir: Directory containing research files (default: ./research)
114
+ """
115
+ self.research_dir = Path(research_dir) if research_dir else Path.cwd() / "research"
116
+
117
+ if not self.research_dir.exists():
118
+ self.research_dir.mkdir(parents=True, exist_ok=True)
119
+
120
+ def _find_all_research_files(self) -> List[Path]:
121
+ """Find all research files (R-*.md).
122
+
123
+ Returns:
124
+ List of paths to research files
125
+ """
126
+ research_files: List[Path] = []
127
+
128
+ # Find flat files (R-*.md)
129
+ research_files.extend(self.research_dir.rglob("R-*.md"))
130
+
131
+ return sorted(set(research_files))
132
+
133
+ def get_next_research_id(self, prefix: str = "R-") -> str:
134
+ """Calculate the next available research ID.
135
+
136
+ Args:
137
+ prefix: Research ID prefix (default: R-)
138
+
139
+ Returns:
140
+ Next research ID (e.g., R-0023)
141
+ """
142
+ research_files = self._find_all_research_files()
143
+
144
+ if not research_files:
145
+ return f"{prefix}0001"
146
+
147
+ # Extract numbers from research IDs with matching prefix
148
+ numbers = []
149
+ pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")
150
+
151
+ for research_file in research_files:
152
+ try:
153
+ research_data = parse_research_file(research_file)
154
+ research_id = research_data.get("frontmatter", {}).get("research_id")
155
+
156
+ if not research_id or not isinstance(research_id, str):
157
+ continue
158
+
159
+ match = pattern.match(research_id)
160
+ if match:
161
+ numbers.append(int(match.group(1)))
162
+ except Exception:
163
+ # Try to extract from filename if parsing fails
164
+ match = pattern.match(research_file.stem)
165
+ if match:
166
+ numbers.append(int(match.group(1)))
167
+
168
+ if not numbers:
169
+ return f"{prefix}0001"
170
+
171
+ # Next number with zero-padding
172
+ next_num = max(numbers) + 1
173
+ return f"{prefix}{next_num:04d}"
174
+
175
+ def list_research(
176
+ self,
177
+ status: Optional[str] = None,
178
+ technique: Optional[str] = None,
179
+ topic: Optional[str] = None,
180
+ ) -> List[Dict[str, Any]]:
181
+ """List all research documents with optional filters.
182
+
183
+ Args:
184
+ status: Filter by status (draft, in_progress, completed)
185
+ technique: Filter by MITRE technique
186
+ topic: Filter by topic (substring match)
187
+
188
+ Returns:
189
+ List of research metadata dicts
190
+ """
191
+ research_list = []
192
+
193
+ for research_file in self._find_all_research_files():
194
+ try:
195
+ research_data = parse_research_file(research_file)
196
+ frontmatter = research_data.get("frontmatter", {})
197
+
198
+ # Apply filters
199
+ if status and frontmatter.get("status") != status:
200
+ continue
201
+
202
+ if technique:
203
+ techniques = frontmatter.get("mitre_techniques", [])
204
+ if technique not in techniques:
205
+ continue
206
+
207
+ if topic:
208
+ research_topic = frontmatter.get("topic", "").lower()
209
+ if topic.lower() not in research_topic:
210
+ continue
211
+
212
+ # Extract summary info
213
+ research_list.append(
214
+ {
215
+ "research_id": frontmatter.get("research_id"),
216
+ "topic": frontmatter.get("topic"),
217
+ "status": frontmatter.get("status"),
218
+ "created_date": frontmatter.get("created_date"),
219
+ "depth": frontmatter.get("depth"),
220
+ "mitre_techniques": frontmatter.get("mitre_techniques", []),
221
+ "linked_hunts": frontmatter.get("linked_hunts", []),
222
+ "duration_minutes": frontmatter.get("duration_minutes"),
223
+ "total_cost_usd": frontmatter.get("total_cost_usd"),
224
+ "file_path": str(research_file),
225
+ }
226
+ )
227
+
228
+ except Exception:
229
+ # Skip files that can't be parsed
230
+ continue
231
+
232
+ return research_list
233
+
234
+ def get_research(self, research_id: str) -> Optional[Dict[str, Any]]:
235
+ """Get a specific research document by ID.
236
+
237
+ Args:
238
+ research_id: Research ID (e.g., R-0001)
239
+
240
+ Returns:
241
+ Research data dict or None if not found
242
+ """
243
+ # Try direct file
244
+ research_file = self.research_dir / f"{research_id}.md"
245
+ if research_file.exists():
246
+ return parse_research_file(research_file)
247
+
248
+ # Try nested search
249
+ research_files = list(self.research_dir.rglob(f"{research_id}.md"))
250
+ if research_files:
251
+ return parse_research_file(research_files[0])
252
+
253
+ return None
254
+
255
+ def search_research(self, query: str) -> List[Dict[str, Any]]:
256
+ """Full-text search across research documents.
257
+
258
+ Args:
259
+ query: Search query string
260
+
261
+ Returns:
262
+ List of matching research documents
263
+ """
264
+ results = []
265
+ query_lower = query.lower()
266
+
267
+ for research_file in self._find_all_research_files():
268
+ try:
269
+ with open(research_file, "r", encoding="utf-8") as f:
270
+ content = f.read()
271
+
272
+ if query_lower in content.lower():
273
+ research_data = parse_research_file(research_file)
274
+ frontmatter = research_data.get("frontmatter", {})
275
+
276
+ results.append(
277
+ {
278
+ "research_id": frontmatter.get("research_id"),
279
+ "topic": frontmatter.get("topic"),
280
+ "status": frontmatter.get("status"),
281
+ "file_path": str(research_file),
282
+ }
283
+ )
284
+
285
+ except Exception:
286
+ continue
287
+
288
+ return results
289
+
290
+ def link_hunt_to_research(self, research_id: str, hunt_id: str) -> bool:
291
+ """Link a hunt to its source research.
292
+
293
+ Updates the research document's linked_hunts field.
294
+
295
+ Args:
296
+ research_id: Research ID (e.g., R-0001)
297
+ hunt_id: Hunt ID to link (e.g., H-0001)
298
+
299
+ Returns:
300
+ True if successful, False otherwise
301
+ """
302
+ research_data = self.get_research(research_id)
303
+ if not research_data:
304
+ return False
305
+
306
+ file_path = Path(research_data["file_path"])
307
+
308
+ try:
309
+ with open(file_path, "r", encoding="utf-8") as f:
310
+ content = f.read()
311
+
312
+ # Parse frontmatter
313
+ frontmatter = research_data.get("frontmatter", {})
314
+ linked_hunts = frontmatter.get("linked_hunts", [])
315
+
316
+ # Add hunt if not already linked
317
+ if hunt_id not in linked_hunts:
318
+ linked_hunts.append(hunt_id)
319
+
320
+ # Update the YAML frontmatter
321
+ # Find and replace linked_hunts line
322
+ if "linked_hunts:" in content:
323
+ # Replace existing linked_hunts
324
+ pattern = r"linked_hunts:.*?(?=\n[a-z_]+:|---)"
325
+ replacement = f"linked_hunts: {linked_hunts}\n"
326
+ content = re.sub(pattern, replacement, content, flags=re.DOTALL)
327
+ else:
328
+ # Add linked_hunts before closing ---
329
+ pattern = r"\n---\s*\n"
330
+ replacement = f"\nlinked_hunts: {linked_hunts}\n---\n"
331
+ content = re.sub(pattern, replacement, content, count=1)
332
+
333
+ with open(file_path, "w", encoding="utf-8") as f:
334
+ f.write(content)
335
+
336
+ return True
337
+
338
+ except Exception:
339
+ return False
340
+
341
+ def create_research_file(
342
+ self,
343
+ research_id: str,
344
+ topic: str,
345
+ content: str,
346
+ frontmatter: Dict[str, Any],
347
+ ) -> Path:
348
+ """Create a new research file.
349
+
350
+ Args:
351
+ research_id: Research ID (e.g., R-0001)
352
+ topic: Research topic
353
+ content: Markdown content
354
+ frontmatter: YAML frontmatter dict
355
+
356
+ Returns:
357
+ Path to created file
358
+ """
359
+ # Ensure research_id and topic are in frontmatter
360
+ frontmatter["research_id"] = research_id
361
+ frontmatter["topic"] = topic
362
+ frontmatter.setdefault("created_date", datetime.now().strftime("%Y-%m-%d"))
363
+ frontmatter.setdefault("status", "completed")
364
+
365
+ # Build file content
366
+ yaml_content = yaml.dump(frontmatter, default_flow_style=False, sort_keys=False)
367
+ file_content = f"---\n{yaml_content}---\n\n{content}"
368
+
369
+ # Write file
370
+ file_path = self.research_dir / f"{research_id}.md"
371
+ with open(file_path, "w", encoding="utf-8") as f:
372
+ f.write(file_content)
373
+
374
+ return file_path
375
+
376
+ def calculate_stats(self) -> Dict[str, Any]:
377
+ """Calculate research program statistics.
378
+
379
+ Returns:
380
+ Dict with counts, costs, and other metrics
381
+ """
382
+ research_list = self.list_research()
383
+
384
+ if not research_list:
385
+ return {
386
+ "total_research": 0,
387
+ "completed_research": 0,
388
+ "total_cost_usd": 0.0,
389
+ "total_duration_minutes": 0,
390
+ "avg_duration_minutes": 0.0,
391
+ "by_status": {},
392
+ "total_linked_hunts": 0,
393
+ }
394
+
395
+ total_research = len(research_list)
396
+ completed_research = len([r for r in research_list if r.get("status") == "completed"])
397
+
398
+ total_cost = sum(r.get("total_cost_usd", 0) or 0 for r in research_list)
399
+ total_duration = sum(r.get("duration_minutes", 0) or 0 for r in research_list)
400
+ avg_duration = total_duration / total_research if total_research > 0 else 0.0
401
+
402
+ # Count by status
403
+ by_status: Dict[str, int] = {}
404
+ for research in research_list:
405
+ status = research.get("status", "unknown")
406
+ by_status[status] = by_status.get(status, 0) + 1
407
+
408
+ # Count linked hunts
409
+ total_linked_hunts = sum(len(r.get("linked_hunts", [])) for r in research_list)
410
+
411
+ return {
412
+ "total_research": total_research,
413
+ "completed_research": completed_research,
414
+ "total_cost_usd": round(total_cost, 4),
415
+ "total_duration_minutes": total_duration,
416
+ "avg_duration_minutes": round(avg_duration, 1),
417
+ "by_status": by_status,
418
+ "total_linked_hunts": total_linked_hunts,
419
+ }