security-controls-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,180 @@
1
+ """Data loader for SCF controls and framework mappings."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ class SCFData:
9
+ """Loads and provides access to SCF control data."""
10
+
11
+ def __init__(self):
12
+ self.controls: list[dict[str, Any]] = []
13
+ self.controls_by_id: dict[str, dict[str, Any]] = {}
14
+ self.framework_to_scf: dict[str, dict[str, list[str]]] = {}
15
+ self.frameworks: dict[str, dict[str, Any]] = {}
16
+ self._load_data()
17
+
18
+ def _load_data(self):
19
+ """Load SCF controls and reverse index from JSON files."""
20
+ data_dir = Path(__file__).parent / "data"
21
+
22
+ # Load controls
23
+ with open(data_dir / "scf-controls.json", "r", encoding="utf-8") as f:
24
+ data = json.load(f)
25
+ self.controls = data["controls"]
26
+
27
+ # Build ID index
28
+ self.controls_by_id = {ctrl["id"]: ctrl for ctrl in self.controls}
29
+
30
+ # Load reverse index
31
+ with open(data_dir / "framework-to-scf.json", "r", encoding="utf-8") as f:
32
+ self.framework_to_scf = json.load(f)
33
+
34
+ # Build framework metadata
35
+ self._build_framework_metadata()
36
+
37
+ def _build_framework_metadata(self):
38
+ """Build framework metadata from controls."""
39
+ # Framework display names (keys must match actual data which uses dots in version numbers)
40
+ framework_names = {
41
+ "nist_csf_2.0": "NIST Cybersecurity Framework 2.0",
42
+ "nist_800_53_r5": "NIST SP 800-53 Revision 5",
43
+ "iso_27001_2022": "ISO/IEC 27001:2022",
44
+ "iso_27002_2022": "ISO/IEC 27002:2022",
45
+ "cis_csc_8.1": "CIS Critical Security Controls v8.1",
46
+ "pci_dss_4.0.1": "PCI DSS v4.0.1",
47
+ "cmmc_2.0_level_1": "CMMC 2.0 Level 1",
48
+ "cmmc_2.0_level_2": "CMMC 2.0 Level 2",
49
+ "soc_2_tsc": "SOC 2 (TSC 2017:2022)",
50
+ "dora": "Digital Operational Resilience Act (DORA)",
51
+ "nis2": "Network and Information Security Directive (NIS2)",
52
+ "gdpr": "General Data Protection Regulation (GDPR)",
53
+ "ncsc_caf_4.0": "NCSC Cyber Assessment Framework 4.0",
54
+ "uk_cyber_essentials": "UK Cyber Essentials",
55
+ "fedramp_r5_moderate": "FedRAMP Revision 5 (Moderate)",
56
+ "hipaa_security_rule": "HIPAA Security Rule",
57
+ }
58
+
59
+ # Count controls per framework
60
+ for fw_key, fw_name in framework_names.items():
61
+ count = sum(1 for ctrl in self.controls if ctrl["framework_mappings"].get(fw_key))
62
+ self.frameworks[fw_key] = {
63
+ "key": fw_key,
64
+ "name": fw_name,
65
+ "controls_mapped": count,
66
+ }
67
+
68
+ def get_control(self, control_id: str) -> dict[str, Any] | None:
69
+ """Get control by SCF ID."""
70
+ return self.controls_by_id.get(control_id)
71
+
72
+ def search_controls(
73
+ self, query: str, frameworks: list[str] | None = None, limit: int = 10
74
+ ) -> list[dict[str, Any]]:
75
+ """Search controls by description. Case-insensitive string matching for v1."""
76
+ query_lower = query.lower()
77
+ results = []
78
+
79
+ for ctrl in self.controls:
80
+ # Check if query matches name or description (case-insensitive)
81
+ name_lower = ctrl["name"].lower() if ctrl["name"] else ""
82
+ desc_lower = ctrl["description"].lower() if ctrl["description"] else ""
83
+
84
+ if query_lower in name_lower or query_lower in desc_lower:
85
+ # Filter by frameworks if specified
86
+ if frameworks:
87
+ has_mapping = any(ctrl["framework_mappings"].get(fw) for fw in frameworks)
88
+ if not has_mapping:
89
+ continue
90
+
91
+ # Get mapped frameworks for response
92
+ mapped_frameworks = [
93
+ fw for fw, mappings in ctrl["framework_mappings"].items() if mappings
94
+ ]
95
+
96
+ # Create snippet (simple version - first 150 chars with highlight)
97
+ desc = ctrl["description"]
98
+ idx = desc.lower().find(query_lower)
99
+ if idx >= 0:
100
+ start = max(0, idx - 50)
101
+ end = min(len(desc), idx + len(query) + 100)
102
+ snippet = desc[start:end]
103
+ if start > 0:
104
+ snippet = "..." + snippet
105
+ if end < len(desc):
106
+ snippet = snippet + "..."
107
+ else:
108
+ snippet = desc[:150] + "..." if len(desc) > 150 else desc
109
+
110
+ results.append(
111
+ {
112
+ "control_id": ctrl["id"],
113
+ "name": ctrl["name"],
114
+ "snippet": snippet,
115
+ "relevance": 1.0, # Simple scoring for v1
116
+ "mapped_frameworks": mapped_frameworks,
117
+ }
118
+ )
119
+
120
+ if len(results) >= limit:
121
+ break
122
+
123
+ return results
124
+
125
+ def get_framework_controls(
126
+ self, framework: str, include_descriptions: bool = False
127
+ ) -> list[dict[str, Any]]:
128
+ """Get all controls that map to a framework."""
129
+ results = []
130
+
131
+ for ctrl in self.controls:
132
+ fw_mappings = ctrl["framework_mappings"].get(framework)
133
+ if fw_mappings:
134
+ result = {
135
+ "scf_id": ctrl["id"],
136
+ "scf_name": ctrl["name"],
137
+ "framework_control_ids": fw_mappings,
138
+ "weight": ctrl["weight"],
139
+ }
140
+
141
+ if include_descriptions:
142
+ result["description"] = ctrl["description"]
143
+
144
+ results.append(result)
145
+
146
+ return results
147
+
148
+ def map_frameworks(
149
+ self,
150
+ source_framework: str,
151
+ target_framework: str,
152
+ source_control: str | None = None,
153
+ ) -> list[dict[str, Any]]:
154
+ """Map controls between two frameworks via SCF."""
155
+ results = []
156
+
157
+ # If source_control specified, filter to only controls with that mapping
158
+ for ctrl in self.controls:
159
+ source_mappings = ctrl["framework_mappings"].get(source_framework)
160
+ target_mappings = ctrl["framework_mappings"].get(target_framework)
161
+
162
+ # Skip if no source mapping
163
+ if not source_mappings:
164
+ continue
165
+
166
+ # Filter by source_control if specified
167
+ if source_control and source_control not in source_mappings:
168
+ continue
169
+
170
+ results.append(
171
+ {
172
+ "scf_id": ctrl["id"],
173
+ "scf_name": ctrl["name"],
174
+ "source_controls": source_mappings,
175
+ "target_controls": target_mappings or [],
176
+ "weight": ctrl["weight"],
177
+ }
178
+ )
179
+
180
+ return results
@@ -0,0 +1,5 @@
1
+ """Extractors for importing security standards from PDF files."""
2
+
3
+ from .pdf_extractor import extract_standard
4
+
5
+ __all__ = ["extract_standard"]
@@ -0,0 +1,248 @@
1
+ """PDF extraction for security standards."""
2
+
3
+ import re
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List
7
+
8
+ import pdfplumber
9
+
10
+
11
+ def extract_standard(
12
+ pdf_path: Path,
13
+ standard_id: str,
14
+ title: str,
15
+ version: str,
16
+ purchased_from: str,
17
+ purchase_date: str,
18
+ ) -> Dict[str, Any]:
19
+ """Extract a standard from PDF.
20
+
21
+ Args:
22
+ pdf_path: Path to PDF file
23
+ standard_id: Unique identifier for the standard
24
+ title: Full title of the standard
25
+ version: Version string
26
+ purchased_from: Where it was purchased
27
+ purchase_date: When it was purchased
28
+
29
+ Returns:
30
+ Dictionary with metadata and structure
31
+ """
32
+ # Open PDF and extract text
33
+ with pdfplumber.open(pdf_path) as pdf:
34
+ total_pages = len(pdf.pages)
35
+
36
+ # Extract text from all pages
37
+ pages_text = []
38
+ for page_num, page in enumerate(pdf.pages, start=1):
39
+ text = page.extract_text() or ""
40
+ pages_text.append({"page": page_num, "text": text})
41
+
42
+ # Detect structure
43
+ sections = _detect_sections(pages_text)
44
+ annexes = _detect_annexes(pages_text)
45
+
46
+ # Build metadata
47
+ metadata = {
48
+ "standard_id": standard_id,
49
+ "title": title,
50
+ "version": version,
51
+ "purchased_from": purchased_from,
52
+ "purchase_date": purchase_date,
53
+ "imported_date": datetime.now().isoformat(),
54
+ "license": "Proprietary - Licensed to individual user",
55
+ "pages": total_pages,
56
+ "restrictions": [
57
+ "Personal use only",
58
+ "No redistribution",
59
+ "No derivative works without permission",
60
+ ],
61
+ }
62
+
63
+ # Build structure
64
+ structure = {
65
+ "metadata": metadata,
66
+ "sections": sections,
67
+ "annexes": annexes,
68
+ }
69
+
70
+ # Calculate stats
71
+ total_clauses = len(sections)
72
+ for annex in annexes:
73
+ total_clauses += len(annex.get("controls", []))
74
+
75
+ stats = {
76
+ "pages": total_pages,
77
+ "sections": len(sections),
78
+ "annexes": len(annexes),
79
+ "total_clauses": total_clauses,
80
+ }
81
+
82
+ return {"metadata": metadata, "structure": structure, "stats": stats}
83
+
84
+
85
+ def _detect_sections(pages_text: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
86
+ """Detect main sections in the document.
87
+
88
+ This uses heuristics to identify section headings like:
89
+ - "1 Scope"
90
+ - "5.1.2 Cryptographic controls"
91
+ - "Chapter 3: Requirements"
92
+ """
93
+ sections = []
94
+
95
+ # Common section patterns
96
+ # Matches: "1 Title", "1.2 Title", "1.2.3 Title"
97
+ section_pattern = re.compile(r"^(\d+(?:\.\d+)*)\s+([A-Z][^\n]{5,80})$", re.MULTILINE)
98
+
99
+ for page_info in pages_text:
100
+ page_num = page_info["page"]
101
+ text = page_info["text"]
102
+
103
+ # Find all section headers on this page
104
+ matches = section_pattern.finditer(text)
105
+
106
+ for match in matches:
107
+ section_id = match.group(1)
108
+ section_title = match.group(2).strip()
109
+
110
+ # Extract content until next section or end of page
111
+ start_pos = match.end()
112
+ next_match = section_pattern.search(text, start_pos)
113
+
114
+ if next_match:
115
+ content = text[start_pos : next_match.start()].strip()
116
+ else:
117
+ content = text[start_pos:].strip()
118
+
119
+ # Only include if we have meaningful content
120
+ if content and len(content) > 20:
121
+ sections.append(
122
+ {
123
+ "id": section_id,
124
+ "title": section_title,
125
+ "page": page_num,
126
+ "content": content[:2000], # Limit length
127
+ "subsections": [],
128
+ }
129
+ )
130
+
131
+ # Build hierarchy (nest subsections)
132
+ sections = _build_hierarchy(sections)
133
+
134
+ return sections
135
+
136
+
137
+ def _detect_annexes(pages_text: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
138
+ """Detect annexes (like Annex A in ISO 27001).
139
+
140
+ Annexes often contain control listings with IDs like:
141
+ - "A.5.15 Access control"
142
+ - "Annex B.2.1 Requirements"
143
+ """
144
+ annexes = []
145
+
146
+ # Pattern for annex headers
147
+ annex_pattern = re.compile(r"^Annex\s+([A-Z])[:\s]+([^\n]+)$", re.MULTILINE | re.IGNORECASE)
148
+
149
+ # Pattern for controls within annexes
150
+ control_pattern = re.compile(r"^([A-Z]\.\d+(?:\.\d+)*)\s+([A-Z][^\n]{5,80})$", re.MULTILINE)
151
+
152
+ current_annex = None
153
+
154
+ for page_info in pages_text:
155
+ page_num = page_info["page"]
156
+ text = page_info["text"]
157
+
158
+ # Check for new annex
159
+ annex_match = annex_pattern.search(text)
160
+ if annex_match:
161
+ # Save previous annex if exists
162
+ if current_annex:
163
+ annexes.append(current_annex)
164
+
165
+ # Start new annex
166
+ annex_id = annex_match.group(1)
167
+ annex_title = annex_match.group(2).strip()
168
+ current_annex = {
169
+ "id": annex_id,
170
+ "title": annex_title,
171
+ "page": page_num,
172
+ "controls": [],
173
+ }
174
+
175
+ # If we're in an annex, look for controls
176
+ if current_annex:
177
+ control_matches = control_pattern.finditer(text)
178
+
179
+ for match in control_matches:
180
+ control_id = match.group(1)
181
+ control_title = match.group(2).strip()
182
+
183
+ # Extract content
184
+ start_pos = match.end()
185
+ next_match = control_pattern.search(text, start_pos)
186
+
187
+ if next_match:
188
+ content = text[start_pos : next_match.start()].strip()
189
+ else:
190
+ content = text[start_pos:].strip()
191
+
192
+ if content and len(content) > 10:
193
+ current_annex["controls"].append(
194
+ {
195
+ "id": control_id,
196
+ "title": control_title,
197
+ "content": content[:1000],
198
+ "page": page_num,
199
+ "category": f"Annex {current_annex['id']}",
200
+ "type": "normative",
201
+ }
202
+ )
203
+
204
+ # Add final annex
205
+ if current_annex and current_annex["controls"]:
206
+ annexes.append(current_annex)
207
+
208
+ return annexes
209
+
210
+
211
+ def _build_hierarchy(sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
212
+ """Build hierarchical structure from flat section list.
213
+
214
+ Converts:
215
+ [{"id": "1"}, {"id": "1.1"}, {"id": "1.2"}, {"id": "2"}]
216
+ Into:
217
+ [{"id": "1", "subsections": [{"id": "1.1"}, {"id": "1.2"}]}, {"id": "2"}]
218
+ """
219
+ if not sections:
220
+ return []
221
+
222
+ # Build a tree structure
223
+ root = []
224
+ stack = [] # Stack of (section, level)
225
+
226
+ for section in sections:
227
+ section_id = section["id"]
228
+ level = section_id.count(".")
229
+
230
+ # Remove subsections key to avoid duplication
231
+ section = {k: v for k, v in section.items() if k != "subsections"}
232
+ section["subsections"] = []
233
+
234
+ # Pop stack until we find the parent level
235
+ while stack and stack[-1][1] >= level:
236
+ stack.pop()
237
+
238
+ if not stack:
239
+ # Top level section
240
+ root.append(section)
241
+ stack.append((section, level))
242
+ else:
243
+ # Add as subsection of parent
244
+ parent = stack[-1][0]
245
+ parent["subsections"].append(section)
246
+ stack.append((section, level))
247
+
248
+ return root