audit-packs-core 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: audit-packs-core
3
+ Version: 0.1.1
4
+ Summary: Core models, normalization, diff, and data-flow primitives for audit-packs
5
+ License: Apache-2.0
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: PyYAML>=6.0
9
+
10
+ # audit-packs-core
11
+
12
+ [![PyPI version](https://img.shields.io/pypi/v/audit-packs-core.svg)](https://pypi.org/project/audit-packs-core/)
13
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](../../LICENSE)
14
+
15
+ `audit-packs-core` is the foundational library for the `audit-packs` ecosystem. It provides the core data structures, schema models, parser interfaces, diffing utilities, and normalization primitives used across all other package modules.
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install audit-packs-core
21
+ ```
22
+
23
+ ## Features
24
+
25
+ - **Standardized Schema Models**: Defines standard structures for scanner findings, controls, frameworks, rules, and reports.
26
+ - **Normalization Primitives**: Converts scanner-specific findings into a scanner-agnostic intermediate representation.
27
+ - **Diffing Utilities**: Compares findings between parent and feature branches to detect newly introduced compliance gaps.
28
+ - **YAML Configuration Parser**: Parses standard YAML frameworks and control files.
29
+
30
+ ## Learn More
31
+
32
+ This library is part of the larger `audit-packs` Compliance Intelligence Engine. For the main command-line interface, GitHub Action integration, and framework mappings, see the [main repository](https://github.com/prakharsingh/audit-packs).
@@ -0,0 +1,23 @@
1
+ # audit-packs-core
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/audit-packs-core.svg)](https://pypi.org/project/audit-packs-core/)
4
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](../../LICENSE)
5
+
6
+ `audit-packs-core` is the foundational library for the `audit-packs` ecosystem. It provides the core data structures, schema models, parser interfaces, diffing utilities, and normalization primitives used across all other package modules.
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ pip install audit-packs-core
12
+ ```
13
+
14
+ ## Features
15
+
16
+ - **Standardized Schema Models**: Defines standard structures for scanner findings, controls, frameworks, rules, and reports.
17
+ - **Normalization Primitives**: Converts scanner-specific findings into a scanner-agnostic intermediate representation.
18
+ - **Diffing Utilities**: Compares findings between parent and feature branches to detect newly introduced compliance gaps.
19
+ - **YAML Configuration Parser**: Parses standard YAML frameworks and control files.
20
+
21
+ ## Learn More
22
+
23
+ This library is part of the larger `audit-packs` Compliance Intelligence Engine. For the main command-line interface, GitHub Action integration, and framework mappings, see the [main repository](https://github.com/prakharsingh/audit-packs).
@@ -0,0 +1,15 @@
1
+ [project]
2
+ name = "audit-packs-core"
3
+ version = "0.1.1"
4
+ description = "Core models, normalization, diff, and data-flow primitives for audit-packs"
5
+ readme = "README.md"
6
+ license = { text = "Apache-2.0" }
7
+ requires-python = ">=3.11"
8
+ dependencies = ["PyYAML>=6.0"]
9
+
10
+ [build-system]
11
+ requires = ["setuptools>=68"]
12
+ build-backend = "setuptools.build_meta"
13
+
14
+ [tool.setuptools.packages.find]
15
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,171 @@
1
+ from __future__ import annotations
2
+ import re
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class DataFlow:
8
+ source_line: int
9
+ source_type: str
10
+ transforms: tuple[str, ...]
11
+ sink_line: int
12
+ sink_type: str
13
+ has_transform: bool
14
+
15
+
16
+ _PYTHON_SOURCE_PATTERNS = [
17
+ # request.form, request.data, request.json
18
+ (re.compile(r"\brequest\.(form|data|json)\b"), "user_input"),
19
+ # input() calls
20
+ (re.compile(r"\binput\s*\("), "user_input"),
21
+ # os.environ
22
+ (re.compile(r"\bos\.environ\b"), "env_var"),
23
+ # ORM .get() / .filter() on known models
24
+ (re.compile(r"\b(User|Patient|Customer)\.(get|filter|filter_by)\s*\("), "db_read"),
25
+ ]
26
+
27
+ _PYTHON_TRANSFORM_NAMES = {"encrypt", "mask", "hash", "anonymise", "redact", "bcrypt"}
28
+
29
+ _PYTHON_SINK_PATTERNS = [
30
+ (re.compile(r"\bdb\.session\.add\s*\("), "db_write"),
31
+ (re.compile(r"\b\w+\.save\s*\(\s*\)"), "db_write"),
32
+ (re.compile(r"\brequests\.(post|put)\s*\("), "api_call"),
33
+ (re.compile(r"\blogging\.(info|warning|error|debug|critical)\s*\("), "log"),
34
+ (re.compile(r"\bprint\s*\("), "log"),
35
+ (re.compile(r"\bresponse\.json\s*\("), "response"),
36
+ ]
37
+
38
+ _HCL_SOURCE_PATTERN = re.compile(r'\bvar\.\w+|\bdata\s+"aws_secretsmanager_secret"')
39
+ _HCL_TRANSFORM_PATTERN = re.compile(r"\bkms_key_id\s*=|\bencrypted\s*=\s*true")
40
+ _HCL_SINK_PATTERN = re.compile(
41
+ r'\bresource\s+"(aws_s3_bucket_object|aws_rds_cluster|aws_lambda_function)"'
42
+ )
43
+
44
+
45
+ def _extract_python_flows(text: str) -> list[DataFlow]:
46
+ lines = text.splitlines()
47
+ flows: list[DataFlow] = []
48
+
49
+ sources: list[tuple[int, str]] = []
50
+ sinks: list[tuple[int, str]] = []
51
+ transform_lines: list[int] = []
52
+
53
+ for i, line in enumerate(lines, start=1):
54
+ for pattern, src_type in _PYTHON_SOURCE_PATTERNS:
55
+ if pattern.search(line):
56
+ sources.append((i, src_type))
57
+ break
58
+
59
+ for name in _PYTHON_TRANSFORM_NAMES:
60
+ if re.search(rf"\b{name}\s*\(", line):
61
+ transform_lines.append(i)
62
+ break
63
+
64
+ for pattern, sink_type in _PYTHON_SINK_PATTERNS:
65
+ if pattern.search(line):
66
+ sinks.append((i, sink_type))
67
+ break
68
+
69
+ for src_line, src_type in sources:
70
+ for sink_line, sink_type in sinks:
71
+ if sink_line <= src_line:
72
+ continue
73
+ transforms_between = tuple(
74
+ _name
75
+ for _name in _PYTHON_TRANSFORM_NAMES
76
+ for t_line in transform_lines
77
+ if src_line < t_line < sink_line
78
+ and re.search(rf"\b{_name}\s*\(", lines[t_line - 1])
79
+ )
80
+ has_transform = bool(transforms_between) or any(
81
+ src_line < t < sink_line for t in transform_lines
82
+ )
83
+ flows.append(
84
+ DataFlow(
85
+ source_line=src_line,
86
+ source_type=src_type,
87
+ transforms=transforms_between,
88
+ sink_line=sink_line,
89
+ sink_type=sink_type,
90
+ has_transform=has_transform,
91
+ )
92
+ )
93
+
94
+ return flows
95
+
96
+
97
+ def _extract_hcl_flows(text: str) -> list[DataFlow]:
98
+ lines = text.splitlines()
99
+ sources: list[int] = []
100
+ sinks: list[int] = []
101
+ has_transform = False
102
+
103
+ for i, line in enumerate(lines, start=1):
104
+ if _HCL_SOURCE_PATTERN.search(line):
105
+ sources.append(i)
106
+ if _HCL_TRANSFORM_PATTERN.search(line):
107
+ has_transform = True
108
+ if _HCL_SINK_PATTERN.search(line):
109
+ sinks.append(i)
110
+
111
+ flows = []
112
+ for src in sources:
113
+ for sink in sinks:
114
+ if sink > src:
115
+ flows.append(
116
+ DataFlow(
117
+ source_line=src,
118
+ source_type="env_var",
119
+ transforms=(),
120
+ sink_line=sink,
121
+ sink_type="db_write",
122
+ has_transform=has_transform,
123
+ )
124
+ )
125
+ return flows
126
+
127
+
128
+ def extract_data_flows(file_text: str, language: str) -> list[DataFlow]:
129
+ """Extract source→transform→sink chains. language: 'python'|'hcl'|'yaml'|'json'."""
130
+ if language == "python":
131
+ return _extract_python_flows(file_text)
132
+ if language in ("hcl", "yaml", "json"):
133
+ return _extract_hcl_flows(file_text)
134
+ return []
135
+
136
+
137
+ def flow_confidence(flows: list[DataFlow], finding_line: int) -> float:
138
+ """
139
+ Compute flow_confidence score for finding at finding_line.
140
+
141
+ Returns 0.5 (neutral) when no flows are within ±50 lines.
142
+ Among in-range flows, selects closest to finding_line (tie-break: prefer has_transform=False).
143
+ Classification:
144
+ has_transform=False, both ends in range → 0.9
145
+ has_transform=False, one end in range → 0.7
146
+ has_transform=True, both ends in range → 0.2
147
+ has_transform=True, one end in range → 0.5
148
+ """
149
+ RANGE = 50
150
+
151
+ def in_range(line: int) -> bool:
152
+ return abs(line - finding_line) <= RANGE
153
+
154
+ in_range_flows = [
155
+ f for f in flows if in_range(f.source_line) or in_range(f.sink_line)
156
+ ]
157
+
158
+ if not in_range_flows:
159
+ return 0.5
160
+
161
+ def sort_key(f: DataFlow) -> tuple:
162
+ dist = min(abs(f.source_line - finding_line), abs(f.sink_line - finding_line))
163
+ return (dist, 0 if not f.has_transform else 1)
164
+
165
+ best = sorted(in_range_flows, key=sort_key)[0]
166
+ both_in_range = in_range(best.source_line) and in_range(best.sink_line)
167
+
168
+ if not best.has_transform:
169
+ return 0.9 if both_in_range else 0.7
170
+ else:
171
+ return 0.2 if both_in_range else 0.5
@@ -0,0 +1,21 @@
1
+ import re
2
+
3
+ _HUNK = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
4
+
5
+
6
+ def parse_unified_diff(diff_text: str) -> dict[str, set[int]]:
7
+ result: dict[str, set[int]] = {}
8
+ current: str | None = None
9
+ for line in diff_text.splitlines():
10
+ if line.startswith("+++ b/"):
11
+ current = line[len("+++ b/") :].strip()
12
+ continue
13
+ if line.startswith("+++ ") or line.startswith("--- "):
14
+ continue
15
+ m = _HUNK.match(line)
16
+ if m and current is not None:
17
+ start = int(m.group(1))
18
+ count = int(m.group(2)) if m.group(2) is not None else 1
19
+ if count > 0:
20
+ result.setdefault(current, set()).update(range(start, start + count))
21
+ return {f: lines for f, lines in result.items() if lines}
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from enum import Enum
4
+
5
+ SEVERITIES = ("low", "medium", "high", "critical")
6
+
7
+
8
+ def severity_rank(severity: str) -> int:
9
+ return SEVERITIES.index(severity)
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class PathNode:
14
+ file: str
15
+ line: int
16
+ snippet: str
17
+ description: str
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class Finding:
22
+ check_id: str
23
+ engine: str
24
+ file: str
25
+ line: int
26
+ severity: str
27
+ message: str
28
+ evidence: str
29
+ doc_context: str = ""
30
+ evidence_path: tuple[PathNode, ...] = ()
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class ControlFinding:
35
+ finding: Finding
36
+ framework: str
37
+ control_id: str
38
+ control_title: str
39
+ evidence_requirements: tuple = ()
40
+
41
+
42
+ class AssessmentStatus(str, Enum):
43
+ """Status of a control after evidence collection."""
44
+
45
+ PASS = "pass"
46
+ FAIL = "fail"
47
+ NOT_APPLICABLE = "not_applicable"
48
+ MANUAL = "manual"
49
+
50
+
51
+ class AdjudicationMode(str, Enum):
52
+ OFF = "off"
53
+ ADVISORY = "advisory"
54
+ ENFORCE = "enforce"
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class AdjudicationResult:
59
+ control_finding: ControlFinding
60
+ detector_score: float
61
+ verifier_argument: str
62
+ challenger_argument: str
63
+ consensus_score: float
64
+ model_consensus: float
65
+ rationale: str
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class ControlStatus:
70
+ """Status-aware view of a single compliance control after assessment."""
71
+
72
+ framework: str
73
+ control_id: str
74
+ control_title: str
75
+ status: AssessmentStatus
76
+ # (engine, check_id) pairs that guard this control
77
+ check_ids: tuple
78
+ # ControlFinding instances that caused a FAIL
79
+ findings: tuple
80
+ # raw evidence strings extracted from findings
81
+ evidence: tuple
@@ -0,0 +1,104 @@
1
+ from audit_packs_core.models import Finding, PathNode
2
+
3
+ _LEVEL_TO_SEVERITY = {
4
+ "error": "high",
5
+ "warning": "medium",
6
+ "note": "low",
7
+ "none": "low",
8
+ }
9
+ _PROP_TO_SEVERITY = {
10
+ "CRITICAL": "critical",
11
+ "HIGH": "high",
12
+ "MEDIUM": "medium",
13
+ "LOW": "low",
14
+ "INFO": "low",
15
+ }
16
+ _CONFIDENCE_MAP = {"HIGH": 0.9, "MEDIUM": 0.6, "LOW": 0.3}
17
+
18
+
19
+ def _extract_evidence_path(result: dict) -> tuple[PathNode, ...]:
20
+ """Parse codeFlows[0].threadFlows[0].locations into PathNode tuples."""
21
+ code_flows = result.get("codeFlows", [])
22
+ if not code_flows:
23
+ return ()
24
+ thread_flows = code_flows[0].get("threadFlows", [])
25
+ if not thread_flows:
26
+ return ()
27
+ locations = thread_flows[0].get("locations", [])
28
+ nodes = []
29
+ for loc_entry in locations:
30
+ loc = loc_entry.get("location", {})
31
+ phys = loc.get("physicalLocation", {})
32
+ uri = phys.get("artifactLocation", {}).get("uri", "")
33
+ line = phys.get("region", {}).get("startLine", 0)
34
+ snippet = phys.get("region", {}).get("snippet", {}).get("text", "")
35
+ description = loc.get("message", {}).get("text", "")
36
+ nodes.append(
37
+ PathNode(file=uri, line=int(line), snippet=snippet, description=description)
38
+ )
39
+ return tuple(nodes)
40
+
41
+
42
+ def _normalize_rule_id(rule_id: str, engine: str) -> str:
43
+ """Strip dotted namespace prefix from semgrep rule IDs (e.g. 'org.foo.bar' → 'bar').
44
+
45
+ Only applied for semgrep because other engines (checkov, codeql, ast) use their
46
+ own ID schemes and stripping would break pack lookups or collapse distinct rules.
47
+ """
48
+ if engine == "semgrep" and "." in rule_id:
49
+ return rule_id.split(".")[-1]
50
+ return rule_id
51
+
52
+
53
+ def sarif_to_findings(sarif: dict, engine: str) -> list[Finding]:
54
+ findings: list[Finding] = []
55
+ for run in sarif.get("runs", []):
56
+ for res in run.get("results", []):
57
+ locs = res.get("locations", [])
58
+ if not locs:
59
+ continue
60
+ phys = locs[0].get("physicalLocation", {})
61
+ path = phys.get("artifactLocation", {}).get("uri", "")
62
+ line = phys.get("region", {}).get("startLine", 1)
63
+ msg = res.get("message", {}).get("text", "")
64
+ snippet = phys.get("region", {}).get("snippet", {}).get("text", "")
65
+ prop_sev = _PROP_TO_SEVERITY.get(
66
+ res.get("properties", {}).get("severity", "").upper()
67
+ )
68
+ level_sev = _LEVEL_TO_SEVERITY.get(res.get("level", "warning"), "medium")
69
+ evidence_path = _extract_evidence_path(res)
70
+
71
+ raw_id = res.get("ruleId", "")
72
+ check_id = _normalize_rule_id(raw_id, engine)
73
+
74
+ findings.append(
75
+ Finding(
76
+ check_id=check_id,
77
+ engine=engine,
78
+ file=path,
79
+ line=int(line),
80
+ severity=prop_sev or level_sev,
81
+ message=msg,
82
+ evidence=snippet or msg,
83
+ evidence_path=evidence_path,
84
+ )
85
+ )
86
+ return findings
87
+
88
+
89
+ def extract_rule_confidences(sarif: dict, engine: str = "") -> dict[str, float]:
90
+ """Return {rule_id → confidence_score} from SARIF tool rule metadata.
91
+
92
+ The engine parameter must match the value passed to sarif_to_findings so that
93
+ the keys in the returned dict align with Finding.check_id values.
94
+ """
95
+ confidences: dict[str, float] = {}
96
+ for run in sarif.get("runs", []):
97
+ rules = run.get("tool", {}).get("driver", {}).get("rules", [])
98
+ for rule in rules:
99
+ rule_id = rule.get("id", "")
100
+ norm_id = _normalize_rule_id(rule_id, engine)
101
+ conf_str = rule.get("properties", {}).get("confidence", "")
102
+ if conf_str.upper() in _CONFIDENCE_MAP:
103
+ confidences[norm_id] = _CONFIDENCE_MAP[conf_str.upper()]
104
+ return confidences
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: audit-packs-core
3
+ Version: 0.1.1
4
+ Summary: Core models, normalization, diff, and data-flow primitives for audit-packs
5
+ License: Apache-2.0
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: PyYAML>=6.0
9
+
10
+ # audit-packs-core
11
+
12
+ [![PyPI version](https://img.shields.io/pypi/v/audit-packs-core.svg)](https://pypi.org/project/audit-packs-core/)
13
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue.svg)](../../LICENSE)
14
+
15
+ `audit-packs-core` is the foundational library for the `audit-packs` ecosystem. It provides the core data structures, schema models, parser interfaces, diffing utilities, and normalization primitives used across all other package modules.
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install audit-packs-core
21
+ ```
22
+
23
+ ## Features
24
+
25
+ - **Standardized Schema Models**: Defines standard structures for scanner findings, controls, frameworks, rules, and reports.
26
+ - **Normalization Primitives**: Converts scanner-specific findings into a scanner-agnostic intermediate representation.
27
+ - **Diffing Utilities**: Compares findings between parent and feature branches to detect newly introduced compliance gaps.
28
+ - **YAML Configuration Parser**: Parses standard YAML frameworks and control files.
29
+
30
+ ## Learn More
31
+
32
+ This library is part of the larger `audit-packs` Compliance Intelligence Engine. For the main command-line interface, GitHub Action integration, and framework mappings, see the [main repository](https://github.com/prakharsingh/audit-packs).
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/audit_packs_core/__init__.py
4
+ src/audit_packs_core/dataflow.py
5
+ src/audit_packs_core/diff.py
6
+ src/audit_packs_core/models.py
7
+ src/audit_packs_core/normalize.py
8
+ src/audit_packs_core.egg-info/PKG-INFO
9
+ src/audit_packs_core.egg-info/SOURCES.txt
10
+ src/audit_packs_core.egg-info/dependency_links.txt
11
+ src/audit_packs_core.egg-info/requires.txt
12
+ src/audit_packs_core.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ audit_packs_core