ventra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. collector/__init__.py +11 -0
  2. collector/__main__.py +8 -0
  3. collector/aws/__init__.py +5 -0
  4. collector/aws/client_factory.py +183 -0
  5. collector/aws/control_plane/cloudtrail.py +331 -0
  6. collector/aws/control_plane/cloudtrail_s3.py +312 -0
  7. collector/aws/control_plane/config.py +96 -0
  8. collector/aws/detections/detective.py +102 -0
  9. collector/aws/detections/guardduty.py +116 -0
  10. collector/aws/detections/macie.py +103 -0
  11. collector/aws/detections/securityhub.py +101 -0
  12. collector/aws/identity/account.py +71 -0
  13. collector/aws/identity/iam.py +386 -0
  14. collector/aws/identity/kms.py +74 -0
  15. collector/aws/identity/secrets.py +56 -0
  16. collector/aws/identity/sts.py +74 -0
  17. collector/aws/network/vpc_flow.py +176 -0
  18. collector/aws/network/waf.py +144 -0
  19. collector/aws/registry.py +60 -0
  20. collector/aws/runner/runner.py +224 -0
  21. collector/aws/workloads/ec2.py +217 -0
  22. collector/aws/workloads/lambda_.py +75 -0
  23. collector/aws/workloads/s3.py +112 -0
  24. collector/azure/__init__.py +17 -0
  25. collector/cli.py +388 -0
  26. collector/devgui.py +350 -0
  27. collector/gcp/__init__.py +17 -0
  28. collector/lib/__init__.py +1 -0
  29. collector/lib/base.py +167 -0
  30. collector/lib/chain_of_custody/__init__.py +6 -0
  31. collector/lib/chain_of_custody/hashing.py +21 -0
  32. collector/lib/chain_of_custody/signing.py +67 -0
  33. collector/lib/models.py +232 -0
  34. collector/lib/packaging/__init__.py +5 -0
  35. collector/lib/packaging/packager.py +77 -0
  36. collector/lib/transport/__init__.py +5 -0
  37. collector/lib/transport/base.py +90 -0
  38. collector/tools/__init__.py +1 -0
  39. collector/tools/verify_readonly.py +66 -0
  40. ventra-0.1.0.dist-info/METADATA +178 -0
  41. ventra-0.1.0.dist-info/RECORD +45 -0
  42. ventra-0.1.0.dist-info/WHEEL +5 -0
  43. ventra-0.1.0.dist-info/entry_points.txt +3 -0
  44. ventra-0.1.0.dist-info/licenses/LICENSE +202 -0
  45. ventra-0.1.0.dist-info/top_level.txt +1 -0
collector/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """Ventra collector — read-only cloud forensic triage acquisition.
2
+
3
+ The collector runs in the client's cloud shell, gathers exactly the logs and artifacts
4
+ incident responders need, and seals them into a signed evidence package described by the
5
+ Ventra Evidence Package Format (EPF).
6
+
7
+ Forensic invariant: nothing in this package may call a mutating cloud API. See
8
+ ``collector.tools.verify_readonly`` and the ``readonly-guard`` CI check.
9
+ """
10
+
11
+ __version__ = "0.1.0"
collector/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Allow ``python -m collector dev`` from a fresh clone before ``pip install``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .cli import main
6
+
7
+ if __name__ == "__main__":
8
+ raise SystemExit(main())
@@ -0,0 +1,5 @@
1
+ """AWS orchestration — registry, runner, and boto3 client factory.
2
+
3
+ Collector modules (identity, control_plane, network, …) live in this package.
4
+ Import ``collector.aws.registry`` to populate and access the registry.
5
+ """
@@ -0,0 +1,183 @@
1
+ """AWS client management used by every collector.
2
+
3
+ Wraps boto3 so collectors don't each reinvent region handling, pagination, and the
4
+ all-important AccessDenied detection (an AccessDenied is a *gap*, recorded as evidence, not a
5
+ crash). All clients are created from a single session so credentials are resolved once.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterator
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+ import boto3
15
+ from botocore.config import Config
16
+ from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError
17
+
18
+ # Error codes that mean "you can't see this" rather than "something broke".
19
+ ACCESS_DENIED_CODES = frozenset(
20
+ {
21
+ "AccessDenied",
22
+ "AccessDeniedException",
23
+ "UnauthorizedOperation",
24
+ "AuthorizationError",
25
+ "AuthFailure",
26
+ "ForbiddenException",
27
+ }
28
+ )
29
+
30
+ # Error codes that mean "this service/feature isn't enabled here".
31
+ NOT_ENABLED_CODES = frozenset(
32
+ {
33
+ "ResourceNotFoundException",
34
+ "BadRequestException",
35
+ "InvalidInputException",
36
+ "SubscriptionRequiredException",
37
+ "OptInRequired",
38
+ # Security Hub raises this when the account is not subscribed to the hub.
39
+ "InvalidAccessException",
40
+ # WAFv2 raises this when e.g. a Web ACL has no logging configuration.
41
+ "WAFNonexistentItemException",
42
+ # Standalone accounts that are not part of an AWS Organization.
43
+ "AWSOrganizationsNotInUseException",
44
+ # CloudTrail trails without Insights enabled.
45
+ "InsightNotEnabledException",
46
+ # IAM raises this when e.g. no custom password policy exists on the account.
47
+ "NoSuchEntity",
48
+ }
49
+ )
50
+
51
+
52
+ class AccessDenied(Exception):
53
+ """Raised by helpers when an API returns an access-denied style error."""
54
+
55
+ def __init__(self, action: str, message: str) -> None:
56
+ super().__init__(f"{action}: {message}")
57
+ self.action = action
58
+ self.message = message
59
+
60
+
61
+ class ServiceNotEnabled(Exception):
62
+ def __init__(self, service: str, message: str) -> None:
63
+ super().__init__(f"{service}: {message}")
64
+ self.service = service
65
+ self.message = message
66
+
67
+
68
+ @dataclass
69
+ class CallerIdentity:
70
+ account_id: str
71
+ arn: str
72
+ user_id: str
73
+ partition: str
74
+
75
+
76
+ class AwsClientFactory:
77
+ """Creates per-service, per-region boto3 clients from one session."""
78
+
79
+ def __init__(self, session: boto3.Session | None = None) -> None:
80
+ self._session = session or boto3.Session()
81
+ self._cfg = Config(retries={"max_attempts": 5, "mode": "adaptive"}, user_agent_extra="ventra")
82
+ self._cache: dict[tuple[str, str | None], Any] = {}
83
+
84
+ def client(self, service: str, region: str | None = None) -> Any:
85
+ key = (service, region)
86
+ if key not in self._cache:
87
+ self._cache[key] = self._session.client(service, region_name=region, config=self._cfg)
88
+ return self._cache[key]
89
+
90
+ # -- identity / region discovery -----------------------------------------------------
91
+
92
+ def caller_identity(self) -> CallerIdentity:
93
+ try:
94
+ ident = self.client("sts").get_caller_identity()
95
+ except NoCredentialsError as exc: # pragma: no cover
96
+ raise RuntimeError(
97
+ "No AWS credentials found. Run inside CloudShell or configure a profile."
98
+ ) from exc
99
+ arn = ident["Arn"]
100
+ partition = arn.split(":")[1] if arn.startswith("arn:") else "aws"
101
+ return CallerIdentity(
102
+ account_id=ident["Account"],
103
+ arn=arn,
104
+ user_id=ident.get("UserId", ""),
105
+ partition=partition,
106
+ )
107
+
108
+ def enabled_regions(self) -> list[str]:
109
+ """Regions enabled for this account (opt-in regions included if active)."""
110
+ try:
111
+ resp = self.client("ec2", "us-east-1").describe_regions(
112
+ Filters=[{"Name": "opt-in-status", "Values": ["opt-in-not-required", "opted-in"]}]
113
+ )
114
+ return sorted(r["RegionName"] for r in resp["Regions"])
115
+ except ClientError:
116
+ # Fall back to the SDK's static partition list.
117
+ return sorted(self._session.get_available_regions("ec2"))
118
+
119
+ # -- safe call helpers ---------------------------------------------------------------
120
+
121
+ def paginate(
122
+ self, service: str, region: str | None, operation: str, result_key: str, **kwargs: Any
123
+ ) -> Iterator[dict[str, Any]]:
124
+ """Yield items across pages, translating access/enablement errors into typed gaps."""
125
+ client = self.client(service, region)
126
+ try:
127
+ paginator = client.get_paginator(operation)
128
+ for page in paginator.paginate(**kwargs):
129
+ yield from page.get(result_key, [])
130
+ except ClientError as exc:
131
+ _raise_typed(exc, f"{service}:{operation}")
132
+ except EndpointConnectionError:
133
+ return
134
+
135
+ def call(self, service: str, region: str | None, operation: str, **kwargs: Any) -> dict[str, Any]:
136
+ client = self.client(service, region)
137
+ try:
138
+ return getattr(client, operation)(**kwargs)
139
+ except ClientError as exc:
140
+ _raise_typed(exc, f"{service}:{operation}")
141
+ raise # unreachable, keeps type-checkers happy
142
+ except EndpointConnectionError as exc:
143
+ # The service has no endpoint in this region — same gap as "not enabled".
144
+ raise ServiceNotEnabled(f"{service}:{operation}", str(exc)) from exc
145
+
146
+ def paginate_manual(
147
+ self,
148
+ service: str,
149
+ region: str | None,
150
+ operation: str,
151
+ result_key: str,
152
+ *,
153
+ token_request_key: str = "NextToken",
154
+ token_response_key: str = "NextToken",
155
+ max_pages: int = 500,
156
+ **kwargs: Any,
157
+ ) -> Iterator[dict[str, Any]]:
158
+ """Token-loop pagination for operations botocore has no paginator for
159
+ (e.g. wafv2 ListWebACLs / detective ListInvestigations)."""
160
+ token: str | None = None
161
+ for _ in range(max_pages):
162
+ params = dict(kwargs)
163
+ if token:
164
+ params[token_request_key] = token
165
+ page = self.call(service, region, operation, **params)
166
+ items = page.get(result_key) or []
167
+ yield from items
168
+ new_token = page.get(token_response_key)
169
+ # Stop on a missing, repeated, or itemless marker so a quirky
170
+ # implementation can never loop us forever.
171
+ if not new_token or new_token == token or not items:
172
+ return
173
+ token = new_token
174
+
175
+
176
+ def _raise_typed(exc: ClientError, action: str) -> None:
177
+ code = exc.response.get("Error", {}).get("Code", "")
178
+ msg = exc.response.get("Error", {}).get("Message", str(exc))
179
+ if code in ACCESS_DENIED_CODES:
180
+ raise AccessDenied(action, msg)
181
+ if code in NOT_ENABLED_CODES:
182
+ raise ServiceNotEnabled(action, msg)
183
+ raise exc
@@ -0,0 +1,331 @@
1
+ """CloudTrail collector — the control-plane backbone of cloud IR.
2
+
3
+ Captures:
4
+ 1. **Trail configuration** — trails, selectors (management / data / network / insights),
5
+ log validation, and S3 delivery settings.
6
+ 2. **Management events** — via LookupEvents (portable, ~90-day lookback).
7
+ 3. **Insight events** — via LookupEvents and S3 log files when Insights is enabled.
8
+ 4. **Data events** — from the trail's S3 log files when data events are enabled.
9
+ 5. **Network activity events** — from the trail's S3 log files when enabled.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from datetime import UTC, datetime, timedelta
15
+
16
+ from ...lib.base import Collector
17
+ from ...lib.models import GapReason, SourceResult, SourceStatus
18
+ from ..client_factory import AccessDenied, ServiceNotEnabled
19
+ from .cloudtrail_s3 import (
20
+ DATA_CATEGORIES,
21
+ INSIGHT_CATEGORIES,
22
+ NETWORK_CATEGORIES,
23
+ collect_s3_trail_records,
24
+ coverage_summary,
25
+ data_events_configured,
26
+ insight_events_configured,
27
+ lookup_event_category,
28
+ merge_dedupe,
29
+ network_activity_configured,
30
+ trail_is_logging_to_s3,
31
+ )
32
+
33
+ # Bound the in-memory LookupEvents pull so a busy account can't exhaust a CloudShell.
34
+ MAX_LOOKUP_RECORDS = 200_000
35
+
36
+
37
+ class CloudTrailCollector(Collector):
38
+ name = "cloudtrail"
39
+ tier = 1
40
+ description = (
41
+ "CloudTrail trail config, management and insight events (LookupEvents), "
42
+ "and data / network-activity events from S3 logs."
43
+ )
44
+ required_actions = (
45
+ "cloudtrail:DescribeTrails",
46
+ "cloudtrail:GetTrailStatus",
47
+ "cloudtrail:GetEventSelectors",
48
+ "cloudtrail:GetInsightSelectors",
49
+ "cloudtrail:LookupEvents",
50
+ "s3:ListBucket",
51
+ "s3:GetObject",
52
+ )
53
+
54
+ def collect(self) -> SourceResult:
55
+ cf = self.ctx.client_factory
56
+ gaps: list[tuple[str, GapReason, str]] = []
57
+
58
+ config = self._trail_config(cf, gaps)
59
+ config["event_coverage"] = coverage_summary(config.get("trails", []))
60
+
61
+ window = self.ctx.time_window
62
+ start = window.since or (datetime.now(UTC) - timedelta(days=90))
63
+ end = window.until or datetime.now(UTC)
64
+
65
+ mgmt_records, lookup_insight_records = self._collect_lookup_events(cf, gaps, start, end)
66
+ insight_s3_records, insight_stats = self._collect_s3_category(
67
+ cf,
68
+ config,
69
+ gaps,
70
+ start,
71
+ end,
72
+ INSIGHT_CATEGORIES,
73
+ insight_events_configured,
74
+ "insight_events",
75
+ require_s3=False,
76
+ )
77
+ insight_records = merge_dedupe(lookup_insight_records, insight_s3_records)
78
+
79
+ data_records, data_stats = self._collect_s3_category(
80
+ cf, config, gaps, start, end, DATA_CATEGORIES, data_events_configured, "data_events"
81
+ )
82
+ network_records, network_stats = self._collect_s3_category(
83
+ cf,
84
+ config,
85
+ gaps,
86
+ start,
87
+ end,
88
+ NETWORK_CATEGORIES,
89
+ network_activity_configured,
90
+ "network_activity",
91
+ )
92
+
93
+ if config["event_coverage"]["insight_events_configured"] and not insight_records:
94
+ gaps.append(
95
+ (
96
+ "insight_events",
97
+ GapReason.NOT_PRESENT,
98
+ "Insights enabled but no insight events in window.",
99
+ )
100
+ )
101
+
102
+ config["s3_collection"] = {
103
+ "insight_events": insight_stats,
104
+ "data_events": data_stats,
105
+ "network_activity": network_stats,
106
+ }
107
+
108
+ files = []
109
+ total = 0
110
+
111
+ if mgmt_records:
112
+ files.append(self.write_jsonl(mgmt_records, "events.jsonl.gz"))
113
+ total += len(mgmt_records)
114
+ if insight_records:
115
+ files.append(self.write_jsonl(insight_records, "events_insights.jsonl.gz"))
116
+ total += len(insight_records)
117
+ if data_records:
118
+ files.append(self.write_jsonl(data_records, "events_data.jsonl.gz"))
119
+ total += len(data_records)
120
+ if network_records:
121
+ files.append(self.write_jsonl(network_records, "events_network.jsonl.gz"))
122
+ total += len(network_records)
123
+
124
+ files.append(self.write_json(config, "config.json"))
125
+
126
+ if total:
127
+ status = SourceStatus.PARTIAL if gaps else SourceStatus.COLLECTED
128
+ elif gaps:
129
+ status = SourceStatus.PARTIAL
130
+ else:
131
+ status = SourceStatus.EMPTY
132
+ gaps.append(
133
+ ("cloudtrail", GapReason.NOT_PRESENT, "No CloudTrail events in window.")
134
+ )
135
+
136
+ self.write_meta(
137
+ {
138
+ "source": self.name,
139
+ "records": total,
140
+ "management_events": len(mgmt_records),
141
+ "insight_events": len(insight_records),
142
+ "data_events": len(data_records),
143
+ "network_activity_events": len(network_records),
144
+ "lookup_insight_events": len(lookup_insight_records),
145
+ "regions": self.ctx.regions,
146
+ "window": window.to_manifest(),
147
+ "trails": len(config.get("trails", [])),
148
+ "log_validation_enabled": config.get("any_log_validation_enabled"),
149
+ "event_coverage": config["event_coverage"],
150
+ "s3_collection": config["s3_collection"],
151
+ }
152
+ )
153
+
154
+ notes = (
155
+ f"{len(mgmt_records)} management, {len(insight_records)} insight, "
156
+ f"{len(data_records)} data, {len(network_records)} network-activity; "
157
+ f"{len(config.get('trails', []))} trail(s)."
158
+ )
159
+ return SourceResult(
160
+ name=self.name,
161
+ status=status,
162
+ files=files,
163
+ record_count=total,
164
+ gaps=gaps,
165
+ notes=notes,
166
+ )
167
+
168
+ def _collect_lookup_events(
169
+ self,
170
+ cf,
171
+ gaps: list[tuple[str, GapReason, str]],
172
+ start: datetime,
173
+ end: datetime,
174
+ ) -> tuple[list[dict], list[dict]]:
175
+ management: list[dict] = []
176
+ insights: list[dict] = []
177
+ truncated = False
178
+ for region in self.ctx.regions:
179
+ if truncated:
180
+ break
181
+ try:
182
+ for ev in cf.paginate(
183
+ "cloudtrail",
184
+ region,
185
+ "lookup_events",
186
+ "Events",
187
+ StartTime=start,
188
+ EndTime=end,
189
+ ):
190
+ if len(management) + len(insights) >= MAX_LOOKUP_RECORDS:
191
+ truncated = True
192
+ break
193
+ ev["_ventra_region"] = region
194
+ if lookup_event_category(ev) == "Insight":
195
+ insights.append(ev)
196
+ else:
197
+ management.append(ev)
198
+ except AccessDenied as exc:
199
+ gaps.append(("cloudtrail", GapReason.ACCESS_DENIED, f"{region}: {exc.message}"))
200
+ except ServiceNotEnabled:
201
+ continue
202
+ if truncated:
203
+ gaps.append(
204
+ (
205
+ "cloudtrail",
206
+ GapReason.COLLECTOR_ERROR,
207
+ f"LookupEvents truncated at {MAX_LOOKUP_RECORDS} records; "
208
+ "narrow the window (--since/--until) for full coverage.",
209
+ )
210
+ )
211
+ return management, insights
212
+
213
+ def _collect_s3_category(
214
+ self,
215
+ cf,
216
+ config: dict,
217
+ gaps: list[tuple[str, GapReason, str]],
218
+ start: datetime,
219
+ end: datetime,
220
+ categories: frozenset[str],
221
+ configured_fn,
222
+ gap_name: str,
223
+ *,
224
+ require_s3: bool = True,
225
+ ) -> tuple[list[dict], dict]:
226
+ trails = config.get("trails", [])
227
+ if not any(configured_fn(t) for t in trails):
228
+ return [], {"configured": False, "records": 0}
229
+
230
+ if not any(trail_is_logging_to_s3(t) for t in trails):
231
+ if require_s3:
232
+ gaps.append(
233
+ (
234
+ gap_name,
235
+ GapReason.LOGGING_NOT_CONFIGURED,
236
+ "Event type enabled but trail does not deliver logs to S3.",
237
+ )
238
+ )
239
+ return [], {"configured": True, "records": 0, "s3_logging": False}
240
+
241
+ combined: list[dict] = []
242
+ combined_stats: dict = {"configured": True, "records": 0, "s3_logging": True}
243
+ account_id = self.ctx.account_id
244
+
245
+ for trail in trails:
246
+ if not configured_fn(trail) or not trail_is_logging_to_s3(trail):
247
+ continue
248
+ recs, stats = collect_s3_trail_records(
249
+ cf,
250
+ trail,
251
+ account_id,
252
+ self.ctx.regions,
253
+ start,
254
+ end,
255
+ categories,
256
+ gaps,
257
+ log=lambda msg: self._log(msg),
258
+ )
259
+ combined.extend(recs)
260
+ for key in ("objects_scanned", "objects_read", "records", "truncated"):
261
+ if key == "truncated":
262
+ combined_stats["truncated"] = combined_stats.get("truncated") or stats.get(
263
+ "truncated"
264
+ )
265
+ else:
266
+ combined_stats[key] = combined_stats.get(key, 0) + stats.get(key, 0)
267
+
268
+ combined_stats["records"] = len(combined)
269
+ if require_s3 and combined_stats.get("configured") and not combined:
270
+ gaps.append(
271
+ (
272
+ gap_name,
273
+ GapReason.NOT_PRESENT,
274
+ f"No {gap_name.replace('_', ' ')} log records in window (check S3 path/permissions).",
275
+ )
276
+ )
277
+ return combined, combined_stats
278
+
279
+ def _trail_config(self, cf, gaps) -> dict:
280
+ trails: list[dict] = []
281
+ any_validation = False
282
+ seen = set()
283
+ for region in self.ctx.regions:
284
+ try:
285
+ described = cf.call("cloudtrail", region, "describe_trails").get("trailList", [])
286
+ except AccessDenied as exc:
287
+ gaps.append(
288
+ ("cloudtrail_config", GapReason.ACCESS_DENIED, f"{region}: {exc.message}")
289
+ )
290
+ continue
291
+ except ServiceNotEnabled:
292
+ continue
293
+ for trail in described:
294
+ arn = trail.get("TrailARN", "")
295
+ if arn in seen:
296
+ continue
297
+ seen.add(arn)
298
+ trail = dict(trail)
299
+ if trail.get("LogFileValidationEnabled"):
300
+ any_validation = True
301
+ home = trail.get("HomeRegion", region)
302
+ try:
303
+ trail["Status"] = cf.call(
304
+ "cloudtrail", home, "get_trail_status", Name=arn
305
+ )
306
+ trail["EventSelectors"] = cf.call(
307
+ "cloudtrail", home, "get_event_selectors", TrailName=arn
308
+ )
309
+ except AccessDenied as exc:
310
+ # Without status, S3 log collection for this trail is skipped — that
311
+ # is a gap worth surfacing, not hiding.
312
+ gaps.append(
313
+ ("cloudtrail_config", GapReason.ACCESS_DENIED, f"{arn}: {exc.message}")
314
+ )
315
+ except Exception: # noqa: BLE001 - keep the trail entry, just less enriched
316
+ pass
317
+ try:
318
+ trail["InsightSelectors"] = cf.call(
319
+ "cloudtrail", home, "get_insight_selectors", TrailName=arn
320
+ )
321
+ except Exception: # InsightNotEnabledException et al. — Insights are off
322
+ trail["InsightSelectors"] = None
323
+ trails.append(trail)
324
+
325
+ return {
326
+ "trails": trails,
327
+ "trail_count": len(trails),
328
+ "any_log_validation_enabled": any_validation,
329
+ "multi_region_trail_present": any(t.get("IsMultiRegionTrail") for t in trails),
330
+ "organization_trail_present": any(t.get("IsOrganizationTrail") for t in trails),
331
+ }