ventra 0.0.0.post26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. collector/__init__.py +23 -0
  2. collector/__main__.py +48 -0
  3. collector/aws/__init__.py +5 -0
  4. collector/aws/client_factory.py +183 -0
  5. collector/aws/common/__init__.py +1 -0
  6. collector/aws/common/cw_logs.py +70 -0
  7. collector/aws/common/s3_logs.py +130 -0
  8. collector/aws/control_plane/cloudtrail.py +641 -0
  9. collector/aws/control_plane/cloudtrail_s3.py +338 -0
  10. collector/aws/control_plane/cloudtrail_validation.py +215 -0
  11. collector/aws/control_plane/config.py +96 -0
  12. collector/aws/control_plane/log_posture.py +344 -0
  13. collector/aws/detections/detective.py +102 -0
  14. collector/aws/detections/guardduty.py +116 -0
  15. collector/aws/detections/inspector2.py +106 -0
  16. collector/aws/detections/macie.py +103 -0
  17. collector/aws/detections/securityhub.py +101 -0
  18. collector/aws/identity/account.py +71 -0
  19. collector/aws/identity/iam.py +386 -0
  20. collector/aws/identity/kms.py +74 -0
  21. collector/aws/identity/secrets.py +56 -0
  22. collector/aws/network/cloudfront.py +205 -0
  23. collector/aws/network/elb_alb.py +232 -0
  24. collector/aws/network/route53_resolver.py +246 -0
  25. collector/aws/network/vpc_flow.py +176 -0
  26. collector/aws/network/waf.py +144 -0
  27. collector/aws/registry.py +73 -0
  28. collector/aws/runner/runner.py +232 -0
  29. collector/aws/workloads/ec2.py +217 -0
  30. collector/aws/workloads/eks_audit.py +179 -0
  31. collector/aws/workloads/lambda_.py +75 -0
  32. collector/aws/workloads/s3.py +112 -0
  33. collector/aws/workloads/s3_access.py +197 -0
  34. collector/azure/__init__.py +17 -0
  35. collector/cli.py +664 -0
  36. collector/devgui.py +353 -0
  37. collector/gcp/__init__.py +17 -0
  38. collector/lib/__init__.py +1 -0
  39. collector/lib/base.py +168 -0
  40. collector/lib/chain_of_custody/__init__.py +6 -0
  41. collector/lib/chain_of_custody/hashing.py +21 -0
  42. collector/lib/chain_of_custody/signing.py +67 -0
  43. collector/lib/ingest.py +60 -0
  44. collector/lib/models.py +233 -0
  45. collector/lib/packaging/__init__.py +5 -0
  46. collector/lib/packaging/packager.py +77 -0
  47. collector/lib/transport/__init__.py +5 -0
  48. collector/lib/transport/base.py +90 -0
  49. collector/tools/__init__.py +1 -0
  50. collector/tools/verify_readonly.py +66 -0
  51. ventra-0.0.0.post26.dist-info/METADATA +181 -0
  52. ventra-0.0.0.post26.dist-info/RECORD +56 -0
  53. ventra-0.0.0.post26.dist-info/WHEEL +5 -0
  54. ventra-0.0.0.post26.dist-info/entry_points.txt +3 -0
  55. ventra-0.0.0.post26.dist-info/licenses/LICENSE +202 -0
  56. ventra-0.0.0.post26.dist-info/top_level.txt +1 -0
collector/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+ """Ventra collector — read-only cloud forensic triage acquisition.
2
+
3
+ The collector runs in the client's cloud shell, gathers exactly the logs and artifacts
4
+ incident responders need, and seals them into a signed evidence package described by the
5
+ Ventra Evidence Package Format (EPF).
6
+
7
+ Forensic invariant: nothing in this package may call a mutating cloud API. See
8
+ ``collector.tools.verify_readonly`` and the ``readonly-guard`` CI check.
9
+ """
10
+
11
+ from importlib.metadata import PackageNotFoundError
12
+ from importlib.metadata import version as _pkg_version
13
+
14
+ try:
15
+ # Resolved from the installed distribution's metadata, which setuptools-scm derives from
16
+ # the git tag at build/install time. Recorded in every manifest as ``tool_version``, so an
17
+ # evidence package always shows exactly which build collected it (a tagged release like
18
+ # ``0.2.0``, or a dev build like ``0.2.0.dev3+g1a2b3c4`` when run from a working tree).
19
+ __version__ = _pkg_version("ventra")
20
+ except PackageNotFoundError: # running from a source tree that was never installed
21
+ __version__ = "0.0.0+unknown"
22
+
23
+ del PackageNotFoundError, _pkg_version
collector/__main__.py ADDED
@@ -0,0 +1,48 @@
1
+ """Allow ``python -m collector dev`` from a fresh clone before ``pip install``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import sys
8
+
9
+ _MIN_PYTHON = (3, 11)
10
+
11
+
12
+ def _reexec_with_newer_python() -> None:
13
+ """Re-run this module with Python 3.11+ when the default ``python3`` is too old."""
14
+ if sys.version_info >= _MIN_PYTHON:
15
+ return
16
+ for cmd in ("python3.12", "python3.11", "python3"):
17
+ path = shutil.which(cmd)
18
+ if not path or os.path.realpath(path) == os.path.realpath(sys.executable):
19
+ continue
20
+ import subprocess
21
+
22
+ ok = subprocess.run(
23
+ [
24
+ path,
25
+ "-c",
26
+ f"import sys; raise SystemExit(0 if sys.version_info >= {_MIN_PYTHON!r} else 1)",
27
+ ],
28
+ capture_output=True,
29
+ ).returncode == 0
30
+ if not ok:
31
+ continue
32
+ os.execv(path, [path, "-m", "collector", *sys.argv[1:]])
33
+
34
+ print(
35
+ "error: Ventra requires Python 3.11 or newer.\n"
36
+ f" Current: {sys.executable} ({sys.version.split()[0]})\n"
37
+ " Install: brew install python@3.11\n"
38
+ " Then run: python3.11 -m collector dev",
39
+ file=sys.stderr,
40
+ )
41
+ raise SystemExit(1)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ _reexec_with_newer_python()
46
+ from .cli import main
47
+
48
+ raise SystemExit(main())
@@ -0,0 +1,5 @@
1
+ """AWS orchestration — registry, runner, and boto3 client factory.
2
+
3
+ Collector modules (identity, control_plane, network, …) live in this package.
4
+ Import ``collector.aws.registry`` to populate and access the registry.
5
+ """
@@ -0,0 +1,183 @@
1
+ """AWS client management used by every collector.
2
+
3
+ Wraps boto3 so collectors don't each reinvent region handling, pagination, and the
4
+ all-important AccessDenied detection (an AccessDenied is a *gap*, recorded as evidence, not a
5
+ crash). All clients are created from a single session so credentials are resolved once.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterator
11
+ from dataclasses import dataclass
12
+ from typing import Any
13
+
14
+ import boto3
15
+ from botocore.config import Config
16
+ from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError
17
+
18
+ # Error codes that mean "you can't see this" rather than "something broke".
19
+ ACCESS_DENIED_CODES = frozenset(
20
+ {
21
+ "AccessDenied",
22
+ "AccessDeniedException",
23
+ "UnauthorizedOperation",
24
+ "AuthorizationError",
25
+ "AuthFailure",
26
+ "ForbiddenException",
27
+ }
28
+ )
29
+
30
+ # Error codes that mean "this service/feature isn't enabled here".
31
+ NOT_ENABLED_CODES = frozenset(
32
+ {
33
+ "ResourceNotFoundException",
34
+ "BadRequestException",
35
+ "InvalidInputException",
36
+ "SubscriptionRequiredException",
37
+ "OptInRequired",
38
+ # Security Hub raises this when the account is not subscribed to the hub.
39
+ "InvalidAccessException",
40
+ # WAFv2 raises this when e.g. a Web ACL has no logging configuration.
41
+ "WAFNonexistentItemException",
42
+ # Standalone accounts that are not part of an AWS Organization.
43
+ "AWSOrganizationsNotInUseException",
44
+ # CloudTrail trails without Insights enabled.
45
+ "InsightNotEnabledException",
46
+ # IAM raises this when e.g. no custom password policy exists on the account.
47
+ "NoSuchEntity",
48
+ }
49
+ )
50
+
51
+
52
+ class AccessDenied(Exception):
53
+ """Raised by helpers when an API returns an access-denied style error."""
54
+
55
+ def __init__(self, action: str, message: str) -> None:
56
+ super().__init__(f"{action}: {message}")
57
+ self.action = action
58
+ self.message = message
59
+
60
+
61
+ class ServiceNotEnabled(Exception):
62
+ def __init__(self, service: str, message: str) -> None:
63
+ super().__init__(f"{service}: {message}")
64
+ self.service = service
65
+ self.message = message
66
+
67
+
68
+ @dataclass
69
+ class CallerIdentity:
70
+ account_id: str
71
+ arn: str
72
+ user_id: str
73
+ partition: str
74
+
75
+
76
+ class AwsClientFactory:
77
+ """Creates per-service, per-region boto3 clients from one session."""
78
+
79
+ def __init__(self, session: boto3.Session | None = None) -> None:
80
+ self._session = session or boto3.Session()
81
+ self._cfg = Config(retries={"max_attempts": 5, "mode": "adaptive"}, user_agent_extra="ventra")
82
+ self._cache: dict[tuple[str, str | None], Any] = {}
83
+
84
+ def client(self, service: str, region: str | None = None) -> Any:
85
+ key = (service, region)
86
+ if key not in self._cache:
87
+ self._cache[key] = self._session.client(service, region_name=region, config=self._cfg)
88
+ return self._cache[key]
89
+
90
+ # -- identity / region discovery -----------------------------------------------------
91
+
92
+ def caller_identity(self) -> CallerIdentity:
93
+ try:
94
+ ident = self.client("sts").get_caller_identity()
95
+ except NoCredentialsError as exc: # pragma: no cover
96
+ raise RuntimeError(
97
+ "No AWS credentials found. Run inside CloudShell or configure a profile."
98
+ ) from exc
99
+ arn = ident["Arn"]
100
+ partition = arn.split(":")[1] if arn.startswith("arn:") else "aws"
101
+ return CallerIdentity(
102
+ account_id=ident["Account"],
103
+ arn=arn,
104
+ user_id=ident.get("UserId", ""),
105
+ partition=partition,
106
+ )
107
+
108
+ def enabled_regions(self) -> list[str]:
109
+ """Regions enabled for this account (opt-in regions included if active)."""
110
+ try:
111
+ resp = self.client("ec2", "us-east-1").describe_regions(
112
+ Filters=[{"Name": "opt-in-status", "Values": ["opt-in-not-required", "opted-in"]}]
113
+ )
114
+ return sorted(r["RegionName"] for r in resp["Regions"])
115
+ except ClientError:
116
+ # Fall back to the SDK's static partition list.
117
+ return sorted(self._session.get_available_regions("ec2"))
118
+
119
+ # -- safe call helpers ---------------------------------------------------------------
120
+
121
+ def paginate(
122
+ self, service: str, region: str | None, operation: str, result_key: str, **kwargs: Any
123
+ ) -> Iterator[dict[str, Any]]:
124
+ """Yield items across pages, translating access/enablement errors into typed gaps."""
125
+ client = self.client(service, region)
126
+ try:
127
+ paginator = client.get_paginator(operation)
128
+ for page in paginator.paginate(**kwargs):
129
+ yield from page.get(result_key, [])
130
+ except ClientError as exc:
131
+ _raise_typed(exc, f"{service}:{operation}")
132
+ except EndpointConnectionError:
133
+ return
134
+
135
+ def call(self, service: str, region: str | None, operation: str, **kwargs: Any) -> dict[str, Any]:
136
+ client = self.client(service, region)
137
+ try:
138
+ return getattr(client, operation)(**kwargs)
139
+ except ClientError as exc:
140
+ _raise_typed(exc, f"{service}:{operation}")
141
+ raise # unreachable, keeps type-checkers happy
142
+ except EndpointConnectionError as exc:
143
+ # The service has no endpoint in this region — same gap as "not enabled".
144
+ raise ServiceNotEnabled(f"{service}:{operation}", str(exc)) from exc
145
+
146
+ def paginate_manual(
147
+ self,
148
+ service: str,
149
+ region: str | None,
150
+ operation: str,
151
+ result_key: str,
152
+ *,
153
+ token_request_key: str = "NextToken",
154
+ token_response_key: str = "NextToken",
155
+ max_pages: int = 500,
156
+ **kwargs: Any,
157
+ ) -> Iterator[dict[str, Any]]:
158
+ """Token-loop pagination for operations botocore has no paginator for
159
+ (e.g. wafv2 ListWebACLs / detective ListInvestigations)."""
160
+ token: str | None = None
161
+ for _ in range(max_pages):
162
+ params = dict(kwargs)
163
+ if token:
164
+ params[token_request_key] = token
165
+ page = self.call(service, region, operation, **params)
166
+ items = page.get(result_key) or []
167
+ yield from items
168
+ new_token = page.get(token_response_key)
169
+ # Stop on a missing, repeated, or itemless marker so a quirky
170
+ # implementation can never loop us forever.
171
+ if not new_token or new_token == token or not items:
172
+ return
173
+ token = new_token
174
+
175
+
176
+ def _raise_typed(exc: ClientError, action: str) -> None:
177
+ code = exc.response.get("Error", {}).get("Code", "")
178
+ msg = exc.response.get("Error", {}).get("Message", str(exc))
179
+ if code in ACCESS_DENIED_CODES:
180
+ raise AccessDenied(action, msg)
181
+ if code in NOT_ENABLED_CODES:
182
+ raise ServiceNotEnabled(action, msg)
183
+ raise exc
@@ -0,0 +1 @@
1
+ """Shared acquisition transports used by multiple log collectors."""
@@ -0,0 +1,70 @@
1
+ """Bounded reader for CloudWatch Logs-delivered logs (EKS audit, Route53 Resolver to CW).
2
+
3
+ One transport, many consumers: time-windowed ``FilterLogEvents`` with an optional stream
4
+ prefix, hard record caps, and typed-gap translation so a missing/denied log group is
5
+ recorded as evidence rather than crashing the run.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from datetime import datetime
11
+ from typing import Any
12
+
13
+ from ...lib.models import GapReason
14
+ from ..client_factory import AccessDenied, ServiceNotEnabled
15
+
16
+ MAX_CW_RECORDS = 200_000
17
+
18
+
19
+ def collect_cw_log_events(
20
+ cf,
21
+ region: str,
22
+ log_group: str,
23
+ start: datetime,
24
+ end: datetime,
25
+ gaps: list[tuple[str, GapReason, str]],
26
+ gap_name: str,
27
+ *,
28
+ stream_prefix: str | None = None,
29
+ max_records: int = MAX_CW_RECORDS,
30
+ ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
31
+ """Pull events from one log group in the window; returns (events, stats)."""
32
+ stats: dict[str, Any] = {
33
+ "log_group": log_group,
34
+ "region": region,
35
+ "records": 0,
36
+ "truncated": False,
37
+ }
38
+ events: list[dict[str, Any]] = []
39
+ kwargs: dict[str, Any] = {
40
+ "logGroupName": log_group,
41
+ "startTime": int(start.timestamp() * 1000),
42
+ "endTime": int(end.timestamp() * 1000),
43
+ }
44
+ if stream_prefix:
45
+ kwargs["logStreamNamePrefix"] = stream_prefix
46
+
47
+ try:
48
+ for ev in cf.paginate("logs", region, "filter_log_events", "events", **kwargs):
49
+ if len(events) >= max_records:
50
+ stats["truncated"] = True
51
+ gaps.append(
52
+ (
53
+ gap_name,
54
+ GapReason.COLLECTOR_ERROR,
55
+ f"{log_group}: truncated at {max_records} records; "
56
+ "narrow the window (--since/--until) for full coverage.",
57
+ )
58
+ )
59
+ break
60
+ ev["_ventra_region"] = region
61
+ ev["_ventra_log_group"] = log_group
62
+ events.append(ev)
63
+ stats["records"] += 1
64
+ except AccessDenied as exc:
65
+ gaps.append((gap_name, GapReason.ACCESS_DENIED, f"{log_group}: {exc.message}"))
66
+ except ServiceNotEnabled as exc:
67
+ gaps.append(
68
+ (gap_name, GapReason.NOT_PRESENT, f"{log_group}: log group not found ({exc.message})")
69
+ )
70
+ return events, stats
@@ -0,0 +1,130 @@
1
+ """Bounded reader for S3-delivered line-format service logs (ELB, CloudFront, S3 access).
2
+
3
+ Mirrors the CloudTrail S3 path: list day-scoped prefixes, read objects (gzip or plain),
4
+ yield records, count everything, and translate access errors into manifest gaps instead of
5
+ crashes. Collectors ship raw lines; the ingester owns versioned parsing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import gzip
11
+ import io
12
+ from collections.abc import Callable, Iterator
13
+ from datetime import datetime, timedelta
14
+ from typing import Any
15
+
16
+ from ...lib.models import GapReason
17
+ from ..client_factory import AccessDenied
18
+
19
+ # Keep CloudShell runs bounded.
20
+ MAX_LOG_OBJECTS = 2000
21
+ MAX_RECORDS = 200_000
22
+
23
+ # (key, line) -> record dict, or None to skip (comment line / out of window).
24
+ LineToRecord = Callable[[str, str], dict[str, Any] | None]
25
+
26
+
27
+ def iter_days(start: datetime, end: datetime) -> Iterator[datetime]:
28
+ day = start.replace(hour=0, minute=0, second=0, microsecond=0)
29
+ last = end.replace(hour=0, minute=0, second=0, microsecond=0)
30
+ while day <= last:
31
+ yield day
32
+ day += timedelta(days=1)
33
+
34
+
35
+ def slash_day_prefixes(base: str, start: datetime, end: datetime) -> list[str]:
36
+ """``<base>YYYY/MM/DD/`` layout (ELB, Route53 Resolver to S3)."""
37
+ return [f"{base}{d.year:04d}/{d.month:02d}/{d.day:02d}/" for d in iter_days(start, end)]
38
+
39
+
40
+ def dash_day_prefixes(base: str, start: datetime, end: datetime) -> list[str]:
41
+ """``<base>YYYY-MM-DD`` flat layout (CloudFront, S3 server access logs)."""
42
+ return [f"{base}{d.year:04d}-{d.month:02d}-{d.day:02d}" for d in iter_days(start, end)]
43
+
44
+
45
+ def bucket_region(cf, bucket: str, default: str = "us-east-1") -> str:
46
+ """Best-effort bucket region so cross-region log buckets still list correctly."""
47
+ try:
48
+ loc = cf.call("s3", default, "get_bucket_location", Bucket=bucket)
49
+ return loc.get("LocationConstraint") or "us-east-1"
50
+ except Exception: # noqa: BLE001 - region resolution is an optimization, not a requirement
51
+ return default
52
+
53
+
54
+ def _object_lines(body: bytes, key: str) -> Iterator[str]:
55
+ if key.endswith(".gz"):
56
+ with gzip.GzipFile(fileobj=io.BytesIO(body)) as gz:
57
+ text = gz.read().decode("utf-8", errors="replace")
58
+ else:
59
+ text = body.decode("utf-8", errors="replace")
60
+ for line in text.splitlines():
61
+ if line.strip():
62
+ yield line
63
+
64
+
65
+ def collect_s3_line_records(
66
+ cf,
67
+ region: str,
68
+ bucket: str,
69
+ prefixes: list[str],
70
+ line_to_record: LineToRecord,
71
+ gaps: list[tuple[str, GapReason, str]],
72
+ gap_name: str,
73
+ *,
74
+ max_objects: int = MAX_LOG_OBJECTS,
75
+ max_records: int = MAX_RECORDS,
76
+ ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
77
+ """Read line logs under ``prefixes`` in ``bucket``; returns (records, stats)."""
78
+ stats: dict[str, Any] = {
79
+ "bucket": bucket,
80
+ "objects_scanned": 0,
81
+ "objects_read": 0,
82
+ "records": 0,
83
+ "truncated": False,
84
+ }
85
+ records: list[dict[str, Any]] = []
86
+ s3 = cf.client("s3", region)
87
+
88
+ for prefix in prefixes:
89
+ if stats["truncated"]:
90
+ break
91
+ try:
92
+ for obj in cf.paginate(
93
+ "s3", region, "list_objects_v2", "Contents", Bucket=bucket, Prefix=prefix
94
+ ):
95
+ stats["objects_scanned"] += 1
96
+ if stats["objects_scanned"] > max_objects:
97
+ stats["truncated"] = True
98
+ break
99
+ key = obj.get("Key", "")
100
+ if key.endswith("/"):
101
+ continue
102
+ try:
103
+ body = s3.get_object(Bucket=bucket, Key=key)["Body"].read()
104
+ except Exception as exc: # noqa: BLE001 - one unreadable object is a gap, not a crash
105
+ gaps.append((gap_name, GapReason.COLLECTOR_ERROR, f"{bucket}/{key}: {exc}"))
106
+ continue
107
+ stats["objects_read"] += 1
108
+ for line in _object_lines(body, key):
109
+ if len(records) >= max_records:
110
+ stats["truncated"] = True
111
+ break
112
+ rec = line_to_record(key, line)
113
+ if rec is not None:
114
+ records.append(rec)
115
+ stats["records"] += 1
116
+ if stats["truncated"]:
117
+ break
118
+ except AccessDenied as exc:
119
+ gaps.append((gap_name, GapReason.ACCESS_DENIED, f"{bucket}/{prefix}: {exc.message}"))
120
+
121
+ if stats["truncated"]:
122
+ gaps.append(
123
+ (
124
+ gap_name,
125
+ GapReason.COLLECTOR_ERROR,
126
+ f"{bucket}: truncated at {max_objects} objects / {max_records} records; "
127
+ "narrow the window (--since/--until) for full coverage.",
128
+ )
129
+ )
130
+ return records, stats