ventra 0.0.0.post26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- collector/__init__.py +23 -0
- collector/__main__.py +48 -0
- collector/aws/__init__.py +5 -0
- collector/aws/client_factory.py +183 -0
- collector/aws/common/__init__.py +1 -0
- collector/aws/common/cw_logs.py +70 -0
- collector/aws/common/s3_logs.py +130 -0
- collector/aws/control_plane/cloudtrail.py +641 -0
- collector/aws/control_plane/cloudtrail_s3.py +338 -0
- collector/aws/control_plane/cloudtrail_validation.py +215 -0
- collector/aws/control_plane/config.py +96 -0
- collector/aws/control_plane/log_posture.py +344 -0
- collector/aws/detections/detective.py +102 -0
- collector/aws/detections/guardduty.py +116 -0
- collector/aws/detections/inspector2.py +106 -0
- collector/aws/detections/macie.py +103 -0
- collector/aws/detections/securityhub.py +101 -0
- collector/aws/identity/account.py +71 -0
- collector/aws/identity/iam.py +386 -0
- collector/aws/identity/kms.py +74 -0
- collector/aws/identity/secrets.py +56 -0
- collector/aws/network/cloudfront.py +205 -0
- collector/aws/network/elb_alb.py +232 -0
- collector/aws/network/route53_resolver.py +246 -0
- collector/aws/network/vpc_flow.py +176 -0
- collector/aws/network/waf.py +144 -0
- collector/aws/registry.py +73 -0
- collector/aws/runner/runner.py +232 -0
- collector/aws/workloads/ec2.py +217 -0
- collector/aws/workloads/eks_audit.py +179 -0
- collector/aws/workloads/lambda_.py +75 -0
- collector/aws/workloads/s3.py +112 -0
- collector/aws/workloads/s3_access.py +197 -0
- collector/azure/__init__.py +17 -0
- collector/cli.py +664 -0
- collector/devgui.py +353 -0
- collector/gcp/__init__.py +17 -0
- collector/lib/__init__.py +1 -0
- collector/lib/base.py +168 -0
- collector/lib/chain_of_custody/__init__.py +6 -0
- collector/lib/chain_of_custody/hashing.py +21 -0
- collector/lib/chain_of_custody/signing.py +67 -0
- collector/lib/ingest.py +60 -0
- collector/lib/models.py +233 -0
- collector/lib/packaging/__init__.py +5 -0
- collector/lib/packaging/packager.py +77 -0
- collector/lib/transport/__init__.py +5 -0
- collector/lib/transport/base.py +90 -0
- collector/tools/__init__.py +1 -0
- collector/tools/verify_readonly.py +66 -0
- ventra-0.0.0.post26.dist-info/METADATA +181 -0
- ventra-0.0.0.post26.dist-info/RECORD +56 -0
- ventra-0.0.0.post26.dist-info/WHEEL +5 -0
- ventra-0.0.0.post26.dist-info/entry_points.txt +3 -0
- ventra-0.0.0.post26.dist-info/licenses/LICENSE +202 -0
- ventra-0.0.0.post26.dist-info/top_level.txt +1 -0
collector/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Ventra collector — read-only cloud forensic triage acquisition.
|
|
2
|
+
|
|
3
|
+
The collector runs in the client's cloud shell, gathers exactly the logs and artifacts
|
|
4
|
+
incident responders need, and seals them into a signed evidence package described by the
|
|
5
|
+
Ventra Evidence Package Format (EPF).
|
|
6
|
+
|
|
7
|
+
Forensic invariant: nothing in this package may call a mutating cloud API. See
|
|
8
|
+
``collector.tools.verify_readonly`` and the ``readonly-guard`` CI check.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from importlib.metadata import PackageNotFoundError
|
|
12
|
+
from importlib.metadata import version as _pkg_version
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
# Resolved from the installed distribution's metadata, which setuptools-scm derives from
|
|
16
|
+
# the git tag at build/install time. Recorded in every manifest as ``tool_version``, so an
|
|
17
|
+
# evidence package always shows exactly which build collected it (a tagged release like
|
|
18
|
+
# ``0.2.0``, or a dev build like ``0.2.0.dev3+g1a2b3c4`` when run from a working tree).
|
|
19
|
+
__version__ = _pkg_version("ventra")
|
|
20
|
+
except PackageNotFoundError: # running from a source tree that was never installed
|
|
21
|
+
__version__ = "0.0.0+unknown"
|
|
22
|
+
|
|
23
|
+
del PackageNotFoundError, _pkg_version
|
collector/__main__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Allow ``python -m collector dev`` from a fresh clone before ``pip install``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
_MIN_PYTHON = (3, 11)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _reexec_with_newer_python() -> None:
|
|
13
|
+
"""Re-run this module with Python 3.11+ when the default ``python3`` is too old."""
|
|
14
|
+
if sys.version_info >= _MIN_PYTHON:
|
|
15
|
+
return
|
|
16
|
+
for cmd in ("python3.12", "python3.11", "python3"):
|
|
17
|
+
path = shutil.which(cmd)
|
|
18
|
+
if not path or os.path.realpath(path) == os.path.realpath(sys.executable):
|
|
19
|
+
continue
|
|
20
|
+
import subprocess
|
|
21
|
+
|
|
22
|
+
ok = subprocess.run(
|
|
23
|
+
[
|
|
24
|
+
path,
|
|
25
|
+
"-c",
|
|
26
|
+
f"import sys; raise SystemExit(0 if sys.version_info >= {_MIN_PYTHON!r} else 1)",
|
|
27
|
+
],
|
|
28
|
+
capture_output=True,
|
|
29
|
+
).returncode == 0
|
|
30
|
+
if not ok:
|
|
31
|
+
continue
|
|
32
|
+
os.execv(path, [path, "-m", "collector", *sys.argv[1:]])
|
|
33
|
+
|
|
34
|
+
print(
|
|
35
|
+
"error: Ventra requires Python 3.11 or newer.\n"
|
|
36
|
+
f" Current: {sys.executable} ({sys.version.split()[0]})\n"
|
|
37
|
+
" Install: brew install python@3.11\n"
|
|
38
|
+
" Then run: python3.11 -m collector dev",
|
|
39
|
+
file=sys.stderr,
|
|
40
|
+
)
|
|
41
|
+
raise SystemExit(1)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
_reexec_with_newer_python()
|
|
46
|
+
from .cli import main
|
|
47
|
+
|
|
48
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""AWS client management used by every collector.
|
|
2
|
+
|
|
3
|
+
Wraps boto3 so collectors don't each reinvent region handling, pagination, and the
|
|
4
|
+
all-important AccessDenied detection (an AccessDenied is a *gap*, recorded as evidence, not a
|
|
5
|
+
crash). All clients are created from a single session so credentials are resolved once.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterator
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import boto3
|
|
15
|
+
from botocore.config import Config
|
|
16
|
+
from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError
|
|
17
|
+
|
|
18
|
+
# Error codes that mean "you can't see this" rather than "something broke".
|
|
19
|
+
ACCESS_DENIED_CODES = frozenset(
|
|
20
|
+
{
|
|
21
|
+
"AccessDenied",
|
|
22
|
+
"AccessDeniedException",
|
|
23
|
+
"UnauthorizedOperation",
|
|
24
|
+
"AuthorizationError",
|
|
25
|
+
"AuthFailure",
|
|
26
|
+
"ForbiddenException",
|
|
27
|
+
}
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Error codes that mean "this service/feature isn't enabled here".
|
|
31
|
+
NOT_ENABLED_CODES = frozenset(
|
|
32
|
+
{
|
|
33
|
+
"ResourceNotFoundException",
|
|
34
|
+
"BadRequestException",
|
|
35
|
+
"InvalidInputException",
|
|
36
|
+
"SubscriptionRequiredException",
|
|
37
|
+
"OptInRequired",
|
|
38
|
+
# Security Hub raises this when the account is not subscribed to the hub.
|
|
39
|
+
"InvalidAccessException",
|
|
40
|
+
# WAFv2 raises this when e.g. a Web ACL has no logging configuration.
|
|
41
|
+
"WAFNonexistentItemException",
|
|
42
|
+
# Standalone accounts that are not part of an AWS Organization.
|
|
43
|
+
"AWSOrganizationsNotInUseException",
|
|
44
|
+
# CloudTrail trails without Insights enabled.
|
|
45
|
+
"InsightNotEnabledException",
|
|
46
|
+
# IAM raises this when e.g. no custom password policy exists on the account.
|
|
47
|
+
"NoSuchEntity",
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class AccessDenied(Exception):
|
|
53
|
+
"""Raised by helpers when an API returns an access-denied style error."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, action: str, message: str) -> None:
|
|
56
|
+
super().__init__(f"{action}: {message}")
|
|
57
|
+
self.action = action
|
|
58
|
+
self.message = message
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ServiceNotEnabled(Exception):
|
|
62
|
+
def __init__(self, service: str, message: str) -> None:
|
|
63
|
+
super().__init__(f"{service}: {message}")
|
|
64
|
+
self.service = service
|
|
65
|
+
self.message = message
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class CallerIdentity:
|
|
70
|
+
account_id: str
|
|
71
|
+
arn: str
|
|
72
|
+
user_id: str
|
|
73
|
+
partition: str
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class AwsClientFactory:
|
|
77
|
+
"""Creates per-service, per-region boto3 clients from one session."""
|
|
78
|
+
|
|
79
|
+
def __init__(self, session: boto3.Session | None = None) -> None:
|
|
80
|
+
self._session = session or boto3.Session()
|
|
81
|
+
self._cfg = Config(retries={"max_attempts": 5, "mode": "adaptive"}, user_agent_extra="ventra")
|
|
82
|
+
self._cache: dict[tuple[str, str | None], Any] = {}
|
|
83
|
+
|
|
84
|
+
def client(self, service: str, region: str | None = None) -> Any:
|
|
85
|
+
key = (service, region)
|
|
86
|
+
if key not in self._cache:
|
|
87
|
+
self._cache[key] = self._session.client(service, region_name=region, config=self._cfg)
|
|
88
|
+
return self._cache[key]
|
|
89
|
+
|
|
90
|
+
# -- identity / region discovery -----------------------------------------------------
|
|
91
|
+
|
|
92
|
+
def caller_identity(self) -> CallerIdentity:
|
|
93
|
+
try:
|
|
94
|
+
ident = self.client("sts").get_caller_identity()
|
|
95
|
+
except NoCredentialsError as exc: # pragma: no cover
|
|
96
|
+
raise RuntimeError(
|
|
97
|
+
"No AWS credentials found. Run inside CloudShell or configure a profile."
|
|
98
|
+
) from exc
|
|
99
|
+
arn = ident["Arn"]
|
|
100
|
+
partition = arn.split(":")[1] if arn.startswith("arn:") else "aws"
|
|
101
|
+
return CallerIdentity(
|
|
102
|
+
account_id=ident["Account"],
|
|
103
|
+
arn=arn,
|
|
104
|
+
user_id=ident.get("UserId", ""),
|
|
105
|
+
partition=partition,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def enabled_regions(self) -> list[str]:
|
|
109
|
+
"""Regions enabled for this account (opt-in regions included if active)."""
|
|
110
|
+
try:
|
|
111
|
+
resp = self.client("ec2", "us-east-1").describe_regions(
|
|
112
|
+
Filters=[{"Name": "opt-in-status", "Values": ["opt-in-not-required", "opted-in"]}]
|
|
113
|
+
)
|
|
114
|
+
return sorted(r["RegionName"] for r in resp["Regions"])
|
|
115
|
+
except ClientError:
|
|
116
|
+
# Fall back to the SDK's static partition list.
|
|
117
|
+
return sorted(self._session.get_available_regions("ec2"))
|
|
118
|
+
|
|
119
|
+
# -- safe call helpers ---------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def paginate(
|
|
122
|
+
self, service: str, region: str | None, operation: str, result_key: str, **kwargs: Any
|
|
123
|
+
) -> Iterator[dict[str, Any]]:
|
|
124
|
+
"""Yield items across pages, translating access/enablement errors into typed gaps."""
|
|
125
|
+
client = self.client(service, region)
|
|
126
|
+
try:
|
|
127
|
+
paginator = client.get_paginator(operation)
|
|
128
|
+
for page in paginator.paginate(**kwargs):
|
|
129
|
+
yield from page.get(result_key, [])
|
|
130
|
+
except ClientError as exc:
|
|
131
|
+
_raise_typed(exc, f"{service}:{operation}")
|
|
132
|
+
except EndpointConnectionError:
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
def call(self, service: str, region: str | None, operation: str, **kwargs: Any) -> dict[str, Any]:
|
|
136
|
+
client = self.client(service, region)
|
|
137
|
+
try:
|
|
138
|
+
return getattr(client, operation)(**kwargs)
|
|
139
|
+
except ClientError as exc:
|
|
140
|
+
_raise_typed(exc, f"{service}:{operation}")
|
|
141
|
+
raise # unreachable, keeps type-checkers happy
|
|
142
|
+
except EndpointConnectionError as exc:
|
|
143
|
+
# The service has no endpoint in this region — same gap as "not enabled".
|
|
144
|
+
raise ServiceNotEnabled(f"{service}:{operation}", str(exc)) from exc
|
|
145
|
+
|
|
146
|
+
def paginate_manual(
|
|
147
|
+
self,
|
|
148
|
+
service: str,
|
|
149
|
+
region: str | None,
|
|
150
|
+
operation: str,
|
|
151
|
+
result_key: str,
|
|
152
|
+
*,
|
|
153
|
+
token_request_key: str = "NextToken",
|
|
154
|
+
token_response_key: str = "NextToken",
|
|
155
|
+
max_pages: int = 500,
|
|
156
|
+
**kwargs: Any,
|
|
157
|
+
) -> Iterator[dict[str, Any]]:
|
|
158
|
+
"""Token-loop pagination for operations botocore has no paginator for
|
|
159
|
+
(e.g. wafv2 ListWebACLs / detective ListInvestigations)."""
|
|
160
|
+
token: str | None = None
|
|
161
|
+
for _ in range(max_pages):
|
|
162
|
+
params = dict(kwargs)
|
|
163
|
+
if token:
|
|
164
|
+
params[token_request_key] = token
|
|
165
|
+
page = self.call(service, region, operation, **params)
|
|
166
|
+
items = page.get(result_key) or []
|
|
167
|
+
yield from items
|
|
168
|
+
new_token = page.get(token_response_key)
|
|
169
|
+
# Stop on a missing, repeated, or itemless marker so a quirky
|
|
170
|
+
# implementation can never loop us forever.
|
|
171
|
+
if not new_token or new_token == token or not items:
|
|
172
|
+
return
|
|
173
|
+
token = new_token
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _raise_typed(exc: ClientError, action: str) -> None:
|
|
177
|
+
code = exc.response.get("Error", {}).get("Code", "")
|
|
178
|
+
msg = exc.response.get("Error", {}).get("Message", str(exc))
|
|
179
|
+
if code in ACCESS_DENIED_CODES:
|
|
180
|
+
raise AccessDenied(action, msg)
|
|
181
|
+
if code in NOT_ENABLED_CODES:
|
|
182
|
+
raise ServiceNotEnabled(action, msg)
|
|
183
|
+
raise exc
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Shared acquisition transports used by multiple log collectors."""
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Bounded reader for CloudWatch Logs-delivered logs (EKS audit, Route53 Resolver to CW).
|
|
2
|
+
|
|
3
|
+
One transport, many consumers: time-windowed ``FilterLogEvents`` with an optional stream
|
|
4
|
+
prefix, hard record caps, and typed-gap translation so a missing/denied log group is
|
|
5
|
+
recorded as evidence rather than crashing the run.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from ...lib.models import GapReason
|
|
14
|
+
from ..client_factory import AccessDenied, ServiceNotEnabled
|
|
15
|
+
|
|
16
|
+
MAX_CW_RECORDS = 200_000
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def collect_cw_log_events(
|
|
20
|
+
cf,
|
|
21
|
+
region: str,
|
|
22
|
+
log_group: str,
|
|
23
|
+
start: datetime,
|
|
24
|
+
end: datetime,
|
|
25
|
+
gaps: list[tuple[str, GapReason, str]],
|
|
26
|
+
gap_name: str,
|
|
27
|
+
*,
|
|
28
|
+
stream_prefix: str | None = None,
|
|
29
|
+
max_records: int = MAX_CW_RECORDS,
|
|
30
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
31
|
+
"""Pull events from one log group in the window; returns (events, stats)."""
|
|
32
|
+
stats: dict[str, Any] = {
|
|
33
|
+
"log_group": log_group,
|
|
34
|
+
"region": region,
|
|
35
|
+
"records": 0,
|
|
36
|
+
"truncated": False,
|
|
37
|
+
}
|
|
38
|
+
events: list[dict[str, Any]] = []
|
|
39
|
+
kwargs: dict[str, Any] = {
|
|
40
|
+
"logGroupName": log_group,
|
|
41
|
+
"startTime": int(start.timestamp() * 1000),
|
|
42
|
+
"endTime": int(end.timestamp() * 1000),
|
|
43
|
+
}
|
|
44
|
+
if stream_prefix:
|
|
45
|
+
kwargs["logStreamNamePrefix"] = stream_prefix
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
for ev in cf.paginate("logs", region, "filter_log_events", "events", **kwargs):
|
|
49
|
+
if len(events) >= max_records:
|
|
50
|
+
stats["truncated"] = True
|
|
51
|
+
gaps.append(
|
|
52
|
+
(
|
|
53
|
+
gap_name,
|
|
54
|
+
GapReason.COLLECTOR_ERROR,
|
|
55
|
+
f"{log_group}: truncated at {max_records} records; "
|
|
56
|
+
"narrow the window (--since/--until) for full coverage.",
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
break
|
|
60
|
+
ev["_ventra_region"] = region
|
|
61
|
+
ev["_ventra_log_group"] = log_group
|
|
62
|
+
events.append(ev)
|
|
63
|
+
stats["records"] += 1
|
|
64
|
+
except AccessDenied as exc:
|
|
65
|
+
gaps.append((gap_name, GapReason.ACCESS_DENIED, f"{log_group}: {exc.message}"))
|
|
66
|
+
except ServiceNotEnabled as exc:
|
|
67
|
+
gaps.append(
|
|
68
|
+
(gap_name, GapReason.NOT_PRESENT, f"{log_group}: log group not found ({exc.message})")
|
|
69
|
+
)
|
|
70
|
+
return events, stats
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Bounded reader for S3-delivered line-format service logs (ELB, CloudFront, S3 access).
|
|
2
|
+
|
|
3
|
+
Mirrors the CloudTrail S3 path: list day-scoped prefixes, read objects (gzip or plain),
|
|
4
|
+
yield records, count everything, and translate access errors into manifest gaps instead of
|
|
5
|
+
crashes. Collectors ship raw lines; the ingester owns versioned parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import gzip
|
|
11
|
+
import io
|
|
12
|
+
from collections.abc import Callable, Iterator
|
|
13
|
+
from datetime import datetime, timedelta
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from ...lib.models import GapReason
|
|
17
|
+
from ..client_factory import AccessDenied
|
|
18
|
+
|
|
19
|
+
# Keep CloudShell runs bounded.
|
|
20
|
+
MAX_LOG_OBJECTS = 2000
|
|
21
|
+
MAX_RECORDS = 200_000
|
|
22
|
+
|
|
23
|
+
# (key, line) -> record dict, or None to skip (comment line / out of window).
|
|
24
|
+
LineToRecord = Callable[[str, str], dict[str, Any] | None]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def iter_days(start: datetime, end: datetime) -> Iterator[datetime]:
|
|
28
|
+
day = start.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
29
|
+
last = end.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
30
|
+
while day <= last:
|
|
31
|
+
yield day
|
|
32
|
+
day += timedelta(days=1)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def slash_day_prefixes(base: str, start: datetime, end: datetime) -> list[str]:
|
|
36
|
+
"""``<base>YYYY/MM/DD/`` layout (ELB, Route53 Resolver to S3)."""
|
|
37
|
+
return [f"{base}{d.year:04d}/{d.month:02d}/{d.day:02d}/" for d in iter_days(start, end)]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def dash_day_prefixes(base: str, start: datetime, end: datetime) -> list[str]:
|
|
41
|
+
"""``<base>YYYY-MM-DD`` flat layout (CloudFront, S3 server access logs)."""
|
|
42
|
+
return [f"{base}{d.year:04d}-{d.month:02d}-{d.day:02d}" for d in iter_days(start, end)]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def bucket_region(cf, bucket: str, default: str = "us-east-1") -> str:
|
|
46
|
+
"""Best-effort bucket region so cross-region log buckets still list correctly."""
|
|
47
|
+
try:
|
|
48
|
+
loc = cf.call("s3", default, "get_bucket_location", Bucket=bucket)
|
|
49
|
+
return loc.get("LocationConstraint") or "us-east-1"
|
|
50
|
+
except Exception: # noqa: BLE001 - region resolution is an optimization, not a requirement
|
|
51
|
+
return default
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _object_lines(body: bytes, key: str) -> Iterator[str]:
|
|
55
|
+
if key.endswith(".gz"):
|
|
56
|
+
with gzip.GzipFile(fileobj=io.BytesIO(body)) as gz:
|
|
57
|
+
text = gz.read().decode("utf-8", errors="replace")
|
|
58
|
+
else:
|
|
59
|
+
text = body.decode("utf-8", errors="replace")
|
|
60
|
+
for line in text.splitlines():
|
|
61
|
+
if line.strip():
|
|
62
|
+
yield line
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def collect_s3_line_records(
|
|
66
|
+
cf,
|
|
67
|
+
region: str,
|
|
68
|
+
bucket: str,
|
|
69
|
+
prefixes: list[str],
|
|
70
|
+
line_to_record: LineToRecord,
|
|
71
|
+
gaps: list[tuple[str, GapReason, str]],
|
|
72
|
+
gap_name: str,
|
|
73
|
+
*,
|
|
74
|
+
max_objects: int = MAX_LOG_OBJECTS,
|
|
75
|
+
max_records: int = MAX_RECORDS,
|
|
76
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
77
|
+
"""Read line logs under ``prefixes`` in ``bucket``; returns (records, stats)."""
|
|
78
|
+
stats: dict[str, Any] = {
|
|
79
|
+
"bucket": bucket,
|
|
80
|
+
"objects_scanned": 0,
|
|
81
|
+
"objects_read": 0,
|
|
82
|
+
"records": 0,
|
|
83
|
+
"truncated": False,
|
|
84
|
+
}
|
|
85
|
+
records: list[dict[str, Any]] = []
|
|
86
|
+
s3 = cf.client("s3", region)
|
|
87
|
+
|
|
88
|
+
for prefix in prefixes:
|
|
89
|
+
if stats["truncated"]:
|
|
90
|
+
break
|
|
91
|
+
try:
|
|
92
|
+
for obj in cf.paginate(
|
|
93
|
+
"s3", region, "list_objects_v2", "Contents", Bucket=bucket, Prefix=prefix
|
|
94
|
+
):
|
|
95
|
+
stats["objects_scanned"] += 1
|
|
96
|
+
if stats["objects_scanned"] > max_objects:
|
|
97
|
+
stats["truncated"] = True
|
|
98
|
+
break
|
|
99
|
+
key = obj.get("Key", "")
|
|
100
|
+
if key.endswith("/"):
|
|
101
|
+
continue
|
|
102
|
+
try:
|
|
103
|
+
body = s3.get_object(Bucket=bucket, Key=key)["Body"].read()
|
|
104
|
+
except Exception as exc: # noqa: BLE001 - one unreadable object is a gap, not a crash
|
|
105
|
+
gaps.append((gap_name, GapReason.COLLECTOR_ERROR, f"{bucket}/{key}: {exc}"))
|
|
106
|
+
continue
|
|
107
|
+
stats["objects_read"] += 1
|
|
108
|
+
for line in _object_lines(body, key):
|
|
109
|
+
if len(records) >= max_records:
|
|
110
|
+
stats["truncated"] = True
|
|
111
|
+
break
|
|
112
|
+
rec = line_to_record(key, line)
|
|
113
|
+
if rec is not None:
|
|
114
|
+
records.append(rec)
|
|
115
|
+
stats["records"] += 1
|
|
116
|
+
if stats["truncated"]:
|
|
117
|
+
break
|
|
118
|
+
except AccessDenied as exc:
|
|
119
|
+
gaps.append((gap_name, GapReason.ACCESS_DENIED, f"{bucket}/{prefix}: {exc.message}"))
|
|
120
|
+
|
|
121
|
+
if stats["truncated"]:
|
|
122
|
+
gaps.append(
|
|
123
|
+
(
|
|
124
|
+
gap_name,
|
|
125
|
+
GapReason.COLLECTOR_ERROR,
|
|
126
|
+
f"{bucket}: truncated at {max_objects} objects / {max_records} records; "
|
|
127
|
+
"narrow the window (--since/--until) for full coverage.",
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
return records, stats
|