loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""CloudTrail S3 exporter — pulls gzipped JSON event objects from S3 to local NDJSON.
|
|
2
|
+
|
|
3
|
+
Invoked via: loghunter export cloudtrail
|
|
4
|
+
|
|
5
|
+
CloudTrail writes objects under a rigid layout:
|
|
6
|
+
<prefix>/AWSLogs/<account-id>/CloudTrail/<region>/YYYY/MM/DD/<file>.json.gz
|
|
7
|
+
each containing {"Records": [ ...events... ]}. A sibling CloudTrail-Digest/ prefix
|
|
8
|
+
holds integrity manifests (not events) and is skipped.
|
|
9
|
+
|
|
10
|
+
AWS authentication is outside this tool: the user authenticates their shell
|
|
11
|
+
(aws login / SSO / env vars / instance role) before running loghunter, and boto3
|
|
12
|
+
resolves the ambient credential chain. We never read, store, or prompt for
|
|
13
|
+
AWS credentials.
|
|
14
|
+
|
|
15
|
+
Date-root discovery keys off the 4-digit-year prefix invariant, NOT a fixed
|
|
16
|
+
segment count — resilient to org-id segments and to the user pointing at any
|
|
17
|
+
level at or above the region.
|
|
18
|
+
|
|
19
|
+
The pull is two-phase: list-only (free) to estimate object bytes and prompt
|
|
20
|
+
above ``egress_warn_gb`` if needed, then download/parse on confirmation. The
|
|
21
|
+
prompt is suppressed when ``skip_confirm=True``.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import contextlib
|
|
27
|
+
import gzip
|
|
28
|
+
import io
|
|
29
|
+
import json
|
|
30
|
+
import re
|
|
31
|
+
import sys
|
|
32
|
+
from datetime import datetime, timedelta, timezone
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from typing import Any
|
|
35
|
+
|
|
36
|
+
from tqdm import tqdm
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
import boto3
|
|
40
|
+
import botocore.exceptions as botocore_exc
|
|
41
|
+
except ImportError:
|
|
42
|
+
boto3 = None # type: ignore[assignment]
|
|
43
|
+
botocore_exc = None # type: ignore[assignment]
|
|
44
|
+
|
|
45
|
+
from loghunter.common.display import liveness
|
|
46
|
+
from loghunter.common.errors import ExportAborted
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
_YEAR_RE = re.compile(r"^\d{4}/$")
|
|
50
|
+
_ACCOUNT_ID_RE = re.compile(r"^\d+/$")
|
|
51
|
+
_ORG_ID_RE = re.compile(r"^o-[a-z0-9]+/$")
|
|
52
|
+
_DIGEST_SEGMENT = "CloudTrail-Digest/"
|
|
53
|
+
_EVENT_SEGMENT = "CloudTrail/"
|
|
54
|
+
_KNOWN_ANCESTOR_SEGMENTS = frozenset({"AWSLogs/", _EVENT_SEGMENT})
|
|
55
|
+
_AUTH_ERROR_CODES = {
|
|
56
|
+
"AccessDenied",
|
|
57
|
+
"ExpiredToken",
|
|
58
|
+
"InvalidToken",
|
|
59
|
+
"InvalidAccessKeyId",
|
|
60
|
+
"SignatureDoesNotMatch",
|
|
61
|
+
"RequestExpired",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Size threshold for splitting NDJSON output into _partNN files.
|
|
65
|
+
# Exposed at module scope so tests can monkeypatch a tiny value.
|
|
66
|
+
_PART_SPLIT_BYTES = 2_000_000_000 # 2 GB
|
|
67
|
+
|
|
68
|
+
OUTPUT_EXTENSION = ".json.log"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def is_configured(backend_cfg: dict[str, Any]) -> bool:
|
|
72
|
+
"""True when [export.cloudtrail].path is set — analogue of Splunk's host check."""
|
|
73
|
+
return bool(backend_cfg.get("path", "").strip())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def summary_descriptor(backend_cfg: dict[str, Any]) -> str:
|
|
77
|
+
"""Identifier shown in the final summary's `Backend :` line, e.g. s3://bucket/AWSLogs/."""
|
|
78
|
+
return backend_cfg.get("path", "")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def implicit_default_query() -> dict[str, Any]:
|
|
82
|
+
"""CloudTrail has no per-query SPL — synthetic default supplies the basename.
|
|
83
|
+
|
|
84
|
+
Returning {} would cause _resolve_output_path to fall back to the query name
|
|
85
|
+
("default"), producing files like default_20260601_7d.json.log. We want
|
|
86
|
+
cloudtrail_20260601_7d.json.log, so the synthetic query carries an explicit
|
|
87
|
+
output_basename.
|
|
88
|
+
"""
|
|
89
|
+
return {"output_basename": "cloudtrail"}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _auth_error_message() -> str:
|
|
93
|
+
return (
|
|
94
|
+
"AWS credentials not found or expired — authenticate your shell "
|
|
95
|
+
"(e.g. your aws login) and try again"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@contextlib.contextmanager
|
|
100
|
+
def _translate_boto_errors():
|
|
101
|
+
"""Translate botocore exceptions into actionable ValueErrors uniformly.
|
|
102
|
+
|
|
103
|
+
Centralizes the mapping table so every boto call site uses the same
|
|
104
|
+
translation. The well-known cases (missing/partial credentials, the
|
|
105
|
+
missing botocore[crt] dep, auth-code ClientErrors) get tailored messages
|
|
106
|
+
naming the exact remedy; the long tail (endpoint resolution, profile
|
|
107
|
+
config errors, non-auth ClientErrors, etc.) is swept up as
|
|
108
|
+
"AWS error during CloudTrail export: <detail>" so a raw botocore
|
|
109
|
+
traceback never reaches the user.
|
|
110
|
+
|
|
111
|
+
Order matters — more specific BotoCoreError subclasses must be caught
|
|
112
|
+
before the BotoCoreError sweep. ClientError is a separate hierarchy
|
|
113
|
+
(not a subclass of BotoCoreError) and is handled in its own branch.
|
|
114
|
+
|
|
115
|
+
Does NOT catch bare Exception — genuinely non-botocore errors
|
|
116
|
+
(programmer bugs, OS issues, etc.) must still surface unmasked.
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
yield
|
|
120
|
+
except (botocore_exc.NoCredentialsError,
|
|
121
|
+
botocore_exc.PartialCredentialsError) as exc:
|
|
122
|
+
raise ValueError(_auth_error_message()) from exc
|
|
123
|
+
except botocore_exc.MissingDependencyException as exc:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
"AWS credential provider needs an extra dependency — run: "
|
|
126
|
+
"pip install 'botocore[crt]' (your AWS profile likely uses "
|
|
127
|
+
f"SSO/login-based credentials). botocore detail: {exc}"
|
|
128
|
+
) from exc
|
|
129
|
+
except botocore_exc.ClientError as exc:
|
|
130
|
+
code = exc.response.get("Error", {}).get("Code", "")
|
|
131
|
+
if code in _AUTH_ERROR_CODES:
|
|
132
|
+
raise ValueError(_auth_error_message()) from exc
|
|
133
|
+
raise ValueError(f"AWS error during CloudTrail export: {exc}") from exc
|
|
134
|
+
except botocore_exc.BotoCoreError as exc:
|
|
135
|
+
raise ValueError(f"AWS error during CloudTrail export: {exc}") from exc
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _parse_s3_path(s3_path: str) -> tuple[str, str]:
|
|
139
|
+
"""Split s3://bucket/key/prefix/ into (bucket, prefix). Prefix ends with /."""
|
|
140
|
+
if not s3_path.startswith("s3://"):
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"CloudTrail path must start with s3:// — got: {s3_path}"
|
|
143
|
+
)
|
|
144
|
+
rest = s3_path[5:]
|
|
145
|
+
if "/" in rest:
|
|
146
|
+
bucket, prefix = rest.split("/", 1)
|
|
147
|
+
else:
|
|
148
|
+
bucket, prefix = rest, ""
|
|
149
|
+
if not bucket:
|
|
150
|
+
raise ValueError(f"CloudTrail path is missing a bucket name: {s3_path}")
|
|
151
|
+
if prefix and not prefix.endswith("/"):
|
|
152
|
+
prefix += "/"
|
|
153
|
+
return bucket, prefix
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _has_cloudtrail_segment(prefix: str) -> bool:
|
|
157
|
+
"""True iff ``prefix`` contains 'CloudTrail/' as a whole path segment.
|
|
158
|
+
|
|
159
|
+
Padded with a leading '/' so a prefix starting at the literal characters
|
|
160
|
+
'CloudTrail/' still matches. Does NOT match 'CloudTrail-Digest/' (different
|
|
161
|
+
segment) or any other compound name.
|
|
162
|
+
"""
|
|
163
|
+
return f"/{prefix}".find(f"/{_EVENT_SEGMENT}") != -1
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _is_cloudtrail_ancestor_segment(segment: str) -> bool:
|
|
167
|
+
"""True iff ``segment`` (trailing-slash form) could plausibly be a structural
|
|
168
|
+
parent of CloudTrail/ in standard AWS layouts.
|
|
169
|
+
|
|
170
|
+
The walker only descends through these patterns when not already inside a
|
|
171
|
+
CloudTrail/ subtree. Co-located non-CloudTrail service segments
|
|
172
|
+
(elasticloadbalancing, RDS, vpc-flow-logs, etc.) are rejected here, so the
|
|
173
|
+
walk never lists inside them — saving S3 calls and preventing an
|
|
174
|
+
AccessDenied in an unrelated branch from aborting an otherwise-readable
|
|
175
|
+
CloudTrail pull.
|
|
176
|
+
|
|
177
|
+
Users with non-standard prefix layouts can point [export.cloudtrail].path deeper
|
|
178
|
+
(at or below CloudTrail/) to bypass — past CloudTrail/, the walker
|
|
179
|
+
descends every child.
|
|
180
|
+
"""
|
|
181
|
+
if segment in _KNOWN_ANCESTOR_SEGMENTS:
|
|
182
|
+
return True
|
|
183
|
+
if _ACCOUNT_ID_RE.match(segment):
|
|
184
|
+
return True
|
|
185
|
+
if _ORG_ID_RE.match(segment):
|
|
186
|
+
return True
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _list_common_prefixes(client, bucket: str, prefix: str) -> list[str]:
|
|
191
|
+
"""Return immediate child common-prefixes under ``prefix`` (one level deep)."""
|
|
192
|
+
with _translate_boto_errors():
|
|
193
|
+
paginator = client.get_paginator("list_objects_v2")
|
|
194
|
+
common: list[str] = []
|
|
195
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
|
|
196
|
+
for cp in page.get("CommonPrefixes", []) or []:
|
|
197
|
+
common.append(cp["Prefix"])
|
|
198
|
+
return common
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _find_date_roots(client, bucket: str, base_prefix: str) -> list[str]:
|
|
202
|
+
"""Walk down from base_prefix until children look like YYYY/ segments.
|
|
203
|
+
|
|
204
|
+
A prefix is accepted as a date root only if its full path contains the
|
|
205
|
+
'CloudTrail/' event segment — this prevents accidental discovery of
|
|
206
|
+
sibling AWS service trees that share the YYYY/MM/DD layout.
|
|
207
|
+
|
|
208
|
+
CloudTrail-Digest/ branches are skipped during the walk.
|
|
209
|
+
"""
|
|
210
|
+
accepted: list[str] = []
|
|
211
|
+
queue: list[str] = [base_prefix]
|
|
212
|
+
visited: set[str] = set()
|
|
213
|
+
|
|
214
|
+
while queue:
|
|
215
|
+
current = queue.pop(0)
|
|
216
|
+
if current in visited:
|
|
217
|
+
continue
|
|
218
|
+
visited.add(current)
|
|
219
|
+
|
|
220
|
+
children = _list_common_prefixes(client, bucket, current)
|
|
221
|
+
inside_cloudtrail = _has_cloudtrail_segment(current)
|
|
222
|
+
|
|
223
|
+
# If immediate children look like 4-digit years, this is a date root
|
|
224
|
+
# candidate. Accept only when the path includes /CloudTrail/.
|
|
225
|
+
year_children = [
|
|
226
|
+
c for c in children if _YEAR_RE.match(c[len(current):])
|
|
227
|
+
]
|
|
228
|
+
if year_children:
|
|
229
|
+
if inside_cloudtrail:
|
|
230
|
+
accepted.append(current)
|
|
231
|
+
# Either way, do not descend further past a year level.
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
# Outside CloudTrail/: only descend known structural ancestors. Inside
|
|
235
|
+
# CloudTrail/: descend freely (regions, years, etc.).
|
|
236
|
+
for child in children:
|
|
237
|
+
tail = child[len(current):]
|
|
238
|
+
if tail == _DIGEST_SEGMENT:
|
|
239
|
+
continue
|
|
240
|
+
if not inside_cloudtrail and not _is_cloudtrail_ancestor_segment(tail):
|
|
241
|
+
# Sibling AWS-service tree (ELB/RDS/etc.) — skip.
|
|
242
|
+
continue
|
|
243
|
+
queue.append(child)
|
|
244
|
+
|
|
245
|
+
return accepted
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _list_objects_for_day(client, bucket: str, day_prefix: str) -> list[dict[str, Any]]:
|
|
249
|
+
"""List .json.gz objects directly under ``day_prefix`` (recursive within the day)."""
|
|
250
|
+
with _translate_boto_errors():
|
|
251
|
+
paginator = client.get_paginator("list_objects_v2")
|
|
252
|
+
out: list[dict[str, Any]] = []
|
|
253
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=day_prefix):
|
|
254
|
+
for obj in page.get("Contents", []) or []:
|
|
255
|
+
if obj["Key"].endswith(".json.gz"):
|
|
256
|
+
out.append(obj)
|
|
257
|
+
return out
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _enumerate_days(since: datetime, until: datetime) -> list[tuple[int, int, int]]:
|
|
261
|
+
"""Whole days (UTC-keyed, matching CloudTrail's S3 partitions) that overlap
|
|
262
|
+
[since, until). Exclusive upper bound.
|
|
263
|
+
|
|
264
|
+
CloudTrail writes day prefixes in UTC, so the window must be normalized to
|
|
265
|
+
UTC before extracting date parts. A local UTC-5 window 2026-06-01 00:00 →
|
|
266
|
+
2026-06-02 00:00 is 2026-06-01 05:00 UTC → 2026-06-02 05:00 UTC and must
|
|
267
|
+
list BOTH 2026/06/01/ and 2026/06/02/. The downstream per-event trim still
|
|
268
|
+
enforces the precise [since, until) window.
|
|
269
|
+
|
|
270
|
+
Returns list of (year, month, day) tuples in UTC, ascending. For
|
|
271
|
+
until <= since, returns [].
|
|
272
|
+
"""
|
|
273
|
+
if until <= since:
|
|
274
|
+
return []
|
|
275
|
+
since_utc = _to_utc(since)
|
|
276
|
+
until_utc = _to_utc(until)
|
|
277
|
+
start_day = since_utc.date()
|
|
278
|
+
last_day = (until_utc - timedelta(microseconds=1)).date()
|
|
279
|
+
days: list[tuple[int, int, int]] = []
|
|
280
|
+
day = start_day
|
|
281
|
+
while day <= last_day:
|
|
282
|
+
days.append((day.year, day.month, day.day))
|
|
283
|
+
day += timedelta(days=1)
|
|
284
|
+
return days
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _parse_event_time(s: str) -> datetime | None:
|
|
288
|
+
"""Best-effort parse of CloudTrail eventTime. Returns None on failure."""
|
|
289
|
+
if not s:
|
|
290
|
+
return None
|
|
291
|
+
try:
|
|
292
|
+
return datetime.fromisoformat(s.replace("Z", "+00:00"))
|
|
293
|
+
except (ValueError, TypeError):
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _to_utc(dt: datetime) -> datetime:
|
|
298
|
+
"""Treat naive datetimes as local and convert to UTC for comparison."""
|
|
299
|
+
if dt.tzinfo is None:
|
|
300
|
+
return dt.astimezone(timezone.utc)
|
|
301
|
+
return dt.astimezone(timezone.utc)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _split_name(base: Path, part_num: int) -> Path:
|
|
305
|
+
"""Insert _part{NN} before all of base's suffixes.
|
|
306
|
+
|
|
307
|
+
cloudtrail_20260601_7d.json.log + 1 -> cloudtrail_20260601_7d_part01.json.log
|
|
308
|
+
"""
|
|
309
|
+
name = base.name
|
|
310
|
+
suffixes = "".join(base.suffixes)
|
|
311
|
+
if suffixes:
|
|
312
|
+
stem_full = name[: -len(suffixes)]
|
|
313
|
+
else:
|
|
314
|
+
stem_full = name
|
|
315
|
+
return base.with_name(f"{stem_full}_part{part_num:02d}{suffixes}")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def fetch(
|
|
319
|
+
query_config: dict[str, Any],
|
|
320
|
+
cloudtrail_config: dict[str, Any],
|
|
321
|
+
since: datetime,
|
|
322
|
+
until: datetime,
|
|
323
|
+
verbose: bool,
|
|
324
|
+
*,
|
|
325
|
+
skip_confirm: bool = False,
|
|
326
|
+
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
|
|
327
|
+
"""Pull CloudTrail events from S3 for the given window.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
query_config: Unused (CloudTrail has no per-query SPL).
|
|
331
|
+
cloudtrail_config: [export.cloudtrail] section (path, egress_warn_gb).
|
|
332
|
+
since: Start of window (inclusive).
|
|
333
|
+
until: End of window (exclusive).
|
|
334
|
+
verbose: If True, print discovery details.
|
|
335
|
+
skip_confirm: Bypass the egress-cost prompt.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
(events, fetch_meta) where fetch_meta = {"units": N, "unit_label": "objects"}.
|
|
339
|
+
|
|
340
|
+
Raises:
|
|
341
|
+
ValueError: bad path, no objects, AWS credential/access errors, missing boto3.
|
|
342
|
+
ExportAborted: operator declined the egress-cost prompt.
|
|
343
|
+
"""
|
|
344
|
+
if boto3 is None:
|
|
345
|
+
raise ValueError("boto3 not installed — run: pip install loghunt[cloudtrail]")
|
|
346
|
+
|
|
347
|
+
path = cloudtrail_config.get("path", "").strip()
|
|
348
|
+
if not path:
|
|
349
|
+
raise ValueError(
|
|
350
|
+
"[export.cloudtrail].path is empty — set it to an s3:// URL (see config_example.toml)"
|
|
351
|
+
)
|
|
352
|
+
bucket, base_prefix = _parse_s3_path(path)
|
|
353
|
+
egress_warn_gb = float(cloudtrail_config.get("egress_warn_gb", 5.0))
|
|
354
|
+
|
|
355
|
+
with _translate_boto_errors():
|
|
356
|
+
client = boto3.client("s3")
|
|
357
|
+
|
|
358
|
+
# Phase 1: list-only. Boundary covers _find_date_roots (the slow S3
|
|
359
|
+
# prefix/delimiter walk) + _enumerate_days + the per-day list loop, so the
|
|
360
|
+
# spinner starts the moment discovery begins. On failure inside discovery
|
|
361
|
+
# the block exits by exception, no seal is written, and the error
|
|
362
|
+
# propagates — display.liveness clears the spinner cleanly.
|
|
363
|
+
objects: list[tuple[str, dict[str, Any]]] = [] # (bucket, obj_dict)
|
|
364
|
+
with liveness("listing CloudTrail objects") as ln:
|
|
365
|
+
date_roots = _find_date_roots(client, bucket, base_prefix)
|
|
366
|
+
if verbose:
|
|
367
|
+
print(f"loghunter cloudtrail: discovered {len(date_roots)} date root(s)", flush=True)
|
|
368
|
+
days = _enumerate_days(since, until)
|
|
369
|
+
for root in date_roots:
|
|
370
|
+
for (y, m, d) in days:
|
|
371
|
+
day_prefix = f"{root}{y:04d}/{m:02d}/{d:02d}/"
|
|
372
|
+
for obj in _list_objects_for_day(client, bucket, day_prefix):
|
|
373
|
+
objects.append((bucket, obj))
|
|
374
|
+
total_bytes = sum(int(o["Size"]) for _, o in objects)
|
|
375
|
+
ln.seal(f"listed {len(objects)} objects ({total_bytes / 1e9:.1f} GB)")
|
|
376
|
+
|
|
377
|
+
if not objects:
|
|
378
|
+
window_str = (
|
|
379
|
+
f"{since.strftime('%Y-%m-%d %H:%M')} → {until.strftime('%Y-%m-%d %H:%M')}"
|
|
380
|
+
)
|
|
381
|
+
raise ValueError(
|
|
382
|
+
f"no CloudTrail objects found under {path} for {window_str} — "
|
|
383
|
+
f"check the S3 path and date range"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Egress guard
|
|
387
|
+
if total_bytes > egress_warn_gb * 1e9 and not skip_confirm:
|
|
388
|
+
prompt = (
|
|
389
|
+
f"This pull will transfer ~{total_bytes / 1e9:.1f} GB from S3, "
|
|
390
|
+
f"which may incur AWS egress charges. Continue? [y/N] "
|
|
391
|
+
)
|
|
392
|
+
try:
|
|
393
|
+
answer = input(prompt).strip().lower()
|
|
394
|
+
except EOFError:
|
|
395
|
+
answer = ""
|
|
396
|
+
if answer not in {"y", "yes"}:
|
|
397
|
+
raise ExportAborted("loghunter export: aborted by user")
|
|
398
|
+
|
|
399
|
+
# Phase 2: fetch + parse, skip corrupt, propagate auth.
|
|
400
|
+
# leave=True + clean bar_format makes this its own liveness narration —
|
|
401
|
+
# countable phases stay on tqdm (mirrors loader.py and detectors/syslog.py).
|
|
402
|
+
events: list[dict[str, Any]] = []
|
|
403
|
+
for bkt, obj in tqdm(
|
|
404
|
+
objects,
|
|
405
|
+
desc="fetching",
|
|
406
|
+
unit="obj",
|
|
407
|
+
leave=True,
|
|
408
|
+
bar_format="{desc}: {n_fmt} objects [{elapsed}]",
|
|
409
|
+
):
|
|
410
|
+
key = obj["Key"]
|
|
411
|
+
with _translate_boto_errors():
|
|
412
|
+
body = client.get_object(Bucket=bkt, Key=key)["Body"].read()
|
|
413
|
+
try:
|
|
414
|
+
with gzip.GzipFile(fileobj=io.BytesIO(body)) as gz:
|
|
415
|
+
envelope = json.load(gz)
|
|
416
|
+
events.extend(envelope.get("Records", []) or [])
|
|
417
|
+
except (gzip.BadGzipFile, json.JSONDecodeError, OSError, UnicodeDecodeError) as exc:
|
|
418
|
+
print(f"skipped unreadable object: {key} ({exc})", file=sys.stderr)
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
# Sort + trim — one logical "order and window" operation. delay=0.25 so
|
|
422
|
+
# trivially small exports do not flicker (the detector loop uses 0.0; we
|
|
423
|
+
# diverge here because typical export-dev-loop datasets are small).
|
|
424
|
+
with liveness("ordering and windowing events", delay=0.25) as ln:
|
|
425
|
+
# Sort by eventTime ascending; events without a parseable eventTime sort first.
|
|
426
|
+
events.sort(key=lambda e: _parse_event_time(e.get("eventTime", "")) or datetime.min.replace(tzinfo=timezone.utc))
|
|
427
|
+
|
|
428
|
+
# Trim to precise [since, until) window
|
|
429
|
+
since_utc = _to_utc(since)
|
|
430
|
+
until_utc = _to_utc(until)
|
|
431
|
+
trimmed: list[dict[str, Any]] = []
|
|
432
|
+
for e in events:
|
|
433
|
+
et = _parse_event_time(e.get("eventTime", ""))
|
|
434
|
+
if et is None:
|
|
435
|
+
continue # drop events with no parseable timestamp
|
|
436
|
+
if since_utc <= et < until_utc:
|
|
437
|
+
trimmed.append(e)
|
|
438
|
+
ln.seal(f"sorted and trimmed to {len(trimmed)} events in window")
|
|
439
|
+
|
|
440
|
+
return trimmed, {"units": len(objects), "unit_label": "objects"}
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def write(
|
|
444
|
+
events: list[dict[str, Any]], outpath: Path, verbose: bool,
|
|
445
|
+
) -> tuple[int, dict[str, Any]]:
|
|
446
|
+
"""Write events as NDJSON, splitting at ~2 GB into _partNN files when needed.
|
|
447
|
+
|
|
448
|
+
Naming: ``outpath`` is used as-is for the first (and only) file when output
|
|
449
|
+
fits under the size limit. On first overflow the existing file is closed and
|
|
450
|
+
renamed to its _part01 form, then writing continues into _part02, etc.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
``(line_count, write_meta)`` where ``write_meta`` carries
|
|
454
|
+
``{"bytes": int, "paths": list[Path]}``. ``paths`` lists every part
|
|
455
|
+
actually produced — single-element when no split occurred,
|
|
456
|
+
``[_part01, _part02, …]`` after the first overflow. The caller uses
|
|
457
|
+
``len(paths) > 1`` to detect a split and reports ``+K more`` where
|
|
458
|
+
``K = len(paths) - 1``.
|
|
459
|
+
"""
|
|
460
|
+
outpath.parent.mkdir(parents=True, exist_ok=True)
|
|
461
|
+
|
|
462
|
+
current_path = outpath
|
|
463
|
+
current_handle = current_path.open("w", encoding="utf-8")
|
|
464
|
+
current_bytes = 0
|
|
465
|
+
part_num = 0 # 0 means "no split yet"; first split renames current to _part01.
|
|
466
|
+
total_lines = 0
|
|
467
|
+
total_bytes = 0
|
|
468
|
+
paths: list[Path] = [current_path]
|
|
469
|
+
|
|
470
|
+
try:
|
|
471
|
+
for ev in events:
|
|
472
|
+
line = json.dumps(ev, default=str) + "\n"
|
|
473
|
+
line_bytes = len(line.encode("utf-8"))
|
|
474
|
+
|
|
475
|
+
if current_bytes > 0 and current_bytes + line_bytes > _PART_SPLIT_BYTES:
|
|
476
|
+
current_handle.close()
|
|
477
|
+
if part_num == 0:
|
|
478
|
+
# First split: rename the bare-named file to _part01 and
|
|
479
|
+
# update the paths list in place (the bare path was added
|
|
480
|
+
# at open time; the rename is the same physical file).
|
|
481
|
+
renamed = _split_name(outpath, 1)
|
|
482
|
+
current_path.rename(renamed)
|
|
483
|
+
paths[0] = renamed
|
|
484
|
+
part_num = 1
|
|
485
|
+
next_part = part_num + 1
|
|
486
|
+
current_path = _split_name(outpath, next_part)
|
|
487
|
+
current_handle = current_path.open("w", encoding="utf-8")
|
|
488
|
+
paths.append(current_path)
|
|
489
|
+
current_bytes = 0
|
|
490
|
+
part_num = next_part
|
|
491
|
+
|
|
492
|
+
current_handle.write(line)
|
|
493
|
+
current_bytes += line_bytes
|
|
494
|
+
total_bytes += line_bytes
|
|
495
|
+
total_lines += 1
|
|
496
|
+
finally:
|
|
497
|
+
current_handle.close()
|
|
498
|
+
|
|
499
|
+
return total_lines, {"bytes": total_bytes, "paths": paths}
|