loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,499 @@
1
+ """CloudTrail S3 exporter — pulls gzipped JSON event objects from S3 to local NDJSON.
2
+
3
+ Invoked via: loghunter export cloudtrail
4
+
5
+ CloudTrail writes objects under a rigid layout:
6
+ <prefix>/AWSLogs/<account-id>/CloudTrail/<region>/YYYY/MM/DD/<file>.json.gz
7
+ each containing {"Records": [ ...events... ]}. A sibling CloudTrail-Digest/ prefix
8
+ holds integrity manifests (not events) and is skipped.
9
+
10
+ AWS authentication is outside this tool: the user authenticates their shell
11
+ (aws login / SSO / env vars / instance role) before running loghunter, and boto3
12
+ resolves the ambient credential chain. We never read, store, or prompt for
13
+ AWS credentials.
14
+
15
+ Date-root discovery keys off the 4-digit-year prefix invariant, NOT a fixed
16
+ segment count — resilient to org-id segments and to the user pointing at any
17
+ level at or above the region.
18
+
19
+ The pull is two-phase: list-only (free) to estimate object bytes and prompt
20
+ above ``egress_warn_gb`` if needed, then download/parse on confirmation. The
21
+ prompt is suppressed when ``skip_confirm=True``.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import contextlib
27
+ import gzip
28
+ import io
29
+ import json
30
+ import re
31
+ import sys
32
+ from datetime import datetime, timedelta, timezone
33
+ from pathlib import Path
34
+ from typing import Any
35
+
36
+ from tqdm import tqdm
37
+
38
+ try:
39
+ import boto3
40
+ import botocore.exceptions as botocore_exc
41
+ except ImportError:
42
+ boto3 = None # type: ignore[assignment]
43
+ botocore_exc = None # type: ignore[assignment]
44
+
45
+ from loghunter.common.display import liveness
46
+ from loghunter.common.errors import ExportAborted
47
+
48
+
49
+ _YEAR_RE = re.compile(r"^\d{4}/$")
50
+ _ACCOUNT_ID_RE = re.compile(r"^\d+/$")
51
+ _ORG_ID_RE = re.compile(r"^o-[a-z0-9]+/$")
52
+ _DIGEST_SEGMENT = "CloudTrail-Digest/"
53
+ _EVENT_SEGMENT = "CloudTrail/"
54
+ _KNOWN_ANCESTOR_SEGMENTS = frozenset({"AWSLogs/", _EVENT_SEGMENT})
55
+ _AUTH_ERROR_CODES = {
56
+ "AccessDenied",
57
+ "ExpiredToken",
58
+ "InvalidToken",
59
+ "InvalidAccessKeyId",
60
+ "SignatureDoesNotMatch",
61
+ "RequestExpired",
62
+ }
63
+
64
+ # Size threshold for splitting NDJSON output into _partNN files.
65
+ # Exposed at module scope so tests can monkeypatch a tiny value.
66
+ _PART_SPLIT_BYTES = 2_000_000_000 # 2 GB
67
+
68
+ OUTPUT_EXTENSION = ".json.log"
69
+
70
+
71
+ def is_configured(backend_cfg: dict[str, Any]) -> bool:
72
+ """True when [export.cloudtrail].path is set — analogue of Splunk's host check."""
73
+ return bool(backend_cfg.get("path", "").strip())
74
+
75
+
76
+ def summary_descriptor(backend_cfg: dict[str, Any]) -> str:
77
+ """Identifier shown in the final summary's `Backend :` line, e.g. s3://bucket/AWSLogs/."""
78
+ return backend_cfg.get("path", "")
79
+
80
+
81
+ def implicit_default_query() -> dict[str, Any]:
82
+ """CloudTrail has no per-query SPL — synthetic default supplies the basename.
83
+
84
+ Returning {} would cause _resolve_output_path to fall back to the query name
85
+ ("default"), producing files like default_20260601_7d.json.log. We want
86
+ cloudtrail_20260601_7d.json.log, so the synthetic query carries an explicit
87
+ output_basename.
88
+ """
89
+ return {"output_basename": "cloudtrail"}
90
+
91
+
92
+ def _auth_error_message() -> str:
93
+ return (
94
+ "AWS credentials not found or expired — authenticate your shell "
95
+ "(e.g. your aws login) and try again"
96
+ )
97
+
98
+
99
+ @contextlib.contextmanager
100
+ def _translate_boto_errors():
101
+ """Translate botocore exceptions into actionable ValueErrors uniformly.
102
+
103
+ Centralizes the mapping table so every boto call site uses the same
104
+ translation. The well-known cases (missing/partial credentials, the
105
+ missing botocore[crt] dep, auth-code ClientErrors) get tailored messages
106
+ naming the exact remedy; the long tail (endpoint resolution, profile
107
+ config errors, non-auth ClientErrors, etc.) is swept up as
108
+ "AWS error during CloudTrail export: <detail>" so a raw botocore
109
+ traceback never reaches the user.
110
+
111
+ Order matters — more specific BotoCoreError subclasses must be caught
112
+ before the BotoCoreError sweep. ClientError is a separate hierarchy
113
+ (not a subclass of BotoCoreError) and is handled in its own branch.
114
+
115
+ Does NOT catch bare Exception — genuinely non-botocore errors
116
+ (programmer bugs, OS issues, etc.) must still surface unmasked.
117
+ """
118
+ try:
119
+ yield
120
+ except (botocore_exc.NoCredentialsError,
121
+ botocore_exc.PartialCredentialsError) as exc:
122
+ raise ValueError(_auth_error_message()) from exc
123
+ except botocore_exc.MissingDependencyException as exc:
124
+ raise ValueError(
125
+ "AWS credential provider needs an extra dependency — run: "
126
+ "pip install 'botocore[crt]' (your AWS profile likely uses "
127
+ f"SSO/login-based credentials). botocore detail: {exc}"
128
+ ) from exc
129
+ except botocore_exc.ClientError as exc:
130
+ code = exc.response.get("Error", {}).get("Code", "")
131
+ if code in _AUTH_ERROR_CODES:
132
+ raise ValueError(_auth_error_message()) from exc
133
+ raise ValueError(f"AWS error during CloudTrail export: {exc}") from exc
134
+ except botocore_exc.BotoCoreError as exc:
135
+ raise ValueError(f"AWS error during CloudTrail export: {exc}") from exc
136
+
137
+
138
+ def _parse_s3_path(s3_path: str) -> tuple[str, str]:
139
+ """Split s3://bucket/key/prefix/ into (bucket, prefix). Prefix ends with /."""
140
+ if not s3_path.startswith("s3://"):
141
+ raise ValueError(
142
+ f"CloudTrail path must start with s3:// — got: {s3_path}"
143
+ )
144
+ rest = s3_path[5:]
145
+ if "/" in rest:
146
+ bucket, prefix = rest.split("/", 1)
147
+ else:
148
+ bucket, prefix = rest, ""
149
+ if not bucket:
150
+ raise ValueError(f"CloudTrail path is missing a bucket name: {s3_path}")
151
+ if prefix and not prefix.endswith("/"):
152
+ prefix += "/"
153
+ return bucket, prefix
154
+
155
+
156
+ def _has_cloudtrail_segment(prefix: str) -> bool:
157
+ """True iff ``prefix`` contains 'CloudTrail/' as a whole path segment.
158
+
159
+ Padded with a leading '/' so a prefix starting at the literal characters
160
+ 'CloudTrail/' still matches. Does NOT match 'CloudTrail-Digest/' (different
161
+ segment) or any other compound name.
162
+ """
163
+ return f"/{prefix}".find(f"/{_EVENT_SEGMENT}") != -1
164
+
165
+
166
+ def _is_cloudtrail_ancestor_segment(segment: str) -> bool:
167
+ """True iff ``segment`` (trailing-slash form) could plausibly be a structural
168
+ parent of CloudTrail/ in standard AWS layouts.
169
+
170
+ The walker only descends through these patterns when not already inside a
171
+ CloudTrail/ subtree. Co-located non-CloudTrail service segments
172
+ (elasticloadbalancing, RDS, vpc-flow-logs, etc.) are rejected here, so the
173
+ walk never lists inside them — saving S3 calls and preventing an
174
+ AccessDenied in an unrelated branch from aborting an otherwise-readable
175
+ CloudTrail pull.
176
+
177
+ Users with non-standard prefix layouts can point [export.cloudtrail].path deeper
178
+ (at or below CloudTrail/) to bypass — past CloudTrail/, the walker
179
+ descends every child.
180
+ """
181
+ if segment in _KNOWN_ANCESTOR_SEGMENTS:
182
+ return True
183
+ if _ACCOUNT_ID_RE.match(segment):
184
+ return True
185
+ if _ORG_ID_RE.match(segment):
186
+ return True
187
+ return False
188
+
189
+
190
+ def _list_common_prefixes(client, bucket: str, prefix: str) -> list[str]:
191
+ """Return immediate child common-prefixes under ``prefix`` (one level deep)."""
192
+ with _translate_boto_errors():
193
+ paginator = client.get_paginator("list_objects_v2")
194
+ common: list[str] = []
195
+ for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
196
+ for cp in page.get("CommonPrefixes", []) or []:
197
+ common.append(cp["Prefix"])
198
+ return common
199
+
200
+
201
+ def _find_date_roots(client, bucket: str, base_prefix: str) -> list[str]:
202
+ """Walk down from base_prefix until children look like YYYY/ segments.
203
+
204
+ A prefix is accepted as a date root only if its full path contains the
205
+ 'CloudTrail/' event segment — this prevents accidental discovery of
206
+ sibling AWS service trees that share the YYYY/MM/DD layout.
207
+
208
+ CloudTrail-Digest/ branches are skipped during the walk.
209
+ """
210
+ accepted: list[str] = []
211
+ queue: list[str] = [base_prefix]
212
+ visited: set[str] = set()
213
+
214
+ while queue:
215
+ current = queue.pop(0)
216
+ if current in visited:
217
+ continue
218
+ visited.add(current)
219
+
220
+ children = _list_common_prefixes(client, bucket, current)
221
+ inside_cloudtrail = _has_cloudtrail_segment(current)
222
+
223
+ # If immediate children look like 4-digit years, this is a date root
224
+ # candidate. Accept only when the path includes /CloudTrail/.
225
+ year_children = [
226
+ c for c in children if _YEAR_RE.match(c[len(current):])
227
+ ]
228
+ if year_children:
229
+ if inside_cloudtrail:
230
+ accepted.append(current)
231
+ # Either way, do not descend further past a year level.
232
+ continue
233
+
234
+ # Outside CloudTrail/: only descend known structural ancestors. Inside
235
+ # CloudTrail/: descend freely (regions, years, etc.).
236
+ for child in children:
237
+ tail = child[len(current):]
238
+ if tail == _DIGEST_SEGMENT:
239
+ continue
240
+ if not inside_cloudtrail and not _is_cloudtrail_ancestor_segment(tail):
241
+ # Sibling AWS-service tree (ELB/RDS/etc.) — skip.
242
+ continue
243
+ queue.append(child)
244
+
245
+ return accepted
246
+
247
+
248
+ def _list_objects_for_day(client, bucket: str, day_prefix: str) -> list[dict[str, Any]]:
249
+ """List .json.gz objects directly under ``day_prefix`` (recursive within the day)."""
250
+ with _translate_boto_errors():
251
+ paginator = client.get_paginator("list_objects_v2")
252
+ out: list[dict[str, Any]] = []
253
+ for page in paginator.paginate(Bucket=bucket, Prefix=day_prefix):
254
+ for obj in page.get("Contents", []) or []:
255
+ if obj["Key"].endswith(".json.gz"):
256
+ out.append(obj)
257
+ return out
258
+
259
+
260
+ def _enumerate_days(since: datetime, until: datetime) -> list[tuple[int, int, int]]:
261
+ """Whole days (UTC-keyed, matching CloudTrail's S3 partitions) that overlap
262
+ [since, until). Exclusive upper bound.
263
+
264
+ CloudTrail writes day prefixes in UTC, so the window must be normalized to
265
+ UTC before extracting date parts. A local UTC-5 window 2026-06-01 00:00 →
266
+ 2026-06-02 00:00 is 2026-06-01 05:00 UTC → 2026-06-02 05:00 UTC and must
267
+ list BOTH 2026/06/01/ and 2026/06/02/. The downstream per-event trim still
268
+ enforces the precise [since, until) window.
269
+
270
+ Returns list of (year, month, day) tuples in UTC, ascending. For
271
+ until <= since, returns [].
272
+ """
273
+ if until <= since:
274
+ return []
275
+ since_utc = _to_utc(since)
276
+ until_utc = _to_utc(until)
277
+ start_day = since_utc.date()
278
+ last_day = (until_utc - timedelta(microseconds=1)).date()
279
+ days: list[tuple[int, int, int]] = []
280
+ day = start_day
281
+ while day <= last_day:
282
+ days.append((day.year, day.month, day.day))
283
+ day += timedelta(days=1)
284
+ return days
285
+
286
+
287
+ def _parse_event_time(s: str) -> datetime | None:
288
+ """Best-effort parse of CloudTrail eventTime. Returns None on failure."""
289
+ if not s:
290
+ return None
291
+ try:
292
+ return datetime.fromisoformat(s.replace("Z", "+00:00"))
293
+ except (ValueError, TypeError):
294
+ return None
295
+
296
+
297
+ def _to_utc(dt: datetime) -> datetime:
298
+ """Treat naive datetimes as local and convert to UTC for comparison."""
299
+ if dt.tzinfo is None:
300
+ return dt.astimezone(timezone.utc)
301
+ return dt.astimezone(timezone.utc)
302
+
303
+
304
+ def _split_name(base: Path, part_num: int) -> Path:
305
+ """Insert _part{NN} before all of base's suffixes.
306
+
307
+ cloudtrail_20260601_7d.json.log + 1 -> cloudtrail_20260601_7d_part01.json.log
308
+ """
309
+ name = base.name
310
+ suffixes = "".join(base.suffixes)
311
+ if suffixes:
312
+ stem_full = name[: -len(suffixes)]
313
+ else:
314
+ stem_full = name
315
+ return base.with_name(f"{stem_full}_part{part_num:02d}{suffixes}")
316
+
317
+
318
+ def fetch(
319
+ query_config: dict[str, Any],
320
+ cloudtrail_config: dict[str, Any],
321
+ since: datetime,
322
+ until: datetime,
323
+ verbose: bool,
324
+ *,
325
+ skip_confirm: bool = False,
326
+ ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
327
+ """Pull CloudTrail events from S3 for the given window.
328
+
329
+ Args:
330
+ query_config: Unused (CloudTrail has no per-query SPL).
331
+ cloudtrail_config: [export.cloudtrail] section (path, egress_warn_gb).
332
+ since: Start of window (inclusive).
333
+ until: End of window (exclusive).
334
+ verbose: If True, print discovery details.
335
+ skip_confirm: Bypass the egress-cost prompt.
336
+
337
+ Returns:
338
+ (events, fetch_meta) where fetch_meta = {"units": N, "unit_label": "objects"}.
339
+
340
+ Raises:
341
+ ValueError: bad path, no objects, AWS credential/access errors, missing boto3.
342
+ ExportAborted: operator declined the egress-cost prompt.
343
+ """
344
+ if boto3 is None:
345
+ raise ValueError("boto3 not installed — run: pip install loghunt[cloudtrail]")
346
+
347
+ path = cloudtrail_config.get("path", "").strip()
348
+ if not path:
349
+ raise ValueError(
350
+ "[export.cloudtrail].path is empty — set it to an s3:// URL (see config_example.toml)"
351
+ )
352
+ bucket, base_prefix = _parse_s3_path(path)
353
+ egress_warn_gb = float(cloudtrail_config.get("egress_warn_gb", 5.0))
354
+
355
+ with _translate_boto_errors():
356
+ client = boto3.client("s3")
357
+
358
+ # Phase 1: list-only. Boundary covers _find_date_roots (the slow S3
359
+ # prefix/delimiter walk) + _enumerate_days + the per-day list loop, so the
360
+ # spinner starts the moment discovery begins. On failure inside discovery
361
+ # the block exits by exception, no seal is written, and the error
362
+ # propagates — display.liveness clears the spinner cleanly.
363
+ objects: list[tuple[str, dict[str, Any]]] = [] # (bucket, obj_dict)
364
+ with liveness("listing CloudTrail objects") as ln:
365
+ date_roots = _find_date_roots(client, bucket, base_prefix)
366
+ if verbose:
367
+ print(f"loghunter cloudtrail: discovered {len(date_roots)} date root(s)", flush=True)
368
+ days = _enumerate_days(since, until)
369
+ for root in date_roots:
370
+ for (y, m, d) in days:
371
+ day_prefix = f"{root}{y:04d}/{m:02d}/{d:02d}/"
372
+ for obj in _list_objects_for_day(client, bucket, day_prefix):
373
+ objects.append((bucket, obj))
374
+ total_bytes = sum(int(o["Size"]) for _, o in objects)
375
+ ln.seal(f"listed {len(objects)} objects ({total_bytes / 1e9:.1f} GB)")
376
+
377
+ if not objects:
378
+ window_str = (
379
+ f"{since.strftime('%Y-%m-%d %H:%M')} → {until.strftime('%Y-%m-%d %H:%M')}"
380
+ )
381
+ raise ValueError(
382
+ f"no CloudTrail objects found under {path} for {window_str} — "
383
+ f"check the S3 path and date range"
384
+ )
385
+
386
+ # Egress guard
387
+ if total_bytes > egress_warn_gb * 1e9 and not skip_confirm:
388
+ prompt = (
389
+ f"This pull will transfer ~{total_bytes / 1e9:.1f} GB from S3, "
390
+ f"which may incur AWS egress charges. Continue? [y/N] "
391
+ )
392
+ try:
393
+ answer = input(prompt).strip().lower()
394
+ except EOFError:
395
+ answer = ""
396
+ if answer not in {"y", "yes"}:
397
+ raise ExportAborted("loghunter export: aborted by user")
398
+
399
+ # Phase 2: fetch + parse, skip corrupt, propagate auth.
400
+ # leave=True + clean bar_format makes this its own liveness narration —
401
+ # countable phases stay on tqdm (mirrors loader.py and detectors/syslog.py).
402
+ events: list[dict[str, Any]] = []
403
+ for bkt, obj in tqdm(
404
+ objects,
405
+ desc="fetching",
406
+ unit="obj",
407
+ leave=True,
408
+ bar_format="{desc}: {n_fmt} objects [{elapsed}]",
409
+ ):
410
+ key = obj["Key"]
411
+ with _translate_boto_errors():
412
+ body = client.get_object(Bucket=bkt, Key=key)["Body"].read()
413
+ try:
414
+ with gzip.GzipFile(fileobj=io.BytesIO(body)) as gz:
415
+ envelope = json.load(gz)
416
+ events.extend(envelope.get("Records", []) or [])
417
+ except (gzip.BadGzipFile, json.JSONDecodeError, OSError, UnicodeDecodeError) as exc:
418
+ print(f"skipped unreadable object: {key} ({exc})", file=sys.stderr)
419
+ continue
420
+
421
+ # Sort + trim — one logical "order and window" operation. delay=0.25 so
422
+ # trivially small exports do not flicker (the detector loop uses 0.0; we
423
+ # diverge here because typical export-dev-loop datasets are small).
424
+ with liveness("ordering and windowing events", delay=0.25) as ln:
425
+ # Sort by eventTime ascending; events without a parseable eventTime sort first.
426
+ events.sort(key=lambda e: _parse_event_time(e.get("eventTime", "")) or datetime.min.replace(tzinfo=timezone.utc))
427
+
428
+ # Trim to precise [since, until) window
429
+ since_utc = _to_utc(since)
430
+ until_utc = _to_utc(until)
431
+ trimmed: list[dict[str, Any]] = []
432
+ for e in events:
433
+ et = _parse_event_time(e.get("eventTime", ""))
434
+ if et is None:
435
+ continue # drop events with no parseable timestamp
436
+ if since_utc <= et < until_utc:
437
+ trimmed.append(e)
438
+ ln.seal(f"sorted and trimmed to {len(trimmed)} events in window")
439
+
440
+ return trimmed, {"units": len(objects), "unit_label": "objects"}
441
+
442
+
443
+ def write(
444
+ events: list[dict[str, Any]], outpath: Path, verbose: bool,
445
+ ) -> tuple[int, dict[str, Any]]:
446
+ """Write events as NDJSON, splitting at ~2 GB into _partNN files when needed.
447
+
448
+ Naming: ``outpath`` is used as-is for the first (and only) file when output
449
+ fits under the size limit. On first overflow the existing file is closed and
450
+ renamed to its _part01 form, then writing continues into _part02, etc.
451
+
452
+ Returns:
453
+ ``(line_count, write_meta)`` where ``write_meta`` carries
454
+ ``{"bytes": int, "paths": list[Path]}``. ``paths`` lists every part
455
+ actually produced — single-element when no split occurred,
456
+ ``[_part01, _part02, …]`` after the first overflow. The caller uses
457
+ ``len(paths) > 1`` to detect a split and reports ``+K more`` where
458
+ ``K = len(paths) - 1``.
459
+ """
460
+ outpath.parent.mkdir(parents=True, exist_ok=True)
461
+
462
+ current_path = outpath
463
+ current_handle = current_path.open("w", encoding="utf-8")
464
+ current_bytes = 0
465
+ part_num = 0 # 0 means "no split yet"; first split renames current to _part01.
466
+ total_lines = 0
467
+ total_bytes = 0
468
+ paths: list[Path] = [current_path]
469
+
470
+ try:
471
+ for ev in events:
472
+ line = json.dumps(ev, default=str) + "\n"
473
+ line_bytes = len(line.encode("utf-8"))
474
+
475
+ if current_bytes > 0 and current_bytes + line_bytes > _PART_SPLIT_BYTES:
476
+ current_handle.close()
477
+ if part_num == 0:
478
+ # First split: rename the bare-named file to _part01 and
479
+ # update the paths list in place (the bare path was added
480
+ # at open time; the rename is the same physical file).
481
+ renamed = _split_name(outpath, 1)
482
+ current_path.rename(renamed)
483
+ paths[0] = renamed
484
+ part_num = 1
485
+ next_part = part_num + 1
486
+ current_path = _split_name(outpath, next_part)
487
+ current_handle = current_path.open("w", encoding="utf-8")
488
+ paths.append(current_path)
489
+ current_bytes = 0
490
+ part_num = next_part
491
+
492
+ current_handle.write(line)
493
+ current_bytes += line_bytes
494
+ total_bytes += line_bytes
495
+ total_lines += 1
496
+ finally:
497
+ current_handle.close()
498
+
499
+ return total_lines, {"bytes": total_bytes, "paths": paths}