loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,534 @@
1
+ """Log exporter orchestrator — backend, query, and output-path resolution.
2
+
3
+ Public entry point:
4
+ run_export(config, backend, query_names, since, until, out, verbose)
5
+
6
+ Architecture:
7
+ This module owns query resolution, backend selection, output-path naming,
8
+ and the fetch/write loop. It does not know any backend's internals.
9
+
10
+ Each backend is a module under exporters/ that exposes exactly four
11
+ module-level callables (duck-typed — no base class, no registry file):
12
+
13
+ is_configured(backend_cfg) -> bool
14
+ Used during auto-select to decide whether this backend is offerable.
15
+
16
+ summary_descriptor(backend_cfg) -> str
17
+ Rendered into the `Backend :` line of the final summary, e.g.
18
+ "host:port" for Splunk or "s3://bucket/prefix" for a future
19
+ object-store backend.
20
+
21
+ fetch(query_config, backend_config, since, until, verbose,
22
+ *, skip_confirm=False) -> (rows, fetch_meta)
23
+ fetch_meta carries at least {"units": int, "unit_label": str} and
24
+ MUST be invariant across queries within the same (since, until)
25
+ window for a given backend — work-unit count is a property of the
26
+ window, not the individual query. The orchestrator enforces this.
27
+ skip_confirm bypasses any backend-side cost prompt; backends that
28
+ have no prompt (Splunk) accept and ignore it.
29
+
30
+ write(rows, outpath, verbose) -> (int, dict)
31
+ Returns ``(line_count, write_meta)``. ``write_meta`` MUST carry at
32
+ least ``{"bytes": int, "paths": list[Path]}`` — bytes is the total
33
+ on-disk size summed across whatever files the backend produced,
34
+ paths lists every file written (single-element when the backend
35
+ does not split; ordered ``[_part01, _part02, …]`` when it does).
36
+ The orchestrator never reaches into the writer's private split
37
+ machinery — it reads the contract.
38
+
39
+ Optional module-level hooks the orchestrator consults if present:
40
+
41
+ implicit_default_query() -> dict
42
+ Used when a backend has no per-query stanza (e.g. CloudTrail has
43
+ no SPL). Returned dict becomes the synthetic "default" query.
44
+
45
+ OUTPUT_EXTENSION: str
46
+ Extension applied to auto-named output files. Default ".log".
47
+ CloudTrail uses ".json.log".
48
+
49
+ Adding a new backend means: (1) drop a module under exporters/ that
50
+ implements those four callables; (2) add its name to _KNOWN_BACKENDS;
51
+ (3) add a branch in _load_backend(). Nothing else changes here.
52
+
53
+ Splunk's hourly chunking helper (_build_hour_windows) is private to
54
+ splunk.py and is no longer reachable from this orchestrator.
55
+ """
56
+
57
+ from __future__ import annotations
58
+
59
+ import importlib
60
+ from datetime import datetime, timedelta
61
+ from pathlib import Path
62
+ from types import ModuleType
63
+ from typing import Any
64
+
65
+ # ExportAborted lives in common.errors so runner.py and exporter backends can
66
+ # both raise it without creating a runner ↔ exporter dependency. Re-exported
67
+ # here so `from loghunter.exporters import ExportAborted` still works for
68
+ # existing call sites and external code.
69
+ from loghunter.common.display import compact_home, human_bytes, liveness
70
+ from loghunter.common.errors import ExportAborted # noqa: F401 (re-export)
71
+ from loghunter.common.paths import be_like_water, effective_root, resolve_path
72
+
73
+
74
+ def _backend_cfg(config: dict[str, Any], name: str) -> dict[str, Any]:
75
+ """Return the [export.<name>] stanza, or an empty dict if absent.
76
+
77
+ Single read-site for backend config — keeps every fetch / is_configured /
78
+ summary_descriptor / query lookup honest to the [export.<backend>] shape.
79
+ """
80
+ return config.get("export", {}).get(name, {})
81
+
82
+
83
+ def _normalize_end_of_day_until(until: datetime) -> datetime:
84
+ """Normalize 23:59:xx (produced by --days) to next midnight for chunk alignment."""
85
+ if until.hour == 23 and until.minute == 59 and until.second >= 58:
86
+ return until.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
87
+ return until
88
+
89
+
90
+ _KNOWN_BACKENDS = ("splunk", "cloudtrail")
91
+
92
+
93
+ def run_export(
94
+ config: dict[str, Any],
95
+ backend: str | None,
96
+ query_names: list[str],
97
+ since: datetime | None,
98
+ until: datetime | None,
99
+ out: str | None,
100
+ verbose: bool,
101
+ *,
102
+ skip_confirm: bool = False,
103
+ ) -> None:
104
+ """Pull log data from an external system and write to local flat files.
105
+
106
+ Args:
107
+ config: Loaded config dict (from common/config.py).
108
+ backend: Backend name ("splunk", etc.) or None to auto-select.
109
+ query_names: Named queries to run. Empty list uses default/single logic.
110
+ since: Start of window, or None to use yesterday 00:00:00 local.
111
+ until: End of window, or None to use today 00:00:00 local.
112
+ out: Raw CLI --out string (preserves trailing slash) or None.
113
+ be_like_water decides file-vs-directory inside the cascade.
114
+ verbose: Threaded to fetch() / write() for backend-internal use
115
+ (e.g. CloudTrail's list-phase line at level >= 1). The W4 grammar
116
+ keeps stdout narration terse and identical across levels — no
117
+ per-query SPL block, no auto-select chatter.
118
+ skip_confirm: When True, skip any backend-side cost prompts (e.g. the
119
+ CloudTrail egress guard). Threaded from the CLI's --yes / -y flag.
120
+ """
121
+ # Apply timeframe defaults independently
122
+ local_now = datetime.now().astimezone()
123
+ today_midnight = local_now.replace(hour=0, minute=0, second=0, microsecond=0)
124
+ if since is None:
125
+ since = today_midnight - timedelta(days=1) # yesterday 00:00:00 local
126
+ if until is None:
127
+ until = today_midnight # today 00:00:00 local (exclusive end)
128
+ until = _normalize_end_of_day_until(until)
129
+
130
+ # Resolve backend and load its module
131
+ resolved_backend = _resolve_backend(config, backend)
132
+ backend_module = _load_backend(resolved_backend)
133
+
134
+ # Resolve queries (backends with no per-query config supply a synthetic
135
+ # default via implicit_default_query()).
136
+ resolved_queries = _resolve_queries(
137
+ config, resolved_backend, query_names, backend_module=backend_module
138
+ )
139
+
140
+ # Guard: an explicit file path target is incompatible with multiple queries.
141
+ # Re-expressed in terms of be_like_water's verdict — never .suffix.
142
+ if out is not None:
143
+ cli_resolved = be_like_water(out)
144
+ if cli_resolved.is_file and len(resolved_queries) > 1:
145
+ raise ValueError(
146
+ f"Cannot use an explicit file path ({cli_resolved.path}) with "
147
+ f"multiple queries — specify a directory instead."
148
+ )
149
+
150
+ local_since = since.astimezone() if since.tzinfo else since
151
+ local_until = until.astimezone() if until.tzinfo else until
152
+ window_str = (
153
+ f"{local_since.strftime('%Y-%m-%d %H:%M')} → "
154
+ f"{local_until.strftime('%Y-%m-%d %H:%M')} local"
155
+ )
156
+
157
+ # Fetch and write each query. fetch() returns (rows, fetch_meta); the
158
+ # orchestrator keeps the first query's fetch_meta as the run-level work-unit
159
+ # descriptor and asserts later queries agree (the metadata is a property of
160
+ # the window, not the query). write() returns (line_count, write_meta) where
161
+ # write_meta carries {"bytes": int, "paths": list[Path]} — the orchestrator
162
+ # is backend-neutral and does not know about splitting.
163
+ extension = getattr(backend_module, "OUTPUT_EXTENSION", ".log")
164
+ backend_cfg = _backend_cfg(config, resolved_backend)
165
+ loghunter_cfg = config.get("loghunter", {})
166
+ root = effective_root(config)
167
+
168
+ # Resolve every query's output path up front so the header line can land
169
+ # before the first fetch. NO bulk-fetch pre-pass — each query streams
170
+ # fetch → write in turn so a long export doesn't hold every result set in
171
+ # RAM, the first result line appears promptly, and a later query's failure
172
+ # doesn't void earlier successfully-written queries.
173
+ plan: list[tuple[str, dict[str, Any], Path]] = []
174
+ for query_name, query_cfg in resolved_queries:
175
+ outpath = _resolve_output_path(
176
+ query_cfg, out, since, until, query_name,
177
+ extension=extension,
178
+ backend_config=backend_cfg,
179
+ loghunter_config=loghunter_cfg,
180
+ root=root,
181
+ )
182
+ plan.append((query_name, query_cfg, outpath))
183
+
184
+ # Header — single plain stdout line. No box, no seplines, NO color, no
185
+ # auto-select chatter on stderr.
186
+ print(
187
+ f"loghunter export · {resolved_backend} "
188
+ f"({backend_module.summary_descriptor(backend_cfg)})"
189
+ )
190
+
191
+ def _format_span(units: int, label: str) -> str:
192
+ total_secs = (local_until - local_since).total_seconds()
193
+ if total_secs > 0 and total_secs % 86400 == 0:
194
+ n_days = int(total_secs / 86400)
195
+ return f"{n_days} day{'s' if n_days != 1 else ''}, {units} {label}"
196
+ n_hours = max(int(total_secs / 3600), 1)
197
+ return f"{n_hours}h, {units} {label}"
198
+
199
+ def _emit_result_line(
200
+ n_written: int, write_meta: dict[str, Any], fallback_path: Path,
201
+ ) -> tuple[int, int]:
202
+ """Print the per-query result line and return (n_written, bytes)."""
203
+ paths = list(write_meta.get("paths") or [fallback_path])
204
+ bytes_written = int(write_meta.get("bytes", 0))
205
+ path_display = compact_home(paths[0])
206
+ if len(paths) > 1:
207
+ path_display += f" (+{len(paths) - 1} more)"
208
+ print(
209
+ f" wrote {n_written:,} lines · {human_bytes(bytes_written)} → {path_display}"
210
+ )
211
+ return n_written, bytes_written
212
+
213
+ # First query: fetch → window line → write. The window line is printed
214
+ # AFTER the first fetch because fetch_meta carries the work-unit count;
215
+ # the result line for query 1 follows immediately so the operator sees
216
+ # tangible progress before query 2 even begins.
217
+ grand_lines = 0
218
+ grand_bytes = 0
219
+ run_fetch_meta: dict[str, Any] | None = None
220
+ n_queries = len(plan)
221
+
222
+ first_name, first_cfg, first_outpath = plan[0]
223
+ first_rows, first_meta = backend_module.fetch(
224
+ first_cfg, backend_cfg, since, until, verbose,
225
+ skip_confirm=skip_confirm,
226
+ )
227
+ run_fetch_meta = first_meta
228
+ try:
229
+ total_units = run_fetch_meta["units"]
230
+ unit_label = run_fetch_meta["unit_label"]
231
+ except (KeyError, TypeError) as exc:
232
+ raise ValueError(
233
+ f"Exporter backend '{resolved_backend}' returned invalid fetch metadata: "
234
+ f"missing 'units' or 'unit_label'."
235
+ ) from exc
236
+ print(f"window: {window_str} ({_format_span(total_units, unit_label)})")
237
+ print()
238
+
239
+ print(f"running {first_name} …")
240
+ with liveness(f"writing {first_outpath.name}") as ln:
241
+ n_written, write_meta = backend_module.write(first_rows, first_outpath, verbose)
242
+ ln.seal(f"{first_name}: wrote {n_written:,} lines")
243
+ # Release the first result set before the next fetch begins — keeps
244
+ # peak memory close to one query, not N.
245
+ first_rows = None # type: ignore[assignment]
246
+ nl, nb = _emit_result_line(n_written, write_meta, first_outpath)
247
+ grand_lines += nl
248
+ grand_bytes += nb
249
+
250
+ # Remaining queries: fetch → check fetch_meta agrees → write → result line,
251
+ # one at a time. Streaming preserves the partial-success property
252
+ # (earlier queries are on disk before later queries even start).
253
+ for query_name, query_cfg, outpath in plan[1:]:
254
+ rows, fetch_meta = backend_module.fetch(
255
+ query_cfg, backend_cfg, since, until, verbose,
256
+ skip_confirm=skip_confirm,
257
+ )
258
+ if (fetch_meta.get("units"), fetch_meta.get("unit_label")) != (
259
+ run_fetch_meta.get("units"), run_fetch_meta.get("unit_label")
260
+ ):
261
+ raise ValueError(
262
+ f"Exporter backend '{resolved_backend}' returned inconsistent fetch "
263
+ f"metadata across queries for the same window — this is a backend bug."
264
+ )
265
+ print(f"running {query_name} …")
266
+ with liveness(f"writing {outpath.name}") as ln:
267
+ n_written, write_meta = backend_module.write(rows, outpath, verbose)
268
+ ln.seal(f"{query_name}: wrote {n_written:,} lines")
269
+ rows = None # type: ignore[assignment]
270
+ nl, nb = _emit_result_line(n_written, write_meta, outpath)
271
+ grand_lines += nl
272
+ grand_bytes += nb
273
+
274
+ # Final summary — one quiet line. No separator, no box.
275
+ q_label = "queries" if n_queries != 1 else "query"
276
+ print()
277
+ print(
278
+ f"done · {n_queries} {q_label} · "
279
+ f"{grand_lines:,} lines · {human_bytes(grand_bytes)}"
280
+ )
281
+
282
+
283
+ def _resolve_backend(config: dict[str, Any], backend: str | None) -> str:
284
+ """Resolve which backend to use based on config and explicit request.
285
+
286
+ Each backend module decides for itself whether its config section is
287
+ sufficient via ``is_configured(backend_cfg)``. The orchestrator iterates
288
+ _KNOWN_BACKENDS, asks each, and collects the names that say yes.
289
+ """
290
+ configured: list[str] = []
291
+ for name in _KNOWN_BACKENDS:
292
+ try:
293
+ module = _load_backend(name)
294
+ except ValueError:
295
+ # Backend listed as known but not yet implemented (e.g. cloudtrail
296
+ # before its module lands). Not auto-selectable.
297
+ continue
298
+ if module.is_configured(_backend_cfg(config, name)):
299
+ configured.append(name)
300
+
301
+ if backend is None:
302
+ if len(configured) == 1:
303
+ # The new header (printed by run_export) names the backend — no
304
+ # stray pre-fetch chatter on auto-select.
305
+ return configured[0]
306
+ elif len(configured) == 0:
307
+ raise ValueError(
308
+ "No export backend configured — add a [export.splunk] section with host to your config.\n"
309
+ "Run 'loghunter init' to create a config."
310
+ )
311
+ else:
312
+ available = ", ".join(sorted(configured))
313
+ raise ValueError(
314
+ f"Multiple backends configured: {available}\n"
315
+ f"Specify one: loghunter export <backend_name>"
316
+ )
317
+
318
+ if backend not in _KNOWN_BACKENDS:
319
+ available = ", ".join(_KNOWN_BACKENDS)
320
+ raise ValueError(f"Unknown backend '{backend}'. Available: {available}")
321
+
322
+ module = _load_backend(backend) # may raise "not yet implemented" — that's correct
323
+ if not module.is_configured(_backend_cfg(config, backend)):
324
+ raise ValueError(
325
+ f"Backend '{backend}' is not configured — "
326
+ f"add a [export.{backend}] section to your config (see config_example.toml)."
327
+ )
328
+ return backend
329
+
330
+
331
+ def _resolve_queries(
332
+ config: dict[str, Any],
333
+ backend: str,
334
+ query_names: list[str],
335
+ *,
336
+ backend_module: ModuleType | None = None,
337
+ ) -> list[tuple[str, dict[str, Any]]]:
338
+ """Resolve query names to (name, config) pairs.
339
+
340
+ Empty query_names uses auto-selection: "default" if it exists, or the only
341
+ defined query. Multiple defined queries with no name given raises ValueError.
342
+
343
+ If the backend has no queries configured AND exposes an
344
+ ``implicit_default_query()`` hook, a synthetic single "default" query is
345
+ used (this is how CloudTrail — which has no per-query SPL — participates).
346
+ """
347
+ queries: dict[str, Any] = _backend_cfg(config, backend).get("query", {})
348
+ if (not queries
349
+ and backend_module is not None
350
+ and hasattr(backend_module, "implicit_default_query")):
351
+ queries = {"default": backend_module.implicit_default_query()}
352
+
353
+ if not query_names:
354
+ if "default" in queries:
355
+ return [("default", queries["default"])]
356
+ elif len(queries) == 1:
357
+ name = next(iter(queries))
358
+ return [(name, queries[name])]
359
+ elif len(queries) == 0:
360
+ raise ValueError(
361
+ f"No queries defined under [export.{backend}.query] in config.\n"
362
+ f"Add a [export.{backend}.query.<name>] section to your config."
363
+ )
364
+ else:
365
+ available = ", ".join(sorted(queries))
366
+ raise ValueError(
367
+ f"Multiple queries defined for backend '{backend}': {available}\n"
368
+ f"Specify a query name: loghunter export {backend} <query_name>"
369
+ )
370
+
371
+ result = []
372
+ for name in query_names:
373
+ if name not in queries:
374
+ available = ", ".join(sorted(queries))
375
+ raise ValueError(
376
+ f"Query '{name}' not found in [export.{backend}.query]\n"
377
+ f"Available queries: {available}"
378
+ )
379
+ result.append((name, queries[name]))
380
+ return result
381
+
382
+
383
+ def _resolve_output_path(
384
+ query_config: dict[str, Any],
385
+ cli_out: str | None,
386
+ since: datetime,
387
+ until: datetime,
388
+ query_name: str,
389
+ *,
390
+ extension: str = ".log",
391
+ backend_config: dict[str, Any] | None = None,
392
+ loghunter_config: dict[str, Any] | None = None,
393
+ root: str = "",
394
+ ) -> Path:
395
+ """Resolve the final output path for a single query result.
396
+
397
+ Five-tier cascade (most-specific wins):
398
+ 1. cli_out (--out, expanded with root="" — shell semantics)
399
+ 2. query_config["export_dir"] (per-query — finest grain; config, root applies)
400
+ 3. backend_config["export_dir"] ([export.<backend>].export_dir; config, root applies)
401
+ 4. loghunter_config["export_dir"] (global default; config, root applies)
402
+ 5. "." (CWD floor — literal, not a resolved value)
403
+
404
+ The winning target string is passed through ``be_like_water`` to decide
405
+ file vs directory. For a FILE verdict the path is returned as-is; for a
406
+ DIRECTORY verdict an auto-name is appended.
407
+
408
+ **Per-source auto-segmentation of the global base.** When the global tier
409
+ (4) wins, ``[loghunter].export_dir`` is treated as a directory BASE and each
410
+ export is written to ``<base>/<source>/`` (``source = output_basename or
411
+ query_name`` — the log-family the admin chose, NOT the transport backend),
412
+ so loghunter never builds the flat pile its own discovery globs cross-read.
413
+ The global base is a directory base regardless of disk state (it ships with
414
+ a trailing slash; a file-shaped global base is meaningless as a multi-source
415
+ base). Every other tier — CLI ``--out``, an explicit per-query / per-backend
416
+ ``export_dir``, and the CWD floor — is the LITERAL final dir and does NOT
417
+ segment. The ``from_global_base`` flag returned by ``_pick_export_target``
418
+ is the sole signal; callers never see it.
419
+
420
+ ``extension`` is appended to the auto-named filename and is supplied by the
421
+ backend via its optional ``OUTPUT_EXTENSION`` module attribute.
422
+
423
+ ``root`` is the LH_ROOT for relative config paths; the caller reads it once
424
+ via ``effective_root(config)`` and threads it in.
425
+ """
426
+ # Compute the source basename ONCE, up front: it drives both the directory
427
+ # segment (global tier) and the auto-named filename.
428
+ basename = query_config.get("output_basename") or query_name
429
+ target, from_global_base = _pick_export_target(
430
+ cli_out, query_config, backend_config, loghunter_config, root=root,
431
+ )
432
+ if from_global_base:
433
+ # Segment the global base BEFORE be_like_water: normalize to exactly one
434
+ # trailing separator, then append the source segment with directory
435
+ # intent. ``Path(basename).name`` is defensive — basename is a bare
436
+ # log-family name by contract, and ``.name`` collapses any stray
437
+ # separator so a segment can never escape the base. The trailing slash
438
+ # yields a be_like_water DIRECTORY verdict even when <base>/<source>/
439
+ # does not exist yet (ladder rule 1), so it still auto-names.
440
+ target = target.rstrip("/") + "/" + Path(basename).name + "/"
441
+ resolved = be_like_water(target)
442
+ if resolved.is_file:
443
+ return resolved.path
444
+ return resolved.path / _auto_filename(basename, since, until, extension=extension)
445
+
446
+
447
+ def _pick_export_target(
448
+ cli: str | None,
449
+ query: dict[str, Any] | None,
450
+ backend: dict[str, Any] | None,
451
+ loghunter: dict[str, Any] | None,
452
+ *,
453
+ root: str = "",
454
+ ) -> tuple[str, bool]:
455
+ """Return ``(target, from_global_base)`` for the five-tier cascade.
456
+
457
+ ``target`` is the first set target STRING across the cascade;
458
+ ``from_global_base`` is True iff the WINNING tier is
459
+ ``loghunter["export_dir"]`` (tier 4) — the only tier that auto-segments per
460
+ source (see ``_resolve_output_path``). It is a real returned bool, not an
461
+ overloaded sentinel, and is consumed ONLY by ``_resolve_output_path``;
462
+ callers and backend modules never reason about it.
463
+
464
+ Preserves trailing slashes by working in strings, not Paths. CLI tier
465
+ resolves with root="" (shell semantics — ~-expansion only); the three
466
+ config tiers resolve through ``resolve_path(value, root)`` so LH_ROOT
467
+ applies. The CWD floor stays a literal "." even though
468
+ ``resolve_path("", root)`` returns None for empty config values. Every
469
+ config tier — per-query, per-backend, and global — uses the single
470
+ ``export_dir`` key.
471
+ """
472
+ if cli is not None:
473
+ resolved = resolve_path(cli, "")
474
+ if resolved is not None:
475
+ return resolved, False
476
+ for stanza, key, is_global in [
477
+ (query, "export_dir", False),
478
+ (backend, "export_dir", False),
479
+ (loghunter, "export_dir", True),
480
+ ]:
481
+ if stanza:
482
+ value = stanza.get(key)
483
+ if value:
484
+ resolved = resolve_path(value, root)
485
+ if resolved is not None:
486
+ return resolved, is_global
487
+ return ".", False
488
+
489
+
490
+ def _auto_filename(
491
+ basename: str,
492
+ since: datetime,
493
+ until: datetime,
494
+ *,
495
+ extension: str = ".log",
496
+ ) -> str:
497
+ """Derive an output filename from the time window.
498
+
499
+ Whole-day windows (both endpoints at midnight, integer days):
500
+ {basename}_{YYYYMMDD}_{N}d{extension}
501
+
502
+ All other windows (partial day, arbitrary range):
503
+ {basename}_{YYYYMMDD}_to_{YYYYMMDD_HHh}{extension}
504
+ """
505
+ local_since = since.astimezone() if since.tzinfo else since
506
+ local_until = until.astimezone() if until.tzinfo else until
507
+
508
+ start_str = local_since.strftime("%Y%m%d")
509
+
510
+ since_at_midnight = (
511
+ local_since.hour == 0 and local_since.minute == 0 and local_since.second == 0
512
+ )
513
+ until_at_midnight = (
514
+ local_until.hour == 0 and local_until.minute == 0 and local_until.second == 0
515
+ )
516
+
517
+ if since_at_midnight and until_at_midnight:
518
+ n_days = int((local_until - local_since).total_seconds() // 86400)
519
+ if n_days >= 1:
520
+ return f"{basename}_{start_str}_{n_days}d{extension}"
521
+
522
+ end_str = local_until.strftime("%Y%m%d_%Hh")
523
+ return f"{basename}_{start_str}_to_{end_str}{extension}"
524
+
525
+
526
+ def _load_backend(backend_name: str) -> ModuleType:
527
+ """Import and return the backend module for the given backend name."""
528
+ if backend_name == "splunk":
529
+ from loghunter.exporters import splunk as splunk_module
530
+ return splunk_module
531
+ if backend_name == "cloudtrail":
532
+ from loghunter.exporters import cloudtrail as cloudtrail_module
533
+ return cloudtrail_module
534
+ raise ValueError(f"Backend '{backend_name}' is not yet implemented.")