loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""Log exporter orchestrator — backend, query, and output-path resolution.
|
|
2
|
+
|
|
3
|
+
Public entry point:
|
|
4
|
+
run_export(config, backend, query_names, since, until, out, verbose)
|
|
5
|
+
|
|
6
|
+
Architecture:
|
|
7
|
+
This module owns query resolution, backend selection, output-path naming,
|
|
8
|
+
and the fetch/write loop. It does not know any backend's internals.
|
|
9
|
+
|
|
10
|
+
Each backend is a module under exporters/ that exposes exactly four
|
|
11
|
+
module-level callables (duck-typed — no base class, no registry file):
|
|
12
|
+
|
|
13
|
+
is_configured(backend_cfg) -> bool
|
|
14
|
+
Used during auto-select to decide whether this backend is offerable.
|
|
15
|
+
|
|
16
|
+
summary_descriptor(backend_cfg) -> str
|
|
17
|
+
Rendered into the `Backend :` line of the final summary, e.g.
|
|
18
|
+
"host:port" for Splunk or "s3://bucket/prefix" for a future
|
|
19
|
+
object-store backend.
|
|
20
|
+
|
|
21
|
+
fetch(query_config, backend_config, since, until, verbose,
|
|
22
|
+
*, skip_confirm=False) -> (rows, fetch_meta)
|
|
23
|
+
fetch_meta carries at least {"units": int, "unit_label": str} and
|
|
24
|
+
MUST be invariant across queries within the same (since, until)
|
|
25
|
+
window for a given backend — work-unit count is a property of the
|
|
26
|
+
window, not the individual query. The orchestrator enforces this.
|
|
27
|
+
skip_confirm bypasses any backend-side cost prompt; backends that
|
|
28
|
+
have no prompt (Splunk) accept and ignore it.
|
|
29
|
+
|
|
30
|
+
write(rows, outpath, verbose) -> (int, dict)
|
|
31
|
+
Returns ``(line_count, write_meta)``. ``write_meta`` MUST carry at
|
|
32
|
+
least ``{"bytes": int, "paths": list[Path]}`` — bytes is the total
|
|
33
|
+
on-disk size summed across whatever files the backend produced,
|
|
34
|
+
paths lists every file written (single-element when the backend
|
|
35
|
+
does not split; ordered ``[_part01, _part02, …]`` when it does).
|
|
36
|
+
The orchestrator never reaches into the writer's private split
|
|
37
|
+
machinery — it reads the contract.
|
|
38
|
+
|
|
39
|
+
Optional module-level hooks the orchestrator consults if present:
|
|
40
|
+
|
|
41
|
+
implicit_default_query() -> dict
|
|
42
|
+
Used when a backend has no per-query stanza (e.g. CloudTrail has
|
|
43
|
+
no SPL). Returned dict becomes the synthetic "default" query.
|
|
44
|
+
|
|
45
|
+
OUTPUT_EXTENSION: str
|
|
46
|
+
Extension applied to auto-named output files. Default ".log".
|
|
47
|
+
CloudTrail uses ".json.log".
|
|
48
|
+
|
|
49
|
+
Adding a new backend means: (1) drop a module under exporters/ that
|
|
50
|
+
implements those four callables; (2) add its name to _KNOWN_BACKENDS;
|
|
51
|
+
(3) add a branch in _load_backend(). Nothing else changes here.
|
|
52
|
+
|
|
53
|
+
Splunk's hourly chunking helper (_build_hour_windows) is private to
|
|
54
|
+
splunk.py and is no longer reachable from this orchestrator.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
from __future__ import annotations
|
|
58
|
+
|
|
59
|
+
import importlib
|
|
60
|
+
from datetime import datetime, timedelta
|
|
61
|
+
from pathlib import Path
|
|
62
|
+
from types import ModuleType
|
|
63
|
+
from typing import Any
|
|
64
|
+
|
|
65
|
+
# ExportAborted lives in common.errors so runner.py and exporter backends can
|
|
66
|
+
# both raise it without creating a runner ↔ exporter dependency. Re-exported
|
|
67
|
+
# here so `from loghunter.exporters import ExportAborted` still works for
|
|
68
|
+
# existing call sites and external code.
|
|
69
|
+
from loghunter.common.display import compact_home, human_bytes, liveness
|
|
70
|
+
from loghunter.common.errors import ExportAborted # noqa: F401 (re-export)
|
|
71
|
+
from loghunter.common.paths import be_like_water, effective_root, resolve_path
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _backend_cfg(config: dict[str, Any], name: str) -> dict[str, Any]:
|
|
75
|
+
"""Return the [export.<name>] stanza, or an empty dict if absent.
|
|
76
|
+
|
|
77
|
+
Single read-site for backend config — keeps every fetch / is_configured /
|
|
78
|
+
summary_descriptor / query lookup honest to the [export.<backend>] shape.
|
|
79
|
+
"""
|
|
80
|
+
return config.get("export", {}).get(name, {})
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _normalize_end_of_day_until(until: datetime) -> datetime:
|
|
84
|
+
"""Normalize 23:59:xx (produced by --days) to next midnight for chunk alignment."""
|
|
85
|
+
if until.hour == 23 and until.minute == 59 and until.second >= 58:
|
|
86
|
+
return until.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
|
|
87
|
+
return until
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
_KNOWN_BACKENDS = ("splunk", "cloudtrail")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def run_export(
|
|
94
|
+
config: dict[str, Any],
|
|
95
|
+
backend: str | None,
|
|
96
|
+
query_names: list[str],
|
|
97
|
+
since: datetime | None,
|
|
98
|
+
until: datetime | None,
|
|
99
|
+
out: str | None,
|
|
100
|
+
verbose: bool,
|
|
101
|
+
*,
|
|
102
|
+
skip_confirm: bool = False,
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Pull log data from an external system and write to local flat files.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
config: Loaded config dict (from common/config.py).
|
|
108
|
+
backend: Backend name ("splunk", etc.) or None to auto-select.
|
|
109
|
+
query_names: Named queries to run. Empty list uses default/single logic.
|
|
110
|
+
since: Start of window, or None to use yesterday 00:00:00 local.
|
|
111
|
+
until: End of window, or None to use today 00:00:00 local.
|
|
112
|
+
out: Raw CLI --out string (preserves trailing slash) or None.
|
|
113
|
+
be_like_water decides file-vs-directory inside the cascade.
|
|
114
|
+
verbose: Threaded to fetch() / write() for backend-internal use
|
|
115
|
+
(e.g. CloudTrail's list-phase line at level >= 1). The W4 grammar
|
|
116
|
+
keeps stdout narration terse and identical across levels — no
|
|
117
|
+
per-query SPL block, no auto-select chatter.
|
|
118
|
+
skip_confirm: When True, skip any backend-side cost prompts (e.g. the
|
|
119
|
+
CloudTrail egress guard). Threaded from the CLI's --yes / -y flag.
|
|
120
|
+
"""
|
|
121
|
+
# Apply timeframe defaults independently
|
|
122
|
+
local_now = datetime.now().astimezone()
|
|
123
|
+
today_midnight = local_now.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
124
|
+
if since is None:
|
|
125
|
+
since = today_midnight - timedelta(days=1) # yesterday 00:00:00 local
|
|
126
|
+
if until is None:
|
|
127
|
+
until = today_midnight # today 00:00:00 local (exclusive end)
|
|
128
|
+
until = _normalize_end_of_day_until(until)
|
|
129
|
+
|
|
130
|
+
# Resolve backend and load its module
|
|
131
|
+
resolved_backend = _resolve_backend(config, backend)
|
|
132
|
+
backend_module = _load_backend(resolved_backend)
|
|
133
|
+
|
|
134
|
+
# Resolve queries (backends with no per-query config supply a synthetic
|
|
135
|
+
# default via implicit_default_query()).
|
|
136
|
+
resolved_queries = _resolve_queries(
|
|
137
|
+
config, resolved_backend, query_names, backend_module=backend_module
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Guard: an explicit file path target is incompatible with multiple queries.
|
|
141
|
+
# Re-expressed in terms of be_like_water's verdict — never .suffix.
|
|
142
|
+
if out is not None:
|
|
143
|
+
cli_resolved = be_like_water(out)
|
|
144
|
+
if cli_resolved.is_file and len(resolved_queries) > 1:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
f"Cannot use an explicit file path ({cli_resolved.path}) with "
|
|
147
|
+
f"multiple queries — specify a directory instead."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
local_since = since.astimezone() if since.tzinfo else since
|
|
151
|
+
local_until = until.astimezone() if until.tzinfo else until
|
|
152
|
+
window_str = (
|
|
153
|
+
f"{local_since.strftime('%Y-%m-%d %H:%M')} → "
|
|
154
|
+
f"{local_until.strftime('%Y-%m-%d %H:%M')} local"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Fetch and write each query. fetch() returns (rows, fetch_meta); the
|
|
158
|
+
# orchestrator keeps the first query's fetch_meta as the run-level work-unit
|
|
159
|
+
# descriptor and asserts later queries agree (the metadata is a property of
|
|
160
|
+
# the window, not the query). write() returns (line_count, write_meta) where
|
|
161
|
+
# write_meta carries {"bytes": int, "paths": list[Path]} — the orchestrator
|
|
162
|
+
# is backend-neutral and does not know about splitting.
|
|
163
|
+
extension = getattr(backend_module, "OUTPUT_EXTENSION", ".log")
|
|
164
|
+
backend_cfg = _backend_cfg(config, resolved_backend)
|
|
165
|
+
loghunter_cfg = config.get("loghunter", {})
|
|
166
|
+
root = effective_root(config)
|
|
167
|
+
|
|
168
|
+
# Resolve every query's output path up front so the header line can land
|
|
169
|
+
# before the first fetch. NO bulk-fetch pre-pass — each query streams
|
|
170
|
+
# fetch → write in turn so a long export doesn't hold every result set in
|
|
171
|
+
# RAM, the first result line appears promptly, and a later query's failure
|
|
172
|
+
# doesn't void earlier successfully-written queries.
|
|
173
|
+
plan: list[tuple[str, dict[str, Any], Path]] = []
|
|
174
|
+
for query_name, query_cfg in resolved_queries:
|
|
175
|
+
outpath = _resolve_output_path(
|
|
176
|
+
query_cfg, out, since, until, query_name,
|
|
177
|
+
extension=extension,
|
|
178
|
+
backend_config=backend_cfg,
|
|
179
|
+
loghunter_config=loghunter_cfg,
|
|
180
|
+
root=root,
|
|
181
|
+
)
|
|
182
|
+
plan.append((query_name, query_cfg, outpath))
|
|
183
|
+
|
|
184
|
+
# Header — single plain stdout line. No box, no seplines, NO color, no
|
|
185
|
+
# auto-select chatter on stderr.
|
|
186
|
+
print(
|
|
187
|
+
f"loghunter export · {resolved_backend} "
|
|
188
|
+
f"({backend_module.summary_descriptor(backend_cfg)})"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def _format_span(units: int, label: str) -> str:
|
|
192
|
+
total_secs = (local_until - local_since).total_seconds()
|
|
193
|
+
if total_secs > 0 and total_secs % 86400 == 0:
|
|
194
|
+
n_days = int(total_secs / 86400)
|
|
195
|
+
return f"{n_days} day{'s' if n_days != 1 else ''}, {units} {label}"
|
|
196
|
+
n_hours = max(int(total_secs / 3600), 1)
|
|
197
|
+
return f"{n_hours}h, {units} {label}"
|
|
198
|
+
|
|
199
|
+
def _emit_result_line(
|
|
200
|
+
n_written: int, write_meta: dict[str, Any], fallback_path: Path,
|
|
201
|
+
) -> tuple[int, int]:
|
|
202
|
+
"""Print the per-query result line and return (n_written, bytes)."""
|
|
203
|
+
paths = list(write_meta.get("paths") or [fallback_path])
|
|
204
|
+
bytes_written = int(write_meta.get("bytes", 0))
|
|
205
|
+
path_display = compact_home(paths[0])
|
|
206
|
+
if len(paths) > 1:
|
|
207
|
+
path_display += f" (+{len(paths) - 1} more)"
|
|
208
|
+
print(
|
|
209
|
+
f" wrote {n_written:,} lines · {human_bytes(bytes_written)} → {path_display}"
|
|
210
|
+
)
|
|
211
|
+
return n_written, bytes_written
|
|
212
|
+
|
|
213
|
+
# First query: fetch → window line → write. The window line is printed
|
|
214
|
+
# AFTER the first fetch because fetch_meta carries the work-unit count;
|
|
215
|
+
# the result line for query 1 follows immediately so the operator sees
|
|
216
|
+
# tangible progress before query 2 even begins.
|
|
217
|
+
grand_lines = 0
|
|
218
|
+
grand_bytes = 0
|
|
219
|
+
run_fetch_meta: dict[str, Any] | None = None
|
|
220
|
+
n_queries = len(plan)
|
|
221
|
+
|
|
222
|
+
first_name, first_cfg, first_outpath = plan[0]
|
|
223
|
+
first_rows, first_meta = backend_module.fetch(
|
|
224
|
+
first_cfg, backend_cfg, since, until, verbose,
|
|
225
|
+
skip_confirm=skip_confirm,
|
|
226
|
+
)
|
|
227
|
+
run_fetch_meta = first_meta
|
|
228
|
+
try:
|
|
229
|
+
total_units = run_fetch_meta["units"]
|
|
230
|
+
unit_label = run_fetch_meta["unit_label"]
|
|
231
|
+
except (KeyError, TypeError) as exc:
|
|
232
|
+
raise ValueError(
|
|
233
|
+
f"Exporter backend '{resolved_backend}' returned invalid fetch metadata: "
|
|
234
|
+
f"missing 'units' or 'unit_label'."
|
|
235
|
+
) from exc
|
|
236
|
+
print(f"window: {window_str} ({_format_span(total_units, unit_label)})")
|
|
237
|
+
print()
|
|
238
|
+
|
|
239
|
+
print(f"running {first_name} …")
|
|
240
|
+
with liveness(f"writing {first_outpath.name}") as ln:
|
|
241
|
+
n_written, write_meta = backend_module.write(first_rows, first_outpath, verbose)
|
|
242
|
+
ln.seal(f"{first_name}: wrote {n_written:,} lines")
|
|
243
|
+
# Release the first result set before the next fetch begins — keeps
|
|
244
|
+
# peak memory close to one query, not N.
|
|
245
|
+
first_rows = None # type: ignore[assignment]
|
|
246
|
+
nl, nb = _emit_result_line(n_written, write_meta, first_outpath)
|
|
247
|
+
grand_lines += nl
|
|
248
|
+
grand_bytes += nb
|
|
249
|
+
|
|
250
|
+
# Remaining queries: fetch → check fetch_meta agrees → write → result line,
|
|
251
|
+
# one at a time. Streaming preserves the partial-success property
|
|
252
|
+
# (earlier queries are on disk before later queries even start).
|
|
253
|
+
for query_name, query_cfg, outpath in plan[1:]:
|
|
254
|
+
rows, fetch_meta = backend_module.fetch(
|
|
255
|
+
query_cfg, backend_cfg, since, until, verbose,
|
|
256
|
+
skip_confirm=skip_confirm,
|
|
257
|
+
)
|
|
258
|
+
if (fetch_meta.get("units"), fetch_meta.get("unit_label")) != (
|
|
259
|
+
run_fetch_meta.get("units"), run_fetch_meta.get("unit_label")
|
|
260
|
+
):
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"Exporter backend '{resolved_backend}' returned inconsistent fetch "
|
|
263
|
+
f"metadata across queries for the same window — this is a backend bug."
|
|
264
|
+
)
|
|
265
|
+
print(f"running {query_name} …")
|
|
266
|
+
with liveness(f"writing {outpath.name}") as ln:
|
|
267
|
+
n_written, write_meta = backend_module.write(rows, outpath, verbose)
|
|
268
|
+
ln.seal(f"{query_name}: wrote {n_written:,} lines")
|
|
269
|
+
rows = None # type: ignore[assignment]
|
|
270
|
+
nl, nb = _emit_result_line(n_written, write_meta, outpath)
|
|
271
|
+
grand_lines += nl
|
|
272
|
+
grand_bytes += nb
|
|
273
|
+
|
|
274
|
+
# Final summary — one quiet line. No separator, no box.
|
|
275
|
+
q_label = "queries" if n_queries != 1 else "query"
|
|
276
|
+
print()
|
|
277
|
+
print(
|
|
278
|
+
f"done · {n_queries} {q_label} · "
|
|
279
|
+
f"{grand_lines:,} lines · {human_bytes(grand_bytes)}"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _resolve_backend(config: dict[str, Any], backend: str | None) -> str:
|
|
284
|
+
"""Resolve which backend to use based on config and explicit request.
|
|
285
|
+
|
|
286
|
+
Each backend module decides for itself whether its config section is
|
|
287
|
+
sufficient via ``is_configured(backend_cfg)``. The orchestrator iterates
|
|
288
|
+
_KNOWN_BACKENDS, asks each, and collects the names that say yes.
|
|
289
|
+
"""
|
|
290
|
+
configured: list[str] = []
|
|
291
|
+
for name in _KNOWN_BACKENDS:
|
|
292
|
+
try:
|
|
293
|
+
module = _load_backend(name)
|
|
294
|
+
except ValueError:
|
|
295
|
+
# Backend listed as known but not yet implemented (e.g. cloudtrail
|
|
296
|
+
# before its module lands). Not auto-selectable.
|
|
297
|
+
continue
|
|
298
|
+
if module.is_configured(_backend_cfg(config, name)):
|
|
299
|
+
configured.append(name)
|
|
300
|
+
|
|
301
|
+
if backend is None:
|
|
302
|
+
if len(configured) == 1:
|
|
303
|
+
# The new header (printed by run_export) names the backend — no
|
|
304
|
+
# stray pre-fetch chatter on auto-select.
|
|
305
|
+
return configured[0]
|
|
306
|
+
elif len(configured) == 0:
|
|
307
|
+
raise ValueError(
|
|
308
|
+
"No export backend configured — add a [export.splunk] section with host to your config.\n"
|
|
309
|
+
"Run 'loghunter init' to create a config."
|
|
310
|
+
)
|
|
311
|
+
else:
|
|
312
|
+
available = ", ".join(sorted(configured))
|
|
313
|
+
raise ValueError(
|
|
314
|
+
f"Multiple backends configured: {available}\n"
|
|
315
|
+
f"Specify one: loghunter export <backend_name>"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if backend not in _KNOWN_BACKENDS:
|
|
319
|
+
available = ", ".join(_KNOWN_BACKENDS)
|
|
320
|
+
raise ValueError(f"Unknown backend '{backend}'. Available: {available}")
|
|
321
|
+
|
|
322
|
+
module = _load_backend(backend) # may raise "not yet implemented" — that's correct
|
|
323
|
+
if not module.is_configured(_backend_cfg(config, backend)):
|
|
324
|
+
raise ValueError(
|
|
325
|
+
f"Backend '{backend}' is not configured — "
|
|
326
|
+
f"add a [export.{backend}] section to your config (see config_example.toml)."
|
|
327
|
+
)
|
|
328
|
+
return backend
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _resolve_queries(
|
|
332
|
+
config: dict[str, Any],
|
|
333
|
+
backend: str,
|
|
334
|
+
query_names: list[str],
|
|
335
|
+
*,
|
|
336
|
+
backend_module: ModuleType | None = None,
|
|
337
|
+
) -> list[tuple[str, dict[str, Any]]]:
|
|
338
|
+
"""Resolve query names to (name, config) pairs.
|
|
339
|
+
|
|
340
|
+
Empty query_names uses auto-selection: "default" if it exists, or the only
|
|
341
|
+
defined query. Multiple defined queries with no name given raises ValueError.
|
|
342
|
+
|
|
343
|
+
If the backend has no queries configured AND exposes an
|
|
344
|
+
``implicit_default_query()`` hook, a synthetic single "default" query is
|
|
345
|
+
used (this is how CloudTrail — which has no per-query SPL — participates).
|
|
346
|
+
"""
|
|
347
|
+
queries: dict[str, Any] = _backend_cfg(config, backend).get("query", {})
|
|
348
|
+
if (not queries
|
|
349
|
+
and backend_module is not None
|
|
350
|
+
and hasattr(backend_module, "implicit_default_query")):
|
|
351
|
+
queries = {"default": backend_module.implicit_default_query()}
|
|
352
|
+
|
|
353
|
+
if not query_names:
|
|
354
|
+
if "default" in queries:
|
|
355
|
+
return [("default", queries["default"])]
|
|
356
|
+
elif len(queries) == 1:
|
|
357
|
+
name = next(iter(queries))
|
|
358
|
+
return [(name, queries[name])]
|
|
359
|
+
elif len(queries) == 0:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"No queries defined under [export.{backend}.query] in config.\n"
|
|
362
|
+
f"Add a [export.{backend}.query.<name>] section to your config."
|
|
363
|
+
)
|
|
364
|
+
else:
|
|
365
|
+
available = ", ".join(sorted(queries))
|
|
366
|
+
raise ValueError(
|
|
367
|
+
f"Multiple queries defined for backend '{backend}': {available}\n"
|
|
368
|
+
f"Specify a query name: loghunter export {backend} <query_name>"
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
result = []
|
|
372
|
+
for name in query_names:
|
|
373
|
+
if name not in queries:
|
|
374
|
+
available = ", ".join(sorted(queries))
|
|
375
|
+
raise ValueError(
|
|
376
|
+
f"Query '{name}' not found in [export.{backend}.query]\n"
|
|
377
|
+
f"Available queries: {available}"
|
|
378
|
+
)
|
|
379
|
+
result.append((name, queries[name]))
|
|
380
|
+
return result
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _resolve_output_path(
|
|
384
|
+
query_config: dict[str, Any],
|
|
385
|
+
cli_out: str | None,
|
|
386
|
+
since: datetime,
|
|
387
|
+
until: datetime,
|
|
388
|
+
query_name: str,
|
|
389
|
+
*,
|
|
390
|
+
extension: str = ".log",
|
|
391
|
+
backend_config: dict[str, Any] | None = None,
|
|
392
|
+
loghunter_config: dict[str, Any] | None = None,
|
|
393
|
+
root: str = "",
|
|
394
|
+
) -> Path:
|
|
395
|
+
"""Resolve the final output path for a single query result.
|
|
396
|
+
|
|
397
|
+
Five-tier cascade (most-specific wins):
|
|
398
|
+
1. cli_out (--out, expanded with root="" — shell semantics)
|
|
399
|
+
2. query_config["export_dir"] (per-query — finest grain; config, root applies)
|
|
400
|
+
3. backend_config["export_dir"] ([export.<backend>].export_dir; config, root applies)
|
|
401
|
+
4. loghunter_config["export_dir"] (global default; config, root applies)
|
|
402
|
+
5. "." (CWD floor — literal, not a resolved value)
|
|
403
|
+
|
|
404
|
+
The winning target string is passed through ``be_like_water`` to decide
|
|
405
|
+
file vs directory. For a FILE verdict the path is returned as-is; for a
|
|
406
|
+
DIRECTORY verdict an auto-name is appended.
|
|
407
|
+
|
|
408
|
+
**Per-source auto-segmentation of the global base.** When the global tier
|
|
409
|
+
(4) wins, ``[loghunter].export_dir`` is treated as a directory BASE and each
|
|
410
|
+
export is written to ``<base>/<source>/`` (``source = output_basename or
|
|
411
|
+
query_name`` — the log-family the admin chose, NOT the transport backend),
|
|
412
|
+
so loghunter never builds the flat pile its own discovery globs cross-read.
|
|
413
|
+
The global base is a directory base regardless of disk state (it ships with
|
|
414
|
+
a trailing slash; a file-shaped global base is meaningless as a multi-source
|
|
415
|
+
base). Every other tier — CLI ``--out``, an explicit per-query / per-backend
|
|
416
|
+
``export_dir``, and the CWD floor — is the LITERAL final dir and does NOT
|
|
417
|
+
segment. The ``from_global_base`` flag returned by ``_pick_export_target``
|
|
418
|
+
is the sole signal; callers never see it.
|
|
419
|
+
|
|
420
|
+
``extension`` is appended to the auto-named filename and is supplied by the
|
|
421
|
+
backend via its optional ``OUTPUT_EXTENSION`` module attribute.
|
|
422
|
+
|
|
423
|
+
``root`` is the LH_ROOT for relative config paths; the caller reads it once
|
|
424
|
+
via ``effective_root(config)`` and threads it in.
|
|
425
|
+
"""
|
|
426
|
+
# Compute the source basename ONCE, up front: it drives both the directory
|
|
427
|
+
# segment (global tier) and the auto-named filename.
|
|
428
|
+
basename = query_config.get("output_basename") or query_name
|
|
429
|
+
target, from_global_base = _pick_export_target(
|
|
430
|
+
cli_out, query_config, backend_config, loghunter_config, root=root,
|
|
431
|
+
)
|
|
432
|
+
if from_global_base:
|
|
433
|
+
# Segment the global base BEFORE be_like_water: normalize to exactly one
|
|
434
|
+
# trailing separator, then append the source segment with directory
|
|
435
|
+
# intent. ``Path(basename).name`` is defensive — basename is a bare
|
|
436
|
+
# log-family name by contract, and ``.name`` collapses any stray
|
|
437
|
+
# separator so a segment can never escape the base. The trailing slash
|
|
438
|
+
# yields a be_like_water DIRECTORY verdict even when <base>/<source>/
|
|
439
|
+
# does not exist yet (ladder rule 1), so it still auto-names.
|
|
440
|
+
target = target.rstrip("/") + "/" + Path(basename).name + "/"
|
|
441
|
+
resolved = be_like_water(target)
|
|
442
|
+
if resolved.is_file:
|
|
443
|
+
return resolved.path
|
|
444
|
+
return resolved.path / _auto_filename(basename, since, until, extension=extension)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def _pick_export_target(
|
|
448
|
+
cli: str | None,
|
|
449
|
+
query: dict[str, Any] | None,
|
|
450
|
+
backend: dict[str, Any] | None,
|
|
451
|
+
loghunter: dict[str, Any] | None,
|
|
452
|
+
*,
|
|
453
|
+
root: str = "",
|
|
454
|
+
) -> tuple[str, bool]:
|
|
455
|
+
"""Return ``(target, from_global_base)`` for the five-tier cascade.
|
|
456
|
+
|
|
457
|
+
``target`` is the first set target STRING across the cascade;
|
|
458
|
+
``from_global_base`` is True iff the WINNING tier is
|
|
459
|
+
``loghunter["export_dir"]`` (tier 4) — the only tier that auto-segments per
|
|
460
|
+
source (see ``_resolve_output_path``). It is a real returned bool, not an
|
|
461
|
+
overloaded sentinel, and is consumed ONLY by ``_resolve_output_path``;
|
|
462
|
+
callers and backend modules never reason about it.
|
|
463
|
+
|
|
464
|
+
Preserves trailing slashes by working in strings, not Paths. CLI tier
|
|
465
|
+
resolves with root="" (shell semantics — ~-expansion only); the three
|
|
466
|
+
config tiers resolve through ``resolve_path(value, root)`` so LH_ROOT
|
|
467
|
+
applies. The CWD floor stays a literal "." even though
|
|
468
|
+
``resolve_path("", root)`` returns None for empty config values. Every
|
|
469
|
+
config tier — per-query, per-backend, and global — uses the single
|
|
470
|
+
``export_dir`` key.
|
|
471
|
+
"""
|
|
472
|
+
if cli is not None:
|
|
473
|
+
resolved = resolve_path(cli, "")
|
|
474
|
+
if resolved is not None:
|
|
475
|
+
return resolved, False
|
|
476
|
+
for stanza, key, is_global in [
|
|
477
|
+
(query, "export_dir", False),
|
|
478
|
+
(backend, "export_dir", False),
|
|
479
|
+
(loghunter, "export_dir", True),
|
|
480
|
+
]:
|
|
481
|
+
if stanza:
|
|
482
|
+
value = stanza.get(key)
|
|
483
|
+
if value:
|
|
484
|
+
resolved = resolve_path(value, root)
|
|
485
|
+
if resolved is not None:
|
|
486
|
+
return resolved, is_global
|
|
487
|
+
return ".", False
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _auto_filename(
|
|
491
|
+
basename: str,
|
|
492
|
+
since: datetime,
|
|
493
|
+
until: datetime,
|
|
494
|
+
*,
|
|
495
|
+
extension: str = ".log",
|
|
496
|
+
) -> str:
|
|
497
|
+
"""Derive an output filename from the time window.
|
|
498
|
+
|
|
499
|
+
Whole-day windows (both endpoints at midnight, integer days):
|
|
500
|
+
{basename}_{YYYYMMDD}_{N}d{extension}
|
|
501
|
+
|
|
502
|
+
All other windows (partial day, arbitrary range):
|
|
503
|
+
{basename}_{YYYYMMDD}_to_{YYYYMMDD_HHh}{extension}
|
|
504
|
+
"""
|
|
505
|
+
local_since = since.astimezone() if since.tzinfo else since
|
|
506
|
+
local_until = until.astimezone() if until.tzinfo else until
|
|
507
|
+
|
|
508
|
+
start_str = local_since.strftime("%Y%m%d")
|
|
509
|
+
|
|
510
|
+
since_at_midnight = (
|
|
511
|
+
local_since.hour == 0 and local_since.minute == 0 and local_since.second == 0
|
|
512
|
+
)
|
|
513
|
+
until_at_midnight = (
|
|
514
|
+
local_until.hour == 0 and local_until.minute == 0 and local_until.second == 0
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
if since_at_midnight and until_at_midnight:
|
|
518
|
+
n_days = int((local_until - local_since).total_seconds() // 86400)
|
|
519
|
+
if n_days >= 1:
|
|
520
|
+
return f"{basename}_{start_str}_{n_days}d{extension}"
|
|
521
|
+
|
|
522
|
+
end_str = local_until.strftime("%Y%m%d_%Hh")
|
|
523
|
+
return f"{basename}_{start_str}_to_{end_str}{extension}"
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _load_backend(backend_name: str) -> ModuleType:
|
|
527
|
+
"""Import and return the backend module for the given backend name."""
|
|
528
|
+
if backend_name == "splunk":
|
|
529
|
+
from loghunter.exporters import splunk as splunk_module
|
|
530
|
+
return splunk_module
|
|
531
|
+
if backend_name == "cloudtrail":
|
|
532
|
+
from loghunter.exporters import cloudtrail as cloudtrail_module
|
|
533
|
+
return cloudtrail_module
|
|
534
|
+
raise ValueError(f"Backend '{backend_name}' is not yet implemented.")
|