loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
loghunter/cli.py ADDED
@@ -0,0 +1,1108 @@
1
+ """CLI dispatcher — argument parsing, subcommand routing, and first-run experience.
2
+
3
+ Entry point: loghunter.cli:main (registered in pyproject.toml).
4
+
5
+ Dispatch table:
6
+ loghunter [options] PATH run all enabled detectors via runner
7
+ loghunter beacon|dns|syslog|... run a single detector
8
+ loghunter digest [PATH ...] orient-before-the-hunt card (sniff-driven)
9
+ loghunter export pull logs from external systems
10
+ loghunter init first-run setup wizard
11
+
12
+ Parsing is a small declarative spec (``_FLAG_LIST`` + ``_VERBS``) plus a
13
+ per-token loop (``_parse_args``). The spec governs allowed-flag membership,
14
+ validation, and generated per-command help. ``blob_path`` is NOT a flag —
15
+ it is an INTERNAL routing key synthesized post-sniff and MUST NOT enter
16
+ the spec.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import os
22
+ import sys
23
+ from dataclasses import dataclass
24
+ from datetime import datetime, timedelta, timezone
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+ from loghunter.common import config as cfg
29
+ from loghunter.common.errors import DigestEmpty, ExportAborted
30
+ from loghunter.common.output import get_handler
31
+ from loghunter.common.paths import be_like_water, effective_root, resolve_path
32
+
33
+
34
+ # ── flag/verb spec ────────────────────────────────────────────────────────────
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class FlagSpec:
39
+ """One advertised CLI flag.
40
+
41
+ ``key`` is the underscore canonical key used by ``parsed[...]``, config,
42
+ and runner kwargs — never the hyphenated display spelling.
43
+ """
44
+ key: str
45
+ long: str
46
+ short: str | None
47
+ takes_value: bool
48
+ metavar: str
49
+ help: str
50
+
51
+
52
+ # Ordered list — also the display order for generated per-command help.
53
+ _FLAG_LIST: tuple[FlagSpec, ...] = (
54
+ FlagSpec("help", "--help", "h", False, "",
55
+ "show this help and exit"),
56
+ FlagSpec("verbose", "--verbose", "v", False, "",
57
+ "verbose output (extended evidence and next-steps; -vv for full raw debug detail)"),
58
+ FlagSpec("yes", "--yes", "y", False, "",
59
+ "assume yes to advisory prompts (large-dataset, egress)"),
60
+ FlagSpec("all", "--all", "a", False, "",
61
+ "load all available data; overrides default window"),
62
+ FlagSpec("out", "--out", "o", True, "PATH",
63
+ "single per-run output target (file or dir; trailing / = dir)"),
64
+ FlagSpec("config", "--config", "c", True, "FILE",
65
+ "path to a config file (overrides search-path lookup)"),
66
+ FlagSpec("since", "--since", "s", True, "DURATION|DATE",
67
+ "window start (7d, 24h, or ISO date)"),
68
+ FlagSpec("detect", "--detect", "d", True, "LIST",
69
+ "detector selection (all, comma list, or 'all,!x,!y')"),
70
+ FlagSpec("dry_run", "--dry-run", None, False, "",
71
+ "show the plan without running detectors / writing output"),
72
+ FlagSpec("export_allowlist", "--export-allowlist", None, False, "",
73
+ "emit allowlist-ready lines instead of a report (stubbed)"),
74
+ FlagSpec("output", "--output", None, True, "FORMAT",
75
+ "output format (text, json, csv, html)"),
76
+ FlagSpec("until", "--until", None, True, "DATE",
77
+ "window end (ISO date)"),
78
+ FlagSpec("days", "--days", None, True, "N-M",
79
+ "days-ago range (e.g. 1-7); order-insensitive"),
80
+ FlagSpec("hours", "--hours", None, True, "N-M",
81
+ "hours-ago range (e.g. 0-2); order-insensitive"),
82
+ FlagSpec("zeek_dir", "--zeek-dir", None, True, "PATH",
83
+ "Zeek log directory (overrides config)"),
84
+ FlagSpec("pihole_dir", "--pihole-dir", None, True, "PATH",
85
+ "Pi-hole / dnsmasq log directory (overrides config)"),
86
+ FlagSpec("syslog_dir", "--syslog-dir", None, True, "PATH",
87
+ "rsyslog log directory (overrides config)"),
88
+ FlagSpec("cloudtrail_dir", "--cloudtrail-dir", None, True, "PATH",
89
+ "CloudTrail JSON directory (overrides config)"),
90
+ )
91
+
92
+ _FLAGS_BY_KEY: dict[str, FlagSpec] = {f.key: f for f in _FLAG_LIST}
93
+ _FLAGS_BY_LONG: dict[str, FlagSpec] = {f.long: f for f in _FLAG_LIST}
94
+ _FLAGS_BY_SHORT: dict[str, FlagSpec] = {f.short: f for f in _FLAG_LIST if f.short}
95
+
96
+
97
+ @dataclass(frozen=True)
98
+ class VerbSpec:
99
+ """One verb in the dispatcher.
100
+
101
+ ``name == ""`` represents the analyze/no-verb path. ``allowed`` is the set
102
+ of canonical flag keys (not long spellings) — short flags are aliases for
103
+ their canonical key, so a short flag is allowed iff its canonical key is
104
+ in ``allowed``.
105
+ """
106
+ name: str
107
+ summary: str
108
+ positional_shape: str
109
+ allowed: frozenset[str]
110
+
111
+
112
+ _ANALYZE_ALLOWED: frozenset[str] = frozenset({
113
+ "help", "verbose", "yes", "all", "out", "config", "since", "detect",
114
+ "dry_run", "export_allowlist", "output", "until", "days", "hours",
115
+ "zeek_dir", "pihole_dir", "syslog_dir", "cloudtrail_dir",
116
+ })
117
+ _SINGLE_DET_ALLOWED: frozenset[str] = _ANALYZE_ALLOWED - {"detect"}
118
+ _DIGEST_ALLOWED: frozenset[str] = frozenset({
119
+ "help", "verbose", "yes", "all", "out", "config", "since",
120
+ "dry_run", "output", "until", "days", "hours", "zeek_dir",
121
+ })
122
+ _EXPORT_ALLOWED: frozenset[str] = frozenset({
123
+ "help", "verbose", "yes", "out", "config", "since", "until", "days", "hours",
124
+ })
125
+ _INIT_ALLOWED: frozenset[str] = frozenset({"help"})
126
+
127
+
128
+ _VERBS: dict[str, VerbSpec] = {
129
+ "": VerbSpec("", "run all enabled detectors",
130
+ "[PATH]", _ANALYZE_ALLOWED),
131
+ "beacon": VerbSpec("beacon", "beacon detection (conn.log)",
132
+ "[PATH]", _SINGLE_DET_ALLOWED),
133
+ "dns": VerbSpec("dns", "DNS clustering (Zeek or Pi-hole)",
134
+ "[PATH]", _SINGLE_DET_ALLOWED),
135
+ "syslog": VerbSpec("syslog", "syslog anomaly detection",
136
+ "[PATH]", _SINGLE_DET_ALLOWED),
137
+ "scan": VerbSpec("scan", "port scan detection (conn.log)",
138
+ "[PATH]", _SINGLE_DET_ALLOWED),
139
+ "duration": VerbSpec("duration", "long connection detection (conn.log)",
140
+ "[PATH]", _SINGLE_DET_ALLOWED),
141
+ "aws": VerbSpec("aws", "CloudTrail behavioral surfacing (per-principal)",
142
+ "[PATH]", _SINGLE_DET_ALLOWED),
143
+ "digest": VerbSpec("digest", "orient-before-the-hunt card (schema sniffed)",
144
+ "[PATH ...]", _DIGEST_ALLOWED),
145
+ "export": VerbSpec("export", "pull logs from external systems to local files",
146
+ "[BACKEND] [QUERY ...]", _EXPORT_ALLOWED),
147
+ "init": VerbSpec("init", "first-run setup wizard",
148
+ "", _INIT_ALLOWED),
149
+ }
150
+
151
+
152
+ _SINGLE_DETECTOR_COMMANDS: frozenset[str] = frozenset({
153
+ "beacon", "dns", "syslog", "scan", "duration", "aws",
154
+ })
155
+
156
+ # User-initiated stop (Ctrl-C during compute). Named so the future error-voice
157
+ # pass can find the message and exit code together. 130 is the Unix 128 + SIGINT
158
+ # convention. Ctrl-C AT THE CONFIRM PROMPTS (runner.py) is a separate path that
159
+ # routes through ExportAborted → exit 0; this is the mid-run sibling.
160
+ _STOPPED_MESSAGE = "Stopped."
161
+ _SIGINT_EXIT_CODE = 130
162
+
163
+
164
+ def main(argv: list[str] | None = None) -> None:
165
+ """Parse arguments and dispatch to the appropriate subcommand or runner."""
166
+ try:
167
+ rc = _main(argv) or 0
168
+ except KeyboardInterrupt:
169
+ # Most terminals echo Ctrl-C as "^C" with no trailing newline before
170
+ # Python sees the signal. Without a leading blank line on TTY stderr,
171
+ # our message lands as "^CStopped." on one row. Non-TTY stderr stays
172
+ # byte-exact at "Stopped.\n" so log capture / scripts are unaffected.
173
+ if sys.stderr.isatty():
174
+ print(file=sys.stderr)
175
+ print(_STOPPED_MESSAGE, file=sys.stderr)
176
+ sys.exit(_SIGINT_EXIT_CODE)
177
+ except ExportAborted as exc:
178
+ print(str(exc))
179
+ sys.exit(0)
180
+ except cfg.ConfigError as exc:
181
+ print(f"loghunter: {exc}", file=sys.stderr)
182
+ sys.exit(1)
183
+ except ValueError as exc:
184
+ print(f"loghunter: {exc}", file=sys.stderr)
185
+ print("Run 'loghunter --help' for usage.", file=sys.stderr)
186
+ sys.exit(1)
187
+ except OSError as exc:
188
+ print(f"loghunter: {exc}", file=sys.stderr)
189
+ sys.exit(1)
190
+ if rc:
191
+ sys.exit(rc)
192
+
193
+
194
+ def _main(argv: list[str] | None = None) -> int:
195
+ """Internal CLI dispatcher. Exceptions are formatted by main().
196
+
197
+ Returns an int exit code; only the digest fan-out currently uses non-zero
198
+ (its three-way exit policy). Every other branch returns 0.
199
+ """
200
+ args = argv if argv is not None else sys.argv[1:]
201
+
202
+ if not args or args == ["--help"] or args == ["-h"]:
203
+ _print_global_usage()
204
+ return 0
205
+
206
+ cand = args[0]
207
+
208
+ if cand in _SINGLE_DETECTOR_COMMANDS:
209
+ verb = cand
210
+ rest = args[1:]
211
+ elif cand == "digest":
212
+ verb = "digest"
213
+ rest = args[1:]
214
+ elif cand == "init":
215
+ verb = "init"
216
+ rest = args[1:]
217
+ elif cand == "export":
218
+ verb = "export"
219
+ rest = args[1:]
220
+ elif cand.startswith("-") or _looks_like_path(cand):
221
+ verb = ""
222
+ rest = args
223
+ else:
224
+ print(f"loghunter: unknown command '{cand}'", file=sys.stderr)
225
+ print("Run 'loghunter --help' for usage.", file=sys.stderr)
226
+ sys.exit(1)
227
+
228
+ # Side-effect-light help short-circuit — STANDALONE --help / -h ONLY.
229
+ # `--help=anything` and `-h=anything` are NOT help; they fall through to
230
+ # the strict parser and produce "takes no value". This fires BEFORE
231
+ # config load, output resolution, sniff dispatch, or wizard entry.
232
+ if any(tok == "--help" or tok == "-h" for tok in rest):
233
+ print(_render_verb_help(verb), end="")
234
+ return 0
235
+
236
+ if verb in _SINGLE_DETECTOR_COMMANDS:
237
+ _run_single_detector(verb, rest)
238
+ elif verb == "digest":
239
+ return _run_digest(rest) or 0
240
+ elif verb == "init":
241
+ _run_init(rest)
242
+ elif verb == "export":
243
+ _run_export(rest)
244
+ else:
245
+ _run_all_detectors(rest)
246
+ return 0
247
+
248
+
249
+ def _looks_like_path(s: str) -> bool:
250
+ """Return True if the string looks like a filesystem path rather than a subcommand.
251
+
252
+ Verbs are matched first in ``_main`` so verb names always win; this only
253
+ decides whether a non-verb token routes to the analyze path or fails as
254
+ an unknown command. The ``os.path.exists`` clause catches bare filenames
255
+ in CWD (``loghunter conn.log``) that the prefix tests would miss.
256
+ """
257
+ if s.startswith("/") or s.startswith("~") or s.startswith("."):
258
+ return True
259
+ if "/" in s:
260
+ return True
261
+ try:
262
+ if os.path.exists(s):
263
+ return True
264
+ except (OSError, ValueError):
265
+ pass
266
+ return False
267
+
268
+
269
+ # ── usage / help generation ───────────────────────────────────────────────────
270
+
271
+
272
+ def _global_usage_text() -> str:
273
+ """Compose the bare-loghunter / --help screen from the spec."""
274
+ lines = [
275
+ "loghunter — network threat hunting for self-hosters",
276
+ "",
277
+ "Usage:",
278
+ " loghunter [options] PATH run all enabled detectors",
279
+ " loghunter beacon [options] PATH beacon detection (conn.log)",
280
+ " loghunter dns [options] PATH DNS clustering (Zeek or Pi-hole)",
281
+ " loghunter syslog [options] PATH syslog anomaly detection",
282
+ " loghunter scan [options] PATH port scan detection (conn.log)",
283
+ " loghunter duration [options] PATH long connection detection (conn.log)",
284
+ " loghunter aws [options] PATH CloudTrail behavioral surfacing (per-principal)",
285
+ "",
286
+ " loghunter digest [options] PATH orient-before-the-hunt card; schema is",
287
+ " inferred from the file (conn, dns, syslog,",
288
+ " cloudtrail, or blob for unrecognized text)",
289
+ "",
290
+ " loghunter export pull logs from external systems",
291
+ " loghunter init first-run setup wizard",
292
+ "",
293
+ "Common options (short forms shown for the frequently-typed flags):",
294
+ " --help, -h --verbose, -v --yes, -y --all, -a",
295
+ " --out, -o=PATH --config, -c=FILE --since, -s=… --detect, -d=LIST",
296
+ "",
297
+ "Less common: --dry-run --export-allowlist --output=FORMAT --until=DATE",
298
+ " --days=N-M --hours=N-M",
299
+ " --zeek-dir=PATH --syslog-dir=PATH --pihole-dir=PATH --cloudtrail-dir=PATH",
300
+ "",
301
+ "Run 'loghunter <command> --help' for command-specific options.",
302
+ "",
303
+ ]
304
+ return "\n".join(lines)
305
+
306
+
307
+ def _print_global_usage() -> None:
308
+ """Print the first-run usage message, appending a hint when no config is found."""
309
+ print(_global_usage_text(), end="")
310
+ if cfg._find_config_file() is None:
311
+ print(" No config found. Run 'loghunter init' to get started.")
312
+ print(" Config will be written to ~/.loghunter/config.toml")
313
+
314
+
315
+ # Compatibility alias — internal helper kept under its historical name for
316
+ # tests/observers that import it directly.
317
+ _print_usage = _print_global_usage
318
+
319
+
320
+ def _render_verb_help(verb: str) -> str:
321
+ """Render per-command help from the spec — drives `<verb> --help` / `-h`."""
322
+ vs = _VERBS[verb]
323
+ cmd = "loghunter" + (f" {verb}" if verb else "")
324
+ shape = f" {vs.positional_shape}" if vs.positional_shape else ""
325
+ lines = [f"Usage: {cmd} [options]{shape}".rstrip(), "", vs.summary, "", "Options:"]
326
+ # Preserve display order from _FLAG_LIST so output is stable.
327
+ for spec in _FLAG_LIST:
328
+ if spec.key not in vs.allowed:
329
+ continue
330
+ if spec.short:
331
+ head = f" {spec.long}, -{spec.short}"
332
+ else:
333
+ head = f" {spec.long}"
334
+ if spec.takes_value:
335
+ head += f"={spec.metavar}"
336
+ lines.append(f"{head:<32} {spec.help}".rstrip())
337
+ return "\n".join(lines) + "\n"
338
+
339
+
340
+ # ── parser ────────────────────────────────────────────────────────────────────
341
+
342
+
343
+ def _parse_args(args: list[str], verb: str) -> dict[str, Any]:
344
+ """Parse CLI tokens for ``verb`` into a kwargs dict.
345
+
346
+ Validation order is BINDING: identity → verb-membership → value-shape. A
347
+ globally-known but verb-disallowed flag yields the wrong-verb error
348
+ regardless of value shape (``digest -d`` and ``digest --detect`` both
349
+ report "not valid for digest", NOT "needs a value").
350
+
351
+ Duplicate flags are last-wins (preserving the original dict-overwrite
352
+ behavior). Single-valued — never promoted to a list.
353
+
354
+ Positionals: ``parsed["path"]`` = first; ``parsed["paths"]`` = full list.
355
+ """
356
+ if verb not in _VERBS:
357
+ raise ValueError(f"unknown verb {verb!r}")
358
+ allowed = _VERBS[verb].allowed
359
+ verb_label = verb if verb else "analyze"
360
+
361
+ def _wrong_verb_long(spec: FlagSpec) -> str:
362
+ alias = f" (-{spec.short})" if spec.short else ""
363
+ return f"{spec.long}{alias} is not valid for {verb_label}"
364
+
365
+ def _wrong_verb_short(spec: FlagSpec) -> str:
366
+ # Short was typed; lead with the short, alias is the long.
367
+ return f"-{spec.short} ({spec.long}) is not valid for {verb_label}"
368
+
369
+ def _needs_value(spec: FlagSpec) -> str:
370
+ alias = f" (-{spec.short})" if spec.short else ""
371
+ short_hint = f"-{spec.short}=… or " if spec.short else ""
372
+ return f"{spec.long}{alias} needs a value: {short_hint}{spec.long}=…"
373
+
374
+ def _no_value(spec: FlagSpec) -> str:
375
+ alias = f" (-{spec.short})" if spec.short else ""
376
+ return f"{spec.long}{alias} takes no value"
377
+
378
+ result: dict[str, Any] = {}
379
+ positionals: list[str] = []
380
+
381
+ for arg in args:
382
+ # `-vv` is a SINGLE explicitly-registered literal token — the one-off
383
+ # spelling for verbose-level 2. Recognized BEFORE the normal short-flag
384
+ # branch so the bundling-refusal lattice ("can't be combined") never
385
+ # catches it. Anything longer (-vvv, -vy) falls through to the bundling
386
+ # path and gets the existing pass-separately message. Wrong-verb parity:
387
+ # `loghunter init -vv` raises the same wrong-verb error as
388
+ # `loghunter init -v`, never "unknown flag" / "needs a value".
389
+ if arg == "-vv":
390
+ v_spec = _FLAGS_BY_SHORT.get("v")
391
+ if v_spec is None or v_spec.key not in allowed:
392
+ raise ValueError(_wrong_verb_short(v_spec))
393
+ result["verbose_level"] = 2
394
+ continue
395
+ if arg.startswith("--"):
396
+ body, eq, val = arg[2:].partition("=")
397
+ long_form = f"--{body}"
398
+ spec = _FLAGS_BY_LONG.get(long_form)
399
+ if spec is None:
400
+ raise ValueError(f"unknown flag --{body}")
401
+ if spec.key not in allowed:
402
+ raise ValueError(_wrong_verb_long(spec))
403
+ if eq:
404
+ if not spec.takes_value:
405
+ raise ValueError(_no_value(spec))
406
+ result[spec.key] = val
407
+ else:
408
+ if spec.takes_value:
409
+ raise ValueError(_needs_value(spec))
410
+ result[spec.key] = True
411
+ elif arg.startswith("-") and arg != "-":
412
+ stripped = arg[1:]
413
+ body, eq, val = stripped.partition("=")
414
+ if len(body) == 1:
415
+ short = body
416
+ spec = _FLAGS_BY_SHORT.get(short)
417
+ if spec is None:
418
+ raise ValueError(f"unknown flag -{short}")
419
+ if spec.key not in allowed:
420
+ raise ValueError(_wrong_verb_short(spec))
421
+ if eq:
422
+ if not spec.takes_value:
423
+ raise ValueError(_no_value(spec))
424
+ result[spec.key] = val
425
+ else:
426
+ if spec.takes_value:
427
+ raise ValueError(_needs_value(spec))
428
+ result[spec.key] = True
429
+ elif len(body) > 1:
430
+ # Bundling attempt — deliberately declined. Surface kindly when
431
+ # every char is a known short; otherwise plain unknown-flag.
432
+ if all(ch in _FLAGS_BY_SHORT for ch in body):
433
+ separated = " ".join(f"-{ch}" for ch in body)
434
+ raise ValueError(
435
+ f"short flags can't be combined (-{body}); "
436
+ f"pass separately: {separated}"
437
+ )
438
+ raise ValueError(f"unknown flag -{body}")
439
+ else:
440
+ raise ValueError(f"unknown flag {arg}")
441
+ else:
442
+ positionals.append(arg)
443
+
444
+ if positionals:
445
+ result["path"] = positionals[0]
446
+ result["paths"] = positionals
447
+
448
+ return result
449
+
450
+
451
+ # ── shared resolution helpers ─────────────────────────────────────────────────
452
+
453
+
454
+ def _assert_all_vs_timeframe(parsed: dict[str, Any]) -> None:
455
+ """``--all`` is mutually exclusive with explicit timeframe flags."""
456
+ if parsed.get("all") and any(k in parsed for k in ("since", "until", "days", "hours")):
457
+ raise ValueError(
458
+ "--all cannot be combined with --since, --until, --days, or --hours"
459
+ )
460
+
461
+
462
+ def _resolve_output_target(
463
+ parsed: dict[str, Any], config: dict[str, Any],
464
+ ) -> tuple[Path | None, Path | None]:
465
+ """Resolve the ``--out`` / ``[loghunter].report_dir`` ladder.
466
+
467
+ Returns ``(output_file, output_dir)`` — exactly one of which is non-None
468
+ when a target is set; both ``None`` means stdout.
469
+ """
470
+ cfg_lh = config.get("loghunter", {})
471
+ root = effective_root(config)
472
+ cli_out = parsed.get("out") if "out" in parsed else None
473
+ if cli_out:
474
+ target = resolve_path(cli_out, "")
475
+ else:
476
+ target = resolve_path(cfg_lh.get("report_dir"), root)
477
+
478
+ if target is None:
479
+ return None, None
480
+ resolved = be_like_water(target)
481
+ if resolved.is_file:
482
+ return resolved.path, None
483
+ return None, resolved.path
484
+
485
+
486
+ def _resolve_timeframe(
487
+ parsed: dict[str, Any],
488
+ now: datetime | None = None,
489
+ ) -> tuple[datetime | None, datetime | None]:
490
+ """Convert --since/--until/--days/--hours into a (since, until) datetime pair."""
491
+ if now is None:
492
+ now = datetime.now(timezone.utc)
493
+ since: datetime | None = None
494
+ until: datetime | None = None
495
+
496
+ if "days" in parsed:
497
+ a, b = _parse_range(str(parsed["days"]), "--days")
498
+ since = (now - timedelta(days=b)).replace(hour=0, minute=0, second=0, microsecond=0)
499
+ until = (now - timedelta(days=a)).replace(hour=23, minute=59, second=59, microsecond=0)
500
+ return since, until
501
+
502
+ if "hours" in parsed:
503
+ a, b = _parse_range(str(parsed["hours"]), "--hours")
504
+ since = now - timedelta(hours=b)
505
+ until = now - timedelta(hours=a)
506
+ return since, until
507
+
508
+ if "since" in parsed:
509
+ s = str(parsed["since"])
510
+ if s.endswith("d"):
511
+ since = now - timedelta(days=_parse_positive_int(s[:-1], "--since"))
512
+ elif s.endswith("h"):
513
+ since = now - timedelta(hours=_parse_positive_int(s[:-1], "--since"))
514
+ else:
515
+ since = _parse_iso_date(s, "--since")
516
+
517
+ if "until" in parsed:
518
+ until = _parse_iso_date(str(parsed["until"]), "--until")
519
+
520
+ return since, until
521
+
522
+
523
+ def _parse_range(value: str, flag: str) -> tuple[int, int]:
524
+ """Parse N-M range arguments for --days and --hours."""
525
+ parts = value.split("-")
526
+ if len(parts) != 2:
527
+ raise ValueError(f"{flag} expects a range like 3-5")
528
+ try:
529
+ start, end = sorted(int(part) for part in parts)
530
+ except ValueError as exc:
531
+ raise ValueError(f"{flag} expects numeric values like 3-5") from exc
532
+ return start, end
533
+
534
+
535
+ def _parse_positive_int(value: str, flag: str) -> int:
536
+ """Parse a positive integer embedded in a duration flag."""
537
+ try:
538
+ parsed = int(value)
539
+ except ValueError as exc:
540
+ raise ValueError(f"{flag} expects a duration like 7d or 24h") from exc
541
+ if parsed < 0:
542
+ raise ValueError(f"{flag} duration must be positive")
543
+ return parsed
544
+
545
+
546
+ def _parse_iso_date(value: str, flag: str) -> datetime:
547
+ """Parse an ISO date/time as UTC for CLI timeframe flags."""
548
+ try:
549
+ parsed = datetime.fromisoformat(value)
550
+ except ValueError as exc:
551
+ raise ValueError(f"{flag} expects a date like 2026-05-01") from exc
552
+ return parsed.replace(tzinfo=timezone.utc)
553
+
554
+
555
+ # ── runner-kwargs builders ────────────────────────────────────────────────────
556
+
557
+
558
+ def _resolve_verbose_level(parsed: dict[str, Any]) -> int:
559
+ """Collapse the parser's two-key verbose state into a single 0/1/2 dial.
560
+
561
+ ``-vv`` is registered as the literal token ``verbose_level=2`` (see
562
+ ``_parse_args``); ``-v`` / ``--verbose`` set ``verbose=True``. Their
563
+ last-wins resolution lands here:
564
+ none → 0; -v → 1; -vv → 2; combined → 2.
565
+ Only the text handler distinguishes all three levels; every other
566
+ consumer collapses to ``>= 1`` (export internals, csv/html description
567
+ gate, digest summariser-failure breadcrumb).
568
+ """
569
+ if parsed.get("verbose_level") == 2:
570
+ return 2
571
+ return 1 if parsed.get("verbose") else 0
572
+
573
+
574
+ _ANALYZE_SOURCE_KEYS: tuple[str, ...] = (
575
+ "zeek_dir", "syslog_dir", "pihole_dir", "cloudtrail_dir",
576
+ )
577
+
578
+
579
+ def _merge_family_value(
580
+ bucket: list[str], flag_value: str | None,
581
+ ) -> str | list[str] | None:
582
+ """Combine a positional-derived bucket with the explicit ``--<family>-dir``
583
+ flag value, returning the runner-kwarg shape for that family.
584
+
585
+ MERGE rule (sanctioned rail supersession in the rev-3 prompt — James
586
+ reconciles CODE.md after landing): positionals routed to the family
587
+ + the flag value BOTH contribute, both load. Order is positionals
588
+ first, flag appended; dedup is the loader's job (via ``.resolve()``).
589
+
590
+ Wire-shape compression for the runner kwarg:
591
+
592
+ - empty + no flag → ``None`` (no override; config fallback within scope)
593
+ - exactly one truthy value → scalar (string) — keeps programmatic
594
+ scalar-caller shape byte-identical with the prior single-Path contract
595
+ - 2+ values → ``list[str]`` — the multi-input shape
596
+
597
+ All three shapes flow through ``runner.run`` → ``resolve_sources`` →
598
+ ``_normalize_overrides``, which collapses to the same downstream
599
+ ``list[Path]`` regardless. Raw strings only — the CLI does NOT
600
+ ``Path(...)`` or ``resolve_path`` source values; ``_resolve_one`` is
601
+ the SOLE string→Path site.
602
+ """
603
+ merged: list[str] = [b for b in bucket if b]
604
+ if flag_value:
605
+ merged.append(flag_value)
606
+ if not merged:
607
+ return None
608
+ if len(merged) == 1:
609
+ return merged[0]
610
+ return merged
611
+
612
+
613
+ def _build_positional_buckets(
614
+ paths: list[str], *, detector_module: Any | None,
615
+ ) -> dict[str, list[str]]:
616
+ """Sniff-classify each positional into its source-family bucket.
617
+
618
+ Returns ``{family_key: [positional, …]}`` for the families touched.
619
+ ``detector_module=None`` triggers the router's content-sniff mode
620
+ (detect=all / unknown selector). Empty input → empty dict.
621
+ """
622
+ from loghunter.common.sources import route_positional_source
623
+
624
+ buckets: dict[str, list[str]] = {}
625
+ for p in paths:
626
+ routed = route_positional_source(p, detector_module=detector_module)
627
+ buckets.setdefault(routed, []).append(p)
628
+ return buckets
629
+
630
+
631
+ def _runner_kwargs(
632
+ parsed: dict[str, Any],
633
+ config: dict[str, Any],
634
+ detect: str | None = None,
635
+ scope: frozenset[str] | None = None,
636
+ source_buckets: dict[str, list[str]] | None = None,
637
+ ) -> dict[str, Any]:
638
+ """Build the kwargs dict for runner.run() from parsed CLI args and loaded config.
639
+
640
+ Source-dir overrides flow through as raw parsed strings, per-family lists,
641
+ or ``None``. The CLI does NOT call ``resolve_path`` or ``Path(...)`` for
642
+ source dirs — ``loghunter.common.sources._resolve_one`` is the SOLE site
643
+ where a source-dir string becomes a resolved ``Path``. The runner threads
644
+ the raw values into ``resolve_sources``, which normalizes scalar/list/None
645
+ uniformly.
646
+
647
+ ``source_buckets`` carries the per-positional sniff classification
648
+ (``{family_key: [positional_path, …]}``). For each family the bucket is
649
+ MERGED with the explicit ``--<family>-dir`` flag (positionals first, flag
650
+ appended) — the sanctioned rail supersession from the rev-3 prompt:
651
+ same-family flag + positional now BOTH load instead of "flag wins."
652
+
653
+ ``scope`` is the SOLE scoping signal: ``None`` = unconstrained,
654
+ ``frozenset(touched_families)`` = scope the run so sibling source-dirs
655
+ stay unloaded. The caller computes ``scope`` from the bucket keys; a
656
+ positional ALWAYS scopes. An explicit override outside ``scope`` still
657
+ applies — the operator widening the run deliberately.
658
+ """
659
+ _assert_all_vs_timeframe(parsed)
660
+
661
+ since, until = _resolve_timeframe(parsed)
662
+ cfg_lh = config.get("loghunter", {})
663
+
664
+ output_file, output_dir = _resolve_output_target(parsed, config)
665
+
666
+ buckets = source_buckets or {}
667
+ family_values: dict[str, str | list[str] | None] = {
668
+ key: _merge_family_value(buckets.get(key, []), parsed.get(key))
669
+ for key in _ANALYZE_SOURCE_KEYS
670
+ }
671
+
672
+ return dict(
673
+ config=config,
674
+ detect=detect or parsed.get("detect"),
675
+ # Source-dir overrides: raw strings / lists / None — resolver owns Path conversion.
676
+ zeek_dir=family_values["zeek_dir"],
677
+ syslog_dir=family_values["syslog_dir"],
678
+ pihole_dir=family_values["pihole_dir"],
679
+ cloudtrail_dir=family_values["cloudtrail_dir"],
680
+ scope=scope,
681
+ since=since,
682
+ until=until,
683
+ output_format=parsed.get("output", cfg_lh.get("output_format", "text")),
684
+ output_dir=output_dir,
685
+ output_file=output_file,
686
+ verbose_level=_resolve_verbose_level(parsed),
687
+ dry_run=bool(parsed.get("dry_run", False)),
688
+ export_allowlist=bool(parsed.get("export_allowlist", False)),
689
+ load_all=bool(parsed.get("all", False)),
690
+ skip_confirm=bool(parsed.get("yes", False)),
691
+ )
692
+
693
+
694
+ # ── verb runners ──────────────────────────────────────────────────────────────
695
+
696
+
697
+ def _named_detector_module(detect: Any) -> Any:
698
+ """Return the imported detector module for an exactly-one-detector selector.
699
+
700
+ Returns ``None`` for ``all``, comma lists, exclusion syntax, missing
701
+ selectors, and unimportable names. Imports ONLY the explicitly named
702
+ module via ``importlib`` — never iterates ``detectors/``. Used by the
703
+ two analyze entry points to feed ``route_positional_source`` with the
704
+ detector's REQUIRED_LOGS / OPTIONAL_LOGS metadata.
705
+ """
706
+ if not isinstance(detect, str):
707
+ return None
708
+ name = detect.strip()
709
+ if not name or name.lower() == "all" or "," in name or "!" in name:
710
+ return None
711
+ try:
712
+ import importlib
713
+ return importlib.import_module(f"loghunter.detectors.{name}")
714
+ except ImportError:
715
+ return None
716
+
717
+
718
+ def _run_all_detectors(args: list[str]) -> None:
719
+ """Parse args and invoke runner with all enabled detectors.
720
+
721
+ Each positional in ``parsed["paths"]`` is sniff-classified into its source
722
+ family bucket via ``route_positional_source`` (detector_module=None for
723
+ detect=all / unknown selector → content-sniff → ``{origin}_dir``,
724
+ defaulting to ``zeek_dir`` on directory / unrecognized / OSError). The
725
+ per-family bucket then MERGES with any explicit ``--<family>-dir`` flag
726
+ inside ``_runner_kwargs``. ``scope = frozenset(touched_families)`` keeps
727
+ sibling source-dirs suppressed.
728
+ """
729
+ import loghunter.runner as runner
730
+
731
+ parsed = _parse_args(args, "")
732
+
733
+ if "output" in parsed:
734
+ get_handler(parsed["output"])
735
+
736
+ config = cfg.load(parsed.get("config"))
737
+
738
+ paths = parsed.get("paths") or []
739
+ if paths:
740
+ mod = _named_detector_module(parsed.get("detect"))
741
+ buckets = _build_positional_buckets(paths, detector_module=mod)
742
+ scope: frozenset[str] | None = frozenset(buckets) if buckets else None
743
+ else:
744
+ buckets = {}
745
+ scope = None
746
+
747
+ runner.run(**_runner_kwargs(
748
+ parsed, config, scope=scope, source_buckets=buckets,
749
+ ))
750
+
751
+
752
+ def _run_single_detector(detector: str, args: list[str]) -> None:
753
+ """Parse args and invoke runner constrained to a single detector.
754
+
755
+ Each positional in ``parsed["paths"]`` is sniff-classified into its source
756
+ family bucket using the named detector module's REQUIRED_LOGS /
757
+ OPTIONAL_LOGS metadata (via ``route_positional_source(detector_module=mod)``).
758
+ The per-family bucket then MERGES with any explicit ``--<family>-dir`` flag
759
+ inside ``_runner_kwargs``. ``scope = frozenset(touched_families)`` keeps
760
+ sibling source-dirs suppressed.
761
+ """
762
+ import loghunter.runner as runner
763
+
764
+ parsed = _parse_args(args, detector)
765
+
766
+ if "output" in parsed:
767
+ get_handler(parsed["output"])
768
+
769
+ config = cfg.load(parsed.get("config"))
770
+
771
+ paths = parsed.get("paths") or []
772
+ if paths:
773
+ mod = _named_detector_module(detector)
774
+ buckets = _build_positional_buckets(paths, detector_module=mod)
775
+ scope: frozenset[str] | None = frozenset(buckets) if buckets else None
776
+ else:
777
+ buckets = {}
778
+ scope = None
779
+
780
+ runner.run(**_runner_kwargs(
781
+ parsed, config, detect=detector, scope=scope, source_buckets=buckets,
782
+ ))
783
+
784
+
785
+ _SOURCE_DIR_KEYS = (
786
+ "zeek_dir", "pihole_dir", "syslog_dir", "cloudtrail_dir", "blob_path",
787
+ )
788
+
789
+
790
+ def _route_sniffed_path(
791
+ parsed: dict[str, Any],
792
+ path: Path,
793
+ result: Any,
794
+ ) -> tuple[dict[str, Any], str]:
795
+ """Build a per-path parsed-dict variant routing a sniffed PATH into the
796
+ right source-dir kwarg. Clears prior-iteration source-dir keys so a stale
797
+ value never leaks between paths in a fan-out loop."""
798
+ parsed_for_path = {
799
+ k: v for k, v in parsed.items() if k not in _SOURCE_DIR_KEYS
800
+ }
801
+ schema = result.schema
802
+ path_str = str(path)
803
+ if schema == "conn":
804
+ parsed_for_path["zeek_dir"] = path_str
805
+ elif schema == "dns":
806
+ if result.origin == "pihole":
807
+ parsed_for_path["pihole_dir"] = path_str
808
+ else:
809
+ parsed_for_path["zeek_dir"] = path_str
810
+ elif schema == "syslog":
811
+ # syslog is fidelity-aware: Zeek syslog.log → zeek_dir; flat
812
+ # rsyslog → syslog_dir. Mirrors the dns origin-split above.
813
+ if result.origin == "zeek":
814
+ parsed_for_path["zeek_dir"] = path_str
815
+ else:
816
+ parsed_for_path["syslog_dir"] = path_str
817
+ elif schema == "cloudtrail":
818
+ parsed_for_path["cloudtrail_dir"] = path_str
819
+ else: # schema == "blob"
820
+ # blob_path is INTERNAL — synthesized post-sniff. It is NOT a flag
821
+ # and must NEVER appear in _FLAGS / _VERBS / help.
822
+ parsed_for_path["blob_path"] = path_str
823
+ return parsed_for_path, schema
824
+
825
+
826
+ def _run_digest(args: list[str]) -> int:
827
+ """Parse args and dispatch to runner.run_digest, supporting N positionals."""
828
+ import loghunter.runner as runner
829
+ from loghunter.common.loader import sniff_format_detailed
830
+
831
+ parsed = _parse_args(args, "digest")
832
+
833
+ # Output validation: registry-first (uniform error voice), then digest's
834
+ # text-only rail. The spec already forbids --output for digest? No — it
835
+ # ALLOWS --output but digest renders text cards only.
836
+ out_fmt = parsed.get("output", "text")
837
+ get_handler(out_fmt)
838
+ if out_fmt != "text":
839
+ raise ValueError(
840
+ f"digest currently supports only --output=text (got {out_fmt!r})"
841
+ )
842
+
843
+ # Positional + source-dir combination guard. The spec allows --zeek-dir
844
+ # for BARE digest (no positional, single conn card from a configured
845
+ # source dir). With a positional present, the positional self-routes via
846
+ # sniff and source-dir flags would be silently overridden — reject the
847
+ # combination up-front so the operator sees the conflict.
848
+ if parsed.get("paths"):
849
+ for flag in ("zeek_dir", "pihole_dir", "syslog_dir", "cloudtrail_dir"):
850
+ if flag in parsed:
851
+ raise ValueError(
852
+ f"digest: --{flag.replace('_', '-')} is not valid alongside "
853
+ "a positional PATH (positionals self-route via sniff)"
854
+ )
855
+
856
+ config = cfg.load(parsed.get("config"))
857
+
858
+ paths_raw = parsed.get("paths") or []
859
+
860
+ if not paths_raw:
861
+ # No positional: config-driven path. Bare digest, single conn card.
862
+ # Output target is resolved by _digest_runner_kwargs (no fan-out
863
+ # involved), so the existing single-card flow is preserved verbatim.
864
+ try:
865
+ runner.run_digest(
866
+ **_digest_runner_kwargs(parsed, config, schema="conn")
867
+ )
868
+ except DigestEmpty as exc:
869
+ # Recognized-but-empty (e.g. header-only conn.log in the
870
+ # configured directory). The file was understood — narrate
871
+ # without a card and exit 0. PLACEHOLDER voice — qmail
872
+ # error-voice pass.
873
+ print(
874
+ f"digest: {exc.basename}: recognized as {exc.schema} "
875
+ "but no parseable records — skipping.",
876
+ file=sys.stderr,
877
+ )
878
+ return 0
879
+ return 0
880
+
881
+ # Fan-out path. Resolve the shared output target ONCE — never per path.
882
+ is_dry_run = bool(parsed.get("dry_run", False))
883
+ get_stream, close_stream = _build_digest_fanout_stream(
884
+ parsed, config, dry_run=is_dry_run,
885
+ )
886
+
887
+ is_multirun = len(paths_raw) > 1
888
+
889
+ rendered = empty = recognized_empty = errored = 0
890
+ try:
891
+ for raw in paths_raw:
892
+ path = Path(os.path.expanduser(raw))
893
+ if not path.exists():
894
+ print(f"digest: path not found: {path}", file=sys.stderr)
895
+ errored += 1
896
+ continue
897
+ if path.is_dir():
898
+ # Multi-path fan-out: silently skip a directory positional.
899
+ # The lone-positional case keeps the v1 contract — whole-
900
+ # directory positionals are rejected with an actionable
901
+ # stderr message and exit 1.
902
+ if len(paths_raw) == 1:
903
+ print(
904
+ f"digest: PATH must be a file, not a directory: {path}",
905
+ file=sys.stderr,
906
+ )
907
+ errored += 1
908
+ continue
909
+ try:
910
+ result = sniff_format_detailed(path)
911
+ if result.state == "empty":
912
+ print(f"{path.name} is empty. Nothing to do!")
913
+ empty += 1
914
+ continue
915
+ parsed_for_path, schema = _route_sniffed_path(
916
+ parsed, path, result,
917
+ )
918
+ kwargs = _digest_runner_kwargs(
919
+ parsed_for_path, config, schema=schema,
920
+ resolve_output=False,
921
+ )
922
+ if schema != "blob":
923
+ kwargs["fallback_blob_path"] = path
924
+ runner.run_digest(
925
+ **kwargs,
926
+ stream=get_stream(),
927
+ leading_separator=(rendered > 0),
928
+ show_progress=not is_multirun,
929
+ )
930
+ except DigestEmpty as exc:
931
+ print(
932
+ f"digest: {exc.basename}: recognized as {exc.schema} "
933
+ "but no parseable records — skipping.",
934
+ file=sys.stderr,
935
+ )
936
+ recognized_empty += 1
937
+ continue
938
+ except (ValueError, OSError) as exc:
939
+ print(f"digest: {path.name}: {exc}", file=sys.stderr)
940
+ errored += 1
941
+ continue
942
+ rendered += 1
943
+ finally:
944
+ close_stream()
945
+
946
+ if rendered > 0:
947
+ return 0
948
+ if errored == 0:
949
+ return 0
950
+ return 1
951
+
952
+
953
+ def _build_digest_fanout_stream(
954
+ parsed: dict[str, Any],
955
+ config: dict[str, Any],
956
+ dry_run: bool = False,
957
+ ) -> tuple[Any, Any]:
958
+ """Resolve the shared digest --out target into a lazy (get, close) pair.
959
+
960
+ Returns:
961
+ get_stream() — sys.stdout for stdout runs; for file targets, opens on
962
+ first call and returns the same handle on subsequent calls.
963
+ close_stream() — closes the file only if get_stream() was ever called.
964
+
965
+ --dry-run skips output resolution entirely.
966
+ """
967
+ if dry_run:
968
+ return (lambda: sys.stdout, lambda: None)
969
+
970
+ output_file, output_dir = _resolve_output_target(parsed, config)
971
+
972
+ if output_file is None and output_dir is None:
973
+ return (lambda: sys.stdout, lambda: None)
974
+
975
+ if output_file is not None:
976
+ dest = output_file
977
+ else:
978
+ # DIR verdict: digest_<timestamp>.txt — fixed for whole run, never
979
+ # derived from any input path. One code path for N=1 and N>1.
980
+ stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
981
+ dest = output_dir / f"digest_{stamp}.txt"
982
+
983
+ state: dict[str, Any] = {"fh": None}
984
+
985
+ def _get_stream() -> Any:
986
+ if state["fh"] is None:
987
+ dest.parent.mkdir(parents=True, exist_ok=True)
988
+ state["fh"] = dest.open("w", encoding="utf-8", newline="")
989
+ return state["fh"]
990
+
991
+ def _close_stream() -> None:
992
+ fh = state["fh"]
993
+ if fh is not None:
994
+ fh.close()
995
+
996
+ return (_get_stream, _close_stream)
997
+
998
+
999
+ def _digest_runner_kwargs(
1000
+ parsed: dict[str, Any],
1001
+ config: dict[str, Any],
1002
+ schema: str = "conn",
1003
+ resolve_output: bool = True,
1004
+ ) -> dict[str, Any]:
1005
+ """Build the kwargs dict for runner.run_digest from parsed CLI args + config.
1006
+
1007
+ Source-dir overrides (``zeek_dir`` / ``pihole_dir`` / ``syslog_dir`` /
1008
+ ``cloudtrail_dir``) flow through as RAW strings (or ``None``). The CLI
1009
+ does NOT resolve or path-wrap them — ``loghunter.common.sources.resolve_digest_source``
1010
+ in ``run_digest`` owns the per-schema candidate ladder, wrong-key + XOR +
1011
+ not-configured errors, and the SOLE string→Path conversion site
1012
+ (``_resolve_one``). Window + output target stay here.
1013
+
1014
+ ``blob_path`` is an INTERNAL routing key (NOT a flag) synthesized by
1015
+ ``_route_sniffed_path``; it stays a ``Path`` and is consumed by
1016
+ ``run_digest``'s blob branch BEFORE source resolution.
1017
+
1018
+ ``resolve_output=False`` is the fan-out seam: the CLI's `_run_digest`
1019
+ has already resolved the shared `--out` target into a single TextIO
1020
+ stream that is passed alongside, so per-path kwargs MUST NOT re-resolve.
1021
+ """
1022
+ _assert_all_vs_timeframe(parsed)
1023
+
1024
+ since, until = _resolve_timeframe(parsed)
1025
+
1026
+ output_file: Path | None = None
1027
+ output_dir: Path | None = None
1028
+ if resolve_output:
1029
+ output_file, output_dir = _resolve_output_target(parsed, config)
1030
+
1031
+ cli_blob = parsed.get("blob_path")
1032
+
1033
+ return dict(
1034
+ config=config,
1035
+ # Source-dir overrides: raw strings (or None) — resolver owns Path conversion.
1036
+ zeek_dir=parsed.get("zeek_dir"),
1037
+ pihole_dir=parsed.get("pihole_dir"),
1038
+ syslog_dir=parsed.get("syslog_dir"),
1039
+ cloudtrail_dir=parsed.get("cloudtrail_dir"),
1040
+ # blob_path is internal routing — expanduser only (no LH_ROOT, no
1041
+ # be_like_water). The blob branch in run_digest consumes it BEFORE
1042
+ # source resolution, so it never reaches resolve_digest_source.
1043
+ blob_path=Path(os.path.expanduser(cli_blob)) if cli_blob else None,
1044
+ since=since,
1045
+ until=until,
1046
+ output_format=parsed.get("output", "text"),
1047
+ output_dir=output_dir,
1048
+ output_file=output_file,
1049
+ verbose_level=_resolve_verbose_level(parsed),
1050
+ dry_run=bool(parsed.get("dry_run", False)),
1051
+ load_all=bool(parsed.get("all", False)),
1052
+ skip_confirm=bool(parsed.get("yes", False)),
1053
+ schema=schema,
1054
+ )
1055
+
1056
+
1057
+ def _run_init(args: list[str]) -> None:
1058
+ """Validate init args via the spec, then delegate to the wizard.
1059
+
1060
+ init's allowed set is help-only. Standalone ``--help`` / ``-h`` is
1061
+ short-circuited in ``_main`` BEFORE this function is invoked, so anything
1062
+ that reaches here MUST be an empty list — any unexpected token raises
1063
+ via the strict parser (unknown flag or wrong-verb).
1064
+ """
1065
+ _parse_args(args, "init")
1066
+ from loghunter.cli_init import run_init
1067
+ run_init()
1068
+
1069
+
1070
+ def _run_export(args: list[str]) -> None:
1071
+ """Pull logs from an external system (Splunk, CloudTrail) to local files."""
1072
+ from loghunter.exporters import run_export
1073
+
1074
+ parsed = _parse_args(args, "export")
1075
+
1076
+ # Timeframe: pass None when no flags given — exporter applies its own default
1077
+ since, until = _resolve_timeframe(parsed, now=datetime.now().astimezone())
1078
+
1079
+ config = cfg.load(parsed.get("config"))
1080
+
1081
+ positionals: list[str] = parsed.get("paths") or []
1082
+
1083
+ # Disambiguate: first positional is a backend name if it matches a known backend
1084
+ _KNOWN_EXPORT_BACKENDS = {"splunk", "cloudtrail"}
1085
+ if positionals and positionals[0] in _KNOWN_EXPORT_BACKENDS:
1086
+ backend: str | None = positionals[0]
1087
+ query_names = positionals[1:]
1088
+ else:
1089
+ backend = None
1090
+ query_names = positionals
1091
+
1092
+ # Pass the raw CLI string (preserving any trailing slash) — be_like_water
1093
+ # decides file vs directory inside the export pipeline.
1094
+ out_str = parsed.get("out") if "out" in parsed else None
1095
+
1096
+ # Export collapses to a single bool: -vv on export == -v (no level-2
1097
+ # surface). The export pipeline keeps its bool internally; the CLI
1098
+ # collapses at the seam.
1099
+ run_export(
1100
+ config=config,
1101
+ backend=backend,
1102
+ query_names=query_names,
1103
+ since=since,
1104
+ until=until,
1105
+ out=out_str,
1106
+ verbose=(_resolve_verbose_level(parsed) >= 1),
1107
+ skip_confirm=bool(parsed.get("yes", False)),
1108
+ )