loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
loghunter/cli_init.py ADDED
@@ -0,0 +1,567 @@
1
+ """First-run setup wizard.
2
+
3
+ CLI-INTERNAL split off ``loghunter/cli.py`` — first-run UX REMAINS CLI-layer
4
+ ownership. This module owns the wizard; ``cli.py`` keeps dispatch and arg
5
+ validation. Nothing here imports detectors, runner, or outputs.
6
+
7
+ The wizard mostly works by hitting Enter: it LOOKS before it asks (detect +
8
+ profile what's on disk) and NEVER clobbers config a user already set. Path
9
+ profiling is glob + stat ONLY — never reads a log line.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import re
16
+ import tomllib
17
+ from datetime import datetime, timedelta
18
+ from pathlib import Path
19
+
20
+ # ── detection ─────────────────────────────────────────────────────────────────
21
+ #
22
+ # Detection looks at conventional public paths; nothing here ever reads a log
23
+ # LINE — stat + glob only. Constants are module-level so flow tests can
24
+ # monkeypatch them off the developer's real filesystem.
25
+
26
+ _ZEEK_CANDIDATES: tuple[str, ...] = (
27
+ "/var/log/zeek",
28
+ "/opt/zeek/logs",
29
+ "/usr/local/zeek/logs",
30
+ "/nsm/zeek/logs",
31
+ )
32
+ # Each entry: (probe path, candidate_dir to register if probe matches). The
33
+ # probe may be a literal file or an absolute glob ("/dir/*.log").
34
+ _PIHOLE_CANDIDATES: tuple[tuple[str, str], ...] = (
35
+ ("/var/log/pihole/pihole.log", "/var/log/pihole"),
36
+ ("/var/log/pihole.log", "/var/log"),
37
+ ("/var/log/pihole/*.log", "/var/log/pihole"),
38
+ )
39
+ _SYSLOG_CANDIDATE: str = "/var/log"
40
+
41
+ # Zeek family globs drive the {logs} fill and the size sum.
42
+ _ZEEK_GLOBS: tuple[str, ...] = (
43
+ "conn*.log*", "dns*.log*", "ssl*.log*",
44
+ "http*.log*", "weird*.log*", "notice*.log*",
45
+ )
46
+ # Pi-hole stays narrow even when the candidate dir is /var/log — we profile
47
+ # only the Pi-hole file so unrelated syslog files don't inflate the size.
48
+ _PIHOLE_GLOB: str = "pihole.log*"
49
+ # Syslog mirrors the detector's OPTIONAL_LOGS glob so the profile honestly
50
+ # previews what will be analyzed.
51
+ _SYSLOG_GLOB: str = "*.log*"
52
+
53
+ _PROFILE_FILE_CAP: int = 5000
54
+ _DOCS_URL: str = "https://github.com/spiralbend/loghunter"
55
+
56
+
57
+ def _detect_zeek() -> str | None:
58
+ """Probe conventional Zeek log dirs; first hit with conn*.log* wins, else
59
+ the first dir that has any *.log*. Returns the dir path or None."""
60
+ fallback: str | None = None
61
+ for cand in _ZEEK_CANDIDATES:
62
+ p = Path(cand)
63
+ try:
64
+ if not p.is_dir():
65
+ continue
66
+ if any(p.glob("conn*.log*")):
67
+ return cand
68
+ if fallback is None and any(p.glob("*.log*")):
69
+ fallback = cand
70
+ except OSError:
71
+ continue
72
+ return fallback
73
+
74
+
75
+ def _detect_pihole() -> str | None:
76
+ """Walk pi-hole probes; return the candidate dir of the first hit."""
77
+ for probe, candidate_dir in _PIHOLE_CANDIDATES:
78
+ try:
79
+ if "*" in probe:
80
+ # Path.glob walks the receiver — for an absolute glob we must
81
+ # split (parent, pattern) and glob from the parent directory.
82
+ parent = Path(probe).parent
83
+ pattern = Path(probe).name
84
+ if parent.is_dir() and any(parent.glob(pattern)):
85
+ return candidate_dir
86
+ else:
87
+ if Path(probe).is_file():
88
+ return candidate_dir
89
+ except OSError:
90
+ continue
91
+ return None
92
+
93
+
94
+ def _detect_syslog() -> str | None:
95
+ """Return the syslog candidate dir if it exists, else None."""
96
+ try:
97
+ return _SYSLOG_CANDIDATE if Path(_SYSLOG_CANDIDATE).is_dir() else None
98
+ except OSError:
99
+ return None
100
+
101
+
102
+ def _human_bytes(n: int) -> str:
103
+ """Format a byte count as `~6 GB` / `~340 MB` / `~12 KB`. The `~` reflects
104
+ that the count is glob-scoped, not whole-dir."""
105
+ if n < 1024:
106
+ return f"~{n} B"
107
+ if n < 1024 ** 2:
108
+ return f"~{n // 1024} KB"
109
+ if n < 1024 ** 3:
110
+ return f"~{n // (1024 ** 2)} MB"
111
+ if n < 1024 ** 4:
112
+ return f"~{n // (1024 ** 3)} GB"
113
+ return f"~{n // (1024 ** 4)} TB"
114
+
115
+
116
+ def _fresh_bucket(delta: timedelta) -> str:
117
+ """Map an age delta to Dave's relative-time phrasing."""
118
+ seconds = delta.total_seconds()
119
+ if seconds < 3600:
120
+ return "updated just now"
121
+ if seconds < 86_400:
122
+ return "fresh today"
123
+ if seconds < 7 * 86_400:
124
+ return "active this week"
125
+ days = int(seconds // 86_400)
126
+ if seconds < 30 * 86_400:
127
+ return f"last activity ~{days} days ago"
128
+ if seconds < 60 * 86_400:
129
+ weeks = days // 7
130
+ return f"but it looks stale — nothing new in ~{weeks} weeks"
131
+ months = days // 30
132
+ return f"but it looks stale — nothing new in ~{months} months"
133
+
134
+
135
+ def _profile_dir(
136
+ path: str,
137
+ globs: tuple[str, ...],
138
+ *,
139
+ logs_label: str | None,
140
+ now: datetime | None = None,
141
+ ) -> dict | None:
142
+ """Stat + glob the candidate dir; return a profile dict or None (no-data).
143
+
144
+ Permission-tolerant: a single file's stat raising OSError is silently
145
+ skipped. The dir not existing or no files matching returns None — the
146
+ caller's "reduced dialogue form" branch."""
147
+ p = Path(path).expanduser()
148
+ try:
149
+ if not p.is_dir():
150
+ return None
151
+ except OSError:
152
+ return None
153
+
154
+ matched: list[Path] = []
155
+ families_present: list[str] = [] # zeek family order, first-seen
156
+ bounded = False
157
+ try:
158
+ for glob in globs:
159
+ family = glob.split("*", 1)[0].rstrip(".") # "conn*.log*" → "conn"
160
+ family_hit = False
161
+ for f in p.glob(glob):
162
+ matched.append(f)
163
+ family_hit = True
164
+ if len(matched) >= _PROFILE_FILE_CAP:
165
+ bounded = True
166
+ break
167
+ if family_hit and family and family not in families_present:
168
+ families_present.append(family)
169
+ if bounded:
170
+ break
171
+ except OSError:
172
+ return None
173
+
174
+ total = 0
175
+ max_mtime: float | None = None
176
+ for f in matched:
177
+ try:
178
+ st = f.stat()
179
+ except OSError:
180
+ continue
181
+ total += st.st_size
182
+ if max_mtime is None or st.st_mtime > max_mtime:
183
+ max_mtime = st.st_mtime
184
+
185
+ if not matched or total == 0 or max_mtime is None:
186
+ return None
187
+
188
+ now = now or datetime.now()
189
+ delta = now - datetime.fromtimestamp(max_mtime)
190
+
191
+ if logs_label is not None:
192
+ logs = logs_label
193
+ elif families_present:
194
+ if len(families_present) <= 2:
195
+ logs = " + ".join(families_present)
196
+ else:
197
+ logs = ", ".join(families_present)
198
+ else:
199
+ logs = ""
200
+
201
+ return {
202
+ "size_bytes": total,
203
+ "size_str": _human_bytes(total),
204
+ "fresh_str": _fresh_bucket(delta),
205
+ "logs": logs,
206
+ "bounded": bounded,
207
+ }
208
+
209
+
210
+ # ── TOML serialization ────────────────────────────────────────────────────────
211
+
212
+ _TOML_FORBIDDEN_RE = re.compile(r'[\x00-\x1f\x7f]')
213
+
214
+
215
+ def _toml_str(value: str) -> str:
216
+ """Serialize a path value as a TOML string. Literal form when possible
217
+ (single-quoted, no escapes); basic form when the value contains a single
218
+ quote. Control characters are rejected — silently writing invalid TOML
219
+ is worse than asking the user to retype the path."""
220
+ if _TOML_FORBIDDEN_RE.search(value):
221
+ raise ValueError(
222
+ "loghunter init: path contains a control character that cannot "
223
+ f"be written to TOML: {value!r}"
224
+ )
225
+ if "'" not in value:
226
+ return f"'{value}'"
227
+ escaped = value.replace("\\", "\\\\").replace('"', '\\"')
228
+ return f'"{escaped}"'
229
+
230
+
231
+ # ── Section-bound keyed upsert ────────────────────────────────────────────────
232
+ #
233
+ # The four managed keys (root, zeek_dir, pihole_dir, syslog_dir) are rewritten
234
+ # ONLY inside the [loghunter] table span. A token appearing in any other
235
+ # stanza, a comment outside the span, or even a [loghunter.subtable] is never
236
+ # matched — that IS the non-clobber guarantee.
237
+
238
+ _LOGHUNTER_HEADER_RE = re.compile(r'^\[loghunter\]\s*(?:#.*)?$', re.MULTILINE)
239
+ _MANAGED_KEYS: tuple[str, ...] = ("root", "zeek_dir", "pihole_dir", "syslog_dir")
240
+
241
+
242
+ def _loghunter_span(text: str) -> tuple[int, int, int] | None:
243
+ """Locate the [loghunter] table span: (header_start, body_start, body_end).
244
+
245
+ body runs from the line AFTER the header to the line BEFORE the next
246
+ `^[` section header, or EOF. Returns None when the header is absent."""
247
+ m = _LOGHUNTER_HEADER_RE.search(text)
248
+ if m is None:
249
+ return None
250
+ header_start = m.start()
251
+ # body starts after the header line's trailing newline
252
+ nl = text.find("\n", m.end())
253
+ body_start = nl + 1 if nl != -1 else len(text)
254
+ # body ends at the next ^[ section header found after body_start
255
+ rest_offset = body_start
256
+ next_header_re = re.compile(r'^\[', re.MULTILINE)
257
+ nh = next_header_re.search(text, rest_offset)
258
+ body_end = nh.start() if nh else len(text)
259
+ return (header_start, body_start, body_end)
260
+
261
+
262
+ def _upsert_loghunter_key(
263
+ text: str, key: str, value: str | None, *, fresh: bool,
264
+ ) -> str:
265
+ """Keyed transform inside the [loghunter] table span.
266
+
267
+ value is None — SKIPPED. fresh=True comments any active line for
268
+ the key; fresh=False is a strict no-op (never
269
+ touch a user-set value).
270
+ value is a string — PROVIDED. Upsert active line inside span. value=""
271
+ is honored (the explicit-empty-root case).
272
+ """
273
+ span = _loghunter_span(text)
274
+ if span is None:
275
+ # Defensive — the shipped example always ships a header. Prepend one
276
+ # so the upsert has a place to land.
277
+ text = "[loghunter]\n" + text
278
+ span = _loghunter_span(text)
279
+ assert span is not None
280
+ header_start, body_start, body_end = span
281
+ body = text[body_start:body_end]
282
+
283
+ active_re = re.compile(rf'^{re.escape(key)}\s*=.*$', re.MULTILINE)
284
+ commented_re = re.compile(rf'^#\s*{re.escape(key)}\s*=.*$', re.MULTILINE)
285
+
286
+ if value is not None:
287
+ new_line = f"{key} = {_toml_str(value)}"
288
+ # Active match wins over a commented sample — otherwise a base shaped
289
+ # like `# zeek_dir = "/default"\nzeek_dir = "/custom"` produces
290
+ # duplicate active keys (the commented line gets uncommented while
291
+ # the existing active line remains, invalidating the TOML).
292
+ m = active_re.search(body) or commented_re.search(body)
293
+ if m is not None:
294
+ new_body = body[:m.start()] + new_line + body[m.end():]
295
+ else:
296
+ # Insert directly after the header line. body_start is already
297
+ # past the header newline, so prepending here = post-header.
298
+ new_body = new_line + "\n" + body
299
+ return text[:body_start] + new_body + text[body_end:]
300
+
301
+ # SKIPPED branch.
302
+ if not fresh:
303
+ return text # never touch a user-set value
304
+ m = re.search(rf'^(?P<key>{re.escape(key)}\s*=.*)$', body, re.MULTILINE)
305
+ if m is None:
306
+ return text # active line not present; nothing to comment
307
+ line_start = m.start()
308
+ new_body = body[:line_start] + "# " + body[line_start:]
309
+ return text[:body_start] + new_body + text[body_end:]
310
+
311
+
312
+ # ── Wizard dialogue ───────────────────────────────────────────────────────────
313
+
314
+ def _print_intro(existing_basis: bool) -> None:
315
+ print("OK, let's find your logs.")
316
+ if existing_basis:
317
+ print("Found ~/.loghunter/, using that as basis (non-destructive)")
318
+ print()
319
+
320
+
321
+ def _print_zeek_found(path: str, profile: dict | None) -> None:
322
+ if profile is not None:
323
+ print(f"Found Zeek at {path}.")
324
+ print(f"{profile['logs']}, {profile['size_str']}, {profile['fresh_str']}. Use this?")
325
+ else:
326
+ # No-data reduced form: single-line headline per the Rev 2 prompt.
327
+ print(f"Found Zeek at {path}. Use this?")
328
+ print("[Enter = yes · type a path · s = skip]")
329
+
330
+
331
+ def _print_zeek_not_found() -> None:
332
+ print("Didn't find Zeek. You might like it: https://zeek.org")
333
+ print("If it's just hiding, tell me where.")
334
+ print("[Enter = skip · type a path]")
335
+
336
+
337
+ def _print_pihole_found(path: str, profile: dict | None) -> None:
338
+ if profile is not None:
339
+ print(f"Found Pi-hole at {path}.")
340
+ print(f"{profile['size_str']} of query logs, {profile['fresh_str']}. Use this?")
341
+ else:
342
+ # No-data reduced form: single-line headline per the Rev 2 prompt.
343
+ print(f"Found Pi-hole at {path}. Use this?")
344
+ print("[Enter = yes · type a path · s = skip]")
345
+
346
+
347
+ def _print_pihole_not_found() -> None:
348
+ print("Pi-hole seems to be absent. Worth a look: https://pi-hole.net")
349
+ print("Point me at the logs if they're elsewhere.")
350
+ print("[Enter = skip · type a path]")
351
+
352
+
353
+ def _print_syslog(path: str, profile: dict | None) -> None:
354
+ if profile is not None:
355
+ print(f"syslog is where you'd expect… {path} — {profile['size_str']}, {profile['fresh_str']}.")
356
+ else:
357
+ print(f"syslog is where you'd expect… {path}.")
358
+ print("Use this? [Enter = yes · type a path · s = skip]")
359
+
360
+
361
+ def _print_gate() -> None:
362
+ print("You should provide at least one: Zeek, Pi-hole, or syslog.")
363
+ print("Or you can point loghunter at individual files. Up to you.")
364
+ print("[r = redo · Enter = skip]")
365
+
366
+
367
+ def _print_root_prompt(default_root: str) -> None:
368
+ print("Last thing: where should LogHunter keep what it produces — exports and reports?")
369
+ print(f"[Enter = {default_root}]")
370
+
371
+
372
+ def _print_confirm(active_sources: list[tuple[str, str]], root: str) -> None:
373
+ if active_sources:
374
+ sources_line = ", ".join(f"{label} ({path})" for label, path in active_sources)
375
+ else:
376
+ sources_line = "(none — pass files on the command line)"
377
+ print("Done — settings written to ~/.loghunter/config.toml.")
378
+ print(f" reading: {sources_line}")
379
+ print(f" data: {root}")
380
+ print()
381
+ print(f"LogHunter documentation lives here: {_DOCS_URL}")
382
+ print("Or just run `loghunter` for a quick-start TL;DR.")
383
+ print()
384
+ print("Good hunting!")
385
+
386
+
387
+ # ── Flow helpers ──────────────────────────────────────────────────────────────
388
+
389
+ def _ask_source(found_path: str | None, found_printer, not_found_printer) -> str | None:
390
+ """Drive one source prompt. Returns the chosen path or None (skipped)."""
391
+ if found_path is not None:
392
+ found_printer()
393
+ answer = input("> ").strip()
394
+ if answer == "":
395
+ return found_path
396
+ if answer.lower() in ("s", "skip"):
397
+ return None
398
+ return os.path.expanduser(answer)
399
+ # NOT FOUND path
400
+ not_found_printer()
401
+ answer = input("> ").strip()
402
+ if answer == "" or answer.lower() in ("s", "skip"):
403
+ return None
404
+ return os.path.expanduser(answer)
405
+
406
+
407
+ def _read_existing_config_for_root(
408
+ target: Path,
409
+ ) -> tuple[bytes | None, str | None, dict | None]:
410
+ """Read an existing config file. Returns (raw_bytes, decoded_text,
411
+ parsed-loghunter-dict). Bytes are preserved verbatim for `.bak`
412
+ (read_text translates CRLF→LF under universal newlines, which would
413
+ break the non-clobber guarantee for Windows-line-ending files)."""
414
+ if not target.exists():
415
+ return (None, None, None)
416
+ try:
417
+ raw = target.read_bytes()
418
+ except OSError as exc:
419
+ raise ValueError(
420
+ f"loghunter init: cannot read existing config at {target}: {exc}"
421
+ ) from exc
422
+ try:
423
+ text = raw.decode("utf-8")
424
+ except UnicodeDecodeError as exc:
425
+ raise ValueError(
426
+ f"loghunter init: existing config at {target} is not UTF-8: {exc}"
427
+ ) from exc
428
+ try:
429
+ parsed = tomllib.loads(text)
430
+ except tomllib.TOMLDecodeError as exc:
431
+ raise ValueError(
432
+ f"loghunter init: existing config at {target} is not valid TOML: {exc}"
433
+ ) from exc
434
+ return (raw, text, parsed.get("loghunter", {}))
435
+
436
+
437
+ def _load_example_text() -> str:
438
+ """Return the shipped config_example.toml contents."""
439
+ try:
440
+ import importlib.resources
441
+ pkg_data = importlib.resources.files("loghunter") / "data"
442
+ return (pkg_data / "config_example.toml").read_text(encoding="utf-8")
443
+ except Exception:
444
+ example_path = Path(__file__).parent / "data" / "config_example.toml"
445
+ return example_path.read_text(encoding="utf-8")
446
+
447
+
448
+ def run_init() -> None:
449
+ """Detection-driven, non-clobbering wizard for the first-run config.
450
+
451
+ Public entry point — ``cli.py`` validates argv via ``_parse_args(args,
452
+ "init")`` (allowed set is help-only; standalone ``--help``/``-h`` is
453
+ short-circuited before this function is invoked) and then delegates here.
454
+ """
455
+ target = Path("~/.loghunter/config.toml").expanduser()
456
+ existing_bytes, existing_text, existing_lh = _read_existing_config_for_root(target)
457
+ existing_basis = existing_bytes is not None
458
+ base_text = existing_text if existing_basis else _load_example_text()
459
+ fresh = not existing_basis
460
+
461
+ _print_intro(existing_basis)
462
+
463
+ # Run the three source prompts in order; on gate-redo we re-loop.
464
+ while True:
465
+ zeek_path = _detect_zeek()
466
+ zeek_profile = (
467
+ _profile_dir(zeek_path, _ZEEK_GLOBS, logs_label=None) if zeek_path else None
468
+ )
469
+ zeek_answer = _ask_source(
470
+ zeek_path,
471
+ lambda: _print_zeek_found(zeek_path, zeek_profile),
472
+ _print_zeek_not_found,
473
+ )
474
+ print()
475
+
476
+ pihole_path = _detect_pihole()
477
+ pihole_profile = (
478
+ _profile_dir(pihole_path, (_PIHOLE_GLOB,), logs_label="query logs")
479
+ if pihole_path else None
480
+ )
481
+ pihole_answer = _ask_source(
482
+ pihole_path,
483
+ lambda: _print_pihole_found(pihole_path, pihole_profile),
484
+ _print_pihole_not_found,
485
+ )
486
+ print()
487
+
488
+ syslog_path = _detect_syslog()
489
+ if syslog_path is not None:
490
+ syslog_profile = _profile_dir(syslog_path, (_SYSLOG_GLOB,), logs_label=None)
491
+ _print_syslog(syslog_path, syslog_profile)
492
+ syslog_input = input("> ").strip()
493
+ if syslog_input == "":
494
+ syslog_answer: str | None = syslog_path
495
+ elif syslog_input.lower() in ("s", "skip"):
496
+ syslog_answer = None
497
+ else:
498
+ syslog_answer = os.path.expanduser(syslog_input)
499
+ else:
500
+ # Treat absent /var/log as a no-found case (extremely rare in
501
+ # practice). Reuse the Zeek not-found shape for shape consistency.
502
+ print("syslog isn't where I'd expect — point me at the logs if they're elsewhere.")
503
+ print("[Enter = skip · type a path]")
504
+ answer = input("> ").strip()
505
+ syslog_answer = os.path.expanduser(answer) if answer else None
506
+ print()
507
+
508
+ if zeek_answer is None and pihole_answer is None and syslog_answer is None:
509
+ _print_gate()
510
+ gate = input("> ").strip().lower()
511
+ if gate == "r":
512
+ continue # re-loop the three source prompts
513
+ break
514
+
515
+ # Root default: existing config's explicit root wins (including ""); else
516
+ # the new live default ~/.loghunter.
517
+ default_root = "~/.loghunter"
518
+ if existing_lh is not None and "root" in existing_lh:
519
+ default_root = existing_lh["root"]
520
+ _print_root_prompt(default_root)
521
+ root_input = input("> ").strip()
522
+ root_value = default_root if root_input == "" else os.path.expanduser(root_input)
523
+ print()
524
+
525
+ # Write `.bak` BEFORE any transformation when re-initing an existing file.
526
+ # Backup the RAW bytes verbatim — text-mode round-trip would translate
527
+ # CRLF→LF on the way in and leave the .bak non-identical to the original.
528
+ target.parent.mkdir(parents=True, exist_ok=True)
529
+ if existing_basis:
530
+ bak_path = target.with_suffix(".toml.bak")
531
+ try:
532
+ bak_path.write_bytes(existing_bytes)
533
+ except OSError as exc:
534
+ raise ValueError(
535
+ f"loghunter init: cannot write backup at {bak_path}: {exc}"
536
+ ) from exc
537
+
538
+ text = base_text
539
+ for key, val in (
540
+ ("root", root_value),
541
+ ("zeek_dir", zeek_answer),
542
+ ("pihole_dir", pihole_answer),
543
+ ("syslog_dir", syslog_answer),
544
+ ):
545
+ text = _upsert_loghunter_key(text, key, val, fresh=fresh)
546
+
547
+ # Bytes-out symmetric with bytes-in: write_text would platform-translate
548
+ # the newline boundary on Windows. write_bytes preserves the byte stream.
549
+ target.write_bytes(text.encode("utf-8"))
550
+
551
+ active: list[tuple[str, str]] = []
552
+ if zeek_answer is not None:
553
+ active.append(("Zeek", zeek_answer))
554
+ if pihole_answer is not None:
555
+ active.append(("Pi-hole", pihole_answer))
556
+ if syslog_answer is not None:
557
+ active.append(("syslog", syslog_answer))
558
+ _print_confirm(active, root_value)
559
+
560
+
561
+ # Compat shim — pre-extraction tests called `cli._run_init([])`. The new entry
562
+ # is ``run_init()`` (cli.py validates argv via ``_parse_args(args, "init")``
563
+ # before delegating). Tests that drive the wizard end-to-end keep working.
564
+ def _run_init(args: list[str] | None = None) -> None:
565
+ """Wrapper that preserves the pre-extraction call shape for tests."""
566
+ del args
567
+ run_init()
@@ -0,0 +1 @@
1
+ """Common utilities shared across detectors and output handlers."""