loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,115 @@
1
+ """A-phase guard: the loader/ package presents the same import surface as the
2
+ former single common/loader.py module, and the test-patchable I/O seams
3
+ (``progress`` / ``_open_log``) remain SETTABLE at the package boundary AND
4
+ patch-through to the load pipeline.
5
+
6
+ This is the extraction's safety net (Glenn execution caution #1): a dropped
7
+ re-export or a facade that snapshots a pre-patch object fails here loudly.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+
14
+ import loghunter.common.loader as loader
15
+
16
+
17
+ # The full re-export inventory. Every name that resolved at
18
+ # loghunter.common.loader.<name> before the package split MUST still resolve.
19
+ # Pinned literally so a dropped re-export is a hard failure, not silent drift.
20
+ _SURFACE = [
21
+ # display re-export (imported module-global, monkeypatched in 12 tests)
22
+ "progress",
23
+ # io
24
+ "_open_log", "_safe_resolve", "_union_dedupe",
25
+ # types
26
+ "LoadResult", "CoverageTracker", "SourceCoverage", "RotationSkipInfo",
27
+ "_data_window", "_PIHOLE_COLUMNS", "_CLOUDTRAIL_COLUMNS", "_SYSLOG_COLUMNS",
28
+ "_LOG_SUFFIXES",
29
+ # diagnostics
30
+ "_log_type", "_schema_warning", "_zeek_file_read_warning",
31
+ "_cloudtrail_parse_warning",
32
+ # sniff
33
+ "sniff_format", "sniff_format_detailed", "SniffResult", "_is_ndjson",
34
+ "_looks_like_syslog", "_SNIFF_MAX_PEEK", "_SNIFF_ORIGIN", "_SNIFF_RECOGNIZERS",
35
+ "_SYSLOG_SNIFF_BYTES",
36
+ # windowing
37
+ "_apply_ts_filter", "_missing_ts", "is_bounded", "is_zeek_bounded",
38
+ "_classify_rotation_name", "_rotation_base_and_index", "_peek_first_ts",
39
+ "_select_group", "_group_order_conflict", "_rotation_windowed_files",
40
+ "_COMPRESSION_EXTS", "_ROTATION_NUM_RE", "_DATE_RANK_BASE", "_EXPORT_WINDOW_RE",
41
+ # windowing — B+D named window model
42
+ "LoadWindow", "apply_default_window",
43
+ # discovery
44
+ "discover_files", "_DATE_DIR_RE", "_zeek_date_subdirs", "_file_matches_pattern",
45
+ "discover_zeek_files", "_syslog_files", "_discover_syslog_files",
46
+ "_dir_has_regular_files", "discover_cloudtrail_files", "_stem_hostname",
47
+ # discovery — B+D strategy resolvers (folded accessors)
48
+ "_zeek_dated_window", "_flat_default_floor", "_default_resolve_window",
49
+ "_zeek_resolve_window", "_flat_resolve_window",
50
+ # pipeline
51
+ "SourceLoader", "_SOURCE_LOADERS", "run_load", "load_required_logs",
52
+ "load_logs", "load_zeek_log", "load_syslog", "load_pihole", "load_cloudtrail",
53
+ "_zeek_records_from_lines", "_zeek_parse_from_lines", "_parse_ndjson_file",
54
+ "_parse_lines", "_zeek_strategy_parse", "_zeek_normalize",
55
+ "_syslog_strategy_parse", "_pihole_strategy_parse", "_cloudtrail_strategy_parse",
56
+ "_events_from_whole_document", "_syslog_should_skip", "_pihole_should_skip",
57
+ "_NORMALIZER_MAP", "resolve_load_windows",
58
+ ]
59
+
60
+
61
+ def test_full_surface_resolves():
62
+ missing = [name for name in _SURFACE if not hasattr(loader, name)]
63
+ assert not missing, f"loader package dropped re-exports: {missing}"
64
+
65
+
66
+ def test_source_loaders_registry_identity():
67
+ # The name imported by string-path must BE the registry the pipeline reads.
68
+ from loghunter.common.loader import _SOURCE_LOADERS as imported
69
+ assert imported is loader._SOURCE_LOADERS
70
+ # Every detector source key has a registered strategy (the completeness rail
71
+ # this extraction must not break).
72
+ for key in ("zeek_dir", "syslog_dir", "pihole_dir", "cloudtrail_dir"):
73
+ assert key in loader._SOURCE_LOADERS
74
+
75
+
76
+ def _write_conn(tmp_path: Path) -> Path:
77
+ d = tmp_path / "zeek"
78
+ d.mkdir()
79
+ (d / "conn.log").write_text(
80
+ '{"ts": 1.0, "id.orig_h": "192.0.2.1", "id.resp_h": "192.0.2.2", '
81
+ '"id.resp_p": 53, "proto": "udp"}\n'
82
+ )
83
+ return d
84
+
85
+
86
+ def test_progress_patch_through(tmp_path, monkeypatch):
87
+ """Patching loader.progress (the package attr) must reach pipeline.run_load —
88
+ proves the facade reads progress at call time, not via an import-time snapshot.
89
+ """
90
+ hits = {"n": 0}
91
+ real = loader.progress
92
+
93
+ def spy(it, **kwargs):
94
+ hits["n"] += 1
95
+ return real(it, **kwargs)
96
+
97
+ monkeypatch.setattr(loader, "progress", spy)
98
+ df = loader.load_logs(_write_conn(tmp_path), "conn*.log*")
99
+ assert len(df) == 1
100
+ assert hits["n"] >= 1, "progress patch did not reach the load pipeline"
101
+
102
+
103
+ def test_open_log_patch_through(tmp_path, monkeypatch):
104
+ """Patching loader._open_log (the package attr) must reach pipeline.run_load."""
105
+ hits = {"n": 0}
106
+ real = loader._open_log
107
+
108
+ def spy(path):
109
+ hits["n"] += 1
110
+ return real(path)
111
+
112
+ monkeypatch.setattr(loader, "_open_log", spy)
113
+ df = loader.load_logs(_write_conn(tmp_path), "conn*.log*")
114
+ assert len(df) == 1
115
+ assert hits["n"] >= 1, "_open_log patch did not reach the load pipeline"
@@ -0,0 +1,215 @@
1
+ """B+D: the named window model — one LoadWindow type, one resolve_load_windows
2
+ resolver shared by run() and run_digest(), and the contributor contract (a new
3
+ source declares its temporal policy on ONE registry entry — zero runner edits,
4
+ zero new accessor, zero digest twin).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import replace
10
+ from datetime import datetime, timedelta, timezone
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ import pytest
15
+
16
+ import loghunter.common.loader as loader
17
+ from loghunter import runner
18
+
19
+
20
+ # ── resolve_load_windows — short-circuits ────────────────────────────────────
21
+
22
+
23
+ def test_resolve_load_windows_short_circuits_on_explicit_window(tmp_path):
24
+ d = tmp_path / "zeek"
25
+ d.mkdir()
26
+ sources = {"conn*.log*": "zeek_dir"}
27
+ dirs = {"zeek_dir": [d]}
28
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
29
+ assert loader.resolve_load_windows(
30
+ sources, dirs, "1d", since=since, until=None, load_all=False
31
+ ) == []
32
+ assert loader.resolve_load_windows(
33
+ sources, dirs, "1d", since=None, until=None, load_all=True
34
+ ) == []
35
+ # empty/"all"/invalid default spec → no windows
36
+ assert loader.resolve_load_windows(
37
+ sources, dirs, "all", since=None, until=None, load_all=False
38
+ ) == []
39
+
40
+
41
+ def test_resolve_load_windows_skips_bounded_file_input(tmp_path):
42
+ f = tmp_path / "conn.log"
43
+ f.write_text("{}\n", encoding="utf-8")
44
+ assert loader.resolve_load_windows(
45
+ {"conn*.log*": "zeek_dir"}, {"zeek_dir": [f]}, "1d",
46
+ since=None, until=None, load_all=False,
47
+ ) == []
48
+
49
+
50
+ # ── per-family resolution shapes (the strategy resolver bodies) ───────────────
51
+
52
+
53
+ def test_resolve_load_windows_zeek_dated_precise_no_trim(tmp_path):
54
+ """Dated Zeek layout → precise (since, until) select_window, trim_span None."""
55
+ zd = tmp_path / "zeek"
56
+ zd.mkdir()
57
+ (zd / "2026-01-05").mkdir()
58
+ windows = loader.resolve_load_windows(
59
+ {"conn*.log*": "zeek_dir"}, {"zeek_dir": [zd]}, "1d",
60
+ since=None, until=None, load_all=False,
61
+ )
62
+ assert len(windows) == 1
63
+ w = windows[0]
64
+ assert w.source == "zeek_dir"
65
+ assert w.select_window == (
66
+ datetime(2026, 1, 5, 0, 0, 0, tzinfo=timezone.utc),
67
+ datetime(2026, 1, 5, 23, 59, 59, tzinfo=timezone.utc),
68
+ )
69
+ assert w.trim_span is None
70
+ assert w.keep_null is False # zeek drops unparseable-ts rows
71
+
72
+
73
+ def test_resolve_load_windows_zeek_flat_load_full_trim(tmp_path):
74
+ """Flat Zeek layout → load full (select_window None) + post-load trim_span."""
75
+ zd = tmp_path / "zeek"
76
+ zd.mkdir()
77
+ (zd / "conn.log").write_text("{}\n", encoding="utf-8") # flat, no dated subdirs
78
+ windows = loader.resolve_load_windows(
79
+ {"conn*.log*": "zeek_dir"}, {"zeek_dir": [zd]}, "1d",
80
+ since=None, until=None, load_all=False,
81
+ )
82
+ assert len(windows) == 1
83
+ w = windows[0]
84
+ assert w.select_window is None
85
+ assert w.trim_span == timedelta(days=1)
86
+
87
+
88
+ def test_resolve_load_windows_flat_family_conservative_floor(tmp_path):
89
+ """syslog → conservative (floor, None) select_window + precise trim_span;
90
+ keep_null True (syslog retains unparseable-ts rows through the implicit window)."""
91
+ sd = tmp_path / "syslog"
92
+ sd.mkdir()
93
+ (sd / "messages").write_text(
94
+ "Jun 5 12:00:00 host kernel: line\n", encoding="utf-8"
95
+ )
96
+ span = timedelta(days=1)
97
+ windows = loader.resolve_load_windows(
98
+ {"*.log*": "syslog_dir"}, {"syslog_dir": [sd]}, "1d",
99
+ since=None, until=None, load_all=False,
100
+ )
101
+ assert len(windows) == 1
102
+ w = windows[0]
103
+ assert w.select_window is not None and w.select_window[1] is None
104
+ assert w.select_window[0] == loader._peek_first_ts(sd / "messages") - span
105
+ assert w.trim_span == span
106
+ assert w.keep_null is True
107
+
108
+
109
+ def test_resolve_load_windows_cloudtrail_opts_out(tmp_path):
110
+ """CloudTrail is baseline-relative → default_window_eligible False → no window."""
111
+ ct = tmp_path / "ct"
112
+ ct.mkdir()
113
+ (ct / "events.json").write_text("[]\n", encoding="utf-8")
114
+ assert loader.resolve_load_windows(
115
+ {"*.json*": "cloudtrail_dir"}, {"cloudtrail_dir": [ct]}, "1d",
116
+ since=None, until=None, load_all=False,
117
+ ) == []
118
+
119
+
120
+ # ── the contributor contract (Doneness #2) ───────────────────────────────────
121
+
122
+
123
+ def _fake_flat_source(**overrides) -> loader.SourceLoader:
124
+ """A hypothetical new flat source: ONE registry entry declaring only the
125
+ genuinely-variable bits — NO resolve_window, NO window_select, default
126
+ default_window_eligible=True. The fixture for the zero-runner-edits contract."""
127
+ base = loader.SourceLoader(
128
+ discover=lambda p, pattern, since, until: (
129
+ sorted(p.glob("*.log")) if p.is_dir() else [p]
130
+ ),
131
+ mode="stream",
132
+ parse=lambda line_iter, *, path, warnings: iter(()),
133
+ ts_policy="keep",
134
+ columns=["ts", "message"],
135
+ should_skip=None,
136
+ normalize=None,
137
+ )
138
+ return replace(base, **overrides) if overrides else base
139
+
140
+
141
+ def test_contributor_contract_new_source_inherits_universal_default(
142
+ tmp_path, monkeypatch
143
+ ):
144
+ """A new flat source declaring NO resolve_window inherits the universal default
145
+ window (load full + post-load trim) with zero runner edits, zero new accessor,
146
+ and zero digest twin — resolved by the ONE resolve_load_windows entry point."""
147
+ monkeypatch.setitem(loader._SOURCE_LOADERS, "fake_dir", _fake_flat_source())
148
+ d = tmp_path / "fakesrc"
149
+ d.mkdir()
150
+ windows = loader.resolve_load_windows(
151
+ {"*.log": "fake_dir"}, {"fake_dir": [d]}, "1d",
152
+ since=None, until=None, load_all=False,
153
+ )
154
+ assert len(windows) == 1
155
+ w = windows[0]
156
+ assert w.source == "fake_dir"
157
+ assert w.select_window is None # universal default = load full
158
+ assert w.trim_span == timedelta(days=1)
159
+ assert w.keep_null is True # read straight off ts_policy="keep"
160
+
161
+
162
+ def test_contributor_contract_new_source_can_opt_out(tmp_path, monkeypatch):
163
+ """A baseline-relative new source opts out via default_window_eligible=False on
164
+ its entry (the cloudtrail pattern) — still zero runner edits, no source-name
165
+ branch — and mints no LoadWindow."""
166
+ monkeypatch.setitem(
167
+ loader._SOURCE_LOADERS,
168
+ "fake_dir",
169
+ _fake_flat_source(default_window_eligible=False),
170
+ )
171
+ d = tmp_path / "fakesrc"
172
+ d.mkdir()
173
+ assert loader.resolve_load_windows(
174
+ {"*.log": "fake_dir"}, {"fake_dir": [d]}, "1d",
175
+ since=None, until=None, load_all=False,
176
+ ) == []
177
+
178
+
179
+ # ── digest preservation: window resolution is Zeek-ONLY (caller-side gate) ─────
180
+
181
+
182
+ def test_digest_window_resolution_is_zeek_only(tmp_path, monkeypatch, capsys):
183
+ """run_digest invokes the SHARED resolver for the Zeek source ONLY; non-Zeek
184
+ digest directories (syslog/cloudtrail) never resolve a default window → load
185
+ full, exactly as before the twin was deleted. Pinned via a spy on the one
186
+ resolver, exercised on the dry-run path (window resolution runs pre-load)."""
187
+ calls: list[Any] = []
188
+ real = loader.resolve_load_windows
189
+
190
+ def spy(needed_sources, *a, **k):
191
+ calls.append(needed_sources)
192
+ return real(needed_sources, *a, **k)
193
+
194
+ monkeypatch.setattr(loader, "resolve_load_windows", spy)
195
+
196
+ zd = tmp_path / "zeek"
197
+ zd.mkdir()
198
+ runner.run_digest(
199
+ config={"loghunter": {"zeek_dir": str(zd)}}, schema="conn", dry_run=True
200
+ )
201
+ assert len(calls) == 1, "zeek digest resolves the default window"
202
+
203
+ calls.clear()
204
+ sd = tmp_path / "syslog"
205
+ sd.mkdir()
206
+ runner.run_digest(
207
+ config={"loghunter": {"syslog_dir": str(sd)}}, schema="syslog", dry_run=True
208
+ )
209
+ ct = tmp_path / "ct"
210
+ ct.mkdir()
211
+ runner.run_digest(
212
+ config={"loghunter": {"cloudtrail_dir": str(ct)}},
213
+ schema="cloudtrail", dry_run=True,
214
+ )
215
+ assert calls == [], "non-Zeek digests never resolve a default window (load full)"