loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,282 @@
1
+ """Tests for the shared digest stats module.
2
+
3
+ Two purposes:
4
+ 1. Lock the seam — `_rate` and `_share` behave correctly at their gates
5
+ and floors, and the constants live where they should.
6
+ 2. Prevent regressions of the factoring — `_rate` has a single source
7
+ of truth (function identity across all three importing cards), and
8
+ `RATE_FLOOR` resolves to the same numeric value everywhere.
9
+
10
+ The existing tests/test_digest_{dns,syslog,cloudtrail}.py suites continuing
11
+ to pass UNCHANGED is the load-bearing proof that Fix 2 was
12
+ behavior-preserving. These tests layer additional invariants at the
13
+ boundary.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import pandas as pd
19
+
20
+ from loghunter.digest import _stats
21
+ from loghunter.digest import cloudtrail as ct
22
+ from loghunter.digest import dns
23
+ from loghunter.digest import syslog
24
+
25
+
26
+ # ─── Sharing invariants ─────────────────────────────────────────────────────
27
+
28
+ def test_rate_identity_across_cards() -> None:
29
+ """All three cards reference the same `_rate` function object — no
30
+ shadowing copies. Function identity is meaningful here: any future
31
+ re-introduction of a local copy would break `is`."""
32
+ assert ct._rate is _stats._rate
33
+ assert dns._rate is _stats._rate
34
+ assert syslog._rate is _stats._rate
35
+
36
+
37
+ def test_rate_floor_value_across_cards() -> None:
38
+ """RATE_FLOOR is an immutable float; check by equality (per James'
39
+ note), not `is`. Identity on a float is brittle and misleading."""
40
+ assert (
41
+ ct.RATE_FLOOR
42
+ == dns.RATE_FLOOR
43
+ == syslog.RATE_FLOOR
44
+ == _stats.RATE_FLOOR
45
+ == 0.01
46
+ )
47
+
48
+
49
+ def test_share_gate_value() -> None:
50
+ """SHARE_GATE lives in _stats and is the canonical 0.80 threshold."""
51
+ assert _stats.SHARE_GATE == 0.80
52
+ assert ct.SHARE_GATE is _stats.SHARE_GATE # constant re-import, same float
53
+
54
+
55
+ # ─── _rate behavior ─────────────────────────────────────────────────────────
56
+
57
+ def test_rate_dashes_below_population_floor() -> None:
58
+ """POPULATION_FLOOR is 5 — a 4-event mask returns None regardless of
59
+ fraction."""
60
+ mask = pd.Series([True, True, True, True])
61
+ contributor = pd.Series(["x", "x", "x", "x"])
62
+ assert _stats._rate(mask, contributor) is None
63
+
64
+
65
+ def test_rate_dashes_when_kind_count_is_zero() -> None:
66
+ """Above floor but no matching events — return None even though the
67
+ population is fine."""
68
+ mask = pd.Series([False] * 20)
69
+ contributor = pd.Series(["x"] * 20)
70
+ assert _stats._rate(mask, contributor) is None
71
+
72
+
73
+ def test_rate_dashes_below_rate_floor() -> None:
74
+ """200 events with 1 hit = 0.5% < RATE_FLOOR (1%) → dashes."""
75
+ mask = pd.Series([False] * 199 + [True])
76
+ contributor = pd.Series(["x"] * 199 + ["badcode"])
77
+ assert _stats._rate(mask, contributor) is None
78
+
79
+
80
+ def test_rate_speaks_with_top_contributor() -> None:
81
+ """50 events, 10 errored (20%), contributor "AccessDenied" is the mode
82
+ among the errored subset — returns (0.20, "AccessDenied")."""
83
+ mask = pd.Series([False] * 40 + [True] * 10)
84
+ contributor = pd.Series(
85
+ ["clean"] * 40 + ["AccessDenied"] * 7 + ["ValidationException"] * 3
86
+ )
87
+ result = _stats._rate(mask, contributor)
88
+ assert result is not None
89
+ fraction, top = result
90
+ assert fraction == 0.20
91
+ assert top == "AccessDenied"
92
+
93
+
94
+ def test_rate_drops_nan_contributors_in_mode() -> None:
95
+ """Top contributor lookup ignores NaN values among matching rows —
96
+ matches the dns/syslog/cloudtrail contract before factoring."""
97
+ mask = pd.Series([True] * 10 + [False] * 90)
98
+ contributor = pd.Series(
99
+ ["alice"] * 5 + [float("nan")] * 5 + ["x"] * 90
100
+ )
101
+ result = _stats._rate(mask, contributor)
102
+ assert result is not None
103
+ fraction, top = result
104
+ assert top == "alice"
105
+ assert fraction == 0.10
106
+
107
+
108
+ # ─── _share behavior ───────────────────────────────────────────────────────
109
+
110
+ def test_share_speaks_on_single_distinct_value_at_100_percent() -> None:
111
+ """One distinct entity at 100% → speaks. Critically, NO population
112
+ floor — the share statistic exists to surface concentration, and
113
+ low cardinality is the signal, not noise."""
114
+ counts = pd.Series([10], index=["203.0.113.99"])
115
+ result = _stats._share(counts, total=10)
116
+ assert result is not None
117
+ entity, top_share = result
118
+ assert entity == "203.0.113.99"
119
+ assert top_share == 1.0
120
+
121
+
122
+ def test_share_speaks_on_two_distinct_values_with_dominant() -> None:
123
+ """99/100 = 99% concentration on 2 distinct entities → speaks. The
124
+ OLD cliff floor would suppress this; the NEW share statistic does not."""
125
+ counts = pd.Series([99, 1], index=["203.0.113.99", "203.0.113.10"])
126
+ result = _stats._share(counts, total=100)
127
+ assert result is not None
128
+ entity, top_share = result
129
+ assert entity == "203.0.113.99"
130
+ assert top_share == 0.99
131
+
132
+
133
+ def test_share_speaks_exactly_at_gate() -> None:
134
+ """80% at SHARE_GATE = 0.80 → speaks (>=, not >)."""
135
+ counts = pd.Series([80, 20], index=["a", "b"])
136
+ result = _stats._share(counts, total=100)
137
+ assert result is not None
138
+ entity, top_share = result
139
+ assert entity == "a"
140
+ assert top_share == 0.80
141
+
142
+
143
+ def test_share_dashes_just_below_gate() -> None:
144
+ """79.9% just below SHARE_GATE → dashes."""
145
+ counts = pd.Series([799, 201], index=["a", "b"])
146
+ assert _stats._share(counts, total=1000) is None
147
+
148
+
149
+ def test_share_dashes_on_diffuse_distribution() -> None:
150
+ """No single entity above the gate → dashes."""
151
+ counts = pd.Series([30, 25, 20, 15, 10],
152
+ index=["a", "b", "c", "d", "e"])
153
+ assert _stats._share(counts, total=100) is None
154
+
155
+
156
+ def test_share_defensive_returns_on_empty_or_zero_total() -> None:
157
+ assert _stats._share(pd.Series([], dtype=int), total=0) is None
158
+ assert _stats._share(pd.Series([], dtype=int), total=100) is None
159
+ assert _stats._share(pd.Series([5], index=["a"]), total=0) is None
160
+
161
+
162
+ def test_share_defensive_return_on_nan_rank1() -> None:
163
+ """A NaN top count is meaningless — return None rather than crashing
164
+ or returning a NaN-share."""
165
+ counts = pd.Series([float("nan")], index=["a"])
166
+ assert _stats._share(counts, total=10) is None
167
+
168
+
169
+ # ─── select_insights_and_fields behavior ────────────────────────────────────
170
+ #
171
+ # The shared selection helper that the four schema summarisers all use.
172
+ # Covers Glenn's precision ask: only suppress from fields when an insight
173
+ # actually ran (formatter present AND used). Missing formatter keeps the
174
+ # slot in fields, preserving "each fact appears exactly once."
175
+
176
+ from loghunter.common.finding import DigestSlot
177
+
178
+
179
+ def _cliff_slot(label: str, *, ratio: float, magnitude: float = 1.0) -> DigestSlot:
180
+ return DigestSlot(
181
+ label=label, statistic="cliff",
182
+ cells=["entity-a", f"{int(magnitude)}", f"{ratio:.1f}x"],
183
+ entity="entity-a", magnitude=magnitude, ratio=ratio,
184
+ )
185
+
186
+
187
+ def _dist_slot(label: str, cells_text: str) -> DigestSlot:
188
+ return DigestSlot(label=label, statistic="dist", cells=[cells_text])
189
+
190
+
191
+ def _nonspeaking(label: str, statistic: str = "cliff") -> DigestSlot:
192
+ return DigestSlot(label=label, statistic=statistic)
193
+
194
+
195
+ def test_select_promotes_top_three_by_salience() -> None:
196
+ """Speaking cliff slots sort by ratio desc; top-3 with a formatter
197
+ become insights. Non-promoted cliff slot stays in fields."""
198
+ slots = [
199
+ _cliff_slot("a", ratio=5.0),
200
+ _cliff_slot("b", ratio=10.0),
201
+ _cliff_slot("c", ratio=2.0),
202
+ _cliff_slot("d", ratio=20.0),
203
+ ]
204
+ formatters = {label: (lambda s, l=label: f"{l}-insight") for label in "abcd"}
205
+ insights, fields = _stats.select_insights_and_fields(slots, formatters)
206
+ # Top 3 by ratio desc: d (20), b (10), a (5). c is not promoted.
207
+ assert insights == ["d-insight", "b-insight", "a-insight"]
208
+ assert [f.label for f in fields] == ["c"]
209
+
210
+
211
+ def test_select_dist_slots_pass_through_unfiltered() -> None:
212
+ """Dist slots never produce insights; they always pass through to
213
+ fields when they have cells."""
214
+ slots = [
215
+ _dist_slot("qtype-mix", "A 50% · AAAA 30%"),
216
+ _cliff_slot("client-volume", ratio=5.0),
217
+ ]
218
+ formatters = {"client-volume": lambda s: "client-volume-insight"}
219
+ insights, fields = _stats.select_insights_and_fields(slots, formatters)
220
+ assert insights == ["client-volume-insight"]
221
+ # qtype-mix not promoted; client-volume promoted → suppressed.
222
+ assert [f.label for f in fields] == ["qtype-mix"]
223
+
224
+
225
+ def test_select_missing_formatter_keeps_slot_as_field() -> None:
226
+ """Glenn's precision: a gating slot whose label has no formatter
227
+ falls through to fields instead of vanishing. 'Each fact appears
228
+ exactly once' must not lose facts to a missing formatter."""
229
+ slots = [
230
+ _cliff_slot("with-fmt", ratio=10.0),
231
+ _cliff_slot("no-fmt", ratio=20.0), # higher salience but no fmt
232
+ ]
233
+ formatters = {"with-fmt": lambda s: "with-fmt-insight"}
234
+ insights, fields = _stats.select_insights_and_fields(slots, formatters)
235
+ # no-fmt ranks first by salience but cannot become an insight; it
236
+ # falls through to fields. with-fmt is the only promoted slot.
237
+ assert insights == ["with-fmt-insight"]
238
+ assert [f.label for f in fields] == ["no-fmt"]
239
+
240
+
241
+ def test_select_non_speaking_slots_omitted_from_both() -> None:
242
+ """A slot with cells=None vanishes from BOTH insights and fields —
243
+ the renderer never sees the non-speaking state."""
244
+ slots = [
245
+ _cliff_slot("speaks", ratio=10.0),
246
+ _nonspeaking("silent"),
247
+ ]
248
+ formatters = {"speaks": lambda s: "speaks-insight"}
249
+ insights, fields = _stats.select_insights_and_fields(slots, formatters)
250
+ assert insights == ["speaks-insight"]
251
+ assert [f.label for f in fields] == []
252
+
253
+
254
+ def test_select_all_speaking_promoted_yields_empty_fields() -> None:
255
+ """The syslog mock case: every speaking slot becomes an insight, so
256
+ the fields block is empty. Card ends on the last insight."""
257
+ slots = [
258
+ _cliff_slot("a", ratio=5.0),
259
+ _cliff_slot("b", ratio=10.0),
260
+ _cliff_slot("c", ratio=2.0),
261
+ ]
262
+ formatters = {label: (lambda s, l=label: f"{l}-insight") for label in "abc"}
263
+ insights, fields = _stats.select_insights_and_fields(slots, formatters)
264
+ assert len(insights) == 3
265
+ assert fields == []
266
+
267
+
268
+ def test_select_share_and_rate_salience_share_bypasses_population_floor() -> None:
269
+ """share salience uses raw percentage; rate salience uses fraction /
270
+ RATE_FLOOR. A heavily concentrated share (90%) outranks a modest
271
+ cliff (5x)."""
272
+ share = DigestSlot(
273
+ label="source-ip", statistic="share",
274
+ cells=["x", "90%"], entity="x", magnitude=90.0, ratio=None,
275
+ )
276
+ cliff = _cliff_slot("event-source", ratio=5.0)
277
+ formatters = {
278
+ "source-ip": lambda s: f"share-{s.magnitude:.0f}",
279
+ "event-source": lambda s: f"cliff-{s.ratio:.0f}",
280
+ }
281
+ insights, _ = _stats.select_insights_and_fields([share, cliff], formatters)
282
+ assert insights == ["share-90", "cliff-5"]