loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,1189 @@
|
|
|
1
|
+
"""Tests for the digest verb and the conn schema summariser.
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
- cliff statistic (gate, population floor, rank2=0)
|
|
5
|
+
- the four conn slots (host involvement, internal/external endpoint rules)
|
|
6
|
+
- histogram adaptive binning + axis label + empty-frame fallback
|
|
7
|
+
- mechanical lede derivation (sorted by raw slot.ratio, never by parsing cells)
|
|
8
|
+
- text renderer (order of zones, scale anchor, axis label)
|
|
9
|
+
- allowlist non-invocation (architectural fork)
|
|
10
|
+
- default-window paths for all three boundedness states
|
|
11
|
+
- CLI dispatch and whitelist enforcement
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import io
|
|
17
|
+
import json
|
|
18
|
+
from datetime import datetime, timedelta, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import pytest
|
|
24
|
+
|
|
25
|
+
import loghunter.runner as runner
|
|
26
|
+
from loghunter.common.finding import DigestCard, DigestSlot, RunSummary
|
|
27
|
+
from loghunter.digest import conn as conn_digest
|
|
28
|
+
from loghunter.digest import _stats
|
|
29
|
+
from loghunter.outputs.text import (
|
|
30
|
+
TextHandler,
|
|
31
|
+
_bar_glyph,
|
|
32
|
+
_format_count,
|
|
33
|
+
_render_histogram,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _conn_insights_and_fields(slots: list[DigestSlot]) -> tuple[list[str], list[DigestSlot]]:
|
|
38
|
+
"""Adapter — exercises the new shared selection helper with conn's
|
|
39
|
+
own formatter map. Equivalent to the deleted conn_digest._build_ledes."""
|
|
40
|
+
return _stats.select_insights_and_fields(slots, conn_digest._INSIGHT_FORMATTERS)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ─── Fixtures ────────────────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
_NOW = datetime(2026, 6, 11, 12, 0, tzinfo=timezone.utc)
|
|
46
|
+
_BASE_TS = _NOW.timestamp()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _conn_row(
|
|
50
|
+
src: str = "10.0.0.10",
|
|
51
|
+
dst: str = "192.0.2.20",
|
|
52
|
+
port: int = 443,
|
|
53
|
+
proto: str = "tcp",
|
|
54
|
+
ts: float = _BASE_TS,
|
|
55
|
+
bytes_: float | None = 1000,
|
|
56
|
+
conn_state: str | None = "SF",
|
|
57
|
+
local_orig: bool | None = True,
|
|
58
|
+
) -> dict:
|
|
59
|
+
"""Build a single canonical conn row.
|
|
60
|
+
|
|
61
|
+
Defaults to internal-source (RFC1918) → external-dst (RFC 5737), TCP/443,
|
|
62
|
+
1000 originator bytes, local_orig=True. Override any column via kwargs.
|
|
63
|
+
"""
|
|
64
|
+
return {
|
|
65
|
+
"src": src,
|
|
66
|
+
"dst": dst,
|
|
67
|
+
"port": port,
|
|
68
|
+
"proto": proto,
|
|
69
|
+
"ts": ts,
|
|
70
|
+
"bytes": bytes_,
|
|
71
|
+
"conn_state": conn_state,
|
|
72
|
+
"local_orig": local_orig,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _conn_df(rows: list[dict]) -> pd.DataFrame:
|
|
77
|
+
"""Build a canonical conn DataFrame from row dicts."""
|
|
78
|
+
columns = ["src", "dst", "port", "proto", "ts", "bytes", "conn_state", "local_orig"]
|
|
79
|
+
if not rows:
|
|
80
|
+
return pd.DataFrame(columns=columns)
|
|
81
|
+
return pd.DataFrame(rows, columns=columns)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _run_summary(window: tuple[datetime, datetime] = (_NOW - timedelta(days=1), _NOW)) -> RunSummary:
|
|
85
|
+
return RunSummary(
|
|
86
|
+
data_window=window,
|
|
87
|
+
record_counts={"conn*.log*": 100},
|
|
88
|
+
data_size_bytes=0,
|
|
89
|
+
detectors_run=[],
|
|
90
|
+
detectors_skipped={},
|
|
91
|
+
notes=[],
|
|
92
|
+
data_sources=["zeek_conn"],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _write_conn_ndjson(path: Path, rows: list[dict]) -> None:
|
|
97
|
+
"""Write conn rows as Zeek-shaped NDJSON (loader will normalise)."""
|
|
98
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
records = []
|
|
100
|
+
for row in rows:
|
|
101
|
+
records.append({
|
|
102
|
+
"ts": row["ts"],
|
|
103
|
+
"id.orig_h": row["src"],
|
|
104
|
+
"id.resp_h": row["dst"],
|
|
105
|
+
"id.resp_p": row["port"],
|
|
106
|
+
"proto": row["proto"],
|
|
107
|
+
**({"orig_bytes": row["bytes"]} if row.get("bytes") is not None else {}),
|
|
108
|
+
**({"conn_state": row["conn_state"]} if row.get("conn_state") is not None else {}),
|
|
109
|
+
**({"local_orig": row["local_orig"]} if row.get("local_orig") is not None else {}),
|
|
110
|
+
})
|
|
111
|
+
path.write_text(
|
|
112
|
+
"\n".join(json.dumps(r) for r in records) + "\n",
|
|
113
|
+
encoding="utf-8",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ─── Cliff statistic ─────────────────────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
def test_cliff_dashes_below_population_floor() -> None:
|
|
120
|
+
series = pd.Series([100, 10, 5], index=["a", "b", "c"]).sort_values(ascending=False)
|
|
121
|
+
assert conn_digest._cliff(series) is None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def test_cliff_dashes_below_gate() -> None:
|
|
125
|
+
series = pd.Series([15, 10, 9, 8, 7, 6], index=list("abcdef")).sort_values(ascending=False)
|
|
126
|
+
assert conn_digest._cliff(series) is None # 15 / 10 = 1.5 < 2.0
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_cliff_names_rank1_when_speaking() -> None:
|
|
130
|
+
series = pd.Series([40, 10, 9, 8, 7, 6], index=list("abcdef")).sort_values(ascending=False)
|
|
131
|
+
result = conn_digest._cliff(series)
|
|
132
|
+
assert result is not None
|
|
133
|
+
entity, magnitude, ratio = result
|
|
134
|
+
assert entity == "a"
|
|
135
|
+
assert magnitude == 40.0
|
|
136
|
+
assert ratio == pytest.approx(4.0)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_cliff_handles_rank2_zero() -> None:
|
|
140
|
+
series = pd.Series([10, 0, 0, 0, 0], index=list("abcde")).sort_values(ascending=False)
|
|
141
|
+
assert conn_digest._cliff(series) is None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ─── conn-share semantics ────────────────────────────────────────────────────
|
|
145
|
+
|
|
146
|
+
def test_conn_share_counts_host_involvement_across_src_and_dst() -> None:
|
|
147
|
+
# Host "10.0.0.50" appears only as dst; should still contribute to its
|
|
148
|
+
# involvement count and to the distinct-host population.
|
|
149
|
+
rows = [
|
|
150
|
+
_conn_row(src="10.0.0.10", dst="10.0.0.50"),
|
|
151
|
+
_conn_row(src="10.0.0.11", dst="10.0.0.50"),
|
|
152
|
+
_conn_row(src="10.0.0.12", dst="10.0.0.50"),
|
|
153
|
+
_conn_row(src="10.0.0.13", dst="10.0.0.50"),
|
|
154
|
+
_conn_row(src="10.0.0.14", dst="10.0.0.50"),
|
|
155
|
+
]
|
|
156
|
+
df = _conn_df(rows)
|
|
157
|
+
slot = conn_digest._slot_conn_share(df)
|
|
158
|
+
# 5 distinct srcs + the one common dst = 6 hosts → population floor met
|
|
159
|
+
assert slot.cells is not None
|
|
160
|
+
assert slot.entity == "10.0.0.50"
|
|
161
|
+
# 5 involvements out of 5 rows = 100%
|
|
162
|
+
assert slot.magnitude == pytest.approx(100.0)
|
|
163
|
+
assert slot.ratio == pytest.approx(5.0) # rank1=5, rank2=1 each
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_conn_share_speaks_with_dominant_host() -> None:
|
|
167
|
+
rows = [_conn_row(src="10.0.0.50", dst=f"192.0.2.{i}") for i in range(10)]
|
|
168
|
+
rows.append(_conn_row(src="10.0.0.11", dst="192.0.2.11"))
|
|
169
|
+
df = _conn_df(rows)
|
|
170
|
+
slot = conn_digest._slot_conn_share(df)
|
|
171
|
+
assert slot.cells is not None
|
|
172
|
+
assert slot.entity == "10.0.0.50"
|
|
173
|
+
assert slot.cells[0] == "10.0.0.50"
|
|
174
|
+
assert "%" in slot.cells[1]
|
|
175
|
+
assert slot.cells[2].endswith("x")
|
|
176
|
+
# Raw cliff ratio carried for lede sorting
|
|
177
|
+
assert slot.ratio is not None and slot.ratio >= 2.0
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def test_conn_share_dashes_on_flat_pile() -> None:
|
|
181
|
+
rows = [_conn_row(src=f"10.0.0.{i}", dst=f"192.0.2.{i}") for i in range(10, 15)]
|
|
182
|
+
df = _conn_df(rows)
|
|
183
|
+
slot = conn_digest._slot_conn_share(df)
|
|
184
|
+
assert slot.cells is None
|
|
185
|
+
assert slot.entity is None
|
|
186
|
+
assert slot.magnitude is None
|
|
187
|
+
assert slot.ratio is None
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ─── densest-tuple, fan-out ──────────────────────────────────────────────────
|
|
191
|
+
|
|
192
|
+
def test_densest_tuple_speaks_with_dominant_flow() -> None:
|
|
193
|
+
rows = [_conn_row(src="10.0.0.10", dst="10.0.0.1", port=22) for _ in range(20)]
|
|
194
|
+
for i in range(5):
|
|
195
|
+
rows.append(_conn_row(src=f"10.0.0.{i+20}", dst="192.0.2.99", port=443))
|
|
196
|
+
df = _conn_df(rows)
|
|
197
|
+
slot = conn_digest._slot_densest_tuple(df)
|
|
198
|
+
assert slot.cells is not None
|
|
199
|
+
assert slot.entity == "10.0.0.10 → 10.0.0.1:22"
|
|
200
|
+
assert slot.cells[0] == "10.0.0.10 → 10.0.0.1:22"
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def test_fan_out_speaks_with_dominant_source() -> None:
|
|
204
|
+
rows = [
|
|
205
|
+
_conn_row(src="10.0.0.53", dst=f"192.0.2.{i}", port=53)
|
|
206
|
+
for i in range(20)
|
|
207
|
+
]
|
|
208
|
+
for i in range(5):
|
|
209
|
+
rows.append(_conn_row(src=f"10.0.0.{i+100}", dst="198.51.100.1", port=80))
|
|
210
|
+
df = _conn_df(rows)
|
|
211
|
+
slot = conn_digest._slot_fan_out(df)
|
|
212
|
+
assert slot.cells is not None
|
|
213
|
+
assert slot.entity == "10.0.0.53:53"
|
|
214
|
+
assert "dsts" in slot.cells[1]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ─── byte-direction: internal/external endpoint rules ───────────────────────
|
|
218
|
+
|
|
219
|
+
def test_byte_direction_requires_internal_src_and_external_dst() -> None:
|
|
220
|
+
# Mix of internal↔internal, external→internal, and only ONE genuinely
|
|
221
|
+
# outbound flow group → population of 1 outbound dst → slot must dash
|
|
222
|
+
# (population floor).
|
|
223
|
+
rows = [
|
|
224
|
+
_conn_row(src="10.0.0.10", dst="10.0.0.20", bytes_=10_000), # int→int
|
|
225
|
+
_conn_row(src="198.51.100.1", dst="10.0.0.10", bytes_=10_000, local_orig=False), # ext→int
|
|
226
|
+
_conn_row(src="10.0.0.10", dst="192.0.2.1", bytes_=10_000), # int→ext
|
|
227
|
+
]
|
|
228
|
+
df = _conn_df(rows)
|
|
229
|
+
slot = conn_digest._slot_byte_direction(df)
|
|
230
|
+
# Only 1 outbound destination → population floor (5) not met → dash.
|
|
231
|
+
assert slot.cells is None
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def test_byte_direction_uses_local_orig_when_present() -> None:
|
|
235
|
+
# A public-looking src with local_orig=True must be treated as internal.
|
|
236
|
+
# Six distinct external dsts; the 50k-byte one must dominate cliff.
|
|
237
|
+
rows = [
|
|
238
|
+
_conn_row(src="203.0.113.10", dst="192.0.2.50", local_orig=True, bytes_=50_000),
|
|
239
|
+
]
|
|
240
|
+
for i in range(6):
|
|
241
|
+
rows.append(_conn_row(src="203.0.113.10", dst=f"198.51.100.{i+1}",
|
|
242
|
+
local_orig=True, bytes_=1_000))
|
|
243
|
+
df = _conn_df(rows)
|
|
244
|
+
slot = conn_digest._slot_byte_direction(df)
|
|
245
|
+
assert slot.cells is not None
|
|
246
|
+
assert slot.entity == "192.0.2.50"
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def test_byte_direction_local_orig_false_excludes_rfc1918_src() -> None:
|
|
250
|
+
# local_orig=False overrides RFC1918 — src is treated as external, so the
|
|
251
|
+
# 999_999-byte row to 192.0.2.50 is NOT outbound and must not dominate.
|
|
252
|
+
# Six other outbound rows with varied bytes give a clear rank-1 elsewhere.
|
|
253
|
+
rows = [
|
|
254
|
+
_conn_row(src="10.0.0.10", dst="192.0.2.50",
|
|
255
|
+
local_orig=False, bytes_=999_999),
|
|
256
|
+
]
|
|
257
|
+
for i, b in enumerate([10_000, 1_000, 500, 200, 100, 50]):
|
|
258
|
+
rows.append(_conn_row(src="10.0.0.11", dst=f"198.51.100.{i+1}",
|
|
259
|
+
local_orig=True, bytes_=b))
|
|
260
|
+
df = _conn_df(rows)
|
|
261
|
+
slot = conn_digest._slot_byte_direction(df)
|
|
262
|
+
assert slot.cells is not None
|
|
263
|
+
assert slot.entity == "198.51.100.1"
|
|
264
|
+
assert slot.entity != "192.0.2.50"
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def test_byte_direction_falls_back_to_rfc1918_when_local_orig_nan() -> None:
|
|
268
|
+
# local_orig missing → RFC1918(src) decides. RFC1918 src + external dst → outbound.
|
|
269
|
+
# 50k-byte dst should dominate over six 1k-byte dsts. The 50k/1k = 50.0
|
|
270
|
+
# ratio lands exactly on CLIFF_DISPLAY_CAP, so the rendered cell caps but
|
|
271
|
+
# slot.ratio stays the raw float — locks the display/storage separation
|
|
272
|
+
# at a realistic call site.
|
|
273
|
+
rows = [
|
|
274
|
+
_conn_row(src="10.0.0.10", dst="192.0.2.50", local_orig=None, bytes_=50_000),
|
|
275
|
+
]
|
|
276
|
+
for i in range(6):
|
|
277
|
+
rows.append(_conn_row(src="10.0.0.10", dst=f"198.51.100.{i+1}",
|
|
278
|
+
local_orig=None, bytes_=1_000))
|
|
279
|
+
df = _conn_df(rows)
|
|
280
|
+
slot = conn_digest._slot_byte_direction(df)
|
|
281
|
+
assert slot.cells is not None
|
|
282
|
+
assert slot.entity == "192.0.2.50"
|
|
283
|
+
# Display-cap separation: raw ratio preserved, rendered cell capped
|
|
284
|
+
assert slot.ratio == pytest.approx(50.0)
|
|
285
|
+
assert slot.cells[2] == ">50x"
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def test_byte_direction_treats_nan_bytes_as_zero() -> None:
|
|
289
|
+
# NaN bytes contribute 0 — 192.0.2.50 has NaN bytes; the five varied-byte
|
|
290
|
+
# outbound rows give a clear rank-1 elsewhere.
|
|
291
|
+
rows = [
|
|
292
|
+
_conn_row(src="10.0.0.10", dst="192.0.2.50", bytes_=None),
|
|
293
|
+
]
|
|
294
|
+
for i, b in enumerate([50_000, 1_000, 500, 200, 100]):
|
|
295
|
+
rows.append(_conn_row(src="10.0.0.10", dst=f"198.51.100.{i+1}", bytes_=b))
|
|
296
|
+
df = _conn_df(rows)
|
|
297
|
+
slot = conn_digest._slot_byte_direction(df)
|
|
298
|
+
assert slot.cells is not None
|
|
299
|
+
# 192.0.2.50 with NaN bytes (counted as 0) must NOT be rank-1
|
|
300
|
+
assert slot.entity == "198.51.100.1"
|
|
301
|
+
assert slot.entity != "192.0.2.50"
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# ─── Zone-1 extras ───────────────────────────────────────────────────────────
|
|
305
|
+
|
|
306
|
+
def test_zone1_split_uses_rfc1918_per_host_not_local_orig() -> None:
|
|
307
|
+
# 10.0.0.X appears only as dst (no local_orig for that endpoint) but is
|
|
308
|
+
# RFC1918, so it must count as internal in the Zone-1 split.
|
|
309
|
+
rows = [
|
|
310
|
+
_conn_row(src="198.51.100.5", dst="10.0.0.50", local_orig=False),
|
|
311
|
+
]
|
|
312
|
+
df = _conn_df(rows)
|
|
313
|
+
body = conn_digest.summarize(df)
|
|
314
|
+
# zone1_extras: first entry is the "hosts" combined line
|
|
315
|
+
label, value = body["zone1_extras"][0]
|
|
316
|
+
assert label == "hosts"
|
|
317
|
+
# Both endpoints are visible; 10.0.0.50 must be classified internal.
|
|
318
|
+
assert "1 internal" in value
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def test_zone1_byte_totals_outbound_and_inbound() -> None:
|
|
322
|
+
rows = [
|
|
323
|
+
_conn_row(src="10.0.0.10", dst="192.0.2.1", bytes_=1000, local_orig=True), # outbound
|
|
324
|
+
_conn_row(src="198.51.100.5", dst="10.0.0.10", bytes_=500, local_orig=False), # inbound
|
|
325
|
+
]
|
|
326
|
+
df = _conn_df(rows)
|
|
327
|
+
body = conn_digest.summarize(df)
|
|
328
|
+
labels_to_values = dict(body["zone1_extras"])
|
|
329
|
+
assert labels_to_values["outbound bytes"] == "1000 B"
|
|
330
|
+
assert labels_to_values["inbound bytes"] == "500 B"
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# ─── Histogram ───────────────────────────────────────────────────────────────
|
|
334
|
+
|
|
335
|
+
def test_histogram_picks_hourly_for_short_span() -> None:
|
|
336
|
+
start = datetime(2026, 6, 11, 0, 0, tzinfo=timezone.utc)
|
|
337
|
+
end = start + timedelta(hours=24)
|
|
338
|
+
ts = pd.Series([
|
|
339
|
+
(start + timedelta(hours=h)).timestamp() for h in range(0, 24)
|
|
340
|
+
])
|
|
341
|
+
counts, unit, peak = runner._compute_histogram(ts, (start, end))
|
|
342
|
+
assert unit == "hr"
|
|
343
|
+
assert len(counts) == 24
|
|
344
|
+
assert peak == 1
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def test_histogram_picks_daily_for_long_span() -> None:
|
|
348
|
+
start = datetime(2026, 5, 1, 0, 0, tzinfo=timezone.utc)
|
|
349
|
+
end = start + timedelta(days=30)
|
|
350
|
+
ts = pd.Series([
|
|
351
|
+
(start + timedelta(days=d, hours=12)).timestamp() for d in range(30)
|
|
352
|
+
])
|
|
353
|
+
counts, unit, peak = runner._compute_histogram(ts, (start, end))
|
|
354
|
+
assert unit == "day"
|
|
355
|
+
assert len(counts) == 30
|
|
356
|
+
assert peak == 1
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def test_histogram_peak_reflects_max_bin() -> None:
|
|
360
|
+
start = datetime(2026, 6, 11, 0, 0, tzinfo=timezone.utc)
|
|
361
|
+
end = start + timedelta(hours=4)
|
|
362
|
+
# Five events all in hour-1
|
|
363
|
+
ts = pd.Series([
|
|
364
|
+
(start + timedelta(hours=1, minutes=m)).timestamp() for m in (1, 2, 3, 4, 5)
|
|
365
|
+
])
|
|
366
|
+
counts, _unit, peak = runner._compute_histogram(ts, (start, end))
|
|
367
|
+
assert peak == 5
|
|
368
|
+
assert counts[1] == 5
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def test_histogram_zero_span_single_record_emits_one_bin() -> None:
|
|
372
|
+
"""A frame whose min-ts == max-ts (single event, or all events sharing one
|
|
373
|
+
timestamp) must emit a one-bin histogram, not the no-events fallback.
|
|
374
|
+
|
|
375
|
+
Regression for the zero-span defect: the prior implementation returned
|
|
376
|
+
`[], "hr", 0` whenever start == end, silently discarding non-empty ts.
|
|
377
|
+
"""
|
|
378
|
+
ts_value = datetime(2026, 6, 11, 12, 0, tzinfo=timezone.utc).timestamp()
|
|
379
|
+
ts = pd.Series([ts_value, ts_value, ts_value])
|
|
380
|
+
window_dt = datetime.fromtimestamp(ts_value, tz=timezone.utc)
|
|
381
|
+
counts, unit, peak = runner._compute_histogram(ts, (window_dt, window_dt))
|
|
382
|
+
assert counts == [3]
|
|
383
|
+
assert peak == 3
|
|
384
|
+
assert unit == "hr"
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def test_histogram_right_edge_event_lands_in_final_bin() -> None:
|
|
388
|
+
"""An event at exactly data_window[1] must land in the final bin.
|
|
389
|
+
|
|
390
|
+
Regression for the half-open-window defect: when the span is an exact
|
|
391
|
+
multiple of bin_seconds (e.g. 24 hours with hourly bins), the prior
|
|
392
|
+
implementation filtered out offsets equal to bin_count, silently
|
|
393
|
+
undercounting the most-recent bin. data_window is derived from
|
|
394
|
+
min(ts)/max(ts), so the max-ts event sits on the right edge by
|
|
395
|
+
construction — it must land in the final bin, not be dropped.
|
|
396
|
+
"""
|
|
397
|
+
start = datetime(2026, 6, 11, 0, 0, tzinfo=timezone.utc)
|
|
398
|
+
end = start + timedelta(hours=24) # exact 24h → bin_count == 24
|
|
399
|
+
ts = pd.Series([start.timestamp(), end.timestamp()])
|
|
400
|
+
counts, unit, peak = runner._compute_histogram(ts, (start, end))
|
|
401
|
+
assert unit == "hr" # locks the hourly binning branch
|
|
402
|
+
assert len(counts) == 24
|
|
403
|
+
assert counts[0] == 1
|
|
404
|
+
assert counts[-1] == 1 # right-edge event lands in final bin
|
|
405
|
+
assert peak == 1
|
|
406
|
+
assert sum(counts) == 2 # no events lost
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def test_histogram_caps_long_span_to_max_bins() -> None:
|
|
410
|
+
"""A 219-day span produces 219 raw daily bins; the width cap folds them
|
|
411
|
+
so the single-line renderer can fit within terminal width."""
|
|
412
|
+
start = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
|
|
413
|
+
end = start + timedelta(days=219)
|
|
414
|
+
ts = pd.Series([
|
|
415
|
+
(start + timedelta(days=d, hours=12)).timestamp() for d in range(219)
|
|
416
|
+
])
|
|
417
|
+
counts, unit, _peak = runner._compute_histogram(ts, (start, end))
|
|
418
|
+
assert unit == "day" # label stays nominal even when bins are folded
|
|
419
|
+
assert len(counts) <= runner._HISTOGRAM_MAX_BINS
|
|
420
|
+
# group_size = ceil(219 / 60) = 4 → ceil(219 / 4) = 55 folded buckets
|
|
421
|
+
assert len(counts) == 55
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def test_histogram_downsampling_preserves_total_event_count() -> None:
|
|
425
|
+
"""Folding adjacent bins by sum loses nothing — every raw event is
|
|
426
|
+
accounted for in the post-fold counts."""
|
|
427
|
+
start = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
|
|
428
|
+
end = start + timedelta(days=219)
|
|
429
|
+
ts = pd.Series([
|
|
430
|
+
(start + timedelta(days=d, hours=12)).timestamp() for d in range(219)
|
|
431
|
+
])
|
|
432
|
+
counts, _unit, _peak = runner._compute_histogram(ts, (start, end))
|
|
433
|
+
assert sum(counts) == 219
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def test_histogram_peak_reflects_post_fold_bucket() -> None:
|
|
437
|
+
"""Peak is recomputed AFTER the fold, so the rendered scale anchor
|
|
438
|
+
reflects the summed bucket value the tallest glyph actually represents.
|
|
439
|
+
|
|
440
|
+
Fixture: 219-day span (forces daily binning + cap to 55 buckets at
|
|
441
|
+
group_size=4). Days 0–3 hold 3 events each; days 4–218 hold 1 event
|
|
442
|
+
each. By construction the largest single-day raw count is 3, but the
|
|
443
|
+
first folded bucket sums to 12 (3+3+3+3).
|
|
444
|
+
"""
|
|
445
|
+
start = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
|
|
446
|
+
end = start + timedelta(days=219)
|
|
447
|
+
events: list[float] = []
|
|
448
|
+
for d in range(4):
|
|
449
|
+
for _ in range(3):
|
|
450
|
+
events.append((start + timedelta(days=d, hours=12)).timestamp())
|
|
451
|
+
for d in range(4, 219):
|
|
452
|
+
events.append((start + timedelta(days=d, hours=12)).timestamp())
|
|
453
|
+
ts = pd.Series(events)
|
|
454
|
+
raw_max_single_bin = 3 # largest single-day raw count by construction
|
|
455
|
+
counts, _unit, peak = runner._compute_histogram(ts, (start, end))
|
|
456
|
+
assert peak == max(counts)
|
|
457
|
+
assert peak > raw_max_single_bin
|
|
458
|
+
assert peak == 12 # days 0–3 fold into bucket 0
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def test_histogram_short_span_unchanged_by_cap() -> None:
|
|
462
|
+
"""Spans yielding <= 60 raw bins must be returned untouched — the cap
|
|
463
|
+
must not perturb the common case. Locks concrete pre-cap values rather
|
|
464
|
+
than mirroring the daily-switch test loosely.
|
|
465
|
+
"""
|
|
466
|
+
start = datetime(2026, 5, 1, 0, 0, tzinfo=timezone.utc)
|
|
467
|
+
end = start + timedelta(days=30)
|
|
468
|
+
ts = pd.Series([
|
|
469
|
+
(start + timedelta(days=d, hours=12)).timestamp() for d in range(30)
|
|
470
|
+
])
|
|
471
|
+
counts, unit, peak = runner._compute_histogram(ts, (start, end))
|
|
472
|
+
assert unit == "day"
|
|
473
|
+
assert counts == [1] * 30 # exact pre-cap values, no folding
|
|
474
|
+
assert peak == 1
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def test_histogram_empty_frame_renders_no_events_line() -> None:
|
|
478
|
+
rendered = _render_histogram([], "hr", 0)
|
|
479
|
+
assert "no events in window" in rendered
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def test_render_histogram_carries_axis_unit_label() -> None:
|
|
483
|
+
hourly = _render_histogram([1, 2, 3], "hr", 3)
|
|
484
|
+
assert "hourly bins" in hourly
|
|
485
|
+
daily = _render_histogram([1, 2, 3], "day", 3)
|
|
486
|
+
assert "daily bins" in daily
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def test_render_histogram_carries_scale_anchor() -> None:
|
|
490
|
+
rendered = _render_histogram([1, 5, 3], "hr", 5)
|
|
491
|
+
assert "peak: 5" in rendered
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def test_bar_glyph_low_and_high() -> None:
|
|
495
|
+
assert _bar_glyph(0, 10) == "▁"
|
|
496
|
+
assert _bar_glyph(10, 10) == "█"
|
|
497
|
+
assert _bar_glyph(5, 10) in "▃▄▅"
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def test_format_count_thresholds() -> None:
|
|
501
|
+
assert _format_count(42) == "42"
|
|
502
|
+
assert _format_count(1500) == "1.5k"
|
|
503
|
+
assert _format_count(14_200) == "14.2k"
|
|
504
|
+
assert _format_count(3_400_000) == "3.4M"
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
# ─── Ledes ───────────────────────────────────────────────────────────────────
|
|
508
|
+
|
|
509
|
+
def test_insights_silent_on_flat_pile() -> None:
|
|
510
|
+
rows = [_conn_row(src=f"10.0.0.{i}", dst=f"192.0.2.{i}") for i in range(5)]
|
|
511
|
+
df = _conn_df(rows)
|
|
512
|
+
body = conn_digest.summarize(df)
|
|
513
|
+
assert body["insights"] == []
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def test_insights_sort_by_slot_ratio_not_cell_string() -> None:
|
|
517
|
+
# Hand-build 4 speaking slots with distinct ratios; verify insights
|
|
518
|
+
# verbalize the top 3 in ratio-desc order via the new selection helper.
|
|
519
|
+
slots = [
|
|
520
|
+
DigestSlot(label="conn-share", statistic="cliff", cells=["A", "10%", "2.0x"],
|
|
521
|
+
entity="A", magnitude=10.0, ratio=2.0),
|
|
522
|
+
DigestSlot(label="densest-tuple", statistic="cliff", cells=["B → C:1", "5", "5.0x"],
|
|
523
|
+
entity="B → C:1", magnitude=5.0, ratio=5.0),
|
|
524
|
+
DigestSlot(label="fan-out", statistic="cliff", cells=["D:2", "8 dsts", "4.0x"],
|
|
525
|
+
entity="D:2", magnitude=8.0, ratio=4.0),
|
|
526
|
+
DigestSlot(label="byte-direction", statistic="cliff", cells=["E", "30%", "3.0x"],
|
|
527
|
+
entity="E", magnitude=30.0, ratio=3.0),
|
|
528
|
+
]
|
|
529
|
+
insights, _ = _conn_insights_and_fields(slots)
|
|
530
|
+
assert len(insights) == 3
|
|
531
|
+
# Top three by ratio descending: densest-tuple (5.0), fan-out (4.0), byte-direction (3.0)
|
|
532
|
+
assert "B → C:1" in insights[0]
|
|
533
|
+
assert "D:2" in insights[1]
|
|
534
|
+
# byte-direction lede MUST NOT lead with the vestigial " → " glyph —
|
|
535
|
+
# the slot stores the bare dst now; only densest-tuple owns the
|
|
536
|
+
# between-endpoints arrow.
|
|
537
|
+
assert insights[2].startswith("E ")
|
|
538
|
+
assert "→" not in insights[2]
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def test_insights_verbalize_identity_and_magnitude() -> None:
|
|
542
|
+
slot = DigestSlot(label="densest-tuple", statistic="cliff",
|
|
543
|
+
cells=["X → Y:22", "482", "3.7x"],
|
|
544
|
+
entity="X → Y:22", magnitude=482.0, ratio=3.7)
|
|
545
|
+
insights, _ = _conn_insights_and_fields([slot])
|
|
546
|
+
assert len(insights) == 1
|
|
547
|
+
line = insights[0]
|
|
548
|
+
assert "X → Y:22" in line
|
|
549
|
+
assert "482" in line
|
|
550
|
+
# Never reveal the raw statistic name
|
|
551
|
+
assert "cliff" not in line.lower()
|
|
552
|
+
assert "rank1" not in line.lower()
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
# ─── Display cap ─────────────────────────────────────────────────────────────
|
|
556
|
+
|
|
557
|
+
def test_format_ratio_cell_below_cap_renders_one_decimal() -> None:
|
|
558
|
+
assert conn_digest._format_ratio_cell(3.7) == "3.7x"
|
|
559
|
+
assert conn_digest._format_ratio_cell(49.9) == "49.9x"
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def test_format_ratio_cell_at_or_above_cap_renders_capped_form() -> None:
|
|
563
|
+
# Boundary is inclusive (>= cap)
|
|
564
|
+
assert conn_digest._format_ratio_cell(50.0) == ">50x"
|
|
565
|
+
assert conn_digest._format_ratio_cell(625000.0) == ">50x"
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def test_format_ratio_lede_below_cap_renders_one_decimal() -> None:
|
|
569
|
+
assert conn_digest._format_ratio_lede(3.7) == "3.7x"
|
|
570
|
+
assert conn_digest._format_ratio_lede(49.9) == "49.9x"
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def test_format_ratio_lede_at_or_above_cap_renders_prose_form() -> None:
|
|
574
|
+
assert conn_digest._format_ratio_lede(50.0) == "more than 50x"
|
|
575
|
+
assert conn_digest._format_ratio_lede(625000.0) == "more than 50x"
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def test_ledes_sort_by_true_ratio_when_one_slot_is_capped() -> None:
|
|
579
|
+
"""The display cap must NOT corrupt lede sort order.
|
|
580
|
+
|
|
581
|
+
A slot with a huge raw ratio (rendered capped) must still outrank a slot
|
|
582
|
+
with a smaller raw ratio (rendered literally). Verifies the separation
|
|
583
|
+
between stored slot.ratio (raw float, drives sort) and rendered display
|
|
584
|
+
string (capped at CLIFF_DISPLAY_CAP).
|
|
585
|
+
"""
|
|
586
|
+
capped = DigestSlot(
|
|
587
|
+
label="byte-direction", statistic="cliff",
|
|
588
|
+
cells=["A", "100%", ">50x"],
|
|
589
|
+
entity="A", magnitude=100.0, ratio=625000.0,
|
|
590
|
+
)
|
|
591
|
+
uncapped = DigestSlot(
|
|
592
|
+
label="densest-tuple", statistic="cliff",
|
|
593
|
+
cells=["B → C:22", "9", "5.0x"],
|
|
594
|
+
entity="B → C:22", magnitude=9.0, ratio=5.0,
|
|
595
|
+
)
|
|
596
|
+
# Intentionally pass uncapped first so the result reflects sort, not input order
|
|
597
|
+
insights, _ = _conn_insights_and_fields([uncapped, capped])
|
|
598
|
+
assert len(insights) == 2
|
|
599
|
+
# Capped slot (raw 625000) sorts first by true ratio
|
|
600
|
+
assert insights[0].startswith("A ")
|
|
601
|
+
assert "more than 50x" in insights[0]
|
|
602
|
+
assert "625000" not in insights[0] # raw number must NOT leak into the rendered string
|
|
603
|
+
# Uncapped slot sorts second, rendered as literal
|
|
604
|
+
assert "B → C:22" in insights[1]
|
|
605
|
+
assert "5.0x" in insights[1]
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
# ─── Summariser shape ────────────────────────────────────────────────────────
|
|
609
|
+
|
|
610
|
+
def test_summarizer_returns_zone1_insights_fields_keys() -> None:
|
|
611
|
+
df = _conn_df([_conn_row()])
|
|
612
|
+
body = conn_digest.summarize(df)
|
|
613
|
+
assert set(body.keys()) == {"zone1_extras", "insights", "fields"}
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def test_summarizer_zone1_extras_lead_with_hosts() -> None:
|
|
617
|
+
df = _conn_df([_conn_row()])
|
|
618
|
+
body = conn_digest.summarize(df)
|
|
619
|
+
assert body["zone1_extras"][0][0] == "hosts"
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
# ─── Renderer (flat shape) ──────────────────────────────────────────────────
|
|
623
|
+
|
|
624
|
+
def _render_card(card: DigestCard) -> str:
|
|
625
|
+
handler = TextHandler(stream=io.StringIO())
|
|
626
|
+
handler.render_digest(card)
|
|
627
|
+
return handler._stream.getvalue()
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def _empty_card() -> DigestCard:
|
|
631
|
+
return DigestCard(
|
|
632
|
+
schema="conn",
|
|
633
|
+
source_name="conn.log",
|
|
634
|
+
data_window=(_NOW - timedelta(days=1), _NOW),
|
|
635
|
+
record_count=0,
|
|
636
|
+
histogram_counts=[],
|
|
637
|
+
histogram_unit="hr",
|
|
638
|
+
histogram_peak=0,
|
|
639
|
+
zone1_extras=[("hosts", "0"), ("outbound bytes", "0 B"), ("inbound bytes", "0 B")],
|
|
640
|
+
insights=[],
|
|
641
|
+
fields=[], # non-speaking slots are filtered before reaching the card
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def test_render_digest_identity_then_ambient() -> None:
|
|
646
|
+
rendered = _render_card(_empty_card())
|
|
647
|
+
lines = rendered.splitlines()
|
|
648
|
+
# Identity line 1, then identity line 2 (window), then identity line 3
|
|
649
|
+
# (schema · N lines · size), then blank, then ambient block.
|
|
650
|
+
assert lines[0] == "conn.log"
|
|
651
|
+
assert lines[2].startswith("conn · 0 lines ·")
|
|
652
|
+
# Ambient block (label-aligned, flush-left).
|
|
653
|
+
assert any(ln.startswith("hosts:") for ln in lines)
|
|
654
|
+
# No banner, no schema rule, no N.B. footer.
|
|
655
|
+
assert "LogHunter" not in rendered
|
|
656
|
+
assert "── digest" not in rendered
|
|
657
|
+
assert "N.B." not in rendered
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def test_render_digest_non_speaking_slots_are_filtered_by_summariser() -> None:
|
|
661
|
+
"""A non-speaking slot never reaches `card.fields` — selection happens
|
|
662
|
+
in the summariser. The renderer prints only what it gets and never
|
|
663
|
+
paints a `label: -` row under the flat grammar."""
|
|
664
|
+
rendered = _render_card(_empty_card())
|
|
665
|
+
assert "conn-share:" not in rendered
|
|
666
|
+
assert "fan-out:" not in rendered
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def test_render_digest_field_block_shows_cells_for_speaking_non_insight_slot() -> None:
|
|
670
|
+
slot = DigestSlot(
|
|
671
|
+
label="densest-tuple", statistic="cliff",
|
|
672
|
+
cells=["X → Y:22", "482", "3.7x"],
|
|
673
|
+
entity="X → Y:22", magnitude=482.0, ratio=3.7,
|
|
674
|
+
)
|
|
675
|
+
card = DigestCard(
|
|
676
|
+
schema="conn",
|
|
677
|
+
source_name="conn.log",
|
|
678
|
+
data_window=(_NOW - timedelta(days=1), _NOW),
|
|
679
|
+
record_count=10,
|
|
680
|
+
histogram_counts=[1, 2, 3],
|
|
681
|
+
histogram_unit="hr",
|
|
682
|
+
histogram_peak=3,
|
|
683
|
+
zone1_extras=[("hosts", "1")],
|
|
684
|
+
insights=[],
|
|
685
|
+
fields=[slot],
|
|
686
|
+
data_size_bytes=0,
|
|
687
|
+
)
|
|
688
|
+
rendered = _render_card(card)
|
|
689
|
+
assert "densest-tuple: X → Y:22 482 3.7x" in rendered
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
# ─── Architectural fork: allowlist non-invocation ────────────────────────────
|
|
693
|
+
|
|
694
|
+
def test_run_digest_does_not_call_allowlist(tmp_path: Path, monkeypatch) -> None:
|
|
695
|
+
"""run_digest must never call build_matcher or AllowlistMatcher.filter_df.
|
|
696
|
+
Patch both to raise; the digest run must complete cleanly."""
|
|
697
|
+
zeek_dir = tmp_path / "zeek"
|
|
698
|
+
rows = [
|
|
699
|
+
_conn_row(src="10.0.0.10", dst="192.0.2.50",
|
|
700
|
+
ts=_BASE_TS - 3600 * (i + 1), local_orig=True)
|
|
701
|
+
for i in range(6)
|
|
702
|
+
]
|
|
703
|
+
_write_conn_ndjson(zeek_dir / "conn.log", rows)
|
|
704
|
+
|
|
705
|
+
sentinel = RuntimeError("digest path violated pre-allowlist tap")
|
|
706
|
+
|
|
707
|
+
from loghunter.common import allowlist as allowlist_mod
|
|
708
|
+
def explode(*_args, **_kwargs):
|
|
709
|
+
raise sentinel
|
|
710
|
+
monkeypatch.setattr(allowlist_mod, "build_matcher", explode)
|
|
711
|
+
monkeypatch.setattr(
|
|
712
|
+
allowlist_mod.AllowlistMatcher, "filter_df",
|
|
713
|
+
lambda self, df, name: (_ for _ in ()).throw(sentinel),
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
717
|
+
# Should complete with no allowlist interaction; capsys swallows the
|
|
718
|
+
# rendered card so the test output stays clean.
|
|
719
|
+
runner.run_digest(
|
|
720
|
+
config=config, zeek_dir=zeek_dir, load_all=True, skip_confirm=True,
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
# ─── Default-window paths ────────────────────────────────────────────────────
|
|
725
|
+
|
|
726
|
+
def test_run_digest_flat_layout_default_window_uses_data_max_ts(
|
|
727
|
+
tmp_path: Path, monkeypatch, capsys
|
|
728
|
+
) -> None:
|
|
729
|
+
"""Flat-layout default window must anchor to data max-ts, not now.
|
|
730
|
+
|
|
731
|
+
Regression lock: an earlier plan draft proposed (now - span, now) as the
|
|
732
|
+
flat-layout fallback. With archived logs whose max-ts is in the past,
|
|
733
|
+
that approach silently discards everything. The corrected behaviour is
|
|
734
|
+
[max_ts - span, max_ts] derived from the data itself.
|
|
735
|
+
"""
|
|
736
|
+
zeek_dir = tmp_path / "zeek"
|
|
737
|
+
# Far-past max-ts (5 years ago)
|
|
738
|
+
far_past_max = _BASE_TS - 5 * 365 * 86400
|
|
739
|
+
# Rows span 3 days before that max
|
|
740
|
+
rows = []
|
|
741
|
+
for i in range(6):
|
|
742
|
+
rows.append(_conn_row(
|
|
743
|
+
src=f"10.0.0.{i}", dst="192.0.2.20",
|
|
744
|
+
ts=far_past_max - i * 86400,
|
|
745
|
+
))
|
|
746
|
+
_write_conn_ndjson(zeek_dir / "conn.log", rows)
|
|
747
|
+
|
|
748
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "1d"}}
|
|
749
|
+
runner.run_digest(config=config, zeek_dir=zeek_dir, skip_confirm=True)
|
|
750
|
+
out = capsys.readouterr().out
|
|
751
|
+
# The rendered identity-line-2 window covers only the last day of the
|
|
752
|
+
# data — anchored to data-max-ts, not "now". The flat card has no
|
|
753
|
+
# banner so the old "Default window" note has no surface; window
|
|
754
|
+
# correctness alone is the signal here.
|
|
755
|
+
far_past_dt = datetime.fromtimestamp(far_past_max, tz=timezone.utc)
|
|
756
|
+
assert far_past_dt.strftime("%Y-%m-%d") in out
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def test_run_digest_dated_layout_default_window_uses_zeek_dated_helper(
|
|
760
|
+
tmp_path: Path, monkeypatch, capsys
|
|
761
|
+
) -> None:
|
|
762
|
+
"""Dated-layout default window must use zeek_dated_default_window."""
|
|
763
|
+
zeek_dir = tmp_path / "zeek"
|
|
764
|
+
# Two dated subdirs
|
|
765
|
+
rows1 = [_conn_row(
|
|
766
|
+
src=f"10.0.0.{i}", dst="192.0.2.10",
|
|
767
|
+
ts=datetime(2026, 5, 30, 12, 0, tzinfo=timezone.utc).timestamp() + i,
|
|
768
|
+
) for i in range(3)]
|
|
769
|
+
rows2 = [_conn_row(
|
|
770
|
+
src=f"10.0.0.{i+10}", dst="192.0.2.20",
|
|
771
|
+
ts=datetime(2026, 5, 31, 12, 0, tzinfo=timezone.utc).timestamp() + i,
|
|
772
|
+
) for i in range(3)]
|
|
773
|
+
_write_conn_ndjson(zeek_dir / "2026-05-30" / "conn.log", rows1)
|
|
774
|
+
_write_conn_ndjson(zeek_dir / "2026-05-31" / "conn.log", rows2)
|
|
775
|
+
|
|
776
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "1d"}}
|
|
777
|
+
runner.run_digest(config=config, zeek_dir=zeek_dir, skip_confirm=True)
|
|
778
|
+
out = capsys.readouterr().out
|
|
779
|
+
# Only the most recent dated dir (2026-05-31) should be in the window
|
|
780
|
+
assert "2026-05-31" in out
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def test_run_digest_bounded_target_skips_default_window(
|
|
784
|
+
tmp_path: Path, monkeypatch, capsys
|
|
785
|
+
) -> None:
|
|
786
|
+
"""A single conn.log file (bounded) must load in full — no default-window filter."""
|
|
787
|
+
log_file = tmp_path / "conn.log"
|
|
788
|
+
far_past_max = _BASE_TS - 5 * 365 * 86400
|
|
789
|
+
rows = [_conn_row(
|
|
790
|
+
src=f"10.0.0.{i}", dst="192.0.2.20",
|
|
791
|
+
ts=far_past_max - i * 86400,
|
|
792
|
+
) for i in range(10)]
|
|
793
|
+
_write_conn_ndjson(log_file, rows)
|
|
794
|
+
|
|
795
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "1d"}}
|
|
796
|
+
runner.run_digest(config=config, zeek_dir=log_file, skip_confirm=True)
|
|
797
|
+
out = capsys.readouterr().out
|
|
798
|
+
# No "Default window" note should appear — bounded targets load full.
|
|
799
|
+
assert "Default window" not in out
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
# ─── Single-file Zeek bypass: filename-era basename gate retired ─────────────
|
|
803
|
+
|
|
804
|
+
# The single-file Zeek loader path used to route through discover_zeek_files,
|
|
805
|
+
# whose single-file branch gates on fnmatch(basename, pattern). After
|
|
806
|
+
# content-sniffing was added at the CLI layer, that filename gate started
|
|
807
|
+
# silently dropping date-prefixed files (e.g. 2026-06-09.conn.log) into
|
|
808
|
+
# zero-row cards. run_digest now bypasses discover_zeek_files for an explicit
|
|
809
|
+
# single Zeek file — only the Zeek path needed the fix (pihole/syslog/cloudtrail
|
|
810
|
+
# loaders already accept explicit files without a basename gate); the detect
|
|
811
|
+
# path is unchanged (it still uses the basename gate as a type check).
|
|
812
|
+
|
|
813
|
+
_TSV_CONN_HEADER = (
|
|
814
|
+
"#separator \\x09\n"
|
|
815
|
+
"#set_separator\t,\n"
|
|
816
|
+
"#empty_field\t(empty)\n"
|
|
817
|
+
"#unset_field\t-\n"
|
|
818
|
+
"#path\tconn\n"
|
|
819
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
|
|
820
|
+
"\tproto\tservice\tduration\torig_bytes\tresp_bytes"
|
|
821
|
+
"\tconn_state\tlocal_orig\tlocal_resp\ttunnel_parents\n"
|
|
822
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport"
|
|
823
|
+
"\tenum\tstring\tinterval\tcount\tcount"
|
|
824
|
+
"\tstring\tbool\tbool\tset[string]\n"
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def test_run_digest_date_prefixed_zeek_ndjson_renders_card_with_rows(
|
|
829
|
+
tmp_path: Path, capsys
|
|
830
|
+
) -> None:
|
|
831
|
+
"""Date-prefixed Zeek NDJSON single file renders a conn card with the
|
|
832
|
+
real row count.
|
|
833
|
+
|
|
834
|
+
Pre-fix the basename gate in discover_zeek_files dropped this file as
|
|
835
|
+
not matching ``conn*.log*``, leaving run_digest with an empty frame
|
|
836
|
+
that rendered as ``(no events in window)``.
|
|
837
|
+
"""
|
|
838
|
+
log_file = tmp_path / "2026-06-09.conn.log"
|
|
839
|
+
rows = [
|
|
840
|
+
_conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
|
|
841
|
+
for i in range(6)
|
|
842
|
+
]
|
|
843
|
+
_write_conn_ndjson(log_file, rows)
|
|
844
|
+
|
|
845
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
846
|
+
runner.run_digest(
|
|
847
|
+
config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
|
|
848
|
+
)
|
|
849
|
+
out = capsys.readouterr().out
|
|
850
|
+
# Histogram has a real peak — not the empty-frame fallback.
|
|
851
|
+
assert "(no events in window)" not in out
|
|
852
|
+
assert "peak:" in out
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def test_run_digest_date_prefixed_zeek_tsv_renders_card_with_rows(
|
|
856
|
+
tmp_path: Path, capsys
|
|
857
|
+
) -> None:
|
|
858
|
+
"""Date-prefixed Zeek TSV single file with a complete header AND at
|
|
859
|
+
least one data row renders a conn card.
|
|
860
|
+
|
|
861
|
+
Proves the bypass reaches the Zeek strategy's prefix-preserving sniff
|
|
862
|
+
(which dispatches TSV vs NDJSON across ``run_load``) and applies the
|
|
863
|
+
conn normalizer — not just that sniff routed the file to the right
|
|
864
|
+
schema.
|
|
865
|
+
"""
|
|
866
|
+
log_file = tmp_path / "2026-06-09.conn.log"
|
|
867
|
+
# Two data rows with distinct ts so the timeline has a non-zero span —
|
|
868
|
+
# required by the ts-confidence guard in run_digest. The bypass under
|
|
869
|
+
# test cares about file discovery + TSV parser routing, not span.
|
|
870
|
+
log_file.write_text(
|
|
871
|
+
_TSV_CONN_HEADER
|
|
872
|
+
+ "1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443"
|
|
873
|
+
"\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
|
|
874
|
+
+ "1748649660.000000\tCTest02\t192.0.2.11\t51515\t203.0.113.20\t443"
|
|
875
|
+
"\ttcp\tssl\t2.1\t800\t4400\tSF\tT\tF\t(empty)\n"
|
|
876
|
+
+ "#close\t2026-01-01-00:00:00\n",
|
|
877
|
+
encoding="utf-8",
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
881
|
+
runner.run_digest(
|
|
882
|
+
config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
|
|
883
|
+
)
|
|
884
|
+
out = capsys.readouterr().out
|
|
885
|
+
assert "(no events in window)" not in out
|
|
886
|
+
assert "peak:" in out
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
def test_run_digest_zeek_tsv_header_only_raises_digest_empty(
|
|
890
|
+
tmp_path: Path
|
|
891
|
+
) -> None:
|
|
892
|
+
"""A Zeek TSV file with a complete ``#path conn`` header but zero data
|
|
893
|
+
rows is RECOGNIZED-BUT-EMPTY: the header carries the schema, sniff
|
|
894
|
+
routes it as conn, the loader returns an empty frame, and run_digest
|
|
895
|
+
raises DigestEmpty (a control signal, not an error).
|
|
896
|
+
|
|
897
|
+
Gate 2 seam: a zero-row schema card was misleading — it read as "we
|
|
898
|
+
hunted and found nothing" rather than the truth ("we recognized it,
|
|
899
|
+
there was nothing to read"). The CLI catches DigestEmpty in both
|
|
900
|
+
entry paths and narrates "recognized X as conn but no parseable
|
|
901
|
+
records — skipping"; this test pins the runner-level raise.
|
|
902
|
+
"""
|
|
903
|
+
from loghunter.common.errors import DigestEmpty
|
|
904
|
+
|
|
905
|
+
log_file = tmp_path / "2026-06-09.conn.log"
|
|
906
|
+
log_file.write_text(
|
|
907
|
+
_TSV_CONN_HEADER + "#close\t2026-01-01-00:00:00\n",
|
|
908
|
+
encoding="utf-8",
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
912
|
+
with pytest.raises(DigestEmpty) as exc_info:
|
|
913
|
+
runner.run_digest(
|
|
914
|
+
config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
|
|
915
|
+
)
|
|
916
|
+
assert exc_info.value.schema == "conn"
|
|
917
|
+
assert exc_info.value.basename == log_file.name
|
|
918
|
+
|
|
919
|
+
|
|
920
|
+
def test_run_digest_plain_conn_log_still_renders_card_with_rows(
|
|
921
|
+
tmp_path: Path, capsys
|
|
922
|
+
) -> None:
|
|
923
|
+
"""Regression: a single file literally named ``conn.log`` (matches the
|
|
924
|
+
old basename pattern) still loads and renders correctly. Confirms the
|
|
925
|
+
bypass didn't break the previously-working filename case."""
|
|
926
|
+
log_file = tmp_path / "conn.log"
|
|
927
|
+
rows = [
|
|
928
|
+
_conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
|
|
929
|
+
for i in range(6)
|
|
930
|
+
]
|
|
931
|
+
_write_conn_ndjson(log_file, rows)
|
|
932
|
+
|
|
933
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
934
|
+
runner.run_digest(
|
|
935
|
+
config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
|
|
936
|
+
)
|
|
937
|
+
out = capsys.readouterr().out
|
|
938
|
+
assert "(no events in window)" not in out
|
|
939
|
+
assert "peak:" in out
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
# ─── Identity line 1: every card carries its source name ────────────────────
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
def test_run_digest_single_file_identity_line_is_basename(
|
|
946
|
+
tmp_path: Path, capsys,
|
|
947
|
+
) -> None:
|
|
948
|
+
"""End-to-end: a single-file digest renders identity line 1 as the
|
|
949
|
+
file's basename. No banner. The exact record count appears on
|
|
950
|
+
identity line 3 (no glob-pattern key)."""
|
|
951
|
+
log_file = tmp_path / "2026-05-30.conn.log"
|
|
952
|
+
rows = [
|
|
953
|
+
_conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
|
|
954
|
+
for i in range(6)
|
|
955
|
+
]
|
|
956
|
+
_write_conn_ndjson(log_file, rows)
|
|
957
|
+
|
|
958
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
959
|
+
runner.run_digest(
|
|
960
|
+
config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
|
|
961
|
+
)
|
|
962
|
+
out = capsys.readouterr().out
|
|
963
|
+
lines = out.splitlines()
|
|
964
|
+
assert lines[0] == "2026-05-30.conn.log"
|
|
965
|
+
# Identity line 3 — exact count, no glob-pattern key.
|
|
966
|
+
schema_line = next(ln for ln in lines if ln.startswith("conn · "))
|
|
967
|
+
assert "6 lines" in schema_line
|
|
968
|
+
assert "conn*.log*" not in schema_line
|
|
969
|
+
# No Source: banner row under the flat grammar.
|
|
970
|
+
assert not any(ln.startswith("Source:") for ln in lines)
|
|
971
|
+
|
|
972
|
+
|
|
973
|
+
def test_run_digest_directory_mode_identity_line_is_dir_name(
|
|
974
|
+
tmp_path: Path, capsys,
|
|
975
|
+
) -> None:
|
|
976
|
+
"""Directory-mode digest gets identity line 1 = directory's basename.
|
|
977
|
+
source_name is no longer the file-vs-directory discriminator — every
|
|
978
|
+
card has an identity line."""
|
|
979
|
+
zeek_dir = tmp_path / "zeek"
|
|
980
|
+
rows = [
|
|
981
|
+
_conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
|
|
982
|
+
for i in range(6)
|
|
983
|
+
]
|
|
984
|
+
_write_conn_ndjson(zeek_dir / "conn.log", rows)
|
|
985
|
+
|
|
986
|
+
config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
|
|
987
|
+
runner.run_digest(
|
|
988
|
+
config=config, zeek_dir=zeek_dir, load_all=True, skip_confirm=True,
|
|
989
|
+
)
|
|
990
|
+
out = capsys.readouterr().out
|
|
991
|
+
lines = out.splitlines()
|
|
992
|
+
assert lines[0] == "zeek"
|
|
993
|
+
# No banner / Source: / Records: rows in the flat grammar.
|
|
994
|
+
assert not any(ln.startswith("Source:") for ln in lines)
|
|
995
|
+
assert not any(ln.startswith("Records:") for ln in lines)
|
|
996
|
+
|
|
997
|
+
|
|
998
|
+
# ─── CLI dispatch and whitelist enforcement ──────────────────────────────────
|
|
999
|
+
|
|
1000
|
+
_ZEEK_NDJSON_CONN_LINE = (
|
|
1001
|
+
'{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
|
|
1002
|
+
' "id.resp_p": 443, "proto": "tcp", "duration": 1.23}\n'
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def _write_zeek_conn_file(tmp_path: Path) -> Path:
|
|
1007
|
+
log_path = tmp_path / "conn.log"
|
|
1008
|
+
log_path.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
1009
|
+
return log_path
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def test_cli_digest_dispatch_routes_to_run_digest(tmp_path: Path, monkeypatch) -> None:
|
|
1013
|
+
import loghunter.cli as cli
|
|
1014
|
+
import loghunter.runner as runner_mod
|
|
1015
|
+
|
|
1016
|
+
called: dict[str, Any] = {}
|
|
1017
|
+
def fake_run_digest(**kwargs):
|
|
1018
|
+
called.update(kwargs)
|
|
1019
|
+
monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
|
|
1020
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1021
|
+
|
|
1022
|
+
log_path = _write_zeek_conn_file(tmp_path)
|
|
1023
|
+
cli._main(["digest", str(log_path), "--all"])
|
|
1024
|
+
assert called.get("schema") == "conn"
|
|
1025
|
+
# CLI passes raw strings; resolver owns Path conversion.
|
|
1026
|
+
assert called.get("zeek_dir") == str(log_path)
|
|
1027
|
+
assert called.get("load_all") is True
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def test_cli_digest_rejects_detect_flag(monkeypatch) -> None:
|
|
1031
|
+
import loghunter.cli as cli
|
|
1032
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1033
|
+
with pytest.raises(ValueError, match="--detect"):
|
|
1034
|
+
cli._main(["digest", "--detect=beacon"])
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
def test_cli_digest_rejects_non_text_output(tmp_path: Path, monkeypatch) -> None:
|
|
1038
|
+
import loghunter.cli as cli
|
|
1039
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1040
|
+
log_path = _write_zeek_conn_file(tmp_path)
|
|
1041
|
+
with pytest.raises(ValueError, match="text"):
|
|
1042
|
+
cli._main(["digest", str(log_path), "--output=json", "--all"])
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
def test_cli_digest_rejects_filter_flag() -> None:
|
|
1046
|
+
"""Filter / field flags aren't anywhere in the spec → plain unknown-flag."""
|
|
1047
|
+
import loghunter.cli as cli
|
|
1048
|
+
with pytest.raises(ValueError, match="unknown flag --filter"):
|
|
1049
|
+
cli._main(["digest", "--filter=src=192.0.2.10"])
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def test_cli_digest_rejects_arbitrary_unknown_long_flag() -> None:
|
|
1053
|
+
import loghunter.cli as cli
|
|
1054
|
+
with pytest.raises(ValueError, match="unknown flag --field"):
|
|
1055
|
+
cli._main(["digest", "--field=src"])
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def test_cli_digest_rejects_unknown_short_flag() -> None:
|
|
1059
|
+
import loghunter.cli as cli
|
|
1060
|
+
with pytest.raises(ValueError, match=r"unknown flag -x"):
|
|
1061
|
+
cli._main(["digest", "-x"])
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
def test_cli_digest_accepts_y_short_flag(tmp_path: Path, monkeypatch) -> None:
|
|
1065
|
+
import loghunter.cli as cli
|
|
1066
|
+
import loghunter.runner as runner_mod
|
|
1067
|
+
|
|
1068
|
+
called: dict[str, Any] = {}
|
|
1069
|
+
def fake_run_digest(**kwargs):
|
|
1070
|
+
called.update(kwargs)
|
|
1071
|
+
monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
|
|
1072
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1073
|
+
|
|
1074
|
+
log_path = _write_zeek_conn_file(tmp_path)
|
|
1075
|
+
cli._main(["digest", str(log_path), "-y", "--all"])
|
|
1076
|
+
assert called.get("skip_confirm") is True
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
def test_cli_digest_missing_path_surfaces_actionable_error_and_exits_nonzero(
|
|
1080
|
+
monkeypatch, capsys,
|
|
1081
|
+
) -> None:
|
|
1082
|
+
"""Per-path errors surface inline on stderr; with no card rendered the
|
|
1083
|
+
digest exit code is 1 (three-way tally: 0 rendered, ≥1 errored)."""
|
|
1084
|
+
import loghunter.cli as cli
|
|
1085
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1086
|
+
rc = cli._main(["digest", "/no/such/file/here.log"])
|
|
1087
|
+
captured = capsys.readouterr()
|
|
1088
|
+
assert "digest: path not found" in captured.err
|
|
1089
|
+
assert rc == 1
|
|
1090
|
+
|
|
1091
|
+
|
|
1092
|
+
def test_cli_digest_directory_positional_is_rejected_and_exits_nonzero(
|
|
1093
|
+
tmp_path: Path, monkeypatch, capsys,
|
|
1094
|
+
) -> None:
|
|
1095
|
+
"""v1 sniff insists on filenames; directories do not fan out. The
|
|
1096
|
+
directory is surfaced inline on stderr and the run exits 1."""
|
|
1097
|
+
import loghunter.cli as cli
|
|
1098
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1099
|
+
a_dir = tmp_path / "logs"
|
|
1100
|
+
a_dir.mkdir()
|
|
1101
|
+
rc = cli._main(["digest", str(a_dir)])
|
|
1102
|
+
captured = capsys.readouterr()
|
|
1103
|
+
assert "must be a file, not a directory" in captured.err
|
|
1104
|
+
assert rc == 1
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
def test_cli_digest_empty_file_prints_message_and_skips(
|
|
1108
|
+
tmp_path: Path, monkeypatch, capsys,
|
|
1109
|
+
) -> None:
|
|
1110
|
+
import loghunter.cli as cli
|
|
1111
|
+
import loghunter.runner as runner_mod
|
|
1112
|
+
called: dict[str, Any] = {}
|
|
1113
|
+
def fake_run_digest(**kwargs):
|
|
1114
|
+
called.update(kwargs)
|
|
1115
|
+
monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
|
|
1116
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1117
|
+
|
|
1118
|
+
empty = tmp_path / "nothing.log"
|
|
1119
|
+
empty.write_text("", encoding="utf-8")
|
|
1120
|
+
cli._main(["digest", str(empty)])
|
|
1121
|
+
captured = capsys.readouterr()
|
|
1122
|
+
assert "nothing.log is empty. Nothing to do!" in captured.out
|
|
1123
|
+
assert called == {}, "run_digest must NOT be invoked for an empty file"
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def test_cli_digest_whitespace_only_file_prints_message_and_skips(
|
|
1127
|
+
tmp_path: Path, monkeypatch, capsys,
|
|
1128
|
+
) -> None:
|
|
1129
|
+
import loghunter.cli as cli
|
|
1130
|
+
import loghunter.runner as runner_mod
|
|
1131
|
+
called: dict[str, Any] = {}
|
|
1132
|
+
def fake_run_digest(**kwargs):
|
|
1133
|
+
called.update(kwargs)
|
|
1134
|
+
monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
|
|
1135
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1136
|
+
|
|
1137
|
+
blanks = tmp_path / "blanks.log"
|
|
1138
|
+
blanks.write_text("\n \n\t\n", encoding="utf-8")
|
|
1139
|
+
cli._main(["digest", str(blanks)])
|
|
1140
|
+
captured = capsys.readouterr()
|
|
1141
|
+
assert "blanks.log is empty. Nothing to do!" in captured.out
|
|
1142
|
+
assert called == {}
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
def test_cli_digest_unrecognized_text_routes_to_blob(
|
|
1146
|
+
tmp_path: Path, monkeypatch,
|
|
1147
|
+
) -> None:
|
|
1148
|
+
import loghunter.cli as cli
|
|
1149
|
+
import loghunter.runner as runner_mod
|
|
1150
|
+
called: dict[str, Any] = {}
|
|
1151
|
+
def fake_run_digest(**kwargs):
|
|
1152
|
+
called.update(kwargs)
|
|
1153
|
+
monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
|
|
1154
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1155
|
+
|
|
1156
|
+
mystery = tmp_path / "mystery.txt"
|
|
1157
|
+
mystery.write_text("hello world\nlorem ipsum\n", encoding="utf-8")
|
|
1158
|
+
cli._main(["digest", str(mystery)])
|
|
1159
|
+
assert called.get("schema") == "blob"
|
|
1160
|
+
assert called.get("blob_path") == mystery
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
def test_cli_digest_bare_no_positional_uses_config_zeek_dir(
|
|
1164
|
+
tmp_path: Path, monkeypatch,
|
|
1165
|
+
) -> None:
|
|
1166
|
+
"""No positional → CLI passes config through unchanged; the config-driven
|
|
1167
|
+
conn fallback fires inside ``resolve_digest_source`` in ``run_digest``.
|
|
1168
|
+
|
|
1169
|
+
This test asserts the CLI seam shape (zeek_dir override is None — the
|
|
1170
|
+
config flows in via the config dict). The actual config-fallback
|
|
1171
|
+
resolution is tested at the resolver layer
|
|
1172
|
+
(tests/test_sources.py:test_digest_conn_override_wins-style coverage
|
|
1173
|
+
+ tests/test_root_provenance.py:test_runner_run_digest_applies_root_to_config_source_dirs).
|
|
1174
|
+
"""
|
|
1175
|
+
import loghunter.cli as cli
|
|
1176
|
+
import loghunter.runner as runner_mod
|
|
1177
|
+
called: dict[str, Any] = {}
|
|
1178
|
+
def fake_run_digest(**kwargs):
|
|
1179
|
+
called.update(kwargs)
|
|
1180
|
+
monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
|
|
1181
|
+
zeek = tmp_path / "zeek"
|
|
1182
|
+
zeek.mkdir()
|
|
1183
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {"zeek_dir": str(zeek)}})
|
|
1184
|
+
|
|
1185
|
+
cli._main(["digest"])
|
|
1186
|
+
assert called.get("schema") == "conn"
|
|
1187
|
+
# CLI seam: no override (None); config flows in via the config dict.
|
|
1188
|
+
assert called.get("zeek_dir") is None
|
|
1189
|
+
assert called["config"]["loghunter"]["zeek_dir"] == str(zeek)
|