PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

loghunter/common/allowlist.py ADDED Viewed

@@ -0,0 +1,436 @@
+"""Allowlist loading and matching.
+Two formats:
+- Pattern files: flat text, one glob or regex per line, # comments.
+  Used for high-volume domain and IP lists.
+- Stanza entries: TOML [[allowlist.entry]] blocks with match type, detector scoping,
+  and human-readable comments. Loaded from config inline or from allowlist_dir/*.toml.
+AllowlistMatcher is the runner's single interface for pre-detector suppression.
+Flat numeric rule format (for conn log suppression)
+─────────────────────────────────────────────────────
+One rule per line. # comments supported. Blank lines ignored.
+A rule is whitespace-separated tokens:
+  IP/CIDR/wildcard fields  — zero, one, or two; unordered for pair matching
+  port/proto token         — leading colon: :443  :123/udp  :*/tcp
+Examples:
+  192.0.2.10  198.51.100.1  :22/tcp    # specific pair, specific port+proto
+  192.0.2.10  :22                       # any flow involving this IP on port 22
+  192.0.2.0/24  :443                    # entire subnet, port 443, any proto
+  *  :123/udp                           # any host, UDP 123
+  :6556                                 # port only — suppress everywhere
+  192.0.2.33                            # bare IP — all traffic involving this host
+"""
+from __future__ import annotations
+import ipaddress
+import re
+from dataclasses import dataclass, field
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import Any
+import pandas as pd
+@dataclass
+class AllowlistEntry:
+    """A single stanza-style allowlist entry with match type and optional detector scope."""
+    match: str
+    comment: str = ""
+    detectors: list[str] = field(default_factory=list)
+    extra: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class NumericRule:
+    """A parsed flat-file numeric suppression rule.
+    ip1, ip2 — IP address, CIDR range, '*' wildcard, or None (matches anything).
+    IP pair matching is unordered: a rule fires regardless of which end is src/dst.
+    port — destination port number, or None to match any port.
+    proto — 'tcp', 'udp', 'icmp', or None to match any protocol.
+    detectors — if non-empty, rule applies only to listed detectors.
+    """
+    ip1: str | None = None
+    ip2: str | None = None
+    port: int | None = None
+    proto: str | None = None
+    detectors: list[str] = field(default_factory=list)
+    comment: str = ""
+class AllowlistMatcher:
+    """Pre-loaded allowlist ready to query.
+    Constructed by the framework and passed to detectors via DetectorContext.
+    """
+    def __init__(
+        self,
+        domain_patterns: list[str] | None = None,
+        entries: list[AllowlistEntry] | None = None,
+        numeric_rules: list[NumericRule] | None = None,
+    ) -> None:
+        self._domain_patterns: list[str] = domain_patterns or []
+        self._entries: list[AllowlistEntry] = entries or []
+        self._numeric_rules: list[NumericRule] = numeric_rules or []
+    def is_domain_allowed(self, domain: str, detector: str | None = None) -> bool:
+        """Return True if domain matches any pattern in the loaded domain lists."""
+        for pattern in self._domain_patterns:
+            if pattern.startswith("re:"):
+                if re.search(pattern[3:], domain):
+                    return True
+            else:
+                if fnmatch(domain, pattern):
+                    return True
+        return False
+    def filter_df(self, df: pd.DataFrame, detector: str) -> pd.DataFrame:
+        """Remove allowlisted rows from a normalized log DataFrame.
+        Connection logs use canonical src, dst, port, proto columns and numeric rules.
+        DNS logs use query plus domain pattern files. Missing columns are handled
+        gracefully — rules referencing absent columns are skipped rather than erroring.
+        """
+        if df.empty:
+            return df
+        if "query" in df.columns:
+            return self._filter_domain_df(df, detector)
+        return self._filter_numeric_df(df, detector)
+    def _filter_domain_df(self, df: pd.DataFrame, detector: str) -> pd.DataFrame:
+        """Remove DNS rows whose query matches a loaded domain pattern."""
+        drop_mask = df["query"].map(
+            lambda q: self.is_domain_allowed(str(q), detector)
+            or self.is_domain_allowed("x." + str(q), detector)
+        )
+        return df[~drop_mask].copy()
+    def _filter_numeric_df(self, df: pd.DataFrame, detector: str) -> pd.DataFrame:
+        """Remove connection rows matching flat numeric suppression rules."""
+        has_src = "src" in df.columns
+        has_dst = "dst" in df.columns
+        has_port = "port" in df.columns
+        has_proto = "proto" in df.columns
+        drop_mask = pd.Series(False, index=df.index)
+        for rule in self._numeric_rules:
+            if rule.detectors and detector not in rule.detectors:
+                continue
+            rule_mask = _numeric_rule_mask(df, rule, has_src, has_dst, has_port, has_proto)
+            drop_mask |= rule_mask
+        return df[~drop_mask].copy()
+def _numeric_rule_mask(
+    df: pd.DataFrame,
+    rule: NumericRule,
+    has_src: bool,
+    has_dst: bool,
+    has_port: bool,
+    has_proto: bool,
+) -> pd.Series:
+    """Build a boolean mask for rows matching rule (True = this row is allowlisted)."""
+    idx = df.index
+    mask = pd.Series(True, index=idx)
+    # Port filter
+    if rule.port is not None:
+        if not has_port:
+            return pd.Series(False, index=idx)
+        mask &= df["port"] == rule.port
+    # Proto filter
+    if rule.proto is not None:
+        if not has_proto:
+            return pd.Series(False, index=idx)
+        mask &= df["proto"] == rule.proto
+    # IP filter
+    if rule.ip1 is not None or rule.ip2 is not None:
+        if not has_src or not has_dst:
+            return pd.Series(False, index=idx)
+        ip_mask = _ip_pair_mask(df, rule.ip1, rule.ip2)
+        mask &= ip_mask
+    return mask
+def _ip_pair_mask(
+    df: pd.DataFrame,
+    ip1: str | None,
+    ip2: str | None,
+) -> pd.Series:
+    """Return True for rows whose (src, dst) pair matches the rule (unordered).
+    With one IP field:  src OR dst matches that IP.
+    With two IP fields: (src matches ip1 AND dst matches ip2) OR vice versa.
+    """
+    if ip2 is None:
+        # Single IP — matches any flow involving ip1
+        return _ip_series_matches(df["src"], ip1) | _ip_series_matches(df["dst"], ip1)
+    if ip1 is None:
+        return _ip_series_matches(df["src"], ip2) | _ip_series_matches(df["dst"], ip2)
+    # Ordered pair in either direction
+    fwd = _ip_series_matches(df["src"], ip1) & _ip_series_matches(df["dst"], ip2)
+    rev = _ip_series_matches(df["src"], ip2) & _ip_series_matches(df["dst"], ip1)
+    return fwd | rev
+def _ip_series_matches(series: pd.Series, spec: str) -> pd.Series:
+    """Vectorized: return True for each row where the IP matches spec.
+    spec may be: '*' (wildcard), a CIDR range, or an exact IP address.
+    CIDR ranges use ipaddress stdlib — no additional dependencies.
+    """
+    if spec == "*":
+        return pd.Series(True, index=series.index)
+    if "/" in spec:
+        try:
+            net = ipaddress.ip_network(spec, strict=False)
+        except ValueError:
+            return pd.Series(False, index=series.index)
+        return series.map(lambda ip: _ip_in_network(ip, net))
+    return series == spec
+def _ip_in_network(ip_str: Any, net: ipaddress.IPv4Network | ipaddress.IPv6Network) -> bool:
+    """Return True if ip_str is a valid IP contained in net."""
+    if not isinstance(ip_str, str):
+        return False
+    try:
+        return ipaddress.ip_address(ip_str) in net
+    except ValueError:
+        return False
+def _parse_numeric_rule_line(line: str) -> NumericRule | None:
+    """Parse one line of a flat numeric rule file into a NumericRule.
+    Returns None for blank lines, comment-only lines, or malformed rules.
+    """
+    if "#" in line:
+        line = line[: line.index("#")]
+    line = line.strip()
+    if not line:
+        return None
+    tokens = line.split()
+    ip_tokens: list[str] = []
+    port: int | None = None
+    proto: str | None = None
+    for token in tokens:
+        if token.startswith(":"):
+            # Port/proto token: :443  :123/udp  :*/tcp
+            port_part = token[1:]
+            if "/" in port_part:
+                port_str, proto_str = port_part.rsplit("/", 1)
+                if proto_str != "*":
+                    proto = proto_str.lower()
+            else:
+                port_str = port_part
+            if port_str != "*":
+                try:
+                    port = int(port_str)
+                except ValueError:
+                    return None  # malformed port
+        else:
+            ip_tokens.append(token)
+    if len(ip_tokens) > 2:
+        return None  # too many IP fields
+    ip1 = ip_tokens[0] if len(ip_tokens) >= 1 else None
+    ip2 = ip_tokens[1] if len(ip_tokens) >= 2 else None
+    return NumericRule(ip1=ip1, ip2=ip2, port=port, proto=proto)
+def load_pattern_file(path: Path) -> list[str]:
+    """Load a flat domain pattern file, stripping comments and blank lines.
+    Inline # comments are stripped before the pattern is recorded, matching
+    the behaviour of the numeric rule parser.
+    """
+    patterns: list[str] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if "#" in line:
+            line = line[: line.index("#")]
+        line = line.strip()
+        if line:
+            patterns.append(line)
+    return patterns
+def load_numeric_rule_file(path: Path) -> list[NumericRule]:
+    """Load a flat numeric rule file and return parsed NumericRule objects."""
+    rules: list[NumericRule] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        rule = _parse_numeric_rule_line(line)
+        if rule is not None:
+            rules.append(rule)
+    return rules
+def load_stanza_file(path: Path) -> list[AllowlistEntry]:
+    """Load a TOML stanza file and return a list of AllowlistEntry objects."""
+    import tomllib
+    with path.open("rb") as fh:
+        data = tomllib.load(fh)
+    entries: list[AllowlistEntry] = []
+    for raw in data.get("allowlist", {}).get("entry", []):
+        extra = {k: v for k, v in raw.items() if k not in ("match", "comment", "detectors")}
+        entries.append(AllowlistEntry(
+            match=raw["match"],
+            comment=raw.get("comment", ""),
+            detectors=_as_list(raw.get("detectors", [])),
+            extra=extra,
+        ))
+    return entries
+# Shipped package data — located relative to this file so it works for both
+# editable installs (loghunter/data/) and regular installs (site-packages/loghunter/data/).
+# Package-local: NOT routed through LH_ROOT (that rail is for config-file values).
+_PACKAGE_DIR = Path(__file__).parent.parent
+_SHIPPED_DOMAIN_FILES: list[Path] = [
+    _PACKAGE_DIR / "data" / "allowlist" / "domains_universal.txt",
+    _PACKAGE_DIR / "data" / "allowlist" / "domains_homelab.txt",
+    _PACKAGE_DIR / "data" / "allowlist" / "domains_devices.txt",
+]
+def build_matcher(config: dict[str, Any]) -> AllowlistMatcher:
+    """Construct an AllowlistMatcher from the [allowlist] config section.
+    Config-supplied path values flow through ``resolve_path(value, root)`` so
+    LH_ROOT applies to relative paths. When a key is absent (raw / notebook
+    configs that skipped ``cfg.load``), the fallback is read from the single
+    source of truth in ``config.py`` via ``default_allowlist_paths()``.
+    """
+    from loghunter.common.config import default_allowlist_paths
+    from loghunter.common.paths import effective_root, resolve_path
+    allowlist_cfg = config.get("allowlist", {})
+    root = effective_root(config)
+    defaults = default_allowlist_paths()
+    # Domain pattern files (used by DNS detector).
+    # Shipped package files loaded first as a base; user-configured paths layer on top.
+    domain_patterns: list[str] = []
+    for shipped in _SHIPPED_DOMAIN_FILES:
+        if shipped.exists():
+            domain_patterns.extend(load_pattern_file(shipped))
+    domain_pattern_paths = allowlist_cfg.get("domain_patterns")
+    if domain_pattern_paths is None:
+        domain_pattern_paths = defaults["domain_patterns"]
+    for path_str in _as_list(domain_pattern_paths):
+        resolved = resolve_path(path_str, root)
+        if resolved is None:
+            continue
+        path = Path(resolved)
+        if path.exists() and path not in _SHIPPED_DOMAIN_FILES:
+            domain_patterns.extend(load_pattern_file(path))
+    # TOML stanza entries (classification — kept for future detectors)
+    entries: list[AllowlistEntry] = []
+    for raw in allowlist_cfg.get("entry", []):
+        extra = {k: v for k, v in raw.items() if k not in ("match", "comment", "detectors")}
+        entries.append(AllowlistEntry(
+            match=raw["match"],
+            comment=raw.get("comment", ""),
+            detectors=_as_list(raw.get("detectors", [])),
+            extra=extra,
+        ))
+    allowlist_dir = allowlist_cfg.get("allowlist_dir")
+    if allowlist_dir is None:
+        allowlist_dir = defaults["allowlist_dir"]
+    resolved_dir = resolve_path(allowlist_dir, root)
+    if resolved_dir:
+        dir_path = Path(resolved_dir)
+        if dir_path.is_dir():
+            for toml_file in sorted(dir_path.glob("*.toml")):
+                entries.extend(load_stanza_file(toml_file))
+    # Flat numeric connection rule files (suppression — used by filter_df).
+    # These are strictly local/site-specific. Unlike domains, connection suppressions
+    # encode local topology and behavior, so LogHunter never loads bundled defaults.
+    numeric_rules: list[NumericRule] = []
+    connection_rule_paths = allowlist_cfg.get("connection_rules")
+    if connection_rule_paths is None:
+        connection_rule_paths = defaults["connection_rules"]
+    for path_str in _as_list(connection_rule_paths):
+        resolved = resolve_path(path_str, root)
+        if resolved is None:
+            continue
+        path = Path(resolved)
+        if path.exists():
+            numeric_rules.extend(load_numeric_rule_file(path))
+    # Convert existing ip_pair and dst_port stanza entries to NumericRules
+    # so that filter_df() applies them without requiring format migration.
+    for entry in entries:
+        rule = _stanza_to_numeric_rule(entry)
+        if rule is not None:
+            numeric_rules.append(rule)
+    return AllowlistMatcher(
+        domain_patterns=domain_patterns,
+        entries=entries,
+        numeric_rules=numeric_rules,
+    )
+def _as_list(value: Any) -> list[str]:
+    """Return a forgiving string list from TOML arrays or comma-separated strings."""
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [part.strip() for part in value.split(",") if part.strip()]
+    if isinstance(value, list):
+        return [str(item).strip() for item in value if str(item).strip()]
+    return [str(value).strip()]
+def _stanza_to_numeric_rule(entry: AllowlistEntry) -> NumericRule | None:
+    """Convert a TOML stanza entry to a NumericRule for use in filter_df().
+    Only ip_pair and dst_port match types are supported; others return None.
+    """
+    if entry.match == "ip_pair":
+        src = entry.extra.get("src")
+        dst = entry.extra.get("dst")
+        dst_port = entry.extra.get("dst_port")
+        port = int(dst_port) if dst_port is not None else None
+        return NumericRule(
+            ip1=src,
+            ip2=dst,
+            port=port,
+            detectors=list(entry.detectors),
+            comment=entry.comment,
+        )
+    if entry.match == "dst_port":
+        value = entry.extra.get("value")
+        port = int(value) if value is not None else None
+        return NumericRule(port=port, detectors=list(entry.detectors), comment=entry.comment)
+    return None