PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

migrations/syslog_drain3.py ADDED Viewed

@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+"""
+syslog_hunt.py — Syslog structural anomaly detection.
+Reads a flat syslog file (one RFC 3164 line per line, as produced by
+get_syslog.py), runs drain3 log templating followed by rarity-based
+anomaly scoring, and writes a plain-text report to ./hunt_output/.
+Pipeline:
+    1. Load & parse  — strip RFC 3164 PRI prefix and syslog header
+    2. Normalize     — collapse PID variants (sshd[1234] → sshd[*])
+    3. Template      — drain3 structural clustering
+    4. Score         — rarity ranking (bottom N percentile = anomalous)
+    5. Reboot detect — suppress per-host kernel boot bursts, emit single line
+    6. Report        — flat list of anomalous raw syslog lines
+Usage:
+    python syslog_hunt.py syslog_20260515_1d.log
+    python syslog_hunt.py --rarity 5 --max-count 2 syslog.log
+    python syslog_hunt.py --exclude host1.example.com host2.example.com syslog.log
+Cron example (daily, 06:00):
+    0 6 * * * cd /opt/hunt && python syslog_hunt.py syslog_$(date +%%Y%%m%%d)_1d.log
+Dependencies:
+    pip install drain3
+"""
+import argparse
+import re
+import sys
+from collections import defaultdict
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+# ── Dependency check ──────────────────────────────────────────────────────────
+try:
+    from drain3 import TemplateMiner
+    from drain3.template_miner_config import TemplateMinerConfig
+except ImportError:
+    print("ERROR: drain3 not installed. Run: pip install drain3")
+    sys.exit(1)
+# ── Compiled patterns ─────────────────────────────────────────────────────────
+PRI_RE        = re.compile(r'^<\d+>')
+SYSLOG_HDR_RE = re.compile(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+\s+')
+PROC_PID_RE   = re.compile(r'\[\d+\]')
+# Syslog timestamp for approximate event time parsing (no year — use current year)
+SYSLOG_TS_RE  = re.compile(r'^(\w{3})\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})')
+# Reboot signal patterns — any of these in a message body triggers reboot detection
+REBOOT_SIGNALS_RE = re.compile(
+    r'(systemd-logind.*[Ss]ystem is rebooting|'
+    r'rsyslogd.*exiting on signal 15|'
+    r'systemd-shutdown.*Sending SIGTERM to remaining|'
+    r'kernel: Linux version\s)',
+    re.IGNORECASE
+)
+# ── Pipeline defaults ─────────────────────────────────────────────────────────
+DRAIN_SIM_THRESH          = 0.5
+DRAIN_DEPTH               = 4
+DRAIN_PARAMETRIZE_NUMERIC = True
+DEFAULT_RARITY_PCT        = 10
+DEFAULT_MAX_COUNT         = 1   # hard ceiling on template count regardless of percentile
+REBOOT_SUPPRESS_WINDOW    = 300  # seconds: suppress anomalies within this window of reboot
+# ── Text formatting ───────────────────────────────────────────────────────────
+WIDTH = 72
+def banner(text):
+    return "\n" + "═" * WIDTH + f"\n  {text}\n" + "═" * WIDTH
+def section(text):
+    return f"\n── {text} " + "─" * max(0, WIDTH - len(text) - 4)
+# ── Parsing ───────────────────────────────────────────────────────────────────
+def parse_host(raw):
+    """Extract hostname from RFC 3164 syslog line."""
+    stripped = PRI_RE.sub("", raw).strip()
+    parts = stripped.split()
+    return parts[3] if len(parts) >= 4 else "unknown"
+def strip_header(raw):
+    """Remove RFC 3164 PRI prefix and timestamp+hostname."""
+    raw = PRI_RE.sub("", raw)
+    return SYSLOG_HDR_RE.sub("", raw).strip()
+def normalize(msg):
+    """Collapse PID brackets so sshd[1234] and sshd[5678] share a template."""
+    return PROC_PID_RE.sub("[*]", msg)
+def parse_syslog_ts(raw):
+    """
+    Parse the syslog timestamp from a raw line. Returns a datetime in local
+    time (naive, current year assumed) or None if unparseable.
+    """
+    stripped = PRI_RE.sub("", raw).strip()
+    m = SYSLOG_TS_RE.match(stripped)
+    if not m:
+        return None
+    month_str, day_str, time_str = m.group(1), m.group(2), m.group(3)
+    year = datetime.now().year
+    try:
+        return datetime.strptime(
+            f"{year} {month_str} {day_str.zfill(2)} {time_str}",
+            "%Y %b %d %H:%M:%S"
+        )
+    except ValueError:
+        return None
+# ── Load ──────────────────────────────────────────────────────────────────────
+def load_syslog(path, exclude_hosts):
+    """
+    Read flat syslog file. Returns list of dicts:
+        raw      — original line
+        host     — parsed hostname
+        message  — stripped + normalized message body
+        ts       — datetime (local, naive) or None
+    """
+    events        = []
+    skipped_empty = 0
+    skipped_host  = 0
+    with open(path, encoding="utf-8", errors="replace") as f:
+        for line in f:
+            raw = line.rstrip("\n")
+            if not raw or raw.startswith("#"):
+                continue
+            host = parse_host(raw)
+            if exclude_hosts and host in exclude_hosts:
+                skipped_host += 1
+                continue
+            msg = normalize(strip_header(raw))
+            if not msg:
+                skipped_empty += 1
+                continue
+            events.append({
+                "raw":     raw,
+                "host":    host,
+                "message": msg,
+                "ts":      parse_syslog_ts(raw),
+            })
+    print(f"  Loaded        : {len(events):,} events")
+    if skipped_host:
+        print(f"  Excluded hosts: {skipped_host:,} events")
+    if skipped_empty:
+        print(f"  Skipped empty : {skipped_empty:,} events")
+    return events
+# ── Templating ────────────────────────────────────────────────────────────────
+def run_drain3(events):
+    """Run drain3 on all events. Adds template_id and template_str in-place."""
+    cfg = TemplateMinerConfig()
+    cfg.drain_sim_th               = DRAIN_SIM_THRESH
+    cfg.drain_depth                = DRAIN_DEPTH
+    cfg.parametrize_numeric_tokens = DRAIN_PARAMETRIZE_NUMERIC
+    miner        = TemplateMiner(config=cfg)
+    n            = len(events)
+    report_every = max(1, n // 20)
+    print(f"  Templating {n:,} events...", end="", flush=True)
+    for i, ev in enumerate(events):
+        result = miner.add_log_message(ev["message"])
+        ev["template_id"]  = result["cluster_id"]
+        ev["template_str"] = result["template_mined"]
+        if (i + 1) % report_every == 0:
+            print(f"\r  Templating {n:,} events... {(i+1)/n*100:.0f}%",
+                  end="", flush=True)
+    n_templates = len({ev["template_id"] for ev in events})
+    print(f"\r  Templating complete: {n_templates:,} unique templates "
+          f"from {n:,} events")
+    return events
+# ── Rarity scoring ────────────────────────────────────────────────────────────
+def score_rarity(events, rarity_pct, max_count):
+    """
+    Flag events whose template count falls at or below the effective threshold.
+    Effective threshold = min(percentile-derived value, max_count).
+    Adds is_anomaly bool in-place. Returns (threshold, freq_dict).
+    """
+    freq = defaultdict(int)
+    for ev in events:
+        freq[ev["template_id"]] += 1
+    sorted_counts = sorted(freq.values())
+    idx           = max(0, int(len(sorted_counts) * rarity_pct / 100) - 1)
+    pct_threshold = sorted_counts[idx]
+    threshold = min(pct_threshold, max_count)
+    rare_ids = {tid for tid, count in freq.items() if count <= threshold}
+    for ev in events:
+        ev["is_anomaly"] = ev["template_id"] in rare_ids
+    n_anom = sum(ev["is_anomaly"] for ev in events)
+    print(f"  Rarity threshold : <= {threshold} events "
+          f"(pct={pct_threshold}, max_count cap={max_count})")
+    print(f"  Anomalous        : {len(rare_ids):,} templates  |  "
+          f"{n_anom:,} events ({n_anom/len(events)*100:.2f}%)")
+    return threshold, dict(freq)
+# ── Reboot detection ──────────────────────────────────────────────────────────
+def detect_reboots(events):
+    """
+    Scan all events for reboot signals. For each host, record the timestamp
+    of each detected reboot. Returns dict: host -> list of reboot datetimes.
+    """
+    reboots = defaultdict(list)
+    for ev in events:
+        if ev["ts"] and REBOOT_SIGNALS_RE.search(ev["raw"]):
+            reboots[ev["host"]].append(ev["ts"])
+    for host in reboots:
+        reboots[host].sort()
+    return dict(reboots)
+def apply_reboot_suppression(noise_events, reboots):
+    """
+    For each anomalous event, check if it falls within REBOOT_SUPPRESS_WINDOW
+    seconds after a detected reboot on the same host. If so, suppress it.
+    Returns:
+        kept          — anomalous events not suppressed
+        reboot_lines  — synthetic reboot annotation lines (one per reboot)
+        suppressed_n  — count of suppressed events
+    """
+    reboot_lines    = []
+    suppressed_n    = 0
+    kept            = []
+    emitted_reboots = set()  # (host, reboot_ts) already announced
+    for ev in noise_events:
+        host = ev["host"]
+        ts   = ev["ts"]
+        if ts is None or host not in reboots:
+            kept.append(ev)
+            continue
+        suppressed = False
+        for rts in reboots[host]:
+            delta = (ts - rts).total_seconds()
+            if 0 <= delta <= REBOOT_SUPPRESS_WINDOW:
+                # Emit a single reboot line the first time we see this reboot
+                key = (host, rts)
+                if key not in emitted_reboots:
+                    emitted_reboots.add(key)
+                    reboot_lines.append({
+                        "ts":   rts,
+                        "host": host,
+                        "raw":  f"*** {host} rebooted at "
+                                f"{rts.strftime('%a %b %d %H:%M:%S')} ***",
+                        "synthetic": True,
+                    })
+                suppressed = True
+                suppressed_n += 1
+                break
+        if not suppressed:
+            kept.append(ev)
+    return kept, reboot_lines, suppressed_n
+# ── Report building ───────────────────────────────────────────────────────────
+def time_range_str(events):
+    """Return a human-readable time range string from event timestamps."""
+    timestamps = [ev["ts"] for ev in events if ev["ts"] is not None]
+    if not timestamps:
+        return "unknown"
+    earliest = min(timestamps)
+    latest   = max(timestamps)
+    fmt      = "%a %b %d %H:%M:%S"
+    if earliest.date() == latest.date():
+        return (f"{earliest.strftime(fmt)} – "
+                f"{latest.strftime('%H:%M:%S')}")
+    return f"{earliest.strftime(fmt)} – {latest.strftime(fmt)}"
+def build_report(events, freq, threshold, rarity_pct, max_count,
+                 input_path, reboots):
+    run_ts    = datetime.now().strftime("%a %b %d %H:%M:%S %Y")
+    total     = len(events)
+    noise_raw = [ev for ev in events if ev["is_anomaly"]]
+    # Apply reboot suppression
+    kept, reboot_lines, suppressed_n = apply_reboot_suppression(
+        noise_raw, reboots
+    )
+    # Merge kept anomalies with synthetic reboot lines, sort by timestamp
+    all_findings = kept + reboot_lines
+    all_findings.sort(key=lambda ev: ev["ts"] if ev.get("ts") else datetime.min)
+    n_noise     = len(kept)
+    n_synthetic = len(reboot_lines)
+    pct_noise   = n_noise / total * 100 if total else 0
+    # Per-host totals (original events only)
+    host_total = defaultdict(int)
+    host_noise = defaultdict(int)
+    for ev in events:
+        host_total[ev["host"]] += 1
+    for ev in kept:
+        host_noise[ev["host"]] += 1
+    n_templates = len({ev["template_id"] for ev in kept})
+    out = []
+    # ── Header ──
+    out.append(banner(f"syslog_hunt.py  |  Anomaly Report  |  {run_ts}"))
+    # ── Summary ──
+    out.append(section("Summary"))
+    out.append(f"  Input              : {input_path.name}")
+    out.append(f"  Scan range         : {time_range_str(events)}")
+    out.append(f"  Total events       : {total:,}")
+    out.append(f"  Rarity threshold   : <= {threshold} events")
+    out.append(f"  Anomalous templates: {n_templates:,}")
+    out.append(f"  Anomalous events   : {n_noise:,}  ({pct_noise:.2f}%)")
+    if suppressed_n:
+        out.append(f"  Reboot-suppressed  : {suppressed_n:,} events "
+                   f"({n_synthetic} reboot(s) detected)")
+    # ── Host breakdown ──
+    out.append(section("Anomaly rate by host"))
+    sorted_hosts = sorted(
+        host_total.keys(),
+        key=lambda h: host_noise.get(h, 0) / host_total[h],
+        reverse=True,
+    )
+    for host in sorted_hosts:
+        tot  = host_total[host]
+        anom = host_noise.get(host, 0)
+        rate = anom / tot * 100 if tot else 0
+        bar  = "█" * min(40, int(rate * 4))
+        out.append(
+            f"  {host:<35}  {anom:>5,} / {tot:>8,}  ({rate:>5.2f}%)  {bar}"
+        )
+    # ── Findings ──
+    n_findings = len(all_findings)
+    out.append(section(f"Findings — {n_noise} anomalous events "
+                       f"({n_templates} templates)"
+                       + (f" + {n_synthetic} reboot(s)" if n_synthetic else "")))
+    for ev in all_findings:
+        out.append(f"  {ev['raw'][:200]}")
+    out.append(banner("End of report"))
+    return "\n".join(out) + "\n"
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="Syslog structural anomaly detection.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "input",
+        type=Path,
+        help="Flat syslog file (one raw line per line)",
+    )
+    parser.add_argument(
+        "--rarity", "-r",
+        type=int,
+        default=DEFAULT_RARITY_PCT,
+        metavar="PCT",
+        help=f"Bottom N percentile flagged as anomalous (default: {DEFAULT_RARITY_PCT})",
+    )
+    parser.add_argument(
+        "--max-count", "-m",
+        type=int,
+        default=DEFAULT_MAX_COUNT,
+        dest="max_count",
+        help=f"Hard cap on template count (default: {DEFAULT_MAX_COUNT})",
+    )
+    parser.add_argument(
+        "--exclude", "-x",
+        nargs="+",
+        default=[],
+        metavar="HOST",
+        help="Hosts to exclude (e.g. --exclude host1.example.com host2.example.com)",
+    )
+    parser.add_argument(
+        "--out", "-o",
+        type=Path,
+        default=None,
+        help="Override output file path",
+    )
+    args = parser.parse_args()
+    if not args.input.exists():
+        print(f"ERROR: file not found: {args.input}")
+        sys.exit(1)
+    exclude_hosts = set(args.exclude)
+    # Output path
+    out_dir = Path("./hunt_output")
+    out_dir.mkdir(exist_ok=True)
+    if args.out:
+        outpath = args.out
+    else:
+        ts      = datetime.now().strftime("%Y%m%d_%H%M")
+        stem    = args.input.stem
+        outpath = out_dir / f"{stem}_anomalies_{ts}.txt"
+    # ── Run ──
+    print(banner(f"syslog_hunt.py  |  {args.input.name}"))
+    print(f"  File      : {args.input}  "
+          f"({args.input.stat().st_size / 1e6:.1f} MB)")
+    print(f"  Rarity    : bottom {args.rarity}th percentile  "
+          f"|  max_count cap={args.max_count}")
+    if exclude_hosts:
+        print(f"  Excluded  : {', '.join(sorted(exclude_hosts))}")
+    print(f"  Output    : {outpath}")
+    print(section("Stage 1 — Load"))
+    events = load_syslog(args.input, exclude_hosts)
+    if not events:
+        print("No events loaded. Check file and host exclusions.")
+        sys.exit(1)
+    hosts = sorted({ev["host"] for ev in events})
+    print(f"  Hosts     : {', '.join(hosts)}")
+    print(f"  Range     : {time_range_str(events)}")
+    print(section("Stage 2 — drain3 Templating"))
+    events = run_drain3(events)
+    print(section("Stage 3 — Rarity Scoring"))
+    threshold, freq = score_rarity(events, args.rarity, args.max_count)
+    print(section("Stage 4 — Reboot Detection"))
+    reboots = detect_reboots(events)
+    if reboots:
+        for host, times in sorted(reboots.items()):
+            for t in times:
+                print(f"  {host}: reboot at {t.strftime('%a %b %d %H:%M:%S')}")
+    else:
+        print("  No reboots detected.")
+    print(section("Stage 5 — Building Report"))
+    report = build_report(
+        events, freq, threshold,
+        args.rarity, args.max_count,
+        args.input, reboots,
+    )
+    outpath.write_text(report, encoding="utf-8")
+    print(f"  Written   : {outpath}  "
+          f"({outpath.stat().st_size / 1024:.1f} KB)")
+    print(report)
+if __name__ == "__main__":
+    main()

scratch/junk/parquet.py ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# flatten_own.py — one CloudTrail file → parquet (same projection as flaws)
+import json, sys, os
+import pandas as pd
+SRC = sys.argv[1] if len(sys.argv) > 1 else "cloudtrail_20260520_to_20260603_00h.json.log"
+OUT = os.path.splitext(SRC)[0] + ".parquet"
+READ_PREFIXES = ("Get","List","Describe","Head","Lookup","Search","BatchGet","Select","Query","Scan")
+def principal(ui):
+    p = ui.get("principalId","") or ""
+    return p.split(":")[-1] if ":" in p else (p or ui.get("type","?"))
+def flatten(e):
+    ui = e.get("userIdentity",{}) or {}
+    attrs = (ui.get("sessionContext",{}) or {}).get("attributes",{}) or {}
+    name = e.get("eventName") or ""
+    return {
+        "eventTime": e.get("eventTime"),
+        "eventSource": (e.get("eventSource") or "").replace(".amazonaws.com",""),
+        "eventName": name,
+        "eventType": e.get("eventType"),
+        "awsRegion": e.get("awsRegion"),
+        "sourceIP": e.get("sourceIPAddress"),
+        "userAgent": e.get("userAgent"),
+        "id_type": ui.get("type"),
+        "principal": principal(ui),
+        "arn": ui.get("arn"),
+        "accountId": ui.get("accountId") or e.get("recipientAccountId"),
+        "invokedBy": ui.get("invokedBy"),
+        "mfa": attrs.get("mfaAuthenticated") == "true",
+        "accessKeyId": ui.get("accessKeyId"),
+        "readOnly_raw": e.get("readOnly"),
+        "is_read": name.startswith(READ_PREFIXES),
+        "errorCode": e.get("errorCode"),
+        "errorMessage": e.get("errorMessage"),
+        "has_request": bool(e.get("requestParameters")),
+        "has_response": bool(e.get("responseElements")),
+        "has_resources": bool(e.get("resources")),
+        "eventVersion": e.get("eventVersion"),
+        "eventID": e.get("eventID"),
+    }
+# tolerate either {"Records":[...]} OR one-JSON-per-line (your sample was JSONL)
+with open(SRC) as f:
+    text = f.read().strip()
+try:
+    obj = json.loads(text)
+    recs = obj["Records"] if isinstance(obj, dict) and "Records" in obj else (obj if isinstance(obj, list) else [obj])
+except json.JSONDecodeError:
+    recs = [json.loads(ln) for ln in text.splitlines() if ln.strip()]
+df = pd.DataFrame(flatten(e) for e in recs)
+df["eventTime"] = pd.to_datetime(df["eventTime"], errors="coerce", utc=True)
+df = df.sort_values("eventTime").reset_index(drop=True)
+df.to_parquet(OUT, engine="pyarrow", compression="zstd", index=False)
+print(f"{len(df):,} events → {OUT}  ({os.path.getsize(OUT)/1e6:.1f} MB)")
+print(f"span: {df['eventTime'].min()} → {df['eventTime'].max()}")

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """LogHunter test suite."""

tests/_cloudtrail_fakes.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Shared FakeS3Client + envelope helpers for CloudTrail exporter tests.
+Lives here (not in either test file) so that the always-run mock test set in
+tests/test_cloudtrail_exporter.py does not transitively import botocore, while
+the botocore-gated set in tests/test_cloudtrail_exporter_botocore.py can reuse
+the same fakes. No botocore reference in this module.
+"""
+from __future__ import annotations
+import gzip
+import json
+from typing import Any
+def _gz_envelope(records: list[dict]) -> bytes:
+    """Encode a {"Records": [...]} envelope as gzipped JSON."""
+    return gzip.compress(json.dumps({"Records": records}).encode("utf-8"))
+class _Body:
+    def __init__(self, content: bytes):
+        self._content = content
+    def read(self) -> bytes:
+        return self._content
+class _FakePaginator:
+    def __init__(
+        self,
+        data: dict[str, dict[str, Any]],
+        log: list[str] | None = None,
+        prefix_errors: dict[str, Exception] | None = None,
+    ):
+        self.data = data
+        self.log = log if log is not None else []
+        self.prefix_errors = prefix_errors or {}
+    def paginate(self, Bucket: str, Prefix: str = "", Delimiter: str | None = None):
+        self.log.append(Prefix)
+        if Prefix in self.prefix_errors:
+            raise self.prefix_errors[Prefix]
+        keys = [k for k in self.data if k.startswith(Prefix)]
+        if Delimiter == "/":
+            common = set()
+            contents = []
+            for key in keys:
+                rest = key[len(Prefix):]
+                if "/" in rest:
+                    common.add(Prefix + rest.split("/", 1)[0] + "/")
+                else:
+                    contents.append({"Key": key, "Size": self.data[key]["size"]})
+            yield {
+                "CommonPrefixes": [{"Prefix": p} for p in sorted(common)],
+                "Contents": contents,
+            }
+        else:
+            yield {
+                "Contents": [
+                    {"Key": k, "Size": self.data[k]["size"]} for k in sorted(keys)
+                ],
+            }
+class FakeS3Client:
+    """Minimal in-memory S3 stub: list_objects_v2 (via paginator) + get_object."""
+    def __init__(self, data: dict[str, dict[str, Any]] | None = None):
+        self.data: dict[str, dict[str, Any]] = data or {}
+        self.get_object_keys: list[str] = []
+        self._get_object_errors: dict[str, Exception] = {}
+        self._list_error: Exception | None = None
+        self.list_prefix_log: list[str] = []
+        self._list_error_for_prefix: dict[str, Exception] = {}
+    def add_object(self, key: str, body: bytes, size: int | None = None) -> None:
+        self.data[key] = {"body": body, "size": size if size is not None else len(body)}
+    def add_year_root_marker(self, prefix: str) -> None:
+        """Force a 'CommonPrefix' under ``prefix`` for a YYYY/ directory.
+        Adds a synthetic '__keep__' key so listing finds the directory.
+        """
+        self.data[f"{prefix}__keep__"] = {"body": b"", "size": 0}
+    def set_get_object_error(self, key: str, exc: Exception) -> None:
+        self._get_object_errors[key] = exc
+    def set_list_error(self, exc: Exception) -> None:
+        self._list_error = exc
+    def set_list_error_for_prefix(self, prefix: str, exc: Exception) -> None:
+        """Raise ``exc`` when list_objects_v2 is called with exactly ``prefix``."""
+        self._list_error_for_prefix[prefix] = exc
+    def get_paginator(self, op: str):
+        if op != "list_objects_v2":
+            raise NotImplementedError(op)
+        if self._list_error is not None:
+            err = self._list_error
+            class _ErrorPaginator:
+                def paginate(self, **_):
+                    raise err
+            return _ErrorPaginator()
+        return _FakePaginator(
+            self.data, self.list_prefix_log, self._list_error_for_prefix
+        )
+    def get_object(self, Bucket: str, Key: str):
+        self.get_object_keys.append(Key)
+        if Key in self._get_object_errors:
+            raise self._get_object_errors[Key]
+        return {"Body": _Body(self.data[Key]["body"])}

tests/conftest.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""Test fixtures shared across the suite."""
+from __future__ import annotations
+import pytest
+def pytest_configure(config: pytest.Config) -> None:
+    """Register custom markers."""
+    # Reserved for future opt-in/opt-out behaviour; the drift tripwire still
+    # uses it as a self-documenting hint that the test depends on real shipped
+    # _DEFAULTS, even though there is no longer an autouse fixture to opt out of.
+    config.addinivalue_line(
+        "markers",
+        "real_defaults: documents that the test depends on the actual shipped "
+        "_DEFAULTS (no per-test mutation of config defaults applied)",
+    )