loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,336 @@
1
+ Metadata-Version: 2.4
2
+ Name: loghunter-cli
3
+ Version: 0.1.0.dev0
4
+ Summary: ML-assisted network and log analysis toolkit for security practitioners and threat hunters.
5
+ Author-email: David Augros <code@augros.org>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: numpy>=1.26
12
+ Requires-Dist: scikit-learn>=1.3
13
+ Requires-Dist: hdbscan>=0.8
14
+ Requires-Dist: drain3>=0.9
15
+ Requires-Dist: tqdm>=4.0
16
+ Requires-Dist: tldextract>=3.0
17
+ Provides-Extra: fast
18
+ Requires-Dist: fast-hdbscan>=0.2; extra == "fast"
19
+ Provides-Extra: splunk
20
+ Requires-Dist: splunk-sdk; extra == "splunk"
21
+ Provides-Extra: cloudtrail
22
+ Requires-Dist: boto3; extra == "cloudtrail"
23
+ Requires-Dist: botocore[crt]; extra == "cloudtrail"
24
+ Provides-Extra: all
25
+ Requires-Dist: loghunt[fast]; extra == "all"
26
+ Requires-Dist: loghunt[splunk]; extra == "all"
27
+ Requires-Dist: loghunt[cloudtrail]; extra == "all"
28
+ Dynamic: license-file
29
+
30
+ # LogHunter
31
+
32
+ LogHunter is a local-first command-line threat-hunting workbench for self-hosters. You
33
+ point it at the logs you already have — Zeek, Pi-hole/dnsmasq, syslog, CloudTrail — and it
34
+ tells you what's in them and runs transparent detectors over them: beaconing, suspicious
35
+ DNS, port scans, rare syslog events, abnormally long connections, and unusual CloudTrail
36
+ activity. Every run names the technique behind each detector, so you always know whether a
37
+ finding came from a published algorithm or an honest heuristic.
38
+
39
+ **Not a SIEM. Not an agent. Not magic.** Nothing to deploy, no database, no daemon, no
40
+ account. Install it, point it at a directory of logs, read the output. It runs on the
41
+ admin's own box, over logs at rest.
42
+
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](#license)
44
+ ![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)
45
+
46
+ > **Status: early / pre-1.0 (`0.1.0.dev0`).** The six detectors below work and are
47
+ > covered by tests, but interfaces may still move before 1.0. Feedback welcome.
48
+
49
+ <!-- TODO(screenshots): a real terminal capture of `loghunter ~/zeek` and a `digest` card go here. -->
50
+
51
+ A run opens with a summary banner — what was loaded, and which technique each detector
52
+ used — then groups findings by detector (illustrative output; addresses are
53
+ [RFC 5737](https://datatracker.ietf.org/doc/html/rfc5737) documentation space):
54
+
55
+ ```
56
+ LogHunter · Threat Hunt
57
+ ══════════════════════════════════════════════════════════════════════════════
58
+ Data found: 2026-05-31 00:00 → 2026-06-01 00:00 (24h)
59
+ Records: 1,284,402 conn.log · 318,221 dns.log · 44,019 *.log
60
+ Detectors: beacon (FFT) · dns (fast-HDBSCAN) · syslog (drain3) · scan [pattern] · duration [heuristics]
61
+ ══════════════════════════════════════════════════════════════════════════════
62
+
63
+ beacon — 2 findings · 1 H 1 M
64
+ ────────────────────────────────────────────────────────────────────────────────
65
+ [H] 192.0.2.37 → 198.51.100.20:443/tcp score 0.91 period 60.0s 1,440 conns
66
+ [M] 192.0.2.37 → 198.51.100.61:8443/tcp score 0.74 period 300.0s 288 conns
67
+
68
+ dns — 1 finding · 1 M
69
+ ────────────────────────────────────────────────────────────────────────────────
70
+ [M] dga-lookups.example entropy 3.91 14 subdomains cluster -1 (noise)
71
+ ```
72
+
73
+ The two-tier styling of the `Detectors:` line is deliberate: published techniques glow in
74
+ parentheses — `(FFT)`, `(HDBSCAN)`, `(drain3)` — while honest house methods are plain in
75
+ brackets — `[pattern]`, `[heuristics]`, `[statistical]`. The restraint is the point. A
76
+ heuristic is never dressed up as an algorithm, which is what makes the glow trustworthy.
77
+
78
+ ## Quick start
79
+
80
+ ```bash
81
+ pip install loghunt
82
+
83
+ # one-time, detection-driven setup — finds your logs and writes a config
84
+ loghunter init
85
+
86
+ # hunt across everything enabled in your config
87
+ loghunter
88
+
89
+ # or point at a directory / file directly
90
+ loghunter ~/zeek-logs
91
+ loghunter syslog /var/log
92
+
93
+ # orient before you hunt — a fast, factual profile of a single file
94
+ loghunter digest /var/log/messages
95
+ ```
96
+
97
+ No config file is required to get started — `loghunter <path>` works against a directory or
98
+ a single file. `loghunter init` just makes it repeatable.
99
+
100
+ ## Why use LogHunter?
101
+
102
+ - **It runs where your logs are.** No services, no database, no daemon, no agent to push.
103
+ `pip install`, point it at a directory, get output. The only setup step that exists at all
104
+ is `loghunter init`, and that only writes a config file.
105
+ - **Real methods, made visible.** Beaconing is found with an FFT over connection timing;
106
+ DNS with HDBSCAN clustering over per-query behavior; rare syslog events with drain3
107
+ log-templating plus rarity scoring; CloudTrail with a transparent per-principal z-score
108
+ composite. Every run tells you which technique ran. You can read *why* something was
109
+ surfaced — no black box.
110
+ - **Big-tent ingestion.** One tool reads Zeek (NDJSON *and* TSV, flat *or* date-partitioned
111
+ directories), Pi-hole/dnsmasq, flat RFC 3164 syslog (Debian *and* RHEL/Fedora layouts),
112
+ and CloudTrail. Rotation and `.gz`/`.bz2`/`.xz` compression are handled transparently.
113
+ - **Orient before you hunt.** `loghunter digest FILE` reads a log and reports facts about
114
+ it — time span, top talkers, the shape of the mix — with zero verdicts. It's sonar, not a
115
+ baggage scanner: it tells you what's there so you know where to point the detectors.
116
+ - **Filter before analyze.** A flat-file allowlist suppresses known-good infrastructure
117
+ *before* any detector sees the data, so your noise floor is yours to set and detectors
118
+ never have to know the allowlist exists.
119
+ - **Honest output.** Findings carry a severity, the evidence behind the score, and (with
120
+ `-v`/`-vv`) the analyst pivots to chase next. Machine formats (`json`, `csv`, `html`) are
121
+ lossless; the terminal view is the one that summarizes.
122
+
123
+ ## Why *not* use LogHunter?
124
+
125
+ - **It is not real-time and not a SIEM.** It runs over logs at rest, in batches. There's no
126
+ streaming, no alerting pipeline, no live correlation across sources at scale. If you need an
127
+ always-on detection platform, you need a SIEM; LogHunter is the workbench you reach for to
128
+ *hunt*.
129
+ - **It is stateless between runs.** There's no persisted baseline and no rolling history.
130
+ CloudTrail "first-seen" novelty, for example, is relative to the window you loaded — not to
131
+ all of recorded time.
132
+ - **Detector coverage is v1.** Six detectors ship today (below). `auth`, `ssl`, `protocol`,
133
+ and `weird` are planned but not built.
134
+ - **The richest network signal wants Zeek.** Pi-hole/dnsmasq gives you DNS only — no RTT,
135
+ TTL, or connection correlation. LogHunter will tell you so and keep working, but Zeek is
136
+ where it shines.
137
+ - **It surfaces, it doesn't block.** This is a tool for a human triaging behavior, not a
138
+ signature IDS or an enforcement point.
139
+
140
+ ## What it hunts
141
+
142
+ | Detector | Surfaces | Method | Source |
143
+ |-----------|-----------------------------------------------------|------------------------------|--------------------------------|
144
+ | `beacon` | periodic C2-style callbacks | FFT over connection timing | Zeek `conn.log` |
145
+ | `dns` | DGA / tunneling / anomalous lookups | HDBSCAN clustering | Zeek `dns.log` **or** Pi-hole |
146
+ | `syslog` | rare events & reboots | drain3 templating + rarity | syslog (flat) **or** Zeek `syslog.log` |
147
+ | `scan` | vertical / horizontal / block / slow port scans | pattern (heuristic) | Zeek `conn.log` |
148
+ | `duration`| abnormally long-lived connections | heuristics | Zeek `conn.log` |
149
+ | `aws` | per-principal anomalous CloudTrail behavior | statistical (z-score composite) | CloudTrail `*.json` |
150
+
151
+ `dns` and `syslog` each answer **one** question across **two** source families — Zeek and
152
+ Pi-hole for DNS, flat rsyslog and Zeek's own `syslog.log` for syslog — and adapt to whichever
153
+ fidelity they're handed.
154
+
155
+ Run them all (`loghunter`), select some (`loghunter --detect=beacon,dns`), or exclude
156
+ (`loghunter --detect='all,!syslog'`). Each detector is also its own subcommand:
157
+ `loghunter beacon ~/zeek`.
158
+
159
+ ## How a run works
160
+
161
+ ```
162
+ discover & parse → allowlist (suppress) → detect → render
163
+ ```
164
+
165
+ Responsibilities don't bleed across that line. The **loader** finds files, decompresses,
166
+ normalizes every connection source to one canonical schema, and absorbs storage variation
167
+ (TSV vs. NDJSON, flat vs. dated directories, rotation). The **allowlist** suppresses
168
+ known-good traffic *before* analysis. **Detectors** only analyze — they never open files,
169
+ read config, or suppress. **Output handlers** only render. The CLI is the one place that
170
+ turns an error into an actionable message and owns the exit code.
171
+
172
+ Because detectors are pure analysis, every one is importable and callable as an ordinary
173
+ Python function — useful in a notebook when you want to experiment.
174
+
175
+ ### Analysis window
176
+
177
+ Pointed at a **directory**, an unqualified run looks back over the last `default_window`
178
+ (`1d` out of the box) of *that source's own* data — the right default for a live log dir
179
+ you don't want to read in full every time. Pointed at a **single file**, it reads the whole
180
+ file. Override either way:
181
+
182
+ ```bash
183
+ loghunter --since=7d ~/zeek # last 7 days
184
+ loghunter --since=2026-05-01 --until=2026-05-08 ~/zeek
185
+ loghunter --days=2-4 ~/zeek # 2 to 4 days ago
186
+ loghunter --all ~/zeek # the entire archive
187
+ ```
188
+
189
+ CloudTrail is the one source that opts out of the default window — novelty detection needs
190
+ full history, so it always loads in full unless you narrow it explicitly.
191
+
192
+ ## Orient before the hunt: `digest`
193
+
194
+ ```bash
195
+ loghunter digest /var/log/messages
196
+ loghunter digest conn.log dns.log # several files → several cards
197
+ ```
198
+
199
+ `digest` content-sniffs each file, routes it to the right summarizer (conn, dns, syslog,
200
+ cloudtrail), and falls back to a fast byte-profiler — **blob** — for anything it doesn't
201
+ recognize. A card is flush-left and factual: the file's time window, line count and size, a
202
+ scale-anchored histogram, and a handful of plain-language insights ("one client accounts for
203
+ 71% of queries"). It states facts and superlatives, never verdicts — no "suspicious," no
204
+ "anomalous." It reads your data *before* the allowlist, because everything in the file,
205
+ allowlisted or not, is part of "what's in here." The blob profiler is bounded: it samples a
206
+ big file rather than reading it, so a one-gigabyte mystery file costs the same as a
207
+ one-kilobyte one.
208
+
209
+ ## Installation
210
+
211
+ LogHunter is published on PyPI as **`loghunt`** (the command, import package, and config
212
+ section are all `loghunter`).
213
+
214
+ ```bash
215
+ pip install loghunt # core
216
+ pip install 'loghunt[fast]' # fast-hdbscan accelerator for DNS clustering
217
+ pip install 'loghunt[splunk]' # Splunk exporter
218
+ pip install 'loghunt[cloudtrail]' # CloudTrail (S3) exporter
219
+ pip install 'loghunt[all]' # everything above
220
+ ```
221
+
222
+ Requires **Python 3.11+**. A bare `pip install loghunt` always works — the DNS clustering
223
+ runs on stock `hdbscan` (a base dependency); `[fast]` swaps in a numba-accelerated backend
224
+ when you want it, and the tool tells you which one is active on every run.
225
+
226
+ From source:
227
+
228
+ ```bash
229
+ git clone https://github.com/spiralbend/loghunter
230
+ cd loghunter
231
+ pip install -e '.[all]'
232
+ ```
233
+
234
+ ## Configuration
235
+
236
+ Configuration is optional — LogHunter runs against a path with none. When you want it
237
+ repeatable, `loghunter init` looks at the conventional locations on your box, profiles what
238
+ it finds (which log families, rough size, freshness — without reading a single log line),
239
+ and writes a fully-annotated `~/.loghunter/config.toml`. It never clobbers settings you
240
+ already have.
241
+
242
+ Config is loaded from the first of:
243
+
244
+ 1. `--config=FILE`
245
+ 2. `~/.loghunter/config.toml`
246
+ 3. `/etc/loghunter/config.toml`
247
+
248
+ Everything LogHunter owns lives under the hidden `~/.loghunter/` — config, allowlists,
249
+ exports, reports — so it can't collide with a project directory. A trimmed example:
250
+
251
+ ```toml
252
+ [loghunter]
253
+ detect = "all" # "all" | "dns,beacon" | "all,!syslog"
254
+ zeek_dir = "/var/log/zeek"
255
+ syslog_dir = "/var/log"
256
+ # pihole_dir = "/var/log/pihole"
257
+ # cloudtrail_dir = "/var/log/cloudtrail"
258
+
259
+ home_net = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
260
+ default_window = "1d" # lookback for a directory; "" or "all" = full
261
+ output_format = "text" # text | json | csv | html
262
+ ```
263
+
264
+ Findings print to your terminal by default — keep it pipeable. Set `report_dir` (or pass
265
+ `--out=PATH`) to write report files instead. Every tunable a detector exposes is documented
266
+ as a commented "engine room" at the bottom of the generated config; you rarely need it, and
267
+ `loghunter <detector> --help` lists the full surface.
268
+
269
+ ## Log sources it speaks
270
+
271
+ - **Zeek** — `conn.log`, `dns.log`, `syslog.log`, in NDJSON or TSV, from a flat directory or
272
+ date-partitioned subdirectories. Rotation and gzip/bzip2/xz compression are transparent.
273
+ - **Pi-hole / dnsmasq** — DNS event logs, aggregated per domain for clustering.
274
+ - **syslog** — flat RFC 3164. Discovery is content-sniffed, not filename-matched, so it
275
+ handles both the Debian convention (`syslog`, `auth.log`, `kern.log`) and the RHEL/Fedora
276
+ one (extensionless `messages`, `secure`, `maillog`) — and won't mistake `dnf.log` or a
277
+ binary like `wtmp` for a log stream.
278
+ - **CloudTrail** — gzipped JSON event records, read locally or pulled from S3 (below).
279
+
280
+ ## The allowlist
281
+
282
+ Two kinds of allowlist file, never conflated:
283
+
284
+ - **Flat files = suppression.** One rule per line — an IP, a CIDR, a `:port/proto`, or a
285
+ domain glob/regex. Matching traffic is dropped before any detector runs. LogHunter ships a
286
+ curated domain list and never ships numeric connection suppressions (those depend on your
287
+ hosts, and shipping them could hide real findings).
288
+ - **TOML stanzas = classification.** When a detector needs to know *what* something is
289
+ (a nameserver, a backup client) rather than whether to drop it.
290
+
291
+ A bare host IP with no port suppresses *all* traffic involving that host — powerful, and
292
+ called out as such wherever it appears.
293
+
294
+ ## Pulling logs in: exporters
295
+
296
+ LogHunter can fetch logs from external systems to local files, which it then analyzes like
297
+ any other source — the syslog detector can't tell whether the data came from rsyslog or a
298
+ Splunk export.
299
+
300
+ ```bash
301
+ loghunter export # run the configured "default" query
302
+ loghunter export auth # run a named query
303
+ ```
304
+
305
+ - **Splunk** — named SPL queries under `[export.splunk.query.<name>]`. Prefer the
306
+ `LOGHUNTER_SPLUNK_USER` / `LOGHUNTER_SPLUNK_PASS` environment variables over plaintext
307
+ credentials in config.
308
+ - **CloudTrail** — pulls gzipped JSON from an S3 prefix. AWS authentication is *not* handled
309
+ here: you authenticate your shell, and boto3 resolves the ambient credential chain.
310
+ LogHunter never reads, stores, or prompts for AWS credentials, and warns before a large
311
+ egress.
312
+
313
+ ## Output formats
314
+
315
+ `text` (default, grouped and summarized), `json` (one finding per line, pipeable), `csv`
316
+ (flattened), and `html` (a self-contained file). Pass `--output=json` or set `output_format`
317
+ in config. `-v` adds the curated "why it scored" detail; `-vv` adds raw debug — template
318
+ strings, cluster membership, full evidence. Color is enhancement-only and TTY-gated: piped
319
+ or redirected output is always plain, and the machine formats never emit an escape code.
320
+
321
+ ## Building from source & running tests
322
+
323
+ ```bash
324
+ git clone https://github.com/spiralbend/loghunter
325
+ cd loghunter
326
+ pip install -e '.[all]'
327
+ python -m pytest
328
+ ```
329
+
330
+ `main` is kept runnable. Architecture tests cover the boundaries that matter — detector
331
+ discovery, run planning, loader metadata, allowlist suppression, output registration, and
332
+ CLI error formatting.
333
+
334
+ ## License
335
+
336
+ LogHunter is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,122 @@
1
+ loghunter/__init__.py,sha256=JnW8Vxk8h2j3xybPMLovIw1PKGc0XtQ7bVMgiczJZeo,114
2
+ loghunter/cli.py,sha256=zME4pnkoPP268fEGytWwaCkqoyrljIDFwef8wBIND20,44799
3
+ loghunter/cli_init.py,sha256=fM6OF-dpgoSadJ0Z2oeUEw5_y2XxLYpxwFWB5N-DGIY,21808
4
+ loghunter/runner.py,sha256=SoXC-3MeEixpQDDiW3lDb5bClOrVzl5uuK0csGE0uCE,81179
5
+ loghunter/common/__init__.py,sha256=OLqoJrEX505jlqZYM0ziUIGhtxGcpeWM3ohq7kyoD18,68
6
+ loghunter/common/allowlist.py,sha256=VJdAtTXnnETxxuD-nC-YDRddGvS97CXjsYMV9Q20w8w,15943
7
+ loghunter/common/clustering.py,sha256=LyY2njihWuKrWJWr_qqmloYlpPqXd0Rsifq4S-C_afM,13822
8
+ loghunter/common/config.py,sha256=EVXlmvfLqfc7VMzsyW_Pyv19i1X7JZg38ZitHEpflZw,8266
9
+ loghunter/common/display.py,sha256=tB-1FeFsHlava16Qhs8dGrKeY3oFJ_jVFeY_cUZNQog,12078
10
+ loghunter/common/errors.py,sha256=43z9UPJv01Kx5uYcWI7QG5wKR4v6pYuUw5hxzpwZXBQ,1835
11
+ loghunter/common/finding.py,sha256=dy_kDOxsjJbzb8ArxfpUZZA6Lqyftbzqs0g_3kWCXQw,9557
12
+ loghunter/common/output.py,sha256=Q8REssHwUf7o79O5JYJPTrpDA3taP2b3DmqcJi6QwbE,3235
13
+ loghunter/common/paths.py,sha256=h6lUBfjxBmaeW7tq4fsy_fL8dL_IAvthVEMpu11NQPY,4176
14
+ loghunter/common/sources.py,sha256=gVtF9GhKHLuObkWhR2NBFh9JIGjwdwYRwtY0T7rO39M,16295
15
+ loghunter/common/loader/__init__.py,sha256=l9wpHOvGG0SuZTJbyrdd7lggIT7wx6-4V1kftOgw_Pk,4327
16
+ loghunter/common/loader/diagnostics.py,sha256=q4NoFpaP8H5FfLzxNUWr7F3bevbZHe1IjqfhxwJ5ZkM,3020
17
+ loghunter/common/loader/discovery.py,sha256=A4U4sOMY3pI9OEKgIGqEyC2VPc-3gglP1hpI_oOzmfc,15053
18
+ loghunter/common/loader/io.py,sha256=8oAwk5p1v0z4aX-xWAl9pCt4Rx49HYfMVQsR20JZReE,2893
19
+ loghunter/common/loader/pipeline.py,sha256=9ANkonqPTekWy8_nFU6oyAuhnYxZTlS3CBA1arAaUtA,42457
20
+ loghunter/common/loader/sniff.py,sha256=3Jhye2sUrFK6lFyMEjguqGOr0hrFCy9pp1HLQdfG4h4,7494
21
+ loghunter/common/loader/types.py,sha256=rMEE4vXVv9n2NBc_N6Fnv0kMjafBjarwsZA9qfRMZhQ,8296
22
+ loghunter/common/loader/windowing.py,sha256=f7YwMuKEkHO-21qd3je15I9zy1uI7nTQLeWKRjjfj6I,24237
23
+ loghunter/data/config_example.toml,sha256=FqBHFypjZbfd_OylOwU8vlQs_YJk4vHl3uvJZJW43so,7874
24
+ loghunter/data/allowlist/connections.txt,sha256=smNirWSiy8aKtfzt_Y5PEV-ifFs99DcJIpi30OED05E,2807
25
+ loghunter/data/allowlist/domains_devices.txt,sha256=qXKZweOzkJQFcMxo7d36oD4i-8TKgx9h2Wa6HTSirrQ,262
26
+ loghunter/data/allowlist/domains_homelab.txt,sha256=1MA3Zgf2ucofFP6jVWPjFeh9k8Jbj3X9J832Z8_lYMI,266
27
+ loghunter/data/allowlist/domains_universal.txt,sha256=dHDgUkaWQ7u8rkk8nFnY7WGnqckFSzWe0UmydmvQSM8,5305
28
+ loghunter/detectors/__init__.py,sha256=xp-UAxICRpgrLivPLx8xBeRjr2tgCKS0k1-PVCcMSXE,248
29
+ loghunter/detectors/auth.py,sha256=BD3SaqPzjWTB2hYvRSlMKGYDKTelDNwK8mWv6BDikUU,753
30
+ loghunter/detectors/aws.py,sha256=oPHIepYlIDn7pWpvFLfCRJckn1Oxm0bIk8zUZJDLXsQ,27678
31
+ loghunter/detectors/beacon.py,sha256=Hq4nWG90V91rKpTdBmXs2WGVNCCSd9Pm7gs5aAE7zxU,8437
32
+ loghunter/detectors/dns.py,sha256=H-gsRipx6wEzxcSG2oU5eOqs0IEW1M8JmH9jSoImhuY,29595
33
+ loghunter/detectors/dnsblock.py,sha256=Pj_5DZ3HimaqoSuoSo2_ZZQecBtekgolZzVO380Dfmk,949
34
+ loghunter/detectors/duration.py,sha256=zPyAj3RRAVkWQ12zWZkQZHQa2t9qUsSj5y8WkGp8QwI,6109
35
+ loghunter/detectors/protocol.py,sha256=Xz1xda-cuxjxQPmLtZ20cvQqWC-kjbCL5ZwsZF_zvcI,808
36
+ loghunter/detectors/scan.py,sha256=20O9nY01jxl8fdg1K27XLZXOTB6H8vSWkAFimgB20C8,29191
37
+ loghunter/detectors/ssl.py,sha256=UMGNtcQ3O9jkII2RauGzh_dtMxArP5V-hKVOHSYV7GI,717
38
+ loghunter/detectors/syslog.py,sha256=rRRwPgV7w2e3B20pNfXFs0WWCZ0TXXvIo2GaxIMpfBQ,9887
39
+ loghunter/detectors/weird.py,sha256=dKTmo2BALl-zGMny0W7yTxnUzWqxByN3HmouMU_PXps,756
40
+ loghunter/digest/__init__.py,sha256=G2HSlAqHpPgXMTrLqN4WoDgvKrVNO7s6R0N1AL0s818,1790
41
+ loghunter/digest/_stats.py,sha256=DpC6NGMgRO5fUy_nQL3JSe4NjCCff8TH5UpwODk-FSA,7961
42
+ loghunter/digest/blob.py,sha256=ec-2xy1klY1ifjtc96HgXas53KjT5cgow2Z0bsRglcw,30265
43
+ loghunter/digest/cloudtrail.py,sha256=mNQs-F25gfYxWSo-TPxqf1Ex735-QQA3mp1mcLtSWto,13928
44
+ loghunter/digest/conn.py,sha256=V4jupXKGntwimeUbj2ZwgxkxrhnQLGIIzHM1RIi-67U,13655
45
+ loghunter/digest/dns.py,sha256=U89aGpbJOZ6wx1AfaxS3_s3in1WkGbnySPqx3qTdFAU,13645
46
+ loghunter/digest/syslog.py,sha256=p7AabWoCyOpTHQr1XcJSOD25Lilu6dX4QYxj_uH6ROA,10771
47
+ loghunter/exporters/__init__.py,sha256=SAcr6rq2v6JOEcSIHfO4SI9WMt2My9eZGf9KDCcwYwA,23483
48
+ loghunter/exporters/cloudtrail.py,sha256=PJFTzu2iAh4WIyvSlGZEdJtq5sC1Zx2KBCD7Pr7-cFo,19627
49
+ loghunter/exporters/splunk.py,sha256=VQH4dGYKpJqYEc0-82ehoVnuN99KKza4DoFOdo7vGAM,8333
50
+ loghunter/outputs/__init__.py,sha256=KKGBVAJyvijLkkhydNU35UlHgMPCBtjwFYFUsBl95Qg,90
51
+ loghunter/outputs/allowlist.py,sha256=pO30cUZtwecHNuYESkwL8jVEV-uPcbSLD3YUayMtxrQ,2390
52
+ loghunter/outputs/csv.py,sha256=77iqHEzP42zwch0tXhHfymu1L0GbD9-xG7F-vO5kFlY,2559
53
+ loghunter/outputs/email.py,sha256=gKXu-MqnJlT3QTP3Q2tgtGdgpLJN0ljljJfJ0hddc1U,1335
54
+ loghunter/outputs/html.py,sha256=newe0pW9tVesqtDB7j-lb5QakdgMOsmNCQxTViYE78Q,3681
55
+ loghunter/outputs/json.py,sha256=1C4klX8xGnuJVxbfNGZBZETm6jQ0puC6axywclEsQWU,2859
56
+ loghunter/outputs/text.py,sha256=EFNZEEWyBiDmzq3W_UKhFpYaWFUHDMhy-m2eZSoQ1qc,59436
57
+ loghunter/parsers/__init__.py,sha256=AbI3M68ELw0VpyAfeS2y3Xdl090hE5_4Kuu4RjKM8T8,81
58
+ loghunter/parsers/cloudtrail.py,sha256=viEUH0VI_hnv1aW9dBP9sFTxIvAcUNYPDIyH4jUKF7E,11775
59
+ loghunter/parsers/dnsmasq.py,sha256=F_L-ZfyklWVdrlUamHjSwada381Zjp-1sO1iuphcoHI,13902
60
+ loghunter/parsers/syslog.py,sha256=rQhxfV0Jq6L6tjOeyjab1p6I1NdeyB_H8LC3kzyuCnA,5724
61
+ loghunter/parsers/zeek.py,sha256=Skbt3VOL6XmHisnahBpteVzOd6VaXfSLV3NesoSOYHo,12530
62
+ loghunter/parsers/zeek_tsv.py,sha256=VyTlFWdC98e-oylR3yb6Y_9o0jYhsM2N_KTJPOLZQ_s,10866
63
+ loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE,sha256=5fFNc6512wTlNpmNEVjE5ea6ZCN_1zUaNgjVaHM__3s,1069
64
+ migrations/cloudtrail_parquet.py,sha256=RTygEpOpgkgBF0-fsN2RQebDpL_ulM6WYWskuqTFYNg,2541
65
+ migrations/conn_fft.py,sha256=Ce0E9JisJOwP8JxZwKVN2NJLOdORlcvU-VuyFJ3vSWw,20955
66
+ migrations/conn_scan.py,sha256=WLNC5etwXx5ed6QxhPJBaObe51krA0tmNAbbE9v0a-A,47659
67
+ migrations/dns_dbscan.py,sha256=ZEse_mJbf1QzwH4qYntMPXTx4z32EqxyxYjAuQcMXoo,20437
68
+ migrations/get_syslog.py,sha256=lF1EaSOTgFOGWX7zyQnjIc9n6pyzjSndxw6CnRvVJ2U,15827
69
+ migrations/syslog_drain3.py,sha256=KFX9phlDVlsHNUJxuhDDnKrLB2FGzSkvcXxfGBZNwbs,17745
70
+ scratch/junk/parquet.py,sha256=RTygEpOpgkgBF0-fsN2RQebDpL_ulM6WYWskuqTFYNg,2541
71
+ tests/__init__.py,sha256=3012BSYwXeGf22gxMpxLqbMwWJqFGfYSWKkKseGSmqg,28
72
+ tests/_cloudtrail_fakes.py,sha256=dTDjdsdbVmV0rwdwkI8UQYyS0I3_7D_C0rNvEkdSx9U,4114
73
+ tests/conftest.py,sha256=zHaPHZSRmV7C-6k8aLpfdBB2vy60m3XBF92Qeeg9AJQ,629
74
+ tests/test_allowlist_defaults_accessor.py,sha256=tZko1MWVqRBfZ6u7_qSNKrF7Hd0SM2tFOMOJy-jjL1w,3547
75
+ tests/test_architecture_spine.py,sha256=LS6ta8shX0i5V2o4BQUs9PejafKTNzW7EM01WEa0-bU,11776
76
+ tests/test_aws_detector.py,sha256=bHXzlj3KPPDf9wpVHWcUaeAUEYnGdP0WU1kRlD8MHgk,21548
77
+ tests/test_be_like_water.py,sha256=3xNmCnZPubkbpp_7p2ngNi_cirUBsctHZxbSTJbPZik,4305
78
+ tests/test_cli_help.py,sha256=17WxnGi2-axprMiHrcPE-Ma3YbW5J-Xvo2kUOtz0jxg,12196
79
+ tests/test_cli_multi_positional.py,sha256=CPCgGiLNpyXpIM4XCkdR6S8rZD6lT7urF-_x_NQdorU,17315
80
+ tests/test_cloudtrail_exporter.py,sha256=ZrWvDGJLyEwH1wEkaO2ug3jNLhm55_tikGOLItkkslo,28537
81
+ tests/test_cloudtrail_exporter_botocore.py,sha256=UFJUTidxo_t5D3tFt1pYA-6FaT8sUeymN7TyquqTgpI,9325
82
+ tests/test_cloudtrail_parser.py,sha256=1YEx-CBXkNVsgtEhrx8Lr5n01YF_iNW2oHa7iJQgFXE,14984
83
+ tests/test_clustering.py,sha256=xWWGvOOzXanJmBx5PfGn37N9EzLjGqnmueerJPSLD6g,3217
84
+ tests/test_clustering_interruptible.py,sha256=jKs9yz6h87jjpPmSHa8vaM5MgWkmjDjlbpdpbp-jmsY,15572
85
+ tests/test_config_cli.py,sha256=TbH7GhCEztDLa3_-eLyigU5z2DqBntpX4UeOoRc6eyE,37853
86
+ tests/test_config_example_drift.py,sha256=AAVMhZF4LGGK_rx0O4HMa6_qvkF1IBepVUfwOzIDqzI,6498
87
+ tests/test_digest_blob.py,sha256=pFqN2D16XthRCuJCkf-DENtAXWbl4SwiaqRwJDQfgS0,47135
88
+ tests/test_digest_cli.py,sha256=JkP67mrkpOb3I0cy7J3ptNWf-pTKGfa0rbzIr6Xyby8,39081
89
+ tests/test_digest_cloudtrail.py,sha256=vhh0e05YAiJXSDZLG9IBv8y1m2Qtkd3lbllS8_vLEeY,41486
90
+ tests/test_digest_conn.py,sha256=XS8V93DWlfCJSflkPUYGnVf7E7tSJVELfNk3A1MZ0_E,48378
91
+ tests/test_digest_dns.py,sha256=6x7DCDMD8OKG_5-3hoFAAikBMf71DqYbCA6Om0XxDWQ,33067
92
+ tests/test_digest_stats.py,sha256=Pzpa3igG2gmIuD6NcpovNqIOwuvlIDOduXO_v4TFrxE,11409
93
+ tests/test_digest_syslog.py,sha256=ti9Wcl85YbKGkiVzXarCnERcRN6R-IeeEJpKHKZBMs8,29488
94
+ tests/test_display.py,sha256=uJR3NbEYb9Gj0fSFw36auzs7QaZ0LTXBoeeE4_B-k-I,13659
95
+ tests/test_dns_detector.py,sha256=8BQx_46O34SGOnPWUPp_-fAPMEer2XL9SiVxbs6oNqQ,44359
96
+ tests/test_dnsmasq_parser.py,sha256=6JpuqV1XXRveGYUKtuv1b1q7kUZHwXDnlTXAntAVHdc,18164
97
+ tests/test_duration_detector.py,sha256=QABfLZqOJS1imfgEqs-tJcbsahRShDbwfDT71cyX7tE,20711
98
+ tests/test_export_orchestrator_shape.py,sha256=U0wOt73CTqr2olWolxjy4pR0MeL4xm3F1QWgj_A_750,6002
99
+ tests/test_init_wizard.py,sha256=nSlcJaWKrEgaJpCYfuTmyhKj7nlxYB0bU0XgQ7FMxuc,28977
100
+ tests/test_loader.py,sha256=4ZdLxE65NddAtTezTzk2Wt8oB_jZUps8xuoxJhvRElk,142998
101
+ tests/test_loader_package_surface.py,sha256=8YKrjIG8j5bCedHK9ZMraEJFii466-a-Sp8ReKOgixw,4751
102
+ tests/test_loader_window_model.py,sha256=I0JxPv1x8x1ym_LtpDBPKDL62ufCfzsAMUtgRoHbKmk,8149
103
+ tests/test_output_path_cascade.py,sha256=9rwLs6bzXp-RmFKT4HCrPfwuUKNbrpDv1GgSoE3hxPs,23887
104
+ tests/test_resolve_path.py,sha256=Fn9JZCBq2L9dpzOEn_wBUfrnYRMcKroxXNBKBRObHHE,4194
105
+ tests/test_root_provenance.py,sha256=YQipG-Gk1zpN6gRf3pwW5T2sxhFkcUcBBWh7OVcwHeM,8526
106
+ tests/test_runner.py,sha256=1juGXDIU4XaRJ0zkXSEPa382UBx7GSaNvLX8Q_L8EyA,100144
107
+ tests/test_scan_detector.py,sha256=mMUxKi_2nLhe1pskGEF9dQgYKbAqT13Af4vjqB2HvKI,19061
108
+ tests/test_search_paths.py,sha256=CCcRK3SnTcz47XzaJAYM9LxVCFAh27FpI0dCSU2rd78,1941
109
+ tests/test_sniff_orchestrator.py,sha256=cApdN6_a9G_8WUw6qqkYqbJceEHf8phn9VP2o5UHac0,13161
110
+ tests/test_sniff_recognizers.py,sha256=6ZTksrwiLMg6R9ibOmyM4Zdg0DqK88P6P80UgKlJr8k,23085
111
+ tests/test_source_resolution_seam.py,sha256=_s1cXLCg1j5LfSjY9LDVubME5Ke5bQsq65xW6ULDLYE,18212
112
+ tests/test_sources.py,sha256=zIp_9LbTT_zB7xEsw7h1n6Ncgnpnc7cRA23SpYgSRqY,23811
113
+ tests/test_splunk_exporter.py,sha256=xgLdlAUTOPZno73RxvEmPt3rSUcIW1ACieoMq6XiRvU,13531
114
+ tests/test_syslog_detector.py,sha256=U5nUviGL-Lls_oKhoE1Pzk8eZuQGn11GI8TN-_jYlq4,18834
115
+ tests/test_syslog_parser.py,sha256=GY5b3p3dmy7Ub33MsrxQmboVdVrkBmNVC9klxoCmL9E,23917
116
+ tests/test_text_output.py,sha256=xQ7ruLPoCBn9-cDJyaYGiQoZhbGLL8jCH1QB8rj2P90,50027
117
+ tests/test_zeek_tsv_parser.py,sha256=0y3PEJ51OtAbtv6zagcWaQ4Cd9VQvubmqmZKt4sdItc,24504
118
+ loghunter_cli-0.1.0.dev0.dist-info/METADATA,sha256=owWqBy4ZY1KNgqeEbr8cBDBFd0V7LtYtWV_5xefRd-c,16748
119
+ loghunter_cli-0.1.0.dev0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
120
+ loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt,sha256=UIjid5xRmph1b5_1BHEiMdOy51etHMo7iVBtd2rktv4,49
121
+ loghunter_cli-0.1.0.dev0.dist-info/top_level.txt,sha256=5_xAFYP6ny5UGBOdWcEceM0sM11Yo_bePjQ_UUZoPdo,35
122
+ loghunter_cli-0.1.0.dev0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ loghunter = loghunter.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 David Augros
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ loghunter
2
+ migrations
3
+ scratch
4
+ tests
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+ # flatten_own.py — one CloudTrail file → parquet (same projection as flaws)
3
+ import json, sys, os
4
+ import pandas as pd
5
+
6
+ SRC = sys.argv[1] if len(sys.argv) > 1 else "cloudtrail_20260520_to_20260603_00h.json.log"
7
+ OUT = os.path.splitext(SRC)[0] + ".parquet"
8
+
9
+ READ_PREFIXES = ("Get","List","Describe","Head","Lookup","Search","BatchGet","Select","Query","Scan")
10
+
11
+ def principal(ui):
12
+ p = ui.get("principalId","") or ""
13
+ return p.split(":")[-1] if ":" in p else (p or ui.get("type","?"))
14
+
15
+ def flatten(e):
16
+ ui = e.get("userIdentity",{}) or {}
17
+ attrs = (ui.get("sessionContext",{}) or {}).get("attributes",{}) or {}
18
+ name = e.get("eventName") or ""
19
+ return {
20
+ "eventTime": e.get("eventTime"),
21
+ "eventSource": (e.get("eventSource") or "").replace(".amazonaws.com",""),
22
+ "eventName": name,
23
+ "eventType": e.get("eventType"),
24
+ "awsRegion": e.get("awsRegion"),
25
+ "sourceIP": e.get("sourceIPAddress"),
26
+ "userAgent": e.get("userAgent"),
27
+ "id_type": ui.get("type"),
28
+ "principal": principal(ui),
29
+ "arn": ui.get("arn"),
30
+ "accountId": ui.get("accountId") or e.get("recipientAccountId"),
31
+ "invokedBy": ui.get("invokedBy"),
32
+ "mfa": attrs.get("mfaAuthenticated") == "true",
33
+ "accessKeyId": ui.get("accessKeyId"),
34
+ "readOnly_raw": e.get("readOnly"),
35
+ "is_read": name.startswith(READ_PREFIXES),
36
+ "errorCode": e.get("errorCode"),
37
+ "errorMessage": e.get("errorMessage"),
38
+ "has_request": bool(e.get("requestParameters")),
39
+ "has_response": bool(e.get("responseElements")),
40
+ "has_resources": bool(e.get("resources")),
41
+ "eventVersion": e.get("eventVersion"),
42
+ "eventID": e.get("eventID"),
43
+ }
44
+
45
+ # tolerate either {"Records":[...]} OR one-JSON-per-line (your sample was JSONL)
46
+ with open(SRC) as f:
47
+ text = f.read().strip()
48
+ try:
49
+ obj = json.loads(text)
50
+ recs = obj["Records"] if isinstance(obj, dict) and "Records" in obj else (obj if isinstance(obj, list) else [obj])
51
+ except json.JSONDecodeError:
52
+ recs = [json.loads(ln) for ln in text.splitlines() if ln.strip()]
53
+
54
+ df = pd.DataFrame(flatten(e) for e in recs)
55
+ df["eventTime"] = pd.to_datetime(df["eventTime"], errors="coerce", utc=True)
56
+ df = df.sort_values("eventTime").reset_index(drop=True)
57
+ df.to_parquet(OUT, engine="pyarrow", compression="zstd", index=False)
58
+ print(f"{len(df):,} events → {OUT} ({os.path.getsize(OUT)/1e6:.1f} MB)")
59
+ print(f"span: {df['eventTime'].min()} → {df['eventTime'].max()}")