ocsf-mapper 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. ocsf_mapper-0.3.1/PKG-INFO +349 -0
  2. ocsf_mapper-0.3.1/README.md +308 -0
  3. ocsf_mapper-0.3.1/pyproject.toml +70 -0
  4. ocsf_mapper-0.3.1/setup.cfg +4 -0
  5. ocsf_mapper-0.3.1/src/ocsf_mapper/__init__.py +33 -0
  6. ocsf_mapper-0.3.1/src/ocsf_mapper/_fastjson.py +52 -0
  7. ocsf_mapper-0.3.1/src/ocsf_mapper/apply.py +378 -0
  8. ocsf_mapper-0.3.1/src/ocsf_mapper/audit.py +115 -0
  9. ocsf_mapper-0.3.1/src/ocsf_mapper/benchmark.py +191 -0
  10. ocsf_mapper-0.3.1/src/ocsf_mapper/catalog.py +109 -0
  11. ocsf_mapper-0.3.1/src/ocsf_mapper/cli.py +408 -0
  12. ocsf_mapper-0.3.1/src/ocsf_mapper/coverage.py +92 -0
  13. ocsf_mapper-0.3.1/src/ocsf_mapper/generate.py +233 -0
  14. ocsf_mapper-0.3.1/src/ocsf_mapper/lint.py +123 -0
  15. ocsf_mapper-0.3.1/src/ocsf_mapper/mapping_diff.py +281 -0
  16. ocsf_mapper-0.3.1/src/ocsf_mapper/ops.py +181 -0
  17. ocsf_mapper-0.3.1/src/ocsf_mapper/parallel.py +181 -0
  18. ocsf_mapper-0.3.1/src/ocsf_mapper/providers/__init__.py +53 -0
  19. ocsf_mapper-0.3.1/src/ocsf_mapper/providers/anthropic.py +38 -0
  20. ocsf_mapper-0.3.1/src/ocsf_mapper/providers/base.py +21 -0
  21. ocsf_mapper-0.3.1/src/ocsf_mapper/providers/fixture.py +58 -0
  22. ocsf_mapper-0.3.1/src/ocsf_mapper/providers/openai.py +42 -0
  23. ocsf_mapper-0.3.1/src/ocsf_mapper/py.typed +0 -0
  24. ocsf_mapper-0.3.1/src/ocsf_mapper/redact.py +154 -0
  25. ocsf_mapper-0.3.1/src/ocsf_mapper/registry.py +78 -0
  26. ocsf_mapper-0.3.1/src/ocsf_mapper/replay.py +186 -0
  27. ocsf_mapper-0.3.1/src/ocsf_mapper/schema.py +149 -0
  28. ocsf_mapper-0.3.1/src/ocsf_mapper/schema_diff.py +261 -0
  29. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/__init__.py +70 -0
  30. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/base.py +41 -0
  31. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/csv.py +66 -0
  32. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/jsonl.py +31 -0
  33. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/parquet.py +64 -0
  34. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/security_lake.py +157 -0
  35. ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/stdout.py +16 -0
  36. ocsf_mapper-0.3.1/src/ocsf_mapper/stream.py +93 -0
  37. ocsf_mapper-0.3.1/src/ocsf_mapper/validate.py +90 -0
  38. ocsf_mapper-0.3.1/src/ocsf_mapper/web/__init__.py +16 -0
  39. ocsf_mapper-0.3.1/src/ocsf_mapper/web/app.py +650 -0
  40. ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/PKG-INFO +349 -0
  41. ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/SOURCES.txt +67 -0
  42. ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/dependency_links.txt +1 -0
  43. ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/entry_points.txt +2 -0
  44. ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/requires.txt +26 -0
  45. ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/top_level.txt +1 -0
  46. ocsf_mapper-0.3.1/tests/test_apply.py +187 -0
  47. ocsf_mapper-0.3.1/tests/test_audit.py +162 -0
  48. ocsf_mapper-0.3.1/tests/test_benchmark.py +116 -0
  49. ocsf_mapper-0.3.1/tests/test_catalog.py +63 -0
  50. ocsf_mapper-0.3.1/tests/test_cef_leef.py +150 -0
  51. ocsf_mapper-0.3.1/tests/test_cli.py +124 -0
  52. ocsf_mapper-0.3.1/tests/test_coverage.py +68 -0
  53. ocsf_mapper-0.3.1/tests/test_fastjson.py +94 -0
  54. ocsf_mapper-0.3.1/tests/test_generate.py +92 -0
  55. ocsf_mapper-0.3.1/tests/test_lint.py +115 -0
  56. ocsf_mapper-0.3.1/tests/test_mapping_diff.py +216 -0
  57. ocsf_mapper-0.3.1/tests/test_ops.py +282 -0
  58. ocsf_mapper-0.3.1/tests/test_parallel.py +192 -0
  59. ocsf_mapper-0.3.1/tests/test_provider_mocks.py +291 -0
  60. ocsf_mapper-0.3.1/tests/test_providers.py +86 -0
  61. ocsf_mapper-0.3.1/tests/test_redact.py +138 -0
  62. ocsf_mapper-0.3.1/tests/test_registry.py +105 -0
  63. ocsf_mapper-0.3.1/tests/test_replay.py +189 -0
  64. ocsf_mapper-0.3.1/tests/test_schema.py +57 -0
  65. ocsf_mapper-0.3.1/tests/test_schema_diff.py +243 -0
  66. ocsf_mapper-0.3.1/tests/test_sinks.py +280 -0
  67. ocsf_mapper-0.3.1/tests/test_stream.py +110 -0
  68. ocsf_mapper-0.3.1/tests/test_validate.py +73 -0
  69. ocsf_mapper-0.3.1/tests/test_web.py +361 -0
@@ -0,0 +1,349 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocsf-mapper
3
+ Version: 0.3.1
4
+ Summary: Map arbitrary log sources to OCSF events via a declarative JSON DSL.
5
+ Author: ocsf-parse contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ocsf/ocsf-parse
8
+ Project-URL: Issues, https://github.com/ocsf/ocsf-parse/issues
9
+ Keywords: ocsf,security,logs,parser,siem
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Information Technology
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Security
19
+ Requires-Python: >=3.9
20
+ Description-Content-Type: text/markdown
21
+ Provides-Extra: anthropic
22
+ Requires-Dist: anthropic>=0.34; extra == "anthropic"
23
+ Provides-Extra: openai
24
+ Requires-Dist: openai>=1.40; extra == "openai"
25
+ Provides-Extra: parquet
26
+ Requires-Dist: pyarrow>=14; extra == "parquet"
27
+ Provides-Extra: fast
28
+ Requires-Dist: orjson>=3.10; extra == "fast"
29
+ Provides-Extra: web
30
+ Requires-Dist: fastapi>=0.110; extra == "web"
31
+ Requires-Dist: uvicorn[standard]>=0.27; extra == "web"
32
+ Requires-Dist: jinja2>=3.1; extra == "web"
33
+ Requires-Dist: python-multipart>=0.0.9; extra == "web"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=8; extra == "dev"
36
+ Requires-Dist: pytest-cov>=5; extra == "dev"
37
+ Requires-Dist: httpx>=0.27; extra == "dev"
38
+ Requires-Dist: fastapi>=0.110; extra == "dev"
39
+ Requires-Dist: jinja2>=3.1; extra == "dev"
40
+ Requires-Dist: python-multipart>=0.0.9; extra == "dev"
41
+
42
+ # ocsf-parse
43
+
44
+ Self-service tool that maps any log source into [OCSF](https://github.com/ocsf/ocsf-schema)
45
+ events through a small declarative JSON DSL. One Python engine, JSON config
46
+ per source, master-data catalog, CI-linted, schema-validated, with a local
47
+ web UI and LLM-assisted onboarding.
48
+
49
+ [![CI](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml/badge.svg)](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml)
50
+
51
+ ## What it does
52
+
53
+ **38 reference mappings**, **20 OCSF event classes**, **8 of 8 OCSF
54
+ categories** — from Windows Event Log / Sysmon / auditd through CloudTrail /
55
+ Okta / Azure AD to Suricata / Wazuh / CrowdStrike. Each mapping ships with
56
+ a paired ~100-event sample, is lint-checked on every PR, and validates
57
+ against the vendored OCSF schema.
58
+
59
+ | OCSF category | Classes covered | Sources |
60
+ |---|---|---|
61
+ | System Activity | `file_activity`, `kernel_activity`, `process_activity`, `scheduled_job_activity`, `registry_key_activity` | auditd_file, dlp_events, falco_kernel, sysmon_process, cron, windows_registry |
62
+ | Findings | `security_finding`, `detection_finding`, `vulnerability_finding` | wazuh, splunk_es_alert, crowdstrike_falcon, suricata_alert, qualys_scan, ueba_alert, cef_generic, leef_generic |
63
+ | IAM | `authentication`, `entity_management` | okta, sshd, cloudtrail (ConsoleLogin), windows_event_log, azure_ad_signin, slack_audit, gitlab_audit |
64
+ | Network | `network_activity`, `http_activity`, `dns_activity`, `email_activity` | nginx, apache, cloudflare, palo_alto, vpc_flow_logs, waf_logs, zeek_dns, m365_email, google_workspace |
65
+ | Discovery | `inventory_info`, `config_state`, `device_config_state_change` | osquery_inventory, aws_config, jamf_inventory, prisma_cloud |
66
+ | Application Activity | `api_activity` | cloudtrail (non-login), github_audit, k8s_audit |
67
+ | Remediation | `remediation_activity` | soar_remediation |
68
+ | Unmanned Systems | `drone_flights_activity` | drone_telemetry |
69
+
70
+ Browse the master-data view with `ocsf-mapper catalog` or
71
+ [`catalog.json`](./catalog.json).
72
+
73
+ ## Quickstart
74
+
75
+ > **First time here?** Read [`QUICKSTART.md`](./QUICKSTART.md) — a
76
+ > 5-minute concrete walkthrough showing exactly what comes out
77
+ > when you point this at your logs. Comes back here when you want
78
+ > the full install / CLI / SDK reference.
79
+
80
+ ### From PyPI (once 0.3.0 ships)
81
+
82
+ ```bash
83
+ pip install 'ocsf-mapper[web,parquet,fast]'
84
+ ocsf-mapper list
85
+ ```
86
+
87
+ ### From source (for development)
88
+
89
+ ```bash
90
+ git clone --recurse-submodules https://github.com/SauNu84/ocsf-parse
91
+ cd ocsf-parse
92
+ pip install -e '.[dev,web,parquet,fast]' # full feature set incl. orjson
93
+ ```
94
+
95
+ ### From Docker (zero-install)
96
+
97
+ ```bash
98
+ docker run --rm -p 8000:8000 ghcr.io/saunu84/ocsf-mapper:0.3.0
99
+ # → web UI on http://127.0.0.1:8000
100
+
101
+ # or pin to "latest" tag:
102
+ docker run --rm ghcr.io/saunu84/ocsf-mapper:latest list
103
+ ```
104
+
105
+ > `[fast]` pulls in `orjson` — 5-10× faster JSON parse/dump than stdlib.
106
+ > Drop-in via `ocsf_mapper._fastjson`; falls back to stdlib if absent.
107
+
108
+ ### CLI
109
+
110
+ ```bash
111
+ # Browse what's available
112
+ ocsf-mapper list # table of mappings
113
+ ocsf-mapper catalog # master-data: vendor + priority + OCSF class
114
+
115
+ # Map a log to OCSF events (sink inferred from output extension)
116
+ ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out.jsonl
117
+ ocsf-mapper apply mappings/okta.json samples/okta.jsonl out.csv
118
+ ocsf-mapper apply mappings/sshd.json samples/sshd.log # → stdout
119
+
120
+ # Pipe stdin → stdout
121
+ cat samples/cloudtrail.jsonl | ocsf-mapper apply mappings/cloudtrail.json - | jq .
122
+
123
+ # Partitioned Parquet for AWS Security Lake
124
+ ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out/ --sink security-lake
125
+ # → out/<class_uid>/eventDay=YYYYMMDD/part-NNNNN.parquet
126
+
127
+ # tail -f a live log
128
+ ocsf-mapper tail mappings/nginx.json /var/log/nginx/access.log out.jsonl
129
+
130
+ # Validate already-OCSF events
131
+ ocsf-mapper validate out.jsonl authentication
132
+
133
+ # CI gate — re-lint every mapping against its pinned sample
134
+ ocsf-mapper lint # exits 0 iff all mappings pass
135
+
136
+ # LLM-assisted mapping draft (needs ANTHROPIC_API_KEY or OPENAI_API_KEY)
137
+ ocsf-mapper generate my_new_source samples/my_new_log.jsonl mappings/my_new.json
138
+
139
+ # Detect breaking OCSF schema changes before a submodule bump bites
140
+ ocsf-mapper schema-diff [<git-ref>] # default: HEAD~1 of ocsf-schema submodule
141
+
142
+ # Throughput diagnostic: per-phase timing (parse / route / map / write)
143
+ ocsf-mapper benchmark mappings/cloudtrail.json samples/cloudtrail.jsonl
144
+
145
+ # Scale up: fan out across N worker processes (linear speedup to CPU count)
146
+ ocsf-mapper apply mappings/cloudtrail.json input.log out.jsonl --workers 8
147
+
148
+ # Redact PII before writing (email / ipv4 / ssn / phone / jwt / Luhn-valid ccn)
149
+ ocsf-mapper apply mappings/X.json input.log out.jsonl --redact
150
+
151
+ # Local web UI (127.0.0.1 only) — includes live-tail mode on the Output tab
152
+ ocsf-mapper serve # → http://127.0.0.1:8000
153
+ ```
154
+
155
+ For a 10 TB-class workload, combine `--workers N` with `--sink security-lake`
156
+ and the streaming SecurityLakeSink (auto-rolls part files every 50 000 events
157
+ per partition). See [§"At scale"](#at-scale) below for the architecture.
158
+
159
+ If you cloned without `--recurse-submodules`:
160
+
161
+ ```bash
162
+ git submodule update --init --recursive
163
+ ```
164
+
165
+ ### Web UI
166
+
167
+ `ocsf-mapper serve` launches a FastAPI + HTMX + Monaco app on `127.0.0.1`.
168
+
169
+ - **Homepage** — card grid of every mapping with priority badge, OCSF
170
+ class+uid, lint status, and a coverage bar.
171
+ - **Per-source page** — five HTMX-swappable tabs:
172
+ - *Sample* — raw lines of the pinned sample.
173
+ - *Output* — drop any log file → side-by-side raw / mapped OCSF /
174
+ per-event validation.
175
+ - *Mapping* — Monaco JSON editor. Save runs the linter against the
176
+ pinned sample server-side and only writes the file if it passes.
177
+ - *Validation* — full validator report across the pinned sample with a
178
+ recurring-issues rollup.
179
+ - *Coverage* — per-class bars (required + recommended attrs populated)
180
+ + lists of missing fields.
181
+ - **`/new` wizard** — upload a sample, fill in vendor / priority, the
182
+ generator drafts a mapping via the configured LLM provider, you review
183
+ the JSON in Monaco, hit save, the linter gate runs before the file is
184
+ written.
185
+
186
+ ## SDK
187
+
188
+ ```python
189
+ from ocsf_mapper import apply_stream, validate, list_mappings
190
+ from ocsf_mapper.sinks import JsonlSink
191
+ from ocsf_mapper.sinks.security_lake import SecurityLakeSink, infer_schema_from
192
+ from ocsf_mapper.coverage import coverage
193
+ from ocsf_mapper.stream import stream_apply
194
+ from ocsf_mapper.parallel import apply_parallel
195
+ from ocsf_mapper.benchmark import benchmark
196
+ from ocsf_mapper.redact import RedactingSink
197
+ import json
198
+
199
+ config = json.loads(open("mappings/cloudtrail.json").read())
200
+
201
+ # Batch
202
+ with JsonlSink("out.jsonl") as sink:
203
+ sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
204
+
205
+ # Partitioned Parquet for downstream Security Lake ingest. Streams to disk
206
+ # every flush_every events per (class_uid × eventDay) partition — memory
207
+ # is bounded regardless of input size.
208
+ schema = infer_schema_from(next(iter(apply_stream(config, open("samples/cloudtrail.jsonl")))))
209
+ with SecurityLakeSink("out_dir", flush_every=50_000, schema=schema) as sink:
210
+ sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
211
+
212
+ # Multi-process for 10 TB-class workloads
213
+ apply_parallel(config, "huge.log", "out.jsonl", n_workers=8, sink_kind="jsonl")
214
+
215
+ # Coverage scoring (what % of required + recommended attrs are populated)
216
+ print(coverage(config))
217
+
218
+ # Per-phase timing
219
+ print(benchmark(config, "samples/cloudtrail.jsonl"))
220
+
221
+ # PII redaction wrapper around any sink
222
+ with RedactingSink(JsonlSink("scrubbed.jsonl")) as sink:
223
+ sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
224
+
225
+ # Live tail
226
+ import threading
227
+ stop = threading.Event()
228
+ with JsonlSink("live.jsonl") as sink:
229
+ stream_apply(config, "/var/log/cloudtrail.log", sink, stop=stop)
230
+ ```
231
+
232
+ ## At scale
233
+
234
+ For 10 TB-class workloads on a single box:
235
+
236
+ | Knob | Effect |
237
+ |---|---|
238
+ | `pip install ocsf-mapper[fast]` | orjson swap — 5-10× faster JSON parse/dump |
239
+ | `--workers N` | Linear speedup to CPU count |
240
+ | `--sink security-lake` | Streaming partitioned Parquet, memory bounded |
241
+ | `infer_schema_from(...)` + `schema=` | Skip pyarrow type re-inference per flush |
242
+
243
+ Combined: ~30-50× over the single-threaded baseline. On an 8-core box this
244
+ puts 10 TB at ~1-2 days instead of ~40 days. See
245
+ [`BENCHMARKS.md`](./BENCHMARKS.md) for per-mapping throughput numbers
246
+ and the phase breakdown that motivates the multi-process gain.
247
+
248
+ For genuinely large workloads, the tool is intended as a **mapping
249
+ *development*** environment — develop the JSON DSL config locally, then
250
+ ship the *same config* to a real distributed runtime (Spark / Flink /
251
+ Beam / Vector) for production. See [`DESIGN_DISTRIBUTED.md`](./DESIGN_DISTRIBUTED.md)
252
+ for the architecture: how the JSON DSL travels into a Spark UDF, a Flink
253
+ streaming job, or Vector/Fluentbit agents, plus CI-gate patterns for
254
+ schema-bump pre-flight and coverage drift.
255
+
256
+ ## Repository layout
257
+
258
+ ```
259
+ mappings/ JSON DSL configs per log source
260
+ samples/ Paired sample log files (used by lint and tests)
261
+ catalog.json Master-data: vendor + priority + OCSF target per source
262
+ ocsf-schema/ Vendored ocsf/ocsf-schema (git submodule, pinned)
263
+ src/ocsf_mapper/
264
+ apply.py DSL executor + public apply()/apply_stream()
265
+ ops.py 11 op kinds (const, path, group, raw, lookup, time,
266
+ range, int, bool, expr, for_each)
267
+ validate.py Structural validator
268
+ registry.py Programmatic mapping inventory
269
+ catalog.py catalog.json reader + table printer
270
+ coverage.py Per-class completeness scoring
271
+ lint.py CI gate
272
+ schema.py OCSF schema loader (categories, classes, dictionary)
273
+ generate.py LLM-assisted two-phase mapping generator
274
+ stream.py tail -f-style streaming helpers
275
+ providers/ LLM provider abstraction (Anthropic, OpenAI, fixture)
276
+ sinks/ Output destinations (jsonl, csv, parquet,
277
+ security-lake, stdout)
278
+ web/ FastAPI + Jinja2 + HTMX app
279
+ cli.py ocsf-mapper CLI entry point
280
+ parallel.py apply_parallel: multi-process fan-out
281
+ benchmark.py Per-phase throughput diagnostic
282
+ stream.py tail -f-style streaming helpers
283
+ schema_diff.py Detect schema-bump breakage vs older git ref
284
+ redact.py PII redaction (email/ipv4/ssn/phone/jwt/ccn)
285
+ _fastjson.py orjson-or-stdlib JSON shim
286
+ scripts/
287
+ generate_samples.py Deterministic sample-data generator
288
+ lint_mappings.py Thin wrapper around python -m ocsf_mapper.lint
289
+ tests/ pytest suite (~225 tests, 90% coverage)
290
+ ```
291
+
292
+ ## Adding a new log source
293
+
294
+ 1. Drop your sample (JSON / regex-parseable text) into `samples/<name>.<ext>`.
295
+ 2. Write `mappings/<name>.json` per the DSL in [`PLAN.md`](./PLAN.md) §4 —
296
+ *or* use `ocsf-mapper generate <name> samples/<name>.<ext>` (or the
297
+ web UI's `/new` wizard) to draft one with LLM assistance.
298
+ 3. Add an entry to `catalog.json` with vendor + priority + OCSF target.
299
+ 4. `ocsf-mapper lint mappings/` — must exit 0.
300
+ 5. `pytest` — must stay green.
301
+
302
+ ## Status
303
+
304
+ - [x] **Phase A — SDK**: pip-installable package, CLI (11 subcommands —
305
+ `apply`, `validate`, `list`, `catalog`, `lint`, `schema-diff`,
306
+ `benchmark`, `diff`, `generate`, `tail`, `serve`), 29 reference
307
+ mappings, master-data catalog, GitHub Actions CI on Python 3.9 /
308
+ 3.11 / 3.12.
309
+ - [x] **Phase B — Web UI**: homepage card grid (with priority badges and
310
+ coverage bars), per-source page with 5 HTMX-swappable tabs
311
+ (Sample, Output, Mapping editor with Monaco, Validation, Coverage),
312
+ new-source wizard at `/new`.
313
+ - [x] **Phase C — LLM-assisted onboarding**: Anthropic / OpenAI / fixture
314
+ provider abstraction, two-phase generator (`suggest_classes` →
315
+ `draft_mapping`), `ocsf-mapper generate` CLI, UI wizard with
316
+ server-side lint gate.
317
+ - [~] **Phase D — Polish**:
318
+ - [x] Per-mapping coverage scoring (required + recommended attrs)
319
+ - [x] Partitioned Parquet sink for AWS Security Lake
320
+ (`<root>/<class_uid>/eventDay=YYYYMMDD/*.parquet`)
321
+ - [x] `tail -f` live streaming mode (`ocsf-mapper tail`)
322
+ - [x] Schema-bump diff (`ocsf-mapper schema-diff` joins schema deltas
323
+ against mappings to surface silent breakage)
324
+ - [x] PII redaction layer (`apply ... --redact`; scrubs email / ipv4
325
+ / ssn / phone / jwt / Luhn-valid ccn)
326
+ - [x] WebSocket live-tail UI mode (Server-Sent Events from the Output
327
+ tab stream new OCSF events as the source file appends)
328
+ - [x] Mapping comparison (`ocsf-mapper diff <a.json> <b.json>`)
329
+ - [x] **Distribution (v0.3)**: PyPI publish workflow, Docker image
330
+ pushed to GHCR, Spark UDF reference at `examples/spark/`,
331
+ landing page deployed via GitHub Pages.
332
+
333
+ ### v0.4+ roadmap
334
+
335
+ Detail in [`PLAN.md`](./PLAN.md) §3a.
336
+
337
+ - **Bucket B — more mappings**: GitHub / GitLab / Slack / Kubernetes
338
+ audit logs; CEF / LEEF generic parsers; Windows-extension classes
339
+ (registry, etc); OCSF categories 7 (remediation) and 8 (unmanned
340
+ systems).
341
+ - **Bucket C — production engineering**: Prometheus `/metrics`, audit
342
+ log of mapping edits, replay tool for backfilling new fields,
343
+ provider test coverage, mapping versioning.
344
+ - **Phase E — distributed runtimes**: Flink streaming adapter, Vector
345
+ / Fluentbit transpiler, Apache Beam adapter.
346
+
347
+ See [`CHANGELOG.md`](./CHANGELOG.md) for the per-feature commit timeline
348
+ and [`PLAN.md`](./PLAN.md) for the original architecture and design
349
+ decisions.
@@ -0,0 +1,308 @@
1
+ # ocsf-parse
2
+
3
+ Self-service tool that maps any log source into [OCSF](https://github.com/ocsf/ocsf-schema)
4
+ events through a small declarative JSON DSL. One Python engine, JSON config
5
+ per source, master-data catalog, CI-linted, schema-validated, with a local
6
+ web UI and LLM-assisted onboarding.
7
+
8
+ [![CI](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml/badge.svg)](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml)
9
+
10
+ ## What it does
11
+
12
+ **38 reference mappings**, **20 OCSF event classes**, **8 of 8 OCSF
13
+ categories** — from Windows Event Log / Sysmon / auditd through CloudTrail /
14
+ Okta / Azure AD to Suricata / Wazuh / CrowdStrike. Each mapping ships with
15
+ a paired ~100-event sample, is lint-checked on every PR, and validates
16
+ against the vendored OCSF schema.
17
+
18
+ | OCSF category | Classes covered | Sources |
19
+ |---|---|---|
20
+ | System Activity | `file_activity`, `kernel_activity`, `process_activity`, `scheduled_job_activity`, `registry_key_activity` | auditd_file, dlp_events, falco_kernel, sysmon_process, cron, windows_registry |
21
+ | Findings | `security_finding`, `detection_finding`, `vulnerability_finding` | wazuh, splunk_es_alert, crowdstrike_falcon, suricata_alert, qualys_scan, ueba_alert, cef_generic, leef_generic |
22
+ | IAM | `authentication`, `entity_management` | okta, sshd, cloudtrail (ConsoleLogin), windows_event_log, azure_ad_signin, slack_audit, gitlab_audit |
23
+ | Network | `network_activity`, `http_activity`, `dns_activity`, `email_activity` | nginx, apache, cloudflare, palo_alto, vpc_flow_logs, waf_logs, zeek_dns, m365_email, google_workspace |
24
+ | Discovery | `inventory_info`, `config_state`, `device_config_state_change` | osquery_inventory, aws_config, jamf_inventory, prisma_cloud |
25
+ | Application Activity | `api_activity` | cloudtrail (non-login), github_audit, k8s_audit |
26
+ | Remediation | `remediation_activity` | soar_remediation |
27
+ | Unmanned Systems | `drone_flights_activity` | drone_telemetry |
28
+
29
+ Browse the master-data view with `ocsf-mapper catalog` or
30
+ [`catalog.json`](./catalog.json).
31
+
32
+ ## Quickstart
33
+
34
+ > **First time here?** Read [`QUICKSTART.md`](./QUICKSTART.md) — a
35
+ > 5-minute concrete walkthrough showing exactly what comes out
36
+ > when you point this at your logs. Comes back here when you want
37
+ > the full install / CLI / SDK reference.
38
+
39
+ ### From PyPI (once 0.3.0 ships)
40
+
41
+ ```bash
42
+ pip install 'ocsf-mapper[web,parquet,fast]'
43
+ ocsf-mapper list
44
+ ```
45
+
46
+ ### From source (for development)
47
+
48
+ ```bash
49
+ git clone --recurse-submodules https://github.com/SauNu84/ocsf-parse
50
+ cd ocsf-parse
51
+ pip install -e '.[dev,web,parquet,fast]' # full feature set incl. orjson
52
+ ```
53
+
54
+ ### From Docker (zero-install)
55
+
56
+ ```bash
57
+ docker run --rm -p 8000:8000 ghcr.io/saunu84/ocsf-mapper:0.3.0
58
+ # → web UI on http://127.0.0.1:8000
59
+
60
+ # or pin to "latest" tag:
61
+ docker run --rm ghcr.io/saunu84/ocsf-mapper:latest list
62
+ ```
63
+
64
+ > `[fast]` pulls in `orjson` — 5-10× faster JSON parse/dump than stdlib.
65
+ > Drop-in via `ocsf_mapper._fastjson`; falls back to stdlib if absent.
66
+
67
+ ### CLI
68
+
69
+ ```bash
70
+ # Browse what's available
71
+ ocsf-mapper list # table of mappings
72
+ ocsf-mapper catalog # master-data: vendor + priority + OCSF class
73
+
74
+ # Map a log to OCSF events (sink inferred from output extension)
75
+ ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out.jsonl
76
+ ocsf-mapper apply mappings/okta.json samples/okta.jsonl out.csv
77
+ ocsf-mapper apply mappings/sshd.json samples/sshd.log # → stdout
78
+
79
+ # Pipe stdin → stdout
80
+ cat samples/cloudtrail.jsonl | ocsf-mapper apply mappings/cloudtrail.json - | jq .
81
+
82
+ # Partitioned Parquet for AWS Security Lake
83
+ ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out/ --sink security-lake
84
+ # → out/<class_uid>/eventDay=YYYYMMDD/part-NNNNN.parquet
85
+
86
+ # tail -f a live log
87
+ ocsf-mapper tail mappings/nginx.json /var/log/nginx/access.log out.jsonl
88
+
89
+ # Validate already-OCSF events
90
+ ocsf-mapper validate out.jsonl authentication
91
+
92
+ # CI gate — re-lint every mapping against its pinned sample
93
+ ocsf-mapper lint # exits 0 iff all mappings pass
94
+
95
+ # LLM-assisted mapping draft (needs ANTHROPIC_API_KEY or OPENAI_API_KEY)
96
+ ocsf-mapper generate my_new_source samples/my_new_log.jsonl mappings/my_new.json
97
+
98
+ # Detect breaking OCSF schema changes before a submodule bump bites
99
+ ocsf-mapper schema-diff [<git-ref>] # default: HEAD~1 of ocsf-schema submodule
100
+
101
+ # Throughput diagnostic: per-phase timing (parse / route / map / write)
102
+ ocsf-mapper benchmark mappings/cloudtrail.json samples/cloudtrail.jsonl
103
+
104
+ # Scale up: fan out across N worker processes (linear speedup to CPU count)
105
+ ocsf-mapper apply mappings/cloudtrail.json input.log out.jsonl --workers 8
106
+
107
+ # Redact PII before writing (email / ipv4 / ssn / phone / jwt / Luhn-valid ccn)
108
+ ocsf-mapper apply mappings/X.json input.log out.jsonl --redact
109
+
110
+ # Local web UI (127.0.0.1 only) — includes live-tail mode on the Output tab
111
+ ocsf-mapper serve # → http://127.0.0.1:8000
112
+ ```
113
+
114
+ For a 10 TB-class workload, combine `--workers N` with `--sink security-lake`
115
+ and the streaming SecurityLakeSink (auto-rolls part files every 50 000 events
116
+ per partition). See [§"At scale"](#at-scale) below for the architecture.
117
+
118
+ If you cloned without `--recurse-submodules`:
119
+
120
+ ```bash
121
+ git submodule update --init --recursive
122
+ ```
123
+
124
+ ### Web UI
125
+
126
+ `ocsf-mapper serve` launches a FastAPI + HTMX + Monaco app on `127.0.0.1`.
127
+
128
+ - **Homepage** — card grid of every mapping with priority badge, OCSF
129
+ class+uid, lint status, and a coverage bar.
130
+ - **Per-source page** — five HTMX-swappable tabs:
131
+ - *Sample* — raw lines of the pinned sample.
132
+ - *Output* — drop any log file → side-by-side raw / mapped OCSF /
133
+ per-event validation.
134
+ - *Mapping* — Monaco JSON editor. Save runs the linter against the
135
+ pinned sample server-side and only writes the file if it passes.
136
+ - *Validation* — full validator report across the pinned sample with a
137
+ recurring-issues rollup.
138
+ - *Coverage* — per-class bars (required + recommended attrs populated)
139
+ + lists of missing fields.
140
+ - **`/new` wizard** — upload a sample, fill in vendor / priority, the
141
+ generator drafts a mapping via the configured LLM provider, you review
142
+ the JSON in Monaco, hit save, the linter gate runs before the file is
143
+ written.
144
+
145
+ ## SDK
146
+
147
+ ```python
148
+ from ocsf_mapper import apply_stream, validate, list_mappings
149
+ from ocsf_mapper.sinks import JsonlSink
150
+ from ocsf_mapper.sinks.security_lake import SecurityLakeSink, infer_schema_from
151
+ from ocsf_mapper.coverage import coverage
152
+ from ocsf_mapper.stream import stream_apply
153
+ from ocsf_mapper.parallel import apply_parallel
154
+ from ocsf_mapper.benchmark import benchmark
155
+ from ocsf_mapper.redact import RedactingSink
156
+ import json
157
+
158
+ config = json.loads(open("mappings/cloudtrail.json").read())
159
+
160
+ # Batch
161
+ with JsonlSink("out.jsonl") as sink:
162
+ sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
163
+
164
+ # Partitioned Parquet for downstream Security Lake ingest. Streams to disk
165
+ # every flush_every events per (class_uid × eventDay) partition — memory
166
+ # is bounded regardless of input size.
167
+ schema = infer_schema_from(next(iter(apply_stream(config, open("samples/cloudtrail.jsonl")))))
168
+ with SecurityLakeSink("out_dir", flush_every=50_000, schema=schema) as sink:
169
+ sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
170
+
171
+ # Multi-process for 10 TB-class workloads
172
+ apply_parallel(config, "huge.log", "out.jsonl", n_workers=8, sink_kind="jsonl")
173
+
174
+ # Coverage scoring (what % of required + recommended attrs are populated)
175
+ print(coverage(config))
176
+
177
+ # Per-phase timing
178
+ print(benchmark(config, "samples/cloudtrail.jsonl"))
179
+
180
+ # PII redaction wrapper around any sink
181
+ with RedactingSink(JsonlSink("scrubbed.jsonl")) as sink:
182
+ sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
183
+
184
+ # Live tail
185
+ import threading
186
+ stop = threading.Event()
187
+ with JsonlSink("live.jsonl") as sink:
188
+ stream_apply(config, "/var/log/cloudtrail.log", sink, stop=stop)
189
+ ```
190
+
191
+ ## At scale
192
+
193
+ For 10 TB-class workloads on a single box:
194
+
195
+ | Knob | Effect |
196
+ |---|---|
197
+ | `pip install ocsf-mapper[fast]` | orjson swap — 5-10× faster JSON parse/dump |
198
+ | `--workers N` | Linear speedup to CPU count |
199
+ | `--sink security-lake` | Streaming partitioned Parquet, memory bounded |
200
+ | `infer_schema_from(...)` + `schema=` | Skip pyarrow type re-inference per flush |
201
+
202
+ Combined: ~30-50× over the single-threaded baseline. On an 8-core box this
203
+ puts 10 TB at ~1-2 days instead of ~40 days. See
204
+ [`BENCHMARKS.md`](./BENCHMARKS.md) for per-mapping throughput numbers
205
+ and the phase breakdown that motivates the multi-process gain.
206
+
207
+ For genuinely large workloads, the tool is intended as a **mapping
208
+ *development*** environment — develop the JSON DSL config locally, then
209
+ ship the *same config* to a real distributed runtime (Spark / Flink /
210
+ Beam / Vector) for production. See [`DESIGN_DISTRIBUTED.md`](./DESIGN_DISTRIBUTED.md)
211
+ for the architecture: how the JSON DSL travels into a Spark UDF, a Flink
212
+ streaming job, or Vector/Fluentbit agents, plus CI-gate patterns for
213
+ schema-bump pre-flight and coverage drift.
214
+
215
+ ## Repository layout
216
+
217
+ ```
218
+ mappings/ JSON DSL configs per log source
219
+ samples/ Paired sample log files (used by lint and tests)
220
+ catalog.json Master-data: vendor + priority + OCSF target per source
221
+ ocsf-schema/ Vendored ocsf/ocsf-schema (git submodule, pinned)
222
+ src/ocsf_mapper/
223
+ apply.py DSL executor + public apply()/apply_stream()
224
+ ops.py 11 op kinds (const, path, group, raw, lookup, time,
225
+ range, int, bool, expr, for_each)
226
+ validate.py Structural validator
227
+ registry.py Programmatic mapping inventory
228
+ catalog.py catalog.json reader + table printer
229
+ coverage.py Per-class completeness scoring
230
+ lint.py CI gate
231
+ schema.py OCSF schema loader (categories, classes, dictionary)
232
+ generate.py LLM-assisted two-phase mapping generator
233
+ stream.py tail -f-style streaming helpers
234
+ providers/ LLM provider abstraction (Anthropic, OpenAI, fixture)
235
+ sinks/ Output destinations (jsonl, csv, parquet,
236
+ security-lake, stdout)
237
+ web/ FastAPI + Jinja2 + HTMX app
238
+ cli.py ocsf-mapper CLI entry point
239
+ parallel.py apply_parallel: multi-process fan-out
240
+ benchmark.py Per-phase throughput diagnostic
241
+ stream.py tail -f-style streaming helpers
242
+ schema_diff.py Detect schema-bump breakage vs older git ref
243
+ redact.py PII redaction (email/ipv4/ssn/phone/jwt/ccn)
244
+ _fastjson.py orjson-or-stdlib JSON shim
245
+ scripts/
246
+ generate_samples.py Deterministic sample-data generator
247
+ lint_mappings.py Thin wrapper around python -m ocsf_mapper.lint
248
+ tests/ pytest suite (~225 tests, 90% coverage)
249
+ ```
250
+
251
+ ## Adding a new log source
252
+
253
+ 1. Drop your sample (JSON / regex-parseable text) into `samples/<name>.<ext>`.
254
+ 2. Write `mappings/<name>.json` per the DSL in [`PLAN.md`](./PLAN.md) §4 —
255
+ *or* use `ocsf-mapper generate <name> samples/<name>.<ext>` (or the
256
+ web UI's `/new` wizard) to draft one with LLM assistance.
257
+ 3. Add an entry to `catalog.json` with vendor + priority + OCSF target.
258
+ 4. `ocsf-mapper lint mappings/` — must exit 0.
259
+ 5. `pytest` — must stay green.
260
+
261
+ ## Status
262
+
263
+ - [x] **Phase A — SDK**: pip-installable package, CLI (11 subcommands —
264
+ `apply`, `validate`, `list`, `catalog`, `lint`, `schema-diff`,
265
+ `benchmark`, `diff`, `generate`, `tail`, `serve`), 29 reference
266
+ mappings, master-data catalog, GitHub Actions CI on Python 3.9 /
267
+ 3.11 / 3.12.
268
+ - [x] **Phase B — Web UI**: homepage card grid (with priority badges and
269
+ coverage bars), per-source page with 5 HTMX-swappable tabs
270
+ (Sample, Output, Mapping editor with Monaco, Validation, Coverage),
271
+ new-source wizard at `/new`.
272
+ - [x] **Phase C — LLM-assisted onboarding**: Anthropic / OpenAI / fixture
273
+ provider abstraction, two-phase generator (`suggest_classes` →
274
+ `draft_mapping`), `ocsf-mapper generate` CLI, UI wizard with
275
+ server-side lint gate.
276
+ - [~] **Phase D — Polish**:
277
+ - [x] Per-mapping coverage scoring (required + recommended attrs)
278
+ - [x] Partitioned Parquet sink for AWS Security Lake
279
+ (`<root>/<class_uid>/eventDay=YYYYMMDD/*.parquet`)
280
+ - [x] `tail -f` live streaming mode (`ocsf-mapper tail`)
281
+ - [x] Schema-bump diff (`ocsf-mapper schema-diff` joins schema deltas
282
+ against mappings to surface silent breakage)
283
+ - [x] PII redaction layer (`apply ... --redact`; scrubs email / ipv4
284
+ / ssn / phone / jwt / Luhn-valid ccn)
285
+ - [x] WebSocket live-tail UI mode (Server-Sent Events from the Output
286
+ tab stream new OCSF events as the source file appends)
287
+ - [x] Mapping comparison (`ocsf-mapper diff <a.json> <b.json>`)
288
+ - [x] **Distribution (v0.3)**: PyPI publish workflow, Docker image
289
+ pushed to GHCR, Spark UDF reference at `examples/spark/`,
290
+ landing page deployed via GitHub Pages.
291
+
292
+ ### v0.4+ roadmap
293
+
294
+ Detail in [`PLAN.md`](./PLAN.md) §3a.
295
+
296
+ - **Bucket B — more mappings**: GitHub / GitLab / Slack / Kubernetes
297
+ audit logs; CEF / LEEF generic parsers; Windows-extension classes
298
+ (registry, etc); OCSF categories 7 (remediation) and 8 (unmanned
299
+ systems).
300
+ - **Bucket C — production engineering**: Prometheus `/metrics`, audit
301
+ log of mapping edits, replay tool for backfilling new fields,
302
+ provider test coverage, mapping versioning.
303
+ - **Phase E — distributed runtimes**: Flink streaming adapter, Vector
304
+ / Fluentbit transpiler, Apache Beam adapter.
305
+
306
+ See [`CHANGELOG.md`](./CHANGELOG.md) for the per-feature commit timeline
307
+ and [`PLAN.md`](./PLAN.md) for the original architecture and design
308
+ decisions.