PyPI - ocsf-mapper - Versions diffs - 0.3.1__tar.gz - Mend

ocsf-mapper 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

ocsf_mapper-0.3.1/PKG-INFO +349 -0
ocsf_mapper-0.3.1/README.md +308 -0
ocsf_mapper-0.3.1/pyproject.toml +70 -0
ocsf_mapper-0.3.1/setup.cfg +4 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/__init__.py +33 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/_fastjson.py +52 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/apply.py +378 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/audit.py +115 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/benchmark.py +191 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/catalog.py +109 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/cli.py +408 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/coverage.py +92 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/generate.py +233 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/lint.py +123 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/mapping_diff.py +281 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/ops.py +181 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/parallel.py +181 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/providers/__init__.py +53 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/providers/anthropic.py +38 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/providers/base.py +21 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/providers/fixture.py +58 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/providers/openai.py +42 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/py.typed +0 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/redact.py +154 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/registry.py +78 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/replay.py +186 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/schema.py +149 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/schema_diff.py +261 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/__init__.py +70 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/base.py +41 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/csv.py +66 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/jsonl.py +31 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/parquet.py +64 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/security_lake.py +157 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/sinks/stdout.py +16 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/stream.py +93 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/validate.py +90 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/web/__init__.py +16 -0
ocsf_mapper-0.3.1/src/ocsf_mapper/web/app.py +650 -0
ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/PKG-INFO +349 -0
ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/SOURCES.txt +67 -0
ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/dependency_links.txt +1 -0
ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/entry_points.txt +2 -0
ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/requires.txt +26 -0
ocsf_mapper-0.3.1/src/ocsf_mapper.egg-info/top_level.txt +1 -0
ocsf_mapper-0.3.1/tests/test_apply.py +187 -0
ocsf_mapper-0.3.1/tests/test_audit.py +162 -0
ocsf_mapper-0.3.1/tests/test_benchmark.py +116 -0
ocsf_mapper-0.3.1/tests/test_catalog.py +63 -0
ocsf_mapper-0.3.1/tests/test_cef_leef.py +150 -0
ocsf_mapper-0.3.1/tests/test_cli.py +124 -0
ocsf_mapper-0.3.1/tests/test_coverage.py +68 -0
ocsf_mapper-0.3.1/tests/test_fastjson.py +94 -0
ocsf_mapper-0.3.1/tests/test_generate.py +92 -0
ocsf_mapper-0.3.1/tests/test_lint.py +115 -0
ocsf_mapper-0.3.1/tests/test_mapping_diff.py +216 -0
ocsf_mapper-0.3.1/tests/test_ops.py +282 -0
ocsf_mapper-0.3.1/tests/test_parallel.py +192 -0
ocsf_mapper-0.3.1/tests/test_provider_mocks.py +291 -0
ocsf_mapper-0.3.1/tests/test_providers.py +86 -0
ocsf_mapper-0.3.1/tests/test_redact.py +138 -0
ocsf_mapper-0.3.1/tests/test_registry.py +105 -0
ocsf_mapper-0.3.1/tests/test_replay.py +189 -0
ocsf_mapper-0.3.1/tests/test_schema.py +57 -0
ocsf_mapper-0.3.1/tests/test_schema_diff.py +243 -0
ocsf_mapper-0.3.1/tests/test_sinks.py +280 -0
ocsf_mapper-0.3.1/tests/test_stream.py +110 -0
ocsf_mapper-0.3.1/tests/test_validate.py +73 -0
ocsf_mapper-0.3.1/tests/test_web.py +361 -0

ocsf_mapper-0.3.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,349 @@
+Metadata-Version: 2.4
+Name: ocsf-mapper
+Version: 0.3.1
+Summary: Map arbitrary log sources to OCSF events via a declarative JSON DSL.
+Author: ocsf-parse contributors
+License: MIT
+Project-URL: Homepage, https://github.com/ocsf/ocsf-parse
+Project-URL: Issues, https://github.com/ocsf/ocsf-parse/issues
+Keywords: ocsf,security,logs,parser,siem
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Information Technology
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Security
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Provides-Extra: anthropic
+Requires-Dist: anthropic>=0.34; extra == "anthropic"
+Provides-Extra: openai
+Requires-Dist: openai>=1.40; extra == "openai"
+Provides-Extra: parquet
+Requires-Dist: pyarrow>=14; extra == "parquet"
+Provides-Extra: fast
+Requires-Dist: orjson>=3.10; extra == "fast"
+Provides-Extra: web
+Requires-Dist: fastapi>=0.110; extra == "web"
+Requires-Dist: uvicorn[standard]>=0.27; extra == "web"
+Requires-Dist: jinja2>=3.1; extra == "web"
+Requires-Dist: python-multipart>=0.0.9; extra == "web"
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == "dev"
+Requires-Dist: pytest-cov>=5; extra == "dev"
+Requires-Dist: httpx>=0.27; extra == "dev"
+Requires-Dist: fastapi>=0.110; extra == "dev"
+Requires-Dist: jinja2>=3.1; extra == "dev"
+Requires-Dist: python-multipart>=0.0.9; extra == "dev"
+# ocsf-parse
+Self-service tool that maps any log source into [OCSF](https://github.com/ocsf/ocsf-schema)
+events through a small declarative JSON DSL. One Python engine, JSON config
+per source, master-data catalog, CI-linted, schema-validated, with a local
+web UI and LLM-assisted onboarding.
+[![CI](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml/badge.svg)](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml)
+## What it does
+**38 reference mappings**, **20 OCSF event classes**, **8 of 8 OCSF
+categories** — from Windows Event Log / Sysmon / auditd through CloudTrail /
+Okta / Azure AD to Suricata / Wazuh / CrowdStrike. Each mapping ships with
+a paired ~100-event sample, is lint-checked on every PR, and validates
+against the vendored OCSF schema.
+| OCSF category | Classes covered | Sources |
+|---|---|---|
+| System Activity | `file_activity`, `kernel_activity`, `process_activity`, `scheduled_job_activity`, `registry_key_activity` | auditd_file, dlp_events, falco_kernel, sysmon_process, cron, windows_registry |
+| Findings | `security_finding`, `detection_finding`, `vulnerability_finding` | wazuh, splunk_es_alert, crowdstrike_falcon, suricata_alert, qualys_scan, ueba_alert, cef_generic, leef_generic |
+| IAM | `authentication`, `entity_management` | okta, sshd, cloudtrail (ConsoleLogin), windows_event_log, azure_ad_signin, slack_audit, gitlab_audit |
+| Network | `network_activity`, `http_activity`, `dns_activity`, `email_activity` | nginx, apache, cloudflare, palo_alto, vpc_flow_logs, waf_logs, zeek_dns, m365_email, google_workspace |
+| Discovery | `inventory_info`, `config_state`, `device_config_state_change` | osquery_inventory, aws_config, jamf_inventory, prisma_cloud |
+| Application Activity | `api_activity` | cloudtrail (non-login), github_audit, k8s_audit |
+| Remediation | `remediation_activity` | soar_remediation |
+| Unmanned Systems | `drone_flights_activity` | drone_telemetry |
+Browse the master-data view with `ocsf-mapper catalog` or
+[`catalog.json`](./catalog.json).
+## Quickstart
+> **First time here?** Read [`QUICKSTART.md`](./QUICKSTART.md) — a
+> 5-minute concrete walkthrough showing exactly what comes out
+> when you point this at your logs. Comes back here when you want
+> the full install / CLI / SDK reference.
+### From PyPI (once 0.3.0 ships)
+```bash
+pip install 'ocsf-mapper[web,parquet,fast]'
+ocsf-mapper list
+```
+### From source (for development)
+```bash
+git clone --recurse-submodules https://github.com/SauNu84/ocsf-parse
+cd ocsf-parse
+pip install -e '.[dev,web,parquet,fast]'    # full feature set incl. orjson
+```
+### From Docker (zero-install)
+```bash
+docker run --rm -p 8000:8000 ghcr.io/saunu84/ocsf-mapper:0.3.0
+# → web UI on http://127.0.0.1:8000
+# or pin to "latest" tag:
+docker run --rm ghcr.io/saunu84/ocsf-mapper:latest list
+```
+> `[fast]` pulls in `orjson` — 5-10× faster JSON parse/dump than stdlib.
+> Drop-in via `ocsf_mapper._fastjson`; falls back to stdlib if absent.
+### CLI
+```bash
+# Browse what's available
+ocsf-mapper list                       # table of mappings
+ocsf-mapper catalog                    # master-data: vendor + priority + OCSF class
+# Map a log to OCSF events (sink inferred from output extension)
+ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out.jsonl
+ocsf-mapper apply mappings/okta.json       samples/okta.jsonl       out.csv
+ocsf-mapper apply mappings/sshd.json       samples/sshd.log         # → stdout
+# Pipe stdin → stdout
+cat samples/cloudtrail.jsonl | ocsf-mapper apply mappings/cloudtrail.json - | jq .
+# Partitioned Parquet for AWS Security Lake
+ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out/ --sink security-lake
+# → out/<class_uid>/eventDay=YYYYMMDD/part-NNNNN.parquet
+# tail -f a live log
+ocsf-mapper tail mappings/nginx.json /var/log/nginx/access.log out.jsonl
+# Validate already-OCSF events
+ocsf-mapper validate out.jsonl authentication
+# CI gate — re-lint every mapping against its pinned sample
+ocsf-mapper lint                       # exits 0 iff all mappings pass
+# LLM-assisted mapping draft (needs ANTHROPIC_API_KEY or OPENAI_API_KEY)
+ocsf-mapper generate my_new_source samples/my_new_log.jsonl mappings/my_new.json
+# Detect breaking OCSF schema changes before a submodule bump bites
+ocsf-mapper schema-diff [<git-ref>]    # default: HEAD~1 of ocsf-schema submodule
+# Throughput diagnostic: per-phase timing (parse / route / map / write)
+ocsf-mapper benchmark mappings/cloudtrail.json samples/cloudtrail.jsonl
+# Scale up: fan out across N worker processes (linear speedup to CPU count)
+ocsf-mapper apply mappings/cloudtrail.json input.log out.jsonl --workers 8
+# Redact PII before writing (email / ipv4 / ssn / phone / jwt / Luhn-valid ccn)
+ocsf-mapper apply mappings/X.json input.log out.jsonl --redact
+# Local web UI (127.0.0.1 only) — includes live-tail mode on the Output tab
+ocsf-mapper serve                      # → http://127.0.0.1:8000
+```
+For a 10 TB-class workload, combine `--workers N` with `--sink security-lake`
+and the streaming SecurityLakeSink (auto-rolls part files every 50 000 events
+per partition). See [§"At scale"](#at-scale) below for the architecture.
+If you cloned without `--recurse-submodules`:
+```bash
+git submodule update --init --recursive
+```
+### Web UI
+`ocsf-mapper serve` launches a FastAPI + HTMX + Monaco app on `127.0.0.1`.
+- **Homepage** — card grid of every mapping with priority badge, OCSF
+  class+uid, lint status, and a coverage bar.
+- **Per-source page** — five HTMX-swappable tabs:
+  - *Sample* — raw lines of the pinned sample.
+  - *Output* — drop any log file → side-by-side raw / mapped OCSF /
+    per-event validation.
+  - *Mapping* — Monaco JSON editor. Save runs the linter against the
+    pinned sample server-side and only writes the file if it passes.
+  - *Validation* — full validator report across the pinned sample with a
+    recurring-issues rollup.
+  - *Coverage* — per-class bars (required + recommended attrs populated)
+    + lists of missing fields.
+- **`/new` wizard** — upload a sample, fill in vendor / priority, the
+  generator drafts a mapping via the configured LLM provider, you review
+  the JSON in Monaco, hit save, the linter gate runs before the file is
+  written.
+## SDK
+```python
+from ocsf_mapper import apply_stream, validate, list_mappings
+from ocsf_mapper.sinks import JsonlSink
+from ocsf_mapper.sinks.security_lake import SecurityLakeSink, infer_schema_from
+from ocsf_mapper.coverage import coverage
+from ocsf_mapper.stream import stream_apply
+from ocsf_mapper.parallel import apply_parallel
+from ocsf_mapper.benchmark import benchmark
+from ocsf_mapper.redact import RedactingSink
+import json
+config = json.loads(open("mappings/cloudtrail.json").read())
+# Batch
+with JsonlSink("out.jsonl") as sink:
+    sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
+# Partitioned Parquet for downstream Security Lake ingest. Streams to disk
+# every flush_every events per (class_uid × eventDay) partition — memory
+# is bounded regardless of input size.
+schema = infer_schema_from(next(iter(apply_stream(config, open("samples/cloudtrail.jsonl")))))
+with SecurityLakeSink("out_dir", flush_every=50_000, schema=schema) as sink:
+    sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
+# Multi-process for 10 TB-class workloads
+apply_parallel(config, "huge.log", "out.jsonl", n_workers=8, sink_kind="jsonl")
+# Coverage scoring (what % of required + recommended attrs are populated)
+print(coverage(config))
+# Per-phase timing
+print(benchmark(config, "samples/cloudtrail.jsonl"))
+# PII redaction wrapper around any sink
+with RedactingSink(JsonlSink("scrubbed.jsonl")) as sink:
+    sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
+# Live tail
+import threading
+stop = threading.Event()
+with JsonlSink("live.jsonl") as sink:
+    stream_apply(config, "/var/log/cloudtrail.log", sink, stop=stop)
+```
+## At scale
+For 10 TB-class workloads on a single box:
+| Knob | Effect |
+|---|---|
+| `pip install ocsf-mapper[fast]` | orjson swap — 5-10× faster JSON parse/dump |
+| `--workers N` | Linear speedup to CPU count |
+| `--sink security-lake` | Streaming partitioned Parquet, memory bounded |
+| `infer_schema_from(...)` + `schema=` | Skip pyarrow type re-inference per flush |
+Combined: ~30-50× over the single-threaded baseline. On an 8-core box this
+puts 10 TB at ~1-2 days instead of ~40 days. See
+[`BENCHMARKS.md`](./BENCHMARKS.md) for per-mapping throughput numbers
+and the phase breakdown that motivates the multi-process gain.
+For genuinely large workloads, the tool is intended as a **mapping
+*development*** environment — develop the JSON DSL config locally, then
+ship the *same config* to a real distributed runtime (Spark / Flink /
+Beam / Vector) for production. See [`DESIGN_DISTRIBUTED.md`](./DESIGN_DISTRIBUTED.md)
+for the architecture: how the JSON DSL travels into a Spark UDF, a Flink
+streaming job, or Vector/Fluentbit agents, plus CI-gate patterns for
+schema-bump pre-flight and coverage drift.
+## Repository layout
+```
+mappings/                JSON DSL configs per log source
+samples/                 Paired sample log files (used by lint and tests)
+catalog.json             Master-data: vendor + priority + OCSF target per source
+ocsf-schema/             Vendored ocsf/ocsf-schema (git submodule, pinned)
+src/ocsf_mapper/
+  apply.py               DSL executor + public apply()/apply_stream()
+  ops.py                 11 op kinds (const, path, group, raw, lookup, time,
+                          range, int, bool, expr, for_each)
+  validate.py            Structural validator
+  registry.py            Programmatic mapping inventory
+  catalog.py             catalog.json reader + table printer
+  coverage.py            Per-class completeness scoring
+  lint.py                CI gate
+  schema.py              OCSF schema loader (categories, classes, dictionary)
+  generate.py            LLM-assisted two-phase mapping generator
+  stream.py              tail -f-style streaming helpers
+  providers/             LLM provider abstraction (Anthropic, OpenAI, fixture)
+  sinks/                 Output destinations (jsonl, csv, parquet,
+                          security-lake, stdout)
+  web/                   FastAPI + Jinja2 + HTMX app
+  cli.py                 ocsf-mapper CLI entry point
+  parallel.py            apply_parallel: multi-process fan-out
+  benchmark.py           Per-phase throughput diagnostic
+  stream.py              tail -f-style streaming helpers
+  schema_diff.py         Detect schema-bump breakage vs older git ref
+  redact.py              PII redaction (email/ipv4/ssn/phone/jwt/ccn)
+  _fastjson.py           orjson-or-stdlib JSON shim
+scripts/
+  generate_samples.py    Deterministic sample-data generator
+  lint_mappings.py       Thin wrapper around python -m ocsf_mapper.lint
+tests/                   pytest suite (~225 tests, 90% coverage)
+```
+## Adding a new log source
+1. Drop your sample (JSON / regex-parseable text) into `samples/<name>.<ext>`.
+2. Write `mappings/<name>.json` per the DSL in [`PLAN.md`](./PLAN.md) §4 —
+   *or* use `ocsf-mapper generate <name> samples/<name>.<ext>` (or the
+   web UI's `/new` wizard) to draft one with LLM assistance.
+3. Add an entry to `catalog.json` with vendor + priority + OCSF target.
+4. `ocsf-mapper lint mappings/` — must exit 0.
+5. `pytest` — must stay green.
+## Status
+- [x] **Phase A — SDK**: pip-installable package, CLI (11 subcommands —
+      `apply`, `validate`, `list`, `catalog`, `lint`, `schema-diff`,
+      `benchmark`, `diff`, `generate`, `tail`, `serve`), 29 reference
+      mappings, master-data catalog, GitHub Actions CI on Python 3.9 /
+      3.11 / 3.12.
+- [x] **Phase B — Web UI**: homepage card grid (with priority badges and
+      coverage bars), per-source page with 5 HTMX-swappable tabs
+      (Sample, Output, Mapping editor with Monaco, Validation, Coverage),
+      new-source wizard at `/new`.
+- [x] **Phase C — LLM-assisted onboarding**: Anthropic / OpenAI / fixture
+      provider abstraction, two-phase generator (`suggest_classes` →
+      `draft_mapping`), `ocsf-mapper generate` CLI, UI wizard with
+      server-side lint gate.
+- [~] **Phase D — Polish**:
+  - [x] Per-mapping coverage scoring (required + recommended attrs)
+  - [x] Partitioned Parquet sink for AWS Security Lake
+        (`<root>/<class_uid>/eventDay=YYYYMMDD/*.parquet`)
+  - [x] `tail -f` live streaming mode (`ocsf-mapper tail`)
+  - [x] Schema-bump diff (`ocsf-mapper schema-diff` joins schema deltas
+        against mappings to surface silent breakage)
+  - [x] PII redaction layer (`apply ... --redact`; scrubs email / ipv4
+        / ssn / phone / jwt / Luhn-valid ccn)
+  - [x] WebSocket live-tail UI mode (Server-Sent Events from the Output
+        tab stream new OCSF events as the source file appends)
+  - [x] Mapping comparison (`ocsf-mapper diff <a.json> <b.json>`)
+- [x] **Distribution (v0.3)**: PyPI publish workflow, Docker image
+      pushed to GHCR, Spark UDF reference at `examples/spark/`,
+      landing page deployed via GitHub Pages.
+### v0.4+ roadmap
+Detail in [`PLAN.md`](./PLAN.md) §3a.
+- **Bucket B — more mappings**: GitHub / GitLab / Slack / Kubernetes
+  audit logs; CEF / LEEF generic parsers; Windows-extension classes
+  (registry, etc); OCSF categories 7 (remediation) and 8 (unmanned
+  systems).
+- **Bucket C — production engineering**: Prometheus `/metrics`, audit
+  log of mapping edits, replay tool for backfilling new fields,
+  provider test coverage, mapping versioning.
+- **Phase E — distributed runtimes**: Flink streaming adapter, Vector
+  / Fluentbit transpiler, Apache Beam adapter.
+See [`CHANGELOG.md`](./CHANGELOG.md) for the per-feature commit timeline
+and [`PLAN.md`](./PLAN.md) for the original architecture and design
+decisions.

ocsf_mapper-0.3.1/README.md ADDED Viewed

@@ -0,0 +1,308 @@
+# ocsf-parse
+Self-service tool that maps any log source into [OCSF](https://github.com/ocsf/ocsf-schema)
+events through a small declarative JSON DSL. One Python engine, JSON config
+per source, master-data catalog, CI-linted, schema-validated, with a local
+web UI and LLM-assisted onboarding.
+[![CI](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml/badge.svg)](https://github.com/SauNu84/ocsf-parse/actions/workflows/ci.yml)
+## What it does
+**38 reference mappings**, **20 OCSF event classes**, **8 of 8 OCSF
+categories** — from Windows Event Log / Sysmon / auditd through CloudTrail /
+Okta / Azure AD to Suricata / Wazuh / CrowdStrike. Each mapping ships with
+a paired ~100-event sample, is lint-checked on every PR, and validates
+against the vendored OCSF schema.
+| OCSF category | Classes covered | Sources |
+|---|---|---|
+| System Activity | `file_activity`, `kernel_activity`, `process_activity`, `scheduled_job_activity`, `registry_key_activity` | auditd_file, dlp_events, falco_kernel, sysmon_process, cron, windows_registry |
+| Findings | `security_finding`, `detection_finding`, `vulnerability_finding` | wazuh, splunk_es_alert, crowdstrike_falcon, suricata_alert, qualys_scan, ueba_alert, cef_generic, leef_generic |
+| IAM | `authentication`, `entity_management` | okta, sshd, cloudtrail (ConsoleLogin), windows_event_log, azure_ad_signin, slack_audit, gitlab_audit |
+| Network | `network_activity`, `http_activity`, `dns_activity`, `email_activity` | nginx, apache, cloudflare, palo_alto, vpc_flow_logs, waf_logs, zeek_dns, m365_email, google_workspace |
+| Discovery | `inventory_info`, `config_state`, `device_config_state_change` | osquery_inventory, aws_config, jamf_inventory, prisma_cloud |
+| Application Activity | `api_activity` | cloudtrail (non-login), github_audit, k8s_audit |
+| Remediation | `remediation_activity` | soar_remediation |
+| Unmanned Systems | `drone_flights_activity` | drone_telemetry |
+Browse the master-data view with `ocsf-mapper catalog` or
+[`catalog.json`](./catalog.json).
+## Quickstart
+> **First time here?** Read [`QUICKSTART.md`](./QUICKSTART.md) — a
+> 5-minute concrete walkthrough showing exactly what comes out
+> when you point this at your logs. Comes back here when you want
+> the full install / CLI / SDK reference.
+### From PyPI (once 0.3.0 ships)
+```bash
+pip install 'ocsf-mapper[web,parquet,fast]'
+ocsf-mapper list
+```
+### From source (for development)
+```bash
+git clone --recurse-submodules https://github.com/SauNu84/ocsf-parse
+cd ocsf-parse
+pip install -e '.[dev,web,parquet,fast]'    # full feature set incl. orjson
+```
+### From Docker (zero-install)
+```bash
+docker run --rm -p 8000:8000 ghcr.io/saunu84/ocsf-mapper:0.3.0
+# → web UI on http://127.0.0.1:8000
+# or pin to "latest" tag:
+docker run --rm ghcr.io/saunu84/ocsf-mapper:latest list
+```
+> `[fast]` pulls in `orjson` — 5-10× faster JSON parse/dump than stdlib.
+> Drop-in via `ocsf_mapper._fastjson`; falls back to stdlib if absent.
+### CLI
+```bash
+# Browse what's available
+ocsf-mapper list                       # table of mappings
+ocsf-mapper catalog                    # master-data: vendor + priority + OCSF class
+# Map a log to OCSF events (sink inferred from output extension)
+ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out.jsonl
+ocsf-mapper apply mappings/okta.json       samples/okta.jsonl       out.csv
+ocsf-mapper apply mappings/sshd.json       samples/sshd.log         # → stdout
+# Pipe stdin → stdout
+cat samples/cloudtrail.jsonl | ocsf-mapper apply mappings/cloudtrail.json - | jq .
+# Partitioned Parquet for AWS Security Lake
+ocsf-mapper apply mappings/cloudtrail.json samples/cloudtrail.jsonl out/ --sink security-lake
+# → out/<class_uid>/eventDay=YYYYMMDD/part-NNNNN.parquet
+# tail -f a live log
+ocsf-mapper tail mappings/nginx.json /var/log/nginx/access.log out.jsonl
+# Validate already-OCSF events
+ocsf-mapper validate out.jsonl authentication
+# CI gate — re-lint every mapping against its pinned sample
+ocsf-mapper lint                       # exits 0 iff all mappings pass
+# LLM-assisted mapping draft (needs ANTHROPIC_API_KEY or OPENAI_API_KEY)
+ocsf-mapper generate my_new_source samples/my_new_log.jsonl mappings/my_new.json
+# Detect breaking OCSF schema changes before a submodule bump bites
+ocsf-mapper schema-diff [<git-ref>]    # default: HEAD~1 of ocsf-schema submodule
+# Throughput diagnostic: per-phase timing (parse / route / map / write)
+ocsf-mapper benchmark mappings/cloudtrail.json samples/cloudtrail.jsonl
+# Scale up: fan out across N worker processes (linear speedup to CPU count)
+ocsf-mapper apply mappings/cloudtrail.json input.log out.jsonl --workers 8
+# Redact PII before writing (email / ipv4 / ssn / phone / jwt / Luhn-valid ccn)
+ocsf-mapper apply mappings/X.json input.log out.jsonl --redact
+# Local web UI (127.0.0.1 only) — includes live-tail mode on the Output tab
+ocsf-mapper serve                      # → http://127.0.0.1:8000
+```
+For a 10 TB-class workload, combine `--workers N` with `--sink security-lake`
+and the streaming SecurityLakeSink (auto-rolls part files every 50 000 events
+per partition). See [§"At scale"](#at-scale) below for the architecture.
+If you cloned without `--recurse-submodules`:
+```bash
+git submodule update --init --recursive
+```
+### Web UI
+`ocsf-mapper serve` launches a FastAPI + HTMX + Monaco app on `127.0.0.1`.
+- **Homepage** — card grid of every mapping with priority badge, OCSF
+  class+uid, lint status, and a coverage bar.
+- **Per-source page** — five HTMX-swappable tabs:
+  - *Sample* — raw lines of the pinned sample.
+  - *Output* — drop any log file → side-by-side raw / mapped OCSF /
+    per-event validation.
+  - *Mapping* — Monaco JSON editor. Save runs the linter against the
+    pinned sample server-side and only writes the file if it passes.
+  - *Validation* — full validator report across the pinned sample with a
+    recurring-issues rollup.
+  - *Coverage* — per-class bars (required + recommended attrs populated)
+    + lists of missing fields.
+- **`/new` wizard** — upload a sample, fill in vendor / priority, the
+  generator drafts a mapping via the configured LLM provider, you review
+  the JSON in Monaco, hit save, the linter gate runs before the file is
+  written.
+## SDK
+```python
+from ocsf_mapper import apply_stream, validate, list_mappings
+from ocsf_mapper.sinks import JsonlSink
+from ocsf_mapper.sinks.security_lake import SecurityLakeSink, infer_schema_from
+from ocsf_mapper.coverage import coverage
+from ocsf_mapper.stream import stream_apply
+from ocsf_mapper.parallel import apply_parallel
+from ocsf_mapper.benchmark import benchmark
+from ocsf_mapper.redact import RedactingSink
+import json
+config = json.loads(open("mappings/cloudtrail.json").read())
+# Batch
+with JsonlSink("out.jsonl") as sink:
+    sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
+# Partitioned Parquet for downstream Security Lake ingest. Streams to disk
+# every flush_every events per (class_uid × eventDay) partition — memory
+# is bounded regardless of input size.
+schema = infer_schema_from(next(iter(apply_stream(config, open("samples/cloudtrail.jsonl")))))
+with SecurityLakeSink("out_dir", flush_every=50_000, schema=schema) as sink:
+    sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
+# Multi-process for 10 TB-class workloads
+apply_parallel(config, "huge.log", "out.jsonl", n_workers=8, sink_kind="jsonl")
+# Coverage scoring (what % of required + recommended attrs are populated)
+print(coverage(config))
+# Per-phase timing
+print(benchmark(config, "samples/cloudtrail.jsonl"))
+# PII redaction wrapper around any sink
+with RedactingSink(JsonlSink("scrubbed.jsonl")) as sink:
+    sink.write_many(apply_stream(config, open("samples/cloudtrail.jsonl")))
+# Live tail
+import threading
+stop = threading.Event()
+with JsonlSink("live.jsonl") as sink:
+    stream_apply(config, "/var/log/cloudtrail.log", sink, stop=stop)
+```
+## At scale
+For 10 TB-class workloads on a single box:
+| Knob | Effect |
+|---|---|
+| `pip install ocsf-mapper[fast]` | orjson swap — 5-10× faster JSON parse/dump |
+| `--workers N` | Linear speedup to CPU count |
+| `--sink security-lake` | Streaming partitioned Parquet, memory bounded |
+| `infer_schema_from(...)` + `schema=` | Skip pyarrow type re-inference per flush |
+Combined: ~30-50× over the single-threaded baseline. On an 8-core box this
+puts 10 TB at ~1-2 days instead of ~40 days. See
+[`BENCHMARKS.md`](./BENCHMARKS.md) for per-mapping throughput numbers
+and the phase breakdown that motivates the multi-process gain.
+For genuinely large workloads, the tool is intended as a **mapping
+*development*** environment — develop the JSON DSL config locally, then
+ship the *same config* to a real distributed runtime (Spark / Flink /
+Beam / Vector) for production. See [`DESIGN_DISTRIBUTED.md`](./DESIGN_DISTRIBUTED.md)
+for the architecture: how the JSON DSL travels into a Spark UDF, a Flink
+streaming job, or Vector/Fluentbit agents, plus CI-gate patterns for
+schema-bump pre-flight and coverage drift.
+## Repository layout
+```
+mappings/                JSON DSL configs per log source
+samples/                 Paired sample log files (used by lint and tests)
+catalog.json             Master-data: vendor + priority + OCSF target per source
+ocsf-schema/             Vendored ocsf/ocsf-schema (git submodule, pinned)
+src/ocsf_mapper/
+  apply.py               DSL executor + public apply()/apply_stream()
+  ops.py                 11 op kinds (const, path, group, raw, lookup, time,
+                          range, int, bool, expr, for_each)
+  validate.py            Structural validator
+  registry.py            Programmatic mapping inventory
+  catalog.py             catalog.json reader + table printer
+  coverage.py            Per-class completeness scoring
+  lint.py                CI gate
+  schema.py              OCSF schema loader (categories, classes, dictionary)
+  generate.py            LLM-assisted two-phase mapping generator
+  stream.py              tail -f-style streaming helpers
+  providers/             LLM provider abstraction (Anthropic, OpenAI, fixture)
+  sinks/                 Output destinations (jsonl, csv, parquet,
+                          security-lake, stdout)
+  web/                   FastAPI + Jinja2 + HTMX app
+  cli.py                 ocsf-mapper CLI entry point
+  parallel.py            apply_parallel: multi-process fan-out
+  benchmark.py           Per-phase throughput diagnostic
+  stream.py              tail -f-style streaming helpers
+  schema_diff.py         Detect schema-bump breakage vs older git ref
+  redact.py              PII redaction (email/ipv4/ssn/phone/jwt/ccn)
+  _fastjson.py           orjson-or-stdlib JSON shim
+scripts/
+  generate_samples.py    Deterministic sample-data generator
+  lint_mappings.py       Thin wrapper around python -m ocsf_mapper.lint
+tests/                   pytest suite (~225 tests, 90% coverage)
+```
+## Adding a new log source
+1. Drop your sample (JSON / regex-parseable text) into `samples/<name>.<ext>`.
+2. Write `mappings/<name>.json` per the DSL in [`PLAN.md`](./PLAN.md) §4 —
+   *or* use `ocsf-mapper generate <name> samples/<name>.<ext>` (or the
+   web UI's `/new` wizard) to draft one with LLM assistance.
+3. Add an entry to `catalog.json` with vendor + priority + OCSF target.
+4. `ocsf-mapper lint mappings/` — must exit 0.
+5. `pytest` — must stay green.
+## Status
+- [x] **Phase A — SDK**: pip-installable package, CLI (11 subcommands —
+      `apply`, `validate`, `list`, `catalog`, `lint`, `schema-diff`,
+      `benchmark`, `diff`, `generate`, `tail`, `serve`), 29 reference
+      mappings, master-data catalog, GitHub Actions CI on Python 3.9 /
+      3.11 / 3.12.
+- [x] **Phase B — Web UI**: homepage card grid (with priority badges and
+      coverage bars), per-source page with 5 HTMX-swappable tabs
+      (Sample, Output, Mapping editor with Monaco, Validation, Coverage),
+      new-source wizard at `/new`.
+- [x] **Phase C — LLM-assisted onboarding**: Anthropic / OpenAI / fixture
+      provider abstraction, two-phase generator (`suggest_classes` →
+      `draft_mapping`), `ocsf-mapper generate` CLI, UI wizard with
+      server-side lint gate.
+- [~] **Phase D — Polish**:
+  - [x] Per-mapping coverage scoring (required + recommended attrs)
+  - [x] Partitioned Parquet sink for AWS Security Lake
+        (`<root>/<class_uid>/eventDay=YYYYMMDD/*.parquet`)
+  - [x] `tail -f` live streaming mode (`ocsf-mapper tail`)
+  - [x] Schema-bump diff (`ocsf-mapper schema-diff` joins schema deltas
+        against mappings to surface silent breakage)
+  - [x] PII redaction layer (`apply ... --redact`; scrubs email / ipv4
+        / ssn / phone / jwt / Luhn-valid ccn)
+  - [x] WebSocket live-tail UI mode (Server-Sent Events from the Output
+        tab stream new OCSF events as the source file appends)
+  - [x] Mapping comparison (`ocsf-mapper diff <a.json> <b.json>`)
+- [x] **Distribution (v0.3)**: PyPI publish workflow, Docker image
+      pushed to GHCR, Spark UDF reference at `examples/spark/`,
+      landing page deployed via GitHub Pages.
+### v0.4+ roadmap
+Detail in [`PLAN.md`](./PLAN.md) §3a.
+- **Bucket B — more mappings**: GitHub / GitLab / Slack / Kubernetes
+  audit logs; CEF / LEEF generic parsers; Windows-extension classes
+  (registry, etc); OCSF categories 7 (remediation) and 8 (unmanned
+  systems).
+- **Bucket C — production engineering**: Prometheus `/metrics`, audit
+  log of mapping edits, replay tool for backfilling new fields,
+  provider test coverage, mapping versioning.
+- **Phase E — distributed runtimes**: Flink streaming adapter, Vector
+  / Fluentbit transpiler, Apache Beam adapter.
+See [`CHANGELOG.md`](./CHANGELOG.md) for the per-feature commit timeline
+and [`PLAN.md`](./PLAN.md) for the original architecture and design
+decisions.