PyPI - ocsf-mapper - Versions diffs - 0.3.1__py3-none-any.whl - Mend

ocsf-mapper 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

ocsf_mapper/__init__.py +33 -0
ocsf_mapper/_fastjson.py +52 -0
ocsf_mapper/apply.py +378 -0
ocsf_mapper/audit.py +115 -0
ocsf_mapper/benchmark.py +191 -0
ocsf_mapper/catalog.py +109 -0
ocsf_mapper/cli.py +408 -0
ocsf_mapper/coverage.py +92 -0
ocsf_mapper/generate.py +233 -0
ocsf_mapper/lint.py +123 -0
ocsf_mapper/mapping_diff.py +281 -0
ocsf_mapper/ops.py +181 -0
ocsf_mapper/parallel.py +181 -0
ocsf_mapper/providers/__init__.py +53 -0
ocsf_mapper/providers/anthropic.py +38 -0
ocsf_mapper/providers/base.py +21 -0
ocsf_mapper/providers/fixture.py +58 -0
ocsf_mapper/providers/openai.py +42 -0
ocsf_mapper/py.typed +0 -0
ocsf_mapper/redact.py +154 -0
ocsf_mapper/registry.py +78 -0
ocsf_mapper/replay.py +186 -0
ocsf_mapper/schema.py +149 -0
ocsf_mapper/schema_diff.py +261 -0
ocsf_mapper/sinks/__init__.py +70 -0
ocsf_mapper/sinks/base.py +41 -0
ocsf_mapper/sinks/csv.py +66 -0
ocsf_mapper/sinks/jsonl.py +31 -0
ocsf_mapper/sinks/parquet.py +64 -0
ocsf_mapper/sinks/security_lake.py +157 -0
ocsf_mapper/sinks/stdout.py +16 -0
ocsf_mapper/stream.py +93 -0
ocsf_mapper/validate.py +90 -0
ocsf_mapper/web/__init__.py +16 -0
ocsf_mapper/web/app.py +650 -0
ocsf_mapper-0.3.1.dist-info/METADATA +349 -0
ocsf_mapper-0.3.1.dist-info/RECORD +40 -0
ocsf_mapper-0.3.1.dist-info/WHEEL +5 -0
ocsf_mapper-0.3.1.dist-info/entry_points.txt +2 -0
ocsf_mapper-0.3.1.dist-info/top_level.txt +1 -0

ocsf_mapper/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""ocsf_mapper — declarative log-to-OCSF mapper.
+Public API (stable surface, populated as modules land in Phase A):
+    from ocsf_mapper import apply, apply_stream
+The CLI lives at ``ocsf_mapper.cli`` (coming in Phase A).
+"""
+from ocsf_mapper.apply import (
+    apply,
+    apply_stream,
+    apply_with_class,
+    apply_stream_with_class,
+)
+from ocsf_mapper.validate import validate, validate_stream
+from ocsf_mapper.schema import Schema
+from ocsf_mapper.registry import list_mappings
+# `ocsf_mapper.catalog` is a CLI module — import it directly when needed
+# (avoids `python -m ocsf_mapper.catalog` double-import warning).
+__all__ = [
+    "apply",
+    "apply_stream",
+    "apply_with_class",
+    "apply_stream_with_class",
+    "validate",
+    "validate_stream",
+    "Schema",
+    "list_mappings",
+]
+__version__ = "0.3.1"

ocsf_mapper/_fastjson.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Drop-in JSON helpers — orjson if installed, stdlib json otherwise.
+Why this exists: on JSON-shaped sources (CloudTrail, Okta, Cloudflare,
+WAF, etc.) the per-event JSON parse is one of the largest line items in
+``apply_stream``'s hot path. ``orjson`` is 5-10× faster than stdlib for
+both ``loads`` and ``dumps``. Adding it as an optional dependency means
+the speedup is opt-in (``pip install ocsf-mapper[fast]``) and the package
+keeps a clean zero-dependency floor.
+Module-level constants:
+  HAS_ORJSON: bool   True iff orjson imported successfully
+Functions:
+  loads(s)  — accepts str or bytes; returns the parsed object
+  dumps(o)  — returns a *str* (so callers don't need to know the backend)
+"""
+from __future__ import annotations
+import json as _stdlib_json
+from typing import Any, Union
+try:
+    import orjson as _orjson  # type: ignore[import-not-found]
+    HAS_ORJSON = True
+except ImportError:  # pragma: no cover - exercised only on installs without orjson
+    _orjson = None
+    HAS_ORJSON = False
+if HAS_ORJSON:
+    def loads(s: Union[str, bytes]) -> Any:
+        # orjson is fastest on bytes; encode str inputs once.
+        if isinstance(s, str):
+            return _orjson.loads(s.encode("utf-8"))
+        return _orjson.loads(s)
+    def dumps(obj: Any) -> str:
+        # orjson.dumps returns bytes; decode once at the boundary so callers
+        # treat the result like the stdlib's str output.
+        return _orjson.dumps(obj).decode("utf-8")
+else:
+    def loads(s: Union[str, bytes]) -> Any:
+        if isinstance(s, bytes):
+            s = s.decode("utf-8")
+        return _stdlib_json.loads(s)
+    def dumps(obj: Any) -> str:
+        # Match orjson's behaviour: no spaces, UTF-8 escapes off.
+        return _stdlib_json.dumps(obj, ensure_ascii=False, separators=(",", ":"))

ocsf_mapper/apply.py ADDED Viewed

@@ -0,0 +1,378 @@
+"""Mapping engine — turns raw log lines into OCSF events using a JSON DSL config.
+This module owns the orchestration:
+  raw line ── parse_record ──> record dict
+                                  │
+                                  ▼
+                            pick_class (routing)
+                                  │
+                                  ▼
+                            map_record (apply ops per target)
+                                  │
+                                  ▼
+                                prune
+                                  │
+                                  ▼
+                              OCSF event
+Op execution is delegated to :mod:`ocsf_mapper.ops`. The public surface is
+:func:`apply` (single line) and :func:`apply_stream` (iterator). Both also
+have ``_with_class`` variants that additionally return the chosen class
+name — used by the linter and any future tooling that needs to validate
+per-class.
+"""
+from __future__ import annotations
+import re
+from functools import lru_cache
+from typing import Any, Iterable, Iterator, Mapping, Optional, Tuple
+from ocsf_mapper._fastjson import loads as _json_loads
+from ocsf_mapper.ops import apply_op, resolve_expr, set_path
+# ---------------------------------------------------------------------------
+# parsing — raw line to record
+# ---------------------------------------------------------------------------
+@lru_cache(maxsize=128)
+def _compile_regex(pattern: str) -> re.Pattern:
+    """Cache compiled regex patterns across calls.
+    parse_record is on the hot path; re.match() on a string-form pattern
+    re-compiles each call. With this cache the compile cost is paid once
+    per unique parser, which matters at 10⁶+ events/run.
+    """
+    return re.compile(pattern)
+def parse_record(raw_line: str, parser_spec: Any) -> Optional[dict]:
+    """Parse a raw line into a record dict.
+    Returns ``None`` for lines that don't match the configured parser; the
+    caller is expected to skip them.
+    Supported parser kinds:
+      * ``"json"`` — one JSON object per line.
+      * ``{"regex": "<pattern>", "groups": [...]}`` — named regex groups.
+      * ``"cef"`` — ArcSight CEF format. Produces ``{cef_version,
+        device_vendor, device_product, device_version, signature_id,
+        name, severity, ext: {...}}`` with the ``key=value`` extension
+        parsed into ``ext``.
+      * ``"leef"`` — IBM LEEF format. Produces ``{leef_version, vendor,
+        product, version, event_id, ext: {...}}``.
+    The ``"cef"`` and ``"leef"`` forms also expose the extension keys at
+    the top level so DSL paths like ``$.src`` work without going through
+    ``$.ext.src``.
+    """
+    if parser_spec == "json":
+        rec = _json_loads(raw_line)
+        rec["__raw__"] = raw_line.rstrip("\n")
+        return rec
+    if parser_spec == "cef":
+        return _parse_cef(raw_line)
+    if parser_spec == "leef":
+        return _parse_leef(raw_line)
+    if isinstance(parser_spec, dict) and "regex" in parser_spec:
+        m = _compile_regex(parser_spec["regex"]).match(raw_line.rstrip("\n"))
+        if not m:
+            return None
+        groups = m.groupdict()
+        rec: dict = {"__groups__": groups, "__raw__": raw_line.rstrip("\n")}
+        # Also expose groups at top level so JSON-path ops can address them.
+        rec.update(groups)
+        return rec
+    raise ValueError(f"unknown parser: {parser_spec!r}")
+# ---------------------------------------------------------------------------
+# CEF / LEEF parsers (vendor-neutral SIEM transports)
+# ---------------------------------------------------------------------------
+def _parse_cef(raw_line: str) -> Optional[dict]:
+    """Parse an ArcSight CEF line.
+    Format::
+        CEF:Version|Device Vendor|Device Product|Device Version|Signature ID|Name|Severity|Extension
+    Eight pipe-separated fields after the ``CEF:`` prefix. The
+    extension is a free-form ``key=value`` blob — keys are
+    space-delimited, values run until the next ``<space><known-key>=``.
+    """
+    line = raw_line.rstrip("\n")
+    if not line.startswith("CEF:"):
+        return None
+    body = line[4:]
+    parts = _split_cef_header(body, n_fields=8)
+    if parts is None or len(parts) < 8:
+        return None
+    cef_version, vendor, product, version, sig_id, name, severity, ext_blob = parts
+    ext = _parse_cef_extension(ext_blob)
+    rec: dict = {
+        "cef_version":    cef_version,
+        "device_vendor":  vendor,
+        "device_product": product,
+        "device_version": version,
+        "signature_id":   sig_id,
+        "name":           name,
+        "severity":       severity,
+        "ext":            ext,
+        "__raw__":        line,
+    }
+    # Flatten extension keys to the top level so $.<key> works directly.
+    for k, v in ext.items():
+        if k not in rec:
+            rec[k] = v
+    return rec
+def _split_cef_header(body: str, n_fields: int) -> Optional[list[str]]:
+    """Split a CEF body on unescaped ``|``. Honours ``\\|`` and ``\\\\`` escapes."""
+    fields: list[str] = []
+    buf: list[str] = []
+    i = 0
+    while i < len(body) and len(fields) < n_fields - 1:
+        c = body[i]
+        if c == "\\" and i + 1 < len(body):
+            nxt = body[i + 1]
+            if nxt in ("|", "\\", "="):
+                buf.append(nxt)
+                i += 2
+                continue
+        if c == "|":
+            fields.append("".join(buf))
+            buf = []
+            i += 1
+            continue
+        buf.append(c)
+        i += 1
+    # Everything remaining is the final field (severity + extension).
+    fields.append("".join(buf) + body[i:])
+    return fields
+def _parse_cef_extension(blob: str) -> dict[str, str]:
+    """Parse a CEF ``key=value key2=value2`` extension blob.
+    Values can contain spaces — we look for the next ``<space>word=`` to
+    delimit. Honours ``\\=`` and ``\\\\`` escapes inside values.
+    """
+    if not blob:
+        return {}
+    # Find all "key=" positions in the string (start, or preceded by space).
+    key_pat = re.compile(r"(?:^|(?<=\s))([A-Za-z_][\w.]*?)=")
+    matches = list(key_pat.finditer(blob))
+    out: dict[str, str] = {}
+    for idx, m in enumerate(matches):
+        key = m.group(1)
+        val_start = m.end()
+        val_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(blob)
+        raw_val = blob[val_start:val_end].rstrip()
+        # Unescape \= \\ \|
+        val = raw_val.replace("\\\\", "\x00").replace("\\=", "=").replace("\\|", "|").replace("\x00", "\\")
+        out[key] = val
+    return out
+def _parse_leef(raw_line: str) -> Optional[dict]:
+    """Parse an IBM LEEF line.
+    LEEF 1.0::
+        LEEF:1.0|Vendor|Product|Version|EventID|<tab-separated key=value>
+    LEEF 2.0::
+        LEEF:2.0|Vendor|Product|Version|EventID|<delim>|<key=value...>
+    where ``<delim>`` is the character used to separate extension pairs
+    (commonly tab ``\\t``, ``|``, ``\\x09``, or a single character).
+    The record shape mirrors :func:`_parse_cef`.
+    """
+    line = raw_line.rstrip("\n")
+    if not line.startswith("LEEF:"):
+        return None
+    body = line[5:]
+    # Peek at the version to decide how many pipes to split on. The
+    # extension is *one* trailing field, so use ``maxsplit`` rather than
+    # a plain ``split("|")`` (which would shred any pipes that appear
+    # inside extension values).
+    head_only = body.split("|", 1)
+    if not head_only:
+        return None
+    leef_version = head_only[0]
+    if leef_version.startswith("2"):
+        parts = body.split("|", 6)         # 7 fields: 6 pipes
+        if len(parts) < 7:
+            return None
+        _, vendor, product, version, event_id, delim_field, ext_blob = parts
+        delim = _normalise_leef_delim(delim_field)
+    else:
+        parts = body.split("|", 5)         # 6 fields: 5 pipes
+        if len(parts) < 6:
+            return None
+        _, vendor, product, version, event_id, ext_blob = parts
+        delim = "\t"
+    ext = _parse_leef_extension(ext_blob, delim)
+    rec: dict = {
+        "leef_version": leef_version,
+        "vendor":       vendor,
+        "product":      product,
+        "version":      version,
+        "event_id":     event_id,
+        "ext":          ext,
+        "__raw__":      line,
+    }
+    for k, v in ext.items():
+        if k not in rec:
+            rec[k] = v
+    return rec
+def _normalise_leef_delim(delim_field: str) -> str:
+    """Map common LEEF 2.0 delimiter encodings to a literal character."""
+    d = delim_field.strip()
+    if d in ("\\t", "x09", "0x09", "9"):
+        return "\t"
+    if not d:
+        return "\t"
+    return d[0]
+def _parse_leef_extension(blob: str, delim: str) -> dict[str, str]:
+    """Split a LEEF extension blob on ``delim``, parse k=v pairs."""
+    if not blob:
+        return {}
+    out: dict[str, str] = {}
+    for pair in blob.split(delim):
+        if "=" not in pair:
+            continue
+        k, v = pair.split("=", 1)
+        if k:
+            out[k.strip()] = v
+    return out
+# ---------------------------------------------------------------------------
+# routing — record to class name
+# ---------------------------------------------------------------------------
+def pick_class(record: Mapping[str, Any], routing: Optional[dict], classes: dict) -> str:
+    """Pick which OCSF class to apply for this record.
+    If ``routing`` is absent or has no matching rule, falls back to the first
+    class declared in ``classes`` (or ``routing.default_class`` if set).
+    """
+    if not routing:
+        return next(iter(classes))
+    field_val = resolve_expr(routing["field"], record)
+    for rule in routing["rules"]:
+        if rule.get("default"):
+            return rule["class"]
+        matches = rule.get("matches", [])
+        if rule.get("prefix"):
+            if any(str(field_val or "").startswith(m) for m in matches):
+                return rule["class"]
+        else:
+            if str(field_val) in matches:
+                return rule["class"]
+    return routing.get("default_class") or next(iter(classes))
+# ---------------------------------------------------------------------------
+# mapping — record to OCSF event
+# ---------------------------------------------------------------------------
+def prune(obj: Any) -> Any:
+    """Recursively drop ``None`` values and empty dicts/lists.
+    Mapping configs intentionally over-declare targets; pruning keeps the
+    output clean for the validator (and matches what real OCSF events look
+    like — missing optional fields are simply absent).
+    """
+    if isinstance(obj, dict):
+        out = {}
+        for k, v in obj.items():
+            pv = prune(v)
+            if pv not in (None, {}, []):
+                out[k] = pv
+        return out
+    if isinstance(obj, list):
+        return [prune(x) for x in obj if x is not None]
+    return obj
+def map_record(record: Mapping[str, Any], class_block: dict) -> dict:
+    """Run all ops in ``class_block['mapping']`` against ``record``, build an OCSF event."""
+    event: dict = {}
+    already_set: dict = {}
+    for target, op in class_block["mapping"].items():
+        val = apply_op(op, record, already_set)
+        set_path(event, target, val)
+        # Top-level scalar targets are exposed to subsequent `expr` ops.
+        if "." not in target and isinstance(val, (int, float, str)):
+            already_set[target] = val
+    return prune(event)
+# ---------------------------------------------------------------------------
+# public API
+# ---------------------------------------------------------------------------
+def apply(config: dict, raw_line: str) -> Optional[dict]:
+    """Map a single raw line to one OCSF event, or ``None`` if unparseable."""
+    result = _apply_with_class(config, raw_line)
+    return None if result is None else result[0]
+def apply_with_class(config: dict, raw_line: str) -> Optional[Tuple[dict, str]]:
+    """Same as :func:`apply` but also returns the chosen OCSF class name."""
+    return _apply_with_class(config, raw_line)
+def apply_stream(config: dict, lines: Iterable[str]) -> Iterator[dict]:
+    """Map a stream of raw lines. Empty / unparseable lines are skipped."""
+    for line in lines:
+        if not line.strip():
+            continue
+        ev = apply(config, line)
+        if ev is not None:
+            yield ev
+def apply_stream_with_class(
+    config: dict, lines: Iterable[str]
+) -> Iterator[Tuple[dict, str]]:
+    """Like :func:`apply_stream` but yields ``(event, class_name)`` pairs."""
+    for line in lines:
+        if not line.strip():
+            continue
+        r = _apply_with_class(config, line)
+        if r is not None:
+            yield r
+# ---------------------------------------------------------------------------
+# internals
+# ---------------------------------------------------------------------------
+def _apply_with_class(config: dict, raw_line: str) -> Optional[Tuple[dict, str]]:
+    rec = parse_record(raw_line, config["parser"])
+    if rec is None:
+        return None
+    cls = pick_class(rec, config.get("routing"), config["classes"])
+    block = config["classes"][cls]
+    event = map_record(rec, block)
+    return event, cls

ocsf_mapper/audit.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""NDJSON audit log of mapping-config edits.
+Compliance question: "who changed cloudtrail.json last Tuesday?" Without
+auth on the web UI, "who" is best-effort — we pick the first non-empty
+of ``OCSF_AUDIT_USER`` / ``USER`` / ``USERNAME`` env vars, falling back
+to ``"local"``. The audit log records:
+  - timestamp (ISO 8601 UTC)
+  - user
+  - action ("create" | "update")
+  - mapping name
+  - bytes before / after (for size-delta visibility)
+  - lint status ("OK" | "FAIL" | "SKIP" | "REJECTED")
+  - error list (empty on success, non-empty on rejected saves)
+Each event is one JSON line in ``<root>/audit/mapping_edits.ndjson``.
+Append-only by design — never edit in place. Operationally:
+    tail -f audit/mapping_edits.ndjson | jq .
+    grep '"mapping":"cloudtrail"' audit/mapping_edits.ndjson | jq -s 'sort_by(.ts)'
+The audit directory is created lazily on first write.
+"""
+from __future__ import annotations
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Iterable, Optional
+_AUDIT_SUBDIR = "audit"
+_AUDIT_FILENAME = "mapping_edits.ndjson"
+def _resolve_user() -> str:
+    for var in ("OCSF_AUDIT_USER", "USER", "USERNAME"):
+        v = os.environ.get(var)
+        if v:
+            return v
+    return "local"
+def audit_path(root: Path | str) -> Path:
+    return Path(root) / _AUDIT_SUBDIR / _AUDIT_FILENAME
+def log_edit(
+    root: Path | str,
+    *,
+    mapping: str,
+    action: str,
+    lint_status: str,
+    errors: Optional[Iterable[str]] = None,
+    bytes_before: Optional[int] = None,
+    bytes_after: Optional[int] = None,
+    user: Optional[str] = None,
+) -> None:
+    """Append one event to the audit log.
+    ``mapping`` is the source short name (``cloudtrail``, ``okta``, ...).
+    ``action`` is ``"create"`` for new sources via the wizard, ``"update"``
+    for the Mapping-tab editor. ``lint_status`` is the result of
+    ``lint_one()`` on the candidate file before the save was committed.
+    ``errors`` is the lint error list, empty on success.
+    Silent best-effort: if the audit directory can't be created we log
+    a warning to stderr but don't raise — losing the audit trail
+    shouldn't break a save.
+    """
+    path = audit_path(root)
+    record = {
+        "ts":           datetime.now(timezone.utc).isoformat(timespec="seconds"),
+        "user":         user or _resolve_user(),
+        "action":       action,
+        "mapping":      mapping,
+        "lint_status":  lint_status,
+        "errors":       list(errors) if errors else [],
+        "bytes_before": bytes_before,
+        "bytes_after":  bytes_after,
+    }
+    try:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("a", encoding="utf-8") as fp:
+            fp.write(json.dumps(record, ensure_ascii=False))
+            fp.write("\n")
+    except OSError as e:  # pragma: no cover - filesystem fault
+        import sys
+        print(f"warning: audit log write failed: {e}", file=sys.stderr)
+def read_audit(root: Path | str, limit: Optional[int] = None) -> list[dict]:
+    """Return the audit log as a list of dicts, newest first.
+    ``limit`` truncates to the most-recent N events. Returns an empty
+    list if the audit file doesn't exist yet.
+    """
+    path = audit_path(root)
+    if not path.exists():
+        return []
+    out: list[dict] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            out.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+    out.reverse()  # newest first
+    if limit is not None:
+        out = out[:limit]
+    return out