ocsf-mapper 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ """ocsf_mapper — declarative log-to-OCSF mapper.
2
+
3
+ Public API (stable surface, populated as modules land in Phase A):
4
+
5
+ from ocsf_mapper import apply, apply_stream
6
+
7
+ The CLI lives at ``ocsf_mapper.cli`` (coming in Phase A).
8
+ """
9
+
10
+ from ocsf_mapper.apply import (
11
+ apply,
12
+ apply_stream,
13
+ apply_with_class,
14
+ apply_stream_with_class,
15
+ )
16
+ from ocsf_mapper.validate import validate, validate_stream
17
+ from ocsf_mapper.schema import Schema
18
+ from ocsf_mapper.registry import list_mappings
19
+
20
+ # `ocsf_mapper.catalog` is a CLI module — import it directly when needed
21
+ # (avoids `python -m ocsf_mapper.catalog` double-import warning).
22
+
23
+ __all__ = [
24
+ "apply",
25
+ "apply_stream",
26
+ "apply_with_class",
27
+ "apply_stream_with_class",
28
+ "validate",
29
+ "validate_stream",
30
+ "Schema",
31
+ "list_mappings",
32
+ ]
33
+ __version__ = "0.3.1"
@@ -0,0 +1,52 @@
1
+ """Drop-in JSON helpers — orjson if installed, stdlib json otherwise.
2
+
3
+ Why this exists: on JSON-shaped sources (CloudTrail, Okta, Cloudflare,
4
+ WAF, etc.) the per-event JSON parse is one of the largest line items in
5
+ ``apply_stream``'s hot path. ``orjson`` is 5-10× faster than stdlib for
6
+ both ``loads`` and ``dumps``. Adding it as an optional dependency means
7
+ the speedup is opt-in (``pip install ocsf-mapper[fast]``) and the package
8
+ keeps a clean zero-dependency floor.
9
+
10
+ Module-level constants:
11
+
12
+ HAS_ORJSON: bool True iff orjson imported successfully
13
+
14
+ Functions:
15
+
16
+ loads(s) — accepts str or bytes; returns the parsed object
17
+ dumps(o) — returns a *str* (so callers don't need to know the backend)
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json as _stdlib_json
23
+ from typing import Any, Union
24
+
25
+ try:
26
+ import orjson as _orjson # type: ignore[import-not-found]
27
+ HAS_ORJSON = True
28
+ except ImportError: # pragma: no cover - exercised only on installs without orjson
29
+ _orjson = None
30
+ HAS_ORJSON = False
31
+
32
+
33
+ if HAS_ORJSON:
34
+ def loads(s: Union[str, bytes]) -> Any:
35
+ # orjson is fastest on bytes; encode str inputs once.
36
+ if isinstance(s, str):
37
+ return _orjson.loads(s.encode("utf-8"))
38
+ return _orjson.loads(s)
39
+
40
+ def dumps(obj: Any) -> str:
41
+ # orjson.dumps returns bytes; decode once at the boundary so callers
42
+ # treat the result like the stdlib's str output.
43
+ return _orjson.dumps(obj).decode("utf-8")
44
+ else:
45
+ def loads(s: Union[str, bytes]) -> Any:
46
+ if isinstance(s, bytes):
47
+ s = s.decode("utf-8")
48
+ return _stdlib_json.loads(s)
49
+
50
+ def dumps(obj: Any) -> str:
51
+ # Match orjson's behaviour: no spaces, UTF-8 escapes off.
52
+ return _stdlib_json.dumps(obj, ensure_ascii=False, separators=(",", ":"))
ocsf_mapper/apply.py ADDED
@@ -0,0 +1,378 @@
1
+ """Mapping engine — turns raw log lines into OCSF events using a JSON DSL config.
2
+
3
+ This module owns the orchestration:
4
+
5
+ raw line ── parse_record ──> record dict
6
+
7
+
8
+ pick_class (routing)
9
+
10
+
11
+ map_record (apply ops per target)
12
+
13
+
14
+ prune
15
+
16
+
17
+ OCSF event
18
+
19
+ Op execution is delegated to :mod:`ocsf_mapper.ops`. The public surface is
20
+ :func:`apply` (single line) and :func:`apply_stream` (iterator). Both also
21
+ have ``_with_class`` variants that additionally return the chosen class
22
+ name — used by the linter and any future tooling that needs to validate
23
+ per-class.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import re
29
+ from functools import lru_cache
30
+ from typing import Any, Iterable, Iterator, Mapping, Optional, Tuple
31
+
32
+ from ocsf_mapper._fastjson import loads as _json_loads
33
+ from ocsf_mapper.ops import apply_op, resolve_expr, set_path
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # parsing — raw line to record
38
+ # ---------------------------------------------------------------------------
39
+
40
+
41
+ @lru_cache(maxsize=128)
42
+ def _compile_regex(pattern: str) -> re.Pattern:
43
+ """Cache compiled regex patterns across calls.
44
+
45
+ parse_record is on the hot path; re.match() on a string-form pattern
46
+ re-compiles each call. With this cache the compile cost is paid once
47
+ per unique parser, which matters at 10⁶+ events/run.
48
+ """
49
+ return re.compile(pattern)
50
+
51
+
52
+ def parse_record(raw_line: str, parser_spec: Any) -> Optional[dict]:
53
+ """Parse a raw line into a record dict.
54
+
55
+ Returns ``None`` for lines that don't match the configured parser; the
56
+ caller is expected to skip them.
57
+
58
+ Supported parser kinds:
59
+
60
+ * ``"json"`` — one JSON object per line.
61
+ * ``{"regex": "<pattern>", "groups": [...]}`` — named regex groups.
62
+ * ``"cef"`` — ArcSight CEF format. Produces ``{cef_version,
63
+ device_vendor, device_product, device_version, signature_id,
64
+ name, severity, ext: {...}}`` with the ``key=value`` extension
65
+ parsed into ``ext``.
66
+ * ``"leef"`` — IBM LEEF format. Produces ``{leef_version, vendor,
67
+ product, version, event_id, ext: {...}}``.
68
+
69
+ The ``"cef"`` and ``"leef"`` forms also expose the extension keys at
70
+ the top level so DSL paths like ``$.src`` work without going through
71
+ ``$.ext.src``.
72
+ """
73
+ if parser_spec == "json":
74
+ rec = _json_loads(raw_line)
75
+ rec["__raw__"] = raw_line.rstrip("\n")
76
+ return rec
77
+ if parser_spec == "cef":
78
+ return _parse_cef(raw_line)
79
+ if parser_spec == "leef":
80
+ return _parse_leef(raw_line)
81
+ if isinstance(parser_spec, dict) and "regex" in parser_spec:
82
+ m = _compile_regex(parser_spec["regex"]).match(raw_line.rstrip("\n"))
83
+ if not m:
84
+ return None
85
+ groups = m.groupdict()
86
+ rec: dict = {"__groups__": groups, "__raw__": raw_line.rstrip("\n")}
87
+ # Also expose groups at top level so JSON-path ops can address them.
88
+ rec.update(groups)
89
+ return rec
90
+ raise ValueError(f"unknown parser: {parser_spec!r}")
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # CEF / LEEF parsers (vendor-neutral SIEM transports)
95
+ # ---------------------------------------------------------------------------
96
+
97
+
98
+ def _parse_cef(raw_line: str) -> Optional[dict]:
99
+ """Parse an ArcSight CEF line.
100
+
101
+ Format::
102
+
103
+ CEF:Version|Device Vendor|Device Product|Device Version|Signature ID|Name|Severity|Extension
104
+
105
+ Eight pipe-separated fields after the ``CEF:`` prefix. The
106
+ extension is a free-form ``key=value`` blob — keys are
107
+ space-delimited, values run until the next ``<space><known-key>=``.
108
+ """
109
+ line = raw_line.rstrip("\n")
110
+ if not line.startswith("CEF:"):
111
+ return None
112
+ body = line[4:]
113
+ parts = _split_cef_header(body, n_fields=8)
114
+ if parts is None or len(parts) < 8:
115
+ return None
116
+ cef_version, vendor, product, version, sig_id, name, severity, ext_blob = parts
117
+ ext = _parse_cef_extension(ext_blob)
118
+ rec: dict = {
119
+ "cef_version": cef_version,
120
+ "device_vendor": vendor,
121
+ "device_product": product,
122
+ "device_version": version,
123
+ "signature_id": sig_id,
124
+ "name": name,
125
+ "severity": severity,
126
+ "ext": ext,
127
+ "__raw__": line,
128
+ }
129
+ # Flatten extension keys to the top level so $.<key> works directly.
130
+ for k, v in ext.items():
131
+ if k not in rec:
132
+ rec[k] = v
133
+ return rec
134
+
135
+
136
+ def _split_cef_header(body: str, n_fields: int) -> Optional[list[str]]:
137
+ """Split a CEF body on unescaped ``|``. Honours ``\\|`` and ``\\\\`` escapes."""
138
+ fields: list[str] = []
139
+ buf: list[str] = []
140
+ i = 0
141
+ while i < len(body) and len(fields) < n_fields - 1:
142
+ c = body[i]
143
+ if c == "\\" and i + 1 < len(body):
144
+ nxt = body[i + 1]
145
+ if nxt in ("|", "\\", "="):
146
+ buf.append(nxt)
147
+ i += 2
148
+ continue
149
+ if c == "|":
150
+ fields.append("".join(buf))
151
+ buf = []
152
+ i += 1
153
+ continue
154
+ buf.append(c)
155
+ i += 1
156
+ # Everything remaining is the final field (severity + extension).
157
+ fields.append("".join(buf) + body[i:])
158
+ return fields
159
+
160
+
161
+ def _parse_cef_extension(blob: str) -> dict[str, str]:
162
+ """Parse a CEF ``key=value key2=value2`` extension blob.
163
+
164
+ Values can contain spaces — we look for the next ``<space>word=`` to
165
+ delimit. Honours ``\\=`` and ``\\\\`` escapes inside values.
166
+ """
167
+ if not blob:
168
+ return {}
169
+ # Find all "key=" positions in the string (start, or preceded by space).
170
+ key_pat = re.compile(r"(?:^|(?<=\s))([A-Za-z_][\w.]*?)=")
171
+ matches = list(key_pat.finditer(blob))
172
+ out: dict[str, str] = {}
173
+ for idx, m in enumerate(matches):
174
+ key = m.group(1)
175
+ val_start = m.end()
176
+ val_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(blob)
177
+ raw_val = blob[val_start:val_end].rstrip()
178
+ # Unescape \= \\ \|
179
+ val = raw_val.replace("\\\\", "\x00").replace("\\=", "=").replace("\\|", "|").replace("\x00", "\\")
180
+ out[key] = val
181
+ return out
182
+
183
+
184
+ def _parse_leef(raw_line: str) -> Optional[dict]:
185
+ """Parse an IBM LEEF line.
186
+
187
+ LEEF 1.0::
188
+
189
+ LEEF:1.0|Vendor|Product|Version|EventID|<tab-separated key=value>
190
+
191
+ LEEF 2.0::
192
+
193
+ LEEF:2.0|Vendor|Product|Version|EventID|<delim>|<key=value...>
194
+
195
+ where ``<delim>`` is the character used to separate extension pairs
196
+ (commonly tab ``\\t``, ``|``, ``\\x09``, or a single character).
197
+
198
+ The record shape mirrors :func:`_parse_cef`.
199
+ """
200
+ line = raw_line.rstrip("\n")
201
+ if not line.startswith("LEEF:"):
202
+ return None
203
+ body = line[5:]
204
+ # Peek at the version to decide how many pipes to split on. The
205
+ # extension is *one* trailing field, so use ``maxsplit`` rather than
206
+ # a plain ``split("|")`` (which would shred any pipes that appear
207
+ # inside extension values).
208
+ head_only = body.split("|", 1)
209
+ if not head_only:
210
+ return None
211
+ leef_version = head_only[0]
212
+ if leef_version.startswith("2"):
213
+ parts = body.split("|", 6) # 7 fields: 6 pipes
214
+ if len(parts) < 7:
215
+ return None
216
+ _, vendor, product, version, event_id, delim_field, ext_blob = parts
217
+ delim = _normalise_leef_delim(delim_field)
218
+ else:
219
+ parts = body.split("|", 5) # 6 fields: 5 pipes
220
+ if len(parts) < 6:
221
+ return None
222
+ _, vendor, product, version, event_id, ext_blob = parts
223
+ delim = "\t"
224
+ ext = _parse_leef_extension(ext_blob, delim)
225
+ rec: dict = {
226
+ "leef_version": leef_version,
227
+ "vendor": vendor,
228
+ "product": product,
229
+ "version": version,
230
+ "event_id": event_id,
231
+ "ext": ext,
232
+ "__raw__": line,
233
+ }
234
+ for k, v in ext.items():
235
+ if k not in rec:
236
+ rec[k] = v
237
+ return rec
238
+
239
+
240
+ def _normalise_leef_delim(delim_field: str) -> str:
241
+ """Map common LEEF 2.0 delimiter encodings to a literal character."""
242
+ d = delim_field.strip()
243
+ if d in ("\\t", "x09", "0x09", "9"):
244
+ return "\t"
245
+ if not d:
246
+ return "\t"
247
+ return d[0]
248
+
249
+
250
+ def _parse_leef_extension(blob: str, delim: str) -> dict[str, str]:
251
+ """Split a LEEF extension blob on ``delim``, parse k=v pairs."""
252
+ if not blob:
253
+ return {}
254
+ out: dict[str, str] = {}
255
+ for pair in blob.split(delim):
256
+ if "=" not in pair:
257
+ continue
258
+ k, v = pair.split("=", 1)
259
+ if k:
260
+ out[k.strip()] = v
261
+ return out
262
+
263
+
264
+ # ---------------------------------------------------------------------------
265
+ # routing — record to class name
266
+ # ---------------------------------------------------------------------------
267
+
268
+
269
+ def pick_class(record: Mapping[str, Any], routing: Optional[dict], classes: dict) -> str:
270
+ """Pick which OCSF class to apply for this record.
271
+
272
+ If ``routing`` is absent or has no matching rule, falls back to the first
273
+ class declared in ``classes`` (or ``routing.default_class`` if set).
274
+ """
275
+ if not routing:
276
+ return next(iter(classes))
277
+ field_val = resolve_expr(routing["field"], record)
278
+ for rule in routing["rules"]:
279
+ if rule.get("default"):
280
+ return rule["class"]
281
+ matches = rule.get("matches", [])
282
+ if rule.get("prefix"):
283
+ if any(str(field_val or "").startswith(m) for m in matches):
284
+ return rule["class"]
285
+ else:
286
+ if str(field_val) in matches:
287
+ return rule["class"]
288
+ return routing.get("default_class") or next(iter(classes))
289
+
290
+
291
+ # ---------------------------------------------------------------------------
292
+ # mapping — record to OCSF event
293
+ # ---------------------------------------------------------------------------
294
+
295
+
296
+ def prune(obj: Any) -> Any:
297
+ """Recursively drop ``None`` values and empty dicts/lists.
298
+
299
+ Mapping configs intentionally over-declare targets; pruning keeps the
300
+ output clean for the validator (and matches what real OCSF events look
301
+ like — missing optional fields are simply absent).
302
+ """
303
+ if isinstance(obj, dict):
304
+ out = {}
305
+ for k, v in obj.items():
306
+ pv = prune(v)
307
+ if pv not in (None, {}, []):
308
+ out[k] = pv
309
+ return out
310
+ if isinstance(obj, list):
311
+ return [prune(x) for x in obj if x is not None]
312
+ return obj
313
+
314
+
315
+ def map_record(record: Mapping[str, Any], class_block: dict) -> dict:
316
+ """Run all ops in ``class_block['mapping']`` against ``record``, build an OCSF event."""
317
+ event: dict = {}
318
+ already_set: dict = {}
319
+ for target, op in class_block["mapping"].items():
320
+ val = apply_op(op, record, already_set)
321
+ set_path(event, target, val)
322
+ # Top-level scalar targets are exposed to subsequent `expr` ops.
323
+ if "." not in target and isinstance(val, (int, float, str)):
324
+ already_set[target] = val
325
+ return prune(event)
326
+
327
+
328
+ # ---------------------------------------------------------------------------
329
+ # public API
330
+ # ---------------------------------------------------------------------------
331
+
332
+
333
+ def apply(config: dict, raw_line: str) -> Optional[dict]:
334
+ """Map a single raw line to one OCSF event, or ``None`` if unparseable."""
335
+ result = _apply_with_class(config, raw_line)
336
+ return None if result is None else result[0]
337
+
338
+
339
+ def apply_with_class(config: dict, raw_line: str) -> Optional[Tuple[dict, str]]:
340
+ """Same as :func:`apply` but also returns the chosen OCSF class name."""
341
+ return _apply_with_class(config, raw_line)
342
+
343
+
344
+ def apply_stream(config: dict, lines: Iterable[str]) -> Iterator[dict]:
345
+ """Map a stream of raw lines. Empty / unparseable lines are skipped."""
346
+ for line in lines:
347
+ if not line.strip():
348
+ continue
349
+ ev = apply(config, line)
350
+ if ev is not None:
351
+ yield ev
352
+
353
+
354
+ def apply_stream_with_class(
355
+ config: dict, lines: Iterable[str]
356
+ ) -> Iterator[Tuple[dict, str]]:
357
+ """Like :func:`apply_stream` but yields ``(event, class_name)`` pairs."""
358
+ for line in lines:
359
+ if not line.strip():
360
+ continue
361
+ r = _apply_with_class(config, line)
362
+ if r is not None:
363
+ yield r
364
+
365
+
366
+ # ---------------------------------------------------------------------------
367
+ # internals
368
+ # ---------------------------------------------------------------------------
369
+
370
+
371
+ def _apply_with_class(config: dict, raw_line: str) -> Optional[Tuple[dict, str]]:
372
+ rec = parse_record(raw_line, config["parser"])
373
+ if rec is None:
374
+ return None
375
+ cls = pick_class(rec, config.get("routing"), config["classes"])
376
+ block = config["classes"][cls]
377
+ event = map_record(rec, block)
378
+ return event, cls
ocsf_mapper/audit.py ADDED
@@ -0,0 +1,115 @@
1
+ """NDJSON audit log of mapping-config edits.
2
+
3
+ Compliance question: "who changed cloudtrail.json last Tuesday?" Without
4
+ auth on the web UI, "who" is best-effort — we pick the first non-empty
5
+ of ``OCSF_AUDIT_USER`` / ``USER`` / ``USERNAME`` env vars, falling back
6
+ to ``"local"``. The audit log records:
7
+
8
+ - timestamp (ISO 8601 UTC)
9
+ - user
10
+ - action ("create" | "update")
11
+ - mapping name
12
+ - bytes before / after (for size-delta visibility)
13
+ - lint status ("OK" | "FAIL" | "SKIP" | "REJECTED")
14
+ - error list (empty on success, non-empty on rejected saves)
15
+
16
+ Each event is one JSON line in ``<root>/audit/mapping_edits.ndjson``.
17
+ Append-only by design — never edit in place. Operationally:
18
+
19
+ tail -f audit/mapping_edits.ndjson | jq .
20
+ grep '"mapping":"cloudtrail"' audit/mapping_edits.ndjson | jq -s 'sort_by(.ts)'
21
+
22
+ The audit directory is created lazily on first write.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import os
29
+ from datetime import datetime, timezone
30
+ from pathlib import Path
31
+ from typing import Iterable, Optional
32
+
33
+
34
+ _AUDIT_SUBDIR = "audit"
35
+ _AUDIT_FILENAME = "mapping_edits.ndjson"
36
+
37
+
38
+ def _resolve_user() -> str:
39
+ for var in ("OCSF_AUDIT_USER", "USER", "USERNAME"):
40
+ v = os.environ.get(var)
41
+ if v:
42
+ return v
43
+ return "local"
44
+
45
+
46
+ def audit_path(root: Path | str) -> Path:
47
+ return Path(root) / _AUDIT_SUBDIR / _AUDIT_FILENAME
48
+
49
+
50
+ def log_edit(
51
+ root: Path | str,
52
+ *,
53
+ mapping: str,
54
+ action: str,
55
+ lint_status: str,
56
+ errors: Optional[Iterable[str]] = None,
57
+ bytes_before: Optional[int] = None,
58
+ bytes_after: Optional[int] = None,
59
+ user: Optional[str] = None,
60
+ ) -> None:
61
+ """Append one event to the audit log.
62
+
63
+ ``mapping`` is the source short name (``cloudtrail``, ``okta``, ...).
64
+ ``action`` is ``"create"`` for new sources via the wizard, ``"update"``
65
+ for the Mapping-tab editor. ``lint_status`` is the result of
66
+ ``lint_one()`` on the candidate file before the save was committed.
67
+ ``errors`` is the lint error list, empty on success.
68
+
69
+ Silent best-effort: if the audit directory can't be created we log
70
+ a warning to stderr but don't raise — losing the audit trail
71
+ shouldn't break a save.
72
+ """
73
+ path = audit_path(root)
74
+ record = {
75
+ "ts": datetime.now(timezone.utc).isoformat(timespec="seconds"),
76
+ "user": user or _resolve_user(),
77
+ "action": action,
78
+ "mapping": mapping,
79
+ "lint_status": lint_status,
80
+ "errors": list(errors) if errors else [],
81
+ "bytes_before": bytes_before,
82
+ "bytes_after": bytes_after,
83
+ }
84
+ try:
85
+ path.parent.mkdir(parents=True, exist_ok=True)
86
+ with path.open("a", encoding="utf-8") as fp:
87
+ fp.write(json.dumps(record, ensure_ascii=False))
88
+ fp.write("\n")
89
+ except OSError as e: # pragma: no cover - filesystem fault
90
+ import sys
91
+ print(f"warning: audit log write failed: {e}", file=sys.stderr)
92
+
93
+
94
+ def read_audit(root: Path | str, limit: Optional[int] = None) -> list[dict]:
95
+ """Return the audit log as a list of dicts, newest first.
96
+
97
+ ``limit`` truncates to the most-recent N events. Returns an empty
98
+ list if the audit file doesn't exist yet.
99
+ """
100
+ path = audit_path(root)
101
+ if not path.exists():
102
+ return []
103
+ out: list[dict] = []
104
+ for line in path.read_text(encoding="utf-8").splitlines():
105
+ line = line.strip()
106
+ if not line:
107
+ continue
108
+ try:
109
+ out.append(json.loads(line))
110
+ except json.JSONDecodeError:
111
+ continue
112
+ out.reverse() # newest first
113
+ if limit is not None:
114
+ out = out[:limit]
115
+ return out