loki-mode 7.18.1 → 7.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,509 @@
1
+ #!/usr/bin/env python3
2
+ """Shared crash-report scrubber for Loki Mode (Phase 0, local-only).
3
+
4
+ This is the single security chokepoint for the crash-reporting feature. It is
5
+ called by all three routes (bash client via python3, Bun client via findPython3,
6
+ and -- in a later phase -- the FastAPI backend via import) so redaction can
7
+ never drift between them. It layers crash-specific deny rules on top of the
8
+ proof-of-run redactor and then emits a WHITELIST-ONLY payload.
9
+
10
+ Design rules (docs/CRASH-REPORTING-PLAN.md sections 5 and Phase 0):
11
+ - Run proof_redact.redact_tree over the raw dict first (reuse the hardened,
12
+ ReDoS-checked patterns).
13
+ - Apply crash-specific deny rules (emails, IPs, repo names) to surviving
14
+ strings BEFORE whitelisting.
15
+ - Emit ONLY whitelisted keys; anything else is DROPPED, not redacted, so
16
+ free-text (prompts, diffs, briefs) can never reach the payload.
17
+ - Compute fingerprint and project_id_hash AFTER scrub, on the REDACTED data,
18
+ so the client and the backend derive identical values.
19
+ - FAIL CLOSED: never raise out of scrub_and_whitelist; on any internal error
20
+ return a minimal safe dict with NO raw data.
21
+ """
22
+
23
+ import hashlib
24
+ import json
25
+ import re
26
+ import sys
27
+ import os
28
+
29
+ # Make proof_redact importable regardless of cwd (same trick as
30
+ # proof-generator.py lines 32-37).
31
+ _HERE = os.path.dirname(os.path.abspath(__file__))
32
+ if _HERE not in sys.path:
33
+ sys.path.insert(0, _HERE)
34
+
35
+ import proof_redact # noqa: E402
36
+
37
+ # Separate from proof_redact.RULES_VERSION. Bump only when crash scrub or
38
+ # whitelist behavior changes in a way callers must be able to detect.
39
+ CRASH_RULES_VERSION = "1.0"
40
+
41
+ # Number of normalized stack frames kept in the signature / fingerprint.
42
+ STACK_FRAMES_N = 5
43
+
44
+ # The ONLY keys allowed to leave the machine. Anything not here is dropped.
45
+ _WHITELIST = (
46
+ "os",
47
+ "arch",
48
+ "loki_version",
49
+ "node_version",
50
+ "bun_version",
51
+ "error_class",
52
+ "stack_signature",
53
+ "rarv_phase",
54
+ "exit_code",
55
+ "friction_kind",
56
+ "project_id_hash",
57
+ "fingerprint",
58
+ "rules_version",
59
+ "redactions_count",
60
+ "captured_at",
61
+ )
62
+
63
+ # Crash-specific deny patterns. All quantifiers are bounded to stay ReDoS-safe;
64
+ # no nested unbounded groups. Slight over-redaction is preferred over a pattern
65
+ # that can backtrack.
66
+
67
+ # Email: bounded local part and domain. RFC-imperfect on purpose; we want a
68
+ # linear scan, not a validator.
69
+ _EMAIL = re.compile(
70
+ r"[A-Za-z0-9._%+\-]{1,64}@[A-Za-z0-9.\-]{1,255}\.[A-Za-z]{2,24}"
71
+ )
72
+
73
+ # IPv4: four bounded octets. Over-matches things like version strings on
74
+ # purpose (acceptable -- versions are emitted via the whitelisted, separately
75
+ # sourced loki_version field, not parsed from free text).
76
+ _IPV4 = re.compile(r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b")
77
+
78
+ # IPv6: a simple bounded colon-hex form. Requires at least two groups so a bare
79
+ # "a:b" style label does not match. Bounded repetition keeps it linear.
80
+ _IPV6 = re.compile(r"\b(?:[A-Fa-f0-9]{1,4}:){2,7}[A-Fa-f0-9]{1,4}\b")
81
+
82
+
83
+ def _parse_owner_repo(git_remote):
84
+ """Extract a normalized 'owner/repo' from a git remote URL, or None.
85
+
86
+ Handles https://host/owner/repo(.git) and scp-style git@host:owner/repo(.git).
87
+ Returns the literal 'owner/repo' string for deny matching, lowercased host
88
+ is irrelevant here (we want the owner/repo path segment).
89
+ """
90
+ if not git_remote or not isinstance(git_remote, str):
91
+ return None
92
+ s = git_remote.strip()
93
+ # scp-style: git@github.com:owner/repo.git
94
+ m = re.match(r"^[^@]+@[^:]+:(.+)$", s)
95
+ if m:
96
+ path = m.group(1)
97
+ else:
98
+ # url-style: strip scheme and host, keep the path.
99
+ m = re.match(r"^[A-Za-z][A-Za-z0-9+.\-]*://[^/]+/(.+)$", s)
100
+ if m:
101
+ path = m.group(1)
102
+ else:
103
+ path = s
104
+ path = path.rstrip("/")
105
+ if path.endswith(".git"):
106
+ path = path[:-4]
107
+ # Keep only the last two path segments (owner/repo).
108
+ parts = [p for p in path.split("/") if p]
109
+ if len(parts) >= 2:
110
+ return parts[-2] + "/" + parts[-1]
111
+ return None
112
+
113
+
114
+ def _apply_crash_deny(s, repo_literals):
115
+ """Apply crash-specific deny rules to one string. Returns (new, count).
116
+
117
+ repo_literals: list of literal strings (repo names, owner/repo) to redact.
118
+ Literals are matched via str.replace (never interpolated into a regex) so a
119
+ repo name containing regex metacharacters cannot break the scan.
120
+ """
121
+ if not isinstance(s, str) or not s:
122
+ return s, 0
123
+ total = 0
124
+ s, n = _EMAIL.subn("[REDACTED:EMAIL]", s)
125
+ total += n
126
+ s, n = _IPV6.subn("[REDACTED:IP]", s)
127
+ total += n
128
+ s, n = _IPV4.subn("[REDACTED:IP]", s)
129
+ total += n
130
+ for lit in repo_literals:
131
+ if lit and lit in s:
132
+ total += s.count(lit)
133
+ s = s.replace(lit, "[REDACTED:REPO]")
134
+ return s, total
135
+
136
+
137
+ def _crash_deny_tree(obj, repo_literals):
138
+ """Recurse a JSON-like structure applying crash deny rules. Returns
139
+ (new_obj, count). Mirrors proof_redact.redact_tree shape."""
140
+ if isinstance(obj, str):
141
+ return _apply_crash_deny(obj, repo_literals)
142
+ if isinstance(obj, dict):
143
+ out = {}
144
+ total = 0
145
+ for k, v in obj.items():
146
+ new_k, ck = (k, 0)
147
+ if isinstance(k, str):
148
+ new_k, ck = _apply_crash_deny(k, repo_literals)
149
+ new_v, cv = _crash_deny_tree(v, repo_literals)
150
+ out[new_k] = new_v
151
+ total += ck + cv
152
+ return out, total
153
+ if isinstance(obj, (list, tuple)):
154
+ out = []
155
+ total = 0
156
+ for item in obj:
157
+ new_item, c = _crash_deny_tree(item, repo_literals)
158
+ out.append(new_item)
159
+ total += c
160
+ return out, total
161
+ return obj, 0
162
+
163
+
164
+ def normalize_stack(frames, n=STACK_FRAMES_N):
165
+ """Extract function/symbol names from raw stack frames; return the top n.
166
+
167
+ Strips file paths, line numbers, columns, hex addresses, the leading "at ",
168
+ and surrounding parens. Must be deterministic and machine-independent: two
169
+ machines with different home paths must produce identical output for the
170
+ same logical stack.
171
+
172
+ Handles both common forms:
173
+ - Python: 'File "/p/f.py", line 42, in func_name'
174
+ - Node/Bun: 'at func_name (/p/f.js:10:5)' and anonymous 'at /p/f.js:1:1'
175
+ """
176
+ out = []
177
+ if not isinstance(frames, (list, tuple)):
178
+ return out
179
+ for frame in frames:
180
+ if not isinstance(frame, str):
181
+ continue
182
+ line = frame.strip()
183
+ if not line:
184
+ continue
185
+ sym = None
186
+
187
+ # Python traceback frame: '... in <symbol>' is the function name.
188
+ m = re.search(r",\s*line\s+\d+,\s*in\s+(.+)$", line)
189
+ if m:
190
+ sym = m.group(1).strip()
191
+ else:
192
+ # Node/Bun: 'at <symbol> (<loc>)' -> capture symbol before '('.
193
+ m = re.match(r"^at\s+(.*?)\s*\(", line)
194
+ if m:
195
+ sym = m.group(1).strip()
196
+ else:
197
+ # 'at <loc>' (anonymous) -> no symbol; mark as anonymous.
198
+ m = re.match(r"^at\s+(.+)$", line)
199
+ if m:
200
+ candidate = m.group(1).strip()
201
+ # If it looks like a bare path:line:col, it is anonymous.
202
+ if re.search(r":\d+(:\d+)?$", candidate) or "/" in candidate or "\\" in candidate:
203
+ sym = "<anonymous>"
204
+ else:
205
+ sym = candidate
206
+ else:
207
+ # Not a recognized frame line (e.g. a Python traceback
208
+ # header "Traceback (most recent call last):" or the final
209
+ # "ValueError: boom" exception line). These are not frames,
210
+ # so skip them: the spec is "extract ONLY the function/symbol
211
+ # name per frame."
212
+ continue
213
+
214
+ if sym is None:
215
+ continue
216
+ # Strip any trailing location that slipped through, hex addresses,
217
+ # and balanced parens content.
218
+ sym = re.sub(r"\s*\(.*\)\s*$", "", sym)
219
+ sym = re.sub(r"\s*0x[0-9A-Fa-f]+\s*$", "", sym)
220
+ sym = re.sub(r"[:@]\d+(:\d+)?$", "", sym)
221
+ sym = sym.strip()
222
+ if not sym:
223
+ sym = "<anonymous>"
224
+ out.append(sym)
225
+ if len(out) >= n:
226
+ break
227
+ return out
228
+
229
+
230
+ def compute_fingerprint(error_class, stack_signature):
231
+ """sha256 of error_class + "\\n" + joined stack_signature, hexdigest.
232
+
233
+ Computed AFTER scrub on the redacted error_class and stack_signature so the
234
+ client and the backend (recomputing from the received whitelisted payload)
235
+ derive identical values.
236
+ """
237
+ ec = error_class if isinstance(error_class, str) else ""
238
+ sig = stack_signature if isinstance(stack_signature, list) else []
239
+ sig = [s if isinstance(s, str) else str(s) for s in sig]
240
+ payload = ec + "\n" + "\n".join(sig)
241
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
242
+
243
+
244
+ def project_id_hash(git_remote):
245
+ """Non-reversible project id from the git remote, sha256 UNSALTED hexdigest.
246
+
247
+ Normalizes: strips scheme, .git suffix, trailing slash, lowercases host.
248
+ If no remote, hashes the literal "no-remote".
249
+
250
+ Unsalted tradeoff (stated explicitly): unsalted gives cross-user dedup (two
251
+ users hitting the same bug in the same public repo collapse to one triage
252
+ issue, which is the point of the occurrence counter). A per-user salt would
253
+ kill that dedup. Unsalted is dictionary-attackable for known public repos,
254
+ but the project id reveals only "which public repo," which is already public,
255
+ so the privacy cost is acceptable. Private-repo origins still hash to an
256
+ opaque value with no path/name leakage.
257
+ """
258
+ if not git_remote or not isinstance(git_remote, str) or not git_remote.strip():
259
+ return hashlib.sha256(b"no-remote").hexdigest()
260
+ s = git_remote.strip()
261
+ # scp-style git@host:path -> host/path
262
+ m = re.match(r"^[^@]+@([^:]+):(.+)$", s)
263
+ if m:
264
+ host = m.group(1).lower()
265
+ path = m.group(2)
266
+ norm = host + "/" + path
267
+ else:
268
+ # url-style: strip scheme, lowercase host, keep path.
269
+ m = re.match(r"^[A-Za-z][A-Za-z0-9+.\-]*://([^/]+)(/.*)?$", s)
270
+ if m:
271
+ host = m.group(1).lower()
272
+ path = m.group(2) or ""
273
+ norm = host + path
274
+ else:
275
+ norm = s
276
+ norm = norm.rstrip("/")
277
+ if norm.endswith(".git"):
278
+ norm = norm[:-4]
279
+ norm = norm.rstrip("/")
280
+ return hashlib.sha256(norm.encode("utf-8")).hexdigest()
281
+
282
+
283
+ # Allowed friction_kind values. Anything else is dropped to None so a misused
284
+ # caller cannot smuggle free text through this whitelisted field.
285
+ _FRICTION_KINDS = ("retry_loop", "rate_limit_loop", "gate_failure")
286
+
287
+ # Allowed RARV phase labels (uppercase), verified from autonomy/run.sh
288
+ # get_rarv_phase_name and its callers. The literal lowercase "iteration"
289
+ # default (run.sh:9437) is handled separately so it can be kept lowercase.
290
+ _RARV_PHASES = frozenset(
291
+ ("REASON", "ACT", "REFLECT", "VERIFY", "UNKNOWN", "CONVERGE", "CLOSE")
292
+ )
293
+
294
+ # Characters allowed in a sanitized short token (identifier-like + a few
295
+ # separators common in error class / phase names, e.g. "errno.ENOENT",
296
+ # "Foo:Bar", "REVIEW").
297
+ _TOKEN_ALLOWED = re.compile(r"[^A-Za-z0-9_.:\-]")
298
+
299
+
300
+ def _sanitize_token(value, max_len):
301
+ """Reduce a string to a strict short token, or "" if nothing survives.
302
+
303
+ Takes the leading token (up to the first whitespace), keeps only
304
+ [A-Za-z0-9_.:-], and truncates to max_len. Returns "" when the input is not
305
+ a string or nothing survives, so callers can apply their own fallback. This
306
+ is the shared core for hardening free-text-capable whitelisted fields: it
307
+ guarantees a misused field can never carry an arbitrary message or secret
308
+ (including a short secret below proof_redact's 20-char ReDoS floor).
309
+ """
310
+ if not isinstance(value, str):
311
+ return ""
312
+ parts = value.split(None, 1)
313
+ token = parts[0] if parts else ""
314
+ token = _TOKEN_ALLOWED.sub("", token)
315
+ return token[:max_len]
316
+
317
+
318
+ def sanitize_error_class(value):
319
+ """Reduce error_class to a strict class-name shape.
320
+
321
+ error_class is a free-text-capable whitelisted field, so harden it
322
+ independently of the regex scrubber (which only catches secrets >= 20 chars
323
+ because of the ReDoS floor in proof_redact). A short secret in a misused
324
+ error_class would otherwise survive. Leading token only, [A-Za-z0-9_.:-],
325
+ truncated to 64 chars, falling back to "UnknownError" if empty.
326
+ """
327
+ token = _sanitize_token(value, 64)
328
+ return token if token else "UnknownError"
329
+
330
+
331
+ def sanitize_rarv_phase(value):
332
+ """Allowlist rarv_phase to the known RARV phase set, else None.
333
+
334
+ rarv_phase is a closed enum (verified from autonomy/run.sh
335
+ get_rarv_phase_name and its callers), so allowlist it rather than
336
+ shape-sanitize: a leading-token sanitize would keep a leading token even
337
+ when the token itself is a short secret. An allowlist makes it impossible
338
+ for any free text or secret to ride in rarv_phase.
339
+
340
+ Matching is case-insensitive and normalized to uppercase (the TS route
341
+ uppercases too), EXCEPT the literal lowercase "iteration" default used in
342
+ run.sh:9437 is kept lowercase. Anything not in the set becomes None.
343
+ """
344
+ if not isinstance(value, str):
345
+ return None
346
+ stripped = value.strip()
347
+ upper = stripped.upper()
348
+ if upper in _RARV_PHASES:
349
+ return upper
350
+ if stripped.lower() == "iteration":
351
+ return "iteration"
352
+ return None
353
+
354
+
355
+ def sanitize_exit_code(value):
356
+ """Coerce exit_code to an integer, or None.
357
+
358
+ An exit code is always numeric, so int(value) if it parses else None. This
359
+ guarantees no string content (and thus no secret) can ride in exit_code even
360
+ if a caller passes a string.
361
+ """
362
+ if isinstance(value, bool):
363
+ # bool is a subclass of int; treat it as non-numeric here.
364
+ return None
365
+ if isinstance(value, int):
366
+ return value
367
+ try:
368
+ return int(value)
369
+ except (TypeError, ValueError):
370
+ return None
371
+
372
+
373
+ def sanitize_friction_kind(value):
374
+ """Allowlist friction_kind to the known set, else None.
375
+
376
+ Prevents a misused caller from placing free text (and thus a short secret)
377
+ into this whitelisted field.
378
+ """
379
+ if isinstance(value, str) and value in _FRICTION_KINDS:
380
+ return value
381
+ return None
382
+
383
+
384
+ def _safe_minimal():
385
+ """Fail-closed result: no raw data, ever."""
386
+ return {
387
+ "error_class": "ScrubError",
388
+ "rules_version": CRASH_RULES_VERSION,
389
+ "redactions_count": 0,
390
+ }
391
+
392
+
393
+ def scrub_and_whitelist(
394
+ raw,
395
+ home=None,
396
+ repo_root=None,
397
+ git_remote=None,
398
+ public_repo=None,
399
+ private_repo=None,
400
+ ):
401
+ """Scrub a raw crash context and emit a whitelist-only payload.
402
+
403
+ Steps:
404
+ 1. proof_redact.redact_tree over raw (with set_context/reset_context).
405
+ 2. crash-specific deny rules (emails, IPv4/IPv6, repo names) on surviving
406
+ string values.
407
+ 3. compute stack_signature (from redacted frames), project_id_hash, and
408
+ fingerprint -- all on redacted data.
409
+ 4. WHITELIST-only emit; drop everything else.
410
+
411
+ Never raises. On any internal error returns the safe minimal dict.
412
+ """
413
+ try:
414
+ if not isinstance(raw, dict):
415
+ return _safe_minimal()
416
+
417
+ # 1. proof_redact pass.
418
+ proof_redact.reset_context()
419
+ try:
420
+ proof_redact.set_context(home=home, repo_root=repo_root)
421
+ redacted, proof_count = proof_redact.redact_tree(raw)
422
+ finally:
423
+ proof_redact.reset_context()
424
+
425
+ # 2. crash-specific deny rules on surviving strings.
426
+ repo_literals = []
427
+ for lit in (public_repo, private_repo):
428
+ if lit and isinstance(lit, str):
429
+ repo_literals.append(lit)
430
+ owner_repo = _parse_owner_repo(git_remote)
431
+ if owner_repo:
432
+ repo_literals.append(owner_repo)
433
+ redacted, crash_count = _crash_deny_tree(redacted, repo_literals)
434
+
435
+ total_redactions = proof_count + crash_count
436
+
437
+ # 3. derived fields on REDACTED data.
438
+ # stack_signature: prefer an explicit list of frames; accept either
439
+ # "stack_signature" or "stack" as the source of frames.
440
+ frames = redacted.get("stack_signature")
441
+ if not isinstance(frames, list):
442
+ frames = redacted.get("stack")
443
+ stack_signature = normalize_stack(frames, n=STACK_FRAMES_N)
444
+
445
+ # Harden every free-text-capable whitelisted field BEFORE it enters the
446
+ # output or the fingerprint. The regex scrubber only catches secrets
447
+ # >= 20 chars (proof_redact ReDoS floor), so a short secret in a misused
448
+ # error_class, rarv_phase, or exit_code would otherwise survive.
449
+ # error_class is reduced to a strict short token (it is genuinely
450
+ # variable -- an exception class name); rarv_phase is allowlisted to the
451
+ # known RARV phase enum; exit_code is coerced to an integer. None of
452
+ # them can carry an arbitrary message or secret. The fingerprint is
453
+ # computed on the sanitized error_class so client and backend (which
454
+ # re-sanitize the received payload) still derive the same hash.
455
+ error_class = sanitize_error_class(redacted.get("error_class"))
456
+
457
+ fingerprint = compute_fingerprint(error_class, stack_signature)
458
+ pid_hash = project_id_hash(git_remote)
459
+
460
+ # 4. whitelist-only emit. Start from redacted source for whitelisted
461
+ # keys, then overlay the freshly computed / sanitized fields.
462
+ out = {}
463
+ for key in _WHITELIST:
464
+ if key in redacted:
465
+ out[key] = redacted[key]
466
+ out["stack_signature"] = stack_signature
467
+ out["error_class"] = error_class
468
+ # rarv_phase: strict short token (or None) so it cannot carry free text.
469
+ if "rarv_phase" in redacted:
470
+ out["rarv_phase"] = sanitize_rarv_phase(redacted.get("rarv_phase"))
471
+ # exit_code: coerce to int (or None) so no string content can ride in it.
472
+ if "exit_code" in redacted:
473
+ out["exit_code"] = sanitize_exit_code(redacted.get("exit_code"))
474
+ # friction_kind is allowlisted; drop to None if not a known value.
475
+ out["friction_kind"] = sanitize_friction_kind(redacted.get("friction_kind"))
476
+ out["fingerprint"] = fingerprint
477
+ out["project_id_hash"] = pid_hash
478
+ out["rules_version"] = CRASH_RULES_VERSION
479
+ out["redactions_count"] = total_redactions
480
+ return out
481
+ except Exception:
482
+ # Fail closed: never leak raw data through an exception path.
483
+ return _safe_minimal()
484
+
485
+
486
+ if __name__ == "__main__":
487
+ # Read a JSON dict from stdin, scrub, print scrubbed JSON to stdout.
488
+ # This is how the bash/TS clients call the scrubber directly.
489
+ try:
490
+ data = json.load(sys.stdin)
491
+ except Exception:
492
+ print(json.dumps(_safe_minimal()))
493
+ sys.exit(0)
494
+ if not isinstance(data, dict):
495
+ print(json.dumps(_safe_minimal()))
496
+ sys.exit(0)
497
+ # Optional context can be passed inside a "_ctx" sidecar key (not emitted,
498
+ # since it is not whitelisted). This lets callers supply home/repo/remote
499
+ # without separate argv plumbing for the simple stdin path.
500
+ ctx = data.pop("_ctx", {}) if isinstance(data.get("_ctx"), dict) else {}
501
+ result = scrub_and_whitelist(
502
+ data,
503
+ home=ctx.get("home"),
504
+ repo_root=ctx.get("repo_root"),
505
+ git_remote=ctx.get("git_remote"),
506
+ public_repo=ctx.get("public_repo"),
507
+ private_repo=ctx.get("private_repo"),
508
+ )
509
+ print(json.dumps(result))