@simbimbo/memory-ocmemog 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/CHANGELOG.md +30 -0
  2. package/README.md +85 -18
  3. package/brain/runtime/__init__.py +2 -12
  4. package/brain/runtime/config.py +1 -24
  5. package/brain/runtime/inference.py +1 -151
  6. package/brain/runtime/instrumentation.py +1 -15
  7. package/brain/runtime/memory/__init__.py +3 -13
  8. package/brain/runtime/memory/api.py +1 -1219
  9. package/brain/runtime/memory/candidate.py +1 -185
  10. package/brain/runtime/memory/conversation_state.py +1 -1823
  11. package/brain/runtime/memory/distill.py +1 -344
  12. package/brain/runtime/memory/embedding_engine.py +1 -92
  13. package/brain/runtime/memory/freshness.py +1 -112
  14. package/brain/runtime/memory/health.py +1 -40
  15. package/brain/runtime/memory/integrity.py +1 -186
  16. package/brain/runtime/memory/memory_consolidation.py +1 -58
  17. package/brain/runtime/memory/memory_links.py +1 -107
  18. package/brain/runtime/memory/memory_salience.py +1 -233
  19. package/brain/runtime/memory/memory_synthesis.py +1 -31
  20. package/brain/runtime/memory/memory_taxonomy.py +1 -33
  21. package/brain/runtime/memory/pondering_engine.py +1 -654
  22. package/brain/runtime/memory/promote.py +1 -277
  23. package/brain/runtime/memory/provenance.py +1 -406
  24. package/brain/runtime/memory/reinforcement.py +1 -71
  25. package/brain/runtime/memory/retrieval.py +1 -210
  26. package/brain/runtime/memory/semantic_search.py +1 -64
  27. package/brain/runtime/memory/store.py +1 -429
  28. package/brain/runtime/memory/unresolved_state.py +1 -91
  29. package/brain/runtime/memory/vector_index.py +1 -323
  30. package/brain/runtime/model_roles.py +1 -9
  31. package/brain/runtime/model_router.py +1 -22
  32. package/brain/runtime/providers.py +1 -66
  33. package/brain/runtime/security/redaction.py +1 -12
  34. package/brain/runtime/state_store.py +1 -23
  35. package/brain/runtime/storage_paths.py +1 -39
  36. package/docs/architecture/memory.md +20 -24
  37. package/docs/release-checklist.md +19 -6
  38. package/docs/usage.md +33 -17
  39. package/index.ts +8 -1
  40. package/ocmemog/__init__.py +11 -0
  41. package/ocmemog/doctor.py +1255 -0
  42. package/ocmemog/runtime/__init__.py +18 -0
  43. package/ocmemog/runtime/_compat_bridge.py +28 -0
  44. package/ocmemog/runtime/config.py +35 -0
  45. package/ocmemog/runtime/identity.py +115 -0
  46. package/ocmemog/runtime/inference.py +164 -0
  47. package/ocmemog/runtime/instrumentation.py +20 -0
  48. package/ocmemog/runtime/memory/__init__.py +91 -0
  49. package/ocmemog/runtime/memory/api.py +1431 -0
  50. package/ocmemog/runtime/memory/candidate.py +192 -0
  51. package/ocmemog/runtime/memory/conversation_state.py +1831 -0
  52. package/ocmemog/runtime/memory/distill.py +282 -0
  53. package/ocmemog/runtime/memory/embedding_engine.py +151 -0
  54. package/ocmemog/runtime/memory/freshness.py +114 -0
  55. package/ocmemog/runtime/memory/health.py +57 -0
  56. package/ocmemog/runtime/memory/integrity.py +208 -0
  57. package/ocmemog/runtime/memory/memory_consolidation.py +60 -0
  58. package/ocmemog/runtime/memory/memory_links.py +109 -0
  59. package/ocmemog/runtime/memory/memory_salience.py +235 -0
  60. package/ocmemog/runtime/memory/memory_synthesis.py +33 -0
  61. package/ocmemog/runtime/memory/memory_taxonomy.py +35 -0
  62. package/ocmemog/runtime/memory/pondering_engine.py +681 -0
  63. package/ocmemog/runtime/memory/promote.py +279 -0
  64. package/ocmemog/runtime/memory/provenance.py +408 -0
  65. package/ocmemog/runtime/memory/reinforcement.py +73 -0
  66. package/ocmemog/runtime/memory/retrieval.py +224 -0
  67. package/ocmemog/runtime/memory/semantic_search.py +66 -0
  68. package/ocmemog/runtime/memory/store.py +433 -0
  69. package/ocmemog/runtime/memory/unresolved_state.py +93 -0
  70. package/ocmemog/runtime/memory/vector_index.py +411 -0
  71. package/ocmemog/runtime/model_roles.py +16 -0
  72. package/ocmemog/runtime/model_router.py +29 -0
  73. package/ocmemog/runtime/providers.py +79 -0
  74. package/ocmemog/runtime/roles.py +92 -0
  75. package/ocmemog/runtime/security/__init__.py +8 -0
  76. package/ocmemog/runtime/security/redaction.py +17 -0
  77. package/ocmemog/runtime/state_store.py +34 -0
  78. package/ocmemog/runtime/storage_paths.py +70 -0
  79. package/ocmemog/sidecar/app.py +311 -23
  80. package/ocmemog/sidecar/compat.py +50 -13
  81. package/ocmemog/sidecar/transcript_watcher.py +391 -190
  82. package/openclaw.plugin.json +4 -0
  83. package/package.json +1 -1
  84. package/scripts/ocmemog-backfill-vectors.py +5 -3
  85. package/scripts/ocmemog-continuity-benchmark.py +1 -1
  86. package/scripts/ocmemog-demo.py +1 -1
  87. package/scripts/ocmemog-doctor.py +15 -0
  88. package/scripts/ocmemog-install.sh +29 -7
  89. package/scripts/ocmemog-integrated-proof.py +373 -0
  90. package/scripts/ocmemog-reindex-vectors.py +5 -3
  91. package/scripts/ocmemog-release-check.sh +330 -0
  92. package/scripts/ocmemog-sidecar.sh +4 -2
  93. package/scripts/ocmemog-test-rig.py +5 -3
  94. package/brain/runtime/memory/artifacts.py +0 -33
  95. package/brain/runtime/memory/context_builder.py +0 -112
  96. package/brain/runtime/memory/interaction_memory.py +0 -57
  97. package/brain/runtime/memory/memory_gate.py +0 -38
  98. package/brain/runtime/memory/memory_graph.py +0 -54
  99. package/brain/runtime/memory/person_identity.py +0 -83
  100. package/brain/runtime/memory/person_memory.py +0 -138
  101. package/brain/runtime/memory/sentiment_memory.py +0 -67
  102. package/brain/runtime/memory/tool_catalog.py +0 -68
@@ -0,0 +1,1255 @@
1
+ """Operator-facing diagnostics command for ocmemog runtime and sidecar state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import importlib
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from collections.abc import Iterable
11
+ from dataclasses import dataclass, asdict
12
+ from typing import Any, Callable
13
+ from urllib.request import Request, urlopen
14
+ from urllib.error import HTTPError
15
+ import contextlib
16
+
17
+ from ocmemog.runtime import state_store
18
+ from ocmemog.runtime.memory import embedding_engine, health, store
19
+ from ocmemog.sidecar import compat as sidecar_compat
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class FixResult:
24
+ action: str
25
+ check_key: str
26
+ message: str
27
+ changed: int
28
+ ok: bool
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class CheckResult:
33
+ key: str
34
+ label: str
35
+ status: str
36
+ message: str
37
+ details: dict[str, Any]
38
+ fixable: bool = False
39
+ fixed: bool = False
40
+ fix_action: str | None = None
41
+ fix_details: dict[str, Any] | None = None
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class DoctorCheck:
46
+ key: str
47
+ label: str
48
+ check: Callable[[None], CheckResult]
49
+ fix_key: str | None = None
50
+ fix: Callable[[None], FixResult] | None = None
51
+
52
+
53
+ _STATUS_PRECEDENCE = {"fail": 2, "warn": 1, "ok": 0}
54
+
55
+
56
+ _ENV_TOGGLE_KEYS = (
57
+ "OCMEMOG_TRANSCRIPT_WATCHER",
58
+ "OCMEMOG_AUTO_HYDRATION",
59
+ "OCMEMOG_INGEST_ASYNC_WORKER",
60
+ "OCMEMOG_SHUTDOWN_DRAIN_QUEUE",
61
+ "OCMEMOG_SHUTDOWN_TIMING",
62
+ "OCMEMOG_SHUTDOWN_DUMP_THREADS",
63
+ "OCMEMOG_USE_OLLAMA",
64
+ "OCMEMOG_REINFORCE_SENTIMENT",
65
+ )
66
+ _SCHEMA_VERSION_NON_STANDARD_TABLES = {"artifacts", "vector_embeddings"}
67
+ _HTTP_TIMEOUT_SECONDS = 2.0
68
+
69
+
70
+ def _queue_backlog_severity(depth: int) -> str:
71
+ if depth <= 0:
72
+ return "none"
73
+ if depth <= 25:
74
+ return "low"
75
+ if depth <= 250:
76
+ return "medium"
77
+ if depth <= 1000:
78
+ return "high"
79
+ return "critical"
80
+
81
+
82
+ def _vector_backlog_severity(missing: int) -> str:
83
+ if missing <= 0:
84
+ return "none"
85
+ if missing <= 200:
86
+ return "low"
87
+ if missing <= 2000:
88
+ return "medium"
89
+ if missing <= 10000:
90
+ return "high"
91
+ return "critical"
92
+
93
+
94
+ def _parse_float_env(name: str, default: float, *, minimum: float | None = None) -> tuple[float, str | None]:
95
+ raw = os.environ.get(name)
96
+ if raw is None or raw == "":
97
+ return default, None
98
+ try:
99
+ value = float(raw)
100
+ except Exception:
101
+ return default, f"{name} must be numeric"
102
+ if minimum is not None and value < minimum:
103
+ return default, f"{name} must be >= {minimum}"
104
+ return value, None
105
+
106
+
107
+ def _parse_int_env(name: str, default: int, *, minimum: int | None = None) -> tuple[int, str | None]:
108
+ raw = os.environ.get(name)
109
+ if raw is None or raw == "":
110
+ return default, None
111
+ try:
112
+ value = int(raw)
113
+ except Exception:
114
+ return default, f"{name} must be integer"
115
+ if minimum is not None and value < minimum:
116
+ return default, f"{name} must be >= {minimum}"
117
+ return value, None
118
+
119
+
120
+ def _parse_bool_env(name: str, default: bool = False) -> tuple[bool, str | None]:
121
+ raw = os.environ.get(name)
122
+ if raw is None or raw == "":
123
+ return default, None
124
+ lowered = raw.strip().lower()
125
+ if lowered in {"1", "true", "yes", "on", "y", "t"}:
126
+ return True, None
127
+ if lowered in {"0", "false", "no", "off", "n", "f"}:
128
+ return False, None
129
+ return default, f"{name} must be a boolean value"
130
+
131
+
132
+ def _queue_status_to_icon(status: str) -> str:
133
+ if status == "fail":
134
+ return "FAIL"
135
+ if status == "warn":
136
+ return "WARN"
137
+ return "PASS"
138
+
139
+
140
+ def _normalize_fixes(raw: Iterable[str] | None) -> list[str]:
141
+ actions: list[str] = []
142
+ if not raw:
143
+ return actions
144
+ for item in raw:
145
+ if not item:
146
+ continue
147
+ for part in item.split(","):
148
+ part = part.strip()
149
+ if part:
150
+ actions.append(part)
151
+ return sorted(dict.fromkeys(actions).keys())
152
+
153
+
154
+ @contextlib.contextmanager
155
+ def _scoped_state_dir(state_dir: str | None):
156
+ if not state_dir:
157
+ yield
158
+ return
159
+ previous = os.environ.get("OCMEMOG_STATE_DIR")
160
+ os.environ["OCMEMOG_STATE_DIR"] = state_dir
161
+ try:
162
+ yield
163
+ finally:
164
+ if previous is None:
165
+ os.environ.pop("OCMEMOG_STATE_DIR", None)
166
+ else:
167
+ os.environ["OCMEMOG_STATE_DIR"] = previous
168
+
169
+
170
+ def _run_imports(_: None) -> CheckResult:
171
+ required_modules = (
172
+ "ocmemog.runtime",
173
+ "ocmemog.runtime.config",
174
+ "ocmemog.runtime.memory",
175
+ "ocmemog.runtime.memory.store",
176
+ "ocmemog.runtime.memory.health",
177
+ "ocmemog.runtime.memory.integrity",
178
+ "ocmemog.runtime.memory.vector_index",
179
+ "ocmemog.runtime.inference",
180
+ "ocmemog.runtime.providers",
181
+ "ocmemog.runtime.memory.embedding_engine",
182
+ "ocmemog.sidecar.compat",
183
+ )
184
+
185
+ errors: list[str] = []
186
+ for module_name in required_modules:
187
+ try:
188
+ importlib.import_module(module_name)
189
+ except Exception as exc:
190
+ errors.append(f"{module_name}: {exc}")
191
+
192
+ if errors:
193
+ return CheckResult(
194
+ key="runtime/imports",
195
+ label="runtime module imports",
196
+ status="fail",
197
+ message="Some required modules failed to import.",
198
+ details={
199
+ "tested": list(required_modules),
200
+ "errors": errors,
201
+ },
202
+ )
203
+ return CheckResult(
204
+ key="runtime/imports",
205
+ label="runtime module imports",
206
+ status="ok",
207
+ message="All runtime modules imported.",
208
+ details={"tested": list(required_modules)},
209
+ )
210
+
211
+
212
+ def _run_state_paths(_: None) -> CheckResult:
213
+ targets = [state_store.root_dir(), state_store.data_dir(), state_store.memory_dir(), state_store.reports_dir()]
214
+ failed: list[str] = []
215
+ tested: list[str] = []
216
+ for target in targets:
217
+ tested.append(str(target))
218
+ try:
219
+ target.mkdir(parents=True, exist_ok=True)
220
+ probe = target / ".ocmemog_doctor_probe"
221
+ probe.write_text("ok", encoding="utf-8")
222
+ probe.unlink()
223
+ except Exception as exc:
224
+ failed.append(f"{target}: {exc}")
225
+
226
+ if failed:
227
+ return CheckResult(
228
+ key="state/path-writable",
229
+ label="state path writability",
230
+ status="fail",
231
+ message="State directories are not fully writable.",
232
+ details={"tested": tested, "failed": failed},
233
+ fixable=True,
234
+ fix_action="create-missing-paths",
235
+ )
236
+ return CheckResult(
237
+ key="state/path-writable",
238
+ label="state path writability",
239
+ status="ok",
240
+ message="State directories exist and are writable.",
241
+ details={"tested": tested},
242
+ fixable=True,
243
+ fix_action="create-missing-paths",
244
+ )
245
+
246
+
247
+ def _run_sqlite_schema(_: None) -> CheckResult:
248
+ required = {
249
+ "memory_events",
250
+ "environment_cognition",
251
+ "experiences",
252
+ "directives",
253
+ "candidates",
254
+ "promotions",
255
+ "demotions",
256
+ "cold_storage",
257
+ "memory_index",
258
+ "vector_embeddings",
259
+ "artifacts",
260
+ "knowledge",
261
+ "preferences",
262
+ "identity",
263
+ "runbooks",
264
+ "lessons",
265
+ "reflections",
266
+ "tasks",
267
+ "conversation_turns",
268
+ "conversation_checkpoints",
269
+ "conversation_state",
270
+ } | set(store.MEMORY_TABLES)
271
+
272
+ counts: dict[str, int] = {table: 0 for table in required}
273
+ version_map: dict[str, dict[str, int]] = {}
274
+ version_issues: list[str] = []
275
+ try:
276
+ store.init_db()
277
+ conn = store.connect()
278
+ try:
279
+ tables = {row[0] for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()}
280
+ missing = sorted(required - tables)
281
+ quick = str(conn.execute("PRAGMA quick_check(1)").fetchone()[0] or "unknown")
282
+ for table in sorted(required):
283
+ if table in missing:
284
+ continue
285
+ try:
286
+ counts[table] = int(conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] or 0)
287
+ except Exception as exc:
288
+ version_issues.append(f"{table} row count query failed: {exc}")
289
+
290
+ try:
291
+ columns = {row[1] for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
292
+ if "schema_version" not in columns:
293
+ if table not in _SCHEMA_VERSION_NON_STANDARD_TABLES:
294
+ version_issues.append(f"{table} missing schema_version column")
295
+ continue
296
+ rows = conn.execute(
297
+ f"SELECT COALESCE(schema_version, '<null>') AS schema_version, COUNT(*) AS count "
298
+ f"FROM {table} GROUP BY COALESCE(schema_version, '<null>')"
299
+ ).fetchall()
300
+ version_map[table] = {str(item[0]): int(item[1]) for item in rows}
301
+ except Exception as exc:
302
+ version_issues.append(f"{table} schema query failed: {exc}")
303
+ finally:
304
+ conn.close()
305
+ except Exception as exc:
306
+ return CheckResult(
307
+ key="sqlite/schema-access",
308
+ label="sqlite and schema",
309
+ status="fail",
310
+ message=f"SQLite schema check failed: {exc}",
311
+ details={"error": str(exc)},
312
+ )
313
+ if not missing:
314
+ details = {
315
+ "required_tables": sorted(required),
316
+ "missing_tables": [],
317
+ "sqlite_quick_check": quick,
318
+ "row_counts": {key: counts[key] for key in sorted(counts)},
319
+ "schema_version_expected": store.SCHEMA_VERSION,
320
+ "schema_versions": version_map,
321
+ "schema_version_issues": version_issues,
322
+ }
323
+ else:
324
+ details = {
325
+ "required_tables": sorted(required),
326
+ "missing_tables": missing,
327
+ "sqlite_quick_check": quick,
328
+ "row_counts": {key: counts[key] for key in sorted(counts)},
329
+ "schema_version_expected": store.SCHEMA_VERSION,
330
+ "schema_versions": version_map,
331
+ "schema_version_issues": version_issues,
332
+ }
333
+ if version_issues:
334
+ details["schema_version_issues"] = version_issues
335
+ for table, versions in version_map.items():
336
+ unexpected = [item for item in versions if item != store.SCHEMA_VERSION]
337
+ if unexpected and table not in ("memory_events", "environment_cognition"):
338
+ version_issues.extend([f"{table} has unexpected schema_version value(s): {', '.join(sorted(unexpected))}"])
339
+
340
+ if missing:
341
+ return CheckResult(
342
+ key="sqlite/schema-access",
343
+ label="sqlite and schema",
344
+ status="fail",
345
+ message="One or more expected schema tables are missing.",
346
+ details=details,
347
+ )
348
+ if quick.lower() != "ok":
349
+ return CheckResult(
350
+ key="sqlite/schema-access",
351
+ label="sqlite and schema",
352
+ status="fail",
353
+ message="SQLite quick check failed.",
354
+ details=details,
355
+ )
356
+ if version_issues:
357
+ return CheckResult(
358
+ key="sqlite/schema-access",
359
+ label="sqlite and schema",
360
+ status="warn",
361
+ message="Schema metadata includes unexpected versions or schema column issues.",
362
+ details=details,
363
+ )
364
+ return CheckResult(
365
+ key="sqlite/schema-access",
366
+ label="sqlite and schema",
367
+ status="ok",
368
+ message="SQLite schema and DB open state are healthy.",
369
+ details=details,
370
+ )
371
+
372
+
373
+ def _import_sidecar_app():
374
+ return importlib.import_module("ocmemog.sidecar.app")
375
+
376
+
377
+ def _run_queue_health(_: None) -> CheckResult:
378
+ try:
379
+ app = _import_sidecar_app()
380
+ except Exception as exc:
381
+ return CheckResult(
382
+ key="queue/health",
383
+ label="queue health",
384
+ status="fail",
385
+ message=f"Failed to import sidecar app for queue checks: {exc}",
386
+ details={"error": str(exc)},
387
+ fixable=True,
388
+ fix_action="repair-queue",
389
+ )
390
+
391
+ try:
392
+ queue_path = app._queue_path()
393
+ depth = app._queue_depth()
394
+ stats = dict(app.QUEUE_STATS)
395
+ queue_size = queue_path.stat().st_size
396
+ worker_enabled = app._parse_bool_env("OCMEMOG_INGEST_ASYNC_WORKER", default=True)
397
+ worker_poll_seconds = None
398
+ worker_batch_max = None
399
+ queue_config: list[str] = []
400
+ try:
401
+ worker_poll_seconds = float(os.environ.get("OCMEMOG_INGEST_ASYNC_POLL_SECONDS", "5"))
402
+ if worker_poll_seconds < 0:
403
+ queue_config.append("OCMEMOG_INGEST_ASYNC_POLL_SECONDS must be >= 0")
404
+ except Exception:
405
+ queue_config.append("OCMEMOG_INGEST_ASYNC_POLL_SECONDS")
406
+ try:
407
+ worker_batch_max = int(os.environ.get("OCMEMOG_INGEST_ASYNC_BATCH_MAX", "25"))
408
+ if worker_batch_max < 1:
409
+ queue_config.append("OCMEMOG_INGEST_ASYNC_BATCH_MAX must be >= 1")
410
+ except Exception:
411
+ queue_config.append("OCMEMOG_INGEST_ASYNC_BATCH_MAX")
412
+
413
+ invalid = 0
414
+ total = 0
415
+ invalid_samples: list[dict[str, Any]] = []
416
+ for raw_line in queue_path.read_text(encoding="utf-8").splitlines():
417
+ line = raw_line.strip()
418
+ if not line:
419
+ continue
420
+ total += 1
421
+ try:
422
+ json.loads(line)
423
+ except Exception:
424
+ invalid += 1
425
+ if len(invalid_samples) < 3:
426
+ invalid_samples.append({"line_no": total, "line": line[:160]})
427
+
428
+ status = "ok"
429
+ messages: list[str] = []
430
+ if invalid:
431
+ status = "warn"
432
+ messages.append(f"Queue has {invalid} invalid line(s).")
433
+ if depth > 25:
434
+ status = "warn"
435
+ messages.append(f"Queue backlog is elevated ({depth}).")
436
+ backlog_severity = _queue_backlog_severity(depth)
437
+ if invalid or backlog_severity in {"medium", "high", "critical"}:
438
+ status = "warn"
439
+ if queue_config:
440
+ status = "warn"
441
+ messages.append("Queue config has invalid values: " + ", ".join(sorted(set(queue_config))))
442
+ if depth > 0 and not worker_enabled and not queue_config:
443
+ status = "warn"
444
+ messages.append("Ingest worker is disabled but queue has pending entries.")
445
+ if depth > 0 and worker_enabled and app._INGEST_WORKER_THREAD is not None and not app._INGEST_WORKER_THREAD.is_alive():
446
+ status = "warn"
447
+ messages.append("Ingest worker thread exists but is not currently alive.")
448
+ hints: list[str] = []
449
+ if invalid > 0:
450
+ hints.append("Run --fix repair-queue to drop invalid queue entries.")
451
+ if depth > 0 and not worker_enabled:
452
+ hints.append("Enable OCMEMOG_INGEST_ASYNC_WORKER or flush with POST /memory/ingest_flush.")
453
+ if depth > 1000:
454
+ hints.append("Queue depth is very high; inspect upstream ingest failures and sidecar reachability.")
455
+ worker_config_issues = queue_config
456
+ if not worker_config_issues:
457
+ if worker_batch_max and worker_batch_max > 40:
458
+ hints.append("Ingest batch size is large; reduce OCMEMOG_INGEST_ASYNC_BATCH_MAX if queue consumers lag.")
459
+ message = "; ".join(messages) if messages else "Queue state is healthy."
460
+ if backlog_severity in {"medium", "high", "critical"} and "Queue state is healthy." in message:
461
+ message = f"Queue backlog severity is {backlog_severity} ({depth})."
462
+ except Exception as exc:
463
+ return CheckResult(
464
+ key="queue/health",
465
+ label="queue health",
466
+ status="fail",
467
+ message=f"Queue health check failed: {exc}",
468
+ details={"error": str(exc)},
469
+ fixable=True,
470
+ fix_action="repair-queue",
471
+ )
472
+
473
+ return CheckResult(
474
+ key="queue/health",
475
+ label="queue health",
476
+ status=status,
477
+ message=message,
478
+ details={
479
+ "queue_depth": depth,
480
+ "queue_path": str(queue_path),
481
+ "invalid_lines": invalid,
482
+ "lines_seen": total,
483
+ "stats": stats,
484
+ "queue_bytes": queue_size,
485
+ "queue_worker_enabled": worker_enabled,
486
+ "queue_worker_poll_seconds": worker_poll_seconds,
487
+ "queue_worker_batch_max": worker_batch_max,
488
+ "queue_config_issues": queue_config,
489
+ "invalid_payload_samples": invalid_samples,
490
+ "ingest_worker_running": bool(app._INGEST_WORKER_THREAD and app._INGEST_WORKER_THREAD.is_alive()),
491
+ "queue_backlog_severity": backlog_severity,
492
+ "queue_hints": hints,
493
+ },
494
+ fixable=True,
495
+ fix_action="repair-queue",
496
+ )
497
+
498
+
499
+ def _run_transcript_watcher_sanity(_: None) -> CheckResult:
500
+ try:
501
+ app = _import_sidecar_app()
502
+ except Exception as exc:
503
+ return CheckResult(
504
+ key="sidecar/transcript-watcher",
505
+ label="sidecar transcript watcher",
506
+ status="fail",
507
+ message=f"Failed to import sidecar app for transcript watcher checks: {exc}",
508
+ details={"error": str(exc)},
509
+ )
510
+
511
+ enabled, valid_toggle = app._parse_bool_env_value(os.environ.get("OCMEMOG_TRANSCRIPT_WATCHER"), default=False)
512
+ issues: list[str] = []
513
+ hints: list[str] = []
514
+ config: dict[str, Any] = {
515
+ "enabled": enabled,
516
+ "watcher_thread_running": bool(app._WATCHER_THREAD and app._WATCHER_THREAD.is_alive()),
517
+ }
518
+
519
+ if not valid_toggle:
520
+ config["watcher_toggle_parse_valid"] = False
521
+ return CheckResult(
522
+ key="sidecar/transcript-watcher",
523
+ label="sidecar transcript watcher",
524
+ status="warn",
525
+ message="Transcript watcher env toggle is not valid boolean syntax.",
526
+ details={"config": config, "issues": ["OCMEMOG_TRANSCRIPT_WATCHER must be a boolean value"], "hints": []},
527
+ )
528
+
529
+ if not enabled:
530
+ return CheckResult(
531
+ key="sidecar/transcript-watcher",
532
+ label="sidecar transcript watcher",
533
+ status="ok",
534
+ message="Transcript watcher is disabled.",
535
+ details={"enabled": False, "issues": [], "hints": [], "config": config},
536
+ )
537
+ if enabled:
538
+ transcript_path = os.environ.get("OCMEMOG_TRANSCRIPT_PATH", "").strip()
539
+ transcript_dir = os.environ.get("OCMEMOG_TRANSCRIPT_DIR", "").strip()
540
+ session_dir = os.environ.get("OCMEMOG_SESSION_DIR", "").strip()
541
+ config.update(
542
+ {
543
+ "transcript_path": transcript_path or None,
544
+ "transcript_dir": transcript_dir or None,
545
+ "session_dir": session_dir or None,
546
+ "transcript_glob": os.environ.get("OCMEMOG_TRANSCRIPT_GLOB", "*.log"),
547
+ "session_glob": os.environ.get("OCMEMOG_SESSION_GLOB", "*.jsonl"),
548
+ "batch_seconds": os.environ.get("OCMEMOG_INGEST_BATCH_SECONDS", "30"),
549
+ "batch_max": os.environ.get("OCMEMOG_INGEST_BATCH_MAX", "25"),
550
+ "poll_seconds": os.environ.get("OCMEMOG_TRANSCRIPT_POLL_SECONDS", "30"),
551
+ "start_at_end": os.environ.get("OCMEMOG_TRANSCRIPT_START_AT_END", "true"),
552
+ "watcher_toggle_parse_valid": True,
553
+ }
554
+ )
555
+ poll_seconds, issue = _parse_float_env("OCMEMOG_TRANSCRIPT_POLL_SECONDS", 30.0, minimum=1)
556
+ if issue:
557
+ issues.append(issue)
558
+ hints.append("Set OCMEMOG_TRANSCRIPT_POLL_SECONDS to a positive number.")
559
+ batch_seconds, issue = _parse_float_env("OCMEMOG_INGEST_BATCH_SECONDS", 30.0, minimum=1)
560
+ if issue:
561
+ issues.append(issue)
562
+ hints.append("Set OCMEMOG_INGEST_BATCH_SECONDS to a positive number.")
563
+ batch_max, issue = _parse_int_env("OCMEMOG_INGEST_BATCH_MAX", 25, minimum=1)
564
+ if issue:
565
+ issues.append(issue)
566
+ hints.append("Set OCMEMOG_INGEST_BATCH_MAX to an integer >= 1.")
567
+ reinforce_enabled, issue = _parse_bool_env("OCMEMOG_REINFORCE_SENTIMENT", True)
568
+ if issue:
569
+ issues.append(issue)
570
+ hints.append("Set OCMEMOG_REINFORCE_SENTIMENT to true/false.")
571
+ config.update(
572
+ {
573
+ "poll_seconds": poll_seconds,
574
+ "batch_seconds": batch_seconds,
575
+ "batch_max": batch_max,
576
+ "reinforce_sentiment": reinforce_enabled,
577
+ }
578
+ )
579
+
580
+ for raw_value in (transcript_path, transcript_dir, session_dir):
581
+ if raw_value:
582
+ target = Path(raw_value).expanduser().resolve()
583
+ if not target.exists():
584
+ hints.append(f"Configured path '{target}' does not currently exist; watcher will create as needed.")
585
+ elif target.is_file() and target.suffix == "":
586
+ issues.append(f"Configured path '{target}' looks like a directory but is a file path.")
587
+
588
+ ingest_endpoint = os.environ.get("OCMEMOG_INGEST_ENDPOINT", "http://127.0.0.1:17891/memory/ingest_async")
589
+ turn_ingest_endpoint = os.environ.get("OCMEMOG_TURN_INGEST_ENDPOINT", "")
590
+ config["ingest_endpoint"] = ingest_endpoint
591
+ config["turn_ingest_endpoint"] = turn_ingest_endpoint or ingest_endpoint.replace("/memory/ingest_async", "/conversation/ingest_turn")
592
+ if not config["turn_ingest_endpoint"].startswith("http"):
593
+ issues.append("OCMEMOG_TURN_INGEST_ENDPOINT must be an absolute HTTP(S) URL when overridden.")
594
+ config["watcher_thread_running"] = bool(app._WATCHER_THREAD and app._WATCHER_THREAD.is_alive())
595
+
596
+ status = "ok"
597
+ message = "Transcript watcher config is healthy."
598
+ if issues:
599
+ status = "warn"
600
+ message = "Transcript watcher config has issues."
601
+
602
+ return CheckResult(
603
+ key="sidecar/transcript-watcher",
604
+ label="sidecar transcript watcher",
605
+ status=status,
606
+ message=message,
607
+ details={
608
+ "config": config,
609
+ "issues": issues,
610
+ "hints": hints,
611
+ "enabled": enabled,
612
+ "watcher_running": bool(app._WATCHER_THREAD and app._WATCHER_THREAD.is_alive()),
613
+ "watcher_toggle_parse_valid": valid_toggle,
614
+ },
615
+ )
616
+
617
+
618
+ def _collect_vector_backlog() -> dict[str, Any]:
619
+ try:
620
+ store.init_db()
621
+ except Exception:
622
+ pass
623
+ backlog: dict[str, int] = {}
624
+ conn = store.connect()
625
+ total_missing = 0
626
+ query_errors: list[str] = []
627
+ try:
628
+ for table in store.MEMORY_TABLES:
629
+ try:
630
+ total = int(conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] or 0)
631
+ indexed = int(
632
+ conn.execute(
633
+ "SELECT COUNT(*) FROM vector_embeddings WHERE source_type = ?",
634
+ (table,),
635
+ ).fetchone()[0]
636
+ or 0
637
+ )
638
+ missing = max(total - indexed, 0)
639
+ backlog[table] = missing
640
+ total_missing += missing
641
+ except Exception as exc:
642
+ query_errors.append(f"{table}: {exc}")
643
+ backlog[table] = 0
644
+ finally:
645
+ conn.close()
646
+ return {
647
+ "per_table": backlog,
648
+ "total_missing": total_missing,
649
+ "severity": _vector_backlog_severity(total_missing),
650
+ "errors": query_errors,
651
+ }
652
+
653
+
654
+ def _run_transcript_root_readability(_: None) -> CheckResult:
655
+ try:
656
+ app = _import_sidecar_app()
657
+ except Exception as exc:
658
+ return CheckResult(
659
+ key="sidecar/transcript-roots",
660
+ label="sidecar transcript roots",
661
+ status="fail",
662
+ message=f"Failed to import sidecar app for transcript-root checks: {exc}",
663
+ details={"error": str(exc)},
664
+ )
665
+
666
+ raw_roots = os.environ.get("OCMEMOG_TRANSCRIPT_ROOTS")
667
+ try:
668
+ roots = app._allowed_transcript_roots()
669
+ root_values = [str(path) for path in roots]
670
+ missing: list[str] = []
671
+ non_directories: list[str] = []
672
+ inaccessible: list[str] = []
673
+ readable_roots: list[str] = []
674
+ for path in roots:
675
+ if not path.exists():
676
+ missing.append(str(path))
677
+ elif not path.is_dir():
678
+ non_directories.append(str(path))
679
+ elif not os.access(str(path), os.R_OK | os.X_OK):
680
+ inaccessible.append(str(path))
681
+ else:
682
+ readable_roots.append(str(path))
683
+ except Exception as exc:
684
+ return CheckResult(
685
+ key="sidecar/transcript-roots",
686
+ label="sidecar transcript roots",
687
+ status="fail",
688
+ message=f"Could not evaluate transcript roots: {exc}",
689
+ details={"error": str(exc)},
690
+ )
691
+
692
+ issues = missing + non_directories + inaccessible
693
+ status = "ok"
694
+ message = "Transcript root paths are readable."
695
+ if raw_roots is not None and not roots:
696
+ status = "warn"
697
+ message = "OCMEMOG_TRANSCRIPT_ROOTS is set but contains no usable entries."
698
+ elif issues:
699
+ status = "warn"
700
+ message = "One or more transcript root paths are not usable."
701
+
702
+ return CheckResult(
703
+ key="sidecar/transcript-roots",
704
+ label="sidecar transcript roots",
705
+ status=status,
706
+ message=message,
707
+ details={
708
+ "configured_via_env": raw_roots is not None,
709
+ "roots": root_values,
710
+ "readable_roots": readable_roots,
711
+ "missing_roots": missing,
712
+ "non_directories": non_directories,
713
+ "inaccessible_roots": inaccessible,
714
+ },
715
+ )
716
+
717
+
718
+ def _run_sidecar_toggle_sanity(_: None) -> CheckResult:
719
+ try:
720
+ app = _import_sidecar_app()
721
+ except Exception as exc:
722
+ return CheckResult(
723
+ key="sidecar/env-toggles",
724
+ label="sidecar environment toggles",
725
+ status="fail",
726
+ message=f"Failed to import sidecar app for env toggle checks: {exc}",
727
+ details={"error": str(exc)},
728
+ )
729
+
730
+ invalid: list[str] = []
731
+ checks: dict[str, dict[str, Any]] = {}
732
+ for key in _ENV_TOGGLE_KEYS:
733
+ raw = os.environ.get(key)
734
+ if raw is None:
735
+ continue
736
+ parsed, valid = app._parse_bool_env_value(raw, default=False)
737
+ checks[key] = {
738
+ "raw": str(raw),
739
+ "parsed": parsed,
740
+ "valid": valid,
741
+ }
742
+ if not valid:
743
+ invalid.append(key)
744
+
745
+ status = "ok"
746
+ message = "Boolean env toggles are valid."
747
+ if invalid:
748
+ status = "warn"
749
+ message = "Invalid boolean env toggle value(s): " + ", ".join(sorted(invalid))
750
+
751
+ if not checks:
752
+ message = "No explicitly configured boolean toggles were found."
753
+
754
+ return CheckResult(
755
+ key="sidecar/env-toggles",
756
+ label="sidecar environment toggles",
757
+ status=status,
758
+ message=message,
759
+ details={"toggles": checks, "invalid": invalid},
760
+ )
761
+
762
+
763
+ def _run_sidecar_http_auth(_: None) -> CheckResult:
764
+ endpoint = os.environ.get("OCMEMOG_ENDPOINT", "http://127.0.0.1:17891")
765
+ token = os.environ.get("OCMEMOG_API_TOKEN")
766
+ probes: list[dict[str, Any]] = []
767
+ issues: list[str] = []
768
+ hints: list[str] = []
769
+ status = "ok"
770
+ message = "Sidecar HTTP auth configuration is healthy."
771
+
772
+ if token:
773
+ unauth_status, unauth_payload, unauth_error = _probe_health_json(endpoint)
774
+ probes.append({
775
+ "label": "unauthenticated",
776
+ "status": unauth_status,
777
+ "error": unauth_error,
778
+ "ok": bool(unauth_payload.get("ok")) if isinstance(unauth_payload, dict) else None,
779
+ })
780
+ if unauth_error:
781
+ status = "warn"
782
+ message = "Sidecar health endpoint is not currently reachable."
783
+ issues.append(unauth_error)
784
+ elif unauth_status == 200:
785
+ status = "warn"
786
+ issues.append("Token configured, but authenticated endpoints are accepting unauthenticated access.")
787
+ hints.append("Verify OCMEMOG_API_TOKEN is exported in both sidecar and operator processes.")
788
+ elif unauth_status != 401:
789
+ status = "warn"
790
+ issues.append(f"Expected 401 for unauthenticated access, got {unauth_status}.")
791
+
792
+ token_ok: list[str] = []
793
+ for label, headers in (
794
+ ("x-token", {"x-ocmemog-token": token}),
795
+ ("bearer", {"authorization": f"Bearer {token}"}),
796
+ ):
797
+ auth_status, auth_payload, auth_error = _probe_health_json(endpoint, headers=headers)
798
+ probes.append(
799
+ {
800
+ "label": label,
801
+ "status": auth_status,
802
+ "error": auth_error,
803
+ "ok": bool(auth_payload.get("ok")) if isinstance(auth_payload, dict) else None,
804
+ }
805
+ )
806
+ if not auth_error and auth_status == 200:
807
+ token_ok.append(label)
808
+
809
+ if not token_ok:
810
+ status = "warn"
811
+ issues.append("Token-based authenticated health check failed.")
812
+ hints.append("Verify the token on operator and sidecar match and the expected header is supported.")
813
+
814
+ details = {
815
+ "token_required": True,
816
+ "token_probe_headers": token_ok,
817
+ "probes": probes,
818
+ "hints": hints,
819
+ }
820
+ else:
821
+ health_status, health_payload, health_error = _probe_health_json(endpoint)
822
+ probes.append({
823
+ "label": "unauthenticated",
824
+ "status": health_status,
825
+ "error": health_error,
826
+ "ok": bool(health_payload.get("ok")) if isinstance(health_payload, dict) else None,
827
+ })
828
+ if health_error:
829
+ status = "warn"
830
+ message = "Sidecar health endpoint is not currently reachable."
831
+ issues.append(health_error)
832
+ elif health_status != 200 or not isinstance(health_payload, dict) or not health_payload.get("ok", False):
833
+ status = "warn"
834
+ issues.append("Sidecar health endpoint returned a non-OK response.")
835
+ details = {
836
+ "token_required": False,
837
+ "probes": probes,
838
+ "hints": hints,
839
+ }
840
+
841
+ if not issues:
842
+ if message == "Sidecar HTTP auth configuration is healthy." and status == "ok":
843
+ details["token_required"] = bool(token)
844
+ else:
845
+ message = "; ".join(issues)
846
+
847
+ details["endpoint"] = endpoint
848
+ return CheckResult(
849
+ key="sidecar/http-auth",
850
+ label="sidecar HTTP auth",
851
+ status=status,
852
+ message=message,
853
+ details=details,
854
+ )
855
+
856
+
857
+ def _probe_health_json(endpoint: str, headers: dict[str, str] | None = None, *, timeout: float = _HTTP_TIMEOUT_SECONDS) -> tuple[int | None, dict[str, Any] | None, str | None]:
858
+ request_headers = {
859
+ "Accept": "application/json",
860
+ }
861
+ if headers:
862
+ request_headers.update(headers)
863
+ request = Request(f"{endpoint.rstrip('/')}/healthz", method="GET")
864
+ for key, value in request_headers.items():
865
+ request.add_header(key, value)
866
+ try:
867
+ with urlopen(request, timeout=timeout) as response:
868
+ status = getattr(response, "status", 200)
869
+ raw = response.read(256).decode("utf-8", errors="ignore")
870
+ payload: dict[str, Any] | None = None
871
+ if raw:
872
+ payload = json.loads(raw)
873
+ if not isinstance(payload, dict):
874
+ return status, None, "non-dict JSON payload"
875
+ return status, payload, None
876
+ except HTTPError as exc:
877
+ raw = ""
878
+ try:
879
+ raw = exc.read(256).decode("utf-8", errors="ignore")
880
+ except Exception:
881
+ raw = ""
882
+ payload: dict[str, Any] | None = None
883
+ if raw:
884
+ try:
885
+ loaded = json.loads(raw)
886
+ payload = loaded if isinstance(loaded, dict) else None
887
+ except Exception:
888
+ payload = None
889
+ return getattr(exc, "code", None), payload, None
890
+ except Exception as exc:
891
+ return None, None, str(exc)
892
+
893
+
894
+ def _fix_create_paths(_: None) -> FixResult:
895
+ try:
896
+ created = []
897
+ for target in (state_store.root_dir(), state_store.data_dir(), state_store.memory_dir(), state_store.reports_dir()):
898
+ target.mkdir(parents=True, exist_ok=True)
899
+ created.append(str(target))
900
+ probe = target / ".ocmemog_doctor_probe"
901
+ probe.write_text("ok", encoding="utf-8")
902
+ probe.unlink()
903
+ return FixResult(
904
+ action="create-missing-paths",
905
+ check_key="state/path-writable",
906
+ message="Created required state directories and confirmed writable state.",
907
+ changed=len(created),
908
+ ok=True,
909
+ )
910
+ except Exception as exc:
911
+ return FixResult(
912
+ action="create-missing-paths",
913
+ check_key="state/path-writable",
914
+ message=f"Could not create state paths: {exc}",
915
+ changed=0,
916
+ ok=False,
917
+ )
918
+
919
+
920
+ def _fix_repair_queue(_: None) -> FixResult:
921
+ try:
922
+ app = _import_sidecar_app()
923
+ queue_path = app._queue_path()
924
+ queue_lines = []
925
+ dropped = 0
926
+ for raw_line in queue_path.read_text(encoding="utf-8").splitlines():
927
+ line = raw_line.strip()
928
+ if not line:
929
+ continue
930
+ try:
931
+ payload = json.loads(line)
932
+ except Exception:
933
+ dropped += 1
934
+ continue
935
+ queue_lines.append(json.dumps(payload, ensure_ascii=False))
936
+
937
+ with app.QUEUE_LOCK:
938
+ app._write_queue_lines(queue_lines)
939
+ return FixResult(
940
+ action="repair-queue",
941
+ check_key="queue/health",
942
+ message=f"Removed {dropped} invalid queue entry(ies).",
943
+ changed=dropped,
944
+ ok=True,
945
+ )
946
+ except Exception as exc:
947
+ return FixResult(
948
+ action="repair-queue",
949
+ check_key="queue/health",
950
+ message=f"Queue repair failed: {exc}",
951
+ changed=0,
952
+ ok=False,
953
+ )
954
+
955
+
956
+ def _run_sidecar_import(_: None) -> CheckResult:
957
+ try:
958
+ app = _import_sidecar_app()
959
+ except Exception as exc:
960
+ return CheckResult(
961
+ key="sidecar/app-import",
962
+ label="sidecar app import",
963
+ status="fail",
964
+ message=f"Failed to import sidecar app module: {exc}",
965
+ details={"error": str(exc)},
966
+ )
967
+
968
+ if not hasattr(app, "app"):
969
+ return CheckResult(
970
+ key="sidecar/app-import",
971
+ label="sidecar app import",
972
+ status="fail",
973
+ message="ocmemog.sidecar.app did not expose FastAPI app object.",
974
+ details={"module": "ocmemog.sidecar.app"},
975
+ )
976
+ return CheckResult(
977
+ key="sidecar/app-import",
978
+ label="sidecar app import",
979
+ status="ok",
980
+ message="sidecar app module imports and exposes FastAPI app.",
981
+ details={"module": "ocmemog.sidecar.app", "app_type": type(app.app).__name__},
982
+ )
983
+
984
+
985
+ def _check_http(endpoint: str) -> str | None:
986
+ try:
987
+ status, payload, error = _probe_health_json(endpoint)
988
+ if error:
989
+ return error
990
+ if not status or status >= 400:
991
+ return f"health endpoint status {status}"
992
+ if not payload:
993
+ return "empty response"
994
+ if not isinstance(payload, dict) or not payload.get("ok"):
995
+ return "health endpoint returned non-ok payload"
996
+ except Exception as exc:
997
+ return str(exc)
998
+ return None
999
+
1000
+
1001
+ def _run_runtime_probe(_: None) -> CheckResult:
1002
+ details: dict[str, Any] = {}
1003
+ status = "ok"
1004
+ messages: list[str] = []
1005
+
1006
+ try:
1007
+ runtime_status = sidecar_compat.probe_runtime()
1008
+ details["runtime_mode"] = runtime_status.mode
1009
+ details["missing_deps"] = runtime_status.missing_deps
1010
+ details["warnings"] = runtime_status.warnings
1011
+ details["todo"] = runtime_status.todo
1012
+ except Exception as exc:
1013
+ status = "fail"
1014
+ messages.append(f"runtime/probe import failed: {exc}")
1015
+ details["runtime_error"] = str(exc)
1016
+ return CheckResult(
1017
+ key="vector/runtime-probe",
1018
+ label="vector/runtime probe",
1019
+ status="fail",
1020
+ message="Runtime probe failed.",
1021
+ details=details,
1022
+ )
1023
+
1024
+ try:
1025
+ payload = health.get_memory_health()
1026
+ details["memory_health"] = payload
1027
+ vector_backlog = _collect_vector_backlog()
1028
+ details["vector_backlog"] = vector_backlog
1029
+ memory_integrity_ok = payload.get("integrity", {}).get("ok", payload.get("vector_index_integrity_status"))
1030
+ if not memory_integrity_ok:
1031
+ status = "fail"
1032
+ messages.append("memory health reported failed integrity.")
1033
+ if vector_backlog.get("errors"):
1034
+ status = max(status, "warn", key=lambda s: _STATUS_PRECEDENCE[s]) if isinstance(status, str) else "warn"
1035
+ messages.append("Vector backlog probe reported query warnings: " + "; ".join(vector_backlog["errors"][:3]))
1036
+ if vector_backlog["total_missing"] > 0:
1037
+ status = max(status, "warn", key=lambda s: _STATUS_PRECEDENCE[s]) if isinstance(status, str) else "warn"
1038
+ messages.append(
1039
+ f"Vector backlog is elevated ({vector_backlog['total_missing']} rows, severity={vector_backlog['severity']})."
1040
+ )
1041
+ details["vector_backlog_hint"] = "Run scripts/ocmemog-backfill-vectors.py to reduce missing vector debt."
1042
+ except Exception as exc:
1043
+ status = "fail"
1044
+ details["memory_health_error"] = str(exc)
1045
+ messages.append(f"memory health check failed: {exc}")
1046
+
1047
+ if runtime_status.mode != "ready":
1048
+ status = max(status, "warn", key=lambda s: _STATUS_PRECEDENCE[s]) if isinstance(status, str) else "warn"
1049
+ messages.append(
1050
+ f"runtime mode is degraded ({len(runtime_status.missing_deps)} missing item(s): "
1051
+ f"{', '.join(runtime_status.missing_deps) or 'none'})."
1052
+ )
1053
+
1054
+ try:
1055
+ if not embedding_engine.generate_embedding("ocmemog doctor probe"):
1056
+ status = "fail"
1057
+ messages.append("embedding probe returned no vector.")
1058
+ except Exception as exc:
1059
+ status = "fail"
1060
+ details["embedding_error"] = str(exc)
1061
+ messages.append(f"embedding probe failed: {exc}")
1062
+
1063
+ endpoint = os.environ.get("OCMEMOG_ENDPOINT", "http://127.0.0.1:17891")
1064
+ sidecar_error = _check_http(endpoint)
1065
+ if sidecar_error:
1066
+ status = max(status, "warn", key=lambda s: _STATUS_PRECEDENCE[s]) if isinstance(status, str) else "warn"
1067
+ details["sidecar_http_error"] = sidecar_error
1068
+ messages.append("sidecar HTTP probe not currently available.")
1069
+ else:
1070
+ details["sidecar_http"] = "ok"
1071
+
1072
+ if not messages:
1073
+ messages.append("runtime, vector, and sidecar probe checks look healthy.")
1074
+
1075
+ return CheckResult(
1076
+ key="vector/runtime-probe",
1077
+ label="vector/runtime probe",
1078
+ status=status,
1079
+ message="; ".join(messages),
1080
+ details=details,
1081
+ )
1082
+
1083
+
1084
+ def _status_rank(status: str) -> int:
1085
+ return _STATUS_PRECEDENCE.get(status, 0)
1086
+
1087
+
1088
+ def _overall_status(results: Iterable[CheckResult]) -> str:
1089
+ max_status = "ok"
1090
+ for result in results:
1091
+ if _status_rank(result.status) > _status_rank(max_status):
1092
+ max_status = result.status
1093
+ return max_status
1094
+
1095
+
1096
+ DOCTOR_CHECKS: tuple[DoctorCheck, ...] = (
1097
+ DoctorCheck(key="runtime/imports", label="runtime module imports", check=_run_imports),
1098
+ DoctorCheck(key="state/path-writable", label="state path writability", check=_run_state_paths, fix_key="create-missing-paths", fix=_fix_create_paths),
1099
+ DoctorCheck(key="sqlite/schema-access", label="sqlite schema access", check=_run_sqlite_schema),
1100
+ DoctorCheck(key="queue/health", label="queue health", check=_run_queue_health, fix_key="repair-queue", fix=_fix_repair_queue),
1101
+ DoctorCheck(key="sidecar/http-auth", label="sidecar HTTP auth", check=_run_sidecar_http_auth),
1102
+ DoctorCheck(key="sidecar/transcript-roots", label="sidecar transcript roots", check=_run_transcript_root_readability),
1103
+ DoctorCheck(key="sidecar/transcript-watcher", label="sidecar transcript watcher", check=_run_transcript_watcher_sanity),
1104
+ DoctorCheck(key="sidecar/env-toggles", label="sidecar environment toggles", check=_run_sidecar_toggle_sanity),
1105
+ DoctorCheck(key="sidecar/app-import", label="sidecar app import", check=_run_sidecar_import),
1106
+ DoctorCheck(key="vector/runtime-probe", label="vector/runtime probe", check=_run_runtime_probe),
1107
+ )
1108
+
1109
+
1110
+ _KNOWN_FIXES = {
1111
+ "create-missing-paths": "state/path-writable",
1112
+ "repair-queue": "queue/health",
1113
+ }
1114
+
1115
+
1116
+ def run_doctor_checks(*, fix_actions: list[str] | None = None, include_checks: set[str] | None = None, state_dir: str | None = None, strict: bool = False):
1117
+ include_checks = set(include_checks or [])
1118
+ known_check_keys = {check.key for check in DOCTOR_CHECKS}
1119
+ if include_checks and (unknown_checks := (set(include_checks) - known_check_keys)):
1120
+ unknown = ", ".join(sorted(unknown_checks))
1121
+ raise ValueError(f"unknown --check key(s): {unknown}")
1122
+
1123
+ selected = [check for check in DOCTOR_CHECKS if not include_checks or check.key in include_checks]
1124
+ fix_actions = _normalize_fixes(fix_actions)
1125
+ if any(item not in _KNOWN_FIXES for item in fix_actions):
1126
+ unknown = sorted(set(fix_actions) - set(_KNOWN_FIXES))
1127
+ raise ValueError(f"unknown --fix action(s): {', '.join(unknown)}")
1128
+
1129
+ with _scoped_state_dir(state_dir):
1130
+ results: list[CheckResult] = []
1131
+ applied_fixes: list[FixResult] = []
1132
+
1133
+ for check in selected:
1134
+ result = check.check(None)
1135
+ if result.status != "ok" and check.fix and check.fix_key in fix_actions:
1136
+ fix = check.fix(None)
1137
+ if fix.ok:
1138
+ result = check.check(None)
1139
+ result = CheckResult(
1140
+ key=result.key,
1141
+ label=result.label,
1142
+ status=result.status,
1143
+ message=result.message,
1144
+ details=result.details,
1145
+ fixable=result.fixable,
1146
+ fixed=True,
1147
+ fix_action=check.fix_key,
1148
+ fix_details=asdict(fix),
1149
+ )
1150
+ applied_fixes.append(fix)
1151
+ results.append(result)
1152
+
1153
+ status = _overall_status(results)
1154
+ if strict and status == "warn":
1155
+ status = "fail"
1156
+
1157
+ return {
1158
+ "status": status,
1159
+ "checks": [asdict(item) for item in results],
1160
+ "fixes": [asdict(item) for item in applied_fixes],
1161
+ "strict": strict,
1162
+ }
1163
+
1164
+
1165
+ def _render_text(report: dict[str, Any]) -> None:
1166
+ print("ocmemog doctor")
1167
+ for check in report["checks"]:
1168
+ status = check["status"]
1169
+ print(f"{_queue_status_to_icon(status):<4} {check['key']}: {check['message']}")
1170
+ details = check.get("details") or {}
1171
+ if details:
1172
+ details_text = json.dumps(details, sort_keys=True)
1173
+ print(f" details: {details_text}")
1174
+ if check.get("fix_action") and check.get("fixed"):
1175
+ fix_details = check.get("fix_details") or {}
1176
+ changed = fix_details.get("changed", 0)
1177
+ fix_message = fix_details.get("message", "fix applied")
1178
+ print(f" fix: {fix_message} (changed={changed})")
1179
+ summary = {
1180
+ "ok": sum(1 for item in report["checks"] if item["status"] == "ok"),
1181
+ "warn": sum(1 for item in report["checks"] if item["status"] == "warn"),
1182
+ "fail": sum(1 for item in report["checks"] if item["status"] == "fail"),
1183
+ "applied_fixes": len(report["fixes"]),
1184
+ }
1185
+ status = report["status"]
1186
+ print(f"summary: {json.dumps(summary, sort_keys=True)}")
1187
+ print(f"overall: {status}")
1188
+
1189
+
1190
+ def _render_json(report: dict[str, Any]) -> None:
1191
+ payload = {
1192
+ "ok": report["status"] == "ok",
1193
+ "status": report["status"],
1194
+ "checks": report["checks"],
1195
+ "fixes": report["fixes"],
1196
+ }
1197
+ print(json.dumps(payload, indent=2, sort_keys=True))
1198
+
1199
+
1200
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
1201
+ parser = argparse.ArgumentParser(
1202
+ prog="ocmemog-doctor",
1203
+ description="Run operator-oriented health checks for ocmemog.",
1204
+ )
1205
+ parser.add_argument(
1206
+ "--json",
1207
+ action="store_true",
1208
+ help="Emit machine-readable JSON output.",
1209
+ )
1210
+ parser.add_argument(
1211
+ "--fix",
1212
+ action="append",
1213
+ default=[],
1214
+ help="Apply explicit low-risk fix action(s): create-missing-paths, repair-queue",
1215
+ )
1216
+ parser.add_argument(
1217
+ "--state-dir",
1218
+ help="Use an explicit state directory for all checks.",
1219
+ )
1220
+ parser.add_argument(
1221
+ "--check",
1222
+ action="append",
1223
+ default=[],
1224
+ help="Run only selected check key(s) (repeatable or comma-separated).",
1225
+ )
1226
+ parser.add_argument(
1227
+ "--strict",
1228
+ action="store_true",
1229
+ help="Treat warn results as failures to hard-gate release checks.",
1230
+ )
1231
+ return parser.parse_args(argv)
1232
+
1233
+
1234
+ def main(argv: list[str] | None = None) -> int:
1235
+ args = parse_args(argv)
1236
+ checks = set(_normalize_fixes(args.check))
1237
+ report = run_doctor_checks(
1238
+ fix_actions=args.fix,
1239
+ include_checks=checks,
1240
+ state_dir=args.state_dir,
1241
+ strict=args.strict,
1242
+ )
1243
+ if args.json:
1244
+ _render_json(report)
1245
+ else:
1246
+ _render_text(report)
1247
+ if report["status"] == "fail":
1248
+ return 2
1249
+ if report["status"] == "warn":
1250
+ return 1
1251
+ return 0
1252
+
1253
+
1254
+ if __name__ == "__main__":
1255
+ raise SystemExit(main())