nexo-brain 7.32.0 → 7.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,103 @@ def _core():
24
24
  return module
25
25
 
26
26
 
27
+ def _cognitive():
28
+ """Lazy import of the cognitive embedding core.
29
+
30
+ Imported lazily so the DB layer never hard-depends on the (heavy) cognitive
31
+ stack and so importing db._memory_v2 cannot fail when numpy/fastembed are
32
+ unavailable in a stripped environment.
33
+ """
34
+ module = sys.modules.get("cognitive._core")
35
+ if module is None:
36
+ module = importlib.import_module("cognitive._core")
37
+ return module
38
+
39
+
40
+ # Identifier persisted alongside each precomputed observation vector so the
41
+ # fusion path can refuse to compare vectors produced by an incompatible model.
42
+ OBSERVATION_EMBEDDING_MODEL = "bge-base-embeddings"
43
+
44
+
45
+ def _model_is_warm() -> bool:
46
+ """Return True only when embedding will NOT trigger a cold model load.
47
+
48
+ Two safe cases:
49
+ * the deterministic offline fallback is active
50
+ (NEXO_SKIP_COGNITIVE_MODEL_DOWNLOAD) — fast and dependency-free, so it is
51
+ always safe to embed; and
52
+ * the real fastembed model is already loaded in this process.
53
+
54
+ A cold real model returns False so the latency path degrades to FTS instead
55
+ of paying a download/load cost on a single query.
56
+ """
57
+ try:
58
+ cog = _cognitive()
59
+ except Exception:
60
+ return False
61
+ try:
62
+ if cog._model_download_disabled():
63
+ return True
64
+ except Exception:
65
+ return False
66
+ return getattr(cog, "_model", None) is not None
67
+
68
+
69
+ def _embedding_columns_present(conn) -> bool:
70
+ try:
71
+ cols = {row[1] for row in conn.execute("PRAGMA table_info(memory_observations)").fetchall()}
72
+ except Exception:
73
+ return False
74
+ return "embedding" in cols and "embedding_model" in cols
75
+
76
+
77
+ def _embed_text_blob(text: str):
78
+ """Embed text into (blob, model_name). Returns (None, '') on any failure.
79
+
80
+ Never raises; never blocks indefinitely on a cold model — callers gate on
81
+ _model_is_warm() before calling on the latency path.
82
+ """
83
+ clean = str(text or "").strip()
84
+ if not clean:
85
+ return None, ""
86
+ try:
87
+ cog = _cognitive()
88
+ vector = cog.embed(clean)
89
+ blob = cog._array_to_blob(vector)
90
+ if not blob:
91
+ return None, ""
92
+ return blob, OBSERVATION_EMBEDDING_MODEL
93
+ except Exception:
94
+ return None, ""
95
+
96
+
97
+ def _write_observation_embedding(uid: str, *, summary: str = "", subject: str = "") -> bool:
98
+ """Precompute and store an observation embedding. Guarded; never raises.
99
+
100
+ Called AFTER the observation row is committed. A failure here must never
101
+ surface to the write path — the observation is already durable; the vector
102
+ is a shadow optimisation that the bounded backfill can fill in later.
103
+ """
104
+ try:
105
+ conn = _core().get_db()
106
+ if not _table_exists(conn, "memory_observations") or not _embedding_columns_present(conn):
107
+ return False
108
+ if not _model_is_warm():
109
+ return False
110
+ text = " ".join(part for part in [str(subject or "").strip(), str(summary or "").strip()] if part).strip()
111
+ blob, model_name = _embed_text_blob(text)
112
+ if blob is None:
113
+ return False
114
+ conn.execute(
115
+ "UPDATE memory_observations SET embedding = ?, embedding_model = ? WHERE observation_uid = ?",
116
+ (blob, model_name, uid),
117
+ )
118
+ conn.commit()
119
+ return True
120
+ except Exception:
121
+ return False
122
+
123
+
27
124
  _REDACT_PATTERNS = (
28
125
  re.compile(r"sk-[a-zA-Z0-9_\-]{20,}"),
29
126
  re.compile(r"ghp_[a-zA-Z0-9]{20,}"),
@@ -254,6 +351,12 @@ def _row_to_observation(row) -> dict:
254
351
  item["evidence_refs"] = _parse_json(item.pop("evidence_refs_json", "[]"), [])
255
352
  item["entities"] = _parse_json(item.pop("entities_json", "[]"), [])
256
353
  item["metadata"] = _parse_json(item.pop("metadata_json", "{}"), {})
354
+ # The shadow embedding BLOB is an internal optimisation, not user-facing
355
+ # payload. Drop the raw bytes (they are not JSON-serialisable) but expose a
356
+ # cheap boolean so callers/tests can assert it was precomputed.
357
+ embedding_blob = item.pop("embedding", None)
358
+ item.pop("embedding_model", None)
359
+ item["has_embedding"] = bool(embedding_blob)
257
360
  return item
258
361
 
259
362
 
@@ -671,6 +774,11 @@ def upsert_memory_observation(observation: dict) -> dict:
671
774
  ),
672
775
  )
673
776
  conn.commit()
777
+ # Precompute the semantic embedding AFTER the write is durable. This is a
778
+ # shadow optimisation: it is guarded, never blocks the write, and skips
779
+ # entirely when the model is cold/unavailable (the bounded backfill fills
780
+ # those rows later). The summary/subject already passed redaction above.
781
+ _write_observation_embedding(uid, summary=clean_summary, subject=clean_subject)
674
782
  row = conn.execute("SELECT * FROM memory_observations WHERE observation_uid = ?", (uid,)).fetchone()
675
783
  result = _row_to_observation(row) if row else {"observation_uid": uid}
676
784
  result["ok"] = True
@@ -898,6 +1006,174 @@ def search_memory_observations_fts(
898
1006
  return [_row_to_observation(row) for row in rows]
899
1007
 
900
1008
 
1009
+ def backfill_observation_embeddings(*, limit: int = 200) -> dict:
1010
+ """Bounded, idempotent backfill of precomputed observation embeddings.
1011
+
1012
+ Only touches rows whose ``embedding IS NULL`` (idempotent — re-running after
1013
+ a full pass is a no-op). Bounded by ``limit`` so it never scans the whole
1014
+ table in one call; callers loop until ``remaining == 0``. Skips entirely
1015
+ when the model is cold so it never triggers a download on a hot path.
1016
+ """
1017
+ conn = _core().get_db()
1018
+ if not _table_exists(conn, "memory_observations"):
1019
+ return {"ok": True, "updated": 0, "skipped": True, "reason": "memory_observations table unavailable"}
1020
+ if not _embedding_columns_present(conn):
1021
+ return {"ok": True, "updated": 0, "skipped": True, "reason": "embedding columns unavailable"}
1022
+ if not _model_is_warm():
1023
+ return {"ok": True, "updated": 0, "skipped": True, "reason": "embedding model cold"}
1024
+
1025
+ max_rows = max(1, min(int(limit or 200), 1000))
1026
+ try:
1027
+ rows = conn.execute(
1028
+ """
1029
+ SELECT id, observation_uid, subject, summary
1030
+ FROM memory_observations
1031
+ WHERE embedding IS NULL
1032
+ ORDER BY created_at DESC, id DESC
1033
+ LIMIT ?
1034
+ """,
1035
+ (max_rows,),
1036
+ ).fetchall()
1037
+ except Exception as exc:
1038
+ return {"ok": False, "updated": 0, "error": _truncate(str(exc), 300)}
1039
+
1040
+ updated = 0
1041
+ failed = 0
1042
+ for row in rows:
1043
+ item = dict(row)
1044
+ text = " ".join(
1045
+ part for part in [str(item.get("subject") or "").strip(), str(item.get("summary") or "").strip()] if part
1046
+ ).strip()
1047
+ blob, model_name = _embed_text_blob(text)
1048
+ if blob is None:
1049
+ failed += 1
1050
+ continue
1051
+ try:
1052
+ conn.execute(
1053
+ "UPDATE memory_observations SET embedding = ?, embedding_model = ? WHERE id = ?",
1054
+ (blob, model_name, item.get("id")),
1055
+ )
1056
+ updated += 1
1057
+ except Exception:
1058
+ failed += 1
1059
+ if updated:
1060
+ conn.commit()
1061
+
1062
+ try:
1063
+ remaining = int(
1064
+ conn.execute("SELECT COUNT(*) FROM memory_observations WHERE embedding IS NULL").fetchone()[0]
1065
+ )
1066
+ except Exception:
1067
+ remaining = 0
1068
+ return {
1069
+ "ok": failed == 0,
1070
+ "updated": updated,
1071
+ "failed": failed,
1072
+ "seen": len(rows),
1073
+ "remaining": remaining,
1074
+ }
1075
+
1076
+
1077
+ def vector_scan_observations(
1078
+ query_vector,
1079
+ *,
1080
+ limit: int = 50,
1081
+ scan_limit: int = 400,
1082
+ start_ts: float | None = None,
1083
+ end_ts: float | None = None,
1084
+ project_key: str = "",
1085
+ min_score: float = 0.0,
1086
+ ) -> list[dict]:
1087
+ """Bounded cosine scan over precomputed observation embeddings.
1088
+
1089
+ ``scan_limit`` caps how many embedded rows are deserialised/compared so a
1090
+ single query can never walk an unbounded table. Returns the top ``limit``
1091
+ matches as ``{observation_uid, vector_score}`` dicts. Never raises — any
1092
+ failure yields an empty list so the caller degrades to FTS.
1093
+ """
1094
+ if query_vector is None:
1095
+ return []
1096
+ conn = _core().get_db()
1097
+ if not _table_exists(conn, "memory_observations") or not _embedding_columns_present(conn):
1098
+ return []
1099
+ try:
1100
+ cog = _cognitive()
1101
+ except Exception:
1102
+ return []
1103
+
1104
+ clauses = ["embedding IS NOT NULL"]
1105
+ params: list[Any] = []
1106
+ if start_ts is not None:
1107
+ clauses.append("created_at >= ?")
1108
+ params.append(float(start_ts))
1109
+ if end_ts is not None:
1110
+ clauses.append("created_at < ?")
1111
+ params.append(float(end_ts))
1112
+ if project_key.strip():
1113
+ clauses.append("project_key = ?")
1114
+ params.append(project_key.strip())
1115
+
1116
+ bounded_scan = max(1, min(int(scan_limit or 400), 2000))
1117
+ bounded_limit = max(1, min(int(limit or 50), 200))
1118
+ try:
1119
+ rows = conn.execute(
1120
+ f"""
1121
+ SELECT observation_uid, embedding
1122
+ FROM memory_observations
1123
+ WHERE {' AND '.join(clauses)}
1124
+ ORDER BY salience DESC, created_at DESC, id DESC
1125
+ LIMIT ?
1126
+ """,
1127
+ params + [bounded_scan],
1128
+ ).fetchall()
1129
+ except Exception:
1130
+ return []
1131
+
1132
+ scored: list[dict] = []
1133
+ for row in rows:
1134
+ blob = row["embedding"]
1135
+ if not blob:
1136
+ continue
1137
+ try:
1138
+ candidate_vector = cog._blob_to_array(blob)
1139
+ score = cog.cosine_similarity(query_vector, candidate_vector)
1140
+ except Exception:
1141
+ continue
1142
+ if score <= min_score:
1143
+ continue
1144
+ scored.append({"observation_uid": row["observation_uid"], "vector_score": float(score)})
1145
+
1146
+ scored.sort(key=lambda item: item["vector_score"], reverse=True)
1147
+ return scored[:bounded_limit]
1148
+
1149
+
1150
+ def get_memory_observations_by_uids(uids: list[str]) -> dict[str, dict]:
1151
+ """Fetch observation rows by uid (bounded). Returns {uid: observation}.
1152
+
1153
+ Used by the retrieval fusion path to materialise semantic-only matches that
1154
+ the lexical/FTS scan did not surface. Bounded to a small batch; never raises.
1155
+ """
1156
+ conn = _core().get_db()
1157
+ if not _table_exists(conn, "memory_observations"):
1158
+ return {}
1159
+ clean = [str(uid).strip() for uid in (uids or []) if str(uid).strip()][:200]
1160
+ if not clean:
1161
+ return {}
1162
+ placeholders = ",".join("?" for _ in clean)
1163
+ try:
1164
+ rows = conn.execute(
1165
+ f"SELECT * FROM memory_observations WHERE observation_uid IN ({placeholders})",
1166
+ clean,
1167
+ ).fetchall()
1168
+ except Exception:
1169
+ return {}
1170
+ result: dict[str, dict] = {}
1171
+ for row in rows:
1172
+ item = _row_to_observation(row)
1173
+ result[item.get("observation_uid")] = item
1174
+ return result
1175
+
1176
+
901
1177
  def memory_observation_stats(days: int = 7) -> dict:
902
1178
  conn = _core().get_db()
903
1179
  if not _table_exists(conn, "memory_observations"):
package/src/db/_schema.py CHANGED
@@ -2119,6 +2119,81 @@ def _m64_local_context_live_dirs(conn):
2119
2119
  )
2120
2120
 
2121
2121
 
2122
+ def _m84_local_chunks_fts(conn):
2123
+ """FTS5 keyword index over local_chunks for semantic-keyword local-file recall.
2124
+
2125
+ Additive + idempotent + reversible. Creates an FTS5 virtual table
2126
+ ``local_chunks_fts`` keyed by ``local_chunks.rowid`` (local_chunks is NOT
2127
+ WITHOUT ROWID, so its implicit rowid is stable and usable as the FTS key)
2128
+ plus triggers that mirror local_chunks insert/delete/update. The FTS row
2129
+ stores a denormalized snapshot of the owning asset's ``privacy_class`` /
2130
+ ``status`` so privacy can be coarse-prefiltered INSIDE the FTS query without
2131
+ a heavy join. The authoritative privacy check stays on the real
2132
+ local_assets join + is_queryable_path in the retrieval path.
2133
+
2134
+ NOTE: no bulk INSERT...SELECT backfill here (that would scan/lock the live
2135
+ 19GB DB at schema time). The incremental, resumable backfill runs in the
2136
+ cron tick (local_context/api.py:_backfill_fts_rows). No vector_blob column,
2137
+ no VACUUM — explicitly out of scope.
2138
+ """
2139
+ try:
2140
+ conn.execute(
2141
+ """
2142
+ CREATE VIRTUAL TABLE IF NOT EXISTS local_chunks_fts USING fts5(
2143
+ text,
2144
+ privacy_class UNINDEXED,
2145
+ asset_status UNINDEXED,
2146
+ tokenize='unicode61 remove_diacritics 2'
2147
+ )
2148
+ """
2149
+ )
2150
+ except Exception:
2151
+ # Hosts without FTS5 support: fall back to a plain shadow table so the
2152
+ # triggers below still succeed and the dual-read simply never flips on
2153
+ # (FTS MATCH would raise OperationalError -> retrieval stays on LIKE).
2154
+ conn.execute(
2155
+ """
2156
+ CREATE TABLE IF NOT EXISTS local_chunks_fts (
2157
+ rowid INTEGER PRIMARY KEY,
2158
+ text TEXT DEFAULT '',
2159
+ privacy_class TEXT DEFAULT '',
2160
+ asset_status TEXT DEFAULT ''
2161
+ )
2162
+ """
2163
+ )
2164
+ conn.executescript(
2165
+ """
2166
+ CREATE TRIGGER IF NOT EXISTS local_chunks_fts_insert
2167
+ AFTER INSERT ON local_chunks BEGIN
2168
+ INSERT INTO local_chunks_fts(rowid, text, privacy_class, asset_status)
2169
+ VALUES (
2170
+ new.rowid,
2171
+ new.text,
2172
+ COALESCE((SELECT privacy_class FROM local_assets WHERE asset_id=new.asset_id), 'normal'),
2173
+ COALESCE((SELECT status FROM local_assets WHERE asset_id=new.asset_id), 'active')
2174
+ );
2175
+ END;
2176
+
2177
+ CREATE TRIGGER IF NOT EXISTS local_chunks_fts_delete
2178
+ AFTER DELETE ON local_chunks BEGIN
2179
+ DELETE FROM local_chunks_fts WHERE rowid = old.rowid;
2180
+ END;
2181
+
2182
+ CREATE TRIGGER IF NOT EXISTS local_chunks_fts_update
2183
+ AFTER UPDATE OF text, asset_id ON local_chunks BEGIN
2184
+ DELETE FROM local_chunks_fts WHERE rowid = old.rowid;
2185
+ INSERT INTO local_chunks_fts(rowid, text, privacy_class, asset_status)
2186
+ VALUES (
2187
+ new.rowid,
2188
+ new.text,
2189
+ COALESCE((SELECT privacy_class FROM local_assets WHERE asset_id=new.asset_id), 'normal'),
2190
+ COALESCE((SELECT status FROM local_assets WHERE asset_id=new.asset_id), 'active')
2191
+ );
2192
+ END;
2193
+ """
2194
+ )
2195
+
2196
+
2122
2197
  def _backfill_diary_quality(conn):
2123
2198
  for table in ("session_diary", "diary_archive"):
2124
2199
  conn.execute(f"""
@@ -3112,6 +3187,32 @@ def _m82_confidence_checks(conn):
3112
3187
  )
3113
3188
 
3114
3189
 
3190
+ def _m83_observation_embeddings(conn):
3191
+ """Shadow embedding columns for semantic retrieval over observations.
3192
+
3193
+ Additive + reversible: two nullable columns on memory_observations. The
3194
+ embedding is precomputed at write time (after the row is committed) and by a
3195
+ bounded backfill. memory_search fuses the precomputed vector with the
3196
+ existing FTS/token score so paraphrases retrieve the right observation.
3197
+
3198
+ The query/latency path never triggers a cold model load — it embeds the
3199
+ query once only when a model is already warm (or the deterministic offline
3200
+ fallback is active) and otherwise degrades to today's FTS-only behaviour.
3201
+ """
3202
+ # Ensure the base table exists before ALTER: partial-DB upgrade paths (which
3203
+ # mark earlier migrations applied without materializing every table) would
3204
+ # otherwise hit "no such table: memory_observations". m60 is idempotent.
3205
+ _m60_memory_observations(conn)
3206
+ _migrate_add_column(conn, "memory_observations", "embedding", "BLOB")
3207
+ _migrate_add_column(conn, "memory_observations", "embedding_model", "TEXT DEFAULT ''")
3208
+ # Partial index so the bounded backfill can find un-embedded rows cheaply.
3209
+ conn.execute(
3210
+ "CREATE INDEX IF NOT EXISTS idx_memory_obs_embedding_pending "
3211
+ "ON memory_observations(id) WHERE embedding IS NULL"
3212
+ )
3213
+ conn.commit()
3214
+
3215
+
3115
3216
  MIGRATIONS = [
3116
3217
  (1, "learnings_columns", _m1_learnings_columns),
3117
3218
  (2, "followups_reasoning", _m2_followups_reasoning),
@@ -3195,6 +3296,7 @@ MIGRATIONS = [
3195
3296
  (80, "opportunity_orchestrator", _m80_opportunity_orchestrator),
3196
3297
  (81, "core_rules_product_metadata", _m81_core_rules_product_metadata),
3197
3298
  (82, "confidence_checks", _m82_confidence_checks),
3299
+ (83, "observation_embeddings", _m83_observation_embeddings),
3198
3300
  ]
3199
3301
 
3200
3302
 
@@ -288,37 +288,69 @@ def _content_hash(fact_type: str, content: str) -> str:
288
288
  # ---------------------------------------------------------------------------
289
289
 
290
290
 
291
- def _auto_learning_add(title: str, content: str) -> bool:
292
- """Best-effort call to tools_learnings.handle_learning_add.
291
+ def _get_auto_capture_logger():
292
+ """Lazy, guarded logger so capture failures are observable, never silent.
293
293
 
294
- Returns True when the learning was stored, False otherwise. Failures
295
- are silent so the hook itself never breaks the user's prompt flow.
294
+ Import must never raise (tmp/read-only homes in tests) falls back to a
295
+ NullHandler. Mirrors the enforcement_engine logger pattern.
296
296
  """
297
+ import logging
298
+ log = logging.getLogger("nexo.auto_capture")
299
+ if log.handlers:
300
+ return log
297
301
  try:
298
- import tools_learnings # type: ignore
302
+ import paths
303
+ d = paths.logs_dir()
304
+ d.mkdir(parents=True, exist_ok=True)
305
+ handler = logging.FileHandler(d / "auto_capture.log")
306
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
307
+ log.addHandler(handler)
308
+ log.setLevel(logging.INFO)
299
309
  except Exception:
300
- return False
310
+ log.addHandler(logging.NullHandler())
311
+ return log
301
312
 
313
+
314
+ def _auto_learning_add(title: str, content: str) -> bool:
315
+ """Persist a correction-derived learning via tools_learnings.handle_learning_add.
316
+
317
+ Failures are LOGGED (never silently swallowed) and retried once with a tiny
318
+ backoff so a transient DB-lock contention on the shared Brain self-heals
319
+ within the same prompt. Never raises and never breaks the prompt flow
320
+ (returns bool); all logging/retry stays inside the hook's exit-0 contract.
321
+ """
322
+ import time
323
+ log = _get_auto_capture_logger()
302
324
  try:
303
- # The public symbol is handle_learning_add. A prior call to a
304
- # non-existent tools_learnings.add_learning raised AttributeError that
305
- # was swallowed below, so EVERY auto-captured correction silently
306
- # failed to persist a learning (error-capture / never-repeat broken).
307
- result = tools_learnings.handle_learning_add(
308
- category="auto",
309
- title=title,
310
- content=content,
311
- priority="medium",
312
- reasoning="auto-captured from correction pattern in UserPromptSubmit/PostToolUse hook",
313
- )
314
- if isinstance(result, str):
315
- return not result.strip().upper().startswith("ERROR")
316
- if isinstance(result, dict):
317
- return bool(result.get("ok") or result.get("id") or result.get("learning_id"))
318
- return bool(result)
319
- except Exception:
325
+ import tools_learnings # type: ignore
326
+ except Exception as exc:
327
+ log.info("auto_capture: tools_learnings unavailable (%s)", exc.__class__.__name__)
320
328
  return False
321
329
 
330
+ for attempt in range(2):
331
+ try:
332
+ result = tools_learnings.handle_learning_add(
333
+ category="auto",
334
+ title=title,
335
+ content=content,
336
+ priority="medium",
337
+ reasoning="auto-captured from correction pattern in UserPromptSubmit/PostToolUse hook",
338
+ )
339
+ if isinstance(result, str):
340
+ ok = not result.strip().upper().startswith("ERROR")
341
+ elif isinstance(result, dict):
342
+ ok = bool(result.get("ok") or result.get("id") or result.get("learning_id"))
343
+ else:
344
+ ok = bool(result)
345
+ if ok:
346
+ return True
347
+ log.warning("auto_capture learning_add failed (attempt %d/2): %r", attempt + 1, result)
348
+ except Exception as exc:
349
+ log.warning("auto_capture learning_add error (attempt %d/2): %s", attempt + 1, exc)
350
+ if attempt == 0:
351
+ time.sleep(0.05)
352
+ return False
353
+
322
354
 
323
355
  # ---------------------------------------------------------------------------
324
356
  # Core processing
@@ -408,7 +440,11 @@ def process_conversation(messages: list[str]) -> dict:
408
440
  learning_added = True
409
441
  learnings_added += 1
410
442
 
411
- _dedup_record(dedup_conn, content_hash, fact_type)
443
+ # A FAILED correction-learning must stay retryable on the next identical
444
+ # prompt, so only arm the 1h dedup gate for a correction once its learning
445
+ # actually persisted. Non-correction facts dedup as before.
446
+ if not (fact_type == "correction" and not learning_added):
447
+ _dedup_record(dedup_conn, content_hash, fact_type)
412
448
 
413
449
  extracted_details.append({
414
450
  "type": fact_type,
@@ -173,6 +173,46 @@ def applies_overlap(left: str, right: str) -> bool:
173
173
  return False
174
174
 
175
175
 
176
+ def normalized_key(title: str, applies_to: str = "") -> str:
177
+ """Stable dedup key for a learning: normalized title + sorted applies scope.
178
+
179
+ Public thin wrapper so callers (e.g. the nightly consolidation brief builder)
180
+ depend only on the resolver's public surface for dedup math. Two learnings
181
+ that differ only in casing/whitespace of the title and ordering of applies_to
182
+ tokens collapse to the same key.
183
+ """
184
+ title_key = _normalize_text(title)
185
+ scope_tokens = sorted(
186
+ {_normalize_applies_token(item) for item in _split_applies_to(applies_to)} - {""}
187
+ )
188
+ if scope_tokens:
189
+ return f"{title_key}|{','.join(scope_tokens)}"
190
+ return title_key
191
+
192
+
193
+ def candidate_similarity(text_a: str, text_b: str) -> float:
194
+ """Similarity between two free-text snippets using the resolver's own math.
195
+
196
+ Wraps hybrid_similarity_score with the SAME thresholds used in _similarity
197
+ (the per-candidate resolution path) so consolidation_prep and the resolver
198
+ stay in lockstep. Returns 0.0 when either side is empty.
199
+ """
200
+ left = str(text_a or "").strip()
201
+ right = str(text_b or "").strip()
202
+ if not left or not right:
203
+ return 0.0
204
+ return float(
205
+ hybrid_similarity_score(
206
+ left,
207
+ right,
208
+ keyword_extractor=extract_keywords,
209
+ strong_semantic_threshold=0.82,
210
+ moderate_semantic_threshold=0.74,
211
+ moderate_keyword_floor=0.08,
212
+ )
213
+ )
214
+
215
+
176
216
  def _table_columns(conn: sqlite3.Connection, table: str) -> set[str]:
177
217
  try:
178
218
  return {str(row["name"]) for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
@@ -413,7 +453,9 @@ __all__ = [
413
453
  "CANONICAL_ACTIONS",
414
454
  "applies_overlap",
415
455
  "authority_rank",
456
+ "candidate_similarity",
416
457
  "looks_contradictory",
417
458
  "normalize_authority",
459
+ "normalized_key",
418
460
  "resolve_learning_candidate",
419
461
  ]