kc-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. kc/__init__.py +5 -0
  2. kc/__main__.py +11 -0
  3. kc/artifacts/__init__.py +1 -0
  4. kc/artifacts/diff.py +76 -0
  5. kc/artifacts/frontmatter.py +26 -0
  6. kc/artifacts/markdown.py +116 -0
  7. kc/atomic_write.py +33 -0
  8. kc/cli.py +284 -0
  9. kc/commands/__init__.py +1 -0
  10. kc/commands/artifact.py +1190 -0
  11. kc/commands/citation.py +231 -0
  12. kc/commands/common.py +346 -0
  13. kc/commands/conformance.py +293 -0
  14. kc/commands/context.py +190 -0
  15. kc/commands/doctor.py +81 -0
  16. kc/commands/eval.py +133 -0
  17. kc/commands/export.py +97 -0
  18. kc/commands/guide.py +571 -0
  19. kc/commands/index.py +54 -0
  20. kc/commands/init.py +207 -0
  21. kc/commands/lint.py +238 -0
  22. kc/commands/source.py +464 -0
  23. kc/commands/status.py +52 -0
  24. kc/commands/task.py +260 -0
  25. kc/config.py +127 -0
  26. kc/embedding_models/potion-base-8M/README.md +97 -0
  27. kc/embedding_models/potion-base-8M/config.json +13 -0
  28. kc/embedding_models/potion-base-8M/model.safetensors +0 -0
  29. kc/embedding_models/potion-base-8M/modules.json +14 -0
  30. kc/embedding_models/potion-base-8M/tokenizer.json +1 -0
  31. kc/errors.py +141 -0
  32. kc/fingerprints.py +35 -0
  33. kc/ids.py +23 -0
  34. kc/locks.py +65 -0
  35. kc/models/__init__.py +17 -0
  36. kc/models/artifact.py +34 -0
  37. kc/models/citation.py +60 -0
  38. kc/models/context.py +23 -0
  39. kc/models/eval.py +21 -0
  40. kc/models/plan.py +37 -0
  41. kc/models/source.py +37 -0
  42. kc/models/source_range.py +29 -0
  43. kc/models/source_revision.py +19 -0
  44. kc/models/task.py +35 -0
  45. kc/output.py +838 -0
  46. kc/paths.py +126 -0
  47. kc/provenance/__init__.py +1 -0
  48. kc/provenance/citations.py +296 -0
  49. kc/search/__init__.py +1 -0
  50. kc/search/extract.py +268 -0
  51. kc/search/fts.py +284 -0
  52. kc/search/semantic.py +346 -0
  53. kc/store/__init__.py +1 -0
  54. kc/store/jsonl.py +55 -0
  55. kc/store/sqlite.py +444 -0
  56. kc/store/transaction.py +67 -0
  57. kc/templates/agents/skills/kc/SKILL.md +282 -0
  58. kc/templates/agents/skills/kc/agents/openai.yaml +5 -0
  59. kc/templates/agents/skills/kc/scripts/resolve_query_citations.py +134 -0
  60. kc/workspace.py +98 -0
  61. kc_cli-0.4.0.dist-info/METADATA +522 -0
  62. kc_cli-0.4.0.dist-info/RECORD +65 -0
  63. kc_cli-0.4.0.dist-info/WHEEL +4 -0
  64. kc_cli-0.4.0.dist-info/entry_points.txt +2 -0
  65. kc_cli-0.4.0.dist-info/licenses/LICENSE +21 -0
kc/search/fts.py ADDED
@@ -0,0 +1,284 @@
1
+ """Range retrieval over SQLite FTS5 plus semantic vectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sqlite3
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from urllib.parse import quote
10
+
11
+ import orjson
12
+
13
+ from kc.errors import KcError
14
+ from kc.models.source import SourceRecord
15
+ from kc.models.source_range import SourceRangeRecord
16
+ from kc.paths import ensure_data_dir_exists
17
+ from kc.search.semantic import (
18
+ assert_semantic_index_ready,
19
+ build_semantic_index,
20
+ semantic_index_status,
21
+ semantic_rankings,
22
+ )
23
+ from kc.store.jsonl import read_jsonl
24
+ from kc.store.sqlite import index_status, init_db, rebuild_index
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class Bm25Rank:
29
+ range_id: str
30
+ rank: int
31
+ score: float
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class CombinedRank:
36
+ range_id: str
37
+ bm25_rank: int | None = None
38
+ bm25_score: float | None = None
39
+ semantic_rank: int | None = None
40
+ semantic_score: float | None = None
41
+ rrf_score: float | None = None
42
+
43
+
44
+ def ensure_index(db_path: Path, sources_path: Path, ranges_path: Path) -> None:
45
+ ensure_data_dir_exists()
46
+ sources = read_jsonl(sources_path, SourceRecord)
47
+ ranges = read_jsonl(ranges_path, SourceRangeRecord)
48
+ status = index_status(db_path, sources, ranges)
49
+ if not status["sqlite_exists"] or status["stale"]:
50
+ rebuild_index(db_path, sources, ranges)
51
+ semantic_status = semantic_index_status(db_path, ranges)
52
+ if (
53
+ not semantic_status["index_metadata"]
54
+ or not semantic_status["metadata_match"]
55
+ or semantic_status["missing_vectors"]
56
+ or semantic_status["stale_vectors"]
57
+ ):
58
+ try:
59
+ build_semantic_index(db_path, ranges)
60
+ except KcError as exc:
61
+ if exc.code != "KC_RETRIEVAL_MODEL_UNAVAILABLE":
62
+ raise
63
+
64
+
65
+ def _build_fts_query(query: str) -> str:
66
+ terms = [t.replace('"', "").strip() for t in query.split() if t.strip()]
67
+ return " OR ".join(f'"{term}"' for term in terms)
68
+
69
+
70
+ def citation_token(
71
+ source_id: str,
72
+ locator: dict[str, Any],
73
+ *,
74
+ range_id: str | None = None,
75
+ legacy: bool = False,
76
+ ) -> str:
77
+ prefix = f"[kc:{source_id}"
78
+ if range_id and not legacy:
79
+ prefix += f":{range_id}"
80
+ if locator.get("kind") == "line_range":
81
+ return f"{prefix}:L{locator.get('start_line')}-L{locator.get('end_line')}]"
82
+ if locator.get("kind") == "json_pointer":
83
+ pointer = quote(str(locator.get("pointer", "/")), safe="/~")
84
+ return f"{prefix}:JP:{pointer}]"
85
+ if locator.get("kind") == "csv_row_range":
86
+ return f"{prefix}:CSV:R{locator.get('start_row')}-R{locator.get('end_row')}]"
87
+ return f"{prefix}]"
88
+
89
+
90
+ def rrf_score(ranks: list[int], *, k: int = 60) -> float:
91
+ return sum(1.0 / (k + rank) for rank in ranks)
92
+
93
+
94
+ def _connect(db_path: Path) -> sqlite3.Connection:
95
+ conn = sqlite3.connect(str(db_path))
96
+ conn.row_factory = sqlite3.Row
97
+ return conn
98
+
99
+
100
+ def _bm25_rankings(
101
+ conn: sqlite3.Connection,
102
+ query: str,
103
+ *,
104
+ domain: str | None = None,
105
+ limit: int = 100,
106
+ ) -> list[Bm25Rank]:
107
+ fts_query = _build_fts_query(query)
108
+ if not fts_query:
109
+ return []
110
+ sql = """
111
+ SELECT
112
+ f.range_id,
113
+ bm25(source_ranges_fts) AS bm25_score
114
+ FROM source_ranges_fts f
115
+ JOIN source_ranges r ON r.range_id = f.range_id
116
+ JOIN sources s ON s.source_id = f.source_id
117
+ WHERE source_ranges_fts MATCH ?
118
+ """
119
+ params: list[Any] = [fts_query]
120
+ if domain:
121
+ sql += " AND f.domain LIKE ?"
122
+ params.append(f"%{domain}%")
123
+ sql += " ORDER BY bm25_score, f.range_id LIMIT ?"
124
+ params.append(limit)
125
+ try:
126
+ rows = conn.execute(sql, params).fetchall()
127
+ except sqlite3.OperationalError as exc:
128
+ raise KcError(
129
+ code="KC_INDEX_BUILD_FAILED",
130
+ message=f"Search failed: {exc}",
131
+ details={"query": query},
132
+ ) from exc
133
+ return [
134
+ Bm25Rank(range_id=row["range_id"], rank=rank, score=float(row["bm25_score"]))
135
+ for rank, row in enumerate(rows, start=1)
136
+ ]
137
+
138
+
139
+ def _rows_for_ranges(conn: sqlite3.Connection, range_ids: list[str]) -> dict[str, sqlite3.Row]:
140
+ if not range_ids:
141
+ return {}
142
+ placeholders = ", ".join("?" for _ in range_ids)
143
+ rows = conn.execute(
144
+ f"""
145
+ SELECT
146
+ r.range_id,
147
+ r.source_id,
148
+ r.locator_json,
149
+ r.source_fingerprint,
150
+ r.record_json AS range_record_json,
151
+ r.excerpt,
152
+ s.display_name,
153
+ s.status AS source_status,
154
+ s.authority_json,
155
+ s.fingerprint AS current_source_fingerprint
156
+ FROM source_ranges r
157
+ JOIN sources s ON s.source_id = r.source_id
158
+ WHERE r.range_id IN ({placeholders})
159
+ """,
160
+ range_ids,
161
+ ).fetchall()
162
+ return {row["range_id"]: row for row in rows}
163
+
164
+
165
+ def _format_result(
166
+ row: sqlite3.Row,
167
+ rank: CombinedRank,
168
+ *,
169
+ hybrid_rank: int,
170
+ ) -> dict[str, Any]:
171
+ locator = orjson.loads(row["locator_json"])
172
+ authority = orjson.loads(row["authority_json"])
173
+ return {
174
+ "range_id": row["range_id"],
175
+ "source_id": row["source_id"],
176
+ "display_name": row["display_name"],
177
+ "locator": locator,
178
+ "excerpt": row["excerpt"],
179
+ "scores": {
180
+ "bm25_rank": rank.bm25_rank,
181
+ "bm25_score": rank.bm25_score,
182
+ "semantic_rank": rank.semantic_rank,
183
+ "semantic_score": rank.semantic_score,
184
+ "hybrid_rank": hybrid_rank,
185
+ "rrf_score": rank.rrf_score,
186
+ },
187
+ "citation_token": citation_token(row["source_id"], locator, range_id=row["range_id"]),
188
+ "legacy_citation_token": citation_token(row["source_id"], locator, legacy=True),
189
+ "source_authority": authority,
190
+ "source_status": row["source_status"],
191
+ "source_fingerprint": row["source_fingerprint"],
192
+ "current_source_fingerprint": row["current_source_fingerprint"],
193
+ }
194
+
195
+
196
+ def _combine_hybrid(
197
+ bm25: list[Bm25Rank],
198
+ semantic: list[Any],
199
+ *,
200
+ rrf_k: int,
201
+ limit: int,
202
+ ) -> list[CombinedRank]:
203
+ by_range: dict[str, CombinedRank] = {}
204
+ for item in bm25:
205
+ by_range[item.range_id] = CombinedRank(
206
+ range_id=item.range_id,
207
+ bm25_rank=item.rank,
208
+ bm25_score=item.score,
209
+ )
210
+ for item in semantic:
211
+ existing = by_range.get(item.range_id)
212
+ by_range[item.range_id] = CombinedRank(
213
+ range_id=item.range_id,
214
+ bm25_rank=existing.bm25_rank if existing else None,
215
+ bm25_score=existing.bm25_score if existing else None,
216
+ semantic_rank=item.rank,
217
+ semantic_score=item.score,
218
+ )
219
+
220
+ merged = []
221
+ for item in by_range.values():
222
+ ranks = [rank for rank in [item.bm25_rank, item.semantic_rank] if rank is not None]
223
+ merged.append(
224
+ CombinedRank(
225
+ range_id=item.range_id,
226
+ bm25_rank=item.bm25_rank,
227
+ bm25_score=item.bm25_score,
228
+ semantic_rank=item.semantic_rank,
229
+ semantic_score=item.semantic_score,
230
+ rrf_score=rrf_score(ranks, k=rrf_k),
231
+ )
232
+ )
233
+ sentinel = 1_000_000
234
+ merged.sort(
235
+ key=lambda item: (
236
+ -(item.rrf_score or 0.0),
237
+ item.bm25_rank if item.bm25_rank is not None else sentinel,
238
+ item.semantic_rank if item.semantic_rank is not None else sentinel,
239
+ item.range_id,
240
+ )
241
+ )
242
+ return merged[:limit]
243
+
244
+
245
+ def search_ranges(
246
+ db_path: Path,
247
+ query: str,
248
+ *,
249
+ domain: str | None = None,
250
+ limit: int = 10,
251
+ rrf_k: int = 60,
252
+ ranges: list[SourceRangeRecord] | None = None,
253
+ metadata: dict[str, Any] | None = None,
254
+ ) -> list[dict[str, Any]]:
255
+ init_db(db_path)
256
+ conn = _connect(db_path)
257
+ try:
258
+ candidate_limit = max(limit * 5, 100)
259
+ source_ranges = ranges if ranges is not None else []
260
+ bm25 = _bm25_rankings(conn, query, domain=domain, limit=candidate_limit)
261
+ mode = "hybrid"
262
+ semantic_unavailable: str | None = None
263
+ try:
264
+ model = assert_semantic_index_ready(db_path, source_ranges)
265
+ semantic = semantic_rankings(conn, query, model, domain=domain, limit=candidate_limit)
266
+ except KcError as exc:
267
+ if exc.code != "KC_RETRIEVAL_MODEL_UNAVAILABLE":
268
+ raise
269
+ semantic = []
270
+ mode = "fts_fallback"
271
+ semantic_unavailable = exc.message
272
+ combined = _combine_hybrid(bm25, semantic, rrf_k=rrf_k, limit=limit)
273
+ if metadata is not None:
274
+ metadata["mode"] = mode
275
+ metadata["semantic_unavailable_reason"] = semantic_unavailable
276
+ rows = _rows_for_ranges(conn, [item.range_id for item in combined])
277
+ results = []
278
+ for hybrid_rank, item in enumerate(combined, start=1):
279
+ row = rows.get(item.range_id)
280
+ if row is not None:
281
+ results.append(_format_result(row, item, hybrid_rank=hybrid_rank))
282
+ return results
283
+ finally:
284
+ conn.close()
kc/search/semantic.py ADDED
@@ -0,0 +1,346 @@
1
+ """Local semantic retrieval support for source ranges."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import sqlite3
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from functools import lru_cache
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ import numpy as np
14
+ import numpy.typing as npt
15
+ import orjson
16
+
17
+ from kc.errors import KcError
18
+ from kc.models.source_range import SourceRangeRecord
19
+ from kc.store.sqlite import init_db
20
+
21
+ MODEL_PROVIDER = "model2vec"
22
+ BUNDLED_MODEL_NAME = "potion-base-8M"
23
+ EXPECTED_DIMENSION = 256
24
+ EXPECTED_CHECKSUM = "sha256:1630533e875a4725d11a703f48cb9598e59f3cfe3fa0d5a19a54045d52922848"
25
+ SEMANTIC_METADATA_KEY = "semantic_model"
26
+ TEXT_MODEL_SUFFIXES = {".json", ".md"}
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class SemanticRank:
31
+ range_id: str
32
+ rank: int
33
+ score: float
34
+
35
+
36
+ def bundled_model_dir() -> Path:
37
+ return Path(__file__).resolve().parent.parent / "embedding_models" / BUNDLED_MODEL_NAME
38
+
39
+
40
+ def model_directory_checksum(model_dir: Path) -> str:
41
+ root = Path(model_dir)
42
+ digest = hashlib.sha256()
43
+ paths = (path for path in root.rglob("*") if path.is_file())
44
+ for path in sorted(paths, key=lambda item: item.relative_to(root).as_posix()):
45
+ rel = path.relative_to(root).as_posix().encode("utf-8")
46
+ digest.update(rel)
47
+ digest.update(b"\0")
48
+ digest.update(_checksum_file_bytes(path))
49
+ digest.update(b"\0")
50
+ return f"sha256:{digest.hexdigest()}"
51
+
52
+
53
+ def _checksum_file_bytes(path: Path) -> bytes:
54
+ data = path.read_bytes()
55
+ if path.suffix.lower() in TEXT_MODEL_SUFFIXES:
56
+ return data.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
57
+ return data
58
+
59
+
60
+ def _model_unavailable(message: str, **details: Any) -> KcError:
61
+ return KcError(
62
+ code="KC_RETRIEVAL_MODEL_UNAVAILABLE",
63
+ message=message,
64
+ details=details,
65
+ suggested_action="run kc index build after fixing the local model",
66
+ )
67
+
68
+
69
+ def semantic_model_metadata(model: Any | None = None) -> dict[str, Any]:
70
+ dimension = int(getattr(model, "dim", EXPECTED_DIMENSION) or EXPECTED_DIMENSION)
71
+ return {
72
+ "provider": MODEL_PROVIDER,
73
+ "model": BUNDLED_MODEL_NAME,
74
+ "dimension": dimension,
75
+ "checksum": model_directory_checksum(bundled_model_dir()),
76
+ "purpose": "ranking_only",
77
+ }
78
+
79
+
80
+ @lru_cache(maxsize=1)
81
+ def load_semantic_model() -> Any:
82
+ model_dir = bundled_model_dir()
83
+ if not model_dir.exists():
84
+ raise _model_unavailable("Bundled semantic model directory is missing.", model_dir=str(model_dir))
85
+ checksum = model_directory_checksum(model_dir)
86
+ if checksum != EXPECTED_CHECKSUM:
87
+ raise _model_unavailable(
88
+ "Bundled semantic model checksum does not match the configured checksum.",
89
+ model_dir=str(model_dir),
90
+ expected_checksum=EXPECTED_CHECKSUM,
91
+ actual_checksum=checksum,
92
+ )
93
+ try:
94
+ from model2vec import StaticModel
95
+ except ImportError as exc:
96
+ raise _model_unavailable("model2vec is not installed.", dependency="model2vec") from exc
97
+ try:
98
+ model = StaticModel.from_pretrained(str(model_dir))
99
+ except Exception as exc:
100
+ raise _model_unavailable("Failed to load bundled semantic model.", model_dir=str(model_dir)) from exc
101
+ dimension = int(getattr(model, "dim", 0) or 0)
102
+ if dimension and dimension != EXPECTED_DIMENSION:
103
+ raise _model_unavailable(
104
+ "Bundled semantic model dimension does not match the configured dimension.",
105
+ expected_dimension=EXPECTED_DIMENSION,
106
+ actual_dimension=dimension,
107
+ )
108
+ return model
109
+
110
+
111
+ def is_model_available() -> tuple[bool, dict[str, Any] | None, str | None]:
112
+ try:
113
+ model = load_semantic_model()
114
+ return True, semantic_model_metadata(model), None
115
+ except KcError as exc:
116
+ return False, None, exc.message
117
+
118
+
119
+ def _encode(model: Any, texts: str | list[str]) -> npt.NDArray[np.float32]:
120
+ try:
121
+ encoded = model.encode(texts, show_progress_bar=False, use_multiprocessing=False)
122
+ except TypeError:
123
+ encoded = model.encode(texts)
124
+ return np.asarray(encoded, dtype=np.float32)
125
+
126
+
127
+ def embed_text(model: Any, text: str) -> npt.NDArray[np.float32]:
128
+ return _encode(model, text).reshape(-1).astype(np.float32)
129
+
130
+
131
+ def embed_texts(model: Any, texts: list[str]) -> npt.NDArray[np.float32]:
132
+ if not texts:
133
+ return np.empty((0, EXPECTED_DIMENSION), dtype=np.float32)
134
+ encoded = _encode(model, texts)
135
+ if encoded.ndim == 1:
136
+ encoded = encoded.reshape(1, -1)
137
+ return encoded.astype(np.float32)
138
+
139
+
140
+ def embedding_to_blob(embedding: npt.NDArray[np.float32]) -> bytes:
141
+ return np.asarray(embedding, dtype=np.float32).reshape(-1).tobytes()
142
+
143
+
144
+ def blob_to_embedding(blob: bytes) -> npt.NDArray[np.float32]:
145
+ return np.frombuffer(blob, dtype=np.float32)
146
+
147
+
148
+ def cosine_similarity(a: npt.NDArray[np.float32], b: npt.NDArray[np.float32]) -> float:
149
+ left = np.asarray(a, dtype=np.float32).reshape(-1)
150
+ right = np.asarray(b, dtype=np.float32).reshape(-1)
151
+ norm = np.linalg.norm(left) * np.linalg.norm(right)
152
+ if norm == 0:
153
+ return 0.0
154
+ return float(np.dot(left, right) / norm)
155
+
156
+
157
+ def _now() -> str:
158
+ return datetime.now(UTC).isoformat()
159
+
160
+
161
+ def _connect(db_path: Path) -> sqlite3.Connection:
162
+ conn = sqlite3.connect(str(db_path))
163
+ conn.row_factory = sqlite3.Row
164
+ return conn
165
+
166
+
167
+ def _table_exists(conn: sqlite3.Connection, table: str) -> bool:
168
+ row = conn.execute(
169
+ "SELECT name FROM sqlite_master WHERE type IN ('table', 'view') AND name = ?",
170
+ (table,),
171
+ ).fetchone()
172
+ return row is not None
173
+
174
+
175
+ def _metadata_from_db(conn: sqlite3.Connection) -> dict[str, Any] | None:
176
+ row = conn.execute("SELECT value_json FROM index_metadata WHERE key = ?", (SEMANTIC_METADATA_KEY,)).fetchone()
177
+ if row is None:
178
+ return None
179
+ return orjson.loads(row["value_json"])
180
+
181
+
182
+ def build_semantic_index(db_path: Path, ranges: list[SourceRangeRecord]) -> dict[str, Any]:
183
+ init_db(db_path)
184
+ model = load_semantic_model()
185
+ metadata = semantic_model_metadata(model)
186
+ vectors = embed_texts(model, [r.excerpt for r in ranges])
187
+ if len(vectors) != len(ranges):
188
+ raise _model_unavailable(
189
+ "Semantic model returned an unexpected number of embeddings.",
190
+ expected=len(ranges),
191
+ actual=len(vectors),
192
+ )
193
+ dimension = int(vectors.shape[1]) if vectors.ndim == 2 and len(ranges) > 0 else int(metadata["dimension"])
194
+ if dimension != int(metadata["dimension"]):
195
+ raise _model_unavailable(
196
+ "Semantic model returned embeddings with an unexpected dimension.",
197
+ expected_dimension=metadata["dimension"],
198
+ actual_dimension=dimension,
199
+ )
200
+
201
+ conn = _connect(db_path)
202
+ try:
203
+ timestamp = _now()
204
+ conn.execute("DELETE FROM source_range_embeddings")
205
+ conn.executemany(
206
+ """
207
+ INSERT INTO source_range_embeddings(
208
+ range_id, source_id, source_fingerprint, text_hash, model_name,
209
+ model_checksum, dimension, embedding, updated_at
210
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
211
+ """,
212
+ [
213
+ (
214
+ source_range.range_id,
215
+ source_range.source_id,
216
+ source_range.source_fingerprint,
217
+ source_range.text_hash,
218
+ metadata["model"],
219
+ metadata["checksum"],
220
+ metadata["dimension"],
221
+ embedding_to_blob(vectors[index]),
222
+ timestamp,
223
+ )
224
+ for index, source_range in enumerate(ranges)
225
+ ],
226
+ )
227
+ stored_metadata = {**metadata, "built_at": timestamp, "ranges": len(ranges)}
228
+ conn.execute(
229
+ "INSERT OR REPLACE INTO index_metadata(key, value_json, updated_at) VALUES (?, ?, ?)",
230
+ (SEMANTIC_METADATA_KEY, orjson.dumps(stored_metadata).decode(), timestamp),
231
+ )
232
+ conn.commit()
233
+ return {"enabled": True, "model": stored_metadata, "embeddings": len(ranges)}
234
+ finally:
235
+ conn.close()
236
+
237
+
238
+ def semantic_index_status(db_path: Path, ranges: list[SourceRangeRecord] | None = None) -> dict[str, Any]:
239
+ available, model_metadata, unavailable_reason = is_model_available()
240
+ status: dict[str, Any] = {
241
+ "model_available": available,
242
+ "model": model_metadata,
243
+ "unavailable_reason": unavailable_reason,
244
+ "index_metadata": None,
245
+ "metadata_match": False,
246
+ "vector_count": 0,
247
+ "missing_vectors": None,
248
+ "stale_vectors": None,
249
+ }
250
+ if not db_path.exists():
251
+ return status
252
+ conn = _connect(db_path)
253
+ try:
254
+ if not _table_exists(conn, "source_range_embeddings"):
255
+ return status
256
+ status["vector_count"] = int(conn.execute("SELECT COUNT(*) FROM source_range_embeddings").fetchone()[0])
257
+ index_metadata = _metadata_from_db(conn)
258
+ status["index_metadata"] = index_metadata
259
+ if model_metadata and index_metadata:
260
+ status["metadata_match"] = all(
261
+ index_metadata.get(key) == model_metadata.get(key)
262
+ for key in ("provider", "model", "dimension", "checksum", "purpose")
263
+ )
264
+ if ranges is not None:
265
+ rows = {
266
+ row["range_id"]: row
267
+ for row in conn.execute(
268
+ """
269
+ SELECT range_id, source_fingerprint, text_hash, model_checksum, dimension
270
+ FROM source_range_embeddings
271
+ """
272
+ ).fetchall()
273
+ }
274
+ missing = 0
275
+ stale = 0
276
+ checksum = model_metadata.get("checksum") if model_metadata else None
277
+ dimension_value = model_metadata.get("dimension") if model_metadata else None
278
+ dimension = int(dimension_value) if dimension_value is not None else None
279
+ for source_range in ranges:
280
+ row = rows.get(source_range.range_id)
281
+ if row is None:
282
+ missing += 1
283
+ continue
284
+ if (
285
+ row["source_fingerprint"] != source_range.source_fingerprint
286
+ or row["text_hash"] != source_range.text_hash
287
+ or row["model_checksum"] != checksum
288
+ or dimension is None
289
+ or int(row["dimension"]) != dimension
290
+ ):
291
+ stale += 1
292
+ status["missing_vectors"] = missing
293
+ status["stale_vectors"] = stale
294
+ return status
295
+ finally:
296
+ conn.close()
297
+
298
+
299
+ def assert_semantic_index_ready(
300
+ db_path: Path,
301
+ ranges: list[SourceRangeRecord],
302
+ ) -> Any:
303
+ model = load_semantic_model()
304
+ status = semantic_index_status(db_path, ranges)
305
+ if not status["index_metadata"] or not status["metadata_match"]:
306
+ raise _model_unavailable(
307
+ "Semantic index metadata is missing or stale. Run kc index build.",
308
+ status=status,
309
+ )
310
+ if status["missing_vectors"] or status["stale_vectors"]:
311
+ raise _model_unavailable(
312
+ "Semantic index vectors are missing or stale. Run kc index build.",
313
+ missing_vectors=status["missing_vectors"],
314
+ stale_vectors=status["stale_vectors"],
315
+ )
316
+ return model
317
+
318
+
319
+ def semantic_rankings(
320
+ conn: sqlite3.Connection,
321
+ query: str,
322
+ model: Any,
323
+ *,
324
+ domain: str | None = None,
325
+ limit: int = 100,
326
+ ) -> list[SemanticRank]:
327
+ query_embedding = embed_text(model, query)
328
+ sql = """
329
+ SELECT e.range_id, e.embedding
330
+ FROM source_range_embeddings e
331
+ JOIN sources s ON s.source_id = e.source_id
332
+ """
333
+ params: list[Any] = []
334
+ if domain:
335
+ sql += " WHERE s.domain_json LIKE ?"
336
+ params.append(f"%{domain}%")
337
+ rows = conn.execute(sql, params).fetchall()
338
+ scored: list[tuple[str, float]] = []
339
+ for row in rows:
340
+ score = cosine_similarity(query_embedding, blob_to_embedding(row["embedding"]))
341
+ scored.append((row["range_id"], score))
342
+ scored.sort(key=lambda item: (-item[1], item[0]))
343
+ return [
344
+ SemanticRank(range_id=range_id, rank=rank, score=score)
345
+ for rank, (range_id, score) in enumerate(scored[:limit], start=1)
346
+ ]
kc/store/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Persistence helpers for kc."""
kc/store/jsonl.py ADDED
@@ -0,0 +1,55 @@
1
+ """JSONL canonical store helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Sequence
6
+ from pathlib import Path
7
+
8
+ import orjson
9
+ from pydantic import BaseModel
10
+
11
+ from kc.atomic_write import atomic_write_bytes
12
+ from kc.errors import KcError
13
+
14
+
15
+ def read_jsonl[T: BaseModel](path: Path, model: type[T]) -> list[T]:
16
+ if not path.exists():
17
+ return []
18
+ records: list[T] = []
19
+ for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
20
+ if not line.strip():
21
+ continue
22
+ try:
23
+ records.append(model.model_validate(orjson.loads(line)))
24
+ except Exception as exc:
25
+ raise KcError(
26
+ code="KC_CONFIG_INVALID",
27
+ message=f"Invalid JSONL record in {path} at line {line_no}: {exc}",
28
+ details={"path": str(path), "line": line_no},
29
+ ) from exc
30
+ return records
31
+
32
+
33
+ def write_jsonl(path: Path, records: Sequence[BaseModel]) -> None:
34
+ path.parent.mkdir(parents=True, exist_ok=True)
35
+ chunks = [
36
+ orjson.dumps(record.model_dump(mode="json"), option=orjson.OPT_APPEND_NEWLINE)
37
+ for record in records
38
+ ]
39
+ atomic_write_bytes(path, b"".join(chunks))
40
+
41
+
42
+ def append_jsonl(path: Path, records: Iterable[BaseModel]) -> None:
43
+ existing: list[BaseModel] = []
44
+ if path.exists():
45
+ existing_lines = path.read_bytes().splitlines()
46
+ existing_data = [orjson.loads(line) for line in existing_lines if line.strip()]
47
+ existing = [_RawModel(value=item) for item in existing_data]
48
+ write_jsonl(path, [*existing, *records])
49
+
50
+
51
+ class _RawModel(BaseModel):
52
+ value: dict
53
+
54
+ def model_dump(self, *args, **kwargs): # type: ignore[no-untyped-def]
55
+ return self.value