kc-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kc/__init__.py +5 -0
- kc/__main__.py +11 -0
- kc/artifacts/__init__.py +1 -0
- kc/artifacts/diff.py +76 -0
- kc/artifacts/frontmatter.py +26 -0
- kc/artifacts/markdown.py +116 -0
- kc/atomic_write.py +33 -0
- kc/cli.py +284 -0
- kc/commands/__init__.py +1 -0
- kc/commands/artifact.py +1190 -0
- kc/commands/citation.py +231 -0
- kc/commands/common.py +346 -0
- kc/commands/conformance.py +293 -0
- kc/commands/context.py +190 -0
- kc/commands/doctor.py +81 -0
- kc/commands/eval.py +133 -0
- kc/commands/export.py +97 -0
- kc/commands/guide.py +571 -0
- kc/commands/index.py +54 -0
- kc/commands/init.py +207 -0
- kc/commands/lint.py +238 -0
- kc/commands/source.py +464 -0
- kc/commands/status.py +52 -0
- kc/commands/task.py +260 -0
- kc/config.py +127 -0
- kc/embedding_models/potion-base-8M/README.md +97 -0
- kc/embedding_models/potion-base-8M/config.json +13 -0
- kc/embedding_models/potion-base-8M/model.safetensors +0 -0
- kc/embedding_models/potion-base-8M/modules.json +14 -0
- kc/embedding_models/potion-base-8M/tokenizer.json +1 -0
- kc/errors.py +141 -0
- kc/fingerprints.py +35 -0
- kc/ids.py +23 -0
- kc/locks.py +65 -0
- kc/models/__init__.py +17 -0
- kc/models/artifact.py +34 -0
- kc/models/citation.py +60 -0
- kc/models/context.py +23 -0
- kc/models/eval.py +21 -0
- kc/models/plan.py +37 -0
- kc/models/source.py +37 -0
- kc/models/source_range.py +29 -0
- kc/models/source_revision.py +19 -0
- kc/models/task.py +35 -0
- kc/output.py +838 -0
- kc/paths.py +126 -0
- kc/provenance/__init__.py +1 -0
- kc/provenance/citations.py +296 -0
- kc/search/__init__.py +1 -0
- kc/search/extract.py +268 -0
- kc/search/fts.py +284 -0
- kc/search/semantic.py +346 -0
- kc/store/__init__.py +1 -0
- kc/store/jsonl.py +55 -0
- kc/store/sqlite.py +444 -0
- kc/store/transaction.py +67 -0
- kc/templates/agents/skills/kc/SKILL.md +282 -0
- kc/templates/agents/skills/kc/agents/openai.yaml +5 -0
- kc/templates/agents/skills/kc/scripts/resolve_query_citations.py +134 -0
- kc/workspace.py +98 -0
- kc_cli-0.4.0.dist-info/METADATA +522 -0
- kc_cli-0.4.0.dist-info/RECORD +65 -0
- kc_cli-0.4.0.dist-info/WHEEL +4 -0
- kc_cli-0.4.0.dist-info/entry_points.txt +2 -0
- kc_cli-0.4.0.dist-info/licenses/LICENSE +21 -0
kc/search/fts.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Range retrieval over SQLite FTS5 plus semantic vectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from urllib.parse import quote
|
|
10
|
+
|
|
11
|
+
import orjson
|
|
12
|
+
|
|
13
|
+
from kc.errors import KcError
|
|
14
|
+
from kc.models.source import SourceRecord
|
|
15
|
+
from kc.models.source_range import SourceRangeRecord
|
|
16
|
+
from kc.paths import ensure_data_dir_exists
|
|
17
|
+
from kc.search.semantic import (
|
|
18
|
+
assert_semantic_index_ready,
|
|
19
|
+
build_semantic_index,
|
|
20
|
+
semantic_index_status,
|
|
21
|
+
semantic_rankings,
|
|
22
|
+
)
|
|
23
|
+
from kc.store.jsonl import read_jsonl
|
|
24
|
+
from kc.store.sqlite import index_status, init_db, rebuild_index
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class Bm25Rank:
|
|
29
|
+
range_id: str
|
|
30
|
+
rank: int
|
|
31
|
+
score: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class CombinedRank:
|
|
36
|
+
range_id: str
|
|
37
|
+
bm25_rank: int | None = None
|
|
38
|
+
bm25_score: float | None = None
|
|
39
|
+
semantic_rank: int | None = None
|
|
40
|
+
semantic_score: float | None = None
|
|
41
|
+
rrf_score: float | None = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def ensure_index(db_path: Path, sources_path: Path, ranges_path: Path) -> None:
|
|
45
|
+
ensure_data_dir_exists()
|
|
46
|
+
sources = read_jsonl(sources_path, SourceRecord)
|
|
47
|
+
ranges = read_jsonl(ranges_path, SourceRangeRecord)
|
|
48
|
+
status = index_status(db_path, sources, ranges)
|
|
49
|
+
if not status["sqlite_exists"] or status["stale"]:
|
|
50
|
+
rebuild_index(db_path, sources, ranges)
|
|
51
|
+
semantic_status = semantic_index_status(db_path, ranges)
|
|
52
|
+
if (
|
|
53
|
+
not semantic_status["index_metadata"]
|
|
54
|
+
or not semantic_status["metadata_match"]
|
|
55
|
+
or semantic_status["missing_vectors"]
|
|
56
|
+
or semantic_status["stale_vectors"]
|
|
57
|
+
):
|
|
58
|
+
try:
|
|
59
|
+
build_semantic_index(db_path, ranges)
|
|
60
|
+
except KcError as exc:
|
|
61
|
+
if exc.code != "KC_RETRIEVAL_MODEL_UNAVAILABLE":
|
|
62
|
+
raise
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _build_fts_query(query: str) -> str:
|
|
66
|
+
terms = [t.replace('"', "").strip() for t in query.split() if t.strip()]
|
|
67
|
+
return " OR ".join(f'"{term}"' for term in terms)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def citation_token(
|
|
71
|
+
source_id: str,
|
|
72
|
+
locator: dict[str, Any],
|
|
73
|
+
*,
|
|
74
|
+
range_id: str | None = None,
|
|
75
|
+
legacy: bool = False,
|
|
76
|
+
) -> str:
|
|
77
|
+
prefix = f"[kc:{source_id}"
|
|
78
|
+
if range_id and not legacy:
|
|
79
|
+
prefix += f":{range_id}"
|
|
80
|
+
if locator.get("kind") == "line_range":
|
|
81
|
+
return f"{prefix}:L{locator.get('start_line')}-L{locator.get('end_line')}]"
|
|
82
|
+
if locator.get("kind") == "json_pointer":
|
|
83
|
+
pointer = quote(str(locator.get("pointer", "/")), safe="/~")
|
|
84
|
+
return f"{prefix}:JP:{pointer}]"
|
|
85
|
+
if locator.get("kind") == "csv_row_range":
|
|
86
|
+
return f"{prefix}:CSV:R{locator.get('start_row')}-R{locator.get('end_row')}]"
|
|
87
|
+
return f"{prefix}]"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def rrf_score(ranks: list[int], *, k: int = 60) -> float:
|
|
91
|
+
return sum(1.0 / (k + rank) for rank in ranks)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _connect(db_path: Path) -> sqlite3.Connection:
|
|
95
|
+
conn = sqlite3.connect(str(db_path))
|
|
96
|
+
conn.row_factory = sqlite3.Row
|
|
97
|
+
return conn
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _bm25_rankings(
|
|
101
|
+
conn: sqlite3.Connection,
|
|
102
|
+
query: str,
|
|
103
|
+
*,
|
|
104
|
+
domain: str | None = None,
|
|
105
|
+
limit: int = 100,
|
|
106
|
+
) -> list[Bm25Rank]:
|
|
107
|
+
fts_query = _build_fts_query(query)
|
|
108
|
+
if not fts_query:
|
|
109
|
+
return []
|
|
110
|
+
sql = """
|
|
111
|
+
SELECT
|
|
112
|
+
f.range_id,
|
|
113
|
+
bm25(source_ranges_fts) AS bm25_score
|
|
114
|
+
FROM source_ranges_fts f
|
|
115
|
+
JOIN source_ranges r ON r.range_id = f.range_id
|
|
116
|
+
JOIN sources s ON s.source_id = f.source_id
|
|
117
|
+
WHERE source_ranges_fts MATCH ?
|
|
118
|
+
"""
|
|
119
|
+
params: list[Any] = [fts_query]
|
|
120
|
+
if domain:
|
|
121
|
+
sql += " AND f.domain LIKE ?"
|
|
122
|
+
params.append(f"%{domain}%")
|
|
123
|
+
sql += " ORDER BY bm25_score, f.range_id LIMIT ?"
|
|
124
|
+
params.append(limit)
|
|
125
|
+
try:
|
|
126
|
+
rows = conn.execute(sql, params).fetchall()
|
|
127
|
+
except sqlite3.OperationalError as exc:
|
|
128
|
+
raise KcError(
|
|
129
|
+
code="KC_INDEX_BUILD_FAILED",
|
|
130
|
+
message=f"Search failed: {exc}",
|
|
131
|
+
details={"query": query},
|
|
132
|
+
) from exc
|
|
133
|
+
return [
|
|
134
|
+
Bm25Rank(range_id=row["range_id"], rank=rank, score=float(row["bm25_score"]))
|
|
135
|
+
for rank, row in enumerate(rows, start=1)
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _rows_for_ranges(conn: sqlite3.Connection, range_ids: list[str]) -> dict[str, sqlite3.Row]:
|
|
140
|
+
if not range_ids:
|
|
141
|
+
return {}
|
|
142
|
+
placeholders = ", ".join("?" for _ in range_ids)
|
|
143
|
+
rows = conn.execute(
|
|
144
|
+
f"""
|
|
145
|
+
SELECT
|
|
146
|
+
r.range_id,
|
|
147
|
+
r.source_id,
|
|
148
|
+
r.locator_json,
|
|
149
|
+
r.source_fingerprint,
|
|
150
|
+
r.record_json AS range_record_json,
|
|
151
|
+
r.excerpt,
|
|
152
|
+
s.display_name,
|
|
153
|
+
s.status AS source_status,
|
|
154
|
+
s.authority_json,
|
|
155
|
+
s.fingerprint AS current_source_fingerprint
|
|
156
|
+
FROM source_ranges r
|
|
157
|
+
JOIN sources s ON s.source_id = r.source_id
|
|
158
|
+
WHERE r.range_id IN ({placeholders})
|
|
159
|
+
""",
|
|
160
|
+
range_ids,
|
|
161
|
+
).fetchall()
|
|
162
|
+
return {row["range_id"]: row for row in rows}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _format_result(
|
|
166
|
+
row: sqlite3.Row,
|
|
167
|
+
rank: CombinedRank,
|
|
168
|
+
*,
|
|
169
|
+
hybrid_rank: int,
|
|
170
|
+
) -> dict[str, Any]:
|
|
171
|
+
locator = orjson.loads(row["locator_json"])
|
|
172
|
+
authority = orjson.loads(row["authority_json"])
|
|
173
|
+
return {
|
|
174
|
+
"range_id": row["range_id"],
|
|
175
|
+
"source_id": row["source_id"],
|
|
176
|
+
"display_name": row["display_name"],
|
|
177
|
+
"locator": locator,
|
|
178
|
+
"excerpt": row["excerpt"],
|
|
179
|
+
"scores": {
|
|
180
|
+
"bm25_rank": rank.bm25_rank,
|
|
181
|
+
"bm25_score": rank.bm25_score,
|
|
182
|
+
"semantic_rank": rank.semantic_rank,
|
|
183
|
+
"semantic_score": rank.semantic_score,
|
|
184
|
+
"hybrid_rank": hybrid_rank,
|
|
185
|
+
"rrf_score": rank.rrf_score,
|
|
186
|
+
},
|
|
187
|
+
"citation_token": citation_token(row["source_id"], locator, range_id=row["range_id"]),
|
|
188
|
+
"legacy_citation_token": citation_token(row["source_id"], locator, legacy=True),
|
|
189
|
+
"source_authority": authority,
|
|
190
|
+
"source_status": row["source_status"],
|
|
191
|
+
"source_fingerprint": row["source_fingerprint"],
|
|
192
|
+
"current_source_fingerprint": row["current_source_fingerprint"],
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _combine_hybrid(
|
|
197
|
+
bm25: list[Bm25Rank],
|
|
198
|
+
semantic: list[Any],
|
|
199
|
+
*,
|
|
200
|
+
rrf_k: int,
|
|
201
|
+
limit: int,
|
|
202
|
+
) -> list[CombinedRank]:
|
|
203
|
+
by_range: dict[str, CombinedRank] = {}
|
|
204
|
+
for item in bm25:
|
|
205
|
+
by_range[item.range_id] = CombinedRank(
|
|
206
|
+
range_id=item.range_id,
|
|
207
|
+
bm25_rank=item.rank,
|
|
208
|
+
bm25_score=item.score,
|
|
209
|
+
)
|
|
210
|
+
for item in semantic:
|
|
211
|
+
existing = by_range.get(item.range_id)
|
|
212
|
+
by_range[item.range_id] = CombinedRank(
|
|
213
|
+
range_id=item.range_id,
|
|
214
|
+
bm25_rank=existing.bm25_rank if existing else None,
|
|
215
|
+
bm25_score=existing.bm25_score if existing else None,
|
|
216
|
+
semantic_rank=item.rank,
|
|
217
|
+
semantic_score=item.score,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
merged = []
|
|
221
|
+
for item in by_range.values():
|
|
222
|
+
ranks = [rank for rank in [item.bm25_rank, item.semantic_rank] if rank is not None]
|
|
223
|
+
merged.append(
|
|
224
|
+
CombinedRank(
|
|
225
|
+
range_id=item.range_id,
|
|
226
|
+
bm25_rank=item.bm25_rank,
|
|
227
|
+
bm25_score=item.bm25_score,
|
|
228
|
+
semantic_rank=item.semantic_rank,
|
|
229
|
+
semantic_score=item.semantic_score,
|
|
230
|
+
rrf_score=rrf_score(ranks, k=rrf_k),
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
sentinel = 1_000_000
|
|
234
|
+
merged.sort(
|
|
235
|
+
key=lambda item: (
|
|
236
|
+
-(item.rrf_score or 0.0),
|
|
237
|
+
item.bm25_rank if item.bm25_rank is not None else sentinel,
|
|
238
|
+
item.semantic_rank if item.semantic_rank is not None else sentinel,
|
|
239
|
+
item.range_id,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
return merged[:limit]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def search_ranges(
|
|
246
|
+
db_path: Path,
|
|
247
|
+
query: str,
|
|
248
|
+
*,
|
|
249
|
+
domain: str | None = None,
|
|
250
|
+
limit: int = 10,
|
|
251
|
+
rrf_k: int = 60,
|
|
252
|
+
ranges: list[SourceRangeRecord] | None = None,
|
|
253
|
+
metadata: dict[str, Any] | None = None,
|
|
254
|
+
) -> list[dict[str, Any]]:
|
|
255
|
+
init_db(db_path)
|
|
256
|
+
conn = _connect(db_path)
|
|
257
|
+
try:
|
|
258
|
+
candidate_limit = max(limit * 5, 100)
|
|
259
|
+
source_ranges = ranges if ranges is not None else []
|
|
260
|
+
bm25 = _bm25_rankings(conn, query, domain=domain, limit=candidate_limit)
|
|
261
|
+
mode = "hybrid"
|
|
262
|
+
semantic_unavailable: str | None = None
|
|
263
|
+
try:
|
|
264
|
+
model = assert_semantic_index_ready(db_path, source_ranges)
|
|
265
|
+
semantic = semantic_rankings(conn, query, model, domain=domain, limit=candidate_limit)
|
|
266
|
+
except KcError as exc:
|
|
267
|
+
if exc.code != "KC_RETRIEVAL_MODEL_UNAVAILABLE":
|
|
268
|
+
raise
|
|
269
|
+
semantic = []
|
|
270
|
+
mode = "fts_fallback"
|
|
271
|
+
semantic_unavailable = exc.message
|
|
272
|
+
combined = _combine_hybrid(bm25, semantic, rrf_k=rrf_k, limit=limit)
|
|
273
|
+
if metadata is not None:
|
|
274
|
+
metadata["mode"] = mode
|
|
275
|
+
metadata["semantic_unavailable_reason"] = semantic_unavailable
|
|
276
|
+
rows = _rows_for_ranges(conn, [item.range_id for item in combined])
|
|
277
|
+
results = []
|
|
278
|
+
for hybrid_rank, item in enumerate(combined, start=1):
|
|
279
|
+
row = rows.get(item.range_id)
|
|
280
|
+
if row is not None:
|
|
281
|
+
results.append(_format_result(row, item, hybrid_rank=hybrid_rank))
|
|
282
|
+
return results
|
|
283
|
+
finally:
|
|
284
|
+
conn.close()
|
kc/search/semantic.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
"""Local semantic retrieval support for source ranges."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import sqlite3
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from functools import lru_cache
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
import numpy.typing as npt
|
|
15
|
+
import orjson
|
|
16
|
+
|
|
17
|
+
from kc.errors import KcError
|
|
18
|
+
from kc.models.source_range import SourceRangeRecord
|
|
19
|
+
from kc.store.sqlite import init_db
|
|
20
|
+
|
|
21
|
+
MODEL_PROVIDER = "model2vec"
|
|
22
|
+
BUNDLED_MODEL_NAME = "potion-base-8M"
|
|
23
|
+
EXPECTED_DIMENSION = 256
|
|
24
|
+
EXPECTED_CHECKSUM = "sha256:1630533e875a4725d11a703f48cb9598e59f3cfe3fa0d5a19a54045d52922848"
|
|
25
|
+
SEMANTIC_METADATA_KEY = "semantic_model"
|
|
26
|
+
TEXT_MODEL_SUFFIXES = {".json", ".md"}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class SemanticRank:
|
|
31
|
+
range_id: str
|
|
32
|
+
rank: int
|
|
33
|
+
score: float
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def bundled_model_dir() -> Path:
|
|
37
|
+
return Path(__file__).resolve().parent.parent / "embedding_models" / BUNDLED_MODEL_NAME
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def model_directory_checksum(model_dir: Path) -> str:
|
|
41
|
+
root = Path(model_dir)
|
|
42
|
+
digest = hashlib.sha256()
|
|
43
|
+
paths = (path for path in root.rglob("*") if path.is_file())
|
|
44
|
+
for path in sorted(paths, key=lambda item: item.relative_to(root).as_posix()):
|
|
45
|
+
rel = path.relative_to(root).as_posix().encode("utf-8")
|
|
46
|
+
digest.update(rel)
|
|
47
|
+
digest.update(b"\0")
|
|
48
|
+
digest.update(_checksum_file_bytes(path))
|
|
49
|
+
digest.update(b"\0")
|
|
50
|
+
return f"sha256:{digest.hexdigest()}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _checksum_file_bytes(path: Path) -> bytes:
|
|
54
|
+
data = path.read_bytes()
|
|
55
|
+
if path.suffix.lower() in TEXT_MODEL_SUFFIXES:
|
|
56
|
+
return data.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _model_unavailable(message: str, **details: Any) -> KcError:
|
|
61
|
+
return KcError(
|
|
62
|
+
code="KC_RETRIEVAL_MODEL_UNAVAILABLE",
|
|
63
|
+
message=message,
|
|
64
|
+
details=details,
|
|
65
|
+
suggested_action="run kc index build after fixing the local model",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def semantic_model_metadata(model: Any | None = None) -> dict[str, Any]:
|
|
70
|
+
dimension = int(getattr(model, "dim", EXPECTED_DIMENSION) or EXPECTED_DIMENSION)
|
|
71
|
+
return {
|
|
72
|
+
"provider": MODEL_PROVIDER,
|
|
73
|
+
"model": BUNDLED_MODEL_NAME,
|
|
74
|
+
"dimension": dimension,
|
|
75
|
+
"checksum": model_directory_checksum(bundled_model_dir()),
|
|
76
|
+
"purpose": "ranking_only",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@lru_cache(maxsize=1)
|
|
81
|
+
def load_semantic_model() -> Any:
|
|
82
|
+
model_dir = bundled_model_dir()
|
|
83
|
+
if not model_dir.exists():
|
|
84
|
+
raise _model_unavailable("Bundled semantic model directory is missing.", model_dir=str(model_dir))
|
|
85
|
+
checksum = model_directory_checksum(model_dir)
|
|
86
|
+
if checksum != EXPECTED_CHECKSUM:
|
|
87
|
+
raise _model_unavailable(
|
|
88
|
+
"Bundled semantic model checksum does not match the configured checksum.",
|
|
89
|
+
model_dir=str(model_dir),
|
|
90
|
+
expected_checksum=EXPECTED_CHECKSUM,
|
|
91
|
+
actual_checksum=checksum,
|
|
92
|
+
)
|
|
93
|
+
try:
|
|
94
|
+
from model2vec import StaticModel
|
|
95
|
+
except ImportError as exc:
|
|
96
|
+
raise _model_unavailable("model2vec is not installed.", dependency="model2vec") from exc
|
|
97
|
+
try:
|
|
98
|
+
model = StaticModel.from_pretrained(str(model_dir))
|
|
99
|
+
except Exception as exc:
|
|
100
|
+
raise _model_unavailable("Failed to load bundled semantic model.", model_dir=str(model_dir)) from exc
|
|
101
|
+
dimension = int(getattr(model, "dim", 0) or 0)
|
|
102
|
+
if dimension and dimension != EXPECTED_DIMENSION:
|
|
103
|
+
raise _model_unavailable(
|
|
104
|
+
"Bundled semantic model dimension does not match the configured dimension.",
|
|
105
|
+
expected_dimension=EXPECTED_DIMENSION,
|
|
106
|
+
actual_dimension=dimension,
|
|
107
|
+
)
|
|
108
|
+
return model
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def is_model_available() -> tuple[bool, dict[str, Any] | None, str | None]:
|
|
112
|
+
try:
|
|
113
|
+
model = load_semantic_model()
|
|
114
|
+
return True, semantic_model_metadata(model), None
|
|
115
|
+
except KcError as exc:
|
|
116
|
+
return False, None, exc.message
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _encode(model: Any, texts: str | list[str]) -> npt.NDArray[np.float32]:
|
|
120
|
+
try:
|
|
121
|
+
encoded = model.encode(texts, show_progress_bar=False, use_multiprocessing=False)
|
|
122
|
+
except TypeError:
|
|
123
|
+
encoded = model.encode(texts)
|
|
124
|
+
return np.asarray(encoded, dtype=np.float32)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def embed_text(model: Any, text: str) -> npt.NDArray[np.float32]:
|
|
128
|
+
return _encode(model, text).reshape(-1).astype(np.float32)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def embed_texts(model: Any, texts: list[str]) -> npt.NDArray[np.float32]:
|
|
132
|
+
if not texts:
|
|
133
|
+
return np.empty((0, EXPECTED_DIMENSION), dtype=np.float32)
|
|
134
|
+
encoded = _encode(model, texts)
|
|
135
|
+
if encoded.ndim == 1:
|
|
136
|
+
encoded = encoded.reshape(1, -1)
|
|
137
|
+
return encoded.astype(np.float32)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def embedding_to_blob(embedding: npt.NDArray[np.float32]) -> bytes:
|
|
141
|
+
return np.asarray(embedding, dtype=np.float32).reshape(-1).tobytes()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def blob_to_embedding(blob: bytes) -> npt.NDArray[np.float32]:
|
|
145
|
+
return np.frombuffer(blob, dtype=np.float32)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def cosine_similarity(a: npt.NDArray[np.float32], b: npt.NDArray[np.float32]) -> float:
|
|
149
|
+
left = np.asarray(a, dtype=np.float32).reshape(-1)
|
|
150
|
+
right = np.asarray(b, dtype=np.float32).reshape(-1)
|
|
151
|
+
norm = np.linalg.norm(left) * np.linalg.norm(right)
|
|
152
|
+
if norm == 0:
|
|
153
|
+
return 0.0
|
|
154
|
+
return float(np.dot(left, right) / norm)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _now() -> str:
|
|
158
|
+
return datetime.now(UTC).isoformat()
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _connect(db_path: Path) -> sqlite3.Connection:
|
|
162
|
+
conn = sqlite3.connect(str(db_path))
|
|
163
|
+
conn.row_factory = sqlite3.Row
|
|
164
|
+
return conn
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _table_exists(conn: sqlite3.Connection, table: str) -> bool:
|
|
168
|
+
row = conn.execute(
|
|
169
|
+
"SELECT name FROM sqlite_master WHERE type IN ('table', 'view') AND name = ?",
|
|
170
|
+
(table,),
|
|
171
|
+
).fetchone()
|
|
172
|
+
return row is not None
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _metadata_from_db(conn: sqlite3.Connection) -> dict[str, Any] | None:
|
|
176
|
+
row = conn.execute("SELECT value_json FROM index_metadata WHERE key = ?", (SEMANTIC_METADATA_KEY,)).fetchone()
|
|
177
|
+
if row is None:
|
|
178
|
+
return None
|
|
179
|
+
return orjson.loads(row["value_json"])
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def build_semantic_index(db_path: Path, ranges: list[SourceRangeRecord]) -> dict[str, Any]:
|
|
183
|
+
init_db(db_path)
|
|
184
|
+
model = load_semantic_model()
|
|
185
|
+
metadata = semantic_model_metadata(model)
|
|
186
|
+
vectors = embed_texts(model, [r.excerpt for r in ranges])
|
|
187
|
+
if len(vectors) != len(ranges):
|
|
188
|
+
raise _model_unavailable(
|
|
189
|
+
"Semantic model returned an unexpected number of embeddings.",
|
|
190
|
+
expected=len(ranges),
|
|
191
|
+
actual=len(vectors),
|
|
192
|
+
)
|
|
193
|
+
dimension = int(vectors.shape[1]) if vectors.ndim == 2 and len(ranges) > 0 else int(metadata["dimension"])
|
|
194
|
+
if dimension != int(metadata["dimension"]):
|
|
195
|
+
raise _model_unavailable(
|
|
196
|
+
"Semantic model returned embeddings with an unexpected dimension.",
|
|
197
|
+
expected_dimension=metadata["dimension"],
|
|
198
|
+
actual_dimension=dimension,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
conn = _connect(db_path)
|
|
202
|
+
try:
|
|
203
|
+
timestamp = _now()
|
|
204
|
+
conn.execute("DELETE FROM source_range_embeddings")
|
|
205
|
+
conn.executemany(
|
|
206
|
+
"""
|
|
207
|
+
INSERT INTO source_range_embeddings(
|
|
208
|
+
range_id, source_id, source_fingerprint, text_hash, model_name,
|
|
209
|
+
model_checksum, dimension, embedding, updated_at
|
|
210
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
211
|
+
""",
|
|
212
|
+
[
|
|
213
|
+
(
|
|
214
|
+
source_range.range_id,
|
|
215
|
+
source_range.source_id,
|
|
216
|
+
source_range.source_fingerprint,
|
|
217
|
+
source_range.text_hash,
|
|
218
|
+
metadata["model"],
|
|
219
|
+
metadata["checksum"],
|
|
220
|
+
metadata["dimension"],
|
|
221
|
+
embedding_to_blob(vectors[index]),
|
|
222
|
+
timestamp,
|
|
223
|
+
)
|
|
224
|
+
for index, source_range in enumerate(ranges)
|
|
225
|
+
],
|
|
226
|
+
)
|
|
227
|
+
stored_metadata = {**metadata, "built_at": timestamp, "ranges": len(ranges)}
|
|
228
|
+
conn.execute(
|
|
229
|
+
"INSERT OR REPLACE INTO index_metadata(key, value_json, updated_at) VALUES (?, ?, ?)",
|
|
230
|
+
(SEMANTIC_METADATA_KEY, orjson.dumps(stored_metadata).decode(), timestamp),
|
|
231
|
+
)
|
|
232
|
+
conn.commit()
|
|
233
|
+
return {"enabled": True, "model": stored_metadata, "embeddings": len(ranges)}
|
|
234
|
+
finally:
|
|
235
|
+
conn.close()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def semantic_index_status(db_path: Path, ranges: list[SourceRangeRecord] | None = None) -> dict[str, Any]:
|
|
239
|
+
available, model_metadata, unavailable_reason = is_model_available()
|
|
240
|
+
status: dict[str, Any] = {
|
|
241
|
+
"model_available": available,
|
|
242
|
+
"model": model_metadata,
|
|
243
|
+
"unavailable_reason": unavailable_reason,
|
|
244
|
+
"index_metadata": None,
|
|
245
|
+
"metadata_match": False,
|
|
246
|
+
"vector_count": 0,
|
|
247
|
+
"missing_vectors": None,
|
|
248
|
+
"stale_vectors": None,
|
|
249
|
+
}
|
|
250
|
+
if not db_path.exists():
|
|
251
|
+
return status
|
|
252
|
+
conn = _connect(db_path)
|
|
253
|
+
try:
|
|
254
|
+
if not _table_exists(conn, "source_range_embeddings"):
|
|
255
|
+
return status
|
|
256
|
+
status["vector_count"] = int(conn.execute("SELECT COUNT(*) FROM source_range_embeddings").fetchone()[0])
|
|
257
|
+
index_metadata = _metadata_from_db(conn)
|
|
258
|
+
status["index_metadata"] = index_metadata
|
|
259
|
+
if model_metadata and index_metadata:
|
|
260
|
+
status["metadata_match"] = all(
|
|
261
|
+
index_metadata.get(key) == model_metadata.get(key)
|
|
262
|
+
for key in ("provider", "model", "dimension", "checksum", "purpose")
|
|
263
|
+
)
|
|
264
|
+
if ranges is not None:
|
|
265
|
+
rows = {
|
|
266
|
+
row["range_id"]: row
|
|
267
|
+
for row in conn.execute(
|
|
268
|
+
"""
|
|
269
|
+
SELECT range_id, source_fingerprint, text_hash, model_checksum, dimension
|
|
270
|
+
FROM source_range_embeddings
|
|
271
|
+
"""
|
|
272
|
+
).fetchall()
|
|
273
|
+
}
|
|
274
|
+
missing = 0
|
|
275
|
+
stale = 0
|
|
276
|
+
checksum = model_metadata.get("checksum") if model_metadata else None
|
|
277
|
+
dimension_value = model_metadata.get("dimension") if model_metadata else None
|
|
278
|
+
dimension = int(dimension_value) if dimension_value is not None else None
|
|
279
|
+
for source_range in ranges:
|
|
280
|
+
row = rows.get(source_range.range_id)
|
|
281
|
+
if row is None:
|
|
282
|
+
missing += 1
|
|
283
|
+
continue
|
|
284
|
+
if (
|
|
285
|
+
row["source_fingerprint"] != source_range.source_fingerprint
|
|
286
|
+
or row["text_hash"] != source_range.text_hash
|
|
287
|
+
or row["model_checksum"] != checksum
|
|
288
|
+
or dimension is None
|
|
289
|
+
or int(row["dimension"]) != dimension
|
|
290
|
+
):
|
|
291
|
+
stale += 1
|
|
292
|
+
status["missing_vectors"] = missing
|
|
293
|
+
status["stale_vectors"] = stale
|
|
294
|
+
return status
|
|
295
|
+
finally:
|
|
296
|
+
conn.close()
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def assert_semantic_index_ready(
|
|
300
|
+
db_path: Path,
|
|
301
|
+
ranges: list[SourceRangeRecord],
|
|
302
|
+
) -> Any:
|
|
303
|
+
model = load_semantic_model()
|
|
304
|
+
status = semantic_index_status(db_path, ranges)
|
|
305
|
+
if not status["index_metadata"] or not status["metadata_match"]:
|
|
306
|
+
raise _model_unavailable(
|
|
307
|
+
"Semantic index metadata is missing or stale. Run kc index build.",
|
|
308
|
+
status=status,
|
|
309
|
+
)
|
|
310
|
+
if status["missing_vectors"] or status["stale_vectors"]:
|
|
311
|
+
raise _model_unavailable(
|
|
312
|
+
"Semantic index vectors are missing or stale. Run kc index build.",
|
|
313
|
+
missing_vectors=status["missing_vectors"],
|
|
314
|
+
stale_vectors=status["stale_vectors"],
|
|
315
|
+
)
|
|
316
|
+
return model
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def semantic_rankings(
|
|
320
|
+
conn: sqlite3.Connection,
|
|
321
|
+
query: str,
|
|
322
|
+
model: Any,
|
|
323
|
+
*,
|
|
324
|
+
domain: str | None = None,
|
|
325
|
+
limit: int = 100,
|
|
326
|
+
) -> list[SemanticRank]:
|
|
327
|
+
query_embedding = embed_text(model, query)
|
|
328
|
+
sql = """
|
|
329
|
+
SELECT e.range_id, e.embedding
|
|
330
|
+
FROM source_range_embeddings e
|
|
331
|
+
JOIN sources s ON s.source_id = e.source_id
|
|
332
|
+
"""
|
|
333
|
+
params: list[Any] = []
|
|
334
|
+
if domain:
|
|
335
|
+
sql += " WHERE s.domain_json LIKE ?"
|
|
336
|
+
params.append(f"%{domain}%")
|
|
337
|
+
rows = conn.execute(sql, params).fetchall()
|
|
338
|
+
scored: list[tuple[str, float]] = []
|
|
339
|
+
for row in rows:
|
|
340
|
+
score = cosine_similarity(query_embedding, blob_to_embedding(row["embedding"]))
|
|
341
|
+
scored.append((row["range_id"], score))
|
|
342
|
+
scored.sort(key=lambda item: (-item[1], item[0]))
|
|
343
|
+
return [
|
|
344
|
+
SemanticRank(range_id=range_id, rank=rank, score=score)
|
|
345
|
+
for rank, (range_id, score) in enumerate(scored[:limit], start=1)
|
|
346
|
+
]
|
kc/store/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Persistence helpers for kc."""
|
kc/store/jsonl.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""JSONL canonical store helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable, Sequence
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import orjson
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from kc.atomic_write import atomic_write_bytes
|
|
12
|
+
from kc.errors import KcError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_jsonl[T: BaseModel](path: Path, model: type[T]) -> list[T]:
|
|
16
|
+
if not path.exists():
|
|
17
|
+
return []
|
|
18
|
+
records: list[T] = []
|
|
19
|
+
for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
|
|
20
|
+
if not line.strip():
|
|
21
|
+
continue
|
|
22
|
+
try:
|
|
23
|
+
records.append(model.model_validate(orjson.loads(line)))
|
|
24
|
+
except Exception as exc:
|
|
25
|
+
raise KcError(
|
|
26
|
+
code="KC_CONFIG_INVALID",
|
|
27
|
+
message=f"Invalid JSONL record in {path} at line {line_no}: {exc}",
|
|
28
|
+
details={"path": str(path), "line": line_no},
|
|
29
|
+
) from exc
|
|
30
|
+
return records
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def write_jsonl(path: Path, records: Sequence[BaseModel]) -> None:
|
|
34
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
chunks = [
|
|
36
|
+
orjson.dumps(record.model_dump(mode="json"), option=orjson.OPT_APPEND_NEWLINE)
|
|
37
|
+
for record in records
|
|
38
|
+
]
|
|
39
|
+
atomic_write_bytes(path, b"".join(chunks))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def append_jsonl(path: Path, records: Iterable[BaseModel]) -> None:
|
|
43
|
+
existing: list[BaseModel] = []
|
|
44
|
+
if path.exists():
|
|
45
|
+
existing_lines = path.read_bytes().splitlines()
|
|
46
|
+
existing_data = [orjson.loads(line) for line in existing_lines if line.strip()]
|
|
47
|
+
existing = [_RawModel(value=item) for item in existing_data]
|
|
48
|
+
write_jsonl(path, [*existing, *records])
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class _RawModel(BaseModel):
|
|
52
|
+
value: dict
|
|
53
|
+
|
|
54
|
+
def model_dump(self, *args, **kwargs): # type: ignore[no-untyped-def]
|
|
55
|
+
return self.value
|