@heytherevibin/skillforge 0.2.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +44 -53
- package/RELEASING.md +1 -1
- package/SECURITY.md +2 -2
- package/STRATEGY.md +1 -3
- package/bin/cli.js +32 -138
- package/package.json +2 -2
- package/python/app/chunking.py +116 -0
- package/python/app/context_fusion.py +77 -0
- package/python/app/events_cli.py +1 -1
- package/python/app/index_cli.py +89 -0
- package/python/app/main.py +380 -214
- package/python/app/mcp_contract.py +121 -0
- package/python/app/mcp_server.py +80 -28
- package/python/app/project_index.py +600 -0
- package/python/app/redaction.py +128 -0
- package/python/app/route_cli.py +42 -19
- package/python/requirements.txt +0 -4
- package/python/tests/test_chunking.py +34 -0
- package/python/tests/test_context_fusion.py +45 -0
- package/python/tests/test_mcp_contract.py +137 -0
- package/python/tests/test_project_index.py +76 -0
- package/python/tests/test_redaction.py +51 -0
- package/python/app/auth.py +0 -63
- package/python/app/cli.py +0 -78
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
"""Project-local RAG: walk a repo, chunk text files, store embeddings in per-project SQLite."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sqlite3
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Callable, Iterator
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from app.chunking import SkillChunk, chunk_max_chars, chunk_overlap_chars, chunk_raw_document
|
|
14
|
+
|
|
15
|
+
# Basenames to skip entirely (noise / vendor / artifacts).
|
|
16
|
+
DEFAULT_IGNORE_DIR_NAMES: frozenset[str] = frozenset(
|
|
17
|
+
{
|
|
18
|
+
".git",
|
|
19
|
+
".hg",
|
|
20
|
+
".svn",
|
|
21
|
+
"node_modules",
|
|
22
|
+
"__pycache__",
|
|
23
|
+
".venv",
|
|
24
|
+
"venv",
|
|
25
|
+
".tox",
|
|
26
|
+
"dist",
|
|
27
|
+
"build",
|
|
28
|
+
".next",
|
|
29
|
+
".nuxt",
|
|
30
|
+
"target",
|
|
31
|
+
"coverage",
|
|
32
|
+
".pytest_cache",
|
|
33
|
+
".mypy_cache",
|
|
34
|
+
".ruff_cache",
|
|
35
|
+
".terraform",
|
|
36
|
+
".parcel-cache",
|
|
37
|
+
".cache",
|
|
38
|
+
".skillforge",
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Suffixes we never try to read as UTF-8 text.
|
|
43
|
+
SKIP_EXTENSIONS: frozenset[str] = frozenset(
|
|
44
|
+
{
|
|
45
|
+
".png",
|
|
46
|
+
".jpg",
|
|
47
|
+
".jpeg",
|
|
48
|
+
".gif",
|
|
49
|
+
".webp",
|
|
50
|
+
".ico",
|
|
51
|
+
".pdf",
|
|
52
|
+
".zip",
|
|
53
|
+
".tar",
|
|
54
|
+
".gz",
|
|
55
|
+
".tgz",
|
|
56
|
+
".bz2",
|
|
57
|
+
".xz",
|
|
58
|
+
".7z",
|
|
59
|
+
".rar",
|
|
60
|
+
".mp3",
|
|
61
|
+
".mp4",
|
|
62
|
+
".mov",
|
|
63
|
+
".wav",
|
|
64
|
+
".woff",
|
|
65
|
+
".woff2",
|
|
66
|
+
".ttf",
|
|
67
|
+
".eot",
|
|
68
|
+
".db",
|
|
69
|
+
".sqlite",
|
|
70
|
+
".sqlite3",
|
|
71
|
+
".bin",
|
|
72
|
+
".exe",
|
|
73
|
+
".dll",
|
|
74
|
+
".so",
|
|
75
|
+
".dylib",
|
|
76
|
+
".o",
|
|
77
|
+
".a",
|
|
78
|
+
".class",
|
|
79
|
+
".jar",
|
|
80
|
+
".pyc",
|
|
81
|
+
".pyo",
|
|
82
|
+
".pyd",
|
|
83
|
+
".lock",
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
TEXT_EXTENSIONS: frozenset[str] = frozenset(
|
|
88
|
+
{
|
|
89
|
+
".md",
|
|
90
|
+
".mdx",
|
|
91
|
+
".txt",
|
|
92
|
+
".rst",
|
|
93
|
+
".py",
|
|
94
|
+
".pyi",
|
|
95
|
+
".js",
|
|
96
|
+
".jsx",
|
|
97
|
+
".mjs",
|
|
98
|
+
".cjs",
|
|
99
|
+
".ts",
|
|
100
|
+
".tsx",
|
|
101
|
+
".json",
|
|
102
|
+
".jsonc",
|
|
103
|
+
".yaml",
|
|
104
|
+
".yml",
|
|
105
|
+
".toml",
|
|
106
|
+
".rs",
|
|
107
|
+
".go",
|
|
108
|
+
".java",
|
|
109
|
+
".kt",
|
|
110
|
+
".kts",
|
|
111
|
+
".rb",
|
|
112
|
+
".php",
|
|
113
|
+
".cs",
|
|
114
|
+
".swift",
|
|
115
|
+
".m",
|
|
116
|
+
".mm",
|
|
117
|
+
".h",
|
|
118
|
+
".hpp",
|
|
119
|
+
".c",
|
|
120
|
+
".cc",
|
|
121
|
+
".cpp",
|
|
122
|
+
".cxx",
|
|
123
|
+
".scss",
|
|
124
|
+
".sass",
|
|
125
|
+
".css",
|
|
126
|
+
".less",
|
|
127
|
+
".html",
|
|
128
|
+
".htm",
|
|
129
|
+
".vue",
|
|
130
|
+
".svelte",
|
|
131
|
+
".sql",
|
|
132
|
+
".graphql",
|
|
133
|
+
".sh",
|
|
134
|
+
".bash",
|
|
135
|
+
".zsh",
|
|
136
|
+
".fish",
|
|
137
|
+
".ps1",
|
|
138
|
+
".env",
|
|
139
|
+
".ini",
|
|
140
|
+
".cfg",
|
|
141
|
+
".conf",
|
|
142
|
+
".properties",
|
|
143
|
+
".xml",
|
|
144
|
+
".gradle",
|
|
145
|
+
".cmake",
|
|
146
|
+
".clj",
|
|
147
|
+
".cljs",
|
|
148
|
+
".ex",
|
|
149
|
+
".exs",
|
|
150
|
+
".erl",
|
|
151
|
+
".hrl",
|
|
152
|
+
".lua",
|
|
153
|
+
".nim",
|
|
154
|
+
".dart",
|
|
155
|
+
".scala",
|
|
156
|
+
".sol",
|
|
157
|
+
".r",
|
|
158
|
+
".R",
|
|
159
|
+
".jl",
|
|
160
|
+
".pl",
|
|
161
|
+
".pm",
|
|
162
|
+
".proto",
|
|
163
|
+
".tex",
|
|
164
|
+
".liquid",
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
SPECIAL_FILENAMES: frozenset[str] = frozenset(
|
|
169
|
+
{
|
|
170
|
+
"dockerfile",
|
|
171
|
+
"makefile",
|
|
172
|
+
"gemfile",
|
|
173
|
+
"rakefile",
|
|
174
|
+
"procfile",
|
|
175
|
+
"jenkinsfile",
|
|
176
|
+
"licence",
|
|
177
|
+
"license",
|
|
178
|
+
"readme",
|
|
179
|
+
"changelog",
|
|
180
|
+
"contributing",
|
|
181
|
+
"code_of_conduct",
|
|
182
|
+
}
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Approximate cap for rows loaded into memory during retrieval.
|
|
186
|
+
PROJECT_RAG_MAX_ROWS_DEFAULT = int(os.getenv("SKILLFORGE_PROJECT_RAG_MAX_CHUNKS", "20000"))
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def index_max_file_bytes() -> int:
|
|
190
|
+
return max(4096, int(os.getenv("SKILLFORGE_INDEX_MAX_FILE_BYTES", "524288")))
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def project_rag_max_chars() -> int:
|
|
194
|
+
return max(0, int(os.getenv("SKILLFORGE_PROJECT_RAG_MAX_CHARS", "24000")))
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def extra_ignore_dir_names() -> frozenset[str]:
|
|
198
|
+
raw = os.getenv("SKILLFORGE_INDEX_IGNORE_DIRS", "").strip()
|
|
199
|
+
if not raw:
|
|
200
|
+
return frozenset()
|
|
201
|
+
parts = {p.strip() for p in raw.replace(";", ",").split(",") if p.strip()}
|
|
202
|
+
return frozenset(parts)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def should_skip_dir(name: str) -> bool:
|
|
206
|
+
if name in DEFAULT_IGNORE_DIR_NAMES or name in extra_ignore_dir_names():
|
|
207
|
+
return True
|
|
208
|
+
# Skip symlink loops / common junk starting with heavy hidden dirs not listed above.
|
|
209
|
+
if name == ".DS_Store":
|
|
210
|
+
return True
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def is_indexable_file(path: Path) -> bool:
|
|
215
|
+
name_l = path.name.lower()
|
|
216
|
+
ext = path.suffix.lower()
|
|
217
|
+
if ext in SKIP_EXTENSIONS:
|
|
218
|
+
return False
|
|
219
|
+
if ext in TEXT_EXTENSIONS:
|
|
220
|
+
return True
|
|
221
|
+
stem = path.stem.lower()
|
|
222
|
+
if stem in SPECIAL_FILENAMES and ext in ("", ".md", ".txt"):
|
|
223
|
+
return True
|
|
224
|
+
base = path.name.lower()
|
|
225
|
+
if base in ("dockerfile", "makefile", "gemfile", "rakefile", "jenkinsfile"):
|
|
226
|
+
return True
|
|
227
|
+
return False
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def iter_project_files(root: Path) -> Iterator[Path]:
|
|
231
|
+
root = root.resolve()
|
|
232
|
+
for dirpath, dirnames, filenames in os.walk(root, topdown=True, followlinks=False):
|
|
233
|
+
dp = Path(dirpath)
|
|
234
|
+
# Prune directories in-place.
|
|
235
|
+
dirnames[:] = sorted(
|
|
236
|
+
d for d in dirnames if not should_skip_dir(d)
|
|
237
|
+
)
|
|
238
|
+
for fn in sorted(filenames):
|
|
239
|
+
p = dp / fn
|
|
240
|
+
try:
|
|
241
|
+
if not p.is_file():
|
|
242
|
+
continue
|
|
243
|
+
except OSError:
|
|
244
|
+
continue
|
|
245
|
+
if not is_indexable_file(p):
|
|
246
|
+
continue
|
|
247
|
+
yield p
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def ensure_project_index_schema(con: sqlite3.Connection) -> None:
|
|
251
|
+
con.executescript("""
|
|
252
|
+
CREATE TABLE IF NOT EXISTS project_index_meta (
|
|
253
|
+
key TEXT PRIMARY KEY,
|
|
254
|
+
value TEXT NOT NULL
|
|
255
|
+
);
|
|
256
|
+
CREATE TABLE IF NOT EXISTS project_chunks (
|
|
257
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
258
|
+
path TEXT NOT NULL,
|
|
259
|
+
line_start INTEGER NOT NULL,
|
|
260
|
+
line_end INTEGER NOT NULL,
|
|
261
|
+
mtime REAL NOT NULL,
|
|
262
|
+
file_size INTEGER NOT NULL,
|
|
263
|
+
content TEXT NOT NULL,
|
|
264
|
+
embedding BLOB NOT NULL
|
|
265
|
+
);
|
|
266
|
+
CREATE INDEX IF NOT EXISTS idx_project_chunks_path ON project_chunks(path);
|
|
267
|
+
""")
|
|
268
|
+
con.commit()
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _meta_get(con: sqlite3.Connection, key: str) -> str | None:
|
|
272
|
+
cur = con.execute("SELECT value FROM project_index_meta WHERE key = ?", (key,))
|
|
273
|
+
row = cur.fetchone()
|
|
274
|
+
return str(row[0]) if row else None
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _meta_set(con: sqlite3.Connection, key: str, value: str) -> None:
|
|
278
|
+
con.execute(
|
|
279
|
+
"INSERT INTO project_index_meta (key, value) VALUES (?, ?) "
|
|
280
|
+
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
|
281
|
+
(key, value),
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _delete_chunks_for_path(con: sqlite3.Connection, relpath: str) -> None:
|
|
286
|
+
con.execute("DELETE FROM project_chunks WHERE path = ?", (relpath,))
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _blob_from_vec(vec: np.ndarray) -> bytes:
|
|
290
|
+
v = np.asarray(vec, dtype=np.float32).reshape(-1)
|
|
291
|
+
return v.tobytes()
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _vec_from_blob(blob: bytes, dim: int) -> np.ndarray:
|
|
295
|
+
arr = np.frombuffer(blob, dtype=np.float32)
|
|
296
|
+
if arr.size != dim:
|
|
297
|
+
raise ValueError(f"embedding size mismatch: got {arr.size}, expected {dim}")
|
|
298
|
+
return arr
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def index_project(
|
|
302
|
+
con: sqlite3.Connection,
|
|
303
|
+
project_root: str | Path,
|
|
304
|
+
embed_model,
|
|
305
|
+
*,
|
|
306
|
+
reset: bool = False,
|
|
307
|
+
now: Callable[[], float] | None = None,
|
|
308
|
+
) -> dict[str, Any]:
|
|
309
|
+
"""Chunk text files under ``project_root`` and store rows in ``project_chunks``.
|
|
310
|
+
|
|
311
|
+
Uses the same chunking window env vars as skills (``SKILLFORGE_CHUNK_*``).
|
|
312
|
+
"""
|
|
313
|
+
ensure_project_index_schema(con)
|
|
314
|
+
t0 = time.time()
|
|
315
|
+
root = Path(project_root).expanduser().resolve()
|
|
316
|
+
if not root.is_dir():
|
|
317
|
+
raise FileNotFoundError(f"project_root is not a directory: {root}")
|
|
318
|
+
|
|
319
|
+
embed_model_name = os.getenv("SKILLFORGE_EMBED_MODEL", "all-MiniLM-L6-v2")
|
|
320
|
+
edim = int(embed_model.get_sentence_embedding_dimension())
|
|
321
|
+
mc = chunk_max_chars()
|
|
322
|
+
oc = chunk_overlap_chars()
|
|
323
|
+
max_bytes = index_max_file_bytes()
|
|
324
|
+
|
|
325
|
+
if reset:
|
|
326
|
+
con.execute("DELETE FROM project_chunks")
|
|
327
|
+
con.commit()
|
|
328
|
+
|
|
329
|
+
files_seen = 0
|
|
330
|
+
chunks_written = 0
|
|
331
|
+
files_skipped_size = 0
|
|
332
|
+
errors: list[str] = []
|
|
333
|
+
|
|
334
|
+
for abs_path in iter_project_files(root):
|
|
335
|
+
try:
|
|
336
|
+
rel = abs_path.relative_to(root).as_posix()
|
|
337
|
+
except ValueError:
|
|
338
|
+
continue
|
|
339
|
+
try:
|
|
340
|
+
st = abs_path.stat()
|
|
341
|
+
except OSError as e:
|
|
342
|
+
errors.append(f"{rel}: stat {e}")
|
|
343
|
+
continue
|
|
344
|
+
if st.st_size > max_bytes:
|
|
345
|
+
files_skipped_size += 1
|
|
346
|
+
continue
|
|
347
|
+
try:
|
|
348
|
+
text = abs_path.read_text(encoding="utf-8", errors="replace")
|
|
349
|
+
except OSError as e:
|
|
350
|
+
errors.append(f"{rel}: read {e}")
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
chunks: list[SkillChunk] = chunk_raw_document(text, max_chars=mc, overlap=oc)
|
|
354
|
+
if not chunks:
|
|
355
|
+
_delete_chunks_for_path(con, rel)
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
files_seen += 1
|
|
359
|
+
_delete_chunks_for_path(con, rel)
|
|
360
|
+
flat_texts: list[str] = []
|
|
361
|
+
rows: list[tuple[Any, ...]] = []
|
|
362
|
+
mtime = float(st.st_mtime)
|
|
363
|
+
fsize = int(st.st_size)
|
|
364
|
+
for ch in chunks:
|
|
365
|
+
embed_in = f"{rel}\n{ch.text}"
|
|
366
|
+
flat_texts.append(embed_in)
|
|
367
|
+
try:
|
|
368
|
+
emb = embed_model.encode(flat_texts, show_progress_bar=False, convert_to_numpy=True)
|
|
369
|
+
except Exception as e:
|
|
370
|
+
errors.append(f"{rel}: embed {e}")
|
|
371
|
+
con.rollback()
|
|
372
|
+
continue
|
|
373
|
+
emb = np.asarray(emb, dtype=np.float32)
|
|
374
|
+
if emb.ndim == 1:
|
|
375
|
+
emb = emb.reshape(1, -1)
|
|
376
|
+
norms = np.linalg.norm(emb, axis=1, keepdims=True)
|
|
377
|
+
norms[norms == 0] = 1.0
|
|
378
|
+
emb = emb / norms
|
|
379
|
+
if emb.shape[1] != edim:
|
|
380
|
+
errors.append(f"{rel}: unexpected embed dim {emb.shape[1]}")
|
|
381
|
+
con.rollback()
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
for ch, row_emb in zip(chunks, emb):
|
|
385
|
+
rows.append(
|
|
386
|
+
(rel, ch.line_start, ch.line_end, mtime, fsize, ch.text, _blob_from_vec(row_emb))
|
|
387
|
+
)
|
|
388
|
+
con.executemany(
|
|
389
|
+
"INSERT INTO project_chunks (path, line_start, line_end, mtime, file_size, content, embedding) "
|
|
390
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
391
|
+
rows,
|
|
392
|
+
)
|
|
393
|
+
chunks_written += len(rows)
|
|
394
|
+
|
|
395
|
+
con.commit()
|
|
396
|
+
_meta_set(con, "embed_model", embed_model_name)
|
|
397
|
+
_meta_set(con, "embedding_dim", str(edim))
|
|
398
|
+
_meta_set(con, "last_index_ts", str(time.time() if now is None else now()))
|
|
399
|
+
_meta_set(
|
|
400
|
+
con,
|
|
401
|
+
"last_index_stats",
|
|
402
|
+
json.dumps({
|
|
403
|
+
"root": str(root),
|
|
404
|
+
"files_indexed": files_seen,
|
|
405
|
+
"chunks_written": chunks_written,
|
|
406
|
+
"files_skipped_oversize": files_skipped_size,
|
|
407
|
+
"reset": reset,
|
|
408
|
+
"elapsed_sec": round(time.time() - t0, 3),
|
|
409
|
+
"errors": errors[:50],
|
|
410
|
+
"chunk_max_chars": mc,
|
|
411
|
+
"chunk_overlap": oc,
|
|
412
|
+
}),
|
|
413
|
+
)
|
|
414
|
+
con.commit()
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
"root": str(root),
|
|
418
|
+
"files_indexed": files_seen,
|
|
419
|
+
"chunks_written": chunks_written,
|
|
420
|
+
"files_skipped_oversize": files_skipped_size,
|
|
421
|
+
"elapsed_sec": round(time.time() - t0, 3),
|
|
422
|
+
"errors": errors,
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def retrieve_project_context_items(
|
|
427
|
+
con: sqlite3.Connection,
|
|
428
|
+
embed_model,
|
|
429
|
+
prompt: str,
|
|
430
|
+
max_total_chars: int | None = None,
|
|
431
|
+
*,
|
|
432
|
+
max_rows: int | None = None,
|
|
433
|
+
overhead_per_chunk: int = 56,
|
|
434
|
+
) -> list[dict[str, Any]]:
|
|
435
|
+
"""Return ranked project file chunks (same shape as skill context items + ``path``)."""
|
|
436
|
+
cap = project_rag_max_chars() if max_total_chars is None else max_total_chars
|
|
437
|
+
if cap <= 0 or not prompt.strip():
|
|
438
|
+
return []
|
|
439
|
+
|
|
440
|
+
ensure_project_index_schema(con)
|
|
441
|
+
store_model = _meta_get(con, "embed_model")
|
|
442
|
+
want_model = os.getenv("SKILLFORGE_EMBED_MODEL", "all-MiniLM-L6-v2")
|
|
443
|
+
if store_model and store_model != want_model:
|
|
444
|
+
return []
|
|
445
|
+
|
|
446
|
+
dim_s = _meta_get(con, "embedding_dim")
|
|
447
|
+
edim = int(dim_s) if dim_s else int(embed_model.get_sentence_embedding_dimension())
|
|
448
|
+
if dim_s and int(dim_s) != int(embed_model.get_sentence_embedding_dimension()):
|
|
449
|
+
return []
|
|
450
|
+
|
|
451
|
+
row_limit = max_rows if max_rows is not None else PROJECT_RAG_MAX_ROWS_DEFAULT
|
|
452
|
+
cur = con.execute(
|
|
453
|
+
"SELECT path, line_start, line_end, content, embedding FROM project_chunks LIMIT ?",
|
|
454
|
+
(row_limit,),
|
|
455
|
+
)
|
|
456
|
+
fetch = cur.fetchall()
|
|
457
|
+
if not fetch:
|
|
458
|
+
return []
|
|
459
|
+
|
|
460
|
+
paths: list[str] = []
|
|
461
|
+
ls: list[int] = []
|
|
462
|
+
le: list[int] = []
|
|
463
|
+
texts: list[str] = []
|
|
464
|
+
mat_list: list[np.ndarray] = []
|
|
465
|
+
for path, line_start, line_end, content, blob in fetch:
|
|
466
|
+
try:
|
|
467
|
+
v = _vec_from_blob(blob, edim)
|
|
468
|
+
except ValueError:
|
|
469
|
+
continue
|
|
470
|
+
paths.append(str(path))
|
|
471
|
+
ls.append(int(line_start))
|
|
472
|
+
le.append(int(line_end))
|
|
473
|
+
texts.append(str(content))
|
|
474
|
+
mat_list.append(v)
|
|
475
|
+
if not mat_list:
|
|
476
|
+
return []
|
|
477
|
+
|
|
478
|
+
mat = np.stack(mat_list, axis=0)
|
|
479
|
+
qv = embed_model.encode(prompt, convert_to_numpy=True)
|
|
480
|
+
qv = np.asarray(qv, dtype=np.float32).reshape(-1)
|
|
481
|
+
qv = qv / max(float(np.linalg.norm(qv)), 1e-12)
|
|
482
|
+
scores = (mat @ qv).flatten()
|
|
483
|
+
order = np.argsort(-scores)
|
|
484
|
+
|
|
485
|
+
out: list[dict[str, Any]] = []
|
|
486
|
+
total = 0
|
|
487
|
+
for o in order:
|
|
488
|
+
i = int(o)
|
|
489
|
+
piece_len = len(texts[i]) + overhead_per_chunk
|
|
490
|
+
if total + piece_len > cap:
|
|
491
|
+
continue
|
|
492
|
+
out.append({
|
|
493
|
+
"skill": None,
|
|
494
|
+
"path": paths[i],
|
|
495
|
+
"line_start": ls[i],
|
|
496
|
+
"line_end": le[i],
|
|
497
|
+
"text": texts[i],
|
|
498
|
+
"score": float(scores[i]),
|
|
499
|
+
})
|
|
500
|
+
total += piece_len
|
|
501
|
+
return out
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def load_project_fusion_pool(
|
|
505
|
+
con: sqlite3.Connection,
|
|
506
|
+
embed_model,
|
|
507
|
+
prompt: str,
|
|
508
|
+
pool_limit: int,
|
|
509
|
+
*,
|
|
510
|
+
max_rows: int | None = None,
|
|
511
|
+
) -> tuple[list[dict[str, Any]], np.ndarray, np.ndarray]:
|
|
512
|
+
"""Top-``pool_limit`` project chunks by query similarity with embeddings (no char budget)."""
|
|
513
|
+
if pool_limit <= 0 or not prompt.strip():
|
|
514
|
+
return [], np.zeros((0, int(embed_model.get_sentence_embedding_dimension()))), np.array([])
|
|
515
|
+
|
|
516
|
+
ensure_project_index_schema(con)
|
|
517
|
+
store_model = _meta_get(con, "embed_model")
|
|
518
|
+
want_model = os.getenv("SKILLFORGE_EMBED_MODEL", "all-MiniLM-L6-v2")
|
|
519
|
+
if store_model and store_model != want_model:
|
|
520
|
+
return [], np.zeros((0, int(embed_model.get_sentence_embedding_dimension()))), np.array([])
|
|
521
|
+
|
|
522
|
+
dim_s = _meta_get(con, "embedding_dim")
|
|
523
|
+
edim = int(dim_s) if dim_s else int(embed_model.get_sentence_embedding_dimension())
|
|
524
|
+
if dim_s and int(dim_s) != int(embed_model.get_sentence_embedding_dimension()):
|
|
525
|
+
return [], np.zeros((0, int(embed_model.get_sentence_embedding_dimension()))), np.array([])
|
|
526
|
+
|
|
527
|
+
row_cap = max_rows if max_rows is not None else PROJECT_RAG_MAX_ROWS_DEFAULT
|
|
528
|
+
cur = con.execute(
|
|
529
|
+
"SELECT path, line_start, line_end, content, embedding FROM project_chunks LIMIT ?",
|
|
530
|
+
(row_cap,),
|
|
531
|
+
)
|
|
532
|
+
fetch = cur.fetchall()
|
|
533
|
+
if not fetch:
|
|
534
|
+
return [], np.zeros((0, edim)), np.array([])
|
|
535
|
+
|
|
536
|
+
paths: list[str] = []
|
|
537
|
+
ls: list[int] = []
|
|
538
|
+
le: list[int] = []
|
|
539
|
+
texts: list[str] = []
|
|
540
|
+
mat_list: list[np.ndarray] = []
|
|
541
|
+
for path, line_start, line_end, content, blob in fetch:
|
|
542
|
+
try:
|
|
543
|
+
v = _vec_from_blob(blob, edim)
|
|
544
|
+
except ValueError:
|
|
545
|
+
continue
|
|
546
|
+
paths.append(str(path))
|
|
547
|
+
ls.append(int(line_start))
|
|
548
|
+
le.append(int(line_end))
|
|
549
|
+
texts.append(str(content))
|
|
550
|
+
mat_list.append(v)
|
|
551
|
+
if not mat_list:
|
|
552
|
+
return [], np.zeros((0, edim)), np.array([])
|
|
553
|
+
|
|
554
|
+
mat = np.stack(mat_list, axis=0)
|
|
555
|
+
qv = embed_model.encode(prompt, convert_to_numpy=True)
|
|
556
|
+
qv = np.asarray(qv, dtype=np.float32).reshape(-1)
|
|
557
|
+
qv = qv / max(float(np.linalg.norm(qv)), 1e-12)
|
|
558
|
+
scores = (mat @ qv).flatten()
|
|
559
|
+
take = min(int(pool_limit), scores.shape[0])
|
|
560
|
+
order = np.argsort(-scores)[:take]
|
|
561
|
+
|
|
562
|
+
items: list[dict[str, Any]] = []
|
|
563
|
+
rows: list[np.ndarray] = []
|
|
564
|
+
rels: list[float] = []
|
|
565
|
+
for o in order:
|
|
566
|
+
i = int(o)
|
|
567
|
+
items.append({
|
|
568
|
+
"skill": None,
|
|
569
|
+
"path": paths[i],
|
|
570
|
+
"line_start": ls[i],
|
|
571
|
+
"line_end": le[i],
|
|
572
|
+
"text": texts[i],
|
|
573
|
+
"score": float(scores[i]),
|
|
574
|
+
"source": "file",
|
|
575
|
+
})
|
|
576
|
+
rows.append(mat[i])
|
|
577
|
+
rels.append(float(scores[i]))
|
|
578
|
+
if not rows:
|
|
579
|
+
return [], np.zeros((0, edim)), np.array([])
|
|
580
|
+
return items, np.stack(rows, axis=0), np.asarray(rels, dtype=np.float32)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def project_index_stats(con: sqlite3.Connection) -> dict[str, Any]:
|
|
584
|
+
ensure_project_index_schema(con)
|
|
585
|
+
cur = con.execute("SELECT COUNT(*) FROM project_chunks")
|
|
586
|
+
n = int(cur.fetchone()[0])
|
|
587
|
+
raw_stats = _meta_get(con, "last_index_stats")
|
|
588
|
+
parsed: Any = None
|
|
589
|
+
if raw_stats:
|
|
590
|
+
try:
|
|
591
|
+
parsed = json.loads(raw_stats)
|
|
592
|
+
except json.JSONDecodeError:
|
|
593
|
+
parsed = raw_stats
|
|
594
|
+
return {
|
|
595
|
+
"chunk_rows": n,
|
|
596
|
+
"embed_model": _meta_get(con, "embed_model"),
|
|
597
|
+
"embedding_dim": _meta_get(con, "embedding_dim"),
|
|
598
|
+
"last_index_ts": _meta_get(con, "last_index_ts"),
|
|
599
|
+
"last_index": parsed,
|
|
600
|
+
}
|