@heytherevibin/skillforge 0.2.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,600 @@
1
+ """Project-local RAG: walk a repo, chunk text files, store embeddings in per-project SQLite."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ import sqlite3
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Any, Callable, Iterator
10
+
11
+ import numpy as np
12
+
13
+ from app.chunking import SkillChunk, chunk_max_chars, chunk_overlap_chars, chunk_raw_document
14
+
15
+ # Basenames to skip entirely (noise / vendor / artifacts).
16
+ DEFAULT_IGNORE_DIR_NAMES: frozenset[str] = frozenset(
17
+ {
18
+ ".git",
19
+ ".hg",
20
+ ".svn",
21
+ "node_modules",
22
+ "__pycache__",
23
+ ".venv",
24
+ "venv",
25
+ ".tox",
26
+ "dist",
27
+ "build",
28
+ ".next",
29
+ ".nuxt",
30
+ "target",
31
+ "coverage",
32
+ ".pytest_cache",
33
+ ".mypy_cache",
34
+ ".ruff_cache",
35
+ ".terraform",
36
+ ".parcel-cache",
37
+ ".cache",
38
+ ".skillforge",
39
+ }
40
+ )
41
+
42
+ # Suffixes we never try to read as UTF-8 text.
43
+ SKIP_EXTENSIONS: frozenset[str] = frozenset(
44
+ {
45
+ ".png",
46
+ ".jpg",
47
+ ".jpeg",
48
+ ".gif",
49
+ ".webp",
50
+ ".ico",
51
+ ".pdf",
52
+ ".zip",
53
+ ".tar",
54
+ ".gz",
55
+ ".tgz",
56
+ ".bz2",
57
+ ".xz",
58
+ ".7z",
59
+ ".rar",
60
+ ".mp3",
61
+ ".mp4",
62
+ ".mov",
63
+ ".wav",
64
+ ".woff",
65
+ ".woff2",
66
+ ".ttf",
67
+ ".eot",
68
+ ".db",
69
+ ".sqlite",
70
+ ".sqlite3",
71
+ ".bin",
72
+ ".exe",
73
+ ".dll",
74
+ ".so",
75
+ ".dylib",
76
+ ".o",
77
+ ".a",
78
+ ".class",
79
+ ".jar",
80
+ ".pyc",
81
+ ".pyo",
82
+ ".pyd",
83
+ ".lock",
84
+ }
85
+ )
86
+
87
+ TEXT_EXTENSIONS: frozenset[str] = frozenset(
88
+ {
89
+ ".md",
90
+ ".mdx",
91
+ ".txt",
92
+ ".rst",
93
+ ".py",
94
+ ".pyi",
95
+ ".js",
96
+ ".jsx",
97
+ ".mjs",
98
+ ".cjs",
99
+ ".ts",
100
+ ".tsx",
101
+ ".json",
102
+ ".jsonc",
103
+ ".yaml",
104
+ ".yml",
105
+ ".toml",
106
+ ".rs",
107
+ ".go",
108
+ ".java",
109
+ ".kt",
110
+ ".kts",
111
+ ".rb",
112
+ ".php",
113
+ ".cs",
114
+ ".swift",
115
+ ".m",
116
+ ".mm",
117
+ ".h",
118
+ ".hpp",
119
+ ".c",
120
+ ".cc",
121
+ ".cpp",
122
+ ".cxx",
123
+ ".scss",
124
+ ".sass",
125
+ ".css",
126
+ ".less",
127
+ ".html",
128
+ ".htm",
129
+ ".vue",
130
+ ".svelte",
131
+ ".sql",
132
+ ".graphql",
133
+ ".sh",
134
+ ".bash",
135
+ ".zsh",
136
+ ".fish",
137
+ ".ps1",
138
+ ".env",
139
+ ".ini",
140
+ ".cfg",
141
+ ".conf",
142
+ ".properties",
143
+ ".xml",
144
+ ".gradle",
145
+ ".cmake",
146
+ ".clj",
147
+ ".cljs",
148
+ ".ex",
149
+ ".exs",
150
+ ".erl",
151
+ ".hrl",
152
+ ".lua",
153
+ ".nim",
154
+ ".dart",
155
+ ".scala",
156
+ ".sol",
157
+ ".r",
158
+ ".R",
159
+ ".jl",
160
+ ".pl",
161
+ ".pm",
162
+ ".proto",
163
+ ".tex",
164
+ ".liquid",
165
+ }
166
+ )
167
+
168
+ SPECIAL_FILENAMES: frozenset[str] = frozenset(
169
+ {
170
+ "dockerfile",
171
+ "makefile",
172
+ "gemfile",
173
+ "rakefile",
174
+ "procfile",
175
+ "jenkinsfile",
176
+ "licence",
177
+ "license",
178
+ "readme",
179
+ "changelog",
180
+ "contributing",
181
+ "code_of_conduct",
182
+ }
183
+ )
184
+
185
+ # Approximate cap for rows loaded into memory during retrieval.
186
+ PROJECT_RAG_MAX_ROWS_DEFAULT = int(os.getenv("SKILLFORGE_PROJECT_RAG_MAX_CHUNKS", "20000"))
187
+
188
+
189
+ def index_max_file_bytes() -> int:
190
+ return max(4096, int(os.getenv("SKILLFORGE_INDEX_MAX_FILE_BYTES", "524288")))
191
+
192
+
193
+ def project_rag_max_chars() -> int:
194
+ return max(0, int(os.getenv("SKILLFORGE_PROJECT_RAG_MAX_CHARS", "24000")))
195
+
196
+
197
+ def extra_ignore_dir_names() -> frozenset[str]:
198
+ raw = os.getenv("SKILLFORGE_INDEX_IGNORE_DIRS", "").strip()
199
+ if not raw:
200
+ return frozenset()
201
+ parts = {p.strip() for p in raw.replace(";", ",").split(",") if p.strip()}
202
+ return frozenset(parts)
203
+
204
+
205
+ def should_skip_dir(name: str) -> bool:
206
+ if name in DEFAULT_IGNORE_DIR_NAMES or name in extra_ignore_dir_names():
207
+ return True
208
+ # Skip symlink loops / common junk starting with heavy hidden dirs not listed above.
209
+ if name == ".DS_Store":
210
+ return True
211
+ return False
212
+
213
+
214
+ def is_indexable_file(path: Path) -> bool:
215
+ name_l = path.name.lower()
216
+ ext = path.suffix.lower()
217
+ if ext in SKIP_EXTENSIONS:
218
+ return False
219
+ if ext in TEXT_EXTENSIONS:
220
+ return True
221
+ stem = path.stem.lower()
222
+ if stem in SPECIAL_FILENAMES and ext in ("", ".md", ".txt"):
223
+ return True
224
+ base = path.name.lower()
225
+ if base in ("dockerfile", "makefile", "gemfile", "rakefile", "jenkinsfile"):
226
+ return True
227
+ return False
228
+
229
+
230
+ def iter_project_files(root: Path) -> Iterator[Path]:
231
+ root = root.resolve()
232
+ for dirpath, dirnames, filenames in os.walk(root, topdown=True, followlinks=False):
233
+ dp = Path(dirpath)
234
+ # Prune directories in-place.
235
+ dirnames[:] = sorted(
236
+ d for d in dirnames if not should_skip_dir(d)
237
+ )
238
+ for fn in sorted(filenames):
239
+ p = dp / fn
240
+ try:
241
+ if not p.is_file():
242
+ continue
243
+ except OSError:
244
+ continue
245
+ if not is_indexable_file(p):
246
+ continue
247
+ yield p
248
+
249
+
250
+ def ensure_project_index_schema(con: sqlite3.Connection) -> None:
251
+ con.executescript("""
252
+ CREATE TABLE IF NOT EXISTS project_index_meta (
253
+ key TEXT PRIMARY KEY,
254
+ value TEXT NOT NULL
255
+ );
256
+ CREATE TABLE IF NOT EXISTS project_chunks (
257
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
258
+ path TEXT NOT NULL,
259
+ line_start INTEGER NOT NULL,
260
+ line_end INTEGER NOT NULL,
261
+ mtime REAL NOT NULL,
262
+ file_size INTEGER NOT NULL,
263
+ content TEXT NOT NULL,
264
+ embedding BLOB NOT NULL
265
+ );
266
+ CREATE INDEX IF NOT EXISTS idx_project_chunks_path ON project_chunks(path);
267
+ """)
268
+ con.commit()
269
+
270
+
271
+ def _meta_get(con: sqlite3.Connection, key: str) -> str | None:
272
+ cur = con.execute("SELECT value FROM project_index_meta WHERE key = ?", (key,))
273
+ row = cur.fetchone()
274
+ return str(row[0]) if row else None
275
+
276
+
277
+ def _meta_set(con: sqlite3.Connection, key: str, value: str) -> None:
278
+ con.execute(
279
+ "INSERT INTO project_index_meta (key, value) VALUES (?, ?) "
280
+ "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
281
+ (key, value),
282
+ )
283
+
284
+
285
+ def _delete_chunks_for_path(con: sqlite3.Connection, relpath: str) -> None:
286
+ con.execute("DELETE FROM project_chunks WHERE path = ?", (relpath,))
287
+
288
+
289
+ def _blob_from_vec(vec: np.ndarray) -> bytes:
290
+ v = np.asarray(vec, dtype=np.float32).reshape(-1)
291
+ return v.tobytes()
292
+
293
+
294
+ def _vec_from_blob(blob: bytes, dim: int) -> np.ndarray:
295
+ arr = np.frombuffer(blob, dtype=np.float32)
296
+ if arr.size != dim:
297
+ raise ValueError(f"embedding size mismatch: got {arr.size}, expected {dim}")
298
+ return arr
299
+
300
+
301
+ def index_project(
302
+ con: sqlite3.Connection,
303
+ project_root: str | Path,
304
+ embed_model,
305
+ *,
306
+ reset: bool = False,
307
+ now: Callable[[], float] | None = None,
308
+ ) -> dict[str, Any]:
309
+ """Chunk text files under ``project_root`` and store rows in ``project_chunks``.
310
+
311
+ Uses the same chunking window env vars as skills (``SKILLFORGE_CHUNK_*``).
312
+ """
313
+ ensure_project_index_schema(con)
314
+ t0 = time.time()
315
+ root = Path(project_root).expanduser().resolve()
316
+ if not root.is_dir():
317
+ raise FileNotFoundError(f"project_root is not a directory: {root}")
318
+
319
+ embed_model_name = os.getenv("SKILLFORGE_EMBED_MODEL", "all-MiniLM-L6-v2")
320
+ edim = int(embed_model.get_sentence_embedding_dimension())
321
+ mc = chunk_max_chars()
322
+ oc = chunk_overlap_chars()
323
+ max_bytes = index_max_file_bytes()
324
+
325
+ if reset:
326
+ con.execute("DELETE FROM project_chunks")
327
+ con.commit()
328
+
329
+ files_seen = 0
330
+ chunks_written = 0
331
+ files_skipped_size = 0
332
+ errors: list[str] = []
333
+
334
+ for abs_path in iter_project_files(root):
335
+ try:
336
+ rel = abs_path.relative_to(root).as_posix()
337
+ except ValueError:
338
+ continue
339
+ try:
340
+ st = abs_path.stat()
341
+ except OSError as e:
342
+ errors.append(f"{rel}: stat {e}")
343
+ continue
344
+ if st.st_size > max_bytes:
345
+ files_skipped_size += 1
346
+ continue
347
+ try:
348
+ text = abs_path.read_text(encoding="utf-8", errors="replace")
349
+ except OSError as e:
350
+ errors.append(f"{rel}: read {e}")
351
+ continue
352
+
353
+ chunks: list[SkillChunk] = chunk_raw_document(text, max_chars=mc, overlap=oc)
354
+ if not chunks:
355
+ _delete_chunks_for_path(con, rel)
356
+ continue
357
+
358
+ files_seen += 1
359
+ _delete_chunks_for_path(con, rel)
360
+ flat_texts: list[str] = []
361
+ rows: list[tuple[Any, ...]] = []
362
+ mtime = float(st.st_mtime)
363
+ fsize = int(st.st_size)
364
+ for ch in chunks:
365
+ embed_in = f"{rel}\n{ch.text}"
366
+ flat_texts.append(embed_in)
367
+ try:
368
+ emb = embed_model.encode(flat_texts, show_progress_bar=False, convert_to_numpy=True)
369
+ except Exception as e:
370
+ errors.append(f"{rel}: embed {e}")
371
+ con.rollback()
372
+ continue
373
+ emb = np.asarray(emb, dtype=np.float32)
374
+ if emb.ndim == 1:
375
+ emb = emb.reshape(1, -1)
376
+ norms = np.linalg.norm(emb, axis=1, keepdims=True)
377
+ norms[norms == 0] = 1.0
378
+ emb = emb / norms
379
+ if emb.shape[1] != edim:
380
+ errors.append(f"{rel}: unexpected embed dim {emb.shape[1]}")
381
+ con.rollback()
382
+ continue
383
+
384
+ for ch, row_emb in zip(chunks, emb):
385
+ rows.append(
386
+ (rel, ch.line_start, ch.line_end, mtime, fsize, ch.text, _blob_from_vec(row_emb))
387
+ )
388
+ con.executemany(
389
+ "INSERT INTO project_chunks (path, line_start, line_end, mtime, file_size, content, embedding) "
390
+ "VALUES (?, ?, ?, ?, ?, ?, ?)",
391
+ rows,
392
+ )
393
+ chunks_written += len(rows)
394
+
395
+ con.commit()
396
+ _meta_set(con, "embed_model", embed_model_name)
397
+ _meta_set(con, "embedding_dim", str(edim))
398
+ _meta_set(con, "last_index_ts", str(time.time() if now is None else now()))
399
+ _meta_set(
400
+ con,
401
+ "last_index_stats",
402
+ json.dumps({
403
+ "root": str(root),
404
+ "files_indexed": files_seen,
405
+ "chunks_written": chunks_written,
406
+ "files_skipped_oversize": files_skipped_size,
407
+ "reset": reset,
408
+ "elapsed_sec": round(time.time() - t0, 3),
409
+ "errors": errors[:50],
410
+ "chunk_max_chars": mc,
411
+ "chunk_overlap": oc,
412
+ }),
413
+ )
414
+ con.commit()
415
+
416
+ return {
417
+ "root": str(root),
418
+ "files_indexed": files_seen,
419
+ "chunks_written": chunks_written,
420
+ "files_skipped_oversize": files_skipped_size,
421
+ "elapsed_sec": round(time.time() - t0, 3),
422
+ "errors": errors,
423
+ }
424
+
425
+
426
+ def retrieve_project_context_items(
427
+ con: sqlite3.Connection,
428
+ embed_model,
429
+ prompt: str,
430
+ max_total_chars: int | None = None,
431
+ *,
432
+ max_rows: int | None = None,
433
+ overhead_per_chunk: int = 56,
434
+ ) -> list[dict[str, Any]]:
435
+ """Return ranked project file chunks (same shape as skill context items + ``path``)."""
436
+ cap = project_rag_max_chars() if max_total_chars is None else max_total_chars
437
+ if cap <= 0 or not prompt.strip():
438
+ return []
439
+
440
+ ensure_project_index_schema(con)
441
+ store_model = _meta_get(con, "embed_model")
442
+ want_model = os.getenv("SKILLFORGE_EMBED_MODEL", "all-MiniLM-L6-v2")
443
+ if store_model and store_model != want_model:
444
+ return []
445
+
446
+ dim_s = _meta_get(con, "embedding_dim")
447
+ edim = int(dim_s) if dim_s else int(embed_model.get_sentence_embedding_dimension())
448
+ if dim_s and int(dim_s) != int(embed_model.get_sentence_embedding_dimension()):
449
+ return []
450
+
451
+ row_limit = max_rows if max_rows is not None else PROJECT_RAG_MAX_ROWS_DEFAULT
452
+ cur = con.execute(
453
+ "SELECT path, line_start, line_end, content, embedding FROM project_chunks LIMIT ?",
454
+ (row_limit,),
455
+ )
456
+ fetch = cur.fetchall()
457
+ if not fetch:
458
+ return []
459
+
460
+ paths: list[str] = []
461
+ ls: list[int] = []
462
+ le: list[int] = []
463
+ texts: list[str] = []
464
+ mat_list: list[np.ndarray] = []
465
+ for path, line_start, line_end, content, blob in fetch:
466
+ try:
467
+ v = _vec_from_blob(blob, edim)
468
+ except ValueError:
469
+ continue
470
+ paths.append(str(path))
471
+ ls.append(int(line_start))
472
+ le.append(int(line_end))
473
+ texts.append(str(content))
474
+ mat_list.append(v)
475
+ if not mat_list:
476
+ return []
477
+
478
+ mat = np.stack(mat_list, axis=0)
479
+ qv = embed_model.encode(prompt, convert_to_numpy=True)
480
+ qv = np.asarray(qv, dtype=np.float32).reshape(-1)
481
+ qv = qv / max(float(np.linalg.norm(qv)), 1e-12)
482
+ scores = (mat @ qv).flatten()
483
+ order = np.argsort(-scores)
484
+
485
+ out: list[dict[str, Any]] = []
486
+ total = 0
487
+ for o in order:
488
+ i = int(o)
489
+ piece_len = len(texts[i]) + overhead_per_chunk
490
+ if total + piece_len > cap:
491
+ continue
492
+ out.append({
493
+ "skill": None,
494
+ "path": paths[i],
495
+ "line_start": ls[i],
496
+ "line_end": le[i],
497
+ "text": texts[i],
498
+ "score": float(scores[i]),
499
+ })
500
+ total += piece_len
501
+ return out
502
+
503
+
504
+ def load_project_fusion_pool(
505
+ con: sqlite3.Connection,
506
+ embed_model,
507
+ prompt: str,
508
+ pool_limit: int,
509
+ *,
510
+ max_rows: int | None = None,
511
+ ) -> tuple[list[dict[str, Any]], np.ndarray, np.ndarray]:
512
+ """Top-``pool_limit`` project chunks by query similarity with embeddings (no char budget)."""
513
+ if pool_limit <= 0 or not prompt.strip():
514
+ return [], np.zeros((0, int(embed_model.get_sentence_embedding_dimension()))), np.array([])
515
+
516
+ ensure_project_index_schema(con)
517
+ store_model = _meta_get(con, "embed_model")
518
+ want_model = os.getenv("SKILLFORGE_EMBED_MODEL", "all-MiniLM-L6-v2")
519
+ if store_model and store_model != want_model:
520
+ return [], np.zeros((0, int(embed_model.get_sentence_embedding_dimension()))), np.array([])
521
+
522
+ dim_s = _meta_get(con, "embedding_dim")
523
+ edim = int(dim_s) if dim_s else int(embed_model.get_sentence_embedding_dimension())
524
+ if dim_s and int(dim_s) != int(embed_model.get_sentence_embedding_dimension()):
525
+ return [], np.zeros((0, int(embed_model.get_sentence_embedding_dimension()))), np.array([])
526
+
527
+ row_cap = max_rows if max_rows is not None else PROJECT_RAG_MAX_ROWS_DEFAULT
528
+ cur = con.execute(
529
+ "SELECT path, line_start, line_end, content, embedding FROM project_chunks LIMIT ?",
530
+ (row_cap,),
531
+ )
532
+ fetch = cur.fetchall()
533
+ if not fetch:
534
+ return [], np.zeros((0, edim)), np.array([])
535
+
536
+ paths: list[str] = []
537
+ ls: list[int] = []
538
+ le: list[int] = []
539
+ texts: list[str] = []
540
+ mat_list: list[np.ndarray] = []
541
+ for path, line_start, line_end, content, blob in fetch:
542
+ try:
543
+ v = _vec_from_blob(blob, edim)
544
+ except ValueError:
545
+ continue
546
+ paths.append(str(path))
547
+ ls.append(int(line_start))
548
+ le.append(int(line_end))
549
+ texts.append(str(content))
550
+ mat_list.append(v)
551
+ if not mat_list:
552
+ return [], np.zeros((0, edim)), np.array([])
553
+
554
+ mat = np.stack(mat_list, axis=0)
555
+ qv = embed_model.encode(prompt, convert_to_numpy=True)
556
+ qv = np.asarray(qv, dtype=np.float32).reshape(-1)
557
+ qv = qv / max(float(np.linalg.norm(qv)), 1e-12)
558
+ scores = (mat @ qv).flatten()
559
+ take = min(int(pool_limit), scores.shape[0])
560
+ order = np.argsort(-scores)[:take]
561
+
562
+ items: list[dict[str, Any]] = []
563
+ rows: list[np.ndarray] = []
564
+ rels: list[float] = []
565
+ for o in order:
566
+ i = int(o)
567
+ items.append({
568
+ "skill": None,
569
+ "path": paths[i],
570
+ "line_start": ls[i],
571
+ "line_end": le[i],
572
+ "text": texts[i],
573
+ "score": float(scores[i]),
574
+ "source": "file",
575
+ })
576
+ rows.append(mat[i])
577
+ rels.append(float(scores[i]))
578
+ if not rows:
579
+ return [], np.zeros((0, edim)), np.array([])
580
+ return items, np.stack(rows, axis=0), np.asarray(rels, dtype=np.float32)
581
+
582
+
583
+ def project_index_stats(con: sqlite3.Connection) -> dict[str, Any]:
584
+ ensure_project_index_schema(con)
585
+ cur = con.execute("SELECT COUNT(*) FROM project_chunks")
586
+ n = int(cur.fetchone()[0])
587
+ raw_stats = _meta_get(con, "last_index_stats")
588
+ parsed: Any = None
589
+ if raw_stats:
590
+ try:
591
+ parsed = json.loads(raw_stats)
592
+ except json.JSONDecodeError:
593
+ parsed = raw_stats
594
+ return {
595
+ "chunk_rows": n,
596
+ "embed_model": _meta_get(con, "embed_model"),
597
+ "embedding_dim": _meta_get(con, "embedding_dim"),
598
+ "last_index_ts": _meta_get(con, "last_index_ts"),
599
+ "last_index": parsed,
600
+ }