@heytherevibin/skillforge 0.2.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ """Split SKILL.md bodies into line-bounded chunks for RAG-style retrieval."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+
8
+ def chunk_max_chars() -> int:
9
+ return max(400, int(os.getenv("SKILLFORGE_CHUNK_MAX_CHARS", "1200")))
10
+
11
+
12
+ def chunk_overlap_chars() -> int:
13
+ return max(0, int(os.getenv("SKILLFORGE_CHUNK_OVERLAP", "200")))
14
+
15
+
16
+ @dataclass
17
+ class SkillChunk:
18
+ """One span of a skill body with 1-based inclusive line numbers (within the body text)."""
19
+
20
+ text: str
21
+ line_start: int
22
+ line_end: int
23
+
24
+
25
+ def _split_long_segment(text: str, line_start: int, max_chars: int, overlap: int) -> list[SkillChunk]:
26
+ """Character windows with overlap; ``line_start`` is the body line of ``text[0]`` (1-based)."""
27
+ if not text:
28
+ return []
29
+ line_no = line_start
30
+ line_at_idx: list[int] = []
31
+ for ch in text:
32
+ line_at_idx.append(line_no)
33
+ if ch == "\n":
34
+ line_no += 1
35
+ n = len(text)
36
+ out: list[SkillChunk] = []
37
+ i = 0
38
+ while i < n:
39
+ end = min(i + max_chars, n)
40
+ piece = text[i:end].strip()
41
+ if piece:
42
+ ls = line_at_idx[i]
43
+ le = line_at_idx[end - 1]
44
+ out.append(SkillChunk(piece, ls, le))
45
+ if end >= n:
46
+ break
47
+ adv = max(1, end - i - overlap)
48
+ i += adv
49
+ if out:
50
+ return out
51
+ st = text.strip()
52
+ if not st:
53
+ return []
54
+ le_fallback = line_start + max(0, text.count("\n"))
55
+ return [SkillChunk(st, line_start, max(line_start, le_fallback))]
56
+
57
+
58
+ def chunk_skill_body(body: str, *, max_chars: int | None = None, overlap: int | None = None) -> list[SkillChunk]:
59
+ """Chunk by markdown headings (lines starting with ``#``) then hard-split long sections.
60
+
61
+ Empty body yields no chunks (caller may treat as single empty).
62
+ """
63
+ mc = max_chars if max_chars is not None else chunk_max_chars()
64
+ ov = overlap if overlap is not None else chunk_overlap_chars()
65
+ b = body or ""
66
+ if not b.strip():
67
+ return []
68
+
69
+ lines = b.split("\n")
70
+ sections: list[tuple[str, int, int]] = []
71
+ cur: list[str] = []
72
+ cur_start = 1
73
+ for i, line in enumerate(lines):
74
+ ln = i + 1
75
+ if line.startswith("#") and cur:
76
+ sections.append(("\n".join(cur), cur_start, ln - 1))
77
+ cur = [line]
78
+ cur_start = ln
79
+ else:
80
+ cur.append(line)
81
+ if cur:
82
+ sections.append(("\n".join(cur), cur_start, len(lines)))
83
+
84
+ chunks: list[SkillChunk] = []
85
+ for text, ls, le in sections:
86
+ text = text.strip()
87
+ if not text:
88
+ continue
89
+ if len(text) <= mc:
90
+ chunks.append(SkillChunk(text, ls, le))
91
+ else:
92
+ chunks.extend(_split_long_segment(text, ls, mc, ov))
93
+ return chunks if chunks else [SkillChunk(b.strip(), 1, max(1, len(lines)))]
94
+
95
+
96
+ def chunk_raw_document(
97
+ body: str,
98
+ *,
99
+ max_chars: int | None = None,
100
+ overlap: int | None = None,
101
+ ) -> list[SkillChunk]:
102
+ """Chunk arbitrary file text with line-bounded windows (no markdown section split).
103
+
104
+ Line numbers are 1-based within the normalized document (``\\r\\n`` → ``\\n``).
105
+ """
106
+ mc = max_chars if max_chars is not None else chunk_max_chars()
107
+ ov = overlap if overlap is not None else chunk_overlap_chars()
108
+ if not body:
109
+ return []
110
+ normalized = body.replace("\r\n", "\n")
111
+ if not normalized.strip():
112
+ return []
113
+ line_count = normalized.count("\n") + 1
114
+ if len(normalized) <= mc:
115
+ return [SkillChunk(normalized, 1, max(1, line_count))]
116
+ return _split_long_segment(normalized, 1, mc, ov)
@@ -0,0 +1,77 @@
1
+ """MMR-based selection to fuse skill + project chunks under one character budget."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+
8
+
9
+ def mmr_select(
10
+ embeddings: np.ndarray,
11
+ relevance: np.ndarray,
12
+ text_lengths: np.ndarray,
13
+ *,
14
+ char_budget: int,
15
+ overhead_per_chunk: int | np.ndarray,
16
+ lambda_mult: float,
17
+ ) -> tuple[list[int], list[dict[str, Any]]]:
18
+ """Greedy MMR over normalized row embeddings.
19
+
20
+ Each step maximizes ``lambda_mult * rel[i] - (1 - lambda_mult) * max_{j in selected} sim(i, j)``.
21
+
22
+ Returns selected **indices** in pick order and a trace row per pick (for telemetry).
23
+ """
24
+ n = int(embeddings.shape[0])
25
+ if n == 0 or char_budget <= 0:
26
+ return [], []
27
+
28
+ lam = float(lambda_mult)
29
+ lam = max(0.0, min(1.0, lam))
30
+ rel = np.asarray(relevance, dtype=np.float64).reshape(-1)
31
+ lens = np.asarray(text_lengths, dtype=np.int64).reshape(-1)
32
+ emb = np.asarray(embeddings, dtype=np.float32)
33
+ if isinstance(overhead_per_chunk, int):
34
+ ovh = np.full(n, int(overhead_per_chunk), dtype=np.int64)
35
+ else:
36
+ ovh = np.asarray(overhead_per_chunk, dtype=np.int64).reshape(-1)
37
+ if emb.shape[0] != n or rel.shape[0] != n or lens.shape[0] != n or ovh.shape[0] != n:
38
+ raise ValueError("embeddings, relevance, text_lengths, and overheads must align")
39
+
40
+ selected: list[int] = []
41
+ trace: list[dict[str, Any]] = []
42
+ used = 0
43
+ remaining = set(range(n))
44
+
45
+ while remaining:
46
+ best_i: int | None = None
47
+ best_mmr = -1e18
48
+ for i in remaining:
49
+ need = int(lens[i]) + int(ovh[i])
50
+ if need <= 0 or used + need > char_budget:
51
+ continue
52
+ if not selected:
53
+ div = 0.0
54
+ else:
55
+ sims = emb[i] @ emb[np.array(selected, dtype=np.int64)].T
56
+ div = float(np.max(sims))
57
+ mmr = lam * float(rel[i]) - (1.0 - lam) * div
58
+ if mmr > best_mmr:
59
+ best_mmr = mmr
60
+ best_i = i
61
+ if best_i is None:
62
+ break
63
+ if selected:
64
+ sims = emb[best_i] @ emb[np.array(selected, dtype=np.int64)].T
65
+ div_used = float(np.max(sims))
66
+ else:
67
+ div_used = 0.0
68
+ selected.append(best_i)
69
+ used += int(lens[best_i]) + int(ovh[best_i])
70
+ remaining.remove(best_i)
71
+ trace.append({
72
+ "pool_index": best_i,
73
+ "mmr": round(float(best_mmr), 6),
74
+ "relevance": round(float(rel[best_i]), 6),
75
+ "max_sim_to_selected": round(div_used, 6),
76
+ })
77
+ return selected, trace
@@ -123,7 +123,7 @@ def main() -> None:
123
123
  db_path = resolve_orchestrator_db(pr)
124
124
 
125
125
  if not db_path.exists():
126
- print("No database yet — run skillforge mcp or skillforge start first (or route once with this project_root).")
126
+ print("No database yet — run skillforge mcp first (or route once with this project_root).")
127
127
  print(f" Expected: {db_path}")
128
128
  return
129
129
 
@@ -0,0 +1,89 @@
1
+ """CLI: index project files into ``<project>/.skillforge/orchestrator.db`` for project RAG."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import asyncio
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from app.db_paths import resolve_orchestrator_db
11
+ from app.main import build_router_and_skills, init_db
12
+ from app.project_index import index_project, project_index_stats
13
+
14
+
15
+ def _parse_args(argv: list[str] | None) -> argparse.Namespace:
16
+ p = argparse.ArgumentParser(
17
+ description=(
18
+ "Chunk and embed text files under project_root into the per-repo orchestrator DB. "
19
+ "Use with MCP route_skills/include_project_rag or skillforge route --include-project-rag."
20
+ ),
21
+ )
22
+ p.add_argument(
23
+ "--project-root",
24
+ required=True,
25
+ help="Repository root directory to index (writes .skillforge/orchestrator.db).",
26
+ )
27
+ p.add_argument(
28
+ "--reset",
29
+ action="store_true",
30
+ help="Clear all project_chunks rows before re-indexing.",
31
+ )
32
+ p.add_argument(
33
+ "--stats-only",
34
+ action="store_true",
35
+ help="Print index metadata from DB and exit (no scan/embed).",
36
+ )
37
+ p.add_argument(
38
+ "--quiet",
39
+ action="store_true",
40
+ help="Skip progress messages on stderr from skill loading.",
41
+ )
42
+ return p.parse_args(argv)
43
+
44
+
45
+ async def _run(args: argparse.Namespace) -> int:
46
+ root_s = args.project_root.strip()
47
+ if not root_s:
48
+ print("skillforge index: --project-root is required.", file=sys.stderr)
49
+ return 2
50
+ root = Path(root_s).expanduser().resolve()
51
+ db_path = resolve_orchestrator_db(str(root))
52
+ db_path.parent.mkdir(parents=True, exist_ok=True)
53
+
54
+ con = init_db(db_path)
55
+ try:
56
+ if args.stats_only:
57
+ print(json.dumps({"db": str(db_path), **project_index_stats(con)}, indent=2))
58
+ return 0
59
+
60
+ router, _ = await asyncio.to_thread(
61
+ build_router_and_skills,
62
+ log=not args.quiet,
63
+ log_prefix="[skillforge-index]",
64
+ )
65
+ stats = await asyncio.to_thread(
66
+ index_project,
67
+ con,
68
+ root,
69
+ router.embed_model,
70
+ reset=args.reset,
71
+ )
72
+ print(
73
+ json.dumps(
74
+ {"db": str(db_path), "index_state": project_index_stats(con), **stats},
75
+ indent=2,
76
+ )
77
+ )
78
+ return 0
79
+ finally:
80
+ con.close()
81
+
82
+
83
+ def main(argv: list[str] | None = None) -> None:
84
+ args = _parse_args(argv)
85
+ raise SystemExit(asyncio.run(_run(args)))
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()