@heytherevibin/skillforge 0.2.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +44 -53
- package/RELEASING.md +1 -1
- package/SECURITY.md +2 -2
- package/STRATEGY.md +1 -3
- package/bin/cli.js +32 -138
- package/package.json +2 -2
- package/python/app/chunking.py +116 -0
- package/python/app/context_fusion.py +77 -0
- package/python/app/events_cli.py +1 -1
- package/python/app/index_cli.py +89 -0
- package/python/app/main.py +380 -214
- package/python/app/mcp_contract.py +121 -0
- package/python/app/mcp_server.py +80 -28
- package/python/app/project_index.py +600 -0
- package/python/app/redaction.py +128 -0
- package/python/app/route_cli.py +42 -19
- package/python/requirements.txt +0 -4
- package/python/tests/test_chunking.py +34 -0
- package/python/tests/test_context_fusion.py +45 -0
- package/python/tests/test_mcp_contract.py +137 -0
- package/python/tests/test_project_index.py +76 -0
- package/python/tests/test_redaction.py +51 -0
- package/python/app/auth.py +0 -63
- package/python/app/cli.py +0 -78
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Split SKILL.md bodies into line-bounded chunks for RAG-style retrieval."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def chunk_max_chars() -> int:
|
|
9
|
+
return max(400, int(os.getenv("SKILLFORGE_CHUNK_MAX_CHARS", "1200")))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def chunk_overlap_chars() -> int:
|
|
13
|
+
return max(0, int(os.getenv("SKILLFORGE_CHUNK_OVERLAP", "200")))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class SkillChunk:
|
|
18
|
+
"""One span of a skill body with 1-based inclusive line numbers (within the body text)."""
|
|
19
|
+
|
|
20
|
+
text: str
|
|
21
|
+
line_start: int
|
|
22
|
+
line_end: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _split_long_segment(text: str, line_start: int, max_chars: int, overlap: int) -> list[SkillChunk]:
|
|
26
|
+
"""Character windows with overlap; ``line_start`` is the body line of ``text[0]`` (1-based)."""
|
|
27
|
+
if not text:
|
|
28
|
+
return []
|
|
29
|
+
line_no = line_start
|
|
30
|
+
line_at_idx: list[int] = []
|
|
31
|
+
for ch in text:
|
|
32
|
+
line_at_idx.append(line_no)
|
|
33
|
+
if ch == "\n":
|
|
34
|
+
line_no += 1
|
|
35
|
+
n = len(text)
|
|
36
|
+
out: list[SkillChunk] = []
|
|
37
|
+
i = 0
|
|
38
|
+
while i < n:
|
|
39
|
+
end = min(i + max_chars, n)
|
|
40
|
+
piece = text[i:end].strip()
|
|
41
|
+
if piece:
|
|
42
|
+
ls = line_at_idx[i]
|
|
43
|
+
le = line_at_idx[end - 1]
|
|
44
|
+
out.append(SkillChunk(piece, ls, le))
|
|
45
|
+
if end >= n:
|
|
46
|
+
break
|
|
47
|
+
adv = max(1, end - i - overlap)
|
|
48
|
+
i += adv
|
|
49
|
+
if out:
|
|
50
|
+
return out
|
|
51
|
+
st = text.strip()
|
|
52
|
+
if not st:
|
|
53
|
+
return []
|
|
54
|
+
le_fallback = line_start + max(0, text.count("\n"))
|
|
55
|
+
return [SkillChunk(st, line_start, max(line_start, le_fallback))]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def chunk_skill_body(body: str, *, max_chars: int | None = None, overlap: int | None = None) -> list[SkillChunk]:
|
|
59
|
+
"""Chunk by markdown headings (lines starting with ``#``) then hard-split long sections.
|
|
60
|
+
|
|
61
|
+
Empty body yields no chunks (caller may treat as single empty).
|
|
62
|
+
"""
|
|
63
|
+
mc = max_chars if max_chars is not None else chunk_max_chars()
|
|
64
|
+
ov = overlap if overlap is not None else chunk_overlap_chars()
|
|
65
|
+
b = body or ""
|
|
66
|
+
if not b.strip():
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
lines = b.split("\n")
|
|
70
|
+
sections: list[tuple[str, int, int]] = []
|
|
71
|
+
cur: list[str] = []
|
|
72
|
+
cur_start = 1
|
|
73
|
+
for i, line in enumerate(lines):
|
|
74
|
+
ln = i + 1
|
|
75
|
+
if line.startswith("#") and cur:
|
|
76
|
+
sections.append(("\n".join(cur), cur_start, ln - 1))
|
|
77
|
+
cur = [line]
|
|
78
|
+
cur_start = ln
|
|
79
|
+
else:
|
|
80
|
+
cur.append(line)
|
|
81
|
+
if cur:
|
|
82
|
+
sections.append(("\n".join(cur), cur_start, len(lines)))
|
|
83
|
+
|
|
84
|
+
chunks: list[SkillChunk] = []
|
|
85
|
+
for text, ls, le in sections:
|
|
86
|
+
text = text.strip()
|
|
87
|
+
if not text:
|
|
88
|
+
continue
|
|
89
|
+
if len(text) <= mc:
|
|
90
|
+
chunks.append(SkillChunk(text, ls, le))
|
|
91
|
+
else:
|
|
92
|
+
chunks.extend(_split_long_segment(text, ls, mc, ov))
|
|
93
|
+
return chunks if chunks else [SkillChunk(b.strip(), 1, max(1, len(lines)))]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def chunk_raw_document(
|
|
97
|
+
body: str,
|
|
98
|
+
*,
|
|
99
|
+
max_chars: int | None = None,
|
|
100
|
+
overlap: int | None = None,
|
|
101
|
+
) -> list[SkillChunk]:
|
|
102
|
+
"""Chunk arbitrary file text with line-bounded windows (no markdown section split).
|
|
103
|
+
|
|
104
|
+
Line numbers are 1-based within the normalized document (``\\r\\n`` → ``\\n``).
|
|
105
|
+
"""
|
|
106
|
+
mc = max_chars if max_chars is not None else chunk_max_chars()
|
|
107
|
+
ov = overlap if overlap is not None else chunk_overlap_chars()
|
|
108
|
+
if not body:
|
|
109
|
+
return []
|
|
110
|
+
normalized = body.replace("\r\n", "\n")
|
|
111
|
+
if not normalized.strip():
|
|
112
|
+
return []
|
|
113
|
+
line_count = normalized.count("\n") + 1
|
|
114
|
+
if len(normalized) <= mc:
|
|
115
|
+
return [SkillChunk(normalized, 1, max(1, line_count))]
|
|
116
|
+
return _split_long_segment(normalized, 1, mc, ov)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""MMR-based selection to fuse skill + project chunks under one character budget."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def mmr_select(
|
|
10
|
+
embeddings: np.ndarray,
|
|
11
|
+
relevance: np.ndarray,
|
|
12
|
+
text_lengths: np.ndarray,
|
|
13
|
+
*,
|
|
14
|
+
char_budget: int,
|
|
15
|
+
overhead_per_chunk: int | np.ndarray,
|
|
16
|
+
lambda_mult: float,
|
|
17
|
+
) -> tuple[list[int], list[dict[str, Any]]]:
|
|
18
|
+
"""Greedy MMR over normalized row embeddings.
|
|
19
|
+
|
|
20
|
+
Each step maximizes ``lambda_mult * rel[i] - (1 - lambda_mult) * max_{j in selected} sim(i, j)``.
|
|
21
|
+
|
|
22
|
+
Returns selected **indices** in pick order and a trace row per pick (for telemetry).
|
|
23
|
+
"""
|
|
24
|
+
n = int(embeddings.shape[0])
|
|
25
|
+
if n == 0 or char_budget <= 0:
|
|
26
|
+
return [], []
|
|
27
|
+
|
|
28
|
+
lam = float(lambda_mult)
|
|
29
|
+
lam = max(0.0, min(1.0, lam))
|
|
30
|
+
rel = np.asarray(relevance, dtype=np.float64).reshape(-1)
|
|
31
|
+
lens = np.asarray(text_lengths, dtype=np.int64).reshape(-1)
|
|
32
|
+
emb = np.asarray(embeddings, dtype=np.float32)
|
|
33
|
+
if isinstance(overhead_per_chunk, int):
|
|
34
|
+
ovh = np.full(n, int(overhead_per_chunk), dtype=np.int64)
|
|
35
|
+
else:
|
|
36
|
+
ovh = np.asarray(overhead_per_chunk, dtype=np.int64).reshape(-1)
|
|
37
|
+
if emb.shape[0] != n or rel.shape[0] != n or lens.shape[0] != n or ovh.shape[0] != n:
|
|
38
|
+
raise ValueError("embeddings, relevance, text_lengths, and overheads must align")
|
|
39
|
+
|
|
40
|
+
selected: list[int] = []
|
|
41
|
+
trace: list[dict[str, Any]] = []
|
|
42
|
+
used = 0
|
|
43
|
+
remaining = set(range(n))
|
|
44
|
+
|
|
45
|
+
while remaining:
|
|
46
|
+
best_i: int | None = None
|
|
47
|
+
best_mmr = -1e18
|
|
48
|
+
for i in remaining:
|
|
49
|
+
need = int(lens[i]) + int(ovh[i])
|
|
50
|
+
if need <= 0 or used + need > char_budget:
|
|
51
|
+
continue
|
|
52
|
+
if not selected:
|
|
53
|
+
div = 0.0
|
|
54
|
+
else:
|
|
55
|
+
sims = emb[i] @ emb[np.array(selected, dtype=np.int64)].T
|
|
56
|
+
div = float(np.max(sims))
|
|
57
|
+
mmr = lam * float(rel[i]) - (1.0 - lam) * div
|
|
58
|
+
if mmr > best_mmr:
|
|
59
|
+
best_mmr = mmr
|
|
60
|
+
best_i = i
|
|
61
|
+
if best_i is None:
|
|
62
|
+
break
|
|
63
|
+
if selected:
|
|
64
|
+
sims = emb[best_i] @ emb[np.array(selected, dtype=np.int64)].T
|
|
65
|
+
div_used = float(np.max(sims))
|
|
66
|
+
else:
|
|
67
|
+
div_used = 0.0
|
|
68
|
+
selected.append(best_i)
|
|
69
|
+
used += int(lens[best_i]) + int(ovh[best_i])
|
|
70
|
+
remaining.remove(best_i)
|
|
71
|
+
trace.append({
|
|
72
|
+
"pool_index": best_i,
|
|
73
|
+
"mmr": round(float(best_mmr), 6),
|
|
74
|
+
"relevance": round(float(rel[best_i]), 6),
|
|
75
|
+
"max_sim_to_selected": round(div_used, 6),
|
|
76
|
+
})
|
|
77
|
+
return selected, trace
|
package/python/app/events_cli.py
CHANGED
|
@@ -123,7 +123,7 @@ def main() -> None:
|
|
|
123
123
|
db_path = resolve_orchestrator_db(pr)
|
|
124
124
|
|
|
125
125
|
if not db_path.exists():
|
|
126
|
-
print("No database yet — run skillforge mcp
|
|
126
|
+
print("No database yet — run skillforge mcp first (or route once with this project_root).")
|
|
127
127
|
print(f" Expected: {db_path}")
|
|
128
128
|
return
|
|
129
129
|
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""CLI: index project files into ``<project>/.skillforge/orchestrator.db`` for project RAG."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from app.db_paths import resolve_orchestrator_db
|
|
11
|
+
from app.main import build_router_and_skills, init_db
|
|
12
|
+
from app.project_index import index_project, project_index_stats
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _parse_args(argv: list[str] | None) -> argparse.Namespace:
|
|
16
|
+
p = argparse.ArgumentParser(
|
|
17
|
+
description=(
|
|
18
|
+
"Chunk and embed text files under project_root into the per-repo orchestrator DB. "
|
|
19
|
+
"Use with MCP route_skills/include_project_rag or skillforge route --include-project-rag."
|
|
20
|
+
),
|
|
21
|
+
)
|
|
22
|
+
p.add_argument(
|
|
23
|
+
"--project-root",
|
|
24
|
+
required=True,
|
|
25
|
+
help="Repository root directory to index (writes .skillforge/orchestrator.db).",
|
|
26
|
+
)
|
|
27
|
+
p.add_argument(
|
|
28
|
+
"--reset",
|
|
29
|
+
action="store_true",
|
|
30
|
+
help="Clear all project_chunks rows before re-indexing.",
|
|
31
|
+
)
|
|
32
|
+
p.add_argument(
|
|
33
|
+
"--stats-only",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Print index metadata from DB and exit (no scan/embed).",
|
|
36
|
+
)
|
|
37
|
+
p.add_argument(
|
|
38
|
+
"--quiet",
|
|
39
|
+
action="store_true",
|
|
40
|
+
help="Skip progress messages on stderr from skill loading.",
|
|
41
|
+
)
|
|
42
|
+
return p.parse_args(argv)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def _run(args: argparse.Namespace) -> int:
|
|
46
|
+
root_s = args.project_root.strip()
|
|
47
|
+
if not root_s:
|
|
48
|
+
print("skillforge index: --project-root is required.", file=sys.stderr)
|
|
49
|
+
return 2
|
|
50
|
+
root = Path(root_s).expanduser().resolve()
|
|
51
|
+
db_path = resolve_orchestrator_db(str(root))
|
|
52
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
con = init_db(db_path)
|
|
55
|
+
try:
|
|
56
|
+
if args.stats_only:
|
|
57
|
+
print(json.dumps({"db": str(db_path), **project_index_stats(con)}, indent=2))
|
|
58
|
+
return 0
|
|
59
|
+
|
|
60
|
+
router, _ = await asyncio.to_thread(
|
|
61
|
+
build_router_and_skills,
|
|
62
|
+
log=not args.quiet,
|
|
63
|
+
log_prefix="[skillforge-index]",
|
|
64
|
+
)
|
|
65
|
+
stats = await asyncio.to_thread(
|
|
66
|
+
index_project,
|
|
67
|
+
con,
|
|
68
|
+
root,
|
|
69
|
+
router.embed_model,
|
|
70
|
+
reset=args.reset,
|
|
71
|
+
)
|
|
72
|
+
print(
|
|
73
|
+
json.dumps(
|
|
74
|
+
{"db": str(db_path), "index_state": project_index_stats(con), **stats},
|
|
75
|
+
indent=2,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return 0
|
|
79
|
+
finally:
|
|
80
|
+
con.close()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main(argv: list[str] | None = None) -> None:
|
|
84
|
+
args = _parse_args(argv)
|
|
85
|
+
raise SystemExit(asyncio.run(_run(args)))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
main()
|