@heytherevibin/skillforge 0.2.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +43 -0
- package/README.md +89 -56
- package/RELEASING.md +1 -1
- package/SECURITY.md +2 -2
- package/STRATEGY.md +1 -3
- package/bin/cli.js +32 -138
- package/package.json +2 -2
- package/python/app/chunking.py +116 -0
- package/python/app/context_fusion.py +77 -0
- package/python/app/events_cli.py +1 -1
- package/python/app/index_cli.py +89 -0
- package/python/app/main.py +632 -229
- package/python/app/mcp_contract.py +121 -0
- package/python/app/mcp_server.py +304 -30
- package/python/app/project_index.py +600 -0
- package/python/app/redaction.py +128 -0
- package/python/app/route_cli.py +42 -19
- package/python/app/route_policies.py +133 -0
- package/python/app/routing_signals.py +95 -0
- package/python/requirements.txt +1 -4
- package/python/tests/test_chunking.py +34 -0
- package/python/tests/test_context_fusion.py +45 -0
- package/python/tests/test_mcp_contract.py +137 -0
- package/python/tests/test_project_index.py +76 -0
- package/python/tests/test_redaction.py +51 -0
- package/python/tests/test_route_policies.py +115 -0
- package/python/tests/test_routing_signals.py +77 -0
- package/python/app/auth.py +0 -63
- package/python/app/cli.py +0 -78
package/bin/cli.js
CHANGED
|
@@ -5,17 +5,15 @@
|
|
|
5
5
|
* Usage:
|
|
6
6
|
* skillforge, skillforge --help Show help (primary path: MCP, not a web app)
|
|
7
7
|
* skillforge mcp MCP stdio server (Claude / Cursor / …)
|
|
8
|
-
* skillforge start [--port=8000] Optional headless HTTP API (no browser UI)
|
|
9
8
|
* skillforge events [--watch] [--limit=N] Print SQLite routing events
|
|
10
9
|
* skillforge route [words…] [--prompt=…] Same routing as MCP route_skills (terminal)
|
|
11
|
-
* skillforge
|
|
10
|
+
* skillforge index --project-root=… Chunk/embed repo files for project RAG
|
|
12
11
|
* skillforge install One-time Python venv + deps
|
|
13
|
-
* skillforge skills … / pack … /
|
|
12
|
+
* skillforge skills … / pack … / reset
|
|
14
13
|
*/
|
|
15
14
|
|
|
16
15
|
const path = require('path');
|
|
17
16
|
const fs = require('fs');
|
|
18
|
-
const crypto = require('crypto');
|
|
19
17
|
const { spawn, spawnSync } = require('child_process');
|
|
20
18
|
const os = require('os');
|
|
21
19
|
const packs = require('../lib/packs');
|
|
@@ -26,8 +24,8 @@ const CONFIG_DIR = path.join(os.homedir(), '.skillforge');
|
|
|
26
24
|
const VENV_DIR = path.join(CONFIG_DIR, 'venv');
|
|
27
25
|
const DATA_DIR = path.join(CONFIG_DIR, 'data');
|
|
28
26
|
const USER_SKILLS_DIR = path.join(CONFIG_DIR, 'skills');
|
|
29
|
-
|
|
30
|
-
const
|
|
27
|
+
/** Bearer-token file for the removed HTTP API (<=0.6.x); deleted on first CLI use. */
|
|
28
|
+
const LEGACY_AUTH_FILE = path.join(CONFIG_DIR, 'auth.json');
|
|
31
29
|
const SETUP_MARKER = path.join(CONFIG_DIR, '.setup-complete');
|
|
32
30
|
|
|
33
31
|
const args = process.argv.slice(2);
|
|
@@ -87,6 +85,18 @@ function ensureDirs() {
|
|
|
87
85
|
}
|
|
88
86
|
}
|
|
89
87
|
|
|
88
|
+
/** v0.7.0 removed HTTP + `skillforge auth`; leftover tokens file is misleading — remove once. */
|
|
89
|
+
function dropLegacyAuthJsonIfPresent() {
|
|
90
|
+
try {
|
|
91
|
+
if (fs.existsSync(LEGACY_AUTH_FILE)) {
|
|
92
|
+
fs.rmSync(LEGACY_AUTH_FILE);
|
|
93
|
+
info('Removed legacy ~/.skillforge/auth.json (HTTP API was removed in v0.7).');
|
|
94
|
+
}
|
|
95
|
+
} catch (e) {
|
|
96
|
+
err(`Could not remove legacy auth.json: ${e.message}`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
90
100
|
function runSetup() {
|
|
91
101
|
info('First-time setup — this happens once and takes ~2 minutes');
|
|
92
102
|
ensureDirs();
|
|
@@ -145,77 +155,7 @@ function setupIfNeeded() {
|
|
|
145
155
|
}
|
|
146
156
|
}
|
|
147
157
|
|
|
148
|
-
// ---- API key check ----
|
|
149
|
-
function checkApiKey() {
|
|
150
|
-
if (!process.env.ANTHROPIC_API_KEY) {
|
|
151
|
-
err('ANTHROPIC_API_KEY environment variable is not set.');
|
|
152
|
-
log(c.dim(' Get a key at https://console.anthropic.com/'));
|
|
153
|
-
log(c.dim(' Then set it:'));
|
|
154
|
-
log(c.dim(' export ANTHROPIC_API_KEY=sk-ant-...'));
|
|
155
|
-
process.exit(1);
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
// ---- auth management ----
|
|
160
|
-
function loadAuth() {
|
|
161
|
-
if (!fs.existsSync(AUTH_FILE)) return {};
|
|
162
|
-
try { return JSON.parse(fs.readFileSync(AUTH_FILE, 'utf8')); } catch { return {}; }
|
|
163
|
-
}
|
|
164
|
-
function saveAuth(map) {
|
|
165
|
-
ensureDirs();
|
|
166
|
-
fs.writeFileSync(AUTH_FILE, JSON.stringify(map, null, 2), { mode: 0o600 });
|
|
167
|
-
}
|
|
168
|
-
function authToEnvVar(map) {
|
|
169
|
-
// map is { token: userId }. Convert and inject as JSON env var.
|
|
170
|
-
return JSON.stringify(map);
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
function authAdd(user) {
|
|
174
|
-
if (!user) { err('Usage: skillforge auth add <user-id>'); process.exit(1); }
|
|
175
|
-
const map = loadAuth();
|
|
176
|
-
// Generate a token
|
|
177
|
-
const token = 'sf_' + crypto.randomBytes(24).toString('base64url');
|
|
178
|
-
map[token] = user;
|
|
179
|
-
saveAuth(map);
|
|
180
|
-
ok(`Created token for user "${user}":`);
|
|
181
|
-
log('');
|
|
182
|
-
log(' ' + c.bold(token));
|
|
183
|
-
log('');
|
|
184
|
-
log(c.dim('Use this token in the Authorization header:'));
|
|
185
|
-
log(c.dim(` Authorization: Bearer ${token}`));
|
|
186
|
-
log(c.dim('Restart the server for the token to take effect.'));
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
function authList() {
|
|
190
|
-
const map = loadAuth();
|
|
191
|
-
const tokens = Object.entries(map);
|
|
192
|
-
if (tokens.length === 0) {
|
|
193
|
-
info('No auth tokens. Server runs in single-user mode.');
|
|
194
|
-
log(c.dim(' Add one with: skillforge auth add <user-id>'));
|
|
195
|
-
return;
|
|
196
|
-
}
|
|
197
|
-
log(c.bold('Auth tokens:'));
|
|
198
|
-
for (const [token, user] of tokens) {
|
|
199
|
-
log(` ${c.dim(token.slice(0, 16) + '...')} → ${user}`);
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
function authRemove(user) {
|
|
204
|
-
if (!user) { err('Usage: skillforge auth remove <user-id>'); process.exit(1); }
|
|
205
|
-
const map = loadAuth();
|
|
206
|
-
const before = Object.keys(map).length;
|
|
207
|
-
for (const [t, u] of Object.entries(map)) {
|
|
208
|
-
if (u === user) delete map[t];
|
|
209
|
-
}
|
|
210
|
-
const removed = before - Object.keys(map).length;
|
|
211
|
-
saveAuth(map);
|
|
212
|
-
if (removed > 0) ok(`Revoked ${removed} token(s) for "${user}"`);
|
|
213
|
-
else info(`No tokens for "${user}"`);
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// ---- server lifecycle ----
|
|
217
158
|
function buildEnv(extra = {}) {
|
|
218
|
-
const authMap = loadAuth();
|
|
219
159
|
return {
|
|
220
160
|
...process.env,
|
|
221
161
|
SKILLFORGE_BUNDLED_SKILLS: path.join(PKG_ROOT, 'skills'),
|
|
@@ -223,36 +163,10 @@ function buildEnv(extra = {}) {
|
|
|
223
163
|
SKILLFORGE_DB_PATH: path.join(DATA_DIR, 'orchestrator.db'),
|
|
224
164
|
PYTHONPATH: path.join(PKG_ROOT, 'python'),
|
|
225
165
|
PYTHONUNBUFFERED: '1',
|
|
226
|
-
...(Object.keys(authMap).length > 0 ? { SKILLFORGE_AUTH_TOKENS: authToEnvVar(authMap) } : {}),
|
|
227
166
|
...extra,
|
|
228
167
|
};
|
|
229
168
|
}
|
|
230
169
|
|
|
231
|
-
function startServer({ port = 8000 } = {}) {
|
|
232
|
-
setupIfNeeded();
|
|
233
|
-
checkApiKey();
|
|
234
|
-
|
|
235
|
-
const env = buildEnv({ SKILLFORGE_PORT: String(port) });
|
|
236
|
-
const authEnabled = Object.keys(loadAuth()).length > 0;
|
|
237
|
-
|
|
238
|
-
info(`Starting HTTP API on http://localhost:${port}`);
|
|
239
|
-
log(c.dim(' Live log: skillforge events --watch'));
|
|
240
|
-
log(c.dim(` Skills dir: ${USER_SKILLS_DIR} (drop folders here to add)`));
|
|
241
|
-
log(c.dim(` Data dir: ${DATA_DIR}`));
|
|
242
|
-
log(c.dim(` Auth: ${authEnabled ? 'enabled (bearer token required)' : 'disabled (single-user)'}`));
|
|
243
|
-
log('');
|
|
244
|
-
|
|
245
|
-
const proc = spawn(
|
|
246
|
-
venvPython(),
|
|
247
|
-
['-m', 'uvicorn', 'app.main:app', '--host', '0.0.0.0', '--port', String(port)],
|
|
248
|
-
{ stdio: 'inherit', env }
|
|
249
|
-
);
|
|
250
|
-
|
|
251
|
-
proc.on('exit', (code) => process.exit(code || 0));
|
|
252
|
-
process.on('SIGINT', () => proc.kill('SIGINT'));
|
|
253
|
-
process.on('SIGTERM', () => proc.kill('SIGTERM'));
|
|
254
|
-
}
|
|
255
|
-
|
|
256
170
|
function printMcpConfig() {
|
|
257
171
|
setupIfNeeded();
|
|
258
172
|
const useLocal = args.includes('--local');
|
|
@@ -313,12 +227,14 @@ function runRouteCmd() {
|
|
|
313
227
|
proc.on('exit', (code) => process.exit(code ?? 0));
|
|
314
228
|
}
|
|
315
229
|
|
|
316
|
-
function
|
|
230
|
+
function runIndexCmd() {
|
|
317
231
|
setupIfNeeded();
|
|
318
|
-
|
|
319
|
-
const
|
|
320
|
-
|
|
321
|
-
|
|
232
|
+
const sub = args.slice(1);
|
|
233
|
+
const proc = spawn(venvPython(), ['-m', 'app.index_cli', ...sub], {
|
|
234
|
+
stdio: 'inherit',
|
|
235
|
+
env: buildEnv(),
|
|
236
|
+
});
|
|
237
|
+
proc.on('exit', (code) => process.exit(code ?? 0));
|
|
322
238
|
}
|
|
323
239
|
|
|
324
240
|
// ---- skill management ----
|
|
@@ -341,7 +257,7 @@ function skillsAdd(srcPath) {
|
|
|
341
257
|
const dest = path.join(USER_SKILLS_DIR, name);
|
|
342
258
|
fs.cpSync(src, dest, { recursive: true });
|
|
343
259
|
ok(`Added skill "${name}" → ${dest}`);
|
|
344
|
-
log(c.dim(' Restart
|
|
260
|
+
log(c.dim(' Restart skillforge mcp (or trigger catalog reload) to pick up the new skill.'));
|
|
345
261
|
}
|
|
346
262
|
|
|
347
263
|
function skillsList() {
|
|
@@ -373,7 +289,7 @@ function skillsRemove(name) {
|
|
|
373
289
|
}
|
|
374
290
|
const target = path.join(USER_SKILLS_DIR, name);
|
|
375
291
|
if (!fs.existsSync(target)) {
|
|
376
|
-
err(`No user skill named "${name}". Bundled skills cannot be removed (use disable_skill via MCP
|
|
292
|
+
err(`No user skill named "${name}". Bundled skills cannot be removed (use disable_skill via MCP).`);
|
|
377
293
|
process.exit(1);
|
|
378
294
|
}
|
|
379
295
|
fs.rmSync(target, { recursive: true, force: true });
|
|
@@ -398,10 +314,9 @@ ${c.bold('Run modes:')}
|
|
|
398
314
|
skillforge --help This message (recommended first step)
|
|
399
315
|
skillforge mcp MCP stdio — primary integration for Claude / Cursor
|
|
400
316
|
skillforge mcp config [--local] [--with-anthropic] Print JSON for MCP host (merge into mcp.json)
|
|
401
|
-
skillforge start [--port=8000] Optional HTTP API (no web dashboard)
|
|
402
317
|
skillforge events [--watch] [--limit=N] [--verbose] [--user=…] Live routing log + usage (see --help)
|
|
403
|
-
skillforge route [words…] [--project-root=…] [--
|
|
404
|
-
skillforge
|
|
318
|
+
skillforge route [words…] [--project-root=…] [--include-project-rag] Route a prompt (see skillforge route --help)
|
|
319
|
+
skillforge index --project-root=… [--reset] [--stats-only] Index repo text for include_project_rag
|
|
405
320
|
|
|
406
321
|
${c.bold('Skills:')}
|
|
407
322
|
skillforge skills list List bundled and user skills
|
|
@@ -414,11 +329,6 @@ ${c.bold('Skill packs (install from git):')}
|
|
|
414
329
|
skillforge pack update <name> Update a pack
|
|
415
330
|
skillforge pack remove <name> Uninstall a pack
|
|
416
331
|
|
|
417
|
-
${c.bold('Auth (multi-user mode):')}
|
|
418
|
-
skillforge auth add <user> Create a bearer token for a user
|
|
419
|
-
skillforge auth list List users with tokens
|
|
420
|
-
skillforge auth remove <user> Revoke all tokens for a user
|
|
421
|
-
|
|
422
332
|
${c.bold('Maintenance:')}
|
|
423
333
|
skillforge reset Wipe learned state and event log
|
|
424
334
|
skillforge install Re-run setup (auto-runs on first launch)
|
|
@@ -436,29 +346,25 @@ ${c.bold('MCP integration:')}
|
|
|
436
346
|
|
|
437
347
|
// ---- main ----
|
|
438
348
|
async function main() {
|
|
349
|
+
dropLegacyAuthJsonIfPresent();
|
|
350
|
+
|
|
439
351
|
if (args.includes('--help') || args.includes('-h') || cmd === 'help') {
|
|
440
352
|
showHelp();
|
|
441
353
|
return;
|
|
442
354
|
}
|
|
443
355
|
|
|
444
|
-
const portArg = args.find((a) => a.startsWith('--port='));
|
|
445
|
-
const port = portArg ? parseInt(portArg.split('=')[1], 10) : 8000;
|
|
446
|
-
|
|
447
356
|
switch (cmd) {
|
|
448
357
|
case undefined:
|
|
449
358
|
showHelp();
|
|
450
359
|
break;
|
|
451
|
-
case 'start':
|
|
452
|
-
startServer({ port });
|
|
453
|
-
break;
|
|
454
360
|
case 'events':
|
|
455
361
|
runEventsCmd();
|
|
456
362
|
break;
|
|
457
363
|
case 'route':
|
|
458
364
|
runRouteCmd();
|
|
459
365
|
break;
|
|
460
|
-
case '
|
|
461
|
-
|
|
366
|
+
case 'index':
|
|
367
|
+
runIndexCmd();
|
|
462
368
|
break;
|
|
463
369
|
case 'mcp':
|
|
464
370
|
if (args[1] === 'config') {
|
|
@@ -492,7 +398,7 @@ async function main() {
|
|
|
492
398
|
const result = packs.installPack(args[2]);
|
|
493
399
|
ok(`Installed pack "${result.name}" (${result.version}) with ${result.skills.length} skill(s):`);
|
|
494
400
|
result.skills.forEach(s => log(' ' + c.dim('•'), s));
|
|
495
|
-
log(c.dim(' Restart
|
|
401
|
+
log(c.dim(' Restart skillforge mcp (or trigger catalog reload) to pick up new skills.'));
|
|
496
402
|
} else if (sub === 'list') {
|
|
497
403
|
const list = packs.listPacks();
|
|
498
404
|
if (list.length === 0) {
|
|
@@ -522,18 +428,6 @@ async function main() {
|
|
|
522
428
|
}
|
|
523
429
|
break;
|
|
524
430
|
}
|
|
525
|
-
case 'auth': {
|
|
526
|
-
const sub = args[1];
|
|
527
|
-
if (sub === 'add') authAdd(args[2]);
|
|
528
|
-
else if (sub === 'list') authList();
|
|
529
|
-
else if (sub === 'remove' || sub === 'rm') authRemove(args[2]);
|
|
530
|
-
else {
|
|
531
|
-
err(`Unknown auth subcommand: ${sub}`);
|
|
532
|
-
log(c.dim(' Try: add, list, remove'));
|
|
533
|
-
process.exit(1);
|
|
534
|
-
}
|
|
535
|
-
break;
|
|
536
|
-
}
|
|
537
431
|
default:
|
|
538
432
|
err(`Unknown command: ${cmd}`);
|
|
539
433
|
showHelp();
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@heytherevibin/skillforge",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "Skill orchestration for Claude: hybrid embedding and router-based routing, MCP
|
|
3
|
+
"version": "0.8.0",
|
|
4
|
+
"description": "Skill orchestration for Claude: hybrid embedding and router-based routing, MCP stdio server, per-user learning, and a large bundled SKILL.md catalog.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"claude",
|
|
7
7
|
"skills",
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Split SKILL.md bodies into line-bounded chunks for RAG-style retrieval."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def chunk_max_chars() -> int:
|
|
9
|
+
return max(400, int(os.getenv("SKILLFORGE_CHUNK_MAX_CHARS", "1200")))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def chunk_overlap_chars() -> int:
|
|
13
|
+
return max(0, int(os.getenv("SKILLFORGE_CHUNK_OVERLAP", "200")))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class SkillChunk:
|
|
18
|
+
"""One span of a skill body with 1-based inclusive line numbers (within the body text)."""
|
|
19
|
+
|
|
20
|
+
text: str
|
|
21
|
+
line_start: int
|
|
22
|
+
line_end: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _split_long_segment(text: str, line_start: int, max_chars: int, overlap: int) -> list[SkillChunk]:
|
|
26
|
+
"""Character windows with overlap; ``line_start`` is the body line of ``text[0]`` (1-based)."""
|
|
27
|
+
if not text:
|
|
28
|
+
return []
|
|
29
|
+
line_no = line_start
|
|
30
|
+
line_at_idx: list[int] = []
|
|
31
|
+
for ch in text:
|
|
32
|
+
line_at_idx.append(line_no)
|
|
33
|
+
if ch == "\n":
|
|
34
|
+
line_no += 1
|
|
35
|
+
n = len(text)
|
|
36
|
+
out: list[SkillChunk] = []
|
|
37
|
+
i = 0
|
|
38
|
+
while i < n:
|
|
39
|
+
end = min(i + max_chars, n)
|
|
40
|
+
piece = text[i:end].strip()
|
|
41
|
+
if piece:
|
|
42
|
+
ls = line_at_idx[i]
|
|
43
|
+
le = line_at_idx[end - 1]
|
|
44
|
+
out.append(SkillChunk(piece, ls, le))
|
|
45
|
+
if end >= n:
|
|
46
|
+
break
|
|
47
|
+
adv = max(1, end - i - overlap)
|
|
48
|
+
i += adv
|
|
49
|
+
if out:
|
|
50
|
+
return out
|
|
51
|
+
st = text.strip()
|
|
52
|
+
if not st:
|
|
53
|
+
return []
|
|
54
|
+
le_fallback = line_start + max(0, text.count("\n"))
|
|
55
|
+
return [SkillChunk(st, line_start, max(line_start, le_fallback))]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def chunk_skill_body(body: str, *, max_chars: int | None = None, overlap: int | None = None) -> list[SkillChunk]:
|
|
59
|
+
"""Chunk by markdown headings (lines starting with ``#``) then hard-split long sections.
|
|
60
|
+
|
|
61
|
+
Empty body yields no chunks (caller may treat as single empty).
|
|
62
|
+
"""
|
|
63
|
+
mc = max_chars if max_chars is not None else chunk_max_chars()
|
|
64
|
+
ov = overlap if overlap is not None else chunk_overlap_chars()
|
|
65
|
+
b = body or ""
|
|
66
|
+
if not b.strip():
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
lines = b.split("\n")
|
|
70
|
+
sections: list[tuple[str, int, int]] = []
|
|
71
|
+
cur: list[str] = []
|
|
72
|
+
cur_start = 1
|
|
73
|
+
for i, line in enumerate(lines):
|
|
74
|
+
ln = i + 1
|
|
75
|
+
if line.startswith("#") and cur:
|
|
76
|
+
sections.append(("\n".join(cur), cur_start, ln - 1))
|
|
77
|
+
cur = [line]
|
|
78
|
+
cur_start = ln
|
|
79
|
+
else:
|
|
80
|
+
cur.append(line)
|
|
81
|
+
if cur:
|
|
82
|
+
sections.append(("\n".join(cur), cur_start, len(lines)))
|
|
83
|
+
|
|
84
|
+
chunks: list[SkillChunk] = []
|
|
85
|
+
for text, ls, le in sections:
|
|
86
|
+
text = text.strip()
|
|
87
|
+
if not text:
|
|
88
|
+
continue
|
|
89
|
+
if len(text) <= mc:
|
|
90
|
+
chunks.append(SkillChunk(text, ls, le))
|
|
91
|
+
else:
|
|
92
|
+
chunks.extend(_split_long_segment(text, ls, mc, ov))
|
|
93
|
+
return chunks if chunks else [SkillChunk(b.strip(), 1, max(1, len(lines)))]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def chunk_raw_document(
|
|
97
|
+
body: str,
|
|
98
|
+
*,
|
|
99
|
+
max_chars: int | None = None,
|
|
100
|
+
overlap: int | None = None,
|
|
101
|
+
) -> list[SkillChunk]:
|
|
102
|
+
"""Chunk arbitrary file text with line-bounded windows (no markdown section split).
|
|
103
|
+
|
|
104
|
+
Line numbers are 1-based within the normalized document (``\\r\\n`` → ``\\n``).
|
|
105
|
+
"""
|
|
106
|
+
mc = max_chars if max_chars is not None else chunk_max_chars()
|
|
107
|
+
ov = overlap if overlap is not None else chunk_overlap_chars()
|
|
108
|
+
if not body:
|
|
109
|
+
return []
|
|
110
|
+
normalized = body.replace("\r\n", "\n")
|
|
111
|
+
if not normalized.strip():
|
|
112
|
+
return []
|
|
113
|
+
line_count = normalized.count("\n") + 1
|
|
114
|
+
if len(normalized) <= mc:
|
|
115
|
+
return [SkillChunk(normalized, 1, max(1, line_count))]
|
|
116
|
+
return _split_long_segment(normalized, 1, mc, ov)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""MMR-based selection to fuse skill + project chunks under one character budget."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def mmr_select(
|
|
10
|
+
embeddings: np.ndarray,
|
|
11
|
+
relevance: np.ndarray,
|
|
12
|
+
text_lengths: np.ndarray,
|
|
13
|
+
*,
|
|
14
|
+
char_budget: int,
|
|
15
|
+
overhead_per_chunk: int | np.ndarray,
|
|
16
|
+
lambda_mult: float,
|
|
17
|
+
) -> tuple[list[int], list[dict[str, Any]]]:
|
|
18
|
+
"""Greedy MMR over normalized row embeddings.
|
|
19
|
+
|
|
20
|
+
Each step maximizes ``lambda_mult * rel[i] - (1 - lambda_mult) * max_{j in selected} sim(i, j)``.
|
|
21
|
+
|
|
22
|
+
Returns selected **indices** in pick order and a trace row per pick (for telemetry).
|
|
23
|
+
"""
|
|
24
|
+
n = int(embeddings.shape[0])
|
|
25
|
+
if n == 0 or char_budget <= 0:
|
|
26
|
+
return [], []
|
|
27
|
+
|
|
28
|
+
lam = float(lambda_mult)
|
|
29
|
+
lam = max(0.0, min(1.0, lam))
|
|
30
|
+
rel = np.asarray(relevance, dtype=np.float64).reshape(-1)
|
|
31
|
+
lens = np.asarray(text_lengths, dtype=np.int64).reshape(-1)
|
|
32
|
+
emb = np.asarray(embeddings, dtype=np.float32)
|
|
33
|
+
if isinstance(overhead_per_chunk, int):
|
|
34
|
+
ovh = np.full(n, int(overhead_per_chunk), dtype=np.int64)
|
|
35
|
+
else:
|
|
36
|
+
ovh = np.asarray(overhead_per_chunk, dtype=np.int64).reshape(-1)
|
|
37
|
+
if emb.shape[0] != n or rel.shape[0] != n or lens.shape[0] != n or ovh.shape[0] != n:
|
|
38
|
+
raise ValueError("embeddings, relevance, text_lengths, and overheads must align")
|
|
39
|
+
|
|
40
|
+
selected: list[int] = []
|
|
41
|
+
trace: list[dict[str, Any]] = []
|
|
42
|
+
used = 0
|
|
43
|
+
remaining = set(range(n))
|
|
44
|
+
|
|
45
|
+
while remaining:
|
|
46
|
+
best_i: int | None = None
|
|
47
|
+
best_mmr = -1e18
|
|
48
|
+
for i in remaining:
|
|
49
|
+
need = int(lens[i]) + int(ovh[i])
|
|
50
|
+
if need <= 0 or used + need > char_budget:
|
|
51
|
+
continue
|
|
52
|
+
if not selected:
|
|
53
|
+
div = 0.0
|
|
54
|
+
else:
|
|
55
|
+
sims = emb[i] @ emb[np.array(selected, dtype=np.int64)].T
|
|
56
|
+
div = float(np.max(sims))
|
|
57
|
+
mmr = lam * float(rel[i]) - (1.0 - lam) * div
|
|
58
|
+
if mmr > best_mmr:
|
|
59
|
+
best_mmr = mmr
|
|
60
|
+
best_i = i
|
|
61
|
+
if best_i is None:
|
|
62
|
+
break
|
|
63
|
+
if selected:
|
|
64
|
+
sims = emb[best_i] @ emb[np.array(selected, dtype=np.int64)].T
|
|
65
|
+
div_used = float(np.max(sims))
|
|
66
|
+
else:
|
|
67
|
+
div_used = 0.0
|
|
68
|
+
selected.append(best_i)
|
|
69
|
+
used += int(lens[best_i]) + int(ovh[best_i])
|
|
70
|
+
remaining.remove(best_i)
|
|
71
|
+
trace.append({
|
|
72
|
+
"pool_index": best_i,
|
|
73
|
+
"mmr": round(float(best_mmr), 6),
|
|
74
|
+
"relevance": round(float(rel[best_i]), 6),
|
|
75
|
+
"max_sim_to_selected": round(div_used, 6),
|
|
76
|
+
})
|
|
77
|
+
return selected, trace
|
package/python/app/events_cli.py
CHANGED
|
@@ -123,7 +123,7 @@ def main() -> None:
|
|
|
123
123
|
db_path = resolve_orchestrator_db(pr)
|
|
124
124
|
|
|
125
125
|
if not db_path.exists():
|
|
126
|
-
print("No database yet — run skillforge mcp
|
|
126
|
+
print("No database yet — run skillforge mcp first (or route once with this project_root).")
|
|
127
127
|
print(f" Expected: {db_path}")
|
|
128
128
|
return
|
|
129
129
|
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""CLI: index project files into ``<project>/.skillforge/orchestrator.db`` for project RAG."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from app.db_paths import resolve_orchestrator_db
|
|
11
|
+
from app.main import build_router_and_skills, init_db
|
|
12
|
+
from app.project_index import index_project, project_index_stats
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _parse_args(argv: list[str] | None) -> argparse.Namespace:
|
|
16
|
+
p = argparse.ArgumentParser(
|
|
17
|
+
description=(
|
|
18
|
+
"Chunk and embed text files under project_root into the per-repo orchestrator DB. "
|
|
19
|
+
"Use with MCP route_skills/include_project_rag or skillforge route --include-project-rag."
|
|
20
|
+
),
|
|
21
|
+
)
|
|
22
|
+
p.add_argument(
|
|
23
|
+
"--project-root",
|
|
24
|
+
required=True,
|
|
25
|
+
help="Repository root directory to index (writes .skillforge/orchestrator.db).",
|
|
26
|
+
)
|
|
27
|
+
p.add_argument(
|
|
28
|
+
"--reset",
|
|
29
|
+
action="store_true",
|
|
30
|
+
help="Clear all project_chunks rows before re-indexing.",
|
|
31
|
+
)
|
|
32
|
+
p.add_argument(
|
|
33
|
+
"--stats-only",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Print index metadata from DB and exit (no scan/embed).",
|
|
36
|
+
)
|
|
37
|
+
p.add_argument(
|
|
38
|
+
"--quiet",
|
|
39
|
+
action="store_true",
|
|
40
|
+
help="Skip progress messages on stderr from skill loading.",
|
|
41
|
+
)
|
|
42
|
+
return p.parse_args(argv)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def _run(args: argparse.Namespace) -> int:
|
|
46
|
+
root_s = args.project_root.strip()
|
|
47
|
+
if not root_s:
|
|
48
|
+
print("skillforge index: --project-root is required.", file=sys.stderr)
|
|
49
|
+
return 2
|
|
50
|
+
root = Path(root_s).expanduser().resolve()
|
|
51
|
+
db_path = resolve_orchestrator_db(str(root))
|
|
52
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
con = init_db(db_path)
|
|
55
|
+
try:
|
|
56
|
+
if args.stats_only:
|
|
57
|
+
print(json.dumps({"db": str(db_path), **project_index_stats(con)}, indent=2))
|
|
58
|
+
return 0
|
|
59
|
+
|
|
60
|
+
router, _ = await asyncio.to_thread(
|
|
61
|
+
build_router_and_skills,
|
|
62
|
+
log=not args.quiet,
|
|
63
|
+
log_prefix="[skillforge-index]",
|
|
64
|
+
)
|
|
65
|
+
stats = await asyncio.to_thread(
|
|
66
|
+
index_project,
|
|
67
|
+
con,
|
|
68
|
+
root,
|
|
69
|
+
router.embed_model,
|
|
70
|
+
reset=args.reset,
|
|
71
|
+
)
|
|
72
|
+
print(
|
|
73
|
+
json.dumps(
|
|
74
|
+
{"db": str(db_path), "index_state": project_index_stats(con), **stats},
|
|
75
|
+
indent=2,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return 0
|
|
79
|
+
finally:
|
|
80
|
+
con.close()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main(argv: list[str] | None = None) -> None:
|
|
84
|
+
args = _parse_args(argv)
|
|
85
|
+
raise SystemExit(asyncio.run(_run(args)))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
main()
|