memstack-skill-loader 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memstack_skill_loader/__init__.py +1 -0
- memstack_skill_loader/__main__.py +18 -0
- memstack_skill_loader/compression.py +345 -0
- memstack_skill_loader/config.py +114 -0
- memstack_skill_loader/dashboard.html +829 -0
- memstack_skill_loader/dashboard.py +360 -0
- memstack_skill_loader/indexer.py +240 -0
- memstack_skill_loader/license.py +409 -0
- memstack_skill_loader/search.py +164 -0
- memstack_skill_loader/server.py +883 -0
- memstack_skill_loader/stats.py +428 -0
- memstack_skill_loader/tfidf_search.py +142 -0
- memstack_skill_loader/version_check.py +93 -0
- memstack_skill_loader-3.5.0.dist-info/METADATA +10 -0
- memstack_skill_loader-3.5.0.dist-info/RECORD +18 -0
- memstack_skill_loader-3.5.0.dist-info/WHEEL +5 -0
- memstack_skill_loader-3.5.0.dist-info/entry_points.txt +2 -0
- memstack_skill_loader-3.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""Local HTTP dashboard server for MemStack usage analytics."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import pickle
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import webbrowser
|
|
9
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .config import load_config
|
|
13
|
+
from .license import is_pro_exclusive
|
|
14
|
+
import sqlite3
|
|
15
|
+
from .stats import DB_PATH, get_dashboard_data, get_project_details, get_skill_fire_counts
|
|
16
|
+
|
|
17
|
+
_HTML_PATH = Path(__file__).parent / "dashboard.html"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _load_ignore_set() -> frozenset[str]:
|
|
21
|
+
"""Load disabled skill names from .memstack-ignore in CWD."""
|
|
22
|
+
ignore_path = Path.cwd() / ".memstack-ignore"
|
|
23
|
+
if not ignore_path.exists():
|
|
24
|
+
return frozenset()
|
|
25
|
+
try:
|
|
26
|
+
lines = ignore_path.read_text(encoding="utf-8").splitlines()
|
|
27
|
+
return frozenset(
|
|
28
|
+
line.strip().lower()
|
|
29
|
+
for line in lines
|
|
30
|
+
if line.strip() and not line.strip().startswith("#")
|
|
31
|
+
)
|
|
32
|
+
except Exception:
|
|
33
|
+
return frozenset()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_CATEGORY_MAP = {
|
|
37
|
+
"automation": "Automation", "cron-scheduler": "Automation", "n8n-workflow": "Automation",
|
|
38
|
+
"api-integration": "Automation", "webhook-receiver": "Automation",
|
|
39
|
+
"business": "Business", "quill": "Business", "scan": "Business", "governor": "Business",
|
|
40
|
+
"content": "Content", "content-pipeline": "Content", "humanize": "Content",
|
|
41
|
+
"deployment": "Deployment", "railway-deploy": "Deployment", "docker-deploy": "Deployment",
|
|
42
|
+
"vps-deploy": "Deployment",
|
|
43
|
+
"development": "Development", "forge": "Development", "shard": "Development",
|
|
44
|
+
"state": "Development", "work": "Development", "verify": "Development",
|
|
45
|
+
"project": "Development", "familiar": "Development",
|
|
46
|
+
"compress": "Core", "diary": "Core", "echo": "Core", "grimoire": "Core",
|
|
47
|
+
"sight": "Core", "token-optimization": "Core",
|
|
48
|
+
"marketing": "Marketing", "seo-geo": "Marketing",
|
|
49
|
+
"product": "Product",
|
|
50
|
+
"security": "Security", "advanced-security": "Security", "env-manager-pro": "Security",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _derive_category(slug: str) -> str:
|
|
55
|
+
"""Derive a category from a skill slug."""
|
|
56
|
+
return _CATEGORY_MAP.get(slug, "Other")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _strip_non_ascii(text: str) -> str:
|
|
60
|
+
"""Strip non-ASCII characters (emoji, unicode) from text."""
|
|
61
|
+
return re.sub(r'[^\x00-\x7F]+', '', text).strip()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_skills_data() -> dict:
|
|
65
|
+
"""Build the full skill catalog with fire counts and status."""
|
|
66
|
+
config = load_config()
|
|
67
|
+
pkl_path = config.resolved_vector_db_path / "tfidf_index.pkl"
|
|
68
|
+
|
|
69
|
+
if not pkl_path.exists():
|
|
70
|
+
return {
|
|
71
|
+
"project": os.path.basename(os.getcwd()),
|
|
72
|
+
"skills": [],
|
|
73
|
+
"total": 0,
|
|
74
|
+
"free_count": 0,
|
|
75
|
+
"pro_count": 0,
|
|
76
|
+
"disabled_count": 0,
|
|
77
|
+
"error": "No skill index found. Run reindex_skills first.",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
with open(pkl_path, "rb") as f:
|
|
82
|
+
index = pickle.load(f)
|
|
83
|
+
except Exception:
|
|
84
|
+
return {
|
|
85
|
+
"project": os.path.basename(os.getcwd()),
|
|
86
|
+
"skills": [],
|
|
87
|
+
"total": 0,
|
|
88
|
+
"free_count": 0,
|
|
89
|
+
"pro_count": 0,
|
|
90
|
+
"disabled_count": 0,
|
|
91
|
+
"error": "Failed to load skill index.",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
skills_raw = index.get("skills", [])
|
|
95
|
+
fire_counts = get_skill_fire_counts()
|
|
96
|
+
ignored = _load_ignore_set()
|
|
97
|
+
|
|
98
|
+
skills = []
|
|
99
|
+
free_count = 0
|
|
100
|
+
pro_count = 0
|
|
101
|
+
disabled_count = 0
|
|
102
|
+
|
|
103
|
+
for s in skills_raw:
|
|
104
|
+
slug = s.get("slug", s.get("name", ""))
|
|
105
|
+
is_pro = is_pro_exclusive(slug)
|
|
106
|
+
name_raw = s.get("name", "")
|
|
107
|
+
name_ascii = _strip_non_ascii(name_raw).lower()
|
|
108
|
+
enabled = (slug.lower() not in ignored
|
|
109
|
+
and name_raw.lower() not in ignored
|
|
110
|
+
and name_ascii not in ignored)
|
|
111
|
+
fires = fire_counts.get(s.get("name", ""), 0)
|
|
112
|
+
|
|
113
|
+
if is_pro:
|
|
114
|
+
pro_count += 1
|
|
115
|
+
else:
|
|
116
|
+
free_count += 1
|
|
117
|
+
if not enabled:
|
|
118
|
+
disabled_count += 1
|
|
119
|
+
|
|
120
|
+
skills.append({
|
|
121
|
+
"name": s.get("name", ""),
|
|
122
|
+
"slug": slug,
|
|
123
|
+
"description": s.get("description", ""),
|
|
124
|
+
"source_label": s.get("source_label", ""),
|
|
125
|
+
"category": _derive_category(slug),
|
|
126
|
+
"is_pro": is_pro,
|
|
127
|
+
"enabled": enabled,
|
|
128
|
+
"fire_count": fires,
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
"project": os.path.basename(os.getcwd()),
|
|
133
|
+
"skills": skills,
|
|
134
|
+
"total": len(skills),
|
|
135
|
+
"free_count": free_count,
|
|
136
|
+
"pro_count": pro_count,
|
|
137
|
+
"disabled_count": disabled_count,
|
|
138
|
+
# Display counts — use fixed values since index may not have all Pro stubs
|
|
139
|
+
"display_free_count": 85,
|
|
140
|
+
"display_pro_count": 29,
|
|
141
|
+
"display_total": 114,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _toggle_skill(skill_name: str, action: str) -> dict:
|
|
146
|
+
"""Enable or disable a skill by updating .memstack-ignore."""
|
|
147
|
+
ignore_path = Path.cwd() / ".memstack-ignore"
|
|
148
|
+
|
|
149
|
+
# Validate action
|
|
150
|
+
if action not in ("enable", "disable"):
|
|
151
|
+
return {"success": False, "error": "Invalid action. Use 'enable' or 'disable'."}
|
|
152
|
+
|
|
153
|
+
# Validate skill exists in index
|
|
154
|
+
config = load_config()
|
|
155
|
+
pkl_path = config.resolved_vector_db_path / "tfidf_index.pkl"
|
|
156
|
+
if pkl_path.exists():
|
|
157
|
+
try:
|
|
158
|
+
with open(pkl_path, "rb") as f:
|
|
159
|
+
index = pickle.load(f)
|
|
160
|
+
known = {s.get("name", "").lower() for s in index.get("skills", [])}
|
|
161
|
+
known |= {s.get("slug", "").lower() for s in index.get("skills", [])}
|
|
162
|
+
if skill_name.lower() not in known:
|
|
163
|
+
return {"success": False, "error": f"Skill '{skill_name}' not found in index."}
|
|
164
|
+
except Exception:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
# Read existing ignore file
|
|
168
|
+
existing_lines: list[str] = []
|
|
169
|
+
if ignore_path.exists():
|
|
170
|
+
try:
|
|
171
|
+
existing_lines = ignore_path.read_text(encoding="utf-8").splitlines()
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
name_lower = skill_name.lower()
|
|
176
|
+
name_lower_ascii = _strip_non_ascii(skill_name).lower()
|
|
177
|
+
|
|
178
|
+
if action == "disable":
|
|
179
|
+
# Check if already disabled
|
|
180
|
+
active = [
|
|
181
|
+
line.strip().lower()
|
|
182
|
+
for line in existing_lines
|
|
183
|
+
if line.strip() and not line.strip().startswith("#")
|
|
184
|
+
]
|
|
185
|
+
if name_lower in active or name_lower_ascii in active:
|
|
186
|
+
return {"success": True, "skill": skill_name, "enabled": False,
|
|
187
|
+
"disabled_count": len(active)}
|
|
188
|
+
# Strip emoji/unicode — write only ASCII to .memstack-ignore
|
|
189
|
+
existing_lines.append(_strip_non_ascii(skill_name))
|
|
190
|
+
ignore_path.parent.mkdir(parents=True, exist_ok=True)
|
|
191
|
+
tmp = ignore_path.with_suffix(".tmp")
|
|
192
|
+
tmp.write_text("\n".join(existing_lines) + "\n", encoding="utf-8")
|
|
193
|
+
tmp.replace(ignore_path)
|
|
194
|
+
ignored = _load_ignore_set()
|
|
195
|
+
return {"success": True, "skill": skill_name, "enabled": False,
|
|
196
|
+
"disabled_count": len(ignored)}
|
|
197
|
+
|
|
198
|
+
else: # enable
|
|
199
|
+
if not ignore_path.exists():
|
|
200
|
+
return {"success": True, "skill": skill_name, "enabled": True,
|
|
201
|
+
"disabled_count": 0}
|
|
202
|
+
new_lines = [
|
|
203
|
+
line for line in existing_lines
|
|
204
|
+
if line.strip().lower() != name_lower and line.strip().lower() != name_lower_ascii
|
|
205
|
+
]
|
|
206
|
+
if len(new_lines) == len(existing_lines):
|
|
207
|
+
return {"success": True, "skill": skill_name, "enabled": True,
|
|
208
|
+
"disabled_count": len(_load_ignore_set())}
|
|
209
|
+
if not any(l.strip() and not l.strip().startswith("#") for l in new_lines):
|
|
210
|
+
try:
|
|
211
|
+
ignore_path.unlink()
|
|
212
|
+
except Exception:
|
|
213
|
+
pass
|
|
214
|
+
return {"success": True, "skill": skill_name, "enabled": True,
|
|
215
|
+
"disabled_count": 0}
|
|
216
|
+
tmp = ignore_path.with_suffix(".tmp")
|
|
217
|
+
tmp.write_text("\n".join(new_lines) + "\n", encoding="utf-8")
|
|
218
|
+
tmp.replace(ignore_path)
|
|
219
|
+
ignored = _load_ignore_set()
|
|
220
|
+
return {"success": True, "skill": skill_name, "enabled": True,
|
|
221
|
+
"disabled_count": len(ignored)}
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _get_category_skills() -> dict:
|
|
225
|
+
"""Return skill fire counts grouped by category."""
|
|
226
|
+
if not DB_PATH.exists():
|
|
227
|
+
return {}
|
|
228
|
+
try:
|
|
229
|
+
conn = sqlite3.connect(str(DB_PATH), timeout=5)
|
|
230
|
+
rows = conn.execute(
|
|
231
|
+
"""
|
|
232
|
+
SELECT COALESCE(category, 'uncategorized') as cat, skill_name, COUNT(*) as fires
|
|
233
|
+
FROM skill_fires
|
|
234
|
+
GROUP BY cat, skill_name
|
|
235
|
+
ORDER BY cat, fires DESC
|
|
236
|
+
""",
|
|
237
|
+
).fetchall()
|
|
238
|
+
conn.close()
|
|
239
|
+
except Exception:
|
|
240
|
+
return {}
|
|
241
|
+
|
|
242
|
+
result: dict[str, list[dict]] = {}
|
|
243
|
+
for cat, name, fires in rows:
|
|
244
|
+
result.setdefault(cat, []).append({"name": name, "fires": fires})
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _get_diary_entries() -> list[dict]:
|
|
249
|
+
"""Read diary entries from known locations."""
|
|
250
|
+
entries = []
|
|
251
|
+
diary_dirs = [
|
|
252
|
+
Path.home() / ".memstack" / "diary",
|
|
253
|
+
Path.cwd() / "memory" / "sessions",
|
|
254
|
+
]
|
|
255
|
+
for diary_dir in diary_dirs:
|
|
256
|
+
if not diary_dir.exists():
|
|
257
|
+
continue
|
|
258
|
+
for f in sorted(diary_dir.glob("*.md"), reverse=True):
|
|
259
|
+
try:
|
|
260
|
+
content = f.read_text(encoding="utf-8", errors="replace")
|
|
261
|
+
# Extract date from filename (e.g., 2026-04-04-multi-agent-session.md)
|
|
262
|
+
name = f.stem
|
|
263
|
+
date_part = name[:10] if len(name) >= 10 else name
|
|
264
|
+
# Extract project from first line or filename
|
|
265
|
+
first_line = content.split("\n", 1)[0].strip("#").strip()
|
|
266
|
+
entries.append({
|
|
267
|
+
"date": date_part,
|
|
268
|
+
"filename": f.name,
|
|
269
|
+
"title": first_line or name,
|
|
270
|
+
"content": content,
|
|
271
|
+
"source": str(diary_dir),
|
|
272
|
+
})
|
|
273
|
+
except Exception:
|
|
274
|
+
continue
|
|
275
|
+
return entries
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class _Handler(BaseHTTPRequestHandler):
|
|
279
|
+
def log_message(self, format, *args):
|
|
280
|
+
pass # suppress default logging
|
|
281
|
+
|
|
282
|
+
def do_GET(self):
|
|
283
|
+
if self.path == "/":
|
|
284
|
+
body = _HTML_PATH.read_bytes()
|
|
285
|
+
self._respond(200, "text/html", body)
|
|
286
|
+
elif self.path == "/api/stats":
|
|
287
|
+
body = json.dumps(get_dashboard_data()).encode()
|
|
288
|
+
self._respond(200, "application/json", body)
|
|
289
|
+
elif self.path == "/api/skills":
|
|
290
|
+
body = json.dumps(_get_skills_data()).encode()
|
|
291
|
+
self._respond(200, "application/json", body)
|
|
292
|
+
elif self.path == "/api/diary":
|
|
293
|
+
body = json.dumps(_get_diary_entries()).encode()
|
|
294
|
+
self._respond(200, "application/json", body)
|
|
295
|
+
elif self.path == "/api/projects":
|
|
296
|
+
body = json.dumps(get_project_details()).encode()
|
|
297
|
+
self._respond(200, "application/json", body)
|
|
298
|
+
elif self.path == "/api/category-skills":
|
|
299
|
+
body = json.dumps(_get_category_skills()).encode()
|
|
300
|
+
self._respond(200, "application/json", body)
|
|
301
|
+
else:
|
|
302
|
+
self.send_error(404)
|
|
303
|
+
|
|
304
|
+
def do_POST(self):
|
|
305
|
+
if self.path == "/api/skills/toggle":
|
|
306
|
+
content_length = int(self.headers.get("Content-Length", 0))
|
|
307
|
+
if content_length == 0:
|
|
308
|
+
body = json.dumps({"success": False, "error": "Request body required."}).encode()
|
|
309
|
+
self._respond(400, "application/json", body)
|
|
310
|
+
return
|
|
311
|
+
raw = self.rfile.read(content_length)
|
|
312
|
+
try:
|
|
313
|
+
data = json.loads(raw)
|
|
314
|
+
except (json.JSONDecodeError, ValueError):
|
|
315
|
+
body = json.dumps({"success": False, "error": "Invalid JSON body."}).encode()
|
|
316
|
+
self._respond(400, "application/json", body)
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
skill = data.get("skill", "").strip()
|
|
320
|
+
action = data.get("action", "").strip()
|
|
321
|
+
|
|
322
|
+
if not skill:
|
|
323
|
+
body = json.dumps({"success": False, "error": "Missing 'skill' field."}).encode()
|
|
324
|
+
self._respond(400, "application/json", body)
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
if action not in ("enable", "disable"):
|
|
328
|
+
body = json.dumps({"success": False, "error": "Invalid action. Use 'enable' or 'disable'."}).encode()
|
|
329
|
+
self._respond(400, "application/json", body)
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
result = _toggle_skill(skill, action)
|
|
333
|
+
status = 200 if result.get("success") else 400
|
|
334
|
+
body = json.dumps(result).encode()
|
|
335
|
+
self._respond(status, "application/json", body)
|
|
336
|
+
else:
|
|
337
|
+
self.send_error(404)
|
|
338
|
+
|
|
339
|
+
def _respond(self, status: int, content_type: str, body: bytes):
|
|
340
|
+
self.send_response(status)
|
|
341
|
+
self.send_header("Content-Type", content_type)
|
|
342
|
+
self.send_header("Content-Length", str(len(body)))
|
|
343
|
+
self.send_header("Access-Control-Allow-Origin", "*")
|
|
344
|
+
self.end_headers()
|
|
345
|
+
self.wfile.write(body)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def start_dashboard(port: int = 3333, open_browser: bool = True):
|
|
349
|
+
"""Start the dashboard HTTP server (blocking)."""
|
|
350
|
+
httpd = HTTPServer(("127.0.0.1", port), _Handler)
|
|
351
|
+
url = f"http://localhost:{port}"
|
|
352
|
+
print(f"MemStack\u2122 Dashboard running at {url}", file=sys.stderr)
|
|
353
|
+
if open_browser:
|
|
354
|
+
webbrowser.open(url)
|
|
355
|
+
try:
|
|
356
|
+
httpd.serve_forever()
|
|
357
|
+
except KeyboardInterrupt:
|
|
358
|
+
print("\nDashboard stopped.", file=sys.stderr)
|
|
359
|
+
finally:
|
|
360
|
+
httpd.server_close()
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""Skill indexer — reads SKILL.md files and builds LanceDB vector index."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import lancedb
|
|
7
|
+
|
|
8
|
+
from .config import Config, load_config
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _parse_frontmatter(content: str) -> tuple[dict, str]:
|
|
12
|
+
"""Extract YAML frontmatter and body from markdown content."""
|
|
13
|
+
if not content.startswith("---"):
|
|
14
|
+
return {}, content
|
|
15
|
+
|
|
16
|
+
end = content.find("---", 3)
|
|
17
|
+
if end == -1:
|
|
18
|
+
return {}, content
|
|
19
|
+
|
|
20
|
+
frontmatter_text = content[3:end].strip()
|
|
21
|
+
body = content[end + 3:].strip()
|
|
22
|
+
|
|
23
|
+
metadata = {}
|
|
24
|
+
for line in frontmatter_text.split("\n"):
|
|
25
|
+
line = line.strip()
|
|
26
|
+
if ":" in line:
|
|
27
|
+
key, _, value = line.partition(":")
|
|
28
|
+
value = value.strip().strip('"').strip("'")
|
|
29
|
+
metadata[key.strip()] = value
|
|
30
|
+
|
|
31
|
+
return metadata, body
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _display_name_from_filename(dirname: str) -> str:
|
|
35
|
+
"""Convert 'railway-deploy' to 'Railway Deploy'."""
|
|
36
|
+
return dirname.replace("-", " ").replace("_", " ").title()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _display_name_from_meta_name(meta_name: str) -> str:
|
|
40
|
+
"""Convert 'memstack-codebase-index' to 'Codebase Index'."""
|
|
41
|
+
parts = meta_name.split("-")
|
|
42
|
+
if parts[0] == "memstack":
|
|
43
|
+
parts = parts[1:]
|
|
44
|
+
return " ".join(parts).title()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _display_name_from_h1(body: str) -> str:
|
|
48
|
+
"""Extract display name from the first H1 heading, before any em-dash."""
|
|
49
|
+
for line in body.split("\n"):
|
|
50
|
+
line = line.strip()
|
|
51
|
+
if line.startswith("# "):
|
|
52
|
+
heading = line[2:].strip()
|
|
53
|
+
# Strip subtitle after em-dash (e.g., "Codebase Index — Scanning...")
|
|
54
|
+
if " — " in heading:
|
|
55
|
+
heading = heading.split(" — ")[0].strip()
|
|
56
|
+
elif " - " in heading:
|
|
57
|
+
heading = heading.split(" - ")[0].strip()
|
|
58
|
+
return heading
|
|
59
|
+
return ""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def discover_skills(config: Config) -> list[dict]:
|
|
63
|
+
"""Walk skill sources and parse all SKILL.md files."""
|
|
64
|
+
skills = []
|
|
65
|
+
|
|
66
|
+
for source in config.skill_sources:
|
|
67
|
+
source_path = Path(source.path).expanduser()
|
|
68
|
+
if not source_path.exists():
|
|
69
|
+
print(f"Warning: skill source not found: {source_path}", file=sys.stderr)
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
pattern = source.pattern
|
|
73
|
+
for skill_file in sorted(source_path.glob(pattern)):
|
|
74
|
+
content = skill_file.read_text(encoding="utf-8", errors="replace")
|
|
75
|
+
metadata, body = _parse_frontmatter(content)
|
|
76
|
+
|
|
77
|
+
# Primary: H1 heading (human-readable)
|
|
78
|
+
display_name = _display_name_from_h1(body)
|
|
79
|
+
# Fallback: frontmatter name (machine ID → title case)
|
|
80
|
+
if not display_name:
|
|
81
|
+
meta_name = metadata.get("name", "")
|
|
82
|
+
if meta_name:
|
|
83
|
+
display_name = _display_name_from_meta_name(meta_name)
|
|
84
|
+
# Last resort: directory name
|
|
85
|
+
if not display_name:
|
|
86
|
+
display_name = _display_name_from_filename(skill_file.parent.name)
|
|
87
|
+
|
|
88
|
+
description = metadata.get("description", "")
|
|
89
|
+
if not description:
|
|
90
|
+
for line in body.split("\n"):
|
|
91
|
+
line = line.strip().lstrip("#").strip()
|
|
92
|
+
if line and not line.startswith("*"):
|
|
93
|
+
description = line
|
|
94
|
+
break
|
|
95
|
+
|
|
96
|
+
skills.append({
|
|
97
|
+
"name": display_name,
|
|
98
|
+
"slug": skill_file.parent.name,
|
|
99
|
+
"description": description,
|
|
100
|
+
"filename": skill_file.name,
|
|
101
|
+
"filepath": str(skill_file.resolve()),
|
|
102
|
+
"source_label": source.label,
|
|
103
|
+
"content": content,
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
return skills
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _build_tfidf_index(skills: list[dict], texts: list[str], config: Config) -> None:
|
|
110
|
+
"""Build a TF-IDF index and save as pickle for fast search.
|
|
111
|
+
|
|
112
|
+
Pickle is used here because sklearn's TfidfVectorizer and sparse matrices
|
|
113
|
+
cannot be serialized with JSON. The pickle is generated and consumed by
|
|
114
|
+
this same codebase (indexer writes, tfidf_search reads).
|
|
115
|
+
"""
|
|
116
|
+
import pickle
|
|
117
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
118
|
+
|
|
119
|
+
vectorizer = TfidfVectorizer(
|
|
120
|
+
stop_words="english",
|
|
121
|
+
ngram_range=(1, 2),
|
|
122
|
+
max_features=5000,
|
|
123
|
+
sublinear_tf=True,
|
|
124
|
+
)
|
|
125
|
+
matrix = vectorizer.fit_transform(texts)
|
|
126
|
+
|
|
127
|
+
# Store skill metadata (without vectors) alongside the TF-IDF index
|
|
128
|
+
skill_meta = [
|
|
129
|
+
{
|
|
130
|
+
"name": s["name"],
|
|
131
|
+
"slug": s["slug"],
|
|
132
|
+
"description": s["description"],
|
|
133
|
+
"filename": s["filename"],
|
|
134
|
+
"filepath": s["filepath"],
|
|
135
|
+
"source_label": s["source_label"],
|
|
136
|
+
"content": s["content"],
|
|
137
|
+
}
|
|
138
|
+
for s in skills
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
pkl_path = config.resolved_vector_db_path / "tfidf_index.pkl"
|
|
142
|
+
with open(pkl_path, "wb") as f:
|
|
143
|
+
pickle.dump({"vectorizer": vectorizer, "matrix": matrix, "skills": skill_meta}, f)
|
|
144
|
+
|
|
145
|
+
# TF-IDF index saved silently
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def build_index(config: Config | None = None) -> int:
|
|
149
|
+
"""Build/rebuild the LanceDB vector index. Returns number of skills indexed."""
|
|
150
|
+
if config is None:
|
|
151
|
+
config = load_config()
|
|
152
|
+
|
|
153
|
+
skills = discover_skills(config)
|
|
154
|
+
if not skills:
|
|
155
|
+
print("No skills found to index.", file=sys.stderr)
|
|
156
|
+
return 0
|
|
157
|
+
|
|
158
|
+
import os
|
|
159
|
+
import time
|
|
160
|
+
import logging
|
|
161
|
+
import warnings
|
|
162
|
+
|
|
163
|
+
# Suppress all noisy output before importing ML libraries
|
|
164
|
+
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
|
165
|
+
os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
|
|
166
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
167
|
+
os.environ["SAFETENSORS_FAST_GPU"] = "0"
|
|
168
|
+
os.environ["TQDM_DISABLE"] = "1"
|
|
169
|
+
warnings.filterwarnings("ignore")
|
|
170
|
+
logging.getLogger("transformers").setLevel(logging.ERROR)
|
|
171
|
+
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
|
|
172
|
+
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
|
|
173
|
+
logging.getLogger("safetensors").setLevel(logging.ERROR)
|
|
174
|
+
|
|
175
|
+
print("Loading embedding model (first run may take 30-60 seconds)...", file=sys.stderr)
|
|
176
|
+
|
|
177
|
+
# Suppress safetensors LOAD REPORT, tqdm progress bars, and HF Hub warnings
|
|
178
|
+
# by redirecting at the OS file-descriptor level (C extensions bypass Python
|
|
179
|
+
# sys.stdout/stderr). Import is inside the block because it also emits warnings.
|
|
180
|
+
_null_fd = os.open(os.devnull, os.O_WRONLY)
|
|
181
|
+
_saved_stdout_fd = os.dup(1)
|
|
182
|
+
_saved_stderr_fd = os.dup(2)
|
|
183
|
+
os.dup2(_null_fd, 1)
|
|
184
|
+
os.dup2(_null_fd, 2)
|
|
185
|
+
try:
|
|
186
|
+
from sentence_transformers import SentenceTransformer
|
|
187
|
+
model = SentenceTransformer(config.embedding_model)
|
|
188
|
+
finally:
|
|
189
|
+
os.dup2(_saved_stdout_fd, 1)
|
|
190
|
+
os.dup2(_saved_stderr_fd, 2)
|
|
191
|
+
os.close(_saved_stdout_fd)
|
|
192
|
+
os.close(_saved_stderr_fd)
|
|
193
|
+
os.close(_null_fd)
|
|
194
|
+
|
|
195
|
+
print(f"Indexing {len(skills)} skills...", file=sys.stderr)
|
|
196
|
+
t0 = time.time()
|
|
197
|
+
|
|
198
|
+
texts = [f"{s['name']} {s['description']}" for s in skills]
|
|
199
|
+
vectors = model.encode(texts, show_progress_bar=False)
|
|
200
|
+
|
|
201
|
+
records = []
|
|
202
|
+
for i, skill in enumerate(skills):
|
|
203
|
+
records.append({
|
|
204
|
+
"name": skill["name"],
|
|
205
|
+
"description": skill["description"],
|
|
206
|
+
"filename": skill["filename"],
|
|
207
|
+
"filepath": skill["filepath"],
|
|
208
|
+
"source_label": skill["source_label"],
|
|
209
|
+
"content": skill["content"],
|
|
210
|
+
"vector": vectors[i].tolist(),
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
db_path = str(config.resolved_vector_db_path)
|
|
214
|
+
db = lancedb.connect(db_path)
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
db.drop_table("skills")
|
|
218
|
+
except Exception:
|
|
219
|
+
pass
|
|
220
|
+
|
|
221
|
+
db.create_table("skills", data=records)
|
|
222
|
+
elapsed = time.time() - t0
|
|
223
|
+
print(f"Done! {len(records)} skills indexed in {elapsed:.1f}s", file=sys.stderr)
|
|
224
|
+
|
|
225
|
+
# Also build TF-IDF index for fast search (no PyTorch needed at search time)
|
|
226
|
+
_build_tfidf_index(skills, texts, config)
|
|
227
|
+
|
|
228
|
+
return len(records)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def main():
|
|
232
|
+
"""CLI entry point for rebuilding the index."""
|
|
233
|
+
config = load_config()
|
|
234
|
+
count = build_index(config)
|
|
235
|
+
if count == 0:
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
main()
|