memstack-skill-loader 3.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ """MemStack Skill Loader — MCP server for semantic skill search."""
@@ -0,0 +1,18 @@
1
+ """Entry point for python -m memstack_skill_loader."""
2
+
3
+ import asyncio
4
+ import sys
5
+
6
+ from .server import run
7
+
8
+
9
+ def main():
10
+ if len(sys.argv) > 1 and sys.argv[1] == "dashboard":
11
+ from .dashboard import start_dashboard
12
+ start_dashboard()
13
+ else:
14
+ asyncio.run(run())
15
+
16
+
17
+ if __name__ == "__main__":
18
+ main()
@@ -0,0 +1,345 @@
1
+ """Skill-aware compression for MemStack MCP Skill Loader.
2
+
3
+ Compresses skill content before serving to reduce token consumption.
4
+ Compression is tiered: free tier gets basic stripping, Pro tier gets
5
+ advanced section-aware compression.
6
+
7
+ All compression preserves:
8
+ - Code blocks (fenced with triple backticks)
9
+ - Checklists and action items
10
+ - Decision tables (content, not formatting padding)
11
+ - URLs and links
12
+ - Conditional logic
13
+ """
14
+
15
+ import hashlib
16
+ import re
17
+ from collections import OrderedDict
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Cache
21
+ # ---------------------------------------------------------------------------
22
+
23
+ _MAX_CACHE = 200
24
+ _cache: OrderedDict[tuple[str, str, str], str] = OrderedDict()
25
+
26
+
27
+ def _cache_key(slug: str, tier: str, content: str) -> tuple[str, str, str]:
28
+ h = hashlib.md5(content.encode(), usedforsecurity=False).hexdigest()[:8]
29
+ return (slug, tier, h)
30
+
31
+
32
+ def clear_cache() -> None:
33
+ """Clear the compression cache (called on reindex)."""
34
+ _cache.clear()
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Token estimation
39
+ # ---------------------------------------------------------------------------
40
+
41
+ def estimate_tokens(text: str) -> int:
42
+ """Estimate token count using character ratio (~4 chars/token for mixed)."""
43
+ return max(1, len(text) // 4)
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Compression helpers — operate on text OUTSIDE code blocks only
48
+ # ---------------------------------------------------------------------------
49
+
50
+ _CODE_BLOCK_RE = re.compile(r"(```[\s\S]*?```)", re.DOTALL)
51
+
52
+
53
+ def _split_code_blocks(content: str) -> list[tuple[str, bool]]:
54
+ """Split content into (text, is_code_block) segments."""
55
+ parts: list[tuple[str, bool]] = []
56
+ last = 0
57
+ for m in _CODE_BLOCK_RE.finditer(content):
58
+ if m.start() > last:
59
+ parts.append((content[last:m.start()], False))
60
+ parts.append((m.group(0), True))
61
+ last = m.end()
62
+ if last < len(content):
63
+ parts.append((content[last:], False))
64
+ return parts
65
+
66
+
67
+ def _apply_outside_code(content: str, fn) -> str:
68
+ """Apply a transform function only to text outside code blocks."""
69
+ parts = _split_code_blocks(content)
70
+ result = []
71
+ for text, is_code in parts:
72
+ if is_code:
73
+ result.append(text)
74
+ else:
75
+ result.append(fn(text))
76
+ return "".join(result)
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # Free tier transforms
81
+ # ---------------------------------------------------------------------------
82
+
83
+ # Patterns for "when to use" sections at the top of a skill
84
+ _WHEN_TO_USE_RE = re.compile(
85
+ r"^(##?\s*(?:When\s+to\s+[Uu]se|Trigger|Use\s+[Ww]hen|Description)\b.*?)(?=^##?\s|\Z)",
86
+ re.MULTILINE | re.DOTALL,
87
+ )
88
+
89
+ _FRONTMATTER_ECHO_RE = re.compile(
90
+ r"^\*?\*?(?:Name|Version|License|Pro since|Description)\s*[:]\s*.*$",
91
+ re.MULTILINE | re.IGNORECASE,
92
+ )
93
+
94
+ _BADGE_RE = re.compile(r"!\[.*?\]\(https?://.*?\)\s*", re.MULTILINE)
95
+ _HR_RE = re.compile(r"^-{3,}\s*$", re.MULTILINE)
96
+ _ALIGN_HTML_RE = re.compile(r"</?p\s+align\s*=\s*[\"']center[\"']\s*/?>", re.IGNORECASE)
97
+ _MULTI_BLANK_RE = re.compile(r"\n{3,}")
98
+ _TRAILING_WS_RE = re.compile(r"[ \t]+$", re.MULTILINE)
99
+ _EMOJI_HEADING_RE = re.compile(
100
+ r"^(#{1,6}\s+)" # heading prefix
101
+ r"[\U0001F300-\U0001FAFF\U00002702-\U000027B0\U0000FE00-\U0000FE0F\U0000200D]+" # emoji cluster
102
+ r"\s*", # trailing space after emoji
103
+ re.MULTILINE,
104
+ )
105
+ _TABLE_PADDING_RE = re.compile(r"\|[ \t]{2,}")
106
+
107
+
108
+ def _strip_when_to_use(text: str) -> str:
109
+ """Remove 'When to use' / trigger sections from the top of the skill.
110
+
111
+ Only removes if it appears before any implementation content
112
+ (within the first 40 lines).
113
+ """
114
+ lines = text.split("\n")
115
+ # Only look in the first 40 lines for the section
116
+ head = "\n".join(lines[:40])
117
+ m = _WHEN_TO_USE_RE.search(head)
118
+ if m:
119
+ # Remove the matched section from the full text
120
+ return text[:m.start()] + text[m.end():]
121
+ return text
122
+
123
+
124
+ def _strip_frontmatter_echo(text: str) -> str:
125
+ return _FRONTMATTER_ECHO_RE.sub("", text)
126
+
127
+
128
+ def _strip_badges(text: str) -> str:
129
+ return _BADGE_RE.sub("", text)
130
+
131
+
132
+ def _strip_hrs(text: str) -> str:
133
+ return _HR_RE.sub("\n", text)
134
+
135
+
136
+ def _strip_align_html(text: str) -> str:
137
+ return _ALIGN_HTML_RE.sub("", text)
138
+
139
+
140
+ def _normalize_whitespace(text: str) -> str:
141
+ text = _TRAILING_WS_RE.sub("", text)
142
+ text = _MULTI_BLANK_RE.sub("\n\n", text)
143
+ return text
144
+
145
+
146
+ def _strip_emoji_headings(text: str) -> str:
147
+ """Remove leading emoji from markdown headings."""
148
+ return _EMOJI_HEADING_RE.sub(r"\1", text)
149
+
150
+
151
+ def _compact_tables(text: str) -> str:
152
+ """Reduce excessive padding inside markdown tables."""
153
+ return _TABLE_PADDING_RE.sub("| ", text)
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Pro tier transforms
158
+ # ---------------------------------------------------------------------------
159
+
160
+ _CHECKLIST_SUB_RE = re.compile(
161
+ r"^(- \[[ x]\] .+)\n" # main checklist item
162
+ r"((?: - .+\n)+)", # one or more sub-items (2-space indented)
163
+ re.MULTILINE,
164
+ )
165
+
166
+
167
+ def _compact_checklists(text: str) -> str:
168
+ """Flatten simple sub-items into the parent checklist item."""
169
+ def _flatten(m: re.Match) -> str:
170
+ main = m.group(1).rstrip()
171
+ subs = m.group(2).strip().split("\n")
172
+ # Only flatten if sub-items are simple (single line each, <80 chars)
173
+ sub_texts = []
174
+ for s in subs:
175
+ s = s.strip().lstrip("- ").strip()
176
+ if len(s) > 80:
177
+ return m.group(0) # too complex, leave as-is
178
+ sub_texts.append(s)
179
+ return main + " (" + "; ".join(sub_texts) + ")\n"
180
+ return _CHECKLIST_SUB_RE.sub(_flatten, text)
181
+
182
+
183
+ def _summarize_long_sections(text: str) -> str:
184
+ """For skills >150 lines, compress non-critical sections.
185
+
186
+ Sections that are not checklists, code blocks, or 'pitfalls/gotchas'
187
+ and are >10 lines get compressed to their heading + first sentence.
188
+ """
189
+ lines = text.split("\n")
190
+ if len(lines) <= 150:
191
+ return text
192
+
193
+ # Identify sections
194
+ sections: list[tuple[int, str]] = []
195
+ for i, line in enumerate(lines):
196
+ if re.match(r"^#{1,3}\s+", line):
197
+ sections.append((i, line))
198
+
199
+ if not sections:
200
+ return text
201
+
202
+ # Protected section keywords
203
+ protected = {"checklist", "pitfall", "gotcha", "common mistake", "warning",
204
+ "important", "critical", "prerequisite", "setup", "install",
205
+ "implementation", "step", "example", "code"}
206
+
207
+ result_lines = list(lines)
208
+ # Process sections in reverse to preserve indices
209
+ for idx in range(len(sections) - 1, -1, -1):
210
+ start = sections[idx][0]
211
+ heading = sections[idx][1].lower()
212
+ end = sections[idx + 1][0] if idx + 1 < len(sections) else len(lines)
213
+
214
+ section_len = end - start
215
+ if section_len <= 10:
216
+ continue
217
+
218
+ # Check if section is protected
219
+ if any(kw in heading for kw in protected):
220
+ continue
221
+
222
+ # Check if section contains code blocks — protect it
223
+ section_text = "\n".join(lines[start:end])
224
+ if "```" in section_text:
225
+ continue
226
+
227
+ # Compress: keep heading + first non-empty sentence
228
+ first_sentence = ""
229
+ for line in lines[start + 1:end]:
230
+ stripped = line.strip()
231
+ if stripped and not stripped.startswith("#"):
232
+ first_sentence = stripped
233
+ break
234
+
235
+ if first_sentence:
236
+ result_lines[start:end] = [lines[start], first_sentence, ""]
237
+ # else leave as-is
238
+
239
+ return "\n".join(result_lines)
240
+
241
+
242
+ def _trim_redundant_examples(text: str) -> str:
243
+ """If 3+ consecutive code blocks exist for the same concept, keep first two."""
244
+ parts = _split_code_blocks(text)
245
+ if len(parts) < 7: # Need at least 3 code blocks (interleaved with text)
246
+ return text
247
+
248
+ # Count consecutive code blocks (separated only by short text)
249
+ result = []
250
+ consecutive_code = 0
251
+ for txt, is_code in parts:
252
+ if is_code:
253
+ consecutive_code += 1
254
+ if consecutive_code <= 2:
255
+ result.append(txt)
256
+ else:
257
+ result.append("\n*[Additional example omitted — use `get_skill(name, full=true)` for all examples]*\n")
258
+ else:
259
+ # If text between code blocks is short (<50 chars), treat as same group
260
+ if txt.strip() and len(txt.strip()) > 50:
261
+ consecutive_code = 0
262
+ result.append(txt)
263
+
264
+ return "".join(result)
265
+
266
+
267
+ # ---------------------------------------------------------------------------
268
+ # Main entry point
269
+ # ---------------------------------------------------------------------------
270
+
271
+ _MIN_COMPRESSED_TOKENS = 50
272
+
273
+
274
+ def compress_skill(content: str, tier: str = "free") -> str:
275
+ """Compress skill content based on tier.
276
+
277
+ Args:
278
+ content: Raw skill markdown content (frontmatter already stripped).
279
+ tier: "free" or "pro".
280
+
281
+ Returns:
282
+ Compressed content string.
283
+ """
284
+ original_content = content
285
+ original_tokens = estimate_tokens(content)
286
+
287
+ # --- Free tier transforms (always applied) ---
288
+ # Apply text transforms only outside code blocks
289
+ content = _apply_outside_code(content, _strip_when_to_use)
290
+ content = _apply_outside_code(content, _strip_frontmatter_echo)
291
+ content = _apply_outside_code(content, _strip_badges)
292
+ content = _apply_outside_code(content, _strip_hrs)
293
+ content = _apply_outside_code(content, _strip_align_html)
294
+ content = _apply_outside_code(content, _strip_emoji_headings)
295
+ content = _apply_outside_code(content, _compact_tables)
296
+
297
+ # --- Pro tier transforms ---
298
+ if tier == "pro":
299
+ content = _apply_outside_code(content, _compact_checklists)
300
+ content = _summarize_long_sections(content)
301
+ content = _trim_redundant_examples(content)
302
+
303
+ # Normalize whitespace last (applies everywhere outside code)
304
+ content = _apply_outside_code(content, _normalize_whitespace)
305
+
306
+ # Final trim of leading/trailing whitespace
307
+ content = content.strip()
308
+
309
+ # Safety: if compression was too aggressive, return original uncompressed
310
+ compressed_tokens = estimate_tokens(content)
311
+ if compressed_tokens < _MIN_COMPRESSED_TOKENS and original_tokens >= _MIN_COMPRESSED_TOKENS:
312
+ return original_content
313
+
314
+ return content
315
+
316
+
317
+ def get_or_compress(skill: dict, tier: str = "free") -> tuple[str, int, int]:
318
+ """Get compressed skill content, using cache if available.
319
+
320
+ Args:
321
+ skill: Skill dict with 'content', 'slug' keys.
322
+ tier: "free" or "pro".
323
+
324
+ Returns:
325
+ Tuple of (compressed_content, tokens_before, tokens_after).
326
+ """
327
+ raw = skill["content"]
328
+ tokens_before = estimate_tokens(raw)
329
+
330
+ key = _cache_key(skill.get("slug", skill["name"]), tier, raw)
331
+
332
+ if key in _cache:
333
+ _cache.move_to_end(key)
334
+ compressed = _cache[key]
335
+ return compressed, tokens_before, estimate_tokens(compressed)
336
+
337
+ compressed = compress_skill(raw, tier=tier)
338
+
339
+ # Add to cache with LRU eviction
340
+ _cache[key] = compressed
341
+ if len(_cache) > _MAX_CACHE:
342
+ _cache.popitem(last=False)
343
+
344
+ tokens_after = estimate_tokens(compressed)
345
+ return compressed, tokens_before, tokens_after
@@ -0,0 +1,114 @@
1
+ """Config loading and validation for MemStack Skill Loader."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass
11
+ class SkillSource:
12
+ type: str
13
+ path: str
14
+ pattern: str = "**/SKILL.md"
15
+ label: str = "Unknown"
16
+
17
+
18
+ @dataclass
19
+ class Config:
20
+ skill_sources: list[SkillSource] = field(default_factory=list)
21
+ embedding_model: str = "all-MiniLM-L6-v2"
22
+ default_top_k: int = 3
23
+ vector_db_path: str = "./vectors"
24
+ auto_reindex_on_start: bool = False
25
+ _config_dir: Path = field(default_factory=lambda: Path.cwd(), repr=False)
26
+
27
+ @property
28
+ def resolved_vector_db_path(self) -> Path:
29
+ p = Path(self.vector_db_path).expanduser()
30
+ if not p.is_absolute():
31
+ p = self._config_dir / p
32
+ return p.resolve()
33
+
34
+ @property
35
+ def pro_skills_dir(self) -> Path:
36
+ """Return the pro-skills directory — customer download first, bundled fallback."""
37
+ customer_dir = Path.home() / ".memstack" / "pro-skills"
38
+ if customer_dir.exists() and (customer_dir / ".complete").exists():
39
+ return customer_dir
40
+ return Path(__file__).resolve().parent.parent.parent / "pro-skills"
41
+
42
+ def with_pro_skills(self) -> "Config":
43
+ """Return a copy of this config with the bundled pro-skills source added."""
44
+ pro_dir = self.pro_skills_dir
45
+ if not pro_dir.exists():
46
+ return self
47
+ pro_source = SkillSource(
48
+ type="local",
49
+ path=str(pro_dir),
50
+ pattern="**/SKILL.md",
51
+ label="MemStack Pro",
52
+ )
53
+ return Config(
54
+ skill_sources=self.skill_sources + [pro_source],
55
+ embedding_model=self.embedding_model,
56
+ default_top_k=self.default_top_k,
57
+ vector_db_path=self.vector_db_path,
58
+ auto_reindex_on_start=self.auto_reindex_on_start,
59
+ _config_dir=self._config_dir,
60
+ )
61
+
62
+
63
+ def load_config(config_path: Path | None = None) -> Config:
64
+ """Load config from JSON file. Falls back to defaults if not found."""
65
+ if config_path is None:
66
+ config_path = Path(__file__).resolve().parent.parent.parent / "config.json"
67
+
68
+ if not config_path.exists():
69
+ print(f"Config not found at {config_path}, using defaults", file=sys.stderr)
70
+ return Config()
71
+
72
+ try:
73
+ with open(config_path, encoding="utf-8") as f:
74
+ data = json.load(f)
75
+ except json.JSONDecodeError as e:
76
+ print(f"Invalid JSON in {config_path}: {e}", file=sys.stderr)
77
+ return Config()
78
+
79
+ sources = []
80
+ env_skills_dir = os.environ.get("MEMSTACK_SKILLS_DIR")
81
+ for s in data.get("skill_sources", []):
82
+ if "path" not in s:
83
+ print(f"Warning: skill source missing 'path', skipping: {s}", file=sys.stderr)
84
+ continue
85
+ skill_path = env_skills_dir if env_skills_dir else str(Path(s["path"]).expanduser())
86
+ sources.append(SkillSource(
87
+ type=s.get("type", "local"),
88
+ path=skill_path,
89
+ pattern=s.get("pattern", "**/SKILL.md"),
90
+ label=s.get("label", "Unknown"),
91
+ ))
92
+
93
+ for source in sources:
94
+ p = Path(source.path).expanduser()
95
+ if not p.exists():
96
+ sibling = config_path.parent.resolve().parent / "memstack" / "skills"
97
+ if sibling.exists():
98
+ source.path = str(sibling)
99
+ print(f"Auto-detected skills at {sibling}", file=sys.stderr)
100
+
101
+ config = Config(
102
+ skill_sources=sources,
103
+ embedding_model=data.get("embedding_model", "all-MiniLM-L6-v2"),
104
+ default_top_k=data.get("default_top_k", 3),
105
+ vector_db_path=data.get("vector_db_path", "./vectors"),
106
+ auto_reindex_on_start=data.get("auto_reindex_on_start", False),
107
+ _config_dir=config_path.parent.resolve(),
108
+ )
109
+
110
+ # Auto-detect pro-skills if license key is set and directory exists
111
+ if os.environ.get("MEMSTACK_PRO_LICENSE_KEY"):
112
+ config = config.with_pro_skills()
113
+
114
+ return config