PyPI - skill-weave - Versions diffs - 0.3.0__py3-none-any.whl - Mend

skill-weave 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

skill_weave/__init__.py +56 -0
skill_weave/advanced.py +326 -0
skill_weave/annotate.py +179 -0
skill_weave/learner.py +234 -0
skill_weave/router.py +190 -0
skill_weave/weaver.py +338 -0
skill_weave-0.3.0.dist-info/METADATA +296 -0
skill_weave-0.3.0.dist-info/RECORD +11 -0
skill_weave-0.3.0.dist-info/WHEEL +5 -0
skill_weave-0.3.0.dist-info/licenses/LICENSE +21 -0
skill_weave-0.3.0.dist-info/top_level.txt +1 -0

skill_weave/__init__.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Skill Weave — Adaptive skill routing for multi-agent systems.
+Core router: 4-dimension weighted scoring (semantic × recency × success × cost).
+Advanced router: 3-stage pipeline (Tree Filter → BM25 → LLM Re-rank).
+Active learning: online weight adjustment via bandit + gradient feedback.
+Skill weaving: multi-skill DAG orchestration (chains, parallel, conditional).
+Production-proven with 141 real-world skills, 95.7% benchmark accuracy.
+"""
+from .router import Skill, RouteResult, SkillRouter
+# Lazy imports for optional heavy dependencies
+__all__ = [
+    "SkillRouter",
+    "SkillWeave",
+    "Skill",
+    "RouteResult",
+    "TreeFilter",
+    "BM25Scorer",
+    "FeedbackLearner",
+    "WeavePlanner",
+    "WeaveNode",
+    "WeaveEdge",
+    "WeaveChain",
+    "MergeStrategy",
+    "NodeType",
+    "annotate_skill",
+    "inject_annotations",
+    "load_skill_metadata",
+]
+__version__ = "0.3.0"
+def __getattr__(name: str):
+    if name == "SkillWeave":
+        from .advanced import SkillWeave as _SkillWeave
+        return _SkillWeave
+    if name == "TreeFilter":
+        from .advanced import TreeFilter as _TreeFilter
+        return _TreeFilter
+    if name == "BM25Scorer":
+        from .advanced import BM25Scorer as _BM25Scorer
+        return _BM25Scorer
+    if name in ("annotate_skill", "inject_annotations", "load_skill_metadata"):
+        from . import annotate as _annotate
+        return getattr(_annotate, name)
+    if name == "FeedbackLearner":
+        from .learner import FeedbackLearner as _FeedbackLearner
+        return _FeedbackLearner
+    if name in ("WeavePlanner", "WeaveNode", "WeaveEdge", "WeaveChain",
+                "MergeStrategy", "NodeType"):
+        from . import weaver as _weaver
+        return getattr(_weaver, name)
+    raise AttributeError(f"module 'skill_weave' has no attribute {name!r}")

skill_weave/advanced.py ADDED Viewed

@@ -0,0 +1,326 @@
+"""Advanced hybrid router — 3-stage pipeline: Tree Filter → BM25 → LLM Re-rank.
+Production-proven with 138 skills, 95.7% accuracy, 81% context overhead reduction.
+"""
+from __future__ import annotations
+import math, re, os, time, json
+from collections import Counter
+from pathlib import Path
+from typing import Callable, Optional
+from .annotate import load_skill_metadata
+class BM25Scorer:
+    """Character-level BM25 for Chinese + word-level for English."""
+    def __init__(self, k1: float = 1.5, b: float = 0.75):
+        self.k1 = k1
+        self.b = b
+        self.doc_lengths: list[int] = []
+        self.avgdl: float = 0.0
+        self.inverted_index: dict[str, dict[int, int]] = {}
+        self.N = 0
+    def _tokenize(self, text: str) -> list[str]:
+        tokens = []
+        # Chinese: character-level 2-grams + single chars
+        for ch in text:
+            if '一' <= ch <= '鿿':
+                tokens.append(ch)
+        # Add 2-grams for Chinese
+        chinese_chars = [c for c in text if '一' <= c <= '鿿']
+        for i in range(len(chinese_chars)-1):
+            tokens.append(chinese_chars[i] + chinese_chars[i+1])
+        # English: word-level
+        words = re.findall(r'[a-zA-Z0-9]+', text.lower())
+        tokens.extend(words)
+        return tokens
+    def index(self, documents: list[str]):
+        self.N = len(documents)
+        self.doc_lengths = []
+        self.inverted_index = {}
+        for doc_id, doc in enumerate(documents):
+            tokens = self._tokenize(doc)
+            self.doc_lengths.append(len(tokens))
+            tf = Counter(tokens)
+            for term, count in tf.items():
+                if term not in self.inverted_index:
+                    self.inverted_index[term] = {}
+                self.inverted_index[term][doc_id] = count
+        self.avgdl = sum(self.doc_lengths) / max(self.N, 1)
+    def score(self, query: str) -> list[tuple[int, float]]:
+        query_tokens = self._tokenize(query)
+        scores: list[float] = [0.0] * self.N
+        for token in query_tokens:
+            if token not in self.inverted_index:
+                continue
+            postings = self.inverted_index[token]
+            df = len(postings)
+            idf = math.log((self.N - df + 0.5) / (df + 0.5) + 1.0)
+            for doc_id, tf in postings.items():
+                dl = self.doc_lengths[doc_id]
+                numerator = tf * (self.k1 + 1)
+                denominator = tf + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
+                scores[doc_id] += idf * numerator / denominator
+        return [(i, s) for i, s in enumerate(scores) if s > 0]
+class TreeFilter:
+    """Zero-token tree-path + keyword prefix matcher for L1 coarse filtering.
+    Matches query against: tree path, skill name, description, where/when fields.
+    Uses synonym expansion for Chinese-English cross-lingual matching.
+    """
+    # Common Chinese-English synonym pairs for skill routing
+    SYNONYMS = {
+        "搜索": ["search", "检索", "查找", "查询", "arxiv", "论文", "文献"],
+        "ssh": ["nas", "远程", "服务器", "部署", "连接", "docker"],
+        "浏览器": ["browser", "playwright", "selenium", "自动化"],
+        "代码": ["code", "编程", "github", "pr", "审查", "review"],
+        "ppt": ["powerpoint", "演示", "幻灯片"],
+        "调试": ["debug", "错误", "bug"],
+        "mcp": ["工具", "协议", "mcporter", "调用"],
+        "微调": ["fine", "tuning", "训练", "llm", "axolotl"],
+        "设计": ["design", "ui", "界面", "架构", "diagram"],
+        "安全": ["security", "审查", "审计", "vetter", "漏洞"],
+        "视频": ["video", "youtube", "动画"],
+        "游戏": ["game", "minecraft", "pokemon"],
+        "深度": ["research", "deep", "研究", "分析"],
+        "linear": ["任务", "项目管理", "issue"],
+        "飞书": ["feishu", "lark", "消息", "群聊"],
+        "图片": ["image", "ascii", "生成", "图"],
+        "音乐": ["music", "音频", "song", "生成"],
+        "语音": ["tts", "speech", "音频", "播报"],
+        "数据": ["data", "分析", "jupyter", "科学"],
+    }
+    def __init__(self, skills: list[dict]):
+        self.skills = skills
+        # Build reverse synonym map
+        self._syn_expand: dict[str, set[str]] = {}
+        for key, values in self.SYNONYMS.items():
+            all_terms = [key] + list(values)
+            for t in all_terms:
+                t_lower = t.lower()
+                if t_lower not in self._syn_expand:
+                    self._syn_expand[t_lower] = set()
+                self._syn_expand[t_lower].update(all_terms)
+    def _expand_query(self, query: str) -> set[str]:
+        """Expand query with synonyms for broader matching."""
+        query_lower = query.lower()
+        tokens = set(re.findall(r'[a-z0-9\u4e00-\u9fff]+', query_lower))
+        expanded = set(tokens)
+        for token in tokens:
+            if token in self._syn_expand:
+                expanded.update(self._syn_expand[token])
+        return expanded
+    def filter(self, query: str, exclude_tier3: bool = True, max_skills: int = 20) -> list[int]:
+        query_lower = query.lower()
+        query_tokens = self._expand_query(query)
+        matched: list[tuple[int, int]] = []
+        for i, skill in enumerate(self.skills):
+            tier = skill.get("tier", 2)
+            name = skill.get("name", "").lower()
+            # Skip T3 cold storage unless user explicitly names the skill
+            if exclude_tier3 and tier == 3:
+                if name not in query_lower:
+                    continue
+            score = 0
+            # Name exact/substring match (strongest signal)
+            name_parts = re.findall(r'[a-z0-9\u4e00-\u9fff]+', name.replace('-', ' '))
+            for np in name_parts:
+                if np in query_lower or np in query_tokens:
+                    score += 10
+            # Tree path segment match
+            tree = skill.get("tree", "").lower()
+            segments = [s.strip() for s in tree.split(">")]
+            for seg in segments:
+                seg_words = re.findall(r'[a-z0-9\u4e00-\u9fff]+', seg)
+                for w in seg_words:
+                    if w in query_tokens:
+                        score += 5
+            # Description / where / when keyword overlap
+            for field in ["description", "where", "when"]:
+                text = skill.get(field, "").lower()
+                words = set(re.findall(r'[a-z0-9\u4e00-\u9fff]+', text))
+                overlap = words & query_tokens
+                score += len(overlap) * 2
+            if score > 0:
+                matched.append((i, score))
+        matched.sort(key=lambda x: x[1], reverse=True)
+        return [idx for idx, _ in matched[:max_skills]]
+class SkillWeave:
+    """Complete hybrid router: Tree Filter (L1) → BM25 (L2) → LLM Re-rank (L3).
+    Production deployment with 138 skills, 95.7% benchmark accuracy,
+    81% context reduction via tree-based pre-filtering.
+    """
+    def __init__(self, skill_dir: str, llm_rank_fn: Optional[Callable] = None):
+        self.skill_dir = skill_dir
+        self.llm_rank_fn = llm_rank_fn
+        self.skills = load_skill_metadata(skill_dir)
+        self.tree_filter = TreeFilter(self.skills)
+        self.bm25 = BM25Scorer()
+        self._build_index()
+        # Stats
+        self.tier_dist = Counter(s.get("tier", 2) for s in self.skills)
+    def _build_index(self):
+        docs = []
+        for s in self.skills:
+            doc = f"{s['name']} {s['description']} {s['tree']} {s.get('where','')} {s.get('when','')}"
+            docs.append(doc)
+        self.bm25.index(docs)
+    @property
+    def stats(self) -> dict:
+        return {
+            "total_skills": len(self.skills),
+            "tier_distribution": dict(self.tier_dist),
+            "annotated": sum(1 for s in self.skills if s.get("tree") != "(unspecified)"),
+        }
+    def route(
+        self,
+        query: str,
+        top_k: int = 5,
+        stage1_limit: int = 20,
+        exclude_tier3: bool = True,
+    ) -> list[dict]:
+        """3-stage routing pipeline.
+        Args:
+            query: Task description
+            top_k: Final result count
+            stage1_limit: Max candidates from tree filter
+            exclude_tier3: Skip cold-storage skills unless named
+        Returns:
+            [{"skill": str, "score": float, "stage": str}, ...]
+        """
+        # Stage 1: Tree filter
+        candidates = self.tree_filter.filter(
+            query, exclude_tier3=exclude_tier3, max_skills=stage1_limit
+        )
+        if not candidates:
+            # Fallback: try without tier exclusion
+            candidates = self.tree_filter.filter(
+                query, exclude_tier3=False, max_skills=stage1_limit
+            )
+        if not candidates:
+            # Last resort: BM25 over ALL non-T3 skills
+            candidates = [
+                i for i, s in enumerate(self.skills)
+                if s.get("tier", 2) != 3 or s["name"].lower() in query.lower()
+            ]
+            if not candidates:
+                return []
+            stage1_limit = min(len(candidates), 50)  # wider net for BM25-only
+        # Stage 2: BM25 re-rank
+        bm25_scores = self.bm25.score(query)
+        bm25_map = dict(bm25_scores)
+        scored = []
+        for idx in candidates:
+            score = bm25_map.get(idx, 0.0)
+            scored.append((idx, score))
+        scored.sort(key=lambda x: x[1], reverse=True)
+        top_bm25 = scored[:top_k * 2]  # keep more for LLM
+        # Stage 3: LLM re-rank (if available)
+        if self.llm_rank_fn and len(top_bm25) > top_k:
+            try:
+                candidate_names = [self.skills[i]["name"] for i, _ in top_bm25]
+                ranked_names = self.llm_rank_fn(query, candidate_names)
+                # Map back
+                name_to_idx = {self.skills[i]["name"]: i for i, _ in top_bm25}
+                results = []
+                for name in ranked_names:
+                    if name in name_to_idx:
+                        idx = name_to_idx[name]
+                        results.append({
+                            "skill": name,
+                            "score": bm25_map.get(idx, 0.0),
+                            "stage": "llm_rerank",
+                        })
+                return results[:top_k]
+            except Exception:
+                pass  # fall through to BM25-only
+        # BM25-only results
+        return [
+            {
+                "skill": self.skills[idx]["name"],
+                "score": round(score, 4),
+                "stage": "bm25",
+            }
+            for idx, score in top_bm25[:top_k]
+        ]
+    def run_benchmark(self, queries: list[dict], verbose: bool = False) -> dict:
+        """Run benchmark against expected results.
+        Args:
+            queries: [{"query": "...", "expected": "skill-name"}, ...]
+            verbose: Print per-query details
+        Returns:
+            {"accuracy": float, "total": int, "correct": int, "details": [...]}
+        """
+        correct = 0
+        details = []
+        for q in queries:
+            results = self.route(q["query"], top_k=1)
+            top_name = results[0]["skill"] if results else ""
+            hit = top_name == q["expected"]
+            if hit:
+                correct += 1
+            details.append({
+                "query": q["query"],
+                "expected": q["expected"],
+                "predicted": top_name,
+                "correct": hit,
+                "candidates": [r["skill"] for r in results[:3]],
+            })
+            if verbose:
+                icon = "✅" if hit else "❌"
+                print(f"{icon} {q['query'][:50]:50s} → {top_name} (expected {q['expected']})")
+        return {
+            "accuracy": round(correct / len(queries), 4) if queries else 0,
+            "total": len(queries),
+            "correct": correct,
+            "details": details,
+        }

skill_weave/annotate.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""Skill annotation engine — generates and manages 4-dimension metadata.
+Dimensions: where (scene), when (triggers), when_not (exclusions), tree (hierarchy path).
+"""
+import json, re, sys, os, yaml
+from pathlib import Path
+DIMENSION_PROMPT = """You are a skill taxonomy expert. For the given skill, generate 4-dimension metadata.
+## Skill Content
+Name: {name}
+Description: {description}
+Body Summary:
+{body_summary}
+## Output Format
+Output YAML with these 4 fields:
+where: |
+  <One sentence: what scenario/domain does this skill apply to?>
+when: |
+  <When should this skill be loaded? 2-4 trigger conditions>
+when_not: |
+  <When should this skill NOT be loaded? 2-4 exclusion conditions, reference alternative skill names>
+tree: <Hierarchy path, e.g. "devops > ssh > nas-host-ops">
+Only output YAML, no explanation."""
+def annotate_skill(skill_path: str, llm_fn=None) -> dict:
+    """Generate 4-dimension metadata for a skill file.
+    Args:
+        skill_path: Path to SKILL.md
+        llm_fn: Optional function(text) -> str for LLM-based annotation
+    Returns:
+        Dict with where, when, when_not, tree keys
+    """
+    with open(skill_path) as f:
+        content = f.read()
+    if not content.startswith("---"):
+        return {}
+    parts = content.split("---", 2)
+    if len(parts) < 3:
+        return {}
+    fm = yaml.safe_load(parts[1]) or {}
+    name = fm.get("name", "")
+    desc = fm.get("description", "")
+    body = parts[2]
+    # Extract body summary (strip code blocks, take first 800 chars)
+    lines = []
+    in_code = False
+    for line in body.split("\n"):
+        if line.strip().startswith("```"):
+            in_code = not in_code
+            continue
+        if in_code:
+            continue
+        clean = line.strip().lstrip("#").strip()
+        if clean:
+            lines.append(clean)
+    summary = " ".join(lines[:20])[:800]
+    if llm_fn:
+        prompt = DIMENSION_PROMPT.format(name=name, description=desc, body_summary=summary)
+        try:
+            raw = llm_fn(prompt)
+            result = yaml.safe_load(raw) or {}
+        except Exception:
+            result = {}
+    else:
+        # Rule-based fallback
+        result = _rule_based_annotation(name, desc)
+    return {
+        "where": result.get("where", "(unspecified)"),
+        "when": result.get("when", "(unspecified)"),
+        "when_not": result.get("when_not", "(unspecified)"),
+        "tree": result.get("tree", "(unspecified)"),
+    }
+def _rule_based_annotation(name: str, desc: str) -> dict:
+    """Fallback rule-based annotation when no LLM available."""
+    combined = f"{name} {desc}".lower()
+    # Tree detection
+    tree = "general"
+    if any(w in combined for w in ["ssh", "nas", "docker", "容器", "部署"]):
+        tree = "devops > infrastructure"
+    elif any(w in combined for w in ["github", "git", "pr", "仓库"]):
+        tree = "devops > github"
+    elif any(w in combined for w in ["feishu", "飞书", "lark"]):
+        tree = "platform > feishu"
+    elif any(w in combined for w in ["search", "搜索", "arxiv", "论文"]):
+        tree = "research > search"
+    elif any(w in combined for w in ["browser", "浏览器", "playwright", "selenium"]):
+        tree = "automation > browser"
+    elif any(w in combined for w in ["fine", "微调", "训练", "llm"]):
+        tree = "mlops > training"
+    elif any(w in combined for w in ["code", "代码", "debug", "调试"]):
+        tree = "devops > code"
+    elif any(w in combined for w in ["design", "设计", "ui", "界面"]):
+        tree = "creative > design"
+    elif any(w in combined for w in ["mcp", "工具", "协议"]):
+        tree = "infrastructure > mcp"
+    return {
+        "where": f"Applied in {desc[:60]}",
+        "when": f"When user requests: {desc[:80]}",
+        "when_not": "When simpler alternatives exist or user explicitly excludes this domain",
+        "tree": tree,
+    }
+def inject_annotations(skill_path: str, annotations: dict) -> bool:
+    """Write 4-dimension annotations into SKILL.md frontmatter.
+    Returns True on success.
+    """
+    with open(skill_path) as f:
+        content = f.read()
+    if not content.startswith("---"):
+        return False
+    parts = content.split("---", 2)
+    if len(parts) < 3:
+        return False
+    fm = yaml.safe_load(parts[1]) or {}
+    fm["where"] = annotations["where"]
+    fm["when"] = annotations["when"]
+    fm["when_not"] = annotations["when_not"]
+    fm["tree"] = annotations["tree"]
+    new_fm = yaml.dump(fm, allow_unicode=True, default_flow_style=False, sort_keys=False).strip()
+    new_content = f"---\n{new_fm}\n---{parts[2]}"
+    with open(skill_path, "w") as f:
+        f.write(new_content)
+    return True
+def load_skill_metadata(skill_dir: str) -> list[dict]:
+    """Scan a skill directory and return all skills with dimension metadata."""
+    skills = []
+    for root, dirs, files in os.walk(skill_dir):
+        if "SKILL.md" in files:
+            path = os.path.join(root, "SKILL.md")
+            try:
+                with open(path) as f:
+                    content = f.read()
+                if content.startswith("---"):
+                    parts = content.split("---", 2)
+                    if len(parts) >= 3:
+                        fm = yaml.safe_load(parts[1]) or {}
+                        name = fm.get("name", "")
+                        if name:
+                            skills.append({
+                                "name": name,
+                                "description": fm.get("description", ""),
+                                "where": fm.get("where", "(unspecified)"),
+                                "when": fm.get("when", "(unspecified)"),
+                                "when_not": fm.get("when_not", "(unspecified)"),
+                                "tree": fm.get("tree", "(unspecified)"),
+                                "tier": fm.get("tier", 2),
+                                "path": path,
+                            })
+            except Exception:
+                pass
+    return skills