npm - @heytherevibin/skillforge - Versions diffs - 0.2.1 → 0.8.0 - Mend

@heytherevibin/skillforge 0.2.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/CHANGELOG.md +43 -0
package/README.md +89 -56
package/RELEASING.md +1 -1
package/SECURITY.md +2 -2
package/STRATEGY.md +1 -3
package/bin/cli.js +32 -138
package/package.json +2 -2
package/python/app/chunking.py +116 -0
package/python/app/context_fusion.py +77 -0
package/python/app/events_cli.py +1 -1
package/python/app/index_cli.py +89 -0
package/python/app/main.py +632 -229
package/python/app/mcp_contract.py +121 -0
package/python/app/mcp_server.py +304 -30
package/python/app/project_index.py +600 -0
package/python/app/redaction.py +128 -0
package/python/app/route_cli.py +42 -19
package/python/app/route_policies.py +133 -0
package/python/app/routing_signals.py +95 -0
package/python/requirements.txt +1 -4
package/python/tests/test_chunking.py +34 -0
package/python/tests/test_context_fusion.py +45 -0
package/python/tests/test_mcp_contract.py +137 -0
package/python/tests/test_project_index.py +76 -0
package/python/tests/test_redaction.py +51 -0
package/python/tests/test_route_policies.py +115 -0
package/python/tests/test_routing_signals.py +77 -0
package/python/app/auth.py +0 -63
package/python/app/cli.py +0 -78

package/python/app/redaction.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Best-effort redaction of secrets and user home paths in exported context (defense in depth)."""
+from __future__ import annotations
+import os
+import re
+from pathlib import Path
+_HOME_RESOLVED: str | None = None
+def redaction_enabled() -> bool:
+    return os.getenv("SKILLFORGE_REDACT_CONTEXT", "1").strip().lower() not in ("0", "false", "no", "")
+def redact_home_in_paths_enabled() -> bool:
+    return os.getenv("SKILLFORGE_REDACT_HOME_IN_PATHS", "1").strip().lower() not in ("0", "false", "no", "")
+def _home_prefix() -> str | None:
+    global _HOME_RESOLVED
+    if _HOME_RESOLVED is not None:
+        return _HOME_RESOLVED or None
+    try:
+        _HOME_RESOLVED = str(Path.home().resolve())
+    except Exception:
+        _HOME_RESOLVED = ""
+    return _HOME_RESOLVED or None
+COMPILED: list[tuple[re.Pattern[str], str]] = [
+    (re.compile(r"sk-ant-api\d\d-[A-Za-z0-9_\-]{20,}"), "[REDACTED_ANTHROPIC_KEY]"),
+    (re.compile(r"\bAIza[0-9A-Za-z\-_]{35}\b"), "[REDACTED_GOOGLE_API_KEY]"),
+    (re.compile(r"xox[baprs]-[0-9A-Za-z\-]{10,}"), "[REDACTED_SLACK_TOKEN]"),
+    (re.compile(r"gh[pP]_[0-9A-Za-z]{36,}"), "[REDACTED_GITHUB_TOKEN]"),
+    (re.compile(r"github_pat_[0-9A-Za-z_]{20,}"), "[REDACTED_GITHUB_PAT]"),
+    (re.compile(
+        r"-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----",
+        re.MULTILINE,
+    ), "[REDACTED_PRIVATE_KEY]"),
+    (re.compile(r"\bAKIA[0-9A-Z]{16}\b"), "[REDACTED_AWS_ACCESS_KEY_ID]"),
+    (re.compile(r"\bASIA[0-9A-Z]{16}\b"), "[REDACTED_AWS_TEMP_KEY_ID]"),
+    # OAuth / Bearer-style (avoid eating normal words — require length)
+    (re.compile(r"\bBearer\s+[A-Za-z0-9\-._~+/]{16,}={0,2}\b", re.IGNORECASE), "Bearer [REDACTED]"),
+    (re.compile(r"\bBasic\s+[A-Za-z0-9+/]{16,}={0,2}\b", re.IGNORECASE), "Basic [REDACTED]"),
+    # Env assignment leaks in pasted logs
+    (re.compile(
+        r"\b(ANTHROPIC_API_KEY|OPENAI_API_KEY|"
+        r"AWS_SECRET_ACCESS_KEY|AWS_SESSION_TOKEN|GITHUB_TOKEN|"
+        r"HF_TOKEN|HUGGINGFACE_TOKEN|SLACK_BOT_TOKEN|DATABASE_URL|"
+        r"SUPABASE_SERVICE_ROLE_KEY|SUPABASE_JWT_SECRET)\s*=\s*(\S+)",
+        re.IGNORECASE,
+    ), r"\1=[REDACTED]"),
+]
+def redact_secret_patterns(text: str) -> tuple[str, int]:
+    """Replace known secret shapes; returns ``(new_text, number_of_pattern_matches)``."""
+    if not text:
+        return text, 0
+    hits = 0
+    out = text
+    for pat, repl in COMPILED:
+        found = pat.findall(out)
+        if found:
+            hits += len(found)
+            out = pat.sub(repl, out)
+    return out, hits
+def redact_home_path_prefix(path: str) -> tuple[str, int]:
+    """If ``path`` starts with the resolved home directory, replace that prefix with ``[HOME]``."""
+    if not path or not redact_home_in_paths_enabled():
+        return path, 0
+    home = _home_prefix()
+    if not home:
+        return path, 0
+    # Normalize slashes for comparison
+    norm = path.replace("\\", "/")
+    home_n = home.replace("\\", "/")
+    if norm == home_n or norm.rstrip("/") == home_n.rstrip("/"):
+        return "[HOME]", 1
+    if norm.startswith(home_n + "/") or norm.startswith(home_n + "\\"):
+        rest = path[len(home) :].lstrip("/\\")
+        return "[HOME]/" + rest.replace("\\", "/"), 1
+    # Windows-style profile (best effort when HOME is /Users/x but path is C:\Users\x)
+    if len(path) > 3 and path[1] == ":":
+        try:
+            from os.path import expanduser
+            eu = expanduser("~")
+            if eu and path.lower().startswith(eu.lower().replace("/", "\\")):
+                return "[HOME]/" + path[len(eu) :].lstrip("\\/").replace("\\", "/"), 1
+        except Exception:
+            pass
+    return path, 0
+def redact_context_path_field(path: str | None) -> tuple[str | None, int]:
+    if not path:
+        return path, 0
+    s, n = redact_home_path_prefix(path)
+    return s, n
+def sanitize_context_items(items: list[dict]) -> tuple[int, int]:
+    """Mutate each item's ``text`` / ``path`` in place. Returns ``(secret_hits, path_hits)``."""
+    sh = ph = 0
+    for c in items:
+        t = c.get("text") or ""
+        nt, h = redact_secret_patterns(t)
+        if h:
+            sh += h
+            c["text"] = nt
+        p = c.get("path")
+        if p is not None:
+            np, h2 = redact_context_path_field(str(p))
+            if h2:
+                ph += h2
+                c["path"] = np
+    return sh, ph
+def redact_display_path(p: str | Path) -> str:
+    """Single path string safe for logs / ``_meta`` (home prefix only + pattern redaction)."""
+    s = str(p)
+    s, _ = redact_home_path_prefix(s)
+    s, _ = redact_secret_patterns(s)
+    return s

package/python/app/route_cli.py CHANGED Viewed

@@ -9,7 +9,14 @@ import time
 from pathlib import Path
 from app.db_paths import resolve_orchestrator_db
-from app.main import build_router_and_skills, init_db, run_route_turn
+from app.main import (
+    build_router_and_skills,
+    format_context_items_markdown,
+    init_db,
+    run_route_turn,
+)
+from app.mcp_contract import MCP_RESPONSE_SCHEMA_VERSION, build_route_skills_meta
+from app.redaction import redaction_enabled, redact_display_path
 def _parse_args(argv: list[str] | None) -> argparse.Namespace:
@@ -28,6 +35,11 @@ def _parse_args(argv: list[str] | None) -> argparse.Namespace:
     p.add_argument("--session-id", default="", help="Stable session id (reuse across turns for reroute stats).")
     p.add_argument("--user-id", default="", help="Logical user id for weights/sessions/events.")
     p.add_argument("--json-meta", action="store_true", help="Print routing metadata as JSON on stderr after output.")
+    p.add_argument(
+        "--include-project-rag",
+        action="store_true",
+        help="Append chunks from `skillforge index` (same DB as --project-root). Requires --project-root.",
+    )
     return p.parse_args(argv)
@@ -38,6 +50,9 @@ async def _run(args: argparse.Namespace) -> int:
         return 2
     pr = (args.project_root or "").strip() or None
+    if args.include_project_rag and not pr:
+        print("skillforge route: --include-project-rag requires --project-root.", file=sys.stderr)
+        return 2
     db_path = resolve_orchestrator_db(pr)
     con = init_db(db_path)
@@ -53,6 +68,8 @@ async def _run(args: argparse.Namespace) -> int:
             conversation=[],
             user_id=user_id,
             session_id=session_id,
+            project_root=pr,
+            include_project_rag=bool(args.include_project_rag),
         )
     finally:
         con.close()
@@ -60,6 +77,7 @@ async def _run(args: argparse.Namespace) -> int:
     picked_names = result["picked_names"]
     reasoning = result["reasoning"]
     sid = result["session_id"]
+    context_items = result.get("context_items") or []
     if pr:
         try:
@@ -73,36 +91,41 @@ async def _run(args: argparse.Namespace) -> int:
                 "route_ms": round(result["route_ms"], 1),
                 "user_id": user_id,
                 "source": "cli_route",
+                "schema_version": MCP_RESPONSE_SCHEMA_VERSION,
+                "context_mode": router.context_mode,
+                "context_items_count": len(context_items),
+                "project_rag_items_count": (result.get("event") or {}).get("project_rag_items_count", 0),
             }
             (d / "last_route.json").write_text(json.dumps(snap, indent=2), encoding="utf-8")
         except OSError:
             pass
+    db_disp = redact_display_path(db_path) if redaction_enabled() else str(db_path)
     blocks = [
-        f"# Skillforge — routed {len(picked_names)} skill(s)",
-        f"_DB:_ `{db_path}`",
+        f"# Skillforge — routed {len(picked_names)} skill(s); context=`{router.context_mode}`",
+        f"_DB:_ `{db_disp}`",
         f"_Reasoning: {reasoning}_" if reasoning else "",
         "",
     ]
-    for n in picked_names:
-        s = skills.get(n)
-        if s:
-            blocks.append(f"---\n## Skill: {s.name}\n\n{s.body}\n")
-    if not picked_names:
+    if context_items:
+        blocks.append(format_context_items_markdown(context_items))
+    elif not picked_names:
         blocks.append("_No skills matched this prompt closely enough to load._")
-    print("\n".join(b for b in blocks if b is not None))
+    response_text = "\n".join(b for b in blocks if b is not None)
+    print(response_text)
     if args.json_meta:
-        meta = {
-            "picked": picked_names,
-            "reasoning": reasoning,
-            "session_id": sid,
-            "user_id": user_id,
-            "rerouted": result["rerouted"],
-            "change_pct": round(result["change"] * 100, 1),
-            "route_ms": round(result["route_ms"], 1),
-            "orchestrator_db": str(db_path),
-        }
+        meta = build_route_skills_meta(
+            result=result,
+            picked_names=picked_names,
+            user_id=user_id,
+            db_path=db_path,
+            skills_map=skills,
+            response_text=response_text,
+            context_items=context_items,
+            fusion=(result.get("event") or {}).get("context_fusion"),
+            context_redaction=(result.get("event") or {}).get("context_redaction"),
+        )
         print(json.dumps(meta, indent=2), file=sys.stderr)
     return 0

package/python/app/route_policies.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Pluggable route policies: regex on prompt → force-include skill names.
+Load order (first file that exists / first successful parse wins for env):
+1. ``SKILLFORGE_ROUTE_POLICIES`` — JSON object inline (e.g. ``{\"rules\":[...]}``).
+2. ``SKILLFORGE_ROUTE_POLICIES_FILE`` — path to a JSON file.
+3. ``<project_root>/.skillforge/policies.json``
+4. ``<project_root>/skillforge-policies.json``
+Rule shape::
+    {
+      "rules": [
+        {
+          "if_text_matches": "(?i)(auth|oauth|jwt|password)",
+          "include": ["security-review"]
+        }
+      ]
+    }
+``if_text_matches`` is passed to ``re.search`` (``re.DOTALL``). ``include`` is a skill
+name or list of names. Forced skills are appended after router picks until
+``MAX_ACTIVE_SKILLS`` is reached.
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import sqlite3
+from pathlib import Path
+from typing import Any
+def load_route_policies_config(project_root: str | None) -> dict[str, Any]:
+    """Return a dict with key ``rules`` (list). Empty rules if nothing configured."""
+    raw_env = os.getenv("SKILLFORGE_ROUTE_POLICIES", "").strip()
+    if raw_env:
+        try:
+            data = json.loads(raw_env)
+            return data if isinstance(data, dict) else {"rules": []}
+        except json.JSONDecodeError:
+            return {"rules": []}
+    paths: list[Path] = []
+    path_env = os.getenv("SKILLFORGE_ROUTE_POLICIES_FILE", "").strip()
+    if path_env:
+        paths.append(Path(path_env).expanduser())
+    if project_root:
+        pr = Path(project_root).expanduser().resolve()
+        paths.append(pr / ".skillforge" / "policies.json")
+        paths.append(pr / "skillforge-policies.json")
+    for p in paths:
+        if p.is_file():
+            try:
+                data = json.loads(p.read_text(encoding="utf-8"))
+                return data if isinstance(data, dict) else {"rules": []}
+            except (OSError, json.JSONDecodeError):
+                continue
+    return {"rules": []}
+def merge_policy_includes(
+    prompt: str,
+    picked_names: list[str],
+    policies: dict[str, Any],
+    by_name: dict[str, Any],
+    con: sqlite3.Connection,
+    user_id: str,
+    *,
+    max_active: int,
+) -> tuple[list[str], list[dict[str, Any]]]:
+    """Append policy-driven skills after ``picked_names`` without duplicates.
+    Returns (merged_pick_list, audit_rows for events / explain_route).
+    """
+    # Local import avoids circular import at module load time.
+    from app.main import get_skill_weight
+    rules = policies.get("rules") if isinstance(policies, dict) else None
+    if not isinstance(rules, list):
+        rules = []
+    audit: list[dict[str, Any]] = []
+    merged = list(picked_names)
+    extras: list[str] = []
+    for rule in rules:
+        if not isinstance(rule, dict):
+            continue
+        pat = rule.get("if_text_matches") or rule.get("pattern") or ""
+        if not isinstance(pat, str) or not pat.strip():
+            continue
+        try:
+            matched = bool(re.search(pat, prompt, flags=re.DOTALL))
+        except re.error:
+            audit.append({"pattern": pat, "effect": "invalid_regex"})
+            continue
+        if not matched:
+            continue
+        inc = rule.get("include")
+        if isinstance(inc, str):
+            inc = [inc]
+        if not isinstance(inc, list):
+            continue
+        for name in inc:
+            if not isinstance(name, str) or not name.strip():
+                continue
+            name = name.strip()
+            if name not in by_name:
+                audit.append({"pattern": pat, "skill": name, "effect": "unknown_skill"})
+                continue
+            _w, disabled = get_skill_weight(con, name, user_id=user_id)
+            if disabled:
+                audit.append({"pattern": pat, "skill": name, "effect": "disabled"})
+                continue
+            if name in merged or name in extras:
+                audit.append({"pattern": pat, "skill": name, "effect": "already_in_list"})
+                continue
+            extras.append(name)
+            audit.append({"pattern": pat, "skill": name, "effect": "added"})
+    for n in extras:
+        if len(merged) >= max_active:
+            audit.append({"skill": n, "effect": "skipped_max_active", "max": max_active})
+            break
+        if n not in merged:
+            merged.append(n)
+    return merged, audit

package/python/app/routing_signals.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Conversation-aware routing text, skill routing cards, and sparse retrieval signals."""
+from __future__ import annotations
+import os
+import re
+from typing import Any, Protocol
+import numpy as np
+_TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_\-./]{2,}", re.I)
+class _SkillCard(Protocol):
+    title: str
+    description: str
+    triggers: str
+    anti_triggers: str
+def build_route_query_text(
+    prompt: str,
+    conversation: list[Any] | None,
+    *,
+    max_turns: int | None = None,
+    max_chars_per_msg: int | None = None,
+) -> str:
+    """Merge recent turns with the current user message for embedding shortlist / hybrid scores.
+    When ``SKILLFORGE_ROUTER_CONV_MAX_TURNS`` is 0 (default), returns ``prompt`` only (legacy behavior).
+    """
+    conv = conversation or []
+    mt = max_turns
+    if mt is None:
+        mt = int(os.getenv("SKILLFORGE_ROUTER_CONV_MAX_TURNS", "0"))
+    mc = max_chars_per_msg
+    if mc is None:
+        mc = int(os.getenv("SKILLFORGE_ROUTER_CONV_MSG_CHARS", "320"))
+    prompt = (prompt or "").strip()
+    if mt <= 0 or not conv:
+        return prompt
+    tail = conv[-mt:]
+    parts: list[str] = []
+    for m in tail:
+        if not isinstance(m, dict):
+            continue
+        role = str(m.get("role") or "user")
+        content = str(m.get("content") or "").strip()
+        if not content:
+            continue
+        if len(content) > mc:
+            content = content[:mc] + "…"
+        parts.append(f"{role}: {content}")
+    if not parts:
+        return prompt
+    return "Conversation context:\n" + "\n".join(parts) + "\n\nCurrent user message:\n" + prompt
+def skill_routing_card(s: _SkillCard) -> str:
+    """Text embedded for each skill + used in hybrid / router prompts."""
+    title = (s.title or "").strip()
+    desc = (s.description or "").strip()
+    tr = (getattr(s, "triggers", None) or "").strip()
+    anti = (getattr(s, "anti_triggers", None) or "").strip()
+    parts = [f"{title}: {desc}"]
+    if tr:
+        parts.append(f"Triggers: {tr}")
+    if anti:
+        parts.append(f"Anti-triggers: {anti}")
+    return "\n".join(parts)
+def tokenize_skills_query(text: str) -> list[str]:
+    return [t.lower() for t in _TOKEN_RE.findall(text or "")]
+def normalize_minmax(arr: np.ndarray) -> np.ndarray:
+    a = np.asarray(arr, dtype=np.float64).reshape(-1)
+    if a.size == 0:
+        return a
+    lo, hi = float(a.min()), float(a.max())
+    if hi <= lo:
+        return np.zeros_like(a)
+    return (a - lo) / (hi - lo)
+def keyword_overlap_scores(route_query: str, skill_cards: list[str]) -> np.ndarray:
+    """Per-skill overlap counts (unnormalized); combine with dense via hybrid alpha."""
+    qt = set(tokenize_skills_query(route_query))
+    if not qt:
+        return np.zeros(len(skill_cards), dtype=np.float64)
+    out: list[float] = []
+    for card in skill_cards:
+        ct = set(tokenize_skills_query(card))
+        out.append(float(len(qt & ct)))
+    return np.array(out, dtype=np.float64)

package/python/requirements.txt CHANGED Viewed

@@ -1,7 +1,4 @@
-fastapi>=0.110
-uvicorn[standard]>=0.27
 anthropic>=0.39
 sentence-transformers>=2.7
 numpy>=1.26
-pydantic>=2.6
-httpx>=0.27
+rank-bm25>=0.2.2

package/python/tests/test_chunking.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Unit tests for skill body chunking (no ML)."""
+from __future__ import annotations
+from app.chunking import chunk_raw_document, chunk_skill_body
+def test_chunk_respects_headings() -> None:
+    body = "# Title\n\nintro\n\n## A\n\none\n\n## B\n\ntwo three"
+    chunks = chunk_skill_body(body, max_chars=500, overlap=50)
+    assert len(chunks) >= 2
+    names = [c.text for c in chunks]
+    assert any("one" in t for t in names)
+    assert any("two three" in t for t in names)
+def test_chunk_line_numbers_monotonic() -> None:
+    body = "a\nb\nc\nd"
+    chunks = chunk_skill_body(body, max_chars=5, overlap=0)
+    assert chunks
+    for c in chunks:
+        assert c.line_start <= c.line_end
+        assert c.line_start >= 1
+def test_empty_body() -> None:
+    assert chunk_skill_body("", max_chars=100, overlap=0) == []
+def test_chunk_raw_document_small_file() -> None:
+    body = "line1\nline2\nline3"
+    chunks = chunk_raw_document(body, max_chars=100, overlap=0)
+    assert len(chunks) == 1
+    assert chunks[0].line_start == 1
+    assert "line1" in chunks[0].text

package/python/tests/test_context_fusion.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Tests for MMR context fusion (numpy only)."""
+from __future__ import annotations
+import numpy as np
+from app.context_fusion import mmr_select
+def test_mmr_prefers_diverse_second_item() -> None:
+    """Two near-duplicate high-rel docs: second pick should favor the orthogonal one when lambda < 1."""
+    # query-aligned
+    e0 = np.array([1.0, 0.0, 0.0], dtype=np.float32)
+    e1 = np.array([0.99, 0.14, 0.0], dtype=np.float32)  # almost same as e0
+    e2 = np.array([0.0, 1.0, 0.0], dtype=np.float32)  # different direction
+    emb = np.stack([e0, e1, e2], axis=0)
+    rel = np.array([1.0, 0.98, 0.5], dtype=np.float64)
+    lens = np.array([10, 10, 10], dtype=np.int64)
+    ovh = np.full(3, 8, dtype=np.int64)
+    order, trace = mmr_select(
+        emb,
+        rel,
+        lens,
+        char_budget=500,
+        overhead_per_chunk=ovh,
+        lambda_mult=0.5,
+    )
+    assert order[0] == 0
+    assert order[1] == 2
+    assert len(trace) == len(order)
+def test_mmr_respects_char_budget() -> None:
+    emb = np.eye(3, dtype=np.float32)
+    rel = np.array([1.0, 0.9, 0.8])
+    lens = np.array([100, 100, 100], dtype=np.int64)
+    ovh = np.array([10, 10, 10], dtype=np.int64)
+    order, _ = mmr_select(
+        emb,
+        rel,
+        lens,
+        char_budget=150,
+        overhead_per_chunk=ovh,
+        lambda_mult=1.0,
+    )
+    assert len(order) == 1