npm - @geravant/sinain - Versions diffs - 1.13.0 → 1.14.0 - Mend

@geravant/sinain 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/sinain-memory/eval/retrieval_evaluator.py DELETED Viewed

@@ -1,186 +0,0 @@
-#!/usr/bin/env python3
-"""Retrieval Quality Evaluator — Recall@k and NDCG@k for knowledge graph queries.
-Inspired by mempalace's LongMemEval benchmark infrastructure. Measures whether the
-right knowledge surfaces when the agent needs it, complementing sinain's existing
-output quality evaluation (schemas + assertions + LLM judges).
-Usage:
-    python3 eval/retrieval_evaluator.py \
-        --db memory/knowledge-graph.db \
-        --benchmark eval/retrieval_benchmark.jsonl \
-        [--k 1,3,5] [--format json|text]
-Benchmark dataset format (JSONL):
-    {"query": "OCR pipeline stalls on macOS 14", "expected_entities": ["fact:sck-capture-fix"], "category": "error-resolution"}
-"""
-import argparse
-import json
-import math
-import sys
-from collections import defaultdict
-from pathlib import Path
-def load_benchmark(path: str) -> list[dict]:
-    """Load benchmark QA pairs from JSONL."""
-    items = []
-    with open(path) as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                items.append(json.loads(line))
-    return items
-def extract_keywords(query: str) -> list[str]:
-    """Extract search keywords from a natural language query."""
-    import re
-    words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]+", query.lower())
-    stopwords = {"the", "is", "in", "on", "for", "and", "or", "of", "to", "a", "an", "it", "was", "not", "how", "what", "when", "does"}
-    return [w for w in words if len(w) > 2 and w not in stopwords]
-def dcg_at_k(relevant_positions: list[int], k: int) -> float:
-    """Compute Discounted Cumulative Gain at k."""
-    score = 0.0
-    for pos in relevant_positions:
-        if pos < k:
-            score += 1.0 / math.log2(pos + 2)  # +2 because position is 0-indexed
-    return score
-def ndcg_at_k(relevant_positions: list[int], num_relevant: int, k: int) -> float:
-    """Compute Normalized DCG at k."""
-    dcg = dcg_at_k(relevant_positions, k)
-    # Ideal DCG: all relevant items at top positions
-    ideal_positions = list(range(min(num_relevant, k)))
-    idcg = dcg_at_k(ideal_positions, k)
-    return dcg / idcg if idcg > 0 else 0.0
-def evaluate_retrieval(
-    benchmark_path: str,
-    db_path: str,
-    k_values: list[int] = [1, 3, 5],
-) -> dict:
-    """Run benchmark queries against graph_query.py, compute Recall@k and NDCG@k."""
-    # Import graph_query from parent dir
-    sys.path.insert(0, str(Path(__file__).parent.parent))
-    from graph_query import query_facts_by_entities
-    items = load_benchmark(benchmark_path)
-    if not items:
-        return {"error": "Empty benchmark dataset"}
-    max_k = max(k_values)
-    metrics: dict[str, list[float]] = defaultdict(list)
-    category_metrics: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
-    details: list[dict] = []
-    for item in items:
-        query = item["query"]
-        expected = set(item.get("expected_entities", []))
-        category = item.get("category", "general")
-        keywords = extract_keywords(query)
-        if not keywords or not expected:
-            continue
-        results = query_facts_by_entities(db_path, keywords, max_facts=max_k)
-        result_ids = [r["entityId"] for r in results]
-        # Find positions of relevant results
-        relevant_positions = []
-        for i, rid in enumerate(result_ids):
-            if rid in expected:
-                relevant_positions.append(i)
-        for k in k_values:
-            hit = any(pos < k for pos in relevant_positions)
-            recall = 1.0 if hit else 0.0
-            ndcg = ndcg_at_k(relevant_positions, len(expected), k)
-            metrics[f"recall@{k}"].append(recall)
-            metrics[f"ndcg@{k}"].append(ndcg)
-            category_metrics[category][f"recall@{k}"].append(recall)
-            category_metrics[category][f"ndcg@{k}"].append(ndcg)
-        details.append({
-            "query": query,
-            "category": category,
-            "expected": list(expected),
-            "retrieved": result_ids[:max_k],
-            "hit@1": any(pos < 1 for pos in relevant_positions),
-            "hit@5": any(pos < 5 for pos in relevant_positions),
-        })
-    # Aggregate
-    summary = {
-        "total_queries": len(items),
-        "evaluated": len(details),
-    }
-    for metric_name, values in sorted(metrics.items()):
-        summary[metric_name] = round(sum(values) / len(values), 4) if values else 0.0
-    # Per-category breakdown
-    categories = {}
-    for cat, cat_metrics in sorted(category_metrics.items()):
-        categories[cat] = {
-            "count": len(next(iter(cat_metrics.values()))),
-        }
-        for metric_name, values in sorted(cat_metrics.items()):
-            categories[cat][metric_name] = round(sum(values) / len(values), 4) if values else 0.0
-    return {
-        "summary": summary,
-        "categories": categories,
-        "details": details,
-    }
-def format_report_text(result: dict) -> str:
-    """Format evaluation result as human-readable text for daily report injection."""
-    lines = ["## Retrieval Quality"]
-    s = result["summary"]
-    for key in sorted(s):
-        if key.startswith("recall@") or key.startswith("ndcg@"):
-            lines.append(f"- {key}: {s[key]:.2%}")
-    if result.get("categories"):
-        lines.append("")
-        lines.append("**By category:**")
-        for cat, cm in sorted(result["categories"].items()):
-            r5 = cm.get("recall@5", 0)
-            lines.append(f"- {cat} (n={cm['count']}): recall@5={r5:.0%}")
-    # Weakest category
-    cats = result.get("categories", {})
-    if cats:
-        weakest = min(cats.items(), key=lambda x: x[1].get("recall@5", 1.0))
-        if weakest[1].get("recall@5", 1.0) < 0.8:
-            lines.append(f"\n**Weakest**: {weakest[0]} ({weakest[1].get('recall@5', 0):.0%})")
-    return "\n".join(lines)
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Retrieval Quality Evaluator")
-    parser.add_argument("--db", required=True, help="Path to knowledge-graph.db")
-    parser.add_argument("--benchmark", required=True, help="Path to retrieval_benchmark.jsonl")
-    parser.add_argument("--k", default="1,3,5", help="Comma-separated k values for Recall@k")
-    parser.add_argument("--format", choices=["json", "text"], default="json", help="Output format")
-    args = parser.parse_args()
-    k_values = [int(k) for k in args.k.split(",")]
-    result = evaluate_retrieval(args.benchmark, args.db, k_values)
-    if args.format == "text":
-        print(format_report_text(result))
-    else:
-        print(json.dumps(result, indent=2, ensure_ascii=False))
-if __name__ == "__main__":
-    main()

package/sinain-memory/eval/schemas.py DELETED Viewed

@@ -1,247 +0,0 @@
-"""JSON Schema definitions for all sinain-koog script outputs.
-Each schema corresponds to the JSON printed by output_json() in its respective
-script.  Used by tick_evaluator.py for mechanical validation (Tier 1 eval).
-"""
-import json
-from typing import Any
-# ---------------------------------------------------------------------------
-# signal_analyzer.py output
-# ---------------------------------------------------------------------------
-SIGNAL_ANALYZER_SCHEMA: dict = {
-    "type": "object",
-    "required": ["signals", "recommendedAction", "idle"],
-    "properties": {
-        "signals": {
-            "type": "array",
-            "items": {"type": "string"},
-        },
-        "recommendedAction": {
-            "oneOf": [
-                {"type": "null"},
-                {
-                    "type": "object",
-                    "required": ["action"],
-                    "properties": {
-                        "action": {"enum": ["sessions_spawn", "telegram_tip", "skip"]},
-                        "task": {"type": "string"},
-                        "confidence": {"type": "number", "minimum": 0, "maximum": 1},
-                    },
-                },
-            ],
-        },
-        "idle": {"type": "boolean"},
-    },
-}
-# ---------------------------------------------------------------------------
-# feedback_analyzer.py output
-# ---------------------------------------------------------------------------
-FEEDBACK_ANALYZER_SCHEMA: dict = {
-    "type": "object",
-    "required": ["feedbackScores", "effectiveness", "curateDirective"],
-    "properties": {
-        "feedbackScores": {
-            "type": "object",
-            "required": ["avg"],
-            "properties": {
-                "avg": {"type": "number"},
-                "high": {"type": "array", "items": {"type": "string"}},
-                "low": {"type": "array", "items": {"type": "string"}},
-            },
-        },
-        "effectiveness": {
-            "type": "object",
-            "required": ["outputs", "positive", "negative", "neutral", "rate"],
-            "properties": {
-                "outputs": {"type": "integer", "minimum": 0},
-                "positive": {"type": "integer", "minimum": 0},
-                "negative": {"type": "integer", "minimum": 0},
-                "neutral": {"type": "integer", "minimum": 0},
-                "rate": {"type": "number", "minimum": 0, "maximum": 1},
-            },
-        },
-        "curateDirective": {
-            "enum": ["aggressive_prune", "normal", "stability", "insufficient_data"],
-        },
-        "interpretation": {"type": "string"},
-    },
-}
-# ---------------------------------------------------------------------------
-# memory_miner.py output
-# ---------------------------------------------------------------------------
-MEMORY_MINER_SCHEMA: dict = {
-    "type": "object",
-    "required": ["findings", "newPatterns"],
-    "properties": {
-        "findings": {"type": "string"},
-        "newPatterns": {"type": "array", "items": {"type": "string"}},
-        "contradictions": {"type": "array", "items": {"type": "string"}},
-        "preferences": {"type": "array", "items": {"type": "string"}},
-        "minedSources": {"type": "array", "items": {"type": "string"}},
-    },
-}
-# ---------------------------------------------------------------------------
-# playbook_curator.py output
-# ---------------------------------------------------------------------------
-PLAYBOOK_CURATOR_SCHEMA: dict = {
-    "type": "object",
-    "required": ["changes", "playbookLines"],
-    "properties": {
-        "changes": {
-            "type": "object",
-            "required": ["added", "pruned", "promoted"],
-            "properties": {
-                "added": {"type": "array", "items": {"type": "string"}},
-                "pruned": {"type": "array", "items": {"type": "string"}},
-                "promoted": {"type": "array", "items": {"type": "string"}},
-            },
-        },
-        "staleItemActions": {"type": "array", "items": {"type": "string"}},
-        "playbookLines": {"type": "integer", "minimum": 0},
-        "error": {"type": "string"},
-    },
-}
-# ---------------------------------------------------------------------------
-# insight_synthesizer.py output (non-skip case)
-# ---------------------------------------------------------------------------
-INSIGHT_SYNTHESIZER_SCHEMA: dict = {
-    "type": "object",
-    "required": ["skip"],
-    "properties": {
-        "skip": {"type": "boolean"},
-        "suggestion": {"type": "string"},
-        "insight": {"type": "string"},
-        "totalChars": {"type": "integer", "minimum": 0},
-        "skipReason": {"type": "string"},
-    },
-}
-# ---------------------------------------------------------------------------
-# module_manager.py extract output
-# ---------------------------------------------------------------------------
-MODULE_EXTRACT_SCHEMA: dict = {
-    "type": "object",
-    "required": ["extracted", "domain", "status"],
-    "properties": {
-        "extracted": {"type": "string"},
-        "domain": {"type": "string"},
-        "patternsEstablished": {"type": "integer", "minimum": 0},
-        "patternsEmerging": {"type": "integer", "minimum": 0},
-        "vocabularyTerms": {"type": "integer", "minimum": 0},
-        "modulePath": {"type": "string"},
-        "status": {"enum": ["suspended", "active"]},
-        "activateWith": {"type": "string"},
-    },
-}
-# ---------------------------------------------------------------------------
-# Registry: script name → schema
-# ---------------------------------------------------------------------------
-SCHEMA_REGISTRY: dict[str, dict] = {
-    "signal_analyzer": SIGNAL_ANALYZER_SCHEMA,
-    "feedback_analyzer": FEEDBACK_ANALYZER_SCHEMA,
-    "memory_miner": MEMORY_MINER_SCHEMA,
-    "playbook_curator": PLAYBOOK_CURATOR_SCHEMA,
-    "insight_synthesizer": INSIGHT_SYNTHESIZER_SCHEMA,
-    "module_extract": MODULE_EXTRACT_SCHEMA,
-}
-# ---------------------------------------------------------------------------
-# Lightweight JSON Schema validator (no external dependency)
-# ---------------------------------------------------------------------------
-def validate(instance: Any, schema: dict) -> list[str]:
-    """Validate *instance* against a JSON Schema subset.
-    Returns a list of error strings (empty = valid).  Supports:
-    type, required, properties, items, enum, oneOf, minimum, maximum.
-    """
-    errors: list[str] = []
-    _validate(instance, schema, "", errors)
-    return errors
-def _validate(instance: Any, schema: dict, path: str, errors: list[str]) -> None:
-    # --- oneOf ---
-    if "oneOf" in schema:
-        matches = 0
-        for sub in schema["oneOf"]:
-            sub_errors: list[str] = []
-            _validate(instance, sub, path, sub_errors)
-            if not sub_errors:
-                matches += 1
-        if matches == 0:
-            errors.append(f"{path or '.'}: does not match any oneOf variant")
-        return
-    # --- enum ---
-    if "enum" in schema:
-        if instance not in schema["enum"]:
-            errors.append(f"{path or '.'}: {instance!r} not in {schema['enum']}")
-        return
-    # --- type ---
-    expected_type = schema.get("type")
-    if expected_type:
-        ok = _type_check(instance, expected_type)
-        if not ok:
-            errors.append(f"{path or '.'}: expected {expected_type}, got {type(instance).__name__}")
-            return
-    # --- required ---
-    if "required" in schema and isinstance(instance, dict):
-        for key in schema["required"]:
-            if key not in instance:
-                errors.append(f"{path}.{key}: required field missing")
-    # --- properties ---
-    if "properties" in schema and isinstance(instance, dict):
-        for key, sub_schema in schema["properties"].items():
-            if key in instance:
-                _validate(instance[key], sub_schema, f"{path}.{key}", errors)
-    # --- items ---
-    if "items" in schema and isinstance(instance, list):
-        for i, item in enumerate(instance):
-            _validate(item, schema["items"], f"{path}[{i}]", errors)
-    # --- minimum / maximum ---
-    if isinstance(instance, (int, float)):
-        if "minimum" in schema and instance < schema["minimum"]:
-            errors.append(f"{path or '.'}: {instance} < minimum {schema['minimum']}")
-        if "maximum" in schema and instance > schema["maximum"]:
-            errors.append(f"{path or '.'}: {instance} > maximum {schema['maximum']}")
-def _type_check(instance: Any, expected: str) -> bool:
-    if expected == "object":
-        return isinstance(instance, dict)
-    if expected == "array":
-        return isinstance(instance, list)
-    if expected == "string":
-        return isinstance(instance, str)
-    if expected == "number":
-        return isinstance(instance, (int, float))
-    if expected == "integer":
-        return isinstance(instance, int) and not isinstance(instance, bool)
-    if expected == "boolean":
-        return isinstance(instance, bool)
-    if expected == "null":
-        return instance is None
-    return True

package/sinain-memory/tests/__init__.py DELETED Viewed

File without changes

package/sinain-memory/tests/conftest.py DELETED Viewed

@@ -1,189 +0,0 @@
-"""Shared fixtures for sinain-koog pytest test suite."""
-import json
-import sys
-from datetime import datetime, timezone
-from pathlib import Path
-import pytest
-# Ensure sinain-koog source is importable
-KOOG_DIR = Path(__file__).resolve().parent.parent
-if str(KOOG_DIR) not in sys.path:
-    sys.path.insert(0, str(KOOG_DIR))
-@pytest.fixture
-def tmp_memory_dir(tmp_path):
-    """Create a temporary memory directory with sample data."""
-    memory = tmp_path / "memory"
-    memory.mkdir()
-    (memory / "playbook-logs").mkdir()
-    (memory / "playbook-archive").mkdir()
-    (memory / "eval-logs").mkdir()
-    (memory / "eval-reports").mkdir()
-    # Sample playbook
-    playbook = (
-        "<!-- mining-index: 2026-02-21,2026-02-20 -->\n"
-        "# Sinain Playbook\n\n"
-        "## Established Patterns\n"
-        "- When OCR pipeline stalls, check camera frame queue depth (score: 0.8)\n"
-        "- When user explores new framework, spawn research agent proactively (score: 0.6)\n\n"
-        "## Observed\n"
-        "- User prefers concise Telegram messages over detailed ones\n"
-        "- Late evening sessions tend to be exploratory/research-heavy\n\n"
-        "## Stale\n"
-        "- Flutter overlay rendering glitch on macOS 15 [since: 2026-02-18]\n\n"
-        "<!-- effectiveness: outputs=8,positive=5,negative=1,neutral=2,rate=0.63,updated=2026-02-21 -->\n"
-    )
-    (memory / "sinain-playbook.md").write_text(playbook, encoding="utf-8")
-    # Sample daily memory files
-    for date in ["2026-02-21", "2026-02-20", "2026-02-19"]:
-        (memory / f"{date}.md").write_text(
-            f"# {date} Session Notes\n\n- Worked on OCR pipeline\n- Explored Flutter overlays\n",
-            encoding="utf-8",
-        )
-    # Sample playbook-log entries
-    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
-    entries = [
-        {
-            "ts": "2026-02-28T10:00:00Z",
-            "idle": False,
-            "sessionSummary": "Debugging OCR pipeline",
-            "signals": [{"description": "OCR pipeline backpressure detected", "priority": "high"}],
-            "recommendedAction": {"action": "sessions_spawn", "task": "Debug OCR backpressure", "confidence": 0.8},
-            "feedbackScores": {"avg": 0.35, "high": ["OCR fix"], "low": []},
-            "effectiveness": {"outputs": 8, "positive": 5, "negative": 1, "neutral": 2, "rate": 0.63},
-            "curateDirective": "normal",
-            "playbookChanges": {
-                "changes": {"added": ["new pattern"], "pruned": [], "promoted": []},
-                "staleItemActions": [],
-                "playbookLines": 12,
-            },
-            "output": {
-                "skip": False,
-                "suggestion": "Consider frame batching for OCR pipeline",
-                "insight": "Evening sessions correlate with exploratory work patterns",
-                "totalChars": 95,
-            },
-            "skipped": False,
-            "actionsConsidered": [
-                {"action": "sessions_spawn", "reason": "Debug OCR backpressure", "chosen": True}
-            ],
-        },
-        {
-            "ts": "2026-02-28T10:30:00Z",
-            "idle": True,
-            "sessionSummary": "User idle",
-            "signals": [],
-            "recommendedAction": None,
-            "feedbackScores": {"avg": 0, "high": [], "low": []},
-            "effectiveness": {"outputs": 8, "positive": 5, "negative": 1, "neutral": 2, "rate": 0.63},
-            "curateDirective": "normal",
-            "playbookChanges": {
-                "changes": {"added": [], "pruned": [], "promoted": []},
-                "staleItemActions": [],
-                "playbookLines": 12,
-            },
-            "output": {
-                "skip": True,
-                "skipReason": "User is idle and no new patterns detected in playbook since last analysis",
-            },
-            "skipped": True,
-            "miningResult": {
-                "findings": "Found cross-day OCR pattern",
-                "newPatterns": ["frame dropping improves OCR accuracy"],
-                "contradictions": [],
-                "preferences": ["user prefers minimal configs"],
-                "minedSources": ["2026-02-21.md"],
-            },
-            "actionsConsidered": [],
-        },
-    ]
-    log_file = memory / "playbook-logs" / f"{today}.jsonl"
-    log_file.write_text(
-        "\n".join(json.dumps(e) for e in entries) + "\n",
-        encoding="utf-8",
-    )
-    return memory
-@pytest.fixture
-def tmp_modules_dir(tmp_path):
-    """Create a temporary modules directory with sample module."""
-    modules = tmp_path / "modules"
-    modules.mkdir()
-    # Registry
-    registry = {
-        "version": 1,
-        "modules": {
-            "react-native-dev": {
-                "status": "active",
-                "priority": 85,
-                "activatedAt": "2026-02-20T10:00:00Z",
-                "lastTriggered": None,
-                "locked": False,
-            },
-            "ocr-pipeline": {
-                "status": "suspended",
-                "priority": 70,
-                "activatedAt": None,
-                "lastTriggered": None,
-                "locked": False,
-            },
-        },
-    }
-    (modules / "module-registry.json").write_text(
-        json.dumps(registry, indent=2), encoding="utf-8"
-    )
-    # Module directories
-    rn_dir = modules / "react-native-dev"
-    rn_dir.mkdir()
-    (rn_dir / "manifest.json").write_text(json.dumps({
-        "id": "react-native-dev",
-        "name": "React Native Development",
-        "description": "Patterns for RN development",
-        "version": "1.0.0",
-        "priority": {"default": 85, "range": [50, 100]},
-        "triggers": {},
-        "locked": False,
-    }, indent=2), encoding="utf-8")
-    (rn_dir / "patterns.md").write_text(
-        "# React Native Development\n\n## Established Patterns\n- Use Hermes engine\n",
-        encoding="utf-8",
-    )
-    return modules
-@pytest.fixture
-def sample_log_entry():
-    """A sample playbook-log entry for testing."""
-    return {
-        "ts": "2026-02-28T10:00:00Z",
-        "idle": False,
-        "signals": [{"description": "OCR pipeline backpressure detected", "priority": "high"}],
-        "recommendedAction": {"action": "sessions_spawn", "task": "Debug OCR backpressure", "confidence": 0.8},
-        "feedbackScores": {"avg": 0.35, "high": ["OCR fix"], "low": []},
-        "effectiveness": {"outputs": 8, "positive": 5, "negative": 1, "neutral": 2, "rate": 0.63},
-        "curateDirective": "normal",
-        "interpretation": "",
-        "playbookChanges": {
-            "changes": {"added": ["new pattern"], "pruned": [], "promoted": []},
-            "staleItemActions": [],
-            "playbookLines": 12,
-        },
-        "output": {
-            "skip": False,
-            "suggestion": "Consider frame batching for OCR pipeline",
-            "insight": "Evening sessions correlate with exploratory work patterns",
-            "totalChars": 95,
-        },
-    }