npm - @rudderhq/agent-runtime-gemini-local - Versions diffs - 0.2.1 → 0.2.2-canary.1 - Mend

@rudderhq/agent-runtime-gemini-local 0.2.1 → 0.2.2-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/skills/conversation-to-skill/scripts/run_loop.py ADDED Viewed

@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached."""
+from __future__ import annotations
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+if __package__ in (None, ""):
+    sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.model_backends import detect_backend
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    random.seed(seed)
+    trigger = [item for item in eval_set if item["should_trigger"]]
+    no_trigger = [item for item in eval_set if not item["should_trigger"]]
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+    return train_set, test_set
+def run_loop(
+    *,
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str | None,
+    backend: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+    backend = detect_backend(backend)
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
+    else:
+        train_set = eval_set
+        test_set = []
+    history = []
+    exit_reason = "unknown"
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            print(f"\n{'=' * 60}", file=sys.stderr)
+            print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
+            print(f"Backend: {backend}", file=sys.stderr)
+            print(f"Description: {current_description}", file=sys.stderr)
+            print(f"{'=' * 60}", file=sys.stderr)
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+            backend=backend,
+        )
+        eval_elapsed = time.time() - t0
+        train_queries = {item["query"] for item in train_set}
+        train_result_list = [item for item in all_results["results"] if item["query"] in train_queries]
+        test_result_list = [item for item in all_results["results"] if item["query"] not in train_queries]
+        train_passed = sum(1 for item in train_result_list if item["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+        if test_set:
+            test_passed = sum(1 for item in test_result_list if item["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_summary = None
+            test_results = None
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+        if live_report_path:
+            partial_output = {
+                "backend": backend,
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+        if verbose:
+            def print_eval_stats(label: str, results: list[dict], elapsed: float):
+                pos = [item for item in results if item["should_trigger"]]
+                neg = [item for item in results if not item["should_trigger"]]
+                tp = sum(item["triggers"] for item in pos)
+                pos_runs = sum(item["runs"] for item in pos)
+                fn = pos_runs - tp
+                fp = sum(item["triggers"] for item in neg)
+                neg_runs = sum(item["runs"] for item in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                print(
+                    f"{label}: {tp + tn}/{total} correct, precision={precision:.0%} "
+                    f"recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)",
+                    file=sys.stderr,
+                )
+                for item in results:
+                    status = "PASS" if item["pass"] else "FAIL"
+                    rate_str = f"{item['triggers']}/{item['runs']}"
+                    print(f"  [{status}] rate={rate_str} expected={item['should_trigger']}: {item['query'][:60]}", file=sys.stderr)
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                print(f"\nAll train queries passed on iteration {iteration}.", file=sys.stderr)
+            break
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
+            break
+        if verbose:
+            print("\nImproving description...", file=sys.stderr)
+        t0 = time.time()
+        blinded_history = [
+            {k: v for k, v in item.items() if not k.startswith("test_")}
+            for item in history
+        ]
+        new_description = improve_description(
+            backend=backend,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+        if verbose:
+            print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
+        current_description = new_description
+    if test_set:
+        best = max(history, key=lambda item: item["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda item: item["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+    if verbose:
+        print(f"\nExit reason: {exit_reason}", file=sys.stderr)
+        print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
+    return {
+        "backend": backend,
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=60, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", default=None, help="Optional backend model identifier")
+    parser.add_argument("--backend", default="auto", choices=["auto", "claude", "codex"], help="Optimization backend")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto', 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save outputs to a timestamped subdirectory here")
+    args = parser.parse_args()
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
+        sys.exit(1)
+    name, _, _ = parse_skill_md(skill_path)
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+    log_dir = results_dir / "logs" if results_dir else None
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        backend=args.backend,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        print(f"\nReport: {live_report_path}", file=sys.stderr)
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+    if results_dir:
+        print(f"Results saved to: {results_dir}", file=sys.stderr)
+if __name__ == "__main__":
+    main()

package/skills/conversation-to-skill/scripts/utils.py ADDED Viewed

@@ -0,0 +1,223 @@
+"""Shared utilities for skill-creator scripts."""
+from __future__ import annotations
+import re
+from pathlib import Path
+FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---\s*(?:\n|$)", re.DOTALL)
+def extract_frontmatter_text(content: str) -> str:
+    """Return the raw YAML frontmatter text from a SKILL.md file."""
+    match = FRONTMATTER_PATTERN.match(content)
+    if not match:
+        raise ValueError("SKILL.md missing frontmatter (expected opening and closing ---)")
+    return match.group(1)
+def _count_indent(line: str) -> int:
+    return len(line) - len(line.lstrip(" "))
+def _parse_scalar(value: str):
+    value = value.strip()
+    if not value:
+        return ""
+    if value[0] == value[-1] and value[0] in {'"', "'"}:
+        return value[1:-1]
+    lowered = value.lower()
+    if lowered in {"true", "false"}:
+        return lowered == "true"
+    if lowered in {"null", "~"}:
+        return None
+    if re.fullmatch(r"-?\d+", value):
+        return int(value)
+    if re.fullmatch(r"-?\d+\.\d+", value):
+        return float(value)
+    if value.startswith("[") and value.endswith("]"):
+        inner = value[1:-1].strip()
+        if not inner:
+            return []
+        return [_parse_scalar(part.strip()) for part in inner.split(",")]
+    return value
+def _fold_lines(lines: list[str]) -> str:
+    paragraphs: list[list[str]] = [[]]
+    for line in lines:
+        if line == "":
+            if paragraphs[-1]:
+                paragraphs.append([])
+            continue
+        paragraphs[-1].append(line)
+    folded = [" ".join(paragraph).strip() for paragraph in paragraphs if paragraph]
+    return "\n\n".join(part for part in folded if part)
+def _parse_block_scalar(lines: list[str], start: int, indent: int, style: str):
+    collected: list[str] = []
+    index = start
+    while index < len(lines):
+        raw = lines[index]
+        if not raw.strip():
+            collected.append("")
+            index += 1
+            continue
+        current_indent = _count_indent(raw)
+        if current_indent < indent:
+            break
+        collected.append(raw[indent:])
+        index += 1
+    if style.startswith("|"):
+        value = "\n".join(collected)
+    else:
+        value = _fold_lines(collected)
+    return value, index
+def _looks_like_mapping_entry(text: str) -> bool:
+    if text.startswith("- "):
+        return False
+    return bool(re.match(r"^[A-Za-z0-9_-]+:\s*.*$", text))
+def _next_nonempty_index(lines: list[str], start: int, min_indent: int):
+    index = start
+    while index < len(lines):
+        raw = lines[index]
+        if not raw.strip():
+            index += 1
+            continue
+        indent = _count_indent(raw)
+        if indent < min_indent:
+            return None
+        return index
+    return None
+def _parse_list(lines: list[str], start: int, indent: int):
+    items = []
+    index = start
+    while index < len(lines):
+        raw = lines[index]
+        if not raw.strip():
+            index += 1
+            continue
+        current_indent = _count_indent(raw)
+        if current_indent < indent:
+            break
+        if current_indent != indent or not raw[indent:].startswith("- "):
+            raise ValueError(f"Invalid list entry near line: {raw}")
+        remainder = raw[indent + 2 :].strip()
+        index += 1
+        if remainder:
+            items.append(_parse_scalar(remainder))
+            continue
+        next_index = _next_nonempty_index(lines, index, indent + 2)
+        if next_index is None:
+            items.append("")
+            continue
+        nested_indent = _count_indent(lines[next_index])
+        nested_text = lines[next_index][nested_indent:]
+        if nested_text.startswith("- "):
+            value, index = _parse_list(lines, next_index, nested_indent)
+        elif _looks_like_mapping_entry(nested_text):
+            value, index = _parse_mapping(lines, next_index, nested_indent)
+        else:
+            value, index = _parse_block_scalar(lines, next_index, nested_indent, ">")
+        items.append(value)
+    return items, index
+def _parse_mapping(lines: list[str], start: int, indent: int):
+    mapping = {}
+    index = start
+    while index < len(lines):
+        raw = lines[index]
+        if not raw.strip():
+            index += 1
+            continue
+        current_indent = _count_indent(raw)
+        if current_indent < indent:
+            break
+        if current_indent != indent:
+            raise ValueError(f"Unexpected indentation near line: {raw}")
+        text = raw[indent:]
+        if text.startswith("- "):
+            raise ValueError(f"Unexpected list item near line: {raw}")
+        if ":" not in text:
+            raise ValueError(f"Invalid mapping entry near line: {raw}")
+        key, remainder = text.split(":", 1)
+        key = key.strip()
+        remainder = remainder.strip()
+        index += 1
+        if remainder in {"|", ">", "|-", ">-"}:
+            value, index = _parse_block_scalar(lines, index, indent + 2, remainder)
+        elif remainder:
+            value = _parse_scalar(remainder)
+        else:
+            next_index = _next_nonempty_index(lines, index, indent + 2)
+            if next_index is None:
+                value = ""
+            else:
+                nested_indent = _count_indent(lines[next_index])
+                nested_text = lines[next_index][nested_indent:]
+                if nested_text.startswith("- "):
+                    value, index = _parse_list(lines, next_index, nested_indent)
+                elif _looks_like_mapping_entry(nested_text):
+                    value, index = _parse_mapping(lines, next_index, nested_indent)
+                else:
+                    value, index = _parse_block_scalar(lines, next_index, nested_indent, ">")
+        mapping[key] = value
+    return mapping, index
+def parse_frontmatter(frontmatter_text: str) -> dict:
+    """Parse a small YAML subset used by SKILL.md frontmatter without PyYAML."""
+    lines = frontmatter_text.splitlines()
+    mapping, index = _parse_mapping(lines, 0, 0)
+    trailing = [line for line in lines[index:] if line.strip()]
+    if trailing:
+        raise ValueError(f"Unexpected trailing content in frontmatter: {trailing[0]}")
+    return mapping
+def load_skill_frontmatter(skill_path: Path) -> tuple[dict, str]:
+    """Load and parse the frontmatter from a skill directory."""
+    content = (skill_path / "SKILL.md").read_text()
+    frontmatter_text = extract_frontmatter_text(content)
+    frontmatter = parse_frontmatter(frontmatter_text)
+    if not isinstance(frontmatter, dict):
+        raise ValueError("Frontmatter must be a mapping")
+    return frontmatter, content
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    frontmatter, content = load_skill_frontmatter(skill_path)
+    name = frontmatter.get("name", "")
+    description = frontmatter.get("description", "")
+    return str(name or ""), str(description or ""), content

package/skills/rudder/references/organization-skills.md CHANGED Viewed

@@ -152,7 +152,7 @@ the full optional enabled-skill set intentionally.
 ## Notes
-- Built-in Rudder skills live in the organization library but are not auto-enabled.
+- Built-in Rudder skills live in the organization library and are always loaded for agent runs.
 - New organizations also seed optional community preset skills into the organization library. They stay organization-managed and default-off for agents.
 - If a skill reference is missing or ambiguous, Rudder returns `422`.
 - Prefer linking back to the relevant issue, approval, and agent when commenting about skill changes.

package/skills/skill-creator/SKILL.md ADDED Viewed

@@ -0,0 +1,9 @@
+---
+name: skill-creator
+description: |
+  Create new skills, improve existing skills, and evaluate whether a skill definition is actually doing useful work.
+---
+# Skill Creator
+Use this skill when the task is to create a skill, refine a skill, or judge whether a skill definition should be changed.

package/skills/skill-optimizer/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Changelog
+## v4.3 framing evidence hardening
+- Added sequencing guidance for requests that ask to optimize a skill after another live task: complete and verify the primary task first, then optimize from evidence.
+- Added explicit treatment of strong user corrections as high-signal evidence, especially framing corrections and wrong-abstraction-level failures.
+- Added framing checks for user outcome vs UI surface, scenario spine vs fixture rows, source of truth vs derivative signal, and product intent vs local convenience.
+- Relaxed the final response contract for larger workflows so the primary task result can be reported before concise skill changes.
+## v4.2 open-source package
+- Added package mode and open-source project structure guidance.
+- Added explicit adapter file lookup under `references/adapters/`.
+- Added packaging expectations for README, examples, evals, changelog, and distributable skill zip.
+- Preserved the generic analysis framework: core optimizer plus modular domain adapters.
+## v4.1 adapter hardening
+- Added explicit domain adapter use rule: source of truth, required inputs, review owner, authority gates, privacy, output template, validation cases, and must-not behaviors.
+- Added benchmark reporting split for trigger accuracy, patch-quality coverage, and downstream transfer.
+- Added warning that synthetic verifier scores are regression signals, not official leaderboard results.
+## v4.0 generic
+- Reframed Skill Optimizer from a software-focused hardening checklist into a domain-general analysis framework.
+- Added universal optimization lens covering purpose, triggers, inputs, workflow, tools, outputs, quality, safety, failure, and maintainability.
+- Moved domain-specific checks into modular adapter patterns.
+- Added trigger optimization guidance and benchmark mode.
+- Preserved strict patch safety around high-impact actions.