PyPI - probegen - Versions diffs - 0.1.0__tar.gz - Mend

probegen 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

probegen-0.1.0/PKG-INFO +96 -0
probegen-0.1.0/README.md +71 -0
probegen-0.1.0/probegen/__init__.py +5 -0
probegen-0.1.0/probegen/__main__.py +7 -0
probegen-0.1.0/probegen/cli/__init__.py +1 -0
probegen-0.1.0/probegen/cli/doctor_cmd.py +146 -0
probegen-0.1.0/probegen/cli/embed_batch.py +54 -0
probegen-0.1.0/probegen/cli/find_similar.py +76 -0
probegen-0.1.0/probegen/cli/get_behavior_diff.py +255 -0
probegen-0.1.0/probegen/cli/init_cmd.py +542 -0
probegen-0.1.0/probegen/cli/main.py +35 -0
probegen-0.1.0/probegen/cli/post_comment.py +98 -0
probegen-0.1.0/probegen/cli/resolve_run_id.py +64 -0
probegen-0.1.0/probegen/cli/run_stage.py +113 -0
probegen-0.1.0/probegen/cli/setup_mcp.py +62 -0
probegen-0.1.0/probegen/cli/write_probes.py +200 -0
probegen-0.1.0/probegen/config.py +158 -0
probegen-0.1.0/probegen/context.py +196 -0
probegen-0.1.0/probegen/errors.py +72 -0
probegen-0.1.0/probegen/export.py +111 -0
probegen-0.1.0/probegen/github.py +299 -0
probegen-0.1.0/probegen/integrations/__init__.py +1 -0
probegen-0.1.0/probegen/integrations/braintrust.py +110 -0
probegen-0.1.0/probegen/integrations/langsmith.py +108 -0
probegen-0.1.0/probegen/integrations/phoenix.py +115 -0
probegen-0.1.0/probegen/integrations/promptfoo.py +128 -0
probegen-0.1.0/probegen/models/__init__.py +34 -0
probegen-0.1.0/probegen/models/_base.py +9 -0
probegen-0.1.0/probegen/models/eval_case.py +117 -0
probegen-0.1.0/probegen/models/manifests.py +122 -0
probegen-0.1.0/probegen/models/probes.py +98 -0
probegen-0.1.0/probegen/models/raw_change_data.py +64 -0
probegen-0.1.0/probegen/prompts/__init__.py +1 -0
probegen-0.1.0/probegen/prompts/stage1_template.py +82 -0
probegen-0.1.0/probegen/prompts/stage2_template.py +38 -0
probegen-0.1.0/probegen/prompts/stage3_template.py +156 -0
probegen-0.1.0/probegen/stages/__init__.py +1 -0
probegen-0.1.0/probegen/stages/_common.py +168 -0
probegen-0.1.0/probegen/stages/stage1.py +39 -0
probegen-0.1.0/probegen/stages/stage2.py +39 -0
probegen-0.1.0/probegen/stages/stage3.py +57 -0
probegen-0.1.0/probegen/tools/__init__.py +1 -0
probegen-0.1.0/probegen/tools/embedding.py +216 -0
probegen-0.1.0/probegen/tools/similarity.py +83 -0
probegen-0.1.0/probegen/write_probes.py +7 -0
probegen-0.1.0/probegen.egg-info/PKG-INFO +96 -0
probegen-0.1.0/probegen.egg-info/SOURCES.txt +51 -0
probegen-0.1.0/probegen.egg-info/dependency_links.txt +1 -0
probegen-0.1.0/probegen.egg-info/entry_points.txt +2 -0
probegen-0.1.0/probegen.egg-info/requires.txt +17 -0
probegen-0.1.0/probegen.egg-info/top_level.txt +1 -0
probegen-0.1.0/pyproject.toml +50 -0
probegen-0.1.0/setup.cfg +4 -0

probegen-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,96 @@
+Metadata-Version: 2.4
+Name: probegen
+Version: 0.1.0
+Summary: Change-coupled eval probe generation for LLM systems
+Author: OpenAI Codex
+License: MIT
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: arize-phoenix-client==2.0.0
+Requires-Dist: braintrust==0.9.0
+Requires-Dist: claude-agent-sdk==0.1.48
+Requires-Dist: click==8.3.1
+Requires-Dist: httpx==0.28.1
+Requires-Dist: langsmith==0.7.17
+Requires-Dist: numpy==2.4.3
+Requires-Dist: openai==2.28.0
+Requires-Dist: pydantic==2.12.5
+Requires-Dist: PyYAML==6.0.3
+Requires-Dist: rich==14.3.3
+Requires-Dist: tiktoken==0.12.0
+Provides-Extra: dev
+Requires-Dist: pytest==9.0.2; extra == "dev"
+Requires-Dist: pytest-cov==7.0.0; extra == "dev"
+Requires-Dist: respx==0.22.0; extra == "dev"
+# Probegen
+Probegen detects behaviorally significant pull request changes in LLM systems and proposes targeted evaluation probes for review before writing them to an evaluation platform. Probegen is **non-blocking** — it runs as a parallel CI job and never prevents PR merges.
+## What it does
+Probegen runs in CI on pull requests. It:
+1. Detects changes to prompts, instructions, guardrails, validators, tool descriptions, classifiers, retry policies, output schemas, and other agent harness artifacts that are likely to alter agent behavior.
+2. Retrieves nearby evaluation coverage from your existing eval stack when mappings exist.
+3. Falls back to starter probe generation when no eval corpus exists yet.
+4. Generates ranked probe proposals tailored to the specific change, including multi-turn conversational probes when the agent is conversational.
+5. Exports those probes as files and, after explicit approval, writes them to the configured platform.
+Probegen is not an eval runner. It generates eval inputs that plug into LangSmith, Braintrust, Arize Phoenix, Promptfoo, or file-based workflows.
+Probegen works out of the box even if you have no evals yet. In that case it generates plausible starter probes from the diff, system prompt or guardrails, and whatever product context you provide. The more eval coverage and product detail you give it, the sharper its novelty detection and boundary analysis become.
+## Prerequisites
+- Python 3.11+
+- Node.js 22+ — required in CI by the GitHub Action (installed automatically). Only needed locally if running `probegen run-stage` directly.
+- An Anthropic API key
+- An eval platform API key only if you want direct platform integration or automatic writeback
+## Quick Start (GitHub Action)
+1. Install the package: `pip install probegen`
+2. Run interactive setup: `probegen init` — generates `probegen.yaml`, workflow file, and `context/` stubs
+3. Fill in `context/product.md` and `context/bad_examples.md` (and other context files for best results)
+4. Add GitHub secrets:
+   | Secret | Purpose | Where to get it |
+   |---|---|---|
+   | `ANTHROPIC_API_KEY` | Required — powers all three stages | console.anthropic.com → API Keys |
+   | `OPENAI_API_KEY` | Required for coverage-aware mode | platform.openai.com → API Keys |
+   | `LANGSMITH_API_KEY` | If using LangSmith | smith.langchain.com → Settings |
+   | `BRAINTRUST_API_KEY` | If using Braintrust | braintrust.dev → Settings |
+   | `PHOENIX_API_KEY` | If using Arize Phoenix | app.phoenix.arize.com → Settings |
+5. Create the approval label in GitHub:
+   ```
+   gh label create "probegen:approve" --color 0075ca --description "Approve Probegen probe writeback"
+   ```
+6. Commit `probegen.yaml`, `.github/workflows/probegen.yml`, and `context/`.
+7. Open a PR that touches a prompt or guardrail.
+8. Run `probegen doctor` to verify your setup.
+## Cost control
+Each stage has a configurable Anthropic API spend budget (see `budgets:` in `probegen.yaml`). Typical costs per PR:
+- Stage 1 (change detection): $0.05–0.30
+- Stage 2 (coverage analysis): $0.10–0.50
+- Stage 3 (probe generation): $0.10–0.60
+Increase budget limits if stages time out on large diffs or complex repos.
+## Advanced Configuration
+The full configuration reference is available in [probegen.yaml.example](probegen.yaml.example).
+## Real example quickstart
+If you want to test Probegen against a real LangGraph repo instead of wiring everything from scratch, use the in-repo demo under [examples/langgraph-agentic-rag](examples/langgraph-agentic-rag) and follow [examples/langgraph-agentic-rag/docs/quickstart.md](examples/langgraph-agentic-rag/docs/quickstart.md).
+## Context pack and trace safety
+Probegen works without a context pack, but probe quality drops significantly. At minimum, fill in product context and known failure modes. This matters even more in starter mode, where Probegen has no existing eval corpus to compare against.
+Production traces are never sanitized by the tool. If you add files under `context/traces/`, anonymize them first. Remove names, emails, account IDs, and any other sensitive data before committing them.

probegen-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,71 @@
+# Probegen
+Probegen detects behaviorally significant pull request changes in LLM systems and proposes targeted evaluation probes for review before writing them to an evaluation platform. Probegen is **non-blocking** — it runs as a parallel CI job and never prevents PR merges.
+## What it does
+Probegen runs in CI on pull requests. It:
+1. Detects changes to prompts, instructions, guardrails, validators, tool descriptions, classifiers, retry policies, output schemas, and other agent harness artifacts that are likely to alter agent behavior.
+2. Retrieves nearby evaluation coverage from your existing eval stack when mappings exist.
+3. Falls back to starter probe generation when no eval corpus exists yet.
+4. Generates ranked probe proposals tailored to the specific change, including multi-turn conversational probes when the agent is conversational.
+5. Exports those probes as files and, after explicit approval, writes them to the configured platform.
+Probegen is not an eval runner. It generates eval inputs that plug into LangSmith, Braintrust, Arize Phoenix, Promptfoo, or file-based workflows.
+Probegen works out of the box even if you have no evals yet. In that case it generates plausible starter probes from the diff, system prompt or guardrails, and whatever product context you provide. The more eval coverage and product detail you give it, the sharper its novelty detection and boundary analysis become.
+## Prerequisites
+- Python 3.11+
+- Node.js 22+ — required in CI by the GitHub Action (installed automatically). Only needed locally if running `probegen run-stage` directly.
+- An Anthropic API key
+- An eval platform API key only if you want direct platform integration or automatic writeback
+## Quick Start (GitHub Action)
+1. Install the package: `pip install probegen`
+2. Run interactive setup: `probegen init` — generates `probegen.yaml`, workflow file, and `context/` stubs
+3. Fill in `context/product.md` and `context/bad_examples.md` (and other context files for best results)
+4. Add GitHub secrets:
+   | Secret | Purpose | Where to get it |
+   |---|---|---|
+   | `ANTHROPIC_API_KEY` | Required — powers all three stages | console.anthropic.com → API Keys |
+   | `OPENAI_API_KEY` | Required for coverage-aware mode | platform.openai.com → API Keys |
+   | `LANGSMITH_API_KEY` | If using LangSmith | smith.langchain.com → Settings |
+   | `BRAINTRUST_API_KEY` | If using Braintrust | braintrust.dev → Settings |
+   | `PHOENIX_API_KEY` | If using Arize Phoenix | app.phoenix.arize.com → Settings |
+5. Create the approval label in GitHub:
+   ```
+   gh label create "probegen:approve" --color 0075ca --description "Approve Probegen probe writeback"
+   ```
+6. Commit `probegen.yaml`, `.github/workflows/probegen.yml`, and `context/`.
+7. Open a PR that touches a prompt or guardrail.
+8. Run `probegen doctor` to verify your setup.
+## Cost control
+Each stage has a configurable Anthropic API spend budget (see `budgets:` in `probegen.yaml`). Typical costs per PR:
+- Stage 1 (change detection): $0.05–0.30
+- Stage 2 (coverage analysis): $0.10–0.50
+- Stage 3 (probe generation): $0.10–0.60
+Increase budget limits if stages time out on large diffs or complex repos.
+## Advanced Configuration
+The full configuration reference is available in [probegen.yaml.example](probegen.yaml.example).
+## Real example quickstart
+If you want to test Probegen against a real LangGraph repo instead of wiring everything from scratch, use the in-repo demo under [examples/langgraph-agentic-rag](examples/langgraph-agentic-rag) and follow [examples/langgraph-agentic-rag/docs/quickstart.md](examples/langgraph-agentic-rag/docs/quickstart.md).
+## Context pack and trace safety
+Probegen works without a context pack, but probe quality drops significantly. At minimum, fill in product context and known failure modes. This matters even more in starter mode, where Probegen has no existing eval corpus to compare against.
+Production traces are never sanitized by the tool. If you add files under `context/traces/`, anonymize them first. Remove names, emails, account IDs, and any other sensitive data before committing them.

probegen-0.1.0/probegen/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from __future__ import annotations
+__all__ = ["__version__"]
+__version__ = "0.1.0"

probegen-0.1.0/probegen/__main__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from __future__ import annotations
+from probegen.cli.main import cli
+if __name__ == "__main__":
+    cli()

probegen-0.1.0/probegen/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from __future__ import annotations

probegen-0.1.0/probegen/cli/doctor_cmd.py ADDED Viewed

@@ -0,0 +1,146 @@
+from __future__ import annotations
+import fnmatch
+import os
+import subprocess
+from pathlib import Path
+import click
+from probegen.config import ProbegenConfig
+from probegen.errors import ConfigError
+def _git_ls_files(cwd: Path) -> list[str]:
+    try:
+        completed = subprocess.run(
+            ["git", "ls-files"],
+            cwd=cwd,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        return completed.stdout.splitlines()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return []
+@click.command("doctor")
+@click.option("--config", "config_path", default="probegen.yaml", show_default=True, type=click.Path(dir_okay=False, path_type=Path))
+@click.option("--ci", is_flag=True, help="Run additional CI-specific checks (requires GITHUB_TOKEN).")
+def doctor_command(config_path: Path, ci: bool) -> None:
+    """Verify Probegen setup and report any issues."""
+    checks: list[tuple[bool, str]] = []
+    root = Path.cwd()
+    # Check 1: probegen.yaml exists
+    config_exists = config_path.exists()
+    checks.append((config_exists, f"probegen.yaml found at {config_path}"))
+    if not config_exists:
+        click.echo(_format_checks(checks))
+        click.echo(f"\nRun `probegen init` to create probegen.yaml.")
+        return
+    # Check 2: config is valid
+    config: ProbegenConfig | None = None
+    try:
+        config = ProbegenConfig.load(config_path, allow_missing=False)
+        checks.append((True, "probegen.yaml is valid"))
+    except ConfigError as exc:
+        checks.append((False, f"probegen.yaml has errors: {exc}"))
+    if config is not None:
+        # Check 3: ANTHROPIC_API_KEY
+        anthropic_key = os.environ.get("ANTHROPIC_API_KEY", "")
+        checks.append((bool(anthropic_key), "ANTHROPIC_API_KEY is set"))
+        # Check 4: Platform-specific keys
+        if config.platforms.langsmith:
+            key_name = config.platforms.langsmith.api_key_env
+            checks.append((bool(os.environ.get(key_name)), f"{key_name} is set (langsmith)"))
+        if config.platforms.braintrust:
+            key_name = config.platforms.braintrust.api_key_env
+            checks.append((bool(os.environ.get(key_name)), f"{key_name} is set (braintrust)"))
+        if config.platforms.arize_phoenix:
+            key_name = config.platforms.arize_phoenix.api_key_env
+            checks.append((bool(os.environ.get(key_name)), f"{key_name} is set (arize_phoenix)"))
+        # Check 5: OPENAI_API_KEY if mappings configured
+        if config.mappings:
+            openai_key = os.environ.get("OPENAI_API_KEY", "")
+            checks.append((bool(openai_key), "OPENAI_API_KEY is set (required for coverage-aware mode)"))
+        # Check 6: Hint pattern matches
+        tracked_files = _git_ls_files(root)
+        if tracked_files:
+            all_patterns = [*config.behavior_artifacts.paths, *config.guardrail_artifacts.paths]
+            if all_patterns:
+                for pattern in all_patterns:
+                    matched = [
+                        f for f in tracked_files
+                        if fnmatch.fnmatch(f, pattern)
+                        and not any(fnmatch.fnmatch(f, ex) for ex in config.behavior_artifacts.exclude)
+                    ]
+                    checks.append((
+                        bool(matched),
+                        f"Pattern '{pattern}' matches {len(matched)} tracked file(s)",
+                    ))
+            else:
+                checks.append((False, "No hint patterns configured in behavior_artifacts or guardrail_artifacts"))
+        # Check 7: context/ directory key files
+        context_files = [
+            config.context.product,
+            config.context.bad_examples,
+        ]
+        for rel_path in context_files:
+            full_path = root / rel_path
+            non_empty = full_path.exists() and full_path.stat().st_size > 0
+            checks.append((non_empty, f"Context file {rel_path} exists and is non-empty"))
+        # Check 8: CI label check
+        if ci:
+            token = os.environ.get("GITHUB_TOKEN", "")
+            repo = os.environ.get("GITHUB_REPOSITORY", "")
+            label_name = config.approval.label
+            if token and repo:
+                label_ok = _check_github_label(repo, token, label_name)
+                checks.append((label_ok, f"GitHub label '{label_name}' exists in {repo}"))
+            else:
+                checks.append((False, "GITHUB_TOKEN or GITHUB_REPOSITORY not set — skipping label check"))
+    click.echo(_format_checks(checks))
+    passed = sum(1 for ok, _ in checks if ok)
+    total = len(checks)
+    click.echo(f"\n{passed}/{total} checks passed.")
+def _check_github_label(repo: str, token: str, label_name: str) -> bool:
+    try:
+        import httpx
+        response = httpx.get(
+            f"https://api.github.com/repos/{repo}/labels/{label_name}",
+            headers={
+                "Accept": "application/vnd.github+json",
+                "Authorization": f"Bearer {token}",
+                "X-GitHub-Api-Version": "2022-11-28",
+            },
+            timeout=10.0,
+        )
+        return response.status_code == 200
+    except Exception:
+        return False
+def _format_checks(checks: list[tuple[bool, str]]) -> str:
+    lines = []
+    for ok, message in checks:
+        symbol = "✓" if ok else "✗"
+        lines.append(f"  {symbol} {message}")
+    return "\n".join(lines)
+if __name__ == "__main__":
+    doctor_command()

probegen-0.1.0/probegen/cli/embed_batch.py ADDED Viewed

@@ -0,0 +1,54 @@
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+import click
+from probegen.errors import EmbeddingError
+from probegen.tools.embedding import embed_batch
+@click.command("embed-batch")
+@click.option("--inputs", "inputs_path", required=True, type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option("--output", "output_path", required=True, type=click.Path(dir_okay=False, path_type=Path))
+@click.option("--model", default="text-embedding-3-small", show_default=True)
+@click.option("--cache", "cache_path", default=".probegen/embedding_cache.db", show_default=True, type=click.Path(path_type=Path))
+@click.option("--dimensions", type=int, default=None)
+def embed_batch_command(
+    inputs_path: Path,
+    output_path: Path,
+    model: str,
+    cache_path: Path,
+    dimensions: int | None,
+) -> None:
+    payload = json.loads(inputs_path.read_text(encoding="utf-8"))
+    try:
+        embeddings, cache_warning = embed_batch(
+            payload,
+            model=model,
+            cache_path=cache_path,
+            dimensions=dimensions,
+        )
+    except EmbeddingError as exc:
+        raise SystemExit(_emit_error(str(exc), 1)) from exc
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(embeddings, indent=2), encoding="utf-8")
+    if cache_warning:
+        click.echo(
+            "probegen embed-batch: embedding cache warning — some cache reads or writes failed; "
+            "embeddings are still valid and have been written to the output file.",
+            err=True,
+        )
+    raise SystemExit(0)
+def _emit_error(message: str, code: int) -> int:
+    click.echo(message, err=True)
+    return code
+if __name__ == "__main__":
+    embed_batch_command()

probegen-0.1.0/probegen/cli/find_similar.py ADDED Viewed

@@ -0,0 +1,76 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+import click
+from probegen.errors import EmbeddingError
+from probegen.tools.embedding import embed_batch
+from probegen.tools.similarity import classify_similarity, cosine_similarity
+@click.command("find-similar")
+@click.option("--candidate", "candidate_path", required=True, type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option("--corpus", "corpus_path", required=True, type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option("--output", "output_path", required=True, type=click.Path(dir_okay=False, path_type=Path))
+@click.option("--duplicate-threshold", default=0.88, show_default=True, type=float)
+@click.option("--boundary-threshold", default=0.72, show_default=True, type=float)
+@click.option("--model", default="text-embedding-3-small", show_default=True)
+@click.option("--cache", "cache_path", default=".probegen/embedding_cache.db", show_default=True, type=click.Path(path_type=Path))
+@click.option("--dimensions", type=int, default=None)
+def find_similar_command(
+    candidate_path: Path,
+    corpus_path: Path,
+    output_path: Path,
+    duplicate_threshold: float,
+    boundary_threshold: float,
+    model: str,
+    cache_path: Path,
+    dimensions: int | None,
+) -> None:
+    candidate = json.loads(candidate_path.read_text(encoding="utf-8"))
+    corpus = json.loads(corpus_path.read_text(encoding="utf-8"))
+    try:
+        embedded_candidate, _ = embed_batch(
+            [candidate],
+            model=model,
+            cache_path=cache_path,
+            dimensions=dimensions,
+        )
+    except EmbeddingError as exc:
+        click.echo(str(exc), err=True)
+        raise SystemExit(1) from exc
+    candidate_embedding = embedded_candidate[0]["embedding"]
+    results: list[dict[str, Any]] = []
+    for item in corpus:
+        score = cosine_similarity(candidate_embedding, item["embedding"])
+        results.append(
+            {
+                "corpus_id": item["id"],
+                "similarity": score,
+                "classification": classify_similarity(
+                    score,
+                    duplicate_threshold=duplicate_threshold,
+                    boundary_threshold=boundary_threshold,
+                ),
+            }
+        )
+    results.sort(key=lambda item: item["similarity"], reverse=True)
+    top_match = results[0] if results else None
+    payload = {
+        "candidate_id": candidate["id"],
+        "results": results,
+        "top_match": top_match,
+        "max_similarity": top_match["similarity"] if top_match else 0.0,
+        "overall_classification": top_match["classification"] if top_match else "novel",
+    }
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+if __name__ == "__main__":
+    find_similar_command()