npm - harness-evolver - Versions diffs - 4.2.4 → 4.2.6 - Mend

harness-evolver 4.2.4 → 4.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.claude-plugin/plugin.json +1 -1
package/bin/install.js +71 -12
package/package.json +1 -1
package/skills/evolve/SKILL.md +2 -0
package/skills/setup/SKILL.md +2 -0
package/tools/setup.py +116 -88

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "harness-evolver",
   "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
-  "version": "4.2.4",
+  "version": "4.2.6",
   "author": {
     "name": "Raphael Valdetaro"
   },

package/bin/install.js CHANGED Viewed

@@ -374,7 +374,7 @@ function installPythonDeps() {
   return false;
 }
-async function configureLangSmith(rl) {
+async function configureLangSmith(rl, nonInteractive) {
   const langsmithCredsDir = process.platform === "darwin"
     ? path.join(HOME, "Library", "Application Support", "langsmith-cli")
     : path.join(HOME, ".config", "langsmith-cli");
@@ -393,13 +393,28 @@ async function configureLangSmith(rl) {
     try {
       const content = fs.readFileSync(langsmithCredsFile, "utf8");
       if (content.includes("LANGSMITH_API_KEY=lsv2_")) {
-        stepDone("API key found in credentials file");
-        hasKey = true;
+        // Validate existing key with a real request
+        const existingKey = content.match(/LANGSMITH_API_KEY=(lsv2_[^\s\n]+)/)?.[1];
+        if (existingKey) {
+          try {
+            execSync(`curl -sf -o /dev/null -w "%{http_code}" -H "x-api-key: ${existingKey}" https://api.smith.langchain.com/info`, { stdio: "pipe", timeout: 10000 });
+            stepDone("API key found and validated");
+            hasKey = true;
+          } catch {
+            barLine(c.yellow("API key found but could not be validated — LangSmith may be unreachable"));
+            barLine(c.dim("Will ask for a new key just in case."));
+          }
+        }
       }
     } catch {}
   }
   if (!hasKey) {
+    if (nonInteractive) {
+      stepError("No API key found — set LANGSMITH_API_KEY in environment and re-run");
+      barLine(c.dim("Run: export LANGSMITH_API_KEY=lsv2_pt_your_key"));
+      return;
+    }
     barLine(c.dim("Get yours at https://smith.langchain.com/settings"));
     barLine(c.dim("LangSmith is required. The evolver won't work without it."));
     barEmpty();
@@ -410,6 +425,13 @@ async function configureLangSmith(rl) {
       const key = apiKey.trim();
       if (key && key.startsWith("lsv2_")) {
+        // Validate key with a real request before saving
+        try {
+          execSync(`curl -sf -o /dev/null -w "%{http_code}" -H "x-api-key: ${key}" https://api.smith.langchain.com/info`, { stdio: "pipe", timeout: 10000 });
+        } catch {
+          barLine(c.yellow("Key could not be validated — LangSmith may be unreachable"));
+          barLine(c.dim("Saving anyway. If it doesn't work, re-run the installer."));
+        }
         try {
           fs.mkdirSync(langsmithCredsDir, { recursive: true });
           fs.writeFileSync(langsmithCredsFile, `LANGSMITH_API_KEY=${key}\n`);
@@ -454,7 +476,7 @@ async function configureLangSmith(rl) {
   }
 }
-async function configureOptionalIntegrations(rl) {
+async function configureOptionalIntegrations(rl, nonInteractive) {
   barEmpty();
   step(c.bold("Optional Integrations"));
   barEmpty();
@@ -474,7 +496,7 @@ async function configureOptionalIntegrations(rl) {
   if (hasContext7) {
     stepDone("Context7 MCP already configured");
-  } else {
+  } else if (!nonInteractive) {
     barLine(c.bold("Context7 MCP") + " \u2014 " + c.dim("up-to-date library documentation"));
     const c7Answer = await ask(rl, `${c.cyan(S.stepActive)}  Install Context7 MCP? [y/N]: `);
     if (c7Answer.trim().toLowerCase() === "y") {
@@ -506,7 +528,7 @@ async function configureOptionalIntegrations(rl) {
   if (hasLcDocs) {
     stepDone("LangChain Docs MCP already configured");
-  } else {
+  } else if (!nonInteractive) {
     barLine(c.bold("LangChain Docs MCP") + " \u2014 " + c.dim("LangChain/LangGraph/LangSmith docs"));
     const lcAnswer = await ask(rl, `${c.cyan(S.stepActive)}  Install LangChain Docs MCP? [y/N]: `);
     if (lcAnswer.trim().toLowerCase() === "y") {
@@ -525,6 +547,8 @@ async function configureOptionalIntegrations(rl) {
 // ─── Main ───────────────────────────────────────────────────────────────────
 async function main() {
+  const nonInteractive = process.argv.includes("--yes") || process.argv.includes("-y");
   banner();
   header("harness-evolver");
@@ -540,6 +564,21 @@ async function main() {
     }
   } catch {}
+  // Check installed version
+  const versionPath = path.join(HOME, ".evolver", "VERSION");
+  let installedVersion = null;
+  if (fs.existsSync(versionPath)) {
+    installedVersion = fs.readFileSync(versionPath, "utf8").trim();
+  }
+  if (installedVersion && installedVersion !== VERSION) {
+    step(`Upgrading ${c.dim(installedVersion)} → ${c.cyan(VERSION)}`);
+  } else if (installedVersion === VERSION) {
+    step(`Reinstalling ${c.cyan(VERSION)}`);
+  } else {
+    step(`Fresh install ${c.cyan(VERSION)}`);
+  }
   barEmpty();
   // Python check
@@ -567,6 +606,11 @@ async function main() {
   const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
+  function askOrDefault(question, defaultValue) {
+    if (nonInteractive) return Promise.resolve(defaultValue);
+    return ask(rl, question);
+  }
   // Runtime selection
   barEmpty();
   stepPrompt("Which runtime(s) to install for?");
@@ -577,7 +621,7 @@ async function main() {
     barLine(c.dim("Select multiple: 1,2 or 1 2"));
   }
-  const runtimeAnswer = await ask(rl, `${c.cyan(S.stepActive)}  Choice [1]: `);
+  const runtimeAnswer = await askOrDefault(`${c.cyan(S.stepActive)}  Choice [1]: `, "1");
   const runtimeInput = (runtimeAnswer.trim() || "1");
   let selected;
@@ -598,7 +642,7 @@ async function main() {
   barLine(`  ${c.bold("1")}  Global ${c.dim(`(~/${selected[0].dir})`)}`);
   barLine(`  ${c.bold("2")}  Local  ${c.dim(`(./${selected[0].dir})`)}`);
-  const scopeAnswer = await ask(rl, `${c.cyan(S.stepActive)}  Choice [1]: `);
+  const scopeAnswer = await askOrDefault(`${c.cyan(S.stepActive)}  Choice [1]: `, "1");
   const scope = (scopeAnswer.trim() === "2") ? "local" : "global";
   stepDone(`Scope: ${c.cyan(scope)}`);
@@ -632,8 +676,20 @@ async function main() {
   const toolCount = installTools();
   stepDone(`${toolCount} tools installed to ~/.evolver/tools/`);
-  // Version marker
-  const versionPath = path.join(HOME, ".evolver", "VERSION");
+  // Suggest .worktreeinclude for worktree support
+  barEmpty();
+  const cwdGit = fs.existsSync(path.join(process.cwd(), ".git"));
+  const cwdWorktreeInclude = fs.existsSync(path.join(process.cwd(), ".worktreeinclude"));
+  if (cwdGit && !cwdWorktreeInclude) {
+    step("Worktree support");
+    barLine(c.dim("For /evolver:evolve to work, .evolver.json needs to be in worktrees."));
+    barLine(c.dim("Create .worktreeinclude in your project root with:"));
+    barLine(c.dim("  .evolver.json"));
+    barLine(c.dim("  .env"));
+    stepDone("Tip shown");
+  }
+  // Version marker (versionPath declared earlier for upgrade check)
   fs.mkdirSync(path.dirname(versionPath), { recursive: true });
   fs.writeFileSync(versionPath, VERSION);
@@ -642,10 +698,10 @@ async function main() {
   installPythonDeps();
   // Configure LangSmith
-  await configureLangSmith(rl);
+  await configureLangSmith(rl, nonInteractive);
   // Optional integrations
-  await configureOptionalIntegrations(rl);
+  await configureOptionalIntegrations(rl, nonInteractive);
   // Done
   barEmpty();
@@ -657,6 +713,9 @@ async function main() {
   barLine(`  ${c.cyan("/evolver:status")} \u2014 check progress`);
   barLine(`  ${c.cyan("/evolver:deploy")}  \u2014 finalize and push`);
   barEmpty();
+  barLine(c.dim("Plugin marketplace (auto-updates):"));
+  barLine(`  ${c.cyan("/plugin install harness-evolver")}  ${c.dim("— from Claude Code marketplace")}`);
+  barEmpty();
   barLine(c.dim("GitHub: https://github.com/raphaelchristi/harness-evolver"));
   footer();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "harness-evolver",
-  "version": "4.2.4",
+  "version": "4.2.6",
   "description": "LangSmith-native autonomous agent optimization for Claude Code",
   "author": "Raphael Valdetaro",
   "license": "MIT",

package/skills/evolve/SKILL.md CHANGED Viewed

@@ -23,6 +23,8 @@ EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HO
 Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations.
+**IMPORTANT: Never pass `LANGSMITH_API_KEY` inline in Bash commands.** The key is loaded automatically by the SessionStart hook and by each tool's `ensure_langsmith_api_key()`. Passing it inline exposes it in the output.
 ## Parse Arguments
 - `--iterations N` (default: from interactive question or 5)

package/skills/setup/SKILL.md CHANGED Viewed

@@ -45,6 +45,8 @@ EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HO
 Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations. This ensures the venv with langsmith is used.
+**IMPORTANT: Never pass `LANGSMITH_API_KEY` inline in Bash commands.** The key is loaded automatically by the SessionStart hook (from credentials file or environment) and by each Python tool's `ensure_langsmith_api_key()`. Passing it inline exposes it in the output. If the key is missing, tell the user to run `export LANGSMITH_API_KEY=lsv2_pt_...` instead.
 ## Phase 1: Explore Project (automatic)
 ```bash

package/tools/setup.py CHANGED Viewed

@@ -462,101 +462,129 @@ def main():
         else:
             print(f"Dataset: '{dataset_name}'")
-    # Create dataset
-    print(f"Creating dataset '{dataset_name}'...")
-    if args.dataset_from_file:
-        dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
-        print(f"  Created from file: {count} examples")
-    elif args.dataset_from_langsmith:
-        dataset, count = create_dataset_from_langsmith(
-            client, dataset_name, args.dataset_from_langsmith,
-        )
-        if not dataset:
-            print("  No traces found in source project. Creating empty dataset.")
+    # Create dataset — wrapped in try/except to clean up orphaned datasets on failure
+    dataset = None
+    try:
+        print(f"Creating dataset '{dataset_name}'...")
+        if args.dataset_from_file:
+            dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
+            print(f"  Created from file: {count} examples")
+        elif args.dataset_from_langsmith:
+            dataset, count = create_dataset_from_langsmith(
+                client, dataset_name, args.dataset_from_langsmith,
+            )
+            if not dataset:
+                print("  No traces found in source project. Creating empty dataset.")
+                dataset = create_empty_dataset(client, dataset_name)
+                count = 0
+            else:
+                print(f"  Created from LangSmith traces: {count} examples")
+        else:
             dataset = create_empty_dataset(client, dataset_name)
             count = 0
+            print("  Created empty dataset (testgen will populate)")
+        # Configure evaluators
+        print(f"Configuring evaluators for goals: {goals}")
+        evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
+        print(f"  Active evaluators: {evaluator_keys}")
+        llm_evaluators = [k for k in evaluator_keys if k in ("correctness", "conciseness")]
+        if llm_evaluators:
+            print(f"  LLM evaluators (agent-based): {llm_evaluators}")
+        # Run baseline (code-based evaluators only; LLM scoring done by evaluator agent)
+        baseline_experiment = None
+        baseline_score = 0.0
+        if not args.skip_baseline and count > 0:
+            print(f"Running baseline target ({count} examples)...")
+            try:
+                baseline_experiment, baseline_score = run_baseline(
+                    client, dataset_name, args.entry_point, evaluators,
+                )
+                print(f"  Baseline has_output score: {baseline_score:.3f}")
+                print(f"  Experiment: {baseline_experiment}")
+                if llm_evaluators:
+                    print(f"  Note: LLM scoring pending — evaluator agent will run during /evolver:evolve")
+            except Exception as e:
+                print(f"  Baseline evaluation failed: {e}", file=sys.stderr)
+                print("  Continuing with score 0.0")
+        elif count == 0:
+            print("Skipping baseline (no examples in dataset yet)")
         else:
-            print(f"  Created from LangSmith traces: {count} examples")
-    else:
-        dataset = create_empty_dataset(client, dataset_name)
-        count = 0
-        print("  Created empty dataset (testgen will populate)")
-    # Configure evaluators
-    print(f"Configuring evaluators for goals: {goals}")
-    evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
-    print(f"  Active evaluators: {evaluator_keys}")
-    llm_evaluators = [k for k in evaluator_keys if k in ("correctness", "conciseness")]
-    if llm_evaluators:
-        print(f"  LLM evaluators (agent-based): {llm_evaluators}")
-    # Run baseline (code-based evaluators only; LLM scoring done by evaluator agent)
-    baseline_experiment = None
-    baseline_score = 0.0
-    if not args.skip_baseline and count > 0:
-        print(f"Running baseline target ({count} examples)...")
+            print("Skipping baseline (--skip-baseline)")
+        # Resolve Python interpreter in entry_point to absolute path
+        # This ensures the entry point works in worktrees where venvs don't exist
+        entry_point = args.entry_point
+        parts = entry_point.split()
+        if parts:
+            python_path = parts[0]
+            # Resolve relative Python paths (e.g., ../.venv/bin/python, .venv/bin/python)
+            if "/" in python_path and not os.path.isabs(python_path):
+                abs_python = os.path.abspath(python_path)
+                if os.path.exists(abs_python):
+                    parts[0] = abs_python
+                    entry_point = " ".join(parts)
+                    print(f"  Resolved Python path: {abs_python}")
+        # Compute project_dir relative to git root (for worktree path resolution)
+        project_dir = ""
         try:
-            baseline_experiment, baseline_score = run_baseline(
-                client, dataset_name, args.entry_point, evaluators,
+            git_prefix = subprocess.run(
+                ["git", "rev-parse", "--show-prefix"],
+                capture_output=True, text=True, timeout=5,
             )
-            print(f"  Baseline has_output score: {baseline_score:.3f}")
-            print(f"  Experiment: {baseline_experiment}")
-            if llm_evaluators:
-                print(f"  Note: LLM scoring pending — evaluator agent will run during /evolver:evolve")
-        except Exception as e:
-            print(f"  Baseline evaluation failed: {e}", file=sys.stderr)
-            print("  Continuing with score 0.0")
-    elif count == 0:
-        print("Skipping baseline (no examples in dataset yet)")
-    else:
-        print("Skipping baseline (--skip-baseline)")
-    # Compute project_dir relative to git root (for worktree path resolution)
-    project_dir = ""
-    try:
-        git_prefix = subprocess.run(
-            ["git", "rev-parse", "--show-prefix"],
-            capture_output=True, text=True, timeout=5,
-        )
-        if git_prefix.returncode == 0:
-            project_dir = git_prefix.stdout.strip().rstrip("/")
-    except Exception:
-        pass
-    # Write config
-    config = {
-        "version": "3.0.0",
-        "project": project_name,
-        "dataset": dataset_name,
-        "dataset_id": str(dataset.id) if dataset else None,
-        "project_dir": project_dir,
-        "entry_point": args.entry_point,
-        "evaluators": evaluator_keys,
-        "optimization_goals": goals,
-        "production_project": args.production_project,
-        "baseline_experiment": baseline_experiment,
-        "best_experiment": baseline_experiment,
-        "best_score": baseline_score,
-        "iterations": 0,
-        "framework": args.framework,
-        "created_at": datetime.now(timezone.utc).isoformat(),
-        "history": [{
-            "version": "baseline",
-            "experiment": baseline_experiment,
-            "score": baseline_score,
-        }] if baseline_experiment else [],
-    }
+            if git_prefix.returncode == 0:
+                project_dir = git_prefix.stdout.strip().rstrip("/")
+        except Exception:
+            pass
-    with open(args.output, "w") as f:
-        json.dump(config, f, indent=2)
+        # Write config
+        config = {
+            "version": "3.0.0",
+            "project": project_name,
+            "dataset": dataset_name,
+            "dataset_id": str(dataset.id) if dataset else None,
+            "project_dir": project_dir,
+            "entry_point": entry_point,
+            "evaluators": evaluator_keys,
+            "optimization_goals": goals,
+            "production_project": args.production_project,
+            "baseline_experiment": baseline_experiment,
+            "best_experiment": baseline_experiment,
+            "best_score": baseline_score,
+            "iterations": 0,
+            "framework": args.framework,
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "history": [{
+                "version": "baseline",
+                "experiment": baseline_experiment,
+                "score": baseline_score,
+            }] if baseline_experiment else [],
+        }
+        with open(args.output, "w") as f:
+            json.dump(config, f, indent=2)
+        print(f"\nSetup complete. Config saved to {args.output}")
+        print(f"  Project: {project_name}")
+        print(f"  Dataset: {dataset_name} ({count} examples)")
+        print(f"  Evaluators: {evaluator_keys}")
+        if baseline_experiment:
+            print(f"  Baseline: {baseline_score:.3f}")
+        print(f"\nNext: run /evolver:evolve")
-    print(f"\nSetup complete. Config saved to {args.output}")
-    print(f"  Project: {project_name}")
-    print(f"  Dataset: {dataset_name} ({count} examples)")
-    print(f"  Evaluators: {evaluator_keys}")
-    if baseline_experiment:
-        print(f"  Baseline: {baseline_score:.3f}")
-    print(f"\nNext: run /evolver:evolve")
+    except Exception as e:
+        # Cleanup orphaned dataset if setup fails after dataset creation
+        if dataset:
+            print(f"Setup failed: {e}", file=sys.stderr)
+            print(f"Cleaning up orphaned dataset '{dataset_name}'...", file=sys.stderr)
+            try:
+                client.delete_dataset(dataset_id=dataset.id)
+                print("  Dataset deleted.", file=sys.stderr)
+            except Exception:
+                print(f"  WARNING: Could not delete dataset. Clean up manually in LangSmith.", file=sys.stderr)
+        raise
 if __name__ == "__main__":