PyPI - npcsh - Versions diffs - 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl - Mend

npcsh 1.1.17py3-none-any.whl → 1.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

npcsh/benchmark/runner.py CHANGED Viewed

@@ -5,14 +5,14 @@ Provides a convenient interface for running Terminal-Bench evaluations
 with different models and providers.
 """
-import os
 import subprocess
 import sys
+import json
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, List, Dict, Any
-import json
 @dataclass
@@ -21,7 +21,7 @@ class BenchmarkConfig:
     model: str = "claude-sonnet-4-20250514"
     provider: str = "anthropic"
     dataset: str = "terminal-bench"
-    dataset_version: str = "2.0"
+    dataset_version: Optional[str] = None  # If None, use latest
     n_concurrent: int = 4
     task_ids: Optional[List[str]] = None
     output_dir: Optional[str] = None
@@ -84,33 +84,52 @@ class BenchmarkRunner:
     def check_dependencies(self) -> Dict[str, bool]:
         """Check if required dependencies are installed."""
+        import shutil
         deps = {
             "harbor": False,
             "terminal-bench": False,
             "docker": False,
         }
-        # Check harbor
-        try:
-            result = subprocess.run(
-                ["harbor", "--version"],
-                capture_output=True,
-                text=True
-            )
-            deps["harbor"] = result.returncode == 0
-        except FileNotFoundError:
-            pass
+        # Find binaries in the same Python environment as current interpreter
+        # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
+        bin_dir = Path(sys.prefix) / "bin"
+        if not bin_dir.exists():
+            # Fallback: use executable's directory without resolving
+            bin_dir = Path(sys.executable).parent
-        # Check terminal-bench (tb CLI)
-        try:
-            result = subprocess.run(
-                ["tb", "--help"],
-                capture_output=True,
-                text=True
-            )
-            deps["terminal-bench"] = result.returncode == 0
-        except FileNotFoundError:
-            pass
+        # Check harbor - first in current Python's bin dir, then PATH
+        harbor_bin = bin_dir / "harbor"
+        if not harbor_bin.exists():
+            harbor_bin = shutil.which("harbor")
+        if harbor_bin:
+            try:
+                result = subprocess.run(
+                    [str(harbor_bin), "--version"],
+                    capture_output=True,
+                    text=True
+                )
+                deps["harbor"] = result.returncode == 0
+            except (FileNotFoundError, OSError):
+                pass
+        # Check terminal-bench (tb CLI) - first in current Python's bin dir, then PATH
+        tb_bin = bin_dir / "tb"
+        if not tb_bin.exists():
+            tb_bin = shutil.which("tb")
+        if tb_bin:
+            try:
+                result = subprocess.run(
+                    [str(tb_bin), "--help"],
+                    capture_output=True,
+                    text=True
+                )
+                deps["terminal-bench"] = result.returncode == 0
+            except (FileNotFoundError, OSError):
+                pass
         # Check docker
         try:
@@ -146,9 +165,10 @@ class BenchmarkRunner:
         model: str = "claude-sonnet-4-20250514",
         provider: str = "anthropic",
         dataset: str = "terminal-bench",
-        dataset_version: str = "2.0",
+        dataset_version: Optional[str] = None,
         n_concurrent: int = 4,
         task_ids: Optional[List[str]] = None,
+        n_tasks: Optional[int] = None,
         npc_name: Optional[str] = None,
         timeout: int = 600,
     ) -> BenchmarkResult:
@@ -159,9 +179,10 @@ class BenchmarkRunner:
             model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
             provider: Provider name (e.g., "anthropic", "openai", "gemini")
             dataset: Dataset name (default: "terminal-bench")
-            dataset_version: Dataset version (default: "2.0")
+            dataset_version: Dataset version (optional, uses latest if None)
             n_concurrent: Number of concurrent task executions
             task_ids: Optional list of specific task IDs to run
+            n_tasks: Optional limit on number of tasks to run
             npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
             timeout: Per-task timeout in seconds
@@ -193,9 +214,22 @@ class BenchmarkRunner:
         else:
             agent_path = "npcsh.benchmark:NpcshAgent"
+        # Find harbor in the same Python environment as current interpreter
+        # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
+        import shutil
+        bin_dir = Path(sys.prefix) / "bin"
+        if not bin_dir.exists():
+            bin_dir = Path(sys.executable).parent
+        harbor_bin = str(bin_dir / "harbor")
+        if not Path(harbor_bin).exists():
+            harbor_bin = shutil.which("harbor") or "harbor"
+        # Build dataset string (with optional version)
+        dataset_str = f"{dataset}@{dataset_version}" if dataset_version else dataset
         cmd = [
-            "harbor", "run",
-            "-d", f"{dataset}@{dataset_version}",
+            harbor_bin, "run",
+            "-d", dataset_str,
             "--agent-import-path", agent_path,
             "-m", full_model,
             "-n", str(n_concurrent),
@@ -203,12 +237,18 @@ class BenchmarkRunner:
         ]
         if task_ids:
-            cmd.extend(["--task-ids", ",".join(task_ids)])
+            for task_id in task_ids:
+                cmd.extend(["--task-name", task_id])
+        if n_tasks:
+            cmd.extend(["-l", str(n_tasks)])
-        print(f"\nRunning Terminal-Bench evaluation:")
+        print("\nRunning Terminal-Bench evaluation:")
         print(f"  Model: {full_model}")
-        print(f"  Dataset: {dataset}@{dataset_version}")
+        print(f"  Dataset: {dataset_str}")
         print(f"  Concurrent tasks: {n_concurrent}")
+        if n_tasks:
+            print(f"  Max tasks: {n_tasks}")
         print(f"  Output: {output_dir}")
         if npc_name:
             print(f"  NPC: {npc_name}")
@@ -311,7 +351,7 @@ class BenchmarkRunner:
         self,
         models: List[tuple],
         dataset: str = "terminal-bench",
-        dataset_version: str = "2.0",
+        dataset_version: Optional[str] = None,
         n_concurrent: int = 4,
         task_ids: Optional[List[str]] = None,
     ) -> Dict[str, BenchmarkResult]:
@@ -321,7 +361,7 @@ class BenchmarkRunner:
         Args:
             models: List of (model, provider) tuples
             dataset: Dataset name
-            dataset_version: Dataset version
+            dataset_version: Dataset version (optional)
             n_concurrent: Number of concurrent tasks
             task_ids: Optional specific task IDs
@@ -338,9 +378,9 @@ class BenchmarkRunner:
         results = {}
         for model, provider in models:
-            print(f"\n{'='*60}")
+            print("\n" + '='*60)
             print(f"Evaluating: {provider}/{model}")
-            print(f"{'='*60}")
+            print('='*60)
             result = self.run(
                 model=model,
@@ -365,9 +405,9 @@ class BenchmarkRunner:
     def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
         """Print a comparison summary table."""
-        print(f"\n{'='*60}")
+        print("\n" + '='*60)
         print("COMPARISON SUMMARY")
-        print(f"{'='*60}")
+        print('='*60)
         print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
         print("-" * 60)
@@ -436,20 +476,22 @@ def run_benchmark(
 def quick_test(
     model: str = "claude-sonnet-4-20250514",
     provider: str = "anthropic",
+    n_tasks: int = 3,
 ) -> BenchmarkResult:
     """
     Run a quick test with a few tasks to verify setup.
-    This runs only 3 easy tasks to quickly verify that everything is working.
+    This runs only a few tasks to quickly verify that everything is working.
     """
     runner = BenchmarkRunner()
-    # Use a small subset of easy tasks for quick testing
+    # Use -l flag to limit number of tasks instead of specifying task names
+    # This avoids issues with task names changing in the dataset
     return runner.run(
         model=model,
         provider=provider,
         n_concurrent=1,
-        task_ids=["ssl-cert", "git-server", "reshard-dataset"],  # Example easy tasks
+        n_tasks=n_tasks,
     )
@@ -484,8 +526,8 @@ Examples:
                        help="Provider name")
     parser.add_argument("--dataset", "-d", default="terminal-bench",
                        help="Dataset name")
-    parser.add_argument("--version", "-v", default="2.0",
-                       help="Dataset version")
+    parser.add_argument("--version", "-v", default=None,
+                       help="Dataset version (optional, uses latest if not specified)")
     parser.add_argument("--concurrent", "-n", type=int, default=4,
                        help="Number of concurrent tasks")
     parser.add_argument("--npc", help="NPC name to use")
@@ -541,7 +583,7 @@ Examples:
             ("gpt-4o", "openai"),
             ("gemini-2.0-flash", "gemini"),
         ]
-        results = runner.compare_models(
+        runner.compare_models(
             models_to_compare,
             n_concurrent=args.concurrent
         )
@@ -560,7 +602,7 @@ Examples:
             n_concurrent=args.concurrent,
             npc_name=args.npc,
         )
-        print(f"\nBenchmark complete!")
+        print("\nBenchmark complete!")
         print(f"Accuracy: {result.accuracy:.1%}")
         print(f"Results saved to: {result.output_dir}")

npcsh/benchmark/templates/install-npcsh.sh.j2 ADDED Viewed

@@ -0,0 +1,35 @@
+#!/bin/bash
+# Installation script for npcsh in Terminal-Bench containers
+# This template is rendered by Harbor before execution
+set -e
+echo "Installing npcsh for Terminal-Bench evaluation..."
+# Install Python dependencies if needed
+if ! command -v pip &> /dev/null; then
+    echo "Installing pip..."
+    apt-get update && apt-get install -y python3-pip
+fi
+# Install npcsh with lite dependencies (API providers only, no local models)
+# Use --break-system-packages for PEP 668 compliance (Ubuntu 24.04+)
+echo "Installing npcsh[lite]..."
+pip install --quiet --break-system-packages npcsh[lite] || pip install --quiet npcsh[lite]
+# Verify installation
+echo "Verifying npcsh installation..."
+npc --help > /dev/null 2>&1 || {
+    echo "ERROR: npcsh installation failed"
+    exit 1
+}
+# Set up default configuration
+export NPCSH_STREAM_OUTPUT=0
+export NPCSH_LOG_LEVEL=warning
+{% if version %}
+echo "npcsh version: {{ version }}"
+{% endif %}
+echo "npcsh installation complete!"

npcsh/build.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import shutil
 import textwrap
 from pathlib import Path
@@ -11,8 +10,7 @@ def build_flask_server(config, **kwargs):
     server_script = output_dir / 'npc_server.py'
     server_code = textwrap.dedent(f'''
-    import os
-    from npcpy.serve import start_flask_server
+      from npcpy.serve import start_flask_server
     from npcpy.npc_compiler import Team
     from sqlalchemy import create_engine
@@ -111,7 +109,7 @@ def build_docker_compose(config, **kwargs):
         volumes:
           - npc-data:/root/.npcsh
         environment:
-          - NPCSH_DB_PATH=/root/.npcsh/npcsh_history.db
+          - NPCSH_DB_PATH=/root/npcsh_history.db
     volumes:
       npc-data:

npcsh/completion.py CHANGED Viewed

@@ -2,8 +2,7 @@
 Readline and tab completion for npcsh
 """
 import os
-import shutil
-from typing import List, Any, Optional
+from typing import List, Any
 try:
     import readline
@@ -66,11 +65,8 @@ def get_file_completions(text: str) -> List[str]:
     completions = []
     if text.startswith("~"):
-        expanded = os.path.expanduser(text)
-        prefix = "~"
-        search_path = expanded
+        search_path = os.path.expanduser(text)
     else:
-        prefix = ""
         search_path = text
     # Get directory to search

npcsh/config.py CHANGED Viewed

@@ -3,7 +3,6 @@ npcsh configuration management
 """
 import os
 import importlib.metadata
-from typing import Optional, Dict, Any
 # Version
 try:
@@ -14,7 +13,6 @@ except importlib.metadata.PackageNotFoundError:
 # Default paths
 DEFAULT_NPC_TEAM_PATH = "~/.npcsh/npc_team"
 PROJECT_NPC_TEAM_PATH = "./npc_team"
-HISTORY_DB_DEFAULT_PATH = "~/.npcsh_history.db"
 READLINE_HISTORY_FILE = os.path.expanduser("~/.npcsh_history")
 # Environment defaults
@@ -44,7 +42,7 @@ NPCSH_REASONING_PROVIDER = os.environ.get("NPCSH_REASONING_PROVIDER", "ollama")
 NPCSH_STREAM_OUTPUT = os.environ.get("NPCSH_STREAM_OUTPUT", "0") == "1"
 NPCSH_API_URL = os.environ.get("NPCSH_API_URL", None)
 NPCSH_SEARCH_PROVIDER = os.environ.get("NPCSH_SEARCH_PROVIDER", "duckduckgo")
-NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG") == "1"
+NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG", "1") != "0"
 def get_shell_config_file() -> str:

npcsh 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl

npcsh 1.1.17py3-none-any.whl → 1.1.18py3-none-any.whl