PyPI - npcsh - Versions diffs - 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl - Mend

npcsh 1.1.17py3-none-any.whl → 1.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

npcsh/benchmark/npcsh_agent.py CHANGED Viewed

@@ -9,7 +9,6 @@ import json
 import os
 import shlex
 from pathlib import Path
-from typing import Optional
 from harbor.agents.installed.base import BaseInstalledAgent, ExecInput
 from harbor.models.agent.context import AgentContext
@@ -54,7 +53,28 @@ class NpcshAgent(BaseInstalledAgent):
         Returns:
             List of ExecInput commands to execute
         """
-        escaped_instruction = shlex.quote(instruction)
+        # Wrap the instruction with explicit jinx usage directions and retry logic
+        tool_instruction = f"""You have access to jinxs including edit_file (for writing/creating files), sh (for running shell commands), and python (for running Python code).
+IMPORTANT RULES:
+1. You MUST use these jinxs to complete the task. Do NOT just output code as text - use the edit_file jinx to actually write files to disk.
+2. After implementing a solution, you MUST verify it works by running any provided test scripts.
+3. If a test fails or produces an error, you MUST try a DIFFERENT approach. Do not give up.
+4. Keep trying different approaches until you succeed or have tried at least 10 different solutions.
+5. NEVER assume success - always check the actual output of test commands.
+Task: {instruction}
+WORKFLOW:
+1. Implement your solution using edit_file and sh
+2. Run any test scripts mentioned in the task
+3. Check the output carefully - look for "PASS", "SUCCESS", "OK" or similar
+4. If the test failed, analyze why and try a completely different approach
+5. Repeat until the test passes
+Remember: Use edit_file to write code files. Use sh to run commands. VERIFY your solution works before concluding."""
+        escaped_instruction = shlex.quote(tool_instruction)
         model_name = self.model_name
         if model_name and "/" in model_name:
@@ -82,18 +102,25 @@ class NpcshAgent(BaseInstalledAgent):
         # Build environment variables for API keys
         env_vars = []
         api_key_map = {
-            "anthropic": "ANTHROPIC_API_KEY",
-            "openai": "OPENAI_API_KEY",
-            "gemini": "GOOGLE_API_KEY",
-            "google": "GOOGLE_API_KEY",
-            "deepseek": "DEEPSEEK_API_KEY",
-            "groq": "GROQ_API_KEY",
-            "openrouter": "OPENROUTER_API_KEY",
+            "anthropic": ["ANTHROPIC_API_KEY"],
+            "openai": ["OPENAI_API_KEY"],
+            "gemini": ["GOOGLE_API_KEY", "GEMINI_API_KEY"],
+            "google": ["GOOGLE_API_KEY", "GEMINI_API_KEY"],
+            "deepseek": ["DEEPSEEK_API_KEY"],
+            "groq": ["GROQ_API_KEY"],
+            "openrouter": ["OPENROUTER_API_KEY"],
         }
-        for prov, env_key in api_key_map.items():
-            if env_key in os.environ:
-                env_vars.append(f'{env_key}="{os.environ[env_key]}"')
+        added_keys = set()
+        for prov, env_keys in api_key_map.items():
+            for env_key in env_keys:
+                if env_key in os.environ:
+                    # For Gemini, always pass as GOOGLE_API_KEY (what litellm expects)
+                    target_key = "GOOGLE_API_KEY" if env_key == "GEMINI_API_KEY" else env_key
+                    if target_key not in added_keys:
+                        env_vars.append(f'{target_key}="{os.environ[env_key]}"')
+                        added_keys.add(target_key)
+                    break
         env_prefix = " ".join(env_vars) + " " if env_vars else ""
@@ -105,24 +132,33 @@ class NpcshAgent(BaseInstalledAgent):
         # Create output directory
         commands.append(ExecInput(
-            cmd=f"mkdir -p {shlex.quote(output_dir)}",
-            timeout=30
+            command=f"mkdir -p {shlex.quote(output_dir)}",
+            timeout_sec=30
+        ))
+        # Create .npcsh_global file to use global team and avoid interactive prompts
+        commands.append(ExecInput(
+            command="touch /app/.npcsh_global",
+            timeout_sec=10
         ))
         # Run npcsh with the instruction
+        # Using corca NPC which has edit_file tool for writing files
         # Using the npc CLI which supports single-command execution
+        # NPCSH_DEFAULT_MODE=agent enables automatic tool execution
         npcsh_cmd = (
             f'{env_prefix}'
             f'NPCSH_CHAT_MODEL="{model}" '
             f'NPCSH_CHAT_PROVIDER="{npcsh_provider}" '
             f'NPCSH_STREAM_OUTPUT=0 '
-            f'npc {escaped_instruction} '
+            f'NPCSH_DEFAULT_MODE=agent '
+            f'npc --npc corca {escaped_instruction} '
             f'2>&1 | tee {shlex.quote(output_file)}'
         )
         commands.append(ExecInput(
-            cmd=npcsh_cmd,
-            timeout=600,  # 10 minute timeout for complex tasks
+            command=npcsh_cmd,
+            timeout_sec=600,  # 10 minute timeout for complex tasks
         ))
         return commands
@@ -198,7 +234,28 @@ class NpcshAgentWithNpc(NpcshAgent):
     def create_run_agent_commands(self, instruction: str) -> list:
         """Create commands using a specific NPC."""
-        escaped_instruction = shlex.quote(instruction)
+        # Wrap the instruction with explicit jinx usage directions and retry logic
+        tool_instruction = f"""You have access to jinxs including edit_file (for writing/creating files), sh (for running shell commands), and python (for running Python code).
+IMPORTANT RULES:
+1. You MUST use these jinxs to complete the task. Do NOT just output code as text - use the edit_file jinx to actually write files to disk.
+2. After implementing a solution, you MUST verify it works by running any provided test scripts.
+3. If a test fails or produces an error, you MUST try a DIFFERENT approach. Do not give up.
+4. Keep trying different approaches until you succeed or have tried at least 10 different solutions.
+5. NEVER assume success - always check the actual output of test commands.
+Task: {instruction}
+WORKFLOW:
+1. Implement your solution using edit_file and sh
+2. Run any test scripts mentioned in the task
+3. Check the output carefully - look for "PASS", "SUCCESS", "OK" or similar
+4. If the test failed, analyze why and try a completely different approach
+5. Repeat until the test passes
+Remember: Use edit_file to write code files. Use sh to run commands. VERIFY your solution works before concluding."""
+        escaped_instruction = shlex.quote(tool_instruction)
         model_name = self.model_name
         if model_name and "/" in model_name:
@@ -240,23 +297,31 @@ class NpcshAgentWithNpc(NpcshAgent):
         commands = []
         commands.append(ExecInput(
-            cmd=f"mkdir -p {shlex.quote(output_dir)}",
-            timeout=30
+            command=f"mkdir -p {shlex.quote(output_dir)}",
+            timeout_sec=30
+        ))
+        # Create .npcsh_global file to use global team and avoid interactive prompts
+        commands.append(ExecInput(
+            command="touch /app/.npcsh_global",
+            timeout_sec=10
         ))
         # Use specific NPC with --npc flag
+        # NPCSH_DEFAULT_MODE=agent enables automatic tool execution
         npcsh_cmd = (
             f'{env_prefix}'
             f'NPCSH_CHAT_MODEL="{model}" '
             f'NPCSH_CHAT_PROVIDER="{npcsh_provider}" '
             f'NPCSH_STREAM_OUTPUT=0 '
+            f'NPCSH_DEFAULT_MODE=agent '
             f'npc --npc {self.npc_name} {escaped_instruction} '
             f'2>&1 | tee {shlex.quote(output_file)}'
         )
         commands.append(ExecInput(
-            cmd=npcsh_cmd,
-            timeout=600,
+            command=npcsh_cmd,
+            timeout_sec=600,
         ))
         return commands

npcsh/benchmark/runner.py CHANGED Viewed

@@ -5,14 +5,14 @@ Provides a convenient interface for running Terminal-Bench evaluations
 with different models and providers.
 """
-import os
 import subprocess
 import sys
+import json
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from typing import Optional, List, Dict, Any
-import json
 @dataclass
@@ -21,7 +21,7 @@ class BenchmarkConfig:
     model: str = "claude-sonnet-4-20250514"
     provider: str = "anthropic"
     dataset: str = "terminal-bench"
-    dataset_version: str = "2.0"
+    dataset_version: Optional[str] = None  # If None, use latest
     n_concurrent: int = 4
     task_ids: Optional[List[str]] = None
     output_dir: Optional[str] = None
@@ -84,33 +84,52 @@ class BenchmarkRunner:
     def check_dependencies(self) -> Dict[str, bool]:
         """Check if required dependencies are installed."""
+        import shutil
         deps = {
             "harbor": False,
             "terminal-bench": False,
             "docker": False,
         }
-        # Check harbor
-        try:
-            result = subprocess.run(
-                ["harbor", "--version"],
-                capture_output=True,
-                text=True
-            )
-            deps["harbor"] = result.returncode == 0
-        except FileNotFoundError:
-            pass
+        # Find binaries in the same Python environment as current interpreter
+        # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
+        bin_dir = Path(sys.prefix) / "bin"
+        if not bin_dir.exists():
+            # Fallback: use executable's directory without resolving
+            bin_dir = Path(sys.executable).parent
-        # Check terminal-bench (tb CLI)
-        try:
-            result = subprocess.run(
-                ["tb", "--help"],
-                capture_output=True,
-                text=True
-            )
-            deps["terminal-bench"] = result.returncode == 0
-        except FileNotFoundError:
-            pass
+        # Check harbor - first in current Python's bin dir, then PATH
+        harbor_bin = bin_dir / "harbor"
+        if not harbor_bin.exists():
+            harbor_bin = shutil.which("harbor")
+        if harbor_bin:
+            try:
+                result = subprocess.run(
+                    [str(harbor_bin), "--version"],
+                    capture_output=True,
+                    text=True
+                )
+                deps["harbor"] = result.returncode == 0
+            except (FileNotFoundError, OSError):
+                pass
+        # Check terminal-bench (tb CLI) - first in current Python's bin dir, then PATH
+        tb_bin = bin_dir / "tb"
+        if not tb_bin.exists():
+            tb_bin = shutil.which("tb")
+        if tb_bin:
+            try:
+                result = subprocess.run(
+                    [str(tb_bin), "--help"],
+                    capture_output=True,
+                    text=True
+                )
+                deps["terminal-bench"] = result.returncode == 0
+            except (FileNotFoundError, OSError):
+                pass
         # Check docker
         try:
@@ -146,9 +165,10 @@ class BenchmarkRunner:
         model: str = "claude-sonnet-4-20250514",
         provider: str = "anthropic",
         dataset: str = "terminal-bench",
-        dataset_version: str = "2.0",
+        dataset_version: Optional[str] = None,
         n_concurrent: int = 4,
         task_ids: Optional[List[str]] = None,
+        n_tasks: Optional[int] = None,
         npc_name: Optional[str] = None,
         timeout: int = 600,
     ) -> BenchmarkResult:
@@ -159,9 +179,10 @@ class BenchmarkRunner:
             model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
             provider: Provider name (e.g., "anthropic", "openai", "gemini")
             dataset: Dataset name (default: "terminal-bench")
-            dataset_version: Dataset version (default: "2.0")
+            dataset_version: Dataset version (optional, uses latest if None)
             n_concurrent: Number of concurrent task executions
             task_ids: Optional list of specific task IDs to run
+            n_tasks: Optional limit on number of tasks to run
             npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
             timeout: Per-task timeout in seconds
@@ -193,9 +214,22 @@ class BenchmarkRunner:
         else:
             agent_path = "npcsh.benchmark:NpcshAgent"
+        # Find harbor in the same Python environment as current interpreter
+        # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
+        import shutil
+        bin_dir = Path(sys.prefix) / "bin"
+        if not bin_dir.exists():
+            bin_dir = Path(sys.executable).parent
+        harbor_bin = str(bin_dir / "harbor")
+        if not Path(harbor_bin).exists():
+            harbor_bin = shutil.which("harbor") or "harbor"
+        # Build dataset string (with optional version)
+        dataset_str = f"{dataset}@{dataset_version}" if dataset_version else dataset
         cmd = [
-            "harbor", "run",
-            "-d", f"{dataset}@{dataset_version}",
+            harbor_bin, "run",
+            "-d", dataset_str,
             "--agent-import-path", agent_path,
             "-m", full_model,
             "-n", str(n_concurrent),
@@ -203,12 +237,18 @@ class BenchmarkRunner:
         ]
         if task_ids:
-            cmd.extend(["--task-ids", ",".join(task_ids)])
+            for task_id in task_ids:
+                cmd.extend(["--task-name", task_id])
+        if n_tasks:
+            cmd.extend(["-l", str(n_tasks)])
-        print(f"\nRunning Terminal-Bench evaluation:")
+        print("\nRunning Terminal-Bench evaluation:")
         print(f"  Model: {full_model}")
-        print(f"  Dataset: {dataset}@{dataset_version}")
+        print(f"  Dataset: {dataset_str}")
         print(f"  Concurrent tasks: {n_concurrent}")
+        if n_tasks:
+            print(f"  Max tasks: {n_tasks}")
         print(f"  Output: {output_dir}")
         if npc_name:
             print(f"  NPC: {npc_name}")
@@ -311,7 +351,7 @@ class BenchmarkRunner:
         self,
         models: List[tuple],
         dataset: str = "terminal-bench",
-        dataset_version: str = "2.0",
+        dataset_version: Optional[str] = None,
         n_concurrent: int = 4,
         task_ids: Optional[List[str]] = None,
     ) -> Dict[str, BenchmarkResult]:
@@ -321,7 +361,7 @@ class BenchmarkRunner:
         Args:
             models: List of (model, provider) tuples
             dataset: Dataset name
-            dataset_version: Dataset version
+            dataset_version: Dataset version (optional)
             n_concurrent: Number of concurrent tasks
             task_ids: Optional specific task IDs
@@ -338,9 +378,9 @@ class BenchmarkRunner:
         results = {}
         for model, provider in models:
-            print(f"\n{'='*60}")
+            print("\n" + '='*60)
             print(f"Evaluating: {provider}/{model}")
-            print(f"{'='*60}")
+            print('='*60)
             result = self.run(
                 model=model,
@@ -365,9 +405,9 @@ class BenchmarkRunner:
     def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
         """Print a comparison summary table."""
-        print(f"\n{'='*60}")
+        print("\n" + '='*60)
         print("COMPARISON SUMMARY")
-        print(f"{'='*60}")
+        print('='*60)
         print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
         print("-" * 60)
@@ -436,20 +476,22 @@ def run_benchmark(
 def quick_test(
     model: str = "claude-sonnet-4-20250514",
     provider: str = "anthropic",
+    n_tasks: int = 3,
 ) -> BenchmarkResult:
     """
     Run a quick test with a few tasks to verify setup.
-    This runs only 3 easy tasks to quickly verify that everything is working.
+    This runs only a few tasks to quickly verify that everything is working.
     """
     runner = BenchmarkRunner()
-    # Use a small subset of easy tasks for quick testing
+    # Use -l flag to limit number of tasks instead of specifying task names
+    # This avoids issues with task names changing in the dataset
     return runner.run(
         model=model,
         provider=provider,
         n_concurrent=1,
-        task_ids=["ssl-cert", "git-server", "reshard-dataset"],  # Example easy tasks
+        n_tasks=n_tasks,
     )
@@ -484,8 +526,8 @@ Examples:
                        help="Provider name")
     parser.add_argument("--dataset", "-d", default="terminal-bench",
                        help="Dataset name")
-    parser.add_argument("--version", "-v", default="2.0",
-                       help="Dataset version")
+    parser.add_argument("--version", "-v", default=None,
+                       help="Dataset version (optional, uses latest if not specified)")
     parser.add_argument("--concurrent", "-n", type=int, default=4,
                        help="Number of concurrent tasks")
     parser.add_argument("--npc", help="NPC name to use")
@@ -541,7 +583,7 @@ Examples:
             ("gpt-4o", "openai"),
             ("gemini-2.0-flash", "gemini"),
         ]
-        results = runner.compare_models(
+        runner.compare_models(
             models_to_compare,
             n_concurrent=args.concurrent
         )
@@ -560,7 +602,7 @@ Examples:
             n_concurrent=args.concurrent,
             npc_name=args.npc,
         )
-        print(f"\nBenchmark complete!")
+        print("\nBenchmark complete!")
         print(f"Accuracy: {result.accuracy:.1%}")
         print(f"Results saved to: {result.output_dir}")

npcsh/benchmark/templates/install-npcsh.sh.j2 ADDED Viewed

@@ -0,0 +1,35 @@
+#!/bin/bash
+# Installation script for npcsh in Terminal-Bench containers
+# This template is rendered by Harbor before execution
+set -e
+echo "Installing npcsh for Terminal-Bench evaluation..."
+# Install Python dependencies if needed
+if ! command -v pip &> /dev/null; then
+    echo "Installing pip..."
+    apt-get update && apt-get install -y python3-pip
+fi
+# Install npcsh with lite dependencies (API providers only, no local models)
+# Use --break-system-packages for PEP 668 compliance (Ubuntu 24.04+)
+echo "Installing npcsh[lite]..."
+pip install --quiet --break-system-packages npcsh[lite] || pip install --quiet npcsh[lite]
+# Verify installation
+echo "Verifying npcsh installation..."
+npc --help > /dev/null 2>&1 || {
+    echo "ERROR: npcsh installation failed"
+    exit 1
+}
+# Set up default configuration
+export NPCSH_STREAM_OUTPUT=0
+export NPCSH_LOG_LEVEL=warning
+{% if version %}
+echo "npcsh version: {{ version }}"
+{% endif %}
+echo "npcsh installation complete!"

npcsh/build.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import shutil
 import textwrap
 from pathlib import Path
@@ -11,8 +10,7 @@ def build_flask_server(config, **kwargs):
     server_script = output_dir / 'npc_server.py'
     server_code = textwrap.dedent(f'''
-    import os
-    from npcpy.serve import start_flask_server
+      from npcpy.serve import start_flask_server
     from npcpy.npc_compiler import Team
     from sqlalchemy import create_engine
@@ -111,7 +109,7 @@ def build_docker_compose(config, **kwargs):
         volumes:
           - npc-data:/root/.npcsh
         environment:
-          - NPCSH_DB_PATH=/root/.npcsh/npcsh_history.db
+          - NPCSH_DB_PATH=/root/npcsh_history.db
     volumes:
       npc-data:

npcsh/completion.py CHANGED Viewed

@@ -2,8 +2,7 @@
 Readline and tab completion for npcsh
 """
 import os
-import shutil
-from typing import List, Any, Optional
+from typing import List, Any
 try:
     import readline
@@ -66,11 +65,8 @@ def get_file_completions(text: str) -> List[str]:
     completions = []
     if text.startswith("~"):
-        expanded = os.path.expanduser(text)
-        prefix = "~"
-        search_path = expanded
+        search_path = os.path.expanduser(text)
     else:
-        prefix = ""
         search_path = text
     # Get directory to search

npcsh/config.py CHANGED Viewed

@@ -3,7 +3,6 @@ npcsh configuration management
 """
 import os
 import importlib.metadata
-from typing import Optional, Dict, Any
 # Version
 try:
@@ -14,7 +13,6 @@ except importlib.metadata.PackageNotFoundError:
 # Default paths
 DEFAULT_NPC_TEAM_PATH = "~/.npcsh/npc_team"
 PROJECT_NPC_TEAM_PATH = "./npc_team"
-HISTORY_DB_DEFAULT_PATH = "~/.npcsh_history.db"
 READLINE_HISTORY_FILE = os.path.expanduser("~/.npcsh_history")
 # Environment defaults
@@ -44,7 +42,8 @@ NPCSH_REASONING_PROVIDER = os.environ.get("NPCSH_REASONING_PROVIDER", "ollama")
 NPCSH_STREAM_OUTPUT = os.environ.get("NPCSH_STREAM_OUTPUT", "0") == "1"
 NPCSH_API_URL = os.environ.get("NPCSH_API_URL", None)
 NPCSH_SEARCH_PROVIDER = os.environ.get("NPCSH_SEARCH_PROVIDER", "duckduckgo")
-NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG") == "1"
+NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG", "1") != "0"
+NPCSH_EDIT_APPROVAL = os.environ.get("NPCSH_EDIT_APPROVAL", "off")  # off, interactive, auto
 def get_shell_config_file() -> str:

npcsh 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

npcsh 1.1.17py3-none-any.whl → 1.1.19py3-none-any.whl