npcsh 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcsh/_state.py +114 -91
- npcsh/alicanto.py +2 -2
- npcsh/benchmark/__init__.py +8 -2
- npcsh/benchmark/npcsh_agent.py +46 -12
- npcsh/benchmark/runner.py +85 -43
- npcsh/benchmark/templates/install-npcsh.sh.j2 +35 -0
- npcsh/build.py +2 -4
- npcsh/completion.py +2 -6
- npcsh/config.py +1 -3
- npcsh/conversation_viewer.py +389 -0
- npcsh/corca.py +0 -1
- npcsh/execution.py +0 -1
- npcsh/guac.py +0 -1
- npcsh/mcp_helpers.py +2 -3
- npcsh/mcp_server.py +5 -10
- npcsh/npc.py +10 -11
- npcsh/npc_team/jinxs/bin/benchmark.jinx +1 -1
- npcsh/npc_team/jinxs/lib/core/search/db_search.jinx +321 -17
- npcsh/npc_team/jinxs/lib/core/search/file_search.jinx +312 -67
- npcsh/npc_team/jinxs/lib/core/search/kg_search.jinx +366 -44
- npcsh/npc_team/jinxs/lib/core/search/mem_review.jinx +73 -0
- npcsh/npc_team/jinxs/lib/core/search/mem_search.jinx +328 -20
- npcsh/npc_team/jinxs/lib/core/search/web_search.jinx +242 -10
- npcsh/npc_team/jinxs/lib/core/sleep.jinx +22 -11
- npcsh/npc_team/jinxs/lib/core/sql.jinx +10 -6
- npcsh/npc_team/jinxs/lib/research/paper_search.jinx +387 -76
- npcsh/npc_team/jinxs/lib/research/semantic_scholar.jinx +372 -55
- npcsh/npc_team/jinxs/lib/utils/jinxs.jinx +299 -144
- npcsh/npc_team/jinxs/modes/alicanto.jinx +356 -0
- npcsh/npc_team/jinxs/modes/arxiv.jinx +720 -0
- npcsh/npc_team/jinxs/modes/corca.jinx +430 -0
- npcsh/npc_team/jinxs/modes/guac.jinx +544 -0
- npcsh/npc_team/jinxs/modes/plonk.jinx +379 -0
- npcsh/npc_team/jinxs/modes/pti.jinx +357 -0
- npcsh/npc_team/jinxs/modes/reattach.jinx +291 -0
- npcsh/npc_team/jinxs/modes/spool.jinx +350 -0
- npcsh/npc_team/jinxs/modes/wander.jinx +455 -0
- npcsh/npc_team/jinxs/{bin → modes}/yap.jinx +13 -7
- npcsh/npcsh.py +7 -4
- npcsh/plonk.py +0 -1
- npcsh/pti.py +0 -1
- npcsh/routes.py +1 -3
- npcsh/spool.py +0 -1
- npcsh/ui.py +0 -1
- npcsh/wander.py +0 -1
- npcsh/yap.py +0 -1
- npcsh-1.1.18.data/data/npcsh/npc_team/alicanto.jinx +356 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/arxiv.jinx +720 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/benchmark.jinx +1 -1
- npcsh-1.1.18.data/data/npcsh/npc_team/corca.jinx +430 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/db_search.jinx +348 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/file_search.jinx +339 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/guac.jinx +544 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/jinxs.jinx +331 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/kg_search.jinx +418 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/mem_review.jinx +73 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/mem_search.jinx +388 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/paper_search.jinx +412 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/plonk.jinx +379 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/pti.jinx +357 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/reattach.jinx +291 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/semantic_scholar.jinx +386 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sleep.jinx +22 -11
- npcsh-1.1.18.data/data/npcsh/npc_team/spool.jinx +350 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/sql.jinx +20 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/wander.jinx +455 -0
- npcsh-1.1.18.data/data/npcsh/npc_team/web_search.jinx +283 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/yap.jinx +13 -7
- {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/METADATA +90 -1
- npcsh-1.1.18.dist-info/RECORD +235 -0
- {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/WHEEL +1 -1
- {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/entry_points.txt +0 -3
- npcsh/npc_team/jinxs/bin/spool.jinx +0 -161
- npcsh/npc_team/jinxs/bin/wander.jinx +0 -242
- npcsh/npc_team/jinxs/lib/research/arxiv.jinx +0 -76
- npcsh-1.1.17.data/data/npcsh/npc_team/arxiv.jinx +0 -76
- npcsh-1.1.17.data/data/npcsh/npc_team/db_search.jinx +0 -44
- npcsh-1.1.17.data/data/npcsh/npc_team/file_search.jinx +0 -94
- npcsh-1.1.17.data/data/npcsh/npc_team/jinxs.jinx +0 -176
- npcsh-1.1.17.data/data/npcsh/npc_team/kg_search.jinx +0 -96
- npcsh-1.1.17.data/data/npcsh/npc_team/mem_search.jinx +0 -80
- npcsh-1.1.17.data/data/npcsh/npc_team/paper_search.jinx +0 -101
- npcsh-1.1.17.data/data/npcsh/npc_team/semantic_scholar.jinx +0 -69
- npcsh-1.1.17.data/data/npcsh/npc_team/spool.jinx +0 -161
- npcsh-1.1.17.data/data/npcsh/npc_team/sql.jinx +0 -16
- npcsh-1.1.17.data/data/npcsh/npc_team/wander.jinx +0 -242
- npcsh-1.1.17.data/data/npcsh/npc_team/web_search.jinx +0 -51
- npcsh-1.1.17.dist-info/RECORD +0 -219
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/add_tab.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/alicanto.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/alicanto.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/browser_action.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/browser_screenshot.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/build.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/chat.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/click.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_browser.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_pane.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_tab.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/cmd.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/compile.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/compress.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/confirm.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/convene.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca_example.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/delegate.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/edit_file.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/focus_pane.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/frederic.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/frederic4.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/guac.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/guac.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/help.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/incognide.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/init.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/kadiefa.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/kadiefa.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/key_press.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/launch_app.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/list_panes.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/load_file.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/navigate.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/notify.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/npcsh.ctx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/npcsh_sibiji.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/nql.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/open_browser.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/open_pane.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/ots.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/paste.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonk.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonk.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonkjr.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonkjr.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/python.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/read_pane.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/roll.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/run_terminal.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sample.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/screenshot.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/search.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/send_message.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/serve.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/set.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sh.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/shh.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sibiji.npc +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sibiji.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/split_pane.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/spool.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch_npc.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch_tab.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switches.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sync.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/teamviz.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/trigger.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/type_text.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/usage.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/verbose.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/vixynt.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/wait.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/write_file.jinx +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/yap.png +0 -0
- {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/zen_mode.jinx +0 -0
- {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/licenses/LICENSE +0 -0
- {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/top_level.txt +0 -0
npcsh/benchmark/runner.py
CHANGED
|
@@ -5,14 +5,14 @@ Provides a convenient interface for running Terminal-Bench evaluations
|
|
|
5
5
|
with different models and providers.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
import os
|
|
9
8
|
import subprocess
|
|
10
9
|
import sys
|
|
10
|
+
import json
|
|
11
11
|
from dataclasses import dataclass, field
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from pathlib import Path
|
|
14
14
|
from typing import Optional, List, Dict, Any
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@dataclass
|
|
@@ -21,7 +21,7 @@ class BenchmarkConfig:
|
|
|
21
21
|
model: str = "claude-sonnet-4-20250514"
|
|
22
22
|
provider: str = "anthropic"
|
|
23
23
|
dataset: str = "terminal-bench"
|
|
24
|
-
dataset_version: str =
|
|
24
|
+
dataset_version: Optional[str] = None # If None, use latest
|
|
25
25
|
n_concurrent: int = 4
|
|
26
26
|
task_ids: Optional[List[str]] = None
|
|
27
27
|
output_dir: Optional[str] = None
|
|
@@ -84,33 +84,52 @@ class BenchmarkRunner:
|
|
|
84
84
|
|
|
85
85
|
def check_dependencies(self) -> Dict[str, bool]:
|
|
86
86
|
"""Check if required dependencies are installed."""
|
|
87
|
+
import shutil
|
|
88
|
+
|
|
87
89
|
deps = {
|
|
88
90
|
"harbor": False,
|
|
89
91
|
"terminal-bench": False,
|
|
90
92
|
"docker": False,
|
|
91
93
|
}
|
|
92
94
|
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
)
|
|
100
|
-
deps["harbor"] = result.returncode == 0
|
|
101
|
-
except FileNotFoundError:
|
|
102
|
-
pass
|
|
95
|
+
# Find binaries in the same Python environment as current interpreter
|
|
96
|
+
# Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
|
|
97
|
+
bin_dir = Path(sys.prefix) / "bin"
|
|
98
|
+
if not bin_dir.exists():
|
|
99
|
+
# Fallback: use executable's directory without resolving
|
|
100
|
+
bin_dir = Path(sys.executable).parent
|
|
103
101
|
|
|
104
|
-
# Check
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
102
|
+
# Check harbor - first in current Python's bin dir, then PATH
|
|
103
|
+
harbor_bin = bin_dir / "harbor"
|
|
104
|
+
if not harbor_bin.exists():
|
|
105
|
+
harbor_bin = shutil.which("harbor")
|
|
106
|
+
|
|
107
|
+
if harbor_bin:
|
|
108
|
+
try:
|
|
109
|
+
result = subprocess.run(
|
|
110
|
+
[str(harbor_bin), "--version"],
|
|
111
|
+
capture_output=True,
|
|
112
|
+
text=True
|
|
113
|
+
)
|
|
114
|
+
deps["harbor"] = result.returncode == 0
|
|
115
|
+
except (FileNotFoundError, OSError):
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
# Check terminal-bench (tb CLI) - first in current Python's bin dir, then PATH
|
|
119
|
+
tb_bin = bin_dir / "tb"
|
|
120
|
+
if not tb_bin.exists():
|
|
121
|
+
tb_bin = shutil.which("tb")
|
|
122
|
+
|
|
123
|
+
if tb_bin:
|
|
124
|
+
try:
|
|
125
|
+
result = subprocess.run(
|
|
126
|
+
[str(tb_bin), "--help"],
|
|
127
|
+
capture_output=True,
|
|
128
|
+
text=True
|
|
129
|
+
)
|
|
130
|
+
deps["terminal-bench"] = result.returncode == 0
|
|
131
|
+
except (FileNotFoundError, OSError):
|
|
132
|
+
pass
|
|
114
133
|
|
|
115
134
|
# Check docker
|
|
116
135
|
try:
|
|
@@ -146,9 +165,10 @@ class BenchmarkRunner:
|
|
|
146
165
|
model: str = "claude-sonnet-4-20250514",
|
|
147
166
|
provider: str = "anthropic",
|
|
148
167
|
dataset: str = "terminal-bench",
|
|
149
|
-
dataset_version: str =
|
|
168
|
+
dataset_version: Optional[str] = None,
|
|
150
169
|
n_concurrent: int = 4,
|
|
151
170
|
task_ids: Optional[List[str]] = None,
|
|
171
|
+
n_tasks: Optional[int] = None,
|
|
152
172
|
npc_name: Optional[str] = None,
|
|
153
173
|
timeout: int = 600,
|
|
154
174
|
) -> BenchmarkResult:
|
|
@@ -159,9 +179,10 @@ class BenchmarkRunner:
|
|
|
159
179
|
model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
|
|
160
180
|
provider: Provider name (e.g., "anthropic", "openai", "gemini")
|
|
161
181
|
dataset: Dataset name (default: "terminal-bench")
|
|
162
|
-
dataset_version: Dataset version (
|
|
182
|
+
dataset_version: Dataset version (optional, uses latest if None)
|
|
163
183
|
n_concurrent: Number of concurrent task executions
|
|
164
184
|
task_ids: Optional list of specific task IDs to run
|
|
185
|
+
n_tasks: Optional limit on number of tasks to run
|
|
165
186
|
npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
|
|
166
187
|
timeout: Per-task timeout in seconds
|
|
167
188
|
|
|
@@ -193,9 +214,22 @@ class BenchmarkRunner:
|
|
|
193
214
|
else:
|
|
194
215
|
agent_path = "npcsh.benchmark:NpcshAgent"
|
|
195
216
|
|
|
217
|
+
# Find harbor in the same Python environment as current interpreter
|
|
218
|
+
# Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
|
|
219
|
+
import shutil
|
|
220
|
+
bin_dir = Path(sys.prefix) / "bin"
|
|
221
|
+
if not bin_dir.exists():
|
|
222
|
+
bin_dir = Path(sys.executable).parent
|
|
223
|
+
harbor_bin = str(bin_dir / "harbor")
|
|
224
|
+
if not Path(harbor_bin).exists():
|
|
225
|
+
harbor_bin = shutil.which("harbor") or "harbor"
|
|
226
|
+
|
|
227
|
+
# Build dataset string (with optional version)
|
|
228
|
+
dataset_str = f"{dataset}@{dataset_version}" if dataset_version else dataset
|
|
229
|
+
|
|
196
230
|
cmd = [
|
|
197
|
-
|
|
198
|
-
"-d",
|
|
231
|
+
harbor_bin, "run",
|
|
232
|
+
"-d", dataset_str,
|
|
199
233
|
"--agent-import-path", agent_path,
|
|
200
234
|
"-m", full_model,
|
|
201
235
|
"-n", str(n_concurrent),
|
|
@@ -203,12 +237,18 @@ class BenchmarkRunner:
|
|
|
203
237
|
]
|
|
204
238
|
|
|
205
239
|
if task_ids:
|
|
206
|
-
|
|
240
|
+
for task_id in task_ids:
|
|
241
|
+
cmd.extend(["--task-name", task_id])
|
|
242
|
+
|
|
243
|
+
if n_tasks:
|
|
244
|
+
cmd.extend(["-l", str(n_tasks)])
|
|
207
245
|
|
|
208
|
-
print(
|
|
246
|
+
print("\nRunning Terminal-Bench evaluation:")
|
|
209
247
|
print(f" Model: {full_model}")
|
|
210
|
-
print(f" Dataset: {
|
|
248
|
+
print(f" Dataset: {dataset_str}")
|
|
211
249
|
print(f" Concurrent tasks: {n_concurrent}")
|
|
250
|
+
if n_tasks:
|
|
251
|
+
print(f" Max tasks: {n_tasks}")
|
|
212
252
|
print(f" Output: {output_dir}")
|
|
213
253
|
if npc_name:
|
|
214
254
|
print(f" NPC: {npc_name}")
|
|
@@ -311,7 +351,7 @@ class BenchmarkRunner:
|
|
|
311
351
|
self,
|
|
312
352
|
models: List[tuple],
|
|
313
353
|
dataset: str = "terminal-bench",
|
|
314
|
-
dataset_version: str =
|
|
354
|
+
dataset_version: Optional[str] = None,
|
|
315
355
|
n_concurrent: int = 4,
|
|
316
356
|
task_ids: Optional[List[str]] = None,
|
|
317
357
|
) -> Dict[str, BenchmarkResult]:
|
|
@@ -321,7 +361,7 @@ class BenchmarkRunner:
|
|
|
321
361
|
Args:
|
|
322
362
|
models: List of (model, provider) tuples
|
|
323
363
|
dataset: Dataset name
|
|
324
|
-
dataset_version: Dataset version
|
|
364
|
+
dataset_version: Dataset version (optional)
|
|
325
365
|
n_concurrent: Number of concurrent tasks
|
|
326
366
|
task_ids: Optional specific task IDs
|
|
327
367
|
|
|
@@ -338,9 +378,9 @@ class BenchmarkRunner:
|
|
|
338
378
|
results = {}
|
|
339
379
|
|
|
340
380
|
for model, provider in models:
|
|
341
|
-
print(
|
|
381
|
+
print("\n" + '='*60)
|
|
342
382
|
print(f"Evaluating: {provider}/{model}")
|
|
343
|
-
print(
|
|
383
|
+
print('='*60)
|
|
344
384
|
|
|
345
385
|
result = self.run(
|
|
346
386
|
model=model,
|
|
@@ -365,9 +405,9 @@ class BenchmarkRunner:
|
|
|
365
405
|
|
|
366
406
|
def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
|
|
367
407
|
"""Print a comparison summary table."""
|
|
368
|
-
print(
|
|
408
|
+
print("\n" + '='*60)
|
|
369
409
|
print("COMPARISON SUMMARY")
|
|
370
|
-
print(
|
|
410
|
+
print('='*60)
|
|
371
411
|
print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
|
|
372
412
|
print("-" * 60)
|
|
373
413
|
|
|
@@ -436,20 +476,22 @@ def run_benchmark(
|
|
|
436
476
|
def quick_test(
|
|
437
477
|
model: str = "claude-sonnet-4-20250514",
|
|
438
478
|
provider: str = "anthropic",
|
|
479
|
+
n_tasks: int = 3,
|
|
439
480
|
) -> BenchmarkResult:
|
|
440
481
|
"""
|
|
441
482
|
Run a quick test with a few tasks to verify setup.
|
|
442
483
|
|
|
443
|
-
This runs only
|
|
484
|
+
This runs only a few tasks to quickly verify that everything is working.
|
|
444
485
|
"""
|
|
445
486
|
runner = BenchmarkRunner()
|
|
446
487
|
|
|
447
|
-
# Use
|
|
488
|
+
# Use -l flag to limit number of tasks instead of specifying task names
|
|
489
|
+
# This avoids issues with task names changing in the dataset
|
|
448
490
|
return runner.run(
|
|
449
491
|
model=model,
|
|
450
492
|
provider=provider,
|
|
451
493
|
n_concurrent=1,
|
|
452
|
-
|
|
494
|
+
n_tasks=n_tasks,
|
|
453
495
|
)
|
|
454
496
|
|
|
455
497
|
|
|
@@ -484,8 +526,8 @@ Examples:
|
|
|
484
526
|
help="Provider name")
|
|
485
527
|
parser.add_argument("--dataset", "-d", default="terminal-bench",
|
|
486
528
|
help="Dataset name")
|
|
487
|
-
parser.add_argument("--version", "-v", default=
|
|
488
|
-
help="Dataset version")
|
|
529
|
+
parser.add_argument("--version", "-v", default=None,
|
|
530
|
+
help="Dataset version (optional, uses latest if not specified)")
|
|
489
531
|
parser.add_argument("--concurrent", "-n", type=int, default=4,
|
|
490
532
|
help="Number of concurrent tasks")
|
|
491
533
|
parser.add_argument("--npc", help="NPC name to use")
|
|
@@ -541,7 +583,7 @@ Examples:
|
|
|
541
583
|
("gpt-4o", "openai"),
|
|
542
584
|
("gemini-2.0-flash", "gemini"),
|
|
543
585
|
]
|
|
544
|
-
|
|
586
|
+
runner.compare_models(
|
|
545
587
|
models_to_compare,
|
|
546
588
|
n_concurrent=args.concurrent
|
|
547
589
|
)
|
|
@@ -560,7 +602,7 @@ Examples:
|
|
|
560
602
|
n_concurrent=args.concurrent,
|
|
561
603
|
npc_name=args.npc,
|
|
562
604
|
)
|
|
563
|
-
print(
|
|
605
|
+
print("\nBenchmark complete!")
|
|
564
606
|
print(f"Accuracy: {result.accuracy:.1%}")
|
|
565
607
|
print(f"Results saved to: {result.output_dir}")
|
|
566
608
|
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Installation script for npcsh in Terminal-Bench containers
|
|
3
|
+
# This template is rendered by Harbor before execution
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "Installing npcsh for Terminal-Bench evaluation..."
|
|
8
|
+
|
|
9
|
+
# Install Python dependencies if needed
|
|
10
|
+
if ! command -v pip &> /dev/null; then
|
|
11
|
+
echo "Installing pip..."
|
|
12
|
+
apt-get update && apt-get install -y python3-pip
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
# Install npcsh with lite dependencies (API providers only, no local models)
|
|
16
|
+
# Use --break-system-packages for PEP 668 compliance (Ubuntu 24.04+)
|
|
17
|
+
echo "Installing npcsh[lite]..."
|
|
18
|
+
pip install --quiet --break-system-packages npcsh[lite] || pip install --quiet npcsh[lite]
|
|
19
|
+
|
|
20
|
+
# Verify installation
|
|
21
|
+
echo "Verifying npcsh installation..."
|
|
22
|
+
npc --help > /dev/null 2>&1 || {
|
|
23
|
+
echo "ERROR: npcsh installation failed"
|
|
24
|
+
exit 1
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Set up default configuration
|
|
28
|
+
export NPCSH_STREAM_OUTPUT=0
|
|
29
|
+
export NPCSH_LOG_LEVEL=warning
|
|
30
|
+
|
|
31
|
+
{% if version %}
|
|
32
|
+
echo "npcsh version: {{ version }}"
|
|
33
|
+
{% endif %}
|
|
34
|
+
|
|
35
|
+
echo "npcsh installation complete!"
|
npcsh/build.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import shutil
|
|
3
2
|
import textwrap
|
|
4
3
|
from pathlib import Path
|
|
@@ -11,8 +10,7 @@ def build_flask_server(config, **kwargs):
|
|
|
11
10
|
server_script = output_dir / 'npc_server.py'
|
|
12
11
|
|
|
13
12
|
server_code = textwrap.dedent(f'''
|
|
14
|
-
|
|
15
|
-
from npcpy.serve import start_flask_server
|
|
13
|
+
from npcpy.serve import start_flask_server
|
|
16
14
|
from npcpy.npc_compiler import Team
|
|
17
15
|
from sqlalchemy import create_engine
|
|
18
16
|
|
|
@@ -111,7 +109,7 @@ def build_docker_compose(config, **kwargs):
|
|
|
111
109
|
volumes:
|
|
112
110
|
- npc-data:/root/.npcsh
|
|
113
111
|
environment:
|
|
114
|
-
- NPCSH_DB_PATH=/root
|
|
112
|
+
- NPCSH_DB_PATH=/root/npcsh_history.db
|
|
115
113
|
|
|
116
114
|
volumes:
|
|
117
115
|
npc-data:
|
npcsh/completion.py
CHANGED
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
Readline and tab completion for npcsh
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
|
-
import
|
|
6
|
-
from typing import List, Any, Optional
|
|
5
|
+
from typing import List, Any
|
|
7
6
|
|
|
8
7
|
try:
|
|
9
8
|
import readline
|
|
@@ -66,11 +65,8 @@ def get_file_completions(text: str) -> List[str]:
|
|
|
66
65
|
completions = []
|
|
67
66
|
|
|
68
67
|
if text.startswith("~"):
|
|
69
|
-
|
|
70
|
-
prefix = "~"
|
|
71
|
-
search_path = expanded
|
|
68
|
+
search_path = os.path.expanduser(text)
|
|
72
69
|
else:
|
|
73
|
-
prefix = ""
|
|
74
70
|
search_path = text
|
|
75
71
|
|
|
76
72
|
# Get directory to search
|
npcsh/config.py
CHANGED
|
@@ -3,7 +3,6 @@ npcsh configuration management
|
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
5
|
import importlib.metadata
|
|
6
|
-
from typing import Optional, Dict, Any
|
|
7
6
|
|
|
8
7
|
# Version
|
|
9
8
|
try:
|
|
@@ -14,7 +13,6 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
14
13
|
# Default paths
|
|
15
14
|
DEFAULT_NPC_TEAM_PATH = "~/.npcsh/npc_team"
|
|
16
15
|
PROJECT_NPC_TEAM_PATH = "./npc_team"
|
|
17
|
-
HISTORY_DB_DEFAULT_PATH = "~/.npcsh_history.db"
|
|
18
16
|
READLINE_HISTORY_FILE = os.path.expanduser("~/.npcsh_history")
|
|
19
17
|
|
|
20
18
|
# Environment defaults
|
|
@@ -44,7 +42,7 @@ NPCSH_REASONING_PROVIDER = os.environ.get("NPCSH_REASONING_PROVIDER", "ollama")
|
|
|
44
42
|
NPCSH_STREAM_OUTPUT = os.environ.get("NPCSH_STREAM_OUTPUT", "0") == "1"
|
|
45
43
|
NPCSH_API_URL = os.environ.get("NPCSH_API_URL", None)
|
|
46
44
|
NPCSH_SEARCH_PROVIDER = os.environ.get("NPCSH_SEARCH_PROVIDER", "duckduckgo")
|
|
47
|
-
NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG")
|
|
45
|
+
NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG", "1") != "0"
|
|
48
46
|
|
|
49
47
|
|
|
50
48
|
def get_shell_config_file() -> str:
|