npcsh 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. npcsh/_state.py +114 -91
  2. npcsh/alicanto.py +2 -2
  3. npcsh/benchmark/__init__.py +8 -2
  4. npcsh/benchmark/npcsh_agent.py +46 -12
  5. npcsh/benchmark/runner.py +85 -43
  6. npcsh/benchmark/templates/install-npcsh.sh.j2 +35 -0
  7. npcsh/build.py +2 -4
  8. npcsh/completion.py +2 -6
  9. npcsh/config.py +1 -3
  10. npcsh/conversation_viewer.py +389 -0
  11. npcsh/corca.py +0 -1
  12. npcsh/execution.py +0 -1
  13. npcsh/guac.py +0 -1
  14. npcsh/mcp_helpers.py +2 -3
  15. npcsh/mcp_server.py +5 -10
  16. npcsh/npc.py +10 -11
  17. npcsh/npc_team/jinxs/bin/benchmark.jinx +1 -1
  18. npcsh/npc_team/jinxs/lib/core/search/db_search.jinx +321 -17
  19. npcsh/npc_team/jinxs/lib/core/search/file_search.jinx +312 -67
  20. npcsh/npc_team/jinxs/lib/core/search/kg_search.jinx +366 -44
  21. npcsh/npc_team/jinxs/lib/core/search/mem_review.jinx +73 -0
  22. npcsh/npc_team/jinxs/lib/core/search/mem_search.jinx +328 -20
  23. npcsh/npc_team/jinxs/lib/core/search/web_search.jinx +242 -10
  24. npcsh/npc_team/jinxs/lib/core/sleep.jinx +22 -11
  25. npcsh/npc_team/jinxs/lib/core/sql.jinx +10 -6
  26. npcsh/npc_team/jinxs/lib/research/paper_search.jinx +387 -76
  27. npcsh/npc_team/jinxs/lib/research/semantic_scholar.jinx +372 -55
  28. npcsh/npc_team/jinxs/lib/utils/jinxs.jinx +299 -144
  29. npcsh/npc_team/jinxs/modes/alicanto.jinx +356 -0
  30. npcsh/npc_team/jinxs/modes/arxiv.jinx +720 -0
  31. npcsh/npc_team/jinxs/modes/corca.jinx +430 -0
  32. npcsh/npc_team/jinxs/modes/guac.jinx +544 -0
  33. npcsh/npc_team/jinxs/modes/plonk.jinx +379 -0
  34. npcsh/npc_team/jinxs/modes/pti.jinx +357 -0
  35. npcsh/npc_team/jinxs/modes/reattach.jinx +291 -0
  36. npcsh/npc_team/jinxs/modes/spool.jinx +350 -0
  37. npcsh/npc_team/jinxs/modes/wander.jinx +455 -0
  38. npcsh/npc_team/jinxs/{bin → modes}/yap.jinx +13 -7
  39. npcsh/npcsh.py +7 -4
  40. npcsh/plonk.py +0 -1
  41. npcsh/pti.py +0 -1
  42. npcsh/routes.py +1 -3
  43. npcsh/spool.py +0 -1
  44. npcsh/ui.py +0 -1
  45. npcsh/wander.py +0 -1
  46. npcsh/yap.py +0 -1
  47. npcsh-1.1.18.data/data/npcsh/npc_team/alicanto.jinx +356 -0
  48. npcsh-1.1.18.data/data/npcsh/npc_team/arxiv.jinx +720 -0
  49. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/benchmark.jinx +1 -1
  50. npcsh-1.1.18.data/data/npcsh/npc_team/corca.jinx +430 -0
  51. npcsh-1.1.18.data/data/npcsh/npc_team/db_search.jinx +348 -0
  52. npcsh-1.1.18.data/data/npcsh/npc_team/file_search.jinx +339 -0
  53. npcsh-1.1.18.data/data/npcsh/npc_team/guac.jinx +544 -0
  54. npcsh-1.1.18.data/data/npcsh/npc_team/jinxs.jinx +331 -0
  55. npcsh-1.1.18.data/data/npcsh/npc_team/kg_search.jinx +418 -0
  56. npcsh-1.1.18.data/data/npcsh/npc_team/mem_review.jinx +73 -0
  57. npcsh-1.1.18.data/data/npcsh/npc_team/mem_search.jinx +388 -0
  58. npcsh-1.1.18.data/data/npcsh/npc_team/paper_search.jinx +412 -0
  59. npcsh-1.1.18.data/data/npcsh/npc_team/plonk.jinx +379 -0
  60. npcsh-1.1.18.data/data/npcsh/npc_team/pti.jinx +357 -0
  61. npcsh-1.1.18.data/data/npcsh/npc_team/reattach.jinx +291 -0
  62. npcsh-1.1.18.data/data/npcsh/npc_team/semantic_scholar.jinx +386 -0
  63. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sleep.jinx +22 -11
  64. npcsh-1.1.18.data/data/npcsh/npc_team/spool.jinx +350 -0
  65. npcsh-1.1.18.data/data/npcsh/npc_team/sql.jinx +20 -0
  66. npcsh-1.1.18.data/data/npcsh/npc_team/wander.jinx +455 -0
  67. npcsh-1.1.18.data/data/npcsh/npc_team/web_search.jinx +283 -0
  68. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/yap.jinx +13 -7
  69. {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/METADATA +90 -1
  70. npcsh-1.1.18.dist-info/RECORD +235 -0
  71. {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/WHEEL +1 -1
  72. {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/entry_points.txt +0 -3
  73. npcsh/npc_team/jinxs/bin/spool.jinx +0 -161
  74. npcsh/npc_team/jinxs/bin/wander.jinx +0 -242
  75. npcsh/npc_team/jinxs/lib/research/arxiv.jinx +0 -76
  76. npcsh-1.1.17.data/data/npcsh/npc_team/arxiv.jinx +0 -76
  77. npcsh-1.1.17.data/data/npcsh/npc_team/db_search.jinx +0 -44
  78. npcsh-1.1.17.data/data/npcsh/npc_team/file_search.jinx +0 -94
  79. npcsh-1.1.17.data/data/npcsh/npc_team/jinxs.jinx +0 -176
  80. npcsh-1.1.17.data/data/npcsh/npc_team/kg_search.jinx +0 -96
  81. npcsh-1.1.17.data/data/npcsh/npc_team/mem_search.jinx +0 -80
  82. npcsh-1.1.17.data/data/npcsh/npc_team/paper_search.jinx +0 -101
  83. npcsh-1.1.17.data/data/npcsh/npc_team/semantic_scholar.jinx +0 -69
  84. npcsh-1.1.17.data/data/npcsh/npc_team/spool.jinx +0 -161
  85. npcsh-1.1.17.data/data/npcsh/npc_team/sql.jinx +0 -16
  86. npcsh-1.1.17.data/data/npcsh/npc_team/wander.jinx +0 -242
  87. npcsh-1.1.17.data/data/npcsh/npc_team/web_search.jinx +0 -51
  88. npcsh-1.1.17.dist-info/RECORD +0 -219
  89. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/add_tab.jinx +0 -0
  90. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/alicanto.npc +0 -0
  91. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/alicanto.png +0 -0
  92. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/browser_action.jinx +0 -0
  93. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/browser_screenshot.jinx +0 -0
  94. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/build.jinx +0 -0
  95. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/chat.jinx +0 -0
  96. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/click.jinx +0 -0
  97. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_browser.jinx +0 -0
  98. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_pane.jinx +0 -0
  99. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_tab.jinx +0 -0
  100. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/cmd.jinx +0 -0
  101. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/compile.jinx +0 -0
  102. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/compress.jinx +0 -0
  103. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/confirm.jinx +0 -0
  104. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/convene.jinx +0 -0
  105. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca.npc +0 -0
  106. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca.png +0 -0
  107. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca_example.png +0 -0
  108. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/delegate.jinx +0 -0
  109. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/edit_file.jinx +0 -0
  110. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/focus_pane.jinx +0 -0
  111. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/frederic.npc +0 -0
  112. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/frederic4.png +0 -0
  113. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/guac.npc +0 -0
  114. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/guac.png +0 -0
  115. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/help.jinx +0 -0
  116. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/incognide.jinx +0 -0
  117. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/init.jinx +0 -0
  118. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/kadiefa.npc +0 -0
  119. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/kadiefa.png +0 -0
  120. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/key_press.jinx +0 -0
  121. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/launch_app.jinx +0 -0
  122. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/list_panes.jinx +0 -0
  123. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/load_file.jinx +0 -0
  124. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/navigate.jinx +0 -0
  125. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/notify.jinx +0 -0
  126. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/npcsh.ctx +0 -0
  127. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/npcsh_sibiji.png +0 -0
  128. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/nql.jinx +0 -0
  129. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/open_browser.jinx +0 -0
  130. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/open_pane.jinx +0 -0
  131. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/ots.jinx +0 -0
  132. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/paste.jinx +0 -0
  133. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonk.npc +0 -0
  134. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonk.png +0 -0
  135. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonkjr.npc +0 -0
  136. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonkjr.png +0 -0
  137. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/python.jinx +0 -0
  138. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/read_pane.jinx +0 -0
  139. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/roll.jinx +0 -0
  140. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/run_terminal.jinx +0 -0
  141. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sample.jinx +0 -0
  142. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/screenshot.jinx +0 -0
  143. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/search.jinx +0 -0
  144. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/send_message.jinx +0 -0
  145. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/serve.jinx +0 -0
  146. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/set.jinx +0 -0
  147. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sh.jinx +0 -0
  148. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/shh.jinx +0 -0
  149. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sibiji.npc +0 -0
  150. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sibiji.png +0 -0
  151. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/split_pane.jinx +0 -0
  152. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/spool.png +0 -0
  153. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch.jinx +0 -0
  154. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch_npc.jinx +0 -0
  155. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch_tab.jinx +0 -0
  156. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switches.jinx +0 -0
  157. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sync.jinx +0 -0
  158. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/teamviz.jinx +0 -0
  159. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/trigger.jinx +0 -0
  160. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/type_text.jinx +0 -0
  161. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/usage.jinx +0 -0
  162. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/verbose.jinx +0 -0
  163. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/vixynt.jinx +0 -0
  164. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/wait.jinx +0 -0
  165. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/write_file.jinx +0 -0
  166. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/yap.png +0 -0
  167. {npcsh-1.1.17.data → npcsh-1.1.18.data}/data/npcsh/npc_team/zen_mode.jinx +0 -0
  168. {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/licenses/LICENSE +0 -0
  169. {npcsh-1.1.17.dist-info → npcsh-1.1.18.dist-info}/top_level.txt +0 -0
npcsh/benchmark/runner.py CHANGED
@@ -5,14 +5,14 @@ Provides a convenient interface for running Terminal-Bench evaluations
5
5
  with different models and providers.
6
6
  """
7
7
 
8
- import os
9
8
  import subprocess
10
9
  import sys
10
+ import json
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
13
  from pathlib import Path
14
14
  from typing import Optional, List, Dict, Any
15
- import json
15
+
16
16
 
17
17
 
18
18
  @dataclass
@@ -21,7 +21,7 @@ class BenchmarkConfig:
21
21
  model: str = "claude-sonnet-4-20250514"
22
22
  provider: str = "anthropic"
23
23
  dataset: str = "terminal-bench"
24
- dataset_version: str = "2.0"
24
+ dataset_version: Optional[str] = None # If None, use latest
25
25
  n_concurrent: int = 4
26
26
  task_ids: Optional[List[str]] = None
27
27
  output_dir: Optional[str] = None
@@ -84,33 +84,52 @@ class BenchmarkRunner:
84
84
 
85
85
  def check_dependencies(self) -> Dict[str, bool]:
86
86
  """Check if required dependencies are installed."""
87
+ import shutil
88
+
87
89
  deps = {
88
90
  "harbor": False,
89
91
  "terminal-bench": False,
90
92
  "docker": False,
91
93
  }
92
94
 
93
- # Check harbor
94
- try:
95
- result = subprocess.run(
96
- ["harbor", "--version"],
97
- capture_output=True,
98
- text=True
99
- )
100
- deps["harbor"] = result.returncode == 0
101
- except FileNotFoundError:
102
- pass
95
+ # Find binaries in the same Python environment as current interpreter
96
+ # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
97
+ bin_dir = Path(sys.prefix) / "bin"
98
+ if not bin_dir.exists():
99
+ # Fallback: use executable's directory without resolving
100
+ bin_dir = Path(sys.executable).parent
103
101
 
104
- # Check terminal-bench (tb CLI)
105
- try:
106
- result = subprocess.run(
107
- ["tb", "--help"],
108
- capture_output=True,
109
- text=True
110
- )
111
- deps["terminal-bench"] = result.returncode == 0
112
- except FileNotFoundError:
113
- pass
102
+ # Check harbor - first in current Python's bin dir, then PATH
103
+ harbor_bin = bin_dir / "harbor"
104
+ if not harbor_bin.exists():
105
+ harbor_bin = shutil.which("harbor")
106
+
107
+ if harbor_bin:
108
+ try:
109
+ result = subprocess.run(
110
+ [str(harbor_bin), "--version"],
111
+ capture_output=True,
112
+ text=True
113
+ )
114
+ deps["harbor"] = result.returncode == 0
115
+ except (FileNotFoundError, OSError):
116
+ pass
117
+
118
+ # Check terminal-bench (tb CLI) - first in current Python's bin dir, then PATH
119
+ tb_bin = bin_dir / "tb"
120
+ if not tb_bin.exists():
121
+ tb_bin = shutil.which("tb")
122
+
123
+ if tb_bin:
124
+ try:
125
+ result = subprocess.run(
126
+ [str(tb_bin), "--help"],
127
+ capture_output=True,
128
+ text=True
129
+ )
130
+ deps["terminal-bench"] = result.returncode == 0
131
+ except (FileNotFoundError, OSError):
132
+ pass
114
133
 
115
134
  # Check docker
116
135
  try:
@@ -146,9 +165,10 @@ class BenchmarkRunner:
146
165
  model: str = "claude-sonnet-4-20250514",
147
166
  provider: str = "anthropic",
148
167
  dataset: str = "terminal-bench",
149
- dataset_version: str = "2.0",
168
+ dataset_version: Optional[str] = None,
150
169
  n_concurrent: int = 4,
151
170
  task_ids: Optional[List[str]] = None,
171
+ n_tasks: Optional[int] = None,
152
172
  npc_name: Optional[str] = None,
153
173
  timeout: int = 600,
154
174
  ) -> BenchmarkResult:
@@ -159,9 +179,10 @@ class BenchmarkRunner:
159
179
  model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
160
180
  provider: Provider name (e.g., "anthropic", "openai", "gemini")
161
181
  dataset: Dataset name (default: "terminal-bench")
162
- dataset_version: Dataset version (default: "2.0")
182
+ dataset_version: Dataset version (optional, uses latest if None)
163
183
  n_concurrent: Number of concurrent task executions
164
184
  task_ids: Optional list of specific task IDs to run
185
+ n_tasks: Optional limit on number of tasks to run
165
186
  npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
166
187
  timeout: Per-task timeout in seconds
167
188
 
@@ -193,9 +214,22 @@ class BenchmarkRunner:
193
214
  else:
194
215
  agent_path = "npcsh.benchmark:NpcshAgent"
195
216
 
217
+ # Find harbor in the same Python environment as current interpreter
218
+ # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
219
+ import shutil
220
+ bin_dir = Path(sys.prefix) / "bin"
221
+ if not bin_dir.exists():
222
+ bin_dir = Path(sys.executable).parent
223
+ harbor_bin = str(bin_dir / "harbor")
224
+ if not Path(harbor_bin).exists():
225
+ harbor_bin = shutil.which("harbor") or "harbor"
226
+
227
+ # Build dataset string (with optional version)
228
+ dataset_str = f"{dataset}@{dataset_version}" if dataset_version else dataset
229
+
196
230
  cmd = [
197
- "harbor", "run",
198
- "-d", f"{dataset}@{dataset_version}",
231
+ harbor_bin, "run",
232
+ "-d", dataset_str,
199
233
  "--agent-import-path", agent_path,
200
234
  "-m", full_model,
201
235
  "-n", str(n_concurrent),
@@ -203,12 +237,18 @@ class BenchmarkRunner:
203
237
  ]
204
238
 
205
239
  if task_ids:
206
- cmd.extend(["--task-ids", ",".join(task_ids)])
240
+ for task_id in task_ids:
241
+ cmd.extend(["--task-name", task_id])
242
+
243
+ if n_tasks:
244
+ cmd.extend(["-l", str(n_tasks)])
207
245
 
208
- print(f"\nRunning Terminal-Bench evaluation:")
246
+ print("\nRunning Terminal-Bench evaluation:")
209
247
  print(f" Model: {full_model}")
210
- print(f" Dataset: {dataset}@{dataset_version}")
248
+ print(f" Dataset: {dataset_str}")
211
249
  print(f" Concurrent tasks: {n_concurrent}")
250
+ if n_tasks:
251
+ print(f" Max tasks: {n_tasks}")
212
252
  print(f" Output: {output_dir}")
213
253
  if npc_name:
214
254
  print(f" NPC: {npc_name}")
@@ -311,7 +351,7 @@ class BenchmarkRunner:
311
351
  self,
312
352
  models: List[tuple],
313
353
  dataset: str = "terminal-bench",
314
- dataset_version: str = "2.0",
354
+ dataset_version: Optional[str] = None,
315
355
  n_concurrent: int = 4,
316
356
  task_ids: Optional[List[str]] = None,
317
357
  ) -> Dict[str, BenchmarkResult]:
@@ -321,7 +361,7 @@ class BenchmarkRunner:
321
361
  Args:
322
362
  models: List of (model, provider) tuples
323
363
  dataset: Dataset name
324
- dataset_version: Dataset version
364
+ dataset_version: Dataset version (optional)
325
365
  n_concurrent: Number of concurrent tasks
326
366
  task_ids: Optional specific task IDs
327
367
 
@@ -338,9 +378,9 @@ class BenchmarkRunner:
338
378
  results = {}
339
379
 
340
380
  for model, provider in models:
341
- print(f"\n{'='*60}")
381
+ print("\n" + '='*60)
342
382
  print(f"Evaluating: {provider}/{model}")
343
- print(f"{'='*60}")
383
+ print('='*60)
344
384
 
345
385
  result = self.run(
346
386
  model=model,
@@ -365,9 +405,9 @@ class BenchmarkRunner:
365
405
 
366
406
  def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
367
407
  """Print a comparison summary table."""
368
- print(f"\n{'='*60}")
408
+ print("\n" + '='*60)
369
409
  print("COMPARISON SUMMARY")
370
- print(f"{'='*60}")
410
+ print('='*60)
371
411
  print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
372
412
  print("-" * 60)
373
413
 
@@ -436,20 +476,22 @@ def run_benchmark(
436
476
  def quick_test(
437
477
  model: str = "claude-sonnet-4-20250514",
438
478
  provider: str = "anthropic",
479
+ n_tasks: int = 3,
439
480
  ) -> BenchmarkResult:
440
481
  """
441
482
  Run a quick test with a few tasks to verify setup.
442
483
 
443
- This runs only 3 easy tasks to quickly verify that everything is working.
484
+ This runs only a few tasks to quickly verify that everything is working.
444
485
  """
445
486
  runner = BenchmarkRunner()
446
487
 
447
- # Use a small subset of easy tasks for quick testing
488
+ # Use -l flag to limit number of tasks instead of specifying task names
489
+ # This avoids issues with task names changing in the dataset
448
490
  return runner.run(
449
491
  model=model,
450
492
  provider=provider,
451
493
  n_concurrent=1,
452
- task_ids=["ssl-cert", "git-server", "reshard-dataset"], # Example easy tasks
494
+ n_tasks=n_tasks,
453
495
  )
454
496
 
455
497
 
@@ -484,8 +526,8 @@ Examples:
484
526
  help="Provider name")
485
527
  parser.add_argument("--dataset", "-d", default="terminal-bench",
486
528
  help="Dataset name")
487
- parser.add_argument("--version", "-v", default="2.0",
488
- help="Dataset version")
529
+ parser.add_argument("--version", "-v", default=None,
530
+ help="Dataset version (optional, uses latest if not specified)")
489
531
  parser.add_argument("--concurrent", "-n", type=int, default=4,
490
532
  help="Number of concurrent tasks")
491
533
  parser.add_argument("--npc", help="NPC name to use")
@@ -541,7 +583,7 @@ Examples:
541
583
  ("gpt-4o", "openai"),
542
584
  ("gemini-2.0-flash", "gemini"),
543
585
  ]
544
- results = runner.compare_models(
586
+ runner.compare_models(
545
587
  models_to_compare,
546
588
  n_concurrent=args.concurrent
547
589
  )
@@ -560,7 +602,7 @@ Examples:
560
602
  n_concurrent=args.concurrent,
561
603
  npc_name=args.npc,
562
604
  )
563
- print(f"\nBenchmark complete!")
605
+ print("\nBenchmark complete!")
564
606
  print(f"Accuracy: {result.accuracy:.1%}")
565
607
  print(f"Results saved to: {result.output_dir}")
566
608
 
@@ -0,0 +1,35 @@
1
+ #!/bin/bash
2
+ # Installation script for npcsh in Terminal-Bench containers
3
+ # This template is rendered by Harbor before execution
4
+
5
+ set -e
6
+
7
+ echo "Installing npcsh for Terminal-Bench evaluation..."
8
+
9
+ # Install Python dependencies if needed
10
+ if ! command -v pip &> /dev/null; then
11
+ echo "Installing pip..."
12
+ apt-get update && apt-get install -y python3-pip
13
+ fi
14
+
15
+ # Install npcsh with lite dependencies (API providers only, no local models)
16
+ # Use --break-system-packages for PEP 668 compliance (Ubuntu 24.04+)
17
+ echo "Installing npcsh[lite]..."
18
+ pip install --quiet --break-system-packages npcsh[lite] || pip install --quiet npcsh[lite]
19
+
20
+ # Verify installation
21
+ echo "Verifying npcsh installation..."
22
+ npc --help > /dev/null 2>&1 || {
23
+ echo "ERROR: npcsh installation failed"
24
+ exit 1
25
+ }
26
+
27
+ # Set up default configuration
28
+ export NPCSH_STREAM_OUTPUT=0
29
+ export NPCSH_LOG_LEVEL=warning
30
+
31
+ {% if version %}
32
+ echo "npcsh version: {{ version }}"
33
+ {% endif %}
34
+
35
+ echo "npcsh installation complete!"
npcsh/build.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
1
  import shutil
3
2
  import textwrap
4
3
  from pathlib import Path
@@ -11,8 +10,7 @@ def build_flask_server(config, **kwargs):
11
10
  server_script = output_dir / 'npc_server.py'
12
11
 
13
12
  server_code = textwrap.dedent(f'''
14
- import os
15
- from npcpy.serve import start_flask_server
13
+ from npcpy.serve import start_flask_server
16
14
  from npcpy.npc_compiler import Team
17
15
  from sqlalchemy import create_engine
18
16
 
@@ -111,7 +109,7 @@ def build_docker_compose(config, **kwargs):
111
109
  volumes:
112
110
  - npc-data:/root/.npcsh
113
111
  environment:
114
- - NPCSH_DB_PATH=/root/.npcsh/npcsh_history.db
112
+ - NPCSH_DB_PATH=/root/npcsh_history.db
115
113
 
116
114
  volumes:
117
115
  npc-data:
npcsh/completion.py CHANGED
@@ -2,8 +2,7 @@
2
2
  Readline and tab completion for npcsh
3
3
  """
4
4
  import os
5
- import shutil
6
- from typing import List, Any, Optional
5
+ from typing import List, Any
7
6
 
8
7
  try:
9
8
  import readline
@@ -66,11 +65,8 @@ def get_file_completions(text: str) -> List[str]:
66
65
  completions = []
67
66
 
68
67
  if text.startswith("~"):
69
- expanded = os.path.expanduser(text)
70
- prefix = "~"
71
- search_path = expanded
68
+ search_path = os.path.expanduser(text)
72
69
  else:
73
- prefix = ""
74
70
  search_path = text
75
71
 
76
72
  # Get directory to search
npcsh/config.py CHANGED
@@ -3,7 +3,6 @@ npcsh configuration management
3
3
  """
4
4
  import os
5
5
  import importlib.metadata
6
- from typing import Optional, Dict, Any
7
6
 
8
7
  # Version
9
8
  try:
@@ -14,7 +13,6 @@ except importlib.metadata.PackageNotFoundError:
14
13
  # Default paths
15
14
  DEFAULT_NPC_TEAM_PATH = "~/.npcsh/npc_team"
16
15
  PROJECT_NPC_TEAM_PATH = "./npc_team"
17
- HISTORY_DB_DEFAULT_PATH = "~/.npcsh_history.db"
18
16
  READLINE_HISTORY_FILE = os.path.expanduser("~/.npcsh_history")
19
17
 
20
18
  # Environment defaults
@@ -44,7 +42,7 @@ NPCSH_REASONING_PROVIDER = os.environ.get("NPCSH_REASONING_PROVIDER", "ollama")
44
42
  NPCSH_STREAM_OUTPUT = os.environ.get("NPCSH_STREAM_OUTPUT", "0") == "1"
45
43
  NPCSH_API_URL = os.environ.get("NPCSH_API_URL", None)
46
44
  NPCSH_SEARCH_PROVIDER = os.environ.get("NPCSH_SEARCH_PROVIDER", "duckduckgo")
47
- NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG") == "1"
45
+ NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG", "1") != "0"
48
46
 
49
47
 
50
48
  def get_shell_config_file() -> str: