npcsh 1.1.16__py3-none-any.whl → 1.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. npcsh/_state.py +24 -9
  2. npcsh/benchmark/__init__.py +22 -0
  3. npcsh/benchmark/npcsh_agent.py +262 -0
  4. npcsh/benchmark/runner.py +569 -0
  5. npcsh/npc_team/jinxs/bin/benchmark.jinx +146 -0
  6. npcsh/npc_team/jinxs/bin/nql.jinx +7 -7
  7. npcsh/npc_team/jinxs/bin/roll.jinx +20 -23
  8. npcsh/npc_team/jinxs/bin/sample.jinx +6 -7
  9. npcsh/npc_team/jinxs/bin/spool.jinx +4 -4
  10. npcsh/npc_team/jinxs/bin/sync.jinx +6 -6
  11. npcsh/npc_team/jinxs/bin/vixynt.jinx +8 -8
  12. npcsh/npc_team/jinxs/bin/wander.jinx +109 -19
  13. npcsh/npc_team/jinxs/bin/yap.jinx +5 -5
  14. npcsh/npc_team/jinxs/incognide/add_tab.jinx +11 -0
  15. npcsh/npc_team/jinxs/incognide/close_pane.jinx +9 -0
  16. npcsh/npc_team/jinxs/incognide/close_tab.jinx +10 -0
  17. npcsh/npc_team/jinxs/incognide/confirm.jinx +10 -0
  18. npcsh/npc_team/jinxs/incognide/focus_pane.jinx +9 -0
  19. npcsh/npc_team/jinxs/{npc_studio/npc-studio.jinx → incognide/incognide.jinx} +2 -2
  20. npcsh/npc_team/jinxs/incognide/list_panes.jinx +8 -0
  21. npcsh/npc_team/jinxs/incognide/navigate.jinx +10 -0
  22. npcsh/npc_team/jinxs/incognide/notify.jinx +10 -0
  23. npcsh/npc_team/jinxs/incognide/open_pane.jinx +13 -0
  24. npcsh/npc_team/jinxs/incognide/read_pane.jinx +9 -0
  25. npcsh/npc_team/jinxs/incognide/run_terminal.jinx +10 -0
  26. npcsh/npc_team/jinxs/incognide/send_message.jinx +10 -0
  27. npcsh/npc_team/jinxs/incognide/split_pane.jinx +12 -0
  28. npcsh/npc_team/jinxs/incognide/switch_npc.jinx +10 -0
  29. npcsh/npc_team/jinxs/incognide/switch_tab.jinx +10 -0
  30. npcsh/npc_team/jinxs/incognide/write_file.jinx +11 -0
  31. npcsh/npc_team/jinxs/incognide/zen_mode.jinx +9 -0
  32. npcsh/npc_team/jinxs/lib/browser/browser_action.jinx +4 -4
  33. npcsh/npc_team/jinxs/lib/browser/browser_screenshot.jinx +1 -1
  34. npcsh/npc_team/jinxs/lib/browser/open_browser.jinx +2 -2
  35. npcsh/npc_team/jinxs/lib/computer_use/click.jinx +2 -2
  36. npcsh/npc_team/jinxs/lib/computer_use/key_press.jinx +1 -1
  37. npcsh/npc_team/jinxs/lib/computer_use/launch_app.jinx +1 -1
  38. npcsh/npc_team/jinxs/lib/computer_use/screenshot.jinx +1 -1
  39. npcsh/npc_team/jinxs/lib/computer_use/trigger.jinx +2 -2
  40. npcsh/npc_team/jinxs/lib/computer_use/type_text.jinx +1 -1
  41. npcsh/npc_team/jinxs/lib/computer_use/wait.jinx +1 -1
  42. npcsh/npc_team/jinxs/lib/core/chat.jinx +4 -4
  43. npcsh/npc_team/jinxs/lib/core/cmd.jinx +4 -4
  44. npcsh/npc_team/jinxs/lib/core/compress.jinx +8 -8
  45. npcsh/npc_team/jinxs/lib/core/edit_file.jinx +3 -0
  46. npcsh/npc_team/jinxs/lib/core/ots.jinx +7 -7
  47. npcsh/npc_team/jinxs/lib/core/search/db_search.jinx +44 -0
  48. npcsh/npc_team/jinxs/lib/core/search/file_search.jinx +94 -0
  49. npcsh/npc_team/jinxs/lib/core/search/kg_search.jinx +96 -0
  50. npcsh/npc_team/jinxs/lib/core/search/mem_search.jinx +80 -0
  51. npcsh/npc_team/jinxs/lib/core/search/web_search.jinx +51 -0
  52. npcsh/npc_team/jinxs/lib/core/search.jinx +52 -129
  53. npcsh/npc_team/jinxs/lib/core/sh.jinx +1 -1
  54. npcsh/npc_team/jinxs/lib/core/sleep.jinx +7 -7
  55. npcsh/npc_team/jinxs/lib/core/sql.jinx +7 -7
  56. npcsh/npc_team/jinxs/lib/orchestration/convene.jinx +7 -7
  57. npcsh/npc_team/jinxs/lib/orchestration/delegate.jinx +8 -9
  58. npcsh/npc_team/jinxs/lib/research/arxiv.jinx +2 -2
  59. npcsh/npc_team/jinxs/lib/research/paper_search.jinx +3 -3
  60. npcsh/npc_team/jinxs/lib/research/semantic_scholar.jinx +2 -2
  61. npcsh/npc_team/jinxs/lib/utils/build.jinx +5 -5
  62. npcsh/npc_team/jinxs/lib/utils/compile.jinx +2 -2
  63. npcsh/npc_team/jinxs/lib/utils/help.jinx +1 -1
  64. npcsh/npc_team/jinxs/lib/utils/init.jinx +5 -5
  65. npcsh/npc_team/jinxs/lib/utils/jinxs.jinx +1 -1
  66. npcsh/npc_team/jinxs/lib/utils/serve.jinx +2 -2
  67. npcsh/npc_team/jinxs/lib/utils/set.jinx +2 -2
  68. npcsh/npc_team/jinxs/lib/utils/switch.jinx +3 -3
  69. npcsh/npc_team/jinxs/lib/utils/switches.jinx +1 -1
  70. npcsh/npc_team/jinxs/lib/utils/teamviz.jinx +2 -2
  71. npcsh/npc_team/sibiji.npc +1 -1
  72. npcsh/npcsh.py +81 -43
  73. npcsh-1.1.17.data/data/npcsh/npc_team/add_tab.jinx +11 -0
  74. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/arxiv.jinx +2 -2
  75. npcsh-1.1.17.data/data/npcsh/npc_team/benchmark.jinx +146 -0
  76. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/browser_action.jinx +4 -4
  77. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/browser_screenshot.jinx +1 -1
  78. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/build.jinx +5 -5
  79. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/chat.jinx +4 -4
  80. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/click.jinx +2 -2
  81. npcsh-1.1.17.data/data/npcsh/npc_team/close_pane.jinx +9 -0
  82. npcsh-1.1.17.data/data/npcsh/npc_team/close_tab.jinx +10 -0
  83. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/cmd.jinx +4 -4
  84. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/compile.jinx +2 -2
  85. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/compress.jinx +8 -8
  86. npcsh-1.1.17.data/data/npcsh/npc_team/confirm.jinx +10 -0
  87. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/convene.jinx +7 -7
  88. npcsh-1.1.17.data/data/npcsh/npc_team/db_search.jinx +44 -0
  89. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/delegate.jinx +8 -9
  90. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/edit_file.jinx +3 -0
  91. npcsh-1.1.17.data/data/npcsh/npc_team/file_search.jinx +94 -0
  92. npcsh-1.1.17.data/data/npcsh/npc_team/focus_pane.jinx +9 -0
  93. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/help.jinx +1 -1
  94. npcsh-1.1.16.data/data/npcsh/npc_team/npc-studio.jinx → npcsh-1.1.17.data/data/npcsh/npc_team/incognide.jinx +2 -2
  95. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/init.jinx +5 -5
  96. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/jinxs.jinx +1 -1
  97. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/key_press.jinx +1 -1
  98. npcsh-1.1.17.data/data/npcsh/npc_team/kg_search.jinx +96 -0
  99. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/launch_app.jinx +1 -1
  100. npcsh-1.1.17.data/data/npcsh/npc_team/list_panes.jinx +8 -0
  101. npcsh-1.1.17.data/data/npcsh/npc_team/mem_search.jinx +80 -0
  102. npcsh-1.1.17.data/data/npcsh/npc_team/navigate.jinx +10 -0
  103. npcsh-1.1.17.data/data/npcsh/npc_team/notify.jinx +10 -0
  104. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/nql.jinx +7 -7
  105. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/open_browser.jinx +2 -2
  106. npcsh-1.1.17.data/data/npcsh/npc_team/open_pane.jinx +13 -0
  107. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/ots.jinx +7 -7
  108. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/paper_search.jinx +3 -3
  109. npcsh-1.1.17.data/data/npcsh/npc_team/read_pane.jinx +9 -0
  110. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/roll.jinx +20 -23
  111. npcsh-1.1.17.data/data/npcsh/npc_team/run_terminal.jinx +10 -0
  112. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/sample.jinx +6 -7
  113. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/screenshot.jinx +1 -1
  114. npcsh-1.1.17.data/data/npcsh/npc_team/search.jinx +54 -0
  115. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/semantic_scholar.jinx +2 -2
  116. npcsh-1.1.17.data/data/npcsh/npc_team/send_message.jinx +10 -0
  117. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/serve.jinx +2 -2
  118. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/set.jinx +2 -2
  119. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/sh.jinx +1 -1
  120. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/sibiji.npc +1 -1
  121. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/sleep.jinx +7 -7
  122. npcsh-1.1.17.data/data/npcsh/npc_team/split_pane.jinx +12 -0
  123. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/spool.jinx +4 -4
  124. npcsh-1.1.17.data/data/npcsh/npc_team/sql.jinx +16 -0
  125. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/switch.jinx +3 -3
  126. npcsh-1.1.17.data/data/npcsh/npc_team/switch_npc.jinx +10 -0
  127. npcsh-1.1.17.data/data/npcsh/npc_team/switch_tab.jinx +10 -0
  128. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/switches.jinx +1 -1
  129. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/sync.jinx +6 -6
  130. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/teamviz.jinx +2 -2
  131. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/trigger.jinx +2 -2
  132. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/type_text.jinx +1 -1
  133. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/vixynt.jinx +8 -8
  134. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/wait.jinx +1 -1
  135. npcsh-1.1.17.data/data/npcsh/npc_team/wander.jinx +242 -0
  136. npcsh-1.1.17.data/data/npcsh/npc_team/web_search.jinx +51 -0
  137. npcsh-1.1.17.data/data/npcsh/npc_team/write_file.jinx +11 -0
  138. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/yap.jinx +5 -5
  139. npcsh-1.1.17.data/data/npcsh/npc_team/zen_mode.jinx +9 -0
  140. {npcsh-1.1.16.dist-info → npcsh-1.1.17.dist-info}/METADATA +10 -7
  141. npcsh-1.1.17.dist-info/RECORD +219 -0
  142. {npcsh-1.1.16.dist-info → npcsh-1.1.17.dist-info}/entry_points.txt +2 -0
  143. npcsh-1.1.16.data/data/npcsh/npc_team/search.jinx +0 -131
  144. npcsh-1.1.16.data/data/npcsh/npc_team/sql.jinx +0 -16
  145. npcsh-1.1.16.data/data/npcsh/npc_team/wander.jinx +0 -152
  146. npcsh-1.1.16.dist-info/RECORD +0 -170
  147. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/alicanto.npc +0 -0
  148. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/alicanto.png +0 -0
  149. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/close_browser.jinx +0 -0
  150. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/corca.npc +0 -0
  151. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/corca.png +0 -0
  152. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/corca_example.png +0 -0
  153. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/frederic.npc +0 -0
  154. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/frederic4.png +0 -0
  155. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/guac.npc +0 -0
  156. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/guac.png +0 -0
  157. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/kadiefa.npc +0 -0
  158. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/kadiefa.png +0 -0
  159. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/load_file.jinx +0 -0
  160. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/npcsh.ctx +0 -0
  161. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/npcsh_sibiji.png +0 -0
  162. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/paste.jinx +0 -0
  163. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/plonk.npc +0 -0
  164. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/plonk.png +0 -0
  165. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/plonkjr.npc +0 -0
  166. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/plonkjr.png +0 -0
  167. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/python.jinx +0 -0
  168. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/shh.jinx +0 -0
  169. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/sibiji.png +0 -0
  170. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/spool.png +0 -0
  171. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/usage.jinx +0 -0
  172. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/verbose.jinx +0 -0
  173. {npcsh-1.1.16.data → npcsh-1.1.17.data}/data/npcsh/npc_team/yap.png +0 -0
  174. {npcsh-1.1.16.dist-info → npcsh-1.1.17.dist-info}/WHEEL +0 -0
  175. {npcsh-1.1.16.dist-info → npcsh-1.1.17.dist-info}/licenses/LICENSE +0 -0
  176. {npcsh-1.1.16.dist-info → npcsh-1.1.17.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,569 @@
1
+ """
2
+ Benchmark runner for npcsh on Terminal-Bench.
3
+
4
+ Provides a convenient interface for running Terminal-Bench evaluations
5
+ with different models and providers.
6
+ """
7
+
8
+ import os
9
+ import subprocess
10
+ import sys
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Optional, List, Dict, Any
15
+ import json
16
+
17
+
18
+ @dataclass
19
+ class BenchmarkConfig:
20
+ """Configuration for a benchmark run."""
21
+ model: str = "claude-sonnet-4-20250514"
22
+ provider: str = "anthropic"
23
+ dataset: str = "terminal-bench"
24
+ dataset_version: str = "2.0"
25
+ n_concurrent: int = 4
26
+ task_ids: Optional[List[str]] = None
27
+ output_dir: Optional[str] = None
28
+ npc_name: Optional[str] = None # Use specific NPC
29
+ timeout: int = 600 # Per-task timeout in seconds
30
+ extra_args: List[str] = field(default_factory=list)
31
+
32
+
33
+ @dataclass
34
+ class BenchmarkResult:
35
+ """Results from a benchmark run."""
36
+ success: bool
37
+ total_tasks: int = 0
38
+ passed_tasks: int = 0
39
+ failed_tasks: int = 0
40
+ accuracy: float = 0.0
41
+ total_tokens: int = 0
42
+ total_cost_usd: float = 0.0
43
+ duration_seconds: float = 0.0
44
+ output_dir: str = ""
45
+ error: Optional[str] = None
46
+ task_results: List[Dict[str, Any]] = field(default_factory=list)
47
+
48
+
49
+ class BenchmarkRunner:
50
+ """
51
+ Runner for Terminal-Bench evaluations with npcsh.
52
+
53
+ Example usage:
54
+ runner = BenchmarkRunner()
55
+
56
+ # Run with default settings (Claude Sonnet)
57
+ result = runner.run()
58
+
59
+ # Run with specific model
60
+ result = runner.run(model="gpt-4o", provider="openai")
61
+
62
+ # Compare multiple models
63
+ results = runner.compare_models([
64
+ ("claude-sonnet-4-20250514", "anthropic"),
65
+ ("gpt-4o", "openai"),
66
+ ("gemini-2.0-flash", "gemini"),
67
+ ])
68
+ """
69
+
70
+ def __init__(self, output_base_dir: Optional[str] = None):
71
+ """
72
+ Initialize the benchmark runner.
73
+
74
+ Args:
75
+ output_base_dir: Base directory for benchmark outputs.
76
+ Defaults to ~/.npcsh/benchmarks/
77
+ """
78
+ if output_base_dir:
79
+ self.output_base_dir = Path(output_base_dir)
80
+ else:
81
+ self.output_base_dir = Path.home() / ".npcsh" / "benchmarks"
82
+
83
+ self.output_base_dir.mkdir(parents=True, exist_ok=True)
84
+
85
+ def check_dependencies(self) -> Dict[str, bool]:
86
+ """Check if required dependencies are installed."""
87
+ deps = {
88
+ "harbor": False,
89
+ "terminal-bench": False,
90
+ "docker": False,
91
+ }
92
+
93
+ # Check harbor
94
+ try:
95
+ result = subprocess.run(
96
+ ["harbor", "--version"],
97
+ capture_output=True,
98
+ text=True
99
+ )
100
+ deps["harbor"] = result.returncode == 0
101
+ except FileNotFoundError:
102
+ pass
103
+
104
+ # Check terminal-bench (tb CLI)
105
+ try:
106
+ result = subprocess.run(
107
+ ["tb", "--help"],
108
+ capture_output=True,
109
+ text=True
110
+ )
111
+ deps["terminal-bench"] = result.returncode == 0
112
+ except FileNotFoundError:
113
+ pass
114
+
115
+ # Check docker
116
+ try:
117
+ result = subprocess.run(
118
+ ["docker", "--version"],
119
+ capture_output=True,
120
+ text=True
121
+ )
122
+ deps["docker"] = result.returncode == 0
123
+ except FileNotFoundError:
124
+ pass
125
+
126
+ return deps
127
+
128
+ def install_dependencies(self) -> bool:
129
+ """Install Terminal-Bench dependencies."""
130
+ print("Installing Terminal-Bench dependencies...")
131
+
132
+ try:
133
+ # Install harbor and terminal-bench via pip/uv
134
+ subprocess.run(
135
+ [sys.executable, "-m", "pip", "install", "harbor", "terminal-bench"],
136
+ check=True
137
+ )
138
+ print("Dependencies installed successfully!")
139
+ return True
140
+ except subprocess.CalledProcessError as e:
141
+ print(f"Failed to install dependencies: {e}")
142
+ return False
143
+
144
+ def run(
145
+ self,
146
+ model: str = "claude-sonnet-4-20250514",
147
+ provider: str = "anthropic",
148
+ dataset: str = "terminal-bench",
149
+ dataset_version: str = "2.0",
150
+ n_concurrent: int = 4,
151
+ task_ids: Optional[List[str]] = None,
152
+ npc_name: Optional[str] = None,
153
+ timeout: int = 600,
154
+ ) -> BenchmarkResult:
155
+ """
156
+ Run Terminal-Bench evaluation with npcsh.
157
+
158
+ Args:
159
+ model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
160
+ provider: Provider name (e.g., "anthropic", "openai", "gemini")
161
+ dataset: Dataset name (default: "terminal-bench")
162
+ dataset_version: Dataset version (default: "2.0")
163
+ n_concurrent: Number of concurrent task executions
164
+ task_ids: Optional list of specific task IDs to run
165
+ npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
166
+ timeout: Per-task timeout in seconds
167
+
168
+ Returns:
169
+ BenchmarkResult with evaluation metrics
170
+ """
171
+ # Check dependencies
172
+ deps = self.check_dependencies()
173
+ if not deps["harbor"]:
174
+ print("Harbor not installed. Installing...")
175
+ if not self.install_dependencies():
176
+ return BenchmarkResult(
177
+ success=False,
178
+ error="Failed to install dependencies"
179
+ )
180
+
181
+ # Create output directory for this run
182
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
183
+ run_name = f"{provider}_{model}_{timestamp}".replace("/", "_")
184
+ output_dir = self.output_base_dir / run_name
185
+ output_dir.mkdir(parents=True, exist_ok=True)
186
+
187
+ # Build the harbor command
188
+ full_model = f"{provider}/{model}"
189
+
190
+ # Choose agent based on whether NPC is specified
191
+ if npc_name:
192
+ agent_path = "npcsh.benchmark:NpcshAgentWithNpc"
193
+ else:
194
+ agent_path = "npcsh.benchmark:NpcshAgent"
195
+
196
+ cmd = [
197
+ "harbor", "run",
198
+ "-d", f"{dataset}@{dataset_version}",
199
+ "--agent-import-path", agent_path,
200
+ "-m", full_model,
201
+ "-n", str(n_concurrent),
202
+ "-o", str(output_dir),
203
+ ]
204
+
205
+ if task_ids:
206
+ cmd.extend(["--task-ids", ",".join(task_ids)])
207
+
208
+ print(f"\nRunning Terminal-Bench evaluation:")
209
+ print(f" Model: {full_model}")
210
+ print(f" Dataset: {dataset}@{dataset_version}")
211
+ print(f" Concurrent tasks: {n_concurrent}")
212
+ print(f" Output: {output_dir}")
213
+ if npc_name:
214
+ print(f" NPC: {npc_name}")
215
+ print(f"\nCommand: {' '.join(cmd)}\n")
216
+
217
+ start_time = datetime.now()
218
+
219
+ try:
220
+ # Run the benchmark
221
+ process = subprocess.run(
222
+ cmd,
223
+ capture_output=True,
224
+ text=True,
225
+ timeout=timeout * n_concurrent * 2 # Overall timeout
226
+ )
227
+
228
+ duration = (datetime.now() - start_time).total_seconds()
229
+
230
+ # Parse results
231
+ result = self._parse_results(output_dir, duration)
232
+ result.output_dir = str(output_dir)
233
+
234
+ if process.returncode != 0:
235
+ result.error = process.stderr
236
+
237
+ # Save run metadata
238
+ self._save_run_metadata(output_dir, {
239
+ "model": full_model,
240
+ "provider": provider,
241
+ "dataset": dataset,
242
+ "dataset_version": dataset_version,
243
+ "n_concurrent": n_concurrent,
244
+ "npc_name": npc_name,
245
+ "duration_seconds": duration,
246
+ "result": {
247
+ "success": result.success,
248
+ "accuracy": result.accuracy,
249
+ "total_tasks": result.total_tasks,
250
+ "passed_tasks": result.passed_tasks,
251
+ }
252
+ })
253
+
254
+ return result
255
+
256
+ except subprocess.TimeoutExpired:
257
+ return BenchmarkResult(
258
+ success=False,
259
+ error="Benchmark timed out",
260
+ output_dir=str(output_dir)
261
+ )
262
+ except Exception as e:
263
+ return BenchmarkResult(
264
+ success=False,
265
+ error=str(e),
266
+ output_dir=str(output_dir)
267
+ )
268
+
269
+ def _parse_results(self, output_dir: Path, duration: float) -> BenchmarkResult:
270
+ """Parse benchmark results from output directory."""
271
+ result = BenchmarkResult(
272
+ success=True,
273
+ duration_seconds=duration
274
+ )
275
+
276
+ # Look for results file
277
+ results_file = output_dir / "results.json"
278
+ if results_file.exists():
279
+ try:
280
+ with open(results_file) as f:
281
+ data = json.load(f)
282
+
283
+ result.total_tasks = data.get("total", 0)
284
+ result.passed_tasks = data.get("passed", 0)
285
+ result.failed_tasks = data.get("failed", 0)
286
+
287
+ if result.total_tasks > 0:
288
+ result.accuracy = result.passed_tasks / result.total_tasks
289
+
290
+ result.task_results = data.get("tasks", [])
291
+
292
+ # Aggregate token usage
293
+ for task in result.task_results:
294
+ result.total_tokens += task.get("tokens", 0)
295
+ result.total_cost_usd += task.get("cost_usd", 0.0)
296
+
297
+ except (json.JSONDecodeError, KeyError) as e:
298
+ result.error = f"Failed to parse results: {e}"
299
+
300
+ return result
301
+
302
+ def _save_run_metadata(self, output_dir: Path, metadata: Dict[str, Any]) -> None:
303
+ """Save run metadata to output directory."""
304
+ metadata_file = output_dir / "run_metadata.json"
305
+ metadata["timestamp"] = datetime.now().isoformat()
306
+
307
+ with open(metadata_file, "w") as f:
308
+ json.dump(metadata, f, indent=2)
309
+
310
+ def compare_models(
311
+ self,
312
+ models: List[tuple],
313
+ dataset: str = "terminal-bench",
314
+ dataset_version: str = "2.0",
315
+ n_concurrent: int = 4,
316
+ task_ids: Optional[List[str]] = None,
317
+ ) -> Dict[str, BenchmarkResult]:
318
+ """
319
+ Compare multiple models on the same benchmark.
320
+
321
+ Args:
322
+ models: List of (model, provider) tuples
323
+ dataset: Dataset name
324
+ dataset_version: Dataset version
325
+ n_concurrent: Number of concurrent tasks
326
+ task_ids: Optional specific task IDs
327
+
328
+ Returns:
329
+ Dictionary mapping model names to results
330
+
331
+ Example:
332
+ results = runner.compare_models([
333
+ ("claude-sonnet-4-20250514", "anthropic"),
334
+ ("gpt-4o", "openai"),
335
+ ("gemini-2.0-flash", "gemini"),
336
+ ])
337
+ """
338
+ results = {}
339
+
340
+ for model, provider in models:
341
+ print(f"\n{'='*60}")
342
+ print(f"Evaluating: {provider}/{model}")
343
+ print(f"{'='*60}")
344
+
345
+ result = self.run(
346
+ model=model,
347
+ provider=provider,
348
+ dataset=dataset,
349
+ dataset_version=dataset_version,
350
+ n_concurrent=n_concurrent,
351
+ task_ids=task_ids,
352
+ )
353
+
354
+ results[f"{provider}/{model}"] = result
355
+
356
+ print(f"\nResult for {provider}/{model}:")
357
+ print(f" Accuracy: {result.accuracy:.1%}")
358
+ print(f" Tasks: {result.passed_tasks}/{result.total_tasks}")
359
+ print(f" Duration: {result.duration_seconds:.1f}s")
360
+
361
+ # Print comparison summary
362
+ self._print_comparison_summary(results)
363
+
364
+ return results
365
+
366
+ def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
367
+ """Print a comparison summary table."""
368
+ print(f"\n{'='*60}")
369
+ print("COMPARISON SUMMARY")
370
+ print(f"{'='*60}")
371
+ print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
372
+ print("-" * 60)
373
+
374
+ sorted_results = sorted(
375
+ results.items(),
376
+ key=lambda x: x[1].accuracy,
377
+ reverse=True
378
+ )
379
+
380
+ for model_name, result in sorted_results:
381
+ print(
382
+ f"{model_name:<40} "
383
+ f"{result.accuracy:>9.1%} "
384
+ f"{result.passed_tasks:>4}/{result.total_tasks:<4}"
385
+ )
386
+
387
+ def list_past_runs(self) -> List[Dict[str, Any]]:
388
+ """List all past benchmark runs."""
389
+ runs = []
390
+
391
+ for run_dir in self.output_base_dir.iterdir():
392
+ if run_dir.is_dir():
393
+ metadata_file = run_dir / "run_metadata.json"
394
+ if metadata_file.exists():
395
+ try:
396
+ with open(metadata_file) as f:
397
+ metadata = json.load(f)
398
+ metadata["run_dir"] = str(run_dir)
399
+ runs.append(metadata)
400
+ except json.JSONDecodeError:
401
+ pass
402
+
403
+ return sorted(runs, key=lambda x: x.get("timestamp", ""), reverse=True)
404
+
405
+
406
+ def run_benchmark(
407
+ model: str = "claude-sonnet-4-20250514",
408
+ provider: str = "anthropic",
409
+ **kwargs
410
+ ) -> BenchmarkResult:
411
+ """
412
+ Convenience function to run a Terminal-Bench evaluation.
413
+
414
+ Args:
415
+ model: Model name
416
+ provider: Provider name
417
+ **kwargs: Additional arguments passed to BenchmarkRunner.run()
418
+
419
+ Returns:
420
+ BenchmarkResult
421
+
422
+ Example:
423
+ from npcsh.benchmark import run_benchmark
424
+
425
+ # Run with Claude
426
+ result = run_benchmark("claude-sonnet-4-20250514", "anthropic")
427
+ print(f"Accuracy: {result.accuracy:.1%}")
428
+
429
+ # Run with GPT-4
430
+ result = run_benchmark("gpt-4o", "openai")
431
+ """
432
+ runner = BenchmarkRunner()
433
+ return runner.run(model=model, provider=provider, **kwargs)
434
+
435
+
436
+ def quick_test(
437
+ model: str = "claude-sonnet-4-20250514",
438
+ provider: str = "anthropic",
439
+ ) -> BenchmarkResult:
440
+ """
441
+ Run a quick test with a few tasks to verify setup.
442
+
443
+ This runs only 3 easy tasks to quickly verify that everything is working.
444
+ """
445
+ runner = BenchmarkRunner()
446
+
447
+ # Use a small subset of easy tasks for quick testing
448
+ return runner.run(
449
+ model=model,
450
+ provider=provider,
451
+ n_concurrent=1,
452
+ task_ids=["ssl-cert", "git-server", "reshard-dataset"], # Example easy tasks
453
+ )
454
+
455
+
456
+ def main():
457
+ """CLI entry point for npcsh-bench command."""
458
+ import argparse
459
+
460
+ parser = argparse.ArgumentParser(
461
+ description="Run Terminal-Bench with npcsh",
462
+ formatter_class=argparse.RawDescriptionHelpFormatter,
463
+ epilog="""
464
+ Examples:
465
+ # Check dependencies
466
+ npcsh-bench --check
467
+
468
+ # Quick test with Claude
469
+ npcsh-bench --quick -m claude-sonnet-4-20250514 -p anthropic
470
+
471
+ # Full benchmark run
472
+ npcsh-bench -m gpt-4o -p openai -n 8
473
+
474
+ # List past runs
475
+ npcsh-bench --list-runs
476
+
477
+ # Compare models (requires manual setup)
478
+ npcsh-bench --compare
479
+ """
480
+ )
481
+ parser.add_argument("--model", "-m",
482
+ help="Model name")
483
+ parser.add_argument("--provider", "-p",
484
+ help="Provider name")
485
+ parser.add_argument("--dataset", "-d", default="terminal-bench",
486
+ help="Dataset name")
487
+ parser.add_argument("--version", "-v", default="2.0",
488
+ help="Dataset version")
489
+ parser.add_argument("--concurrent", "-n", type=int, default=4,
490
+ help="Number of concurrent tasks")
491
+ parser.add_argument("--npc", help="NPC name to use")
492
+ parser.add_argument("--quick", action="store_true",
493
+ help="Run quick test with few tasks")
494
+ parser.add_argument("--list-runs", action="store_true",
495
+ help="List past benchmark runs")
496
+ parser.add_argument("--check", action="store_true",
497
+ help="Check if dependencies are installed")
498
+ parser.add_argument("--compare", action="store_true",
499
+ help="Compare multiple models (Claude, GPT-4, Gemini)")
500
+
501
+ args = parser.parse_args()
502
+
503
+ runner = BenchmarkRunner()
504
+
505
+ if args.check:
506
+ print("Checking Terminal-Bench dependencies...\n")
507
+ deps = runner.check_dependencies()
508
+ all_good = True
509
+ for dep, installed in deps.items():
510
+ status = "OK" if installed else "MISSING"
511
+ symbol = "+" if installed else "-"
512
+ print(f" [{symbol}] {dep}: {status}")
513
+ if not installed:
514
+ all_good = False
515
+
516
+ if not all_good:
517
+ print("\nTo install missing dependencies:")
518
+ print(" pip install harbor terminal-bench")
519
+ print("\nOr install with npcsh:")
520
+ print(" pip install npcsh[bench]")
521
+ else:
522
+ print("\nAll dependencies installed!")
523
+
524
+ elif args.list_runs:
525
+ runs = runner.list_past_runs()
526
+ if not runs:
527
+ print("No past benchmark runs found.")
528
+ else:
529
+ print(f"Found {len(runs)} past runs:\n")
530
+ for run in runs:
531
+ print(f" {run.get('timestamp', 'unknown')}: {run.get('model', 'unknown')}")
532
+ result = run.get('result', {})
533
+ print(f" Accuracy: {result.get('accuracy', 0):.1%}")
534
+ print(f" Tasks: {result.get('passed_tasks', 0)}/{result.get('total_tasks', 0)}")
535
+ print()
536
+
537
+ elif args.compare:
538
+ print("Comparing models on Terminal-Bench 2.0...\n")
539
+ models_to_compare = [
540
+
541
+ ("gpt-4o", "openai"),
542
+ ("gemini-2.0-flash", "gemini"),
543
+ ]
544
+ results = runner.compare_models(
545
+ models_to_compare,
546
+ n_concurrent=args.concurrent
547
+ )
548
+
549
+ elif args.quick:
550
+ result = quick_test(args.model, args.provider)
551
+ print(f"\nQuick test result: {'PASS' if result.success else 'FAIL'}")
552
+ print(f"Accuracy: {result.accuracy:.1%}")
553
+
554
+ else:
555
+ result = runner.run(
556
+ model=args.model,
557
+ provider=args.provider,
558
+ dataset=args.dataset,
559
+ dataset_version=args.version,
560
+ n_concurrent=args.concurrent,
561
+ npc_name=args.npc,
562
+ )
563
+ print(f"\nBenchmark complete!")
564
+ print(f"Accuracy: {result.accuracy:.1%}")
565
+ print(f"Results saved to: {result.output_dir}")
566
+
567
+
568
+ if __name__ == "__main__":
569
+ main()