npcsh 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. npcsh/_state.py +138 -100
  2. npcsh/alicanto.py +2 -2
  3. npcsh/benchmark/__init__.py +28 -0
  4. npcsh/benchmark/npcsh_agent.py +296 -0
  5. npcsh/benchmark/runner.py +611 -0
  6. npcsh/benchmark/templates/install-npcsh.sh.j2 +35 -0
  7. npcsh/build.py +2 -4
  8. npcsh/completion.py +2 -6
  9. npcsh/config.py +1 -3
  10. npcsh/conversation_viewer.py +389 -0
  11. npcsh/corca.py +0 -1
  12. npcsh/execution.py +0 -1
  13. npcsh/guac.py +0 -1
  14. npcsh/mcp_helpers.py +2 -3
  15. npcsh/mcp_server.py +5 -10
  16. npcsh/npc.py +10 -11
  17. npcsh/npc_team/jinxs/bin/benchmark.jinx +146 -0
  18. npcsh/npc_team/jinxs/bin/nql.jinx +7 -7
  19. npcsh/npc_team/jinxs/bin/roll.jinx +20 -23
  20. npcsh/npc_team/jinxs/bin/sample.jinx +6 -7
  21. npcsh/npc_team/jinxs/bin/sync.jinx +6 -6
  22. npcsh/npc_team/jinxs/bin/vixynt.jinx +8 -8
  23. npcsh/npc_team/jinxs/incognide/add_tab.jinx +11 -0
  24. npcsh/npc_team/jinxs/incognide/close_pane.jinx +9 -0
  25. npcsh/npc_team/jinxs/incognide/close_tab.jinx +10 -0
  26. npcsh/npc_team/jinxs/incognide/confirm.jinx +10 -0
  27. npcsh/npc_team/jinxs/incognide/focus_pane.jinx +9 -0
  28. npcsh/npc_team/jinxs/{npc_studio/npc-studio.jinx → incognide/incognide.jinx} +2 -2
  29. npcsh/npc_team/jinxs/incognide/list_panes.jinx +8 -0
  30. npcsh/npc_team/jinxs/incognide/navigate.jinx +10 -0
  31. npcsh/npc_team/jinxs/incognide/notify.jinx +10 -0
  32. npcsh/npc_team/jinxs/incognide/open_pane.jinx +13 -0
  33. npcsh/npc_team/jinxs/incognide/read_pane.jinx +9 -0
  34. npcsh/npc_team/jinxs/incognide/run_terminal.jinx +10 -0
  35. npcsh/npc_team/jinxs/incognide/send_message.jinx +10 -0
  36. npcsh/npc_team/jinxs/incognide/split_pane.jinx +12 -0
  37. npcsh/npc_team/jinxs/incognide/switch_npc.jinx +10 -0
  38. npcsh/npc_team/jinxs/incognide/switch_tab.jinx +10 -0
  39. npcsh/npc_team/jinxs/incognide/write_file.jinx +11 -0
  40. npcsh/npc_team/jinxs/incognide/zen_mode.jinx +9 -0
  41. npcsh/npc_team/jinxs/lib/browser/browser_action.jinx +4 -4
  42. npcsh/npc_team/jinxs/lib/browser/browser_screenshot.jinx +1 -1
  43. npcsh/npc_team/jinxs/lib/browser/open_browser.jinx +2 -2
  44. npcsh/npc_team/jinxs/lib/computer_use/click.jinx +2 -2
  45. npcsh/npc_team/jinxs/lib/computer_use/key_press.jinx +1 -1
  46. npcsh/npc_team/jinxs/lib/computer_use/launch_app.jinx +1 -1
  47. npcsh/npc_team/jinxs/lib/computer_use/screenshot.jinx +1 -1
  48. npcsh/npc_team/jinxs/lib/computer_use/trigger.jinx +2 -2
  49. npcsh/npc_team/jinxs/lib/computer_use/type_text.jinx +1 -1
  50. npcsh/npc_team/jinxs/lib/computer_use/wait.jinx +1 -1
  51. npcsh/npc_team/jinxs/lib/core/chat.jinx +4 -4
  52. npcsh/npc_team/jinxs/lib/core/cmd.jinx +4 -4
  53. npcsh/npc_team/jinxs/lib/core/compress.jinx +8 -8
  54. npcsh/npc_team/jinxs/lib/core/edit_file.jinx +3 -0
  55. npcsh/npc_team/jinxs/lib/core/ots.jinx +7 -7
  56. npcsh/npc_team/jinxs/lib/core/search/db_search.jinx +348 -0
  57. npcsh/npc_team/jinxs/lib/core/search/file_search.jinx +339 -0
  58. npcsh/npc_team/jinxs/lib/core/search/kg_search.jinx +418 -0
  59. npcsh/npc_team/jinxs/lib/core/search/mem_review.jinx +73 -0
  60. npcsh/npc_team/jinxs/lib/core/search/mem_search.jinx +388 -0
  61. npcsh/npc_team/jinxs/lib/core/search/web_search.jinx +283 -0
  62. npcsh/npc_team/jinxs/lib/core/search.jinx +52 -129
  63. npcsh/npc_team/jinxs/lib/core/sh.jinx +1 -1
  64. npcsh/npc_team/jinxs/lib/core/sleep.jinx +29 -18
  65. npcsh/npc_team/jinxs/lib/core/sql.jinx +15 -11
  66. npcsh/npc_team/jinxs/lib/orchestration/convene.jinx +7 -7
  67. npcsh/npc_team/jinxs/lib/orchestration/delegate.jinx +8 -9
  68. npcsh/npc_team/jinxs/lib/research/paper_search.jinx +389 -78
  69. npcsh/npc_team/jinxs/lib/research/semantic_scholar.jinx +373 -56
  70. npcsh/npc_team/jinxs/lib/utils/build.jinx +5 -5
  71. npcsh/npc_team/jinxs/lib/utils/compile.jinx +2 -2
  72. npcsh/npc_team/jinxs/lib/utils/help.jinx +1 -1
  73. npcsh/npc_team/jinxs/lib/utils/init.jinx +5 -5
  74. npcsh/npc_team/jinxs/lib/utils/jinxs.jinx +300 -145
  75. npcsh/npc_team/jinxs/lib/utils/serve.jinx +2 -2
  76. npcsh/npc_team/jinxs/lib/utils/set.jinx +2 -2
  77. npcsh/npc_team/jinxs/lib/utils/switch.jinx +3 -3
  78. npcsh/npc_team/jinxs/lib/utils/switches.jinx +1 -1
  79. npcsh/npc_team/jinxs/lib/utils/teamviz.jinx +2 -2
  80. npcsh/npc_team/jinxs/modes/alicanto.jinx +356 -0
  81. npcsh/npc_team/jinxs/modes/arxiv.jinx +720 -0
  82. npcsh/npc_team/jinxs/modes/corca.jinx +430 -0
  83. npcsh/npc_team/jinxs/modes/guac.jinx +544 -0
  84. npcsh/npc_team/jinxs/modes/plonk.jinx +379 -0
  85. npcsh/npc_team/jinxs/modes/pti.jinx +357 -0
  86. npcsh/npc_team/jinxs/modes/reattach.jinx +291 -0
  87. npcsh/npc_team/jinxs/modes/spool.jinx +350 -0
  88. npcsh/npc_team/jinxs/modes/wander.jinx +455 -0
  89. {npcsh-1.1.16.data/data/npcsh/npc_team → npcsh/npc_team/jinxs/modes}/yap.jinx +8 -2
  90. npcsh/npc_team/sibiji.npc +1 -1
  91. npcsh/npcsh.py +87 -46
  92. npcsh/plonk.py +0 -1
  93. npcsh/pti.py +0 -1
  94. npcsh/routes.py +1 -3
  95. npcsh/spool.py +0 -1
  96. npcsh/ui.py +0 -1
  97. npcsh/wander.py +0 -1
  98. npcsh/yap.py +0 -1
  99. npcsh-1.1.18.data/data/npcsh/npc_team/add_tab.jinx +11 -0
  100. npcsh-1.1.18.data/data/npcsh/npc_team/alicanto.jinx +356 -0
  101. npcsh-1.1.18.data/data/npcsh/npc_team/arxiv.jinx +720 -0
  102. npcsh-1.1.18.data/data/npcsh/npc_team/benchmark.jinx +146 -0
  103. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/browser_action.jinx +4 -4
  104. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/browser_screenshot.jinx +1 -1
  105. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/build.jinx +5 -5
  106. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/chat.jinx +4 -4
  107. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/click.jinx +2 -2
  108. npcsh-1.1.18.data/data/npcsh/npc_team/close_pane.jinx +9 -0
  109. npcsh-1.1.18.data/data/npcsh/npc_team/close_tab.jinx +10 -0
  110. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/cmd.jinx +4 -4
  111. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/compile.jinx +2 -2
  112. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/compress.jinx +8 -8
  113. npcsh-1.1.18.data/data/npcsh/npc_team/confirm.jinx +10 -0
  114. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/convene.jinx +7 -7
  115. npcsh-1.1.18.data/data/npcsh/npc_team/corca.jinx +430 -0
  116. npcsh-1.1.18.data/data/npcsh/npc_team/db_search.jinx +348 -0
  117. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/delegate.jinx +8 -9
  118. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/edit_file.jinx +3 -0
  119. npcsh-1.1.18.data/data/npcsh/npc_team/file_search.jinx +339 -0
  120. npcsh-1.1.18.data/data/npcsh/npc_team/focus_pane.jinx +9 -0
  121. npcsh-1.1.18.data/data/npcsh/npc_team/guac.jinx +544 -0
  122. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/help.jinx +1 -1
  123. npcsh-1.1.16.data/data/npcsh/npc_team/npc-studio.jinx → npcsh-1.1.18.data/data/npcsh/npc_team/incognide.jinx +2 -2
  124. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/init.jinx +5 -5
  125. npcsh-1.1.18.data/data/npcsh/npc_team/jinxs.jinx +331 -0
  126. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/key_press.jinx +1 -1
  127. npcsh-1.1.18.data/data/npcsh/npc_team/kg_search.jinx +418 -0
  128. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/launch_app.jinx +1 -1
  129. npcsh-1.1.18.data/data/npcsh/npc_team/list_panes.jinx +8 -0
  130. npcsh-1.1.18.data/data/npcsh/npc_team/mem_review.jinx +73 -0
  131. npcsh-1.1.18.data/data/npcsh/npc_team/mem_search.jinx +388 -0
  132. npcsh-1.1.18.data/data/npcsh/npc_team/navigate.jinx +10 -0
  133. npcsh-1.1.18.data/data/npcsh/npc_team/notify.jinx +10 -0
  134. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/nql.jinx +7 -7
  135. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/open_browser.jinx +2 -2
  136. npcsh-1.1.18.data/data/npcsh/npc_team/open_pane.jinx +13 -0
  137. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/ots.jinx +7 -7
  138. npcsh-1.1.18.data/data/npcsh/npc_team/paper_search.jinx +412 -0
  139. npcsh-1.1.18.data/data/npcsh/npc_team/plonk.jinx +379 -0
  140. npcsh-1.1.18.data/data/npcsh/npc_team/pti.jinx +357 -0
  141. npcsh-1.1.18.data/data/npcsh/npc_team/read_pane.jinx +9 -0
  142. npcsh-1.1.18.data/data/npcsh/npc_team/reattach.jinx +291 -0
  143. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/roll.jinx +20 -23
  144. npcsh-1.1.18.data/data/npcsh/npc_team/run_terminal.jinx +10 -0
  145. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sample.jinx +6 -7
  146. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/screenshot.jinx +1 -1
  147. npcsh-1.1.18.data/data/npcsh/npc_team/search.jinx +54 -0
  148. npcsh-1.1.18.data/data/npcsh/npc_team/semantic_scholar.jinx +386 -0
  149. npcsh-1.1.18.data/data/npcsh/npc_team/send_message.jinx +10 -0
  150. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/serve.jinx +2 -2
  151. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/set.jinx +2 -2
  152. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sh.jinx +1 -1
  153. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sibiji.npc +1 -1
  154. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sleep.jinx +29 -18
  155. npcsh-1.1.18.data/data/npcsh/npc_team/split_pane.jinx +12 -0
  156. npcsh-1.1.18.data/data/npcsh/npc_team/spool.jinx +350 -0
  157. npcsh-1.1.18.data/data/npcsh/npc_team/sql.jinx +20 -0
  158. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switch.jinx +3 -3
  159. npcsh-1.1.18.data/data/npcsh/npc_team/switch_npc.jinx +10 -0
  160. npcsh-1.1.18.data/data/npcsh/npc_team/switch_tab.jinx +10 -0
  161. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/switches.jinx +1 -1
  162. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sync.jinx +6 -6
  163. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/teamviz.jinx +2 -2
  164. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/trigger.jinx +2 -2
  165. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/type_text.jinx +1 -1
  166. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/vixynt.jinx +8 -8
  167. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/wait.jinx +1 -1
  168. npcsh-1.1.18.data/data/npcsh/npc_team/wander.jinx +455 -0
  169. npcsh-1.1.18.data/data/npcsh/npc_team/web_search.jinx +283 -0
  170. npcsh-1.1.18.data/data/npcsh/npc_team/write_file.jinx +11 -0
  171. {npcsh/npc_team/jinxs/bin → npcsh-1.1.18.data/data/npcsh/npc_team}/yap.jinx +8 -2
  172. npcsh-1.1.18.data/data/npcsh/npc_team/zen_mode.jinx +9 -0
  173. {npcsh-1.1.16.dist-info → npcsh-1.1.18.dist-info}/METADATA +99 -7
  174. npcsh-1.1.18.dist-info/RECORD +235 -0
  175. {npcsh-1.1.16.dist-info → npcsh-1.1.18.dist-info}/WHEEL +1 -1
  176. {npcsh-1.1.16.dist-info → npcsh-1.1.18.dist-info}/entry_points.txt +2 -3
  177. npcsh/npc_team/jinxs/bin/spool.jinx +0 -161
  178. npcsh/npc_team/jinxs/bin/wander.jinx +0 -152
  179. npcsh/npc_team/jinxs/lib/research/arxiv.jinx +0 -76
  180. npcsh-1.1.16.data/data/npcsh/npc_team/arxiv.jinx +0 -76
  181. npcsh-1.1.16.data/data/npcsh/npc_team/jinxs.jinx +0 -176
  182. npcsh-1.1.16.data/data/npcsh/npc_team/paper_search.jinx +0 -101
  183. npcsh-1.1.16.data/data/npcsh/npc_team/search.jinx +0 -131
  184. npcsh-1.1.16.data/data/npcsh/npc_team/semantic_scholar.jinx +0 -69
  185. npcsh-1.1.16.data/data/npcsh/npc_team/spool.jinx +0 -161
  186. npcsh-1.1.16.data/data/npcsh/npc_team/sql.jinx +0 -16
  187. npcsh-1.1.16.data/data/npcsh/npc_team/wander.jinx +0 -152
  188. npcsh-1.1.16.dist-info/RECORD +0 -170
  189. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/alicanto.npc +0 -0
  190. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/alicanto.png +0 -0
  191. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/close_browser.jinx +0 -0
  192. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca.npc +0 -0
  193. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca.png +0 -0
  194. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/corca_example.png +0 -0
  195. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/frederic.npc +0 -0
  196. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/frederic4.png +0 -0
  197. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/guac.npc +0 -0
  198. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/guac.png +0 -0
  199. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/kadiefa.npc +0 -0
  200. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/kadiefa.png +0 -0
  201. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/load_file.jinx +0 -0
  202. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/npcsh.ctx +0 -0
  203. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/npcsh_sibiji.png +0 -0
  204. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/paste.jinx +0 -0
  205. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonk.npc +0 -0
  206. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonk.png +0 -0
  207. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonkjr.npc +0 -0
  208. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/plonkjr.png +0 -0
  209. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/python.jinx +0 -0
  210. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/shh.jinx +0 -0
  211. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/sibiji.png +0 -0
  212. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/spool.png +0 -0
  213. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/usage.jinx +0 -0
  214. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/verbose.jinx +0 -0
  215. {npcsh-1.1.16.data → npcsh-1.1.18.data}/data/npcsh/npc_team/yap.png +0 -0
  216. {npcsh-1.1.16.dist-info → npcsh-1.1.18.dist-info}/licenses/LICENSE +0 -0
  217. {npcsh-1.1.16.dist-info → npcsh-1.1.18.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,611 @@
1
+ """
2
+ Benchmark runner for npcsh on Terminal-Bench.
3
+
4
+ Provides a convenient interface for running Terminal-Bench evaluations
5
+ with different models and providers.
6
+ """
7
+
8
+ import subprocess
9
+ import sys
10
+ import json
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Optional, List, Dict, Any
15
+
16
+
17
+
18
+ @dataclass
19
+ class BenchmarkConfig:
20
+ """Configuration for a benchmark run."""
21
+ model: str = "claude-sonnet-4-20250514"
22
+ provider: str = "anthropic"
23
+ dataset: str = "terminal-bench"
24
+ dataset_version: Optional[str] = None # If None, use latest
25
+ n_concurrent: int = 4
26
+ task_ids: Optional[List[str]] = None
27
+ output_dir: Optional[str] = None
28
+ npc_name: Optional[str] = None # Use specific NPC
29
+ timeout: int = 600 # Per-task timeout in seconds
30
+ extra_args: List[str] = field(default_factory=list)
31
+
32
+
33
+ @dataclass
34
+ class BenchmarkResult:
35
+ """Results from a benchmark run."""
36
+ success: bool
37
+ total_tasks: int = 0
38
+ passed_tasks: int = 0
39
+ failed_tasks: int = 0
40
+ accuracy: float = 0.0
41
+ total_tokens: int = 0
42
+ total_cost_usd: float = 0.0
43
+ duration_seconds: float = 0.0
44
+ output_dir: str = ""
45
+ error: Optional[str] = None
46
+ task_results: List[Dict[str, Any]] = field(default_factory=list)
47
+
48
+
49
+ class BenchmarkRunner:
50
+ """
51
+ Runner for Terminal-Bench evaluations with npcsh.
52
+
53
+ Example usage:
54
+ runner = BenchmarkRunner()
55
+
56
+ # Run with default settings (Claude Sonnet)
57
+ result = runner.run()
58
+
59
+ # Run with specific model
60
+ result = runner.run(model="gpt-4o", provider="openai")
61
+
62
+ # Compare multiple models
63
+ results = runner.compare_models([
64
+ ("claude-sonnet-4-20250514", "anthropic"),
65
+ ("gpt-4o", "openai"),
66
+ ("gemini-2.0-flash", "gemini"),
67
+ ])
68
+ """
69
+
70
+ def __init__(self, output_base_dir: Optional[str] = None):
71
+ """
72
+ Initialize the benchmark runner.
73
+
74
+ Args:
75
+ output_base_dir: Base directory for benchmark outputs.
76
+ Defaults to ~/.npcsh/benchmarks/
77
+ """
78
+ if output_base_dir:
79
+ self.output_base_dir = Path(output_base_dir)
80
+ else:
81
+ self.output_base_dir = Path.home() / ".npcsh" / "benchmarks"
82
+
83
+ self.output_base_dir.mkdir(parents=True, exist_ok=True)
84
+
85
+ def check_dependencies(self) -> Dict[str, bool]:
86
+ """Check if required dependencies are installed."""
87
+ import shutil
88
+
89
+ deps = {
90
+ "harbor": False,
91
+ "terminal-bench": False,
92
+ "docker": False,
93
+ }
94
+
95
+ # Find binaries in the same Python environment as current interpreter
96
+ # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
97
+ bin_dir = Path(sys.prefix) / "bin"
98
+ if not bin_dir.exists():
99
+ # Fallback: use executable's directory without resolving
100
+ bin_dir = Path(sys.executable).parent
101
+
102
+ # Check harbor - first in current Python's bin dir, then PATH
103
+ harbor_bin = bin_dir / "harbor"
104
+ if not harbor_bin.exists():
105
+ harbor_bin = shutil.which("harbor")
106
+
107
+ if harbor_bin:
108
+ try:
109
+ result = subprocess.run(
110
+ [str(harbor_bin), "--version"],
111
+ capture_output=True,
112
+ text=True
113
+ )
114
+ deps["harbor"] = result.returncode == 0
115
+ except (FileNotFoundError, OSError):
116
+ pass
117
+
118
+ # Check terminal-bench (tb CLI) - first in current Python's bin dir, then PATH
119
+ tb_bin = bin_dir / "tb"
120
+ if not tb_bin.exists():
121
+ tb_bin = shutil.which("tb")
122
+
123
+ if tb_bin:
124
+ try:
125
+ result = subprocess.run(
126
+ [str(tb_bin), "--help"],
127
+ capture_output=True,
128
+ text=True
129
+ )
130
+ deps["terminal-bench"] = result.returncode == 0
131
+ except (FileNotFoundError, OSError):
132
+ pass
133
+
134
+ # Check docker
135
+ try:
136
+ result = subprocess.run(
137
+ ["docker", "--version"],
138
+ capture_output=True,
139
+ text=True
140
+ )
141
+ deps["docker"] = result.returncode == 0
142
+ except FileNotFoundError:
143
+ pass
144
+
145
+ return deps
146
+
147
+ def install_dependencies(self) -> bool:
148
+ """Install Terminal-Bench dependencies."""
149
+ print("Installing Terminal-Bench dependencies...")
150
+
151
+ try:
152
+ # Install harbor and terminal-bench via pip/uv
153
+ subprocess.run(
154
+ [sys.executable, "-m", "pip", "install", "harbor", "terminal-bench"],
155
+ check=True
156
+ )
157
+ print("Dependencies installed successfully!")
158
+ return True
159
+ except subprocess.CalledProcessError as e:
160
+ print(f"Failed to install dependencies: {e}")
161
+ return False
162
+
163
+ def run(
164
+ self,
165
+ model: str = "claude-sonnet-4-20250514",
166
+ provider: str = "anthropic",
167
+ dataset: str = "terminal-bench",
168
+ dataset_version: Optional[str] = None,
169
+ n_concurrent: int = 4,
170
+ task_ids: Optional[List[str]] = None,
171
+ n_tasks: Optional[int] = None,
172
+ npc_name: Optional[str] = None,
173
+ timeout: int = 600,
174
+ ) -> BenchmarkResult:
175
+ """
176
+ Run Terminal-Bench evaluation with npcsh.
177
+
178
+ Args:
179
+ model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
180
+ provider: Provider name (e.g., "anthropic", "openai", "gemini")
181
+ dataset: Dataset name (default: "terminal-bench")
182
+ dataset_version: Dataset version (optional, uses latest if None)
183
+ n_concurrent: Number of concurrent task executions
184
+ task_ids: Optional list of specific task IDs to run
185
+ n_tasks: Optional limit on number of tasks to run
186
+ npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
187
+ timeout: Per-task timeout in seconds
188
+
189
+ Returns:
190
+ BenchmarkResult with evaluation metrics
191
+ """
192
+ # Check dependencies
193
+ deps = self.check_dependencies()
194
+ if not deps["harbor"]:
195
+ print("Harbor not installed. Installing...")
196
+ if not self.install_dependencies():
197
+ return BenchmarkResult(
198
+ success=False,
199
+ error="Failed to install dependencies"
200
+ )
201
+
202
+ # Create output directory for this run
203
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
204
+ run_name = f"{provider}_{model}_{timestamp}".replace("/", "_")
205
+ output_dir = self.output_base_dir / run_name
206
+ output_dir.mkdir(parents=True, exist_ok=True)
207
+
208
+ # Build the harbor command
209
+ full_model = f"{provider}/{model}"
210
+
211
+ # Choose agent based on whether NPC is specified
212
+ if npc_name:
213
+ agent_path = "npcsh.benchmark:NpcshAgentWithNpc"
214
+ else:
215
+ agent_path = "npcsh.benchmark:NpcshAgent"
216
+
217
+ # Find harbor in the same Python environment as current interpreter
218
+ # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
219
+ import shutil
220
+ bin_dir = Path(sys.prefix) / "bin"
221
+ if not bin_dir.exists():
222
+ bin_dir = Path(sys.executable).parent
223
+ harbor_bin = str(bin_dir / "harbor")
224
+ if not Path(harbor_bin).exists():
225
+ harbor_bin = shutil.which("harbor") or "harbor"
226
+
227
+ # Build dataset string (with optional version)
228
+ dataset_str = f"{dataset}@{dataset_version}" if dataset_version else dataset
229
+
230
+ cmd = [
231
+ harbor_bin, "run",
232
+ "-d", dataset_str,
233
+ "--agent-import-path", agent_path,
234
+ "-m", full_model,
235
+ "-n", str(n_concurrent),
236
+ "-o", str(output_dir),
237
+ ]
238
+
239
+ if task_ids:
240
+ for task_id in task_ids:
241
+ cmd.extend(["--task-name", task_id])
242
+
243
+ if n_tasks:
244
+ cmd.extend(["-l", str(n_tasks)])
245
+
246
+ print("\nRunning Terminal-Bench evaluation:")
247
+ print(f" Model: {full_model}")
248
+ print(f" Dataset: {dataset_str}")
249
+ print(f" Concurrent tasks: {n_concurrent}")
250
+ if n_tasks:
251
+ print(f" Max tasks: {n_tasks}")
252
+ print(f" Output: {output_dir}")
253
+ if npc_name:
254
+ print(f" NPC: {npc_name}")
255
+ print(f"\nCommand: {' '.join(cmd)}\n")
256
+
257
+ start_time = datetime.now()
258
+
259
+ try:
260
+ # Run the benchmark
261
+ process = subprocess.run(
262
+ cmd,
263
+ capture_output=True,
264
+ text=True,
265
+ timeout=timeout * n_concurrent * 2 # Overall timeout
266
+ )
267
+
268
+ duration = (datetime.now() - start_time).total_seconds()
269
+
270
+ # Parse results
271
+ result = self._parse_results(output_dir, duration)
272
+ result.output_dir = str(output_dir)
273
+
274
+ if process.returncode != 0:
275
+ result.error = process.stderr
276
+
277
+ # Save run metadata
278
+ self._save_run_metadata(output_dir, {
279
+ "model": full_model,
280
+ "provider": provider,
281
+ "dataset": dataset,
282
+ "dataset_version": dataset_version,
283
+ "n_concurrent": n_concurrent,
284
+ "npc_name": npc_name,
285
+ "duration_seconds": duration,
286
+ "result": {
287
+ "success": result.success,
288
+ "accuracy": result.accuracy,
289
+ "total_tasks": result.total_tasks,
290
+ "passed_tasks": result.passed_tasks,
291
+ }
292
+ })
293
+
294
+ return result
295
+
296
+ except subprocess.TimeoutExpired:
297
+ return BenchmarkResult(
298
+ success=False,
299
+ error="Benchmark timed out",
300
+ output_dir=str(output_dir)
301
+ )
302
+ except Exception as e:
303
+ return BenchmarkResult(
304
+ success=False,
305
+ error=str(e),
306
+ output_dir=str(output_dir)
307
+ )
308
+
309
+ def _parse_results(self, output_dir: Path, duration: float) -> BenchmarkResult:
310
+ """Parse benchmark results from output directory."""
311
+ result = BenchmarkResult(
312
+ success=True,
313
+ duration_seconds=duration
314
+ )
315
+
316
+ # Look for results file
317
+ results_file = output_dir / "results.json"
318
+ if results_file.exists():
319
+ try:
320
+ with open(results_file) as f:
321
+ data = json.load(f)
322
+
323
+ result.total_tasks = data.get("total", 0)
324
+ result.passed_tasks = data.get("passed", 0)
325
+ result.failed_tasks = data.get("failed", 0)
326
+
327
+ if result.total_tasks > 0:
328
+ result.accuracy = result.passed_tasks / result.total_tasks
329
+
330
+ result.task_results = data.get("tasks", [])
331
+
332
+ # Aggregate token usage
333
+ for task in result.task_results:
334
+ result.total_tokens += task.get("tokens", 0)
335
+ result.total_cost_usd += task.get("cost_usd", 0.0)
336
+
337
+ except (json.JSONDecodeError, KeyError) as e:
338
+ result.error = f"Failed to parse results: {e}"
339
+
340
+ return result
341
+
342
+ def _save_run_metadata(self, output_dir: Path, metadata: Dict[str, Any]) -> None:
343
+ """Save run metadata to output directory."""
344
+ metadata_file = output_dir / "run_metadata.json"
345
+ metadata["timestamp"] = datetime.now().isoformat()
346
+
347
+ with open(metadata_file, "w") as f:
348
+ json.dump(metadata, f, indent=2)
349
+
350
+ def compare_models(
351
+ self,
352
+ models: List[tuple],
353
+ dataset: str = "terminal-bench",
354
+ dataset_version: Optional[str] = None,
355
+ n_concurrent: int = 4,
356
+ task_ids: Optional[List[str]] = None,
357
+ ) -> Dict[str, BenchmarkResult]:
358
+ """
359
+ Compare multiple models on the same benchmark.
360
+
361
+ Args:
362
+ models: List of (model, provider) tuples
363
+ dataset: Dataset name
364
+ dataset_version: Dataset version (optional)
365
+ n_concurrent: Number of concurrent tasks
366
+ task_ids: Optional specific task IDs
367
+
368
+ Returns:
369
+ Dictionary mapping model names to results
370
+
371
+ Example:
372
+ results = runner.compare_models([
373
+ ("claude-sonnet-4-20250514", "anthropic"),
374
+ ("gpt-4o", "openai"),
375
+ ("gemini-2.0-flash", "gemini"),
376
+ ])
377
+ """
378
+ results = {}
379
+
380
+ for model, provider in models:
381
+ print("\n" + '='*60)
382
+ print(f"Evaluating: {provider}/{model}")
383
+ print('='*60)
384
+
385
+ result = self.run(
386
+ model=model,
387
+ provider=provider,
388
+ dataset=dataset,
389
+ dataset_version=dataset_version,
390
+ n_concurrent=n_concurrent,
391
+ task_ids=task_ids,
392
+ )
393
+
394
+ results[f"{provider}/{model}"] = result
395
+
396
+ print(f"\nResult for {provider}/{model}:")
397
+ print(f" Accuracy: {result.accuracy:.1%}")
398
+ print(f" Tasks: {result.passed_tasks}/{result.total_tasks}")
399
+ print(f" Duration: {result.duration_seconds:.1f}s")
400
+
401
+ # Print comparison summary
402
+ self._print_comparison_summary(results)
403
+
404
+ return results
405
+
406
+ def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
407
+ """Print a comparison summary table."""
408
+ print("\n" + '='*60)
409
+ print("COMPARISON SUMMARY")
410
+ print('='*60)
411
+ print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
412
+ print("-" * 60)
413
+
414
+ sorted_results = sorted(
415
+ results.items(),
416
+ key=lambda x: x[1].accuracy,
417
+ reverse=True
418
+ )
419
+
420
+ for model_name, result in sorted_results:
421
+ print(
422
+ f"{model_name:<40} "
423
+ f"{result.accuracy:>9.1%} "
424
+ f"{result.passed_tasks:>4}/{result.total_tasks:<4}"
425
+ )
426
+
427
+ def list_past_runs(self) -> List[Dict[str, Any]]:
428
+ """List all past benchmark runs."""
429
+ runs = []
430
+
431
+ for run_dir in self.output_base_dir.iterdir():
432
+ if run_dir.is_dir():
433
+ metadata_file = run_dir / "run_metadata.json"
434
+ if metadata_file.exists():
435
+ try:
436
+ with open(metadata_file) as f:
437
+ metadata = json.load(f)
438
+ metadata["run_dir"] = str(run_dir)
439
+ runs.append(metadata)
440
+ except json.JSONDecodeError:
441
+ pass
442
+
443
+ return sorted(runs, key=lambda x: x.get("timestamp", ""), reverse=True)
444
+
445
+
446
+ def run_benchmark(
447
+ model: str = "claude-sonnet-4-20250514",
448
+ provider: str = "anthropic",
449
+ **kwargs
450
+ ) -> BenchmarkResult:
451
+ """
452
+ Convenience function to run a Terminal-Bench evaluation.
453
+
454
+ Args:
455
+ model: Model name
456
+ provider: Provider name
457
+ **kwargs: Additional arguments passed to BenchmarkRunner.run()
458
+
459
+ Returns:
460
+ BenchmarkResult
461
+
462
+ Example:
463
+ from npcsh.benchmark import run_benchmark
464
+
465
+ # Run with Claude
466
+ result = run_benchmark("claude-sonnet-4-20250514", "anthropic")
467
+ print(f"Accuracy: {result.accuracy:.1%}")
468
+
469
+ # Run with GPT-4
470
+ result = run_benchmark("gpt-4o", "openai")
471
+ """
472
+ runner = BenchmarkRunner()
473
+ return runner.run(model=model, provider=provider, **kwargs)
474
+
475
+
476
+ def quick_test(
477
+ model: str = "claude-sonnet-4-20250514",
478
+ provider: str = "anthropic",
479
+ n_tasks: int = 3,
480
+ ) -> BenchmarkResult:
481
+ """
482
+ Run a quick test with a few tasks to verify setup.
483
+
484
+ This runs only a few tasks to quickly verify that everything is working.
485
+ """
486
+ runner = BenchmarkRunner()
487
+
488
+ # Use -l flag to limit number of tasks instead of specifying task names
489
+ # This avoids issues with task names changing in the dataset
490
+ return runner.run(
491
+ model=model,
492
+ provider=provider,
493
+ n_concurrent=1,
494
+ n_tasks=n_tasks,
495
+ )
496
+
497
+
498
+ def main():
499
+ """CLI entry point for npcsh-bench command."""
500
+ import argparse
501
+
502
+ parser = argparse.ArgumentParser(
503
+ description="Run Terminal-Bench with npcsh",
504
+ formatter_class=argparse.RawDescriptionHelpFormatter,
505
+ epilog="""
506
+ Examples:
507
+ # Check dependencies
508
+ npcsh-bench --check
509
+
510
+ # Quick test with Claude
511
+ npcsh-bench --quick -m claude-sonnet-4-20250514 -p anthropic
512
+
513
+ # Full benchmark run
514
+ npcsh-bench -m gpt-4o -p openai -n 8
515
+
516
+ # List past runs
517
+ npcsh-bench --list-runs
518
+
519
+ # Compare models (requires manual setup)
520
+ npcsh-bench --compare
521
+ """
522
+ )
523
+ parser.add_argument("--model", "-m",
524
+ help="Model name")
525
+ parser.add_argument("--provider", "-p",
526
+ help="Provider name")
527
+ parser.add_argument("--dataset", "-d", default="terminal-bench",
528
+ help="Dataset name")
529
+ parser.add_argument("--version", "-v", default=None,
530
+ help="Dataset version (optional, uses latest if not specified)")
531
+ parser.add_argument("--concurrent", "-n", type=int, default=4,
532
+ help="Number of concurrent tasks")
533
+ parser.add_argument("--npc", help="NPC name to use")
534
+ parser.add_argument("--quick", action="store_true",
535
+ help="Run quick test with few tasks")
536
+ parser.add_argument("--list-runs", action="store_true",
537
+ help="List past benchmark runs")
538
+ parser.add_argument("--check", action="store_true",
539
+ help="Check if dependencies are installed")
540
+ parser.add_argument("--compare", action="store_true",
541
+ help="Compare multiple models (Claude, GPT-4, Gemini)")
542
+
543
+ args = parser.parse_args()
544
+
545
+ runner = BenchmarkRunner()
546
+
547
+ if args.check:
548
+ print("Checking Terminal-Bench dependencies...\n")
549
+ deps = runner.check_dependencies()
550
+ all_good = True
551
+ for dep, installed in deps.items():
552
+ status = "OK" if installed else "MISSING"
553
+ symbol = "+" if installed else "-"
554
+ print(f" [{symbol}] {dep}: {status}")
555
+ if not installed:
556
+ all_good = False
557
+
558
+ if not all_good:
559
+ print("\nTo install missing dependencies:")
560
+ print(" pip install harbor terminal-bench")
561
+ print("\nOr install with npcsh:")
562
+ print(" pip install npcsh[bench]")
563
+ else:
564
+ print("\nAll dependencies installed!")
565
+
566
+ elif args.list_runs:
567
+ runs = runner.list_past_runs()
568
+ if not runs:
569
+ print("No past benchmark runs found.")
570
+ else:
571
+ print(f"Found {len(runs)} past runs:\n")
572
+ for run in runs:
573
+ print(f" {run.get('timestamp', 'unknown')}: {run.get('model', 'unknown')}")
574
+ result = run.get('result', {})
575
+ print(f" Accuracy: {result.get('accuracy', 0):.1%}")
576
+ print(f" Tasks: {result.get('passed_tasks', 0)}/{result.get('total_tasks', 0)}")
577
+ print()
578
+
579
+ elif args.compare:
580
+ print("Comparing models on Terminal-Bench 2.0...\n")
581
+ models_to_compare = [
582
+
583
+ ("gpt-4o", "openai"),
584
+ ("gemini-2.0-flash", "gemini"),
585
+ ]
586
+ runner.compare_models(
587
+ models_to_compare,
588
+ n_concurrent=args.concurrent
589
+ )
590
+
591
+ elif args.quick:
592
+ result = quick_test(args.model, args.provider)
593
+ print(f"\nQuick test result: {'PASS' if result.success else 'FAIL'}")
594
+ print(f"Accuracy: {result.accuracy:.1%}")
595
+
596
+ else:
597
+ result = runner.run(
598
+ model=args.model,
599
+ provider=args.provider,
600
+ dataset=args.dataset,
601
+ dataset_version=args.version,
602
+ n_concurrent=args.concurrent,
603
+ npc_name=args.npc,
604
+ )
605
+ print("\nBenchmark complete!")
606
+ print(f"Accuracy: {result.accuracy:.1%}")
607
+ print(f"Results saved to: {result.output_dir}")
608
+
609
+
610
+ if __name__ == "__main__":
611
+ main()
@@ -0,0 +1,35 @@
1
+ #!/bin/bash
2
+ # Installation script for npcsh in Terminal-Bench containers
3
+ # This template is rendered by Harbor before execution
4
+
5
+ set -e
6
+
7
+ echo "Installing npcsh for Terminal-Bench evaluation..."
8
+
9
+ # Install Python dependencies if needed
10
+ if ! command -v pip &> /dev/null; then
11
+ echo "Installing pip..."
12
+ apt-get update && apt-get install -y python3-pip
13
+ fi
14
+
15
+ # Install npcsh with lite dependencies (API providers only, no local models)
16
+ # Use --break-system-packages for PEP 668 compliance (Ubuntu 24.04+)
17
+ echo "Installing npcsh[lite]..."
18
+ pip install --quiet --break-system-packages npcsh[lite] || pip install --quiet npcsh[lite]
19
+
20
+ # Verify installation
21
+ echo "Verifying npcsh installation..."
22
+ npc --help > /dev/null 2>&1 || {
23
+ echo "ERROR: npcsh installation failed"
24
+ exit 1
25
+ }
26
+
27
+ # Set up default configuration
28
+ export NPCSH_STREAM_OUTPUT=0
29
+ export NPCSH_LOG_LEVEL=warning
30
+
31
+ {% if version %}
32
+ echo "npcsh version: {{ version }}"
33
+ {% endif %}
34
+
35
+ echo "npcsh installation complete!"
npcsh/build.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
1
  import shutil
3
2
  import textwrap
4
3
  from pathlib import Path
@@ -11,8 +10,7 @@ def build_flask_server(config, **kwargs):
11
10
  server_script = output_dir / 'npc_server.py'
12
11
 
13
12
  server_code = textwrap.dedent(f'''
14
- import os
15
- from npcpy.serve import start_flask_server
13
+ from npcpy.serve import start_flask_server
16
14
  from npcpy.npc_compiler import Team
17
15
  from sqlalchemy import create_engine
18
16
 
@@ -111,7 +109,7 @@ def build_docker_compose(config, **kwargs):
111
109
  volumes:
112
110
  - npc-data:/root/.npcsh
113
111
  environment:
114
- - NPCSH_DB_PATH=/root/.npcsh/npcsh_history.db
112
+ - NPCSH_DB_PATH=/root/npcsh_history.db
115
113
 
116
114
  volumes:
117
115
  npc-data:
npcsh/completion.py CHANGED
@@ -2,8 +2,7 @@
2
2
  Readline and tab completion for npcsh
3
3
  """
4
4
  import os
5
- import shutil
6
- from typing import List, Any, Optional
5
+ from typing import List, Any
7
6
 
8
7
  try:
9
8
  import readline
@@ -66,11 +65,8 @@ def get_file_completions(text: str) -> List[str]:
66
65
  completions = []
67
66
 
68
67
  if text.startswith("~"):
69
- expanded = os.path.expanduser(text)
70
- prefix = "~"
71
- search_path = expanded
68
+ search_path = os.path.expanduser(text)
72
69
  else:
73
- prefix = ""
74
70
  search_path = text
75
71
 
76
72
  # Get directory to search