npcsh 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. npcsh/_state.py +122 -91
  2. npcsh/alicanto.py +2 -2
  3. npcsh/benchmark/__init__.py +8 -2
  4. npcsh/benchmark/npcsh_agent.py +87 -22
  5. npcsh/benchmark/runner.py +85 -43
  6. npcsh/benchmark/templates/install-npcsh.sh.j2 +35 -0
  7. npcsh/build.py +2 -4
  8. npcsh/completion.py +2 -6
  9. npcsh/config.py +2 -3
  10. npcsh/conversation_viewer.py +389 -0
  11. npcsh/corca.py +0 -1
  12. npcsh/diff_viewer.py +452 -0
  13. npcsh/execution.py +0 -1
  14. npcsh/guac.py +0 -1
  15. npcsh/mcp_helpers.py +2 -3
  16. npcsh/mcp_server.py +5 -10
  17. npcsh/npc.py +10 -11
  18. npcsh/npc_team/jinxs/bin/benchmark.jinx +1 -1
  19. npcsh/npc_team/jinxs/bin/config_tui.jinx +299 -0
  20. npcsh/npc_team/jinxs/bin/memories.jinx +316 -0
  21. npcsh/npc_team/jinxs/bin/setup.jinx +240 -0
  22. npcsh/npc_team/jinxs/bin/sync.jinx +143 -150
  23. npcsh/npc_team/jinxs/bin/team_tui.jinx +327 -0
  24. npcsh/npc_team/jinxs/incognide/add_tab.jinx +1 -1
  25. npcsh/npc_team/jinxs/incognide/close_pane.jinx +1 -1
  26. npcsh/npc_team/jinxs/incognide/close_tab.jinx +1 -1
  27. npcsh/npc_team/jinxs/incognide/confirm.jinx +1 -1
  28. npcsh/npc_team/jinxs/incognide/focus_pane.jinx +1 -1
  29. npcsh/npc_team/jinxs/incognide/list_panes.jinx +1 -1
  30. npcsh/npc_team/jinxs/incognide/navigate.jinx +1 -1
  31. npcsh/npc_team/jinxs/incognide/notify.jinx +1 -1
  32. npcsh/npc_team/jinxs/incognide/open_pane.jinx +1 -1
  33. npcsh/npc_team/jinxs/incognide/read_pane.jinx +1 -1
  34. npcsh/npc_team/jinxs/incognide/run_terminal.jinx +1 -1
  35. npcsh/npc_team/jinxs/incognide/send_message.jinx +1 -1
  36. npcsh/npc_team/jinxs/incognide/split_pane.jinx +1 -1
  37. npcsh/npc_team/jinxs/incognide/switch_npc.jinx +1 -1
  38. npcsh/npc_team/jinxs/incognide/switch_tab.jinx +1 -1
  39. npcsh/npc_team/jinxs/incognide/write_file.jinx +1 -1
  40. npcsh/npc_team/jinxs/incognide/zen_mode.jinx +1 -1
  41. npcsh/npc_team/jinxs/lib/core/search/db_search.jinx +321 -17
  42. npcsh/npc_team/jinxs/lib/core/search/file_search.jinx +312 -67
  43. npcsh/npc_team/jinxs/lib/core/search/kg_search.jinx +366 -44
  44. npcsh/npc_team/jinxs/lib/core/search/mem_review.jinx +73 -0
  45. npcsh/npc_team/jinxs/lib/core/search/mem_search.jinx +328 -20
  46. npcsh/npc_team/jinxs/lib/core/search/web_search.jinx +242 -10
  47. npcsh/npc_team/jinxs/lib/core/sleep.jinx +22 -11
  48. npcsh/npc_team/jinxs/lib/core/sql.jinx +10 -6
  49. npcsh/npc_team/jinxs/lib/research/paper_search.jinx +387 -76
  50. npcsh/npc_team/jinxs/lib/research/semantic_scholar.jinx +372 -55
  51. npcsh/npc_team/jinxs/lib/utils/jinxs.jinx +299 -144
  52. npcsh/npc_team/jinxs/modes/alicanto.jinx +356 -0
  53. npcsh/npc_team/jinxs/modes/arxiv.jinx +720 -0
  54. npcsh/npc_team/jinxs/modes/corca.jinx +430 -0
  55. npcsh/npc_team/jinxs/modes/guac.jinx +542 -0
  56. npcsh/npc_team/jinxs/modes/plonk.jinx +379 -0
  57. npcsh/npc_team/jinxs/modes/pti.jinx +357 -0
  58. npcsh/npc_team/jinxs/modes/reattach.jinx +291 -0
  59. npcsh/npc_team/jinxs/modes/spool.jinx +350 -0
  60. npcsh/npc_team/jinxs/modes/wander.jinx +455 -0
  61. npcsh/npc_team/jinxs/{bin → modes}/yap.jinx +13 -7
  62. npcsh/npcsh.py +7 -4
  63. npcsh/plonk.py +0 -1
  64. npcsh/pti.py +0 -1
  65. npcsh/routes.py +1 -3
  66. npcsh/spool.py +0 -1
  67. npcsh/ui.py +0 -1
  68. npcsh/wander.py +0 -1
  69. npcsh/yap.py +0 -1
  70. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/add_tab.jinx +1 -1
  71. npcsh-1.1.19.data/data/npcsh/npc_team/alicanto.jinx +356 -0
  72. npcsh-1.1.19.data/data/npcsh/npc_team/arxiv.jinx +720 -0
  73. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/benchmark.jinx +1 -1
  74. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/close_pane.jinx +1 -1
  75. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/close_tab.jinx +1 -1
  76. npcsh-1.1.19.data/data/npcsh/npc_team/config_tui.jinx +299 -0
  77. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/confirm.jinx +1 -1
  78. npcsh-1.1.19.data/data/npcsh/npc_team/corca.jinx +430 -0
  79. npcsh-1.1.19.data/data/npcsh/npc_team/db_search.jinx +348 -0
  80. npcsh-1.1.19.data/data/npcsh/npc_team/file_search.jinx +339 -0
  81. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/focus_pane.jinx +1 -1
  82. npcsh-1.1.19.data/data/npcsh/npc_team/guac.jinx +542 -0
  83. npcsh-1.1.19.data/data/npcsh/npc_team/jinxs.jinx +331 -0
  84. npcsh-1.1.19.data/data/npcsh/npc_team/kg_search.jinx +418 -0
  85. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/list_panes.jinx +1 -1
  86. npcsh-1.1.19.data/data/npcsh/npc_team/mem_review.jinx +73 -0
  87. npcsh-1.1.19.data/data/npcsh/npc_team/mem_search.jinx +388 -0
  88. npcsh-1.1.19.data/data/npcsh/npc_team/memories.jinx +316 -0
  89. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/navigate.jinx +1 -1
  90. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/notify.jinx +1 -1
  91. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/open_pane.jinx +1 -1
  92. npcsh-1.1.19.data/data/npcsh/npc_team/paper_search.jinx +412 -0
  93. npcsh-1.1.19.data/data/npcsh/npc_team/plonk.jinx +379 -0
  94. npcsh-1.1.19.data/data/npcsh/npc_team/pti.jinx +357 -0
  95. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/read_pane.jinx +1 -1
  96. npcsh-1.1.19.data/data/npcsh/npc_team/reattach.jinx +291 -0
  97. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/run_terminal.jinx +1 -1
  98. npcsh-1.1.19.data/data/npcsh/npc_team/semantic_scholar.jinx +386 -0
  99. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/send_message.jinx +1 -1
  100. npcsh-1.1.19.data/data/npcsh/npc_team/setup.jinx +240 -0
  101. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/sleep.jinx +22 -11
  102. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/split_pane.jinx +1 -1
  103. npcsh-1.1.19.data/data/npcsh/npc_team/spool.jinx +350 -0
  104. npcsh-1.1.19.data/data/npcsh/npc_team/sql.jinx +20 -0
  105. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/switch_npc.jinx +1 -1
  106. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/switch_tab.jinx +1 -1
  107. npcsh-1.1.19.data/data/npcsh/npc_team/sync.jinx +223 -0
  108. npcsh-1.1.19.data/data/npcsh/npc_team/team_tui.jinx +327 -0
  109. npcsh-1.1.19.data/data/npcsh/npc_team/wander.jinx +455 -0
  110. npcsh-1.1.19.data/data/npcsh/npc_team/web_search.jinx +283 -0
  111. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/write_file.jinx +1 -1
  112. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/yap.jinx +13 -7
  113. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/zen_mode.jinx +1 -1
  114. {npcsh-1.1.17.dist-info → npcsh-1.1.19.dist-info}/METADATA +110 -14
  115. npcsh-1.1.19.dist-info/RECORD +244 -0
  116. {npcsh-1.1.17.dist-info → npcsh-1.1.19.dist-info}/WHEEL +1 -1
  117. {npcsh-1.1.17.dist-info → npcsh-1.1.19.dist-info}/entry_points.txt +4 -3
  118. npcsh/npc_team/jinxs/bin/spool.jinx +0 -161
  119. npcsh/npc_team/jinxs/bin/wander.jinx +0 -242
  120. npcsh/npc_team/jinxs/lib/research/arxiv.jinx +0 -76
  121. npcsh-1.1.17.data/data/npcsh/npc_team/arxiv.jinx +0 -76
  122. npcsh-1.1.17.data/data/npcsh/npc_team/db_search.jinx +0 -44
  123. npcsh-1.1.17.data/data/npcsh/npc_team/file_search.jinx +0 -94
  124. npcsh-1.1.17.data/data/npcsh/npc_team/jinxs.jinx +0 -176
  125. npcsh-1.1.17.data/data/npcsh/npc_team/kg_search.jinx +0 -96
  126. npcsh-1.1.17.data/data/npcsh/npc_team/mem_search.jinx +0 -80
  127. npcsh-1.1.17.data/data/npcsh/npc_team/paper_search.jinx +0 -101
  128. npcsh-1.1.17.data/data/npcsh/npc_team/semantic_scholar.jinx +0 -69
  129. npcsh-1.1.17.data/data/npcsh/npc_team/spool.jinx +0 -161
  130. npcsh-1.1.17.data/data/npcsh/npc_team/sql.jinx +0 -16
  131. npcsh-1.1.17.data/data/npcsh/npc_team/sync.jinx +0 -230
  132. npcsh-1.1.17.data/data/npcsh/npc_team/wander.jinx +0 -242
  133. npcsh-1.1.17.data/data/npcsh/npc_team/web_search.jinx +0 -51
  134. npcsh-1.1.17.dist-info/RECORD +0 -219
  135. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/alicanto.npc +0 -0
  136. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/alicanto.png +0 -0
  137. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/browser_action.jinx +0 -0
  138. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/browser_screenshot.jinx +0 -0
  139. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/build.jinx +0 -0
  140. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/chat.jinx +0 -0
  141. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/click.jinx +0 -0
  142. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/close_browser.jinx +0 -0
  143. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/cmd.jinx +0 -0
  144. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/compile.jinx +0 -0
  145. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/compress.jinx +0 -0
  146. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/convene.jinx +0 -0
  147. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/corca.npc +0 -0
  148. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/corca.png +0 -0
  149. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/corca_example.png +0 -0
  150. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/delegate.jinx +0 -0
  151. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/edit_file.jinx +0 -0
  152. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/frederic.npc +0 -0
  153. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/frederic4.png +0 -0
  154. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/guac.npc +0 -0
  155. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/guac.png +0 -0
  156. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/help.jinx +0 -0
  157. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/incognide.jinx +0 -0
  158. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/init.jinx +0 -0
  159. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/kadiefa.npc +0 -0
  160. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/kadiefa.png +0 -0
  161. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/key_press.jinx +0 -0
  162. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/launch_app.jinx +0 -0
  163. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/load_file.jinx +0 -0
  164. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/npcsh.ctx +0 -0
  165. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/npcsh_sibiji.png +0 -0
  166. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/nql.jinx +0 -0
  167. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/open_browser.jinx +0 -0
  168. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/ots.jinx +0 -0
  169. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/paste.jinx +0 -0
  170. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/plonk.npc +0 -0
  171. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/plonk.png +0 -0
  172. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/plonkjr.npc +0 -0
  173. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/plonkjr.png +0 -0
  174. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/python.jinx +0 -0
  175. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/roll.jinx +0 -0
  176. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/sample.jinx +0 -0
  177. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/screenshot.jinx +0 -0
  178. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/search.jinx +0 -0
  179. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/serve.jinx +0 -0
  180. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/set.jinx +0 -0
  181. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/sh.jinx +0 -0
  182. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/shh.jinx +0 -0
  183. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/sibiji.npc +0 -0
  184. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/sibiji.png +0 -0
  185. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/spool.png +0 -0
  186. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/switch.jinx +0 -0
  187. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/switches.jinx +0 -0
  188. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/teamviz.jinx +0 -0
  189. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/trigger.jinx +0 -0
  190. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/type_text.jinx +0 -0
  191. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/usage.jinx +0 -0
  192. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/verbose.jinx +0 -0
  193. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/vixynt.jinx +0 -0
  194. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/wait.jinx +0 -0
  195. {npcsh-1.1.17.data → npcsh-1.1.19.data}/data/npcsh/npc_team/yap.png +0 -0
  196. {npcsh-1.1.17.dist-info → npcsh-1.1.19.dist-info}/licenses/LICENSE +0 -0
  197. {npcsh-1.1.17.dist-info → npcsh-1.1.19.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,6 @@ import json
9
9
  import os
10
10
  import shlex
11
11
  from pathlib import Path
12
- from typing import Optional
13
12
 
14
13
  from harbor.agents.installed.base import BaseInstalledAgent, ExecInput
15
14
  from harbor.models.agent.context import AgentContext
@@ -54,7 +53,28 @@ class NpcshAgent(BaseInstalledAgent):
54
53
  Returns:
55
54
  List of ExecInput commands to execute
56
55
  """
57
- escaped_instruction = shlex.quote(instruction)
56
+ # Wrap the instruction with explicit jinx usage directions and retry logic
57
+ tool_instruction = f"""You have access to jinxs including edit_file (for writing/creating files), sh (for running shell commands), and python (for running Python code).
58
+
59
+ IMPORTANT RULES:
60
+ 1. You MUST use these jinxs to complete the task. Do NOT just output code as text - use the edit_file jinx to actually write files to disk.
61
+ 2. After implementing a solution, you MUST verify it works by running any provided test scripts.
62
+ 3. If a test fails or produces an error, you MUST try a DIFFERENT approach. Do not give up.
63
+ 4. Keep trying different approaches until you succeed or have tried at least 10 different solutions.
64
+ 5. NEVER assume success - always check the actual output of test commands.
65
+
66
+ Task: {instruction}
67
+
68
+ WORKFLOW:
69
+ 1. Implement your solution using edit_file and sh
70
+ 2. Run any test scripts mentioned in the task
71
+ 3. Check the output carefully - look for "PASS", "SUCCESS", "OK" or similar
72
+ 4. If the test failed, analyze why and try a completely different approach
73
+ 5. Repeat until the test passes
74
+
75
+ Remember: Use edit_file to write code files. Use sh to run commands. VERIFY your solution works before concluding."""
76
+
77
+ escaped_instruction = shlex.quote(tool_instruction)
58
78
  model_name = self.model_name
59
79
 
60
80
  if model_name and "/" in model_name:
@@ -82,18 +102,25 @@ class NpcshAgent(BaseInstalledAgent):
82
102
  # Build environment variables for API keys
83
103
  env_vars = []
84
104
  api_key_map = {
85
- "anthropic": "ANTHROPIC_API_KEY",
86
- "openai": "OPENAI_API_KEY",
87
- "gemini": "GOOGLE_API_KEY",
88
- "google": "GOOGLE_API_KEY",
89
- "deepseek": "DEEPSEEK_API_KEY",
90
- "groq": "GROQ_API_KEY",
91
- "openrouter": "OPENROUTER_API_KEY",
105
+ "anthropic": ["ANTHROPIC_API_KEY"],
106
+ "openai": ["OPENAI_API_KEY"],
107
+ "gemini": ["GOOGLE_API_KEY", "GEMINI_API_KEY"],
108
+ "google": ["GOOGLE_API_KEY", "GEMINI_API_KEY"],
109
+ "deepseek": ["DEEPSEEK_API_KEY"],
110
+ "groq": ["GROQ_API_KEY"],
111
+ "openrouter": ["OPENROUTER_API_KEY"],
92
112
  }
93
113
 
94
- for prov, env_key in api_key_map.items():
95
- if env_key in os.environ:
96
- env_vars.append(f'{env_key}="{os.environ[env_key]}"')
114
+ added_keys = set()
115
+ for prov, env_keys in api_key_map.items():
116
+ for env_key in env_keys:
117
+ if env_key in os.environ:
118
+ # For Gemini, always pass as GOOGLE_API_KEY (what litellm expects)
119
+ target_key = "GOOGLE_API_KEY" if env_key == "GEMINI_API_KEY" else env_key
120
+ if target_key not in added_keys:
121
+ env_vars.append(f'{target_key}="{os.environ[env_key]}"')
122
+ added_keys.add(target_key)
123
+ break
97
124
 
98
125
  env_prefix = " ".join(env_vars) + " " if env_vars else ""
99
126
 
@@ -105,24 +132,33 @@ class NpcshAgent(BaseInstalledAgent):
105
132
 
106
133
  # Create output directory
107
134
  commands.append(ExecInput(
108
- cmd=f"mkdir -p {shlex.quote(output_dir)}",
109
- timeout=30
135
+ command=f"mkdir -p {shlex.quote(output_dir)}",
136
+ timeout_sec=30
137
+ ))
138
+
139
+ # Create .npcsh_global file to use global team and avoid interactive prompts
140
+ commands.append(ExecInput(
141
+ command="touch /app/.npcsh_global",
142
+ timeout_sec=10
110
143
  ))
111
144
 
112
145
  # Run npcsh with the instruction
146
+ # Using corca NPC which has edit_file tool for writing files
113
147
  # Using the npc CLI which supports single-command execution
148
+ # NPCSH_DEFAULT_MODE=agent enables automatic tool execution
114
149
  npcsh_cmd = (
115
150
  f'{env_prefix}'
116
151
  f'NPCSH_CHAT_MODEL="{model}" '
117
152
  f'NPCSH_CHAT_PROVIDER="{npcsh_provider}" '
118
153
  f'NPCSH_STREAM_OUTPUT=0 '
119
- f'npc {escaped_instruction} '
154
+ f'NPCSH_DEFAULT_MODE=agent '
155
+ f'npc --npc corca {escaped_instruction} '
120
156
  f'2>&1 | tee {shlex.quote(output_file)}'
121
157
  )
122
158
 
123
159
  commands.append(ExecInput(
124
- cmd=npcsh_cmd,
125
- timeout=600, # 10 minute timeout for complex tasks
160
+ command=npcsh_cmd,
161
+ timeout_sec=600, # 10 minute timeout for complex tasks
126
162
  ))
127
163
 
128
164
  return commands
@@ -198,7 +234,28 @@ class NpcshAgentWithNpc(NpcshAgent):
198
234
 
199
235
  def create_run_agent_commands(self, instruction: str) -> list:
200
236
  """Create commands using a specific NPC."""
201
- escaped_instruction = shlex.quote(instruction)
237
+ # Wrap the instruction with explicit jinx usage directions and retry logic
238
+ tool_instruction = f"""You have access to jinxs including edit_file (for writing/creating files), sh (for running shell commands), and python (for running Python code).
239
+
240
+ IMPORTANT RULES:
241
+ 1. You MUST use these jinxs to complete the task. Do NOT just output code as text - use the edit_file jinx to actually write files to disk.
242
+ 2. After implementing a solution, you MUST verify it works by running any provided test scripts.
243
+ 3. If a test fails or produces an error, you MUST try a DIFFERENT approach. Do not give up.
244
+ 4. Keep trying different approaches until you succeed or have tried at least 10 different solutions.
245
+ 5. NEVER assume success - always check the actual output of test commands.
246
+
247
+ Task: {instruction}
248
+
249
+ WORKFLOW:
250
+ 1. Implement your solution using edit_file and sh
251
+ 2. Run any test scripts mentioned in the task
252
+ 3. Check the output carefully - look for "PASS", "SUCCESS", "OK" or similar
253
+ 4. If the test failed, analyze why and try a completely different approach
254
+ 5. Repeat until the test passes
255
+
256
+ Remember: Use edit_file to write code files. Use sh to run commands. VERIFY your solution works before concluding."""
257
+
258
+ escaped_instruction = shlex.quote(tool_instruction)
202
259
  model_name = self.model_name
203
260
 
204
261
  if model_name and "/" in model_name:
@@ -240,23 +297,31 @@ class NpcshAgentWithNpc(NpcshAgent):
240
297
  commands = []
241
298
 
242
299
  commands.append(ExecInput(
243
- cmd=f"mkdir -p {shlex.quote(output_dir)}",
244
- timeout=30
300
+ command=f"mkdir -p {shlex.quote(output_dir)}",
301
+ timeout_sec=30
302
+ ))
303
+
304
+ # Create .npcsh_global file to use global team and avoid interactive prompts
305
+ commands.append(ExecInput(
306
+ command="touch /app/.npcsh_global",
307
+ timeout_sec=10
245
308
  ))
246
309
 
247
310
  # Use specific NPC with --npc flag
311
+ # NPCSH_DEFAULT_MODE=agent enables automatic tool execution
248
312
  npcsh_cmd = (
249
313
  f'{env_prefix}'
250
314
  f'NPCSH_CHAT_MODEL="{model}" '
251
315
  f'NPCSH_CHAT_PROVIDER="{npcsh_provider}" '
252
316
  f'NPCSH_STREAM_OUTPUT=0 '
317
+ f'NPCSH_DEFAULT_MODE=agent '
253
318
  f'npc --npc {self.npc_name} {escaped_instruction} '
254
319
  f'2>&1 | tee {shlex.quote(output_file)}'
255
320
  )
256
321
 
257
322
  commands.append(ExecInput(
258
- cmd=npcsh_cmd,
259
- timeout=600,
323
+ command=npcsh_cmd,
324
+ timeout_sec=600,
260
325
  ))
261
326
 
262
327
  return commands
npcsh/benchmark/runner.py CHANGED
@@ -5,14 +5,14 @@ Provides a convenient interface for running Terminal-Bench evaluations
5
5
  with different models and providers.
6
6
  """
7
7
 
8
- import os
9
8
  import subprocess
10
9
  import sys
10
+ import json
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
13
  from pathlib import Path
14
14
  from typing import Optional, List, Dict, Any
15
- import json
15
+
16
16
 
17
17
 
18
18
  @dataclass
@@ -21,7 +21,7 @@ class BenchmarkConfig:
21
21
  model: str = "claude-sonnet-4-20250514"
22
22
  provider: str = "anthropic"
23
23
  dataset: str = "terminal-bench"
24
- dataset_version: str = "2.0"
24
+ dataset_version: Optional[str] = None # If None, use latest
25
25
  n_concurrent: int = 4
26
26
  task_ids: Optional[List[str]] = None
27
27
  output_dir: Optional[str] = None
@@ -84,33 +84,52 @@ class BenchmarkRunner:
84
84
 
85
85
  def check_dependencies(self) -> Dict[str, bool]:
86
86
  """Check if required dependencies are installed."""
87
+ import shutil
88
+
87
89
  deps = {
88
90
  "harbor": False,
89
91
  "terminal-bench": False,
90
92
  "docker": False,
91
93
  }
92
94
 
93
- # Check harbor
94
- try:
95
- result = subprocess.run(
96
- ["harbor", "--version"],
97
- capture_output=True,
98
- text=True
99
- )
100
- deps["harbor"] = result.returncode == 0
101
- except FileNotFoundError:
102
- pass
95
+ # Find binaries in the same Python environment as current interpreter
96
+ # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
97
+ bin_dir = Path(sys.prefix) / "bin"
98
+ if not bin_dir.exists():
99
+ # Fallback: use executable's directory without resolving
100
+ bin_dir = Path(sys.executable).parent
103
101
 
104
- # Check terminal-bench (tb CLI)
105
- try:
106
- result = subprocess.run(
107
- ["tb", "--help"],
108
- capture_output=True,
109
- text=True
110
- )
111
- deps["terminal-bench"] = result.returncode == 0
112
- except FileNotFoundError:
113
- pass
102
+ # Check harbor - first in current Python's bin dir, then PATH
103
+ harbor_bin = bin_dir / "harbor"
104
+ if not harbor_bin.exists():
105
+ harbor_bin = shutil.which("harbor")
106
+
107
+ if harbor_bin:
108
+ try:
109
+ result = subprocess.run(
110
+ [str(harbor_bin), "--version"],
111
+ capture_output=True,
112
+ text=True
113
+ )
114
+ deps["harbor"] = result.returncode == 0
115
+ except (FileNotFoundError, OSError):
116
+ pass
117
+
118
+ # Check terminal-bench (tb CLI) - first in current Python's bin dir, then PATH
119
+ tb_bin = bin_dir / "tb"
120
+ if not tb_bin.exists():
121
+ tb_bin = shutil.which("tb")
122
+
123
+ if tb_bin:
124
+ try:
125
+ result = subprocess.run(
126
+ [str(tb_bin), "--help"],
127
+ capture_output=True,
128
+ text=True
129
+ )
130
+ deps["terminal-bench"] = result.returncode == 0
131
+ except (FileNotFoundError, OSError):
132
+ pass
114
133
 
115
134
  # Check docker
116
135
  try:
@@ -146,9 +165,10 @@ class BenchmarkRunner:
146
165
  model: str = "claude-sonnet-4-20250514",
147
166
  provider: str = "anthropic",
148
167
  dataset: str = "terminal-bench",
149
- dataset_version: str = "2.0",
168
+ dataset_version: Optional[str] = None,
150
169
  n_concurrent: int = 4,
151
170
  task_ids: Optional[List[str]] = None,
171
+ n_tasks: Optional[int] = None,
152
172
  npc_name: Optional[str] = None,
153
173
  timeout: int = 600,
154
174
  ) -> BenchmarkResult:
@@ -159,9 +179,10 @@ class BenchmarkRunner:
159
179
  model: Model name (e.g., "claude-sonnet-4-20250514", "gpt-4o")
160
180
  provider: Provider name (e.g., "anthropic", "openai", "gemini")
161
181
  dataset: Dataset name (default: "terminal-bench")
162
- dataset_version: Dataset version (default: "2.0")
182
+ dataset_version: Dataset version (optional, uses latest if None)
163
183
  n_concurrent: Number of concurrent task executions
164
184
  task_ids: Optional list of specific task IDs to run
185
+ n_tasks: Optional limit on number of tasks to run
165
186
  npc_name: Optional NPC name to use (e.g., "sibiji", "corca")
166
187
  timeout: Per-task timeout in seconds
167
188
 
@@ -193,9 +214,22 @@ class BenchmarkRunner:
193
214
  else:
194
215
  agent_path = "npcsh.benchmark:NpcshAgent"
195
216
 
217
+ # Find harbor in the same Python environment as current interpreter
218
+ # Use sys.prefix to get the virtualenv/pyenv directory (don't resolve symlinks)
219
+ import shutil
220
+ bin_dir = Path(sys.prefix) / "bin"
221
+ if not bin_dir.exists():
222
+ bin_dir = Path(sys.executable).parent
223
+ harbor_bin = str(bin_dir / "harbor")
224
+ if not Path(harbor_bin).exists():
225
+ harbor_bin = shutil.which("harbor") or "harbor"
226
+
227
+ # Build dataset string (with optional version)
228
+ dataset_str = f"{dataset}@{dataset_version}" if dataset_version else dataset
229
+
196
230
  cmd = [
197
- "harbor", "run",
198
- "-d", f"{dataset}@{dataset_version}",
231
+ harbor_bin, "run",
232
+ "-d", dataset_str,
199
233
  "--agent-import-path", agent_path,
200
234
  "-m", full_model,
201
235
  "-n", str(n_concurrent),
@@ -203,12 +237,18 @@ class BenchmarkRunner:
203
237
  ]
204
238
 
205
239
  if task_ids:
206
- cmd.extend(["--task-ids", ",".join(task_ids)])
240
+ for task_id in task_ids:
241
+ cmd.extend(["--task-name", task_id])
242
+
243
+ if n_tasks:
244
+ cmd.extend(["-l", str(n_tasks)])
207
245
 
208
- print(f"\nRunning Terminal-Bench evaluation:")
246
+ print("\nRunning Terminal-Bench evaluation:")
209
247
  print(f" Model: {full_model}")
210
- print(f" Dataset: {dataset}@{dataset_version}")
248
+ print(f" Dataset: {dataset_str}")
211
249
  print(f" Concurrent tasks: {n_concurrent}")
250
+ if n_tasks:
251
+ print(f" Max tasks: {n_tasks}")
212
252
  print(f" Output: {output_dir}")
213
253
  if npc_name:
214
254
  print(f" NPC: {npc_name}")
@@ -311,7 +351,7 @@ class BenchmarkRunner:
311
351
  self,
312
352
  models: List[tuple],
313
353
  dataset: str = "terminal-bench",
314
- dataset_version: str = "2.0",
354
+ dataset_version: Optional[str] = None,
315
355
  n_concurrent: int = 4,
316
356
  task_ids: Optional[List[str]] = None,
317
357
  ) -> Dict[str, BenchmarkResult]:
@@ -321,7 +361,7 @@ class BenchmarkRunner:
321
361
  Args:
322
362
  models: List of (model, provider) tuples
323
363
  dataset: Dataset name
324
- dataset_version: Dataset version
364
+ dataset_version: Dataset version (optional)
325
365
  n_concurrent: Number of concurrent tasks
326
366
  task_ids: Optional specific task IDs
327
367
 
@@ -338,9 +378,9 @@ class BenchmarkRunner:
338
378
  results = {}
339
379
 
340
380
  for model, provider in models:
341
- print(f"\n{'='*60}")
381
+ print("\n" + '='*60)
342
382
  print(f"Evaluating: {provider}/{model}")
343
- print(f"{'='*60}")
383
+ print('='*60)
344
384
 
345
385
  result = self.run(
346
386
  model=model,
@@ -365,9 +405,9 @@ class BenchmarkRunner:
365
405
 
366
406
  def _print_comparison_summary(self, results: Dict[str, BenchmarkResult]) -> None:
367
407
  """Print a comparison summary table."""
368
- print(f"\n{'='*60}")
408
+ print("\n" + '='*60)
369
409
  print("COMPARISON SUMMARY")
370
- print(f"{'='*60}")
410
+ print('='*60)
371
411
  print(f"{'Model':<40} {'Accuracy':>10} {'Tasks':>10}")
372
412
  print("-" * 60)
373
413
 
@@ -436,20 +476,22 @@ def run_benchmark(
436
476
  def quick_test(
437
477
  model: str = "claude-sonnet-4-20250514",
438
478
  provider: str = "anthropic",
479
+ n_tasks: int = 3,
439
480
  ) -> BenchmarkResult:
440
481
  """
441
482
  Run a quick test with a few tasks to verify setup.
442
483
 
443
- This runs only 3 easy tasks to quickly verify that everything is working.
484
+ This runs only a few tasks to quickly verify that everything is working.
444
485
  """
445
486
  runner = BenchmarkRunner()
446
487
 
447
- # Use a small subset of easy tasks for quick testing
488
+ # Use -l flag to limit number of tasks instead of specifying task names
489
+ # This avoids issues with task names changing in the dataset
448
490
  return runner.run(
449
491
  model=model,
450
492
  provider=provider,
451
493
  n_concurrent=1,
452
- task_ids=["ssl-cert", "git-server", "reshard-dataset"], # Example easy tasks
494
+ n_tasks=n_tasks,
453
495
  )
454
496
 
455
497
 
@@ -484,8 +526,8 @@ Examples:
484
526
  help="Provider name")
485
527
  parser.add_argument("--dataset", "-d", default="terminal-bench",
486
528
  help="Dataset name")
487
- parser.add_argument("--version", "-v", default="2.0",
488
- help="Dataset version")
529
+ parser.add_argument("--version", "-v", default=None,
530
+ help="Dataset version (optional, uses latest if not specified)")
489
531
  parser.add_argument("--concurrent", "-n", type=int, default=4,
490
532
  help="Number of concurrent tasks")
491
533
  parser.add_argument("--npc", help="NPC name to use")
@@ -541,7 +583,7 @@ Examples:
541
583
  ("gpt-4o", "openai"),
542
584
  ("gemini-2.0-flash", "gemini"),
543
585
  ]
544
- results = runner.compare_models(
586
+ runner.compare_models(
545
587
  models_to_compare,
546
588
  n_concurrent=args.concurrent
547
589
  )
@@ -560,7 +602,7 @@ Examples:
560
602
  n_concurrent=args.concurrent,
561
603
  npc_name=args.npc,
562
604
  )
563
- print(f"\nBenchmark complete!")
605
+ print("\nBenchmark complete!")
564
606
  print(f"Accuracy: {result.accuracy:.1%}")
565
607
  print(f"Results saved to: {result.output_dir}")
566
608
 
@@ -0,0 +1,35 @@
1
+ #!/bin/bash
2
+ # Installation script for npcsh in Terminal-Bench containers
3
+ # This template is rendered by Harbor before execution
4
+
5
+ set -e
6
+
7
+ echo "Installing npcsh for Terminal-Bench evaluation..."
8
+
9
+ # Install Python dependencies if needed
10
+ if ! command -v pip &> /dev/null; then
11
+ echo "Installing pip..."
12
+ apt-get update && apt-get install -y python3-pip
13
+ fi
14
+
15
+ # Install npcsh with lite dependencies (API providers only, no local models)
16
+ # Use --break-system-packages for PEP 668 compliance (Ubuntu 24.04+)
17
+ echo "Installing npcsh[lite]..."
18
+ pip install --quiet --break-system-packages npcsh[lite] || pip install --quiet npcsh[lite]
19
+
20
+ # Verify installation
21
+ echo "Verifying npcsh installation..."
22
+ npc --help > /dev/null 2>&1 || {
23
+ echo "ERROR: npcsh installation failed"
24
+ exit 1
25
+ }
26
+
27
+ # Set up default configuration
28
+ export NPCSH_STREAM_OUTPUT=0
29
+ export NPCSH_LOG_LEVEL=warning
30
+
31
+ {% if version %}
32
+ echo "npcsh version: {{ version }}"
33
+ {% endif %}
34
+
35
+ echo "npcsh installation complete!"
npcsh/build.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
1
  import shutil
3
2
  import textwrap
4
3
  from pathlib import Path
@@ -11,8 +10,7 @@ def build_flask_server(config, **kwargs):
11
10
  server_script = output_dir / 'npc_server.py'
12
11
 
13
12
  server_code = textwrap.dedent(f'''
14
- import os
15
- from npcpy.serve import start_flask_server
13
+ from npcpy.serve import start_flask_server
16
14
  from npcpy.npc_compiler import Team
17
15
  from sqlalchemy import create_engine
18
16
 
@@ -111,7 +109,7 @@ def build_docker_compose(config, **kwargs):
111
109
  volumes:
112
110
  - npc-data:/root/.npcsh
113
111
  environment:
114
- - NPCSH_DB_PATH=/root/.npcsh/npcsh_history.db
112
+ - NPCSH_DB_PATH=/root/npcsh_history.db
115
113
 
116
114
  volumes:
117
115
  npc-data:
npcsh/completion.py CHANGED
@@ -2,8 +2,7 @@
2
2
  Readline and tab completion for npcsh
3
3
  """
4
4
  import os
5
- import shutil
6
- from typing import List, Any, Optional
5
+ from typing import List, Any
7
6
 
8
7
  try:
9
8
  import readline
@@ -66,11 +65,8 @@ def get_file_completions(text: str) -> List[str]:
66
65
  completions = []
67
66
 
68
67
  if text.startswith("~"):
69
- expanded = os.path.expanduser(text)
70
- prefix = "~"
71
- search_path = expanded
68
+ search_path = os.path.expanduser(text)
72
69
  else:
73
- prefix = ""
74
70
  search_path = text
75
71
 
76
72
  # Get directory to search
npcsh/config.py CHANGED
@@ -3,7 +3,6 @@ npcsh configuration management
3
3
  """
4
4
  import os
5
5
  import importlib.metadata
6
- from typing import Optional, Dict, Any
7
6
 
8
7
  # Version
9
8
  try:
@@ -14,7 +13,6 @@ except importlib.metadata.PackageNotFoundError:
14
13
  # Default paths
15
14
  DEFAULT_NPC_TEAM_PATH = "~/.npcsh/npc_team"
16
15
  PROJECT_NPC_TEAM_PATH = "./npc_team"
17
- HISTORY_DB_DEFAULT_PATH = "~/.npcsh_history.db"
18
16
  READLINE_HISTORY_FILE = os.path.expanduser("~/.npcsh_history")
19
17
 
20
18
  # Environment defaults
@@ -44,7 +42,8 @@ NPCSH_REASONING_PROVIDER = os.environ.get("NPCSH_REASONING_PROVIDER", "ollama")
44
42
  NPCSH_STREAM_OUTPUT = os.environ.get("NPCSH_STREAM_OUTPUT", "0") == "1"
45
43
  NPCSH_API_URL = os.environ.get("NPCSH_API_URL", None)
46
44
  NPCSH_SEARCH_PROVIDER = os.environ.get("NPCSH_SEARCH_PROVIDER", "duckduckgo")
47
- NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG") == "1"
45
+ NPCSH_BUILD_KG = os.environ.get("NPCSH_BUILD_KG", "1") != "0"
46
+ NPCSH_EDIT_APPROVAL = os.environ.get("NPCSH_EDIT_APPROVAL", "off") # off, interactive, auto
48
47
 
49
48
 
50
49
  def get_shell_config_file() -> str: