clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
clawbench/tui.py ADDED
@@ -0,0 +1,1401 @@
1
+ """Interactive TUI for ClawBench — select mode, models, and cases with rich UI."""
2
+
3
+ import inspect
4
+ import json
5
+ import multiprocessing
6
+ import os
7
+ import platform
8
+ import shutil
9
+ import subprocess
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ import questionary
14
+ import yaml
15
+ from questionary import Style
16
+ from rich.console import Console
17
+ from rich.panel import Panel
18
+ from rich.status import Status
19
+ from rich.table import Table
20
+ from rich.text import Text
21
+
22
+
23
+ def _patch_questionary_defaults() -> None:
24
+ """Patch questionary's baked-in defaults.
25
+
26
+ 1. Remove the ``?`` qmark prefix from every prompt type.
27
+ 2. Replace the instruction hint with ``(↑↓)`` for select/checkbox only.
28
+ """
29
+ # Pass 1: strip qmark on all prompt functions
30
+ for name in dir(questionary):
31
+ fn = getattr(questionary, name, None)
32
+ if not callable(fn) or not hasattr(fn, "__defaults__"):
33
+ continue
34
+ defaults = fn.__defaults__
35
+ if not defaults:
36
+ continue
37
+ try:
38
+ sig = inspect.signature(fn)
39
+ except (TypeError, ValueError):
40
+ continue
41
+ params_with_defaults = [
42
+ p for p in sig.parameters.values()
43
+ if p.default is not inspect.Parameter.empty
44
+ ]
45
+ if len(params_with_defaults) != len(defaults):
46
+ continue
47
+ new = list(defaults)
48
+ changed = False
49
+ for i, p in enumerate(params_with_defaults):
50
+ if p.name == "qmark":
51
+ new[i] = ""
52
+ changed = True
53
+ if changed:
54
+ fn.__defaults__ = tuple(new)
55
+
56
+ # Pass 2: set instruction=(↑↓) on select and checkbox only
57
+ for name in ("select", "checkbox"):
58
+ fn = getattr(questionary, name, None)
59
+ if fn is None or not hasattr(fn, "__defaults__"):
60
+ continue
61
+ defaults = fn.__defaults__
62
+ if not defaults:
63
+ continue
64
+ try:
65
+ sig = inspect.signature(fn)
66
+ except (TypeError, ValueError):
67
+ continue
68
+ params_with_defaults = [
69
+ p for p in sig.parameters.values()
70
+ if p.default is not inspect.Parameter.empty
71
+ ]
72
+ if len(params_with_defaults) != len(defaults):
73
+ continue
74
+ new = list(defaults)
75
+ changed = False
76
+ for i, p in enumerate(params_with_defaults):
77
+ if p.name == "instruction" and new[i] is None:
78
+ new[i] = "(↑↓)"
79
+ changed = True
80
+ if changed:
81
+ fn.__defaults__ = tuple(new)
82
+
83
+
84
+ _patch_questionary_defaults()
85
+
86
+ from clawbench import engine as _engine
87
+ from clawbench import paths as _paths
88
+
89
+ MODELS_YAML = _paths.user_models_yaml()
90
+ CASES_DIR = _paths.test_cases_dir()
91
+ PROJECT_ROOT = _paths.bundled_data_dir().parent # kept for any downstream reference
92
+
93
+ API_TYPES = [
94
+ "openai-completions",
95
+ "openai-responses",
96
+ "anthropic-messages",
97
+ "google-generative-ai",
98
+ ]
99
+ THINKING_LEVELS = ["off", "minimal", "low", "medium", "high", "xhigh", "adaptive"]
100
+
101
+ # Provider presets for the "Add a new model" flow. Selecting a provider
102
+ # fills in base_url + api_type automatically and shows a handful of
103
+ # example model names so the user doesn't have to remember the exact
104
+ # string for each vendor.
105
+ PROVIDER_PRESETS: dict[str, dict] = {
106
+ "anthropic": {
107
+ "label": "Anthropic (Claude)",
108
+ "base_url": "https://api.anthropic.com",
109
+ "api_type": "anthropic-messages",
110
+ # Native Anthropic API uses hyphens ("claude-opus-4-6"), NOT
111
+ # the dotted form ("claude-opus-4.6") that OpenRouter uses.
112
+ "examples": [
113
+ "claude-opus-4-6",
114
+ "claude-sonnet-4-6",
115
+ "claude-haiku-4-5",
116
+ ],
117
+ },
118
+ "openai": {
119
+ "label": "OpenAI (GPT / o-series)",
120
+ "base_url": "https://api.openai.com/v1",
121
+ "api_type": "openai-completions",
122
+ "examples": [
123
+ "gpt-5.4",
124
+ "gpt-5.4-mini",
125
+ "gpt-5.4-nano",
126
+ "o3-mini",
127
+ ],
128
+ },
129
+ "google": {
130
+ "label": "Google (Gemini)",
131
+ "base_url": "https://generativelanguage.googleapis.com",
132
+ "api_type": "google-generative-ai",
133
+ # 3.x is still in -preview as of April 2026; 2.5 is the
134
+ # current stable tier. We show both so users can pick.
135
+ "examples": [
136
+ "gemini-3.1-pro-preview",
137
+ "gemini-3-flash-preview",
138
+ "gemini-2.5-pro",
139
+ "gemini-2.5-flash",
140
+ ],
141
+ },
142
+ "openrouter": {
143
+ "label": "OpenRouter (multi-provider gateway)",
144
+ "base_url": "https://openrouter.ai/api/v1",
145
+ "api_type": "openai-completions",
146
+ # OpenRouter normalizes some vendor ids — Claude uses DOTS
147
+ # here ("claude-sonnet-4.6"), not the hyphens of the native
148
+ # Anthropic API. When in doubt, paste the id exactly as
149
+ # listed on https://openrouter.ai/models.
150
+ "examples": [
151
+ "anthropic/claude-sonnet-4.6",
152
+ "openai/gpt-5.4",
153
+ "google/gemini-3-flash-preview",
154
+ "qwen/qwen3.5-plus-02-15",
155
+ ],
156
+ },
157
+ }
158
+
159
+ console = Console()
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Theme — picked at startup, persisted per user, applied to all questionary
163
+ # prompts. Dark and light variants use explicit hex colors rather than ANSI
164
+ # names so the contrast is correct regardless of what ANSI palette the
165
+ # user's terminal happens to have.
166
+ # ---------------------------------------------------------------------------
167
+
168
+ CONFIG_DIR = _paths.user_config_dir()
169
+ CONFIG_FILE = _paths.user_config_json()
170
+
171
+
172
+ def _make_style(theme: str) -> Style:
173
+ """Build a questionary Style from Apple HIG system colors.
174
+
175
+ Light appearance uses deeper tones for legibility on white;
176
+ dark appearance uses brighter tones for dark backgrounds.
177
+
178
+ Reference — Apple Human Interface Guidelines system colors:
179
+ Light Dark
180
+ Blue #007AFF Blue #0A84FF
181
+ Indigo #5856D6 Indigo #5E5CE6
182
+ Green #34C759 Green #30D158
183
+ Gray #8E8E93 Gray #8E8E93
184
+ """
185
+ if theme == "light":
186
+ return Style([
187
+ ("qmark", "fg:#5856D6 bold"), # Apple Indigo (light)
188
+ ("question", "fg:#000000 bold"),
189
+ ("answer", "fg:#34C759 bold"), # Apple Green (light)
190
+ ("pointer", "fg:#5856D6 bold"),
191
+ ("highlighted", "fg:#5856D6 bold"),
192
+ ("selected", "fg:#34C759"),
193
+ ("separator", "fg:#8E8E93"), # Apple Gray
194
+ ("instruction", "fg:#8E8E93"),
195
+ ("text", "fg:#000000"),
196
+ ])
197
+ # dark (default)
198
+ return Style([
199
+ ("qmark", "fg:#5E5CE6 bold"), # Apple Indigo (dark)
200
+ ("question", "fg:#ffffff bold"),
201
+ ("answer", "fg:#30D158 bold"), # Apple Green (dark)
202
+ ("pointer", "fg:#5E5CE6 bold"),
203
+ ("highlighted", "fg:#5E5CE6 bold"),
204
+ ("selected", "fg:#30D158"),
205
+ ("separator", "fg:#8E8E93"), # Apple Gray
206
+ ("instruction", "fg:#8E8E93"),
207
+ ("text", "fg:#ffffff"),
208
+ ])
209
+
210
+
211
+ # Neutral style for the very first prompt (theme picker itself).
212
+ #
213
+ # This runs BEFORE we know whether the user's terminal is dark or light,
214
+ # so we can't commit to any color choice — a bright cyan that looks fine
215
+ # on a black background is painfully neon on a white one, and vice versa.
216
+ # The trick: use ``reverse`` (swaps foreground and background) for the
217
+ # highlighted row, which gives strong contrast on any terminal without
218
+ # picking a single RGB value. Everything else is plain ``bold`` or the
219
+ # terminal's own default foreground.
220
+ _NEUTRAL_STYLE = Style([
221
+ ("qmark", ""),
222
+ ("question", "bold"),
223
+ ("answer", "bold"),
224
+ ("pointer", "bold"),
225
+ ("highlighted", "reverse bold"),
226
+ ("selected", "bold"),
227
+ ("separator", "fg:ansibrightblack"),
228
+ ("instruction", "fg:ansibrightblack"),
229
+ ("text", ""),
230
+ ])
231
+
232
+
233
+ def _load_saved_theme() -> str | None:
234
+ try:
235
+ data = json.loads(CONFIG_FILE.read_text())
236
+ except (FileNotFoundError, json.JSONDecodeError, OSError):
237
+ return None
238
+ theme = data.get("theme")
239
+ return theme if theme in ("dark", "light") else None
240
+
241
+
242
+ def _save_theme(theme: str) -> None:
243
+ try:
244
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
245
+ CONFIG_FILE.write_text(json.dumps({"theme": theme}, indent=2))
246
+ except OSError:
247
+ pass # best-effort persistence; not worth crashing the TUI
248
+
249
+
250
+ def _pick_theme() -> str:
251
+ """Prompt for dark/light terminal. Returns the chosen theme string."""
252
+ console.print()
253
+ console.print(
254
+ " [dim]Pick the color theme that matches your terminal background.[/]"
255
+ )
256
+ theme = questionary.select(
257
+ "Terminal theme:",
258
+ choices=[
259
+ questionary.Choice("Dark (dark background, light text)", value="dark"),
260
+ questionary.Choice("Light (light background, dark text)", value="light"),
261
+ ],
262
+ style=_NEUTRAL_STYLE,
263
+ ).ask()
264
+ if theme is None:
265
+ sys.exit(0)
266
+ _save_theme(theme)
267
+ console.print(
268
+ f" [dim]Saved theme={theme} to {CONFIG_FILE} — use "
269
+ f"'Change theme' in the menu to switch later.[/]"
270
+ )
271
+ console.print()
272
+ return theme
273
+
274
+
275
+ # Module-level STYLE: starts as the neutral fallback, gets replaced in main()
276
+ # once we know the user's theme preference. All prompt call sites read
277
+ # ``STYLE`` at call time, so rebinding it via ``global`` inside main() works.
278
+ STYLE: Style = _NEUTRAL_STYLE
279
+
280
+ # Rich markup accent colors, updated alongside STYLE in main().
281
+ # ACCENT — section headers ("--- Select Model ---"), panel borders.
282
+ # ACCENT2 — inline values, secondary highlights.
283
+ #
284
+ # Apple HIG system colors (light / dark):
285
+ # Indigo #5856D6 / #5E5CE6
286
+ # Blue #007AFF / #0A84FF
287
+ # Green #34C759 / #30D158
288
+ ACCENT = "#5E5CE6" # Apple Indigo dark; replaced in main()
289
+ ACCENT2 = "#0A84FF" # Apple Blue dark; replaced in main()
290
+
291
+
292
+ # ---------------------------------------------------------------------------
293
+ # Data loading
294
+ # ---------------------------------------------------------------------------
295
+
296
+ def load_models_data() -> dict:
297
+ if not MODELS_YAML.exists():
298
+ return {}
299
+ return yaml.safe_load(MODELS_YAML.read_text()) or {}
300
+
301
+
302
+ def save_models(data: dict) -> None:
303
+ MODELS_YAML.parent.mkdir(parents=True, exist_ok=True)
304
+ MODELS_YAML.write_text(yaml.safe_dump(data, sort_keys=False))
305
+
306
+
307
+ def load_models() -> list[str]:
308
+ return sorted(load_models_data().keys())
309
+
310
+
311
+ def load_cases() -> list[str]:
312
+ cases = sorted(p.parent.name for p in CASES_DIR.glob("*/task.json"))
313
+ if not cases:
314
+ console.print("[red bold]ERROR:[/] No test cases found in test-cases/")
315
+ sys.exit(1)
316
+ return cases
317
+
318
+
319
+ # ---------------------------------------------------------------------------
320
+ # Helpers
321
+ # ---------------------------------------------------------------------------
322
+
323
+ def _recommend_concurrent() -> int:
324
+ cpus = multiprocessing.cpu_count()
325
+ try:
326
+ mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
327
+ mem_gb = mem_bytes / (1024**3)
328
+ except (ValueError, OSError):
329
+ mem_gb = 8
330
+ by_cpu = cpus // 2
331
+ by_ram = int(mem_gb // 2)
332
+ recommended = max(1, min(by_cpu, by_ram))
333
+ console.print(
334
+ f" System: [{ACCENT2}]{cpus}[/] CPUs, [{ACCENT2}]{mem_gb:.0f}[/] GB RAM "
335
+ f"— recommended max: [green bold]{recommended}[/]"
336
+ )
337
+ return recommended
338
+
339
+
340
+ def _case_display(case: str) -> str:
341
+ """Format a case name for display: '886 886-entertainment-hobbies-...'"""
342
+ prefix = case.split("-", 1)[0]
343
+ return f"{prefix:>3} {case}"
344
+
345
+
346
+ def _parse_range_input(raw: str, cases: list[str]) -> list[str]:
347
+ """Parse comma-separated IDs, ranges (e.g. 1-50), or * into case names."""
348
+ if raw.strip() == "*":
349
+ return list(cases)
350
+
351
+ # Build ID map: both '001' and '1' → full case name
352
+ id_map: dict[str, str] = {}
353
+ for c in cases:
354
+ prefix = c.split("-", 1)[0]
355
+ id_map[prefix] = c
356
+ stripped = prefix.lstrip("0") or "0"
357
+ id_map[stripped] = c
358
+
359
+ selected: list[str] = []
360
+ for part in raw.split(","):
361
+ part = part.strip()
362
+ if not part:
363
+ continue
364
+ if "-" in part:
365
+ lo, hi = part.split("-", 1)
366
+ for i in range(int(lo), int(hi) + 1):
367
+ key = str(i)
368
+ if key in id_map and id_map[key] not in selected:
369
+ selected.append(id_map[key])
370
+ else:
371
+ if part in id_map and id_map[part] not in selected:
372
+ selected.append(id_map[part])
373
+ return selected
374
+
375
+
376
+ def run_cmd(cmd: list[str], *, hint: str | None = None) -> None:
377
+ console.print()
378
+ console.print(Panel(" ".join(cmd), title="[bold]Command[/]", border_style="green"))
379
+ if hint:
380
+ console.print()
381
+ console.print(hint)
382
+ console.print()
383
+ os.execvp(cmd[0], cmd)
384
+
385
+
386
+ def _confirm_launch(summary: dict) -> bool:
387
+ """Show a summary panel and ask for confirmation."""
388
+ table = Table(show_header=False, box=None, padding=(0, 2))
389
+ table.add_column(style=f"bold {ACCENT}")
390
+ table.add_column()
391
+ for key, val in summary.items():
392
+ table.add_row(key, str(val))
393
+ console.print()
394
+ console.print(Panel(table, title="[bold]Launch Summary[/]", border_style=ACCENT))
395
+ console.print()
396
+ return questionary.confirm("Launch?", default=True, style=STYLE).ask()
397
+
398
+
399
+ def _show_models_table(data: dict) -> None:
400
+ """Print a rich table of configured models."""
401
+ if not data:
402
+ console.print(" [dim]No models configured yet.[/]\n")
403
+ return
404
+ table = Table(title="Configured Models", border_style="dim")
405
+ table.add_column("Name", style="bold green")
406
+ table.add_column("API Type", style=ACCENT2)
407
+ table.add_column("Base URL")
408
+ table.add_column("Thinking", style="yellow")
409
+ for name, cfg in sorted(data.items()):
410
+ table.add_row(
411
+ name,
412
+ cfg.get("api_type", "—"),
413
+ cfg.get("base_url", "—"),
414
+ cfg.get("thinking_level", "—"),
415
+ )
416
+ console.print(table)
417
+ console.print()
418
+
419
+
420
+ # ---------------------------------------------------------------------------
421
+ # Mode: Single run
422
+ # ---------------------------------------------------------------------------
423
+
424
+ def mode_single(models: list[str], cases: list[str]) -> None:
425
+ _ADD_NEW = "+ Add new model"
426
+ while True:
427
+ console.print(f"\n[bold {ACCENT}]--- Select Model ---[/]\n")
428
+ model = questionary.select(
429
+ "Model:",
430
+ choices=models + [questionary.Choice(_ADD_NEW, value=_ADD_NEW)],
431
+ style=STYLE,
432
+ ).ask()
433
+ if model is None:
434
+ return
435
+ if model != _ADD_NEW:
436
+ break
437
+ _add_model(load_models_data())
438
+ models = load_models()
439
+ if not models:
440
+ return
441
+
442
+ console.print(f"\n[bold {ACCENT}]--- Select Test Case ---[/]\n")
443
+ case = questionary.select(
444
+ "Case (arrow keys, or type to filter):",
445
+ choices=cases,
446
+ style=STYLE,
447
+ use_search_filter=True,
448
+ use_jk_keys=False,
449
+ ).ask()
450
+ if case is None:
451
+ return
452
+
453
+ ok = _confirm_launch({"Mode": "Single run", "Model": model, "Case": case})
454
+ if not ok:
455
+ return
456
+
457
+ run_cmd(
458
+ [
459
+ sys.executable, "-m", "clawbench", "run",
460
+ case, model,
461
+ ],
462
+ hint=(
463
+ " [dim]Tip: once the container starts, open the noVNC URL\n"
464
+ " printed below to watch the agent operate the browser\n"
465
+ " in real-time.[/]"
466
+ ),
467
+ )
468
+
469
+
470
+ # ---------------------------------------------------------------------------
471
+ # Mode: Batch run
472
+ # ---------------------------------------------------------------------------
473
+
474
+ def mode_batch(models: list[str], cases: list[str]) -> None:
475
+ _ADD_NEW = "+ Add new model"
476
+ while True:
477
+ console.print(f"\n[bold {ACCENT}]--- Select Models ---[/]\n")
478
+ selected_models = questionary.checkbox(
479
+ "Models (space to select, enter to confirm):",
480
+ choices=models + [questionary.Choice(_ADD_NEW, value=_ADD_NEW)],
481
+ style=STYLE,
482
+ validate=lambda x: len(x) > 0 or "Select at least one model",
483
+ ).ask()
484
+ if not selected_models:
485
+ return
486
+ if _ADD_NEW in selected_models:
487
+ _add_model(load_models_data())
488
+ models = load_models()
489
+ continue # re-show checkbox with updated list
490
+ break
491
+
492
+ console.print(f"\n[bold {ACCENT}]--- Case Selection ---[/]\n")
493
+ case_mode = questionary.select(
494
+ "How to select cases?",
495
+ choices=[
496
+ questionary.Choice("All cases", value="all"),
497
+ questionary.Choice("Case range (e.g. 1-50)", value="range"),
498
+ questionary.Choice("Pick specific cases", value="pick"),
499
+ ],
500
+ style=STYLE,
501
+ ).ask()
502
+ if case_mode is None:
503
+ return
504
+
505
+ case_args: list[str] = []
506
+
507
+ if case_mode == "all":
508
+ case_args = ["--all-cases"]
509
+ case_summary = f"All ({len(cases)})"
510
+ elif case_mode == "range":
511
+ raw = questionary.text(
512
+ "Range (e.g. 1-50, 100-200):",
513
+ style=STYLE,
514
+ validate=lambda x: bool(x.strip()) or "Enter a range",
515
+ ).ask()
516
+ if raw is None:
517
+ return
518
+ # Validate and show what matched
519
+ matched = _parse_range_input(raw, cases)
520
+ if not matched:
521
+ console.print("[red]No cases matched that range.[/]")
522
+ return
523
+ console.print(f" Matched [green]{len(matched)}[/] cases")
524
+ case_args = ["--case-range", raw.strip()]
525
+ case_summary = f"Range {raw.strip()} ({len(matched)} cases)"
526
+ else:
527
+ # Interactive checkbox with all cases
528
+ selected_cases = questionary.checkbox(
529
+ "Cases (space to select):",
530
+ choices=[questionary.Choice(_case_display(c), value=c) for c in cases],
531
+ style=STYLE,
532
+ validate=lambda x: len(x) > 0 or "Select at least one case",
533
+ ).ask()
534
+ if not selected_cases:
535
+ return
536
+ case_args = ["--cases"] + [f"test-cases/{c}" for c in selected_cases]
537
+ case_summary = f"{len(selected_cases)} selected"
538
+
539
+ recommended = _recommend_concurrent()
540
+ concurrent = questionary.text(
541
+ "Max concurrent jobs:",
542
+ default=str(recommended),
543
+ style=STYLE,
544
+ validate=lambda x: x.isdigit() and int(x) > 0 or "Enter a positive number",
545
+ ).ask()
546
+ if concurrent is None:
547
+ return
548
+
549
+ dry = questionary.confirm("Dry run first?", default=False, style=STYLE).ask()
550
+ if dry is None:
551
+ return
552
+
553
+ ok = _confirm_launch({
554
+ "Mode": "Batch run",
555
+ "Models": ", ".join(selected_models),
556
+ "Cases": case_summary,
557
+ "Concurrent": concurrent,
558
+ "Dry run": "Yes" if dry else "No",
559
+ })
560
+ if not ok:
561
+ return
562
+
563
+ cmd = [
564
+ sys.executable, "-m", "clawbench", "batch",
565
+ "--models", *selected_models,
566
+ *case_args,
567
+ "--max-concurrent", concurrent,
568
+ ]
569
+ if dry:
570
+ cmd.append("--dry-run")
571
+
572
+ run_cmd(cmd)
573
+
574
+
575
+ # ---------------------------------------------------------------------------
576
+ # Mode: Human
577
+ # ---------------------------------------------------------------------------
578
+
579
+ def mode_human(cases: list[str]) -> None:
580
+ console.print(f"\n[bold {ACCENT}]--- Select Test Case ---[/]\n")
581
+ case = questionary.select(
582
+ "Case (arrow keys, or type to filter):",
583
+ choices=cases,
584
+ style=STYLE,
585
+ use_search_filter=True,
586
+ use_jk_keys=False,
587
+ ).ask()
588
+ if case is None:
589
+ return
590
+
591
+ ok = _confirm_launch({"Mode": "Human mode", "Case": case})
592
+ if not ok:
593
+ return
594
+
595
+ run_cmd(
596
+ [
597
+ sys.executable, "-m", "clawbench", "run",
598
+ case, "--human",
599
+ ],
600
+ hint=(
601
+ " [dim]Tip: open the noVNC URL printed below to\n"
602
+ " control the browser directly.[/]"
603
+ ),
604
+ )
605
+
606
+
607
+ # ---------------------------------------------------------------------------
608
+ # Mode: Configure models
609
+ # ---------------------------------------------------------------------------
610
+
611
+ def mode_configure() -> None:
612
+ while True:
613
+ data = load_models_data()
614
+ _show_models_table(data)
615
+
616
+ actions = ["Add a new model"]
617
+ if data:
618
+ actions.extend(["Edit a model", "Delete a model", "Back to main menu"])
619
+ else:
620
+ actions.append("Back to main menu")
621
+
622
+ action = questionary.select(
623
+ "What would you like to do?",
624
+ choices=actions,
625
+ style=STYLE,
626
+ ).ask()
627
+ if action is None or action == "Back to main menu":
628
+ return
629
+
630
+ if action == "Add a new model":
631
+ _add_model(data)
632
+ elif action == "Edit a model":
633
+ _edit_model(data)
634
+ elif action == "Delete a model":
635
+ _delete_model(data)
636
+
637
+
638
+ def _add_model(data: dict) -> None:
639
+ # -- Step 1: Pick provider ----------------------------------------------
640
+ # Selecting one of the presets auto-fills base_url + api_type and shows
641
+ # a handful of example model names. "Custom" falls back to the old
642
+ # flow where the user types everything by hand.
643
+ provider_choices = [
644
+ questionary.Choice(preset["label"], value=key)
645
+ for key, preset in PROVIDER_PRESETS.items()
646
+ ]
647
+ provider_choices.append(
648
+ questionary.Choice("Custom (enter base URL + API type by hand)",
649
+ value="custom")
650
+ )
651
+
652
+ console.print(f"\n[bold {ACCENT}]--- Step 1: Provider ---[/]\n")
653
+ provider = questionary.select(
654
+ "Which provider?",
655
+ choices=provider_choices,
656
+ style=STYLE,
657
+ ).ask()
658
+ if provider is None:
659
+ return
660
+
661
+ preset = PROVIDER_PRESETS.get(provider)
662
+
663
+ # -- Step 2: Model name (with per-provider examples) --------------------
664
+ console.print(f"\n[bold {ACCENT}]--- Step 2: Model name ---[/]\n")
665
+ if preset:
666
+ console.print(
667
+ f" [dim]Examples for {preset['label'].strip()}:[/]"
668
+ )
669
+ for ex in preset["examples"]:
670
+ console.print(f" [{ACCENT2}]{ex}[/]")
671
+ console.print(
672
+ " [dim](This string is passed verbatim to the provider as "
673
+ "the model id, and used as the key in models.yaml.)[/]\n"
674
+ )
675
+ else:
676
+ console.print(
677
+ " [dim]Enter the exact model id your custom API expects.[/]\n"
678
+ )
679
+
680
+ name = questionary.text(
681
+ "Model name:",
682
+ style=STYLE,
683
+ validate=lambda x: (
684
+ "Name cannot be empty" if not x.strip()
685
+ else f"'{x.strip()}' already exists" if x.strip() in data
686
+ else True
687
+ ),
688
+ ).ask()
689
+ if name is None:
690
+ return
691
+ name = name.strip()
692
+
693
+ # -- Step 3: base_url + api_type (preset or manual) ---------------------
694
+ if preset:
695
+ base_url = preset["base_url"]
696
+ api_type = preset["api_type"]
697
+ console.print(
698
+ f" [dim]Using preset: base_url={base_url} api_type={api_type}[/]"
699
+ )
700
+ else:
701
+ console.print(f"\n[bold {ACCENT}]--- Step 3: Endpoint ---[/]\n")
702
+ base_url = questionary.text(
703
+ "Base URL:",
704
+ style=STYLE,
705
+ validate=lambda x: bool(x.strip()) or "URL cannot be empty",
706
+ ).ask()
707
+ if base_url is None:
708
+ return
709
+ api_type = questionary.select(
710
+ "API type:",
711
+ choices=API_TYPES,
712
+ default="openai-completions",
713
+ style=STYLE,
714
+ ).ask()
715
+ if api_type is None:
716
+ return
717
+ base_url = base_url.strip()
718
+
719
+ # -- Step 4: API key ----------------------------------------------------
720
+ console.print(f"\n[bold {ACCENT}]--- Step 4: API key ---[/]\n")
721
+ api_key = questionary.text(
722
+ "API key:",
723
+ style=STYLE,
724
+ validate=lambda x: bool(x.strip()) or "API key cannot be empty",
725
+ ).ask()
726
+ if api_key is None:
727
+ return
728
+
729
+ # -- Step 5: Thinking level --------------------------------------------
730
+ console.print(f"\n[bold {ACCENT}]--- Step 5: Thinking level ---[/]\n")
731
+ thinking_level = questionary.select(
732
+ "Thinking level:",
733
+ choices=THINKING_LEVELS,
734
+ default="medium",
735
+ style=STYLE,
736
+ ).ask()
737
+ if thinking_level is None:
738
+ return
739
+
740
+ data[name] = {
741
+ "api_key": api_key.strip(),
742
+ "base_url": base_url,
743
+ "api_type": api_type,
744
+ "thinking_level": thinking_level,
745
+ }
746
+ save_models(data)
747
+ console.print(f"\n [green bold]Saved[/] {name} to {MODELS_YAML}\n")
748
+
749
+
750
+ def _edit_model(data: dict) -> None:
751
+ name = questionary.select(
752
+ "Which model to edit?",
753
+ choices=sorted(data.keys()),
754
+ style=STYLE,
755
+ ).ask()
756
+ if name is None:
757
+ return
758
+
759
+ cfg = data[name]
760
+ console.print(f"\n Editing [bold]{name}[/] — press Enter to keep current value.\n")
761
+
762
+ base_url = questionary.text(
763
+ "Base URL:",
764
+ default=cfg.get("base_url", ""),
765
+ style=STYLE,
766
+ ).ask()
767
+ if base_url is None:
768
+ return
769
+
770
+ api_type = questionary.select(
771
+ "API type:",
772
+ choices=API_TYPES,
773
+ default=cfg.get("api_type", "openai-completions"),
774
+ style=STYLE,
775
+ ).ask()
776
+ if api_type is None:
777
+ return
778
+
779
+ api_key = questionary.text(
780
+ "API key:",
781
+ default=cfg.get("api_key", ""),
782
+ style=STYLE,
783
+ ).ask()
784
+ if api_key is None:
785
+ return
786
+
787
+ thinking_level = questionary.select(
788
+ "Thinking level:",
789
+ choices=THINKING_LEVELS,
790
+ default=cfg.get("thinking_level", "medium"),
791
+ style=STYLE,
792
+ ).ask()
793
+ if thinking_level is None:
794
+ return
795
+
796
+ data[name] = {
797
+ "api_key": api_key.strip(),
798
+ "base_url": base_url.strip(),
799
+ "api_type": api_type,
800
+ "thinking_level": thinking_level,
801
+ }
802
+ save_models(data)
803
+ console.print(f"\n [green bold]Updated[/] {name}\n")
804
+
805
+
806
+ def _delete_model(data: dict) -> None:
807
+ name = questionary.select(
808
+ "Which model to delete?",
809
+ choices=sorted(data.keys()),
810
+ style=STYLE,
811
+ ).ask()
812
+ if name is None:
813
+ return
814
+
815
+ confirm = questionary.confirm(
816
+ f"Delete '{name}'?", default=False, style=STYLE
817
+ ).ask()
818
+ if not confirm:
819
+ return
820
+
821
+ del data[name]
822
+ save_models(data)
823
+ console.print(f"\n [red bold]Deleted[/] {name}\n")
824
+
825
+
826
+ # ---------------------------------------------------------------------------
827
+ # Main
828
+ # ---------------------------------------------------------------------------
829
+
830
+ def _require_tty() -> None:
831
+ """Bail out with a friendly message when stdin/stdout is not a real TTY.
832
+
833
+ questionary/prompt_toolkit crash with a cryptic ``OSError: [Errno 22]
834
+ Invalid argument`` when stdin is not a terminal (piped input, non-tty
835
+ IDE terminals, CI, or calls from tools that don't allocate a pty).
836
+ Detect this up-front and point the user at the non-interactive
837
+ entrypoints instead.
838
+ """
839
+ if sys.stdin.isatty() and sys.stdout.isatty():
840
+ return
841
+
842
+ console.print()
843
+ console.print("[yellow bold]![/] This TUI needs an interactive terminal.")
844
+ console.print()
845
+ console.print(
846
+ " stdin/stdout is not a TTY (piped input, CI, some IDE terminals,"
847
+ )
848
+ console.print(
849
+ f" or tool-based invocations). The underlying [{ACCENT2}]questionary[/]"
850
+ )
851
+ console.print(
852
+ " library requires a real terminal to render prompts."
853
+ )
854
+ console.print()
855
+ console.print(" For non-interactive use, call the CLI directly:")
856
+ console.print()
857
+ console.print(
858
+ f" [{ACCENT2}]claw-bench run[/] "
859
+ f"[{ACCENT2}]001-daily-life-food-uber-eats claude-sonnet-4-6[/]"
860
+ )
861
+ console.print()
862
+ console.print(
863
+ f" [{ACCENT2}]claw-bench batch[/] "
864
+ f"[{ACCENT2}]--all-models --case-range 1-50 --max-concurrent 3[/]"
865
+ )
866
+ console.print()
867
+ console.print(
868
+ f" See [{ACCENT2}]claw-bench --help[/] for full CLI usage."
869
+ )
870
+ console.print()
871
+ sys.exit(1)
872
+
873
+
874
+ # ---------------------------------------------------------------------------
875
+ # Container engine health check
876
+ # ---------------------------------------------------------------------------
877
+ #
878
+ # Every ClawBench run (including Human mode) launches a container via
879
+ # docker or podman. If the CLI binary is installed but the daemon /
880
+ # Linux VM isn't running, the user would otherwise only find out after
881
+ # picking a model, picking a case, confirming Launch, and waiting for a
882
+ # cryptic "unable to connect to Podman socket" error deep inside the
883
+ # build step.
884
+ #
885
+ # We short-circuit that by probing the engine at TUI startup and
886
+ # offering a one-click fix for the common cases:
887
+ #
888
+ # * podman with no machine initialized -> `podman machine init`
889
+ # + `podman machine start`
890
+ # * podman machine exists but stopped -> `podman machine start`
891
+ # * docker daemon not running on macOS -> `open -a Docker`
892
+ #
893
+ # If the fix succeeds, we continue; if the user declines or it fails,
894
+ # we let them into the TUI anyway (they may want to Configure models or
895
+ # Change theme before fixing the engine), but flag that run modes will
896
+ # fail until the engine is up.
897
+
898
+
899
+ def _engine_from_env_or_path() -> str | None:
900
+ """Thin shim — detection lives in :mod:`clawbench.engine` (podman-first)."""
901
+ return _engine.detect_engine()
902
+
903
+
904
+ def _check_engine() -> tuple[str | None, str, str]:
905
+ """Probe the container engine and classify the result.
906
+
907
+ Thin shim around :func:`clawbench.engine.check_engine` that preserves
908
+ the legacy ``(engine, status, detail)`` tuple shape expected by the
909
+ TUI call sites. See :mod:`clawbench.engine` for status codes.
910
+ """
911
+ s = _engine.check_engine()
912
+ return s.engine, s.status, s.detail
913
+
914
+
915
+ def _diagnose_fix_failure(buf: list[str]) -> str | None:
916
+ """Scan captured command output for known failure patterns and
917
+ return a friendly hint, or None if we don't recognize the error.
918
+
919
+ These run AFTER a fix command (``podman machine init`` etc.) has
920
+ failed, so we have the full output buffer to pattern-match.
921
+ """
922
+ blob = "\n".join(buf).lower()
923
+
924
+ # Proxy misconfiguration: users in regions that need an HTTP proxy
925
+ # to reach quay.io often have a stale HTTPS_PROXY pointing at a
926
+ # port that isn't currently listening. The error looks like:
927
+ # proxyconnect tcp: dial tcp 127.0.0.1:7891: connect: connection refused
928
+ if "proxyconnect" in blob and "connection refused" in blob:
929
+ # Pull out the port we couldn't reach, if visible.
930
+ import re as _re
931
+ m = _re.search(r"dial tcp \S+?:(\d+)", blob)
932
+ port = m.group(1) if m else "?"
933
+ return (
934
+ "Your HTTP(S)_PROXY env var points at a proxy on "
935
+ f"port {port}, but nothing is listening there right now.\n\n"
936
+ "Either start the proxy tool on that port, or update your "
937
+ "shell profile (.zshrc/.bashrc) to point at the proxy port "
938
+ "that is actually running, then open a fresh terminal and "
939
+ "re-run ./run.sh.\n\n"
940
+ "If you are behind the Great Firewall, you need a working "
941
+ "proxy to reach quay.io — you can't just unset HTTPS_PROXY."
942
+ )
943
+
944
+ # DNS / generic connectivity failure (no proxy involved)
945
+ if "no such host" in blob or "could not resolve" in blob:
946
+ return (
947
+ "Network lookup failed while reaching a container registry. "
948
+ "Check your DNS / VPN / proxy settings and try again."
949
+ )
950
+
951
+ # Registry unreachable but no proxy error — probably GFW without a
952
+ # proxy configured at all.
953
+ if "quay.io" in blob and ("timeout" in blob or "i/o timeout" in blob):
954
+ return (
955
+ "Couldn't reach quay.io (the podman machine image registry). "
956
+ "If you are behind a restrictive network, set HTTPS_PROXY "
957
+ "to a working proxy and re-run."
958
+ )
959
+
960
+ return None
961
+
962
+
963
+ def _run_streamed(cmd: list[str], *, status_msg: str) -> int:
964
+ """Run a long-running command under a Rich Status spinner.
965
+
966
+ We stream stderr/stdout to a buffer so the spinner stays clean, then
967
+ dump the last ~20 lines on failure — plus a diagnostic hint if we
968
+ recognize the error pattern. Returns the process exit code.
969
+ """
970
+ buf: list[str] = []
971
+ with Status(status_msg, console=console):
972
+ proc = subprocess.Popen(
973
+ cmd,
974
+ stdout=subprocess.PIPE,
975
+ stderr=subprocess.STDOUT,
976
+ text=True,
977
+ bufsize=1,
978
+ )
979
+ assert proc.stdout is not None
980
+ for line in proc.stdout:
981
+ buf.append(line.rstrip())
982
+ rc = proc.wait()
983
+ if rc != 0:
984
+ console.print()
985
+ console.print(f" [red]Command failed:[/] {' '.join(cmd)}")
986
+ for line in buf[-20:]:
987
+ console.print(f" [dim]{line}[/]")
988
+ hint = _diagnose_fix_failure(buf)
989
+ if hint:
990
+ console.print()
991
+ console.print(Panel(
992
+ Text(hint, style="dim"),
993
+ title="[bold]Likely cause[/]",
994
+ ))
995
+ console.print()
996
+ return rc
997
+
998
+
999
+ def _fix_engine(engine: str, status: str, detail: str) -> bool:
1000
+ """Show an actionable panel for the engine problem and offer a fix.
1001
+
1002
+ Returns True if the engine is now usable, False otherwise. Safe to
1003
+ call with any combination — no-op for ``ready``.
1004
+ """
1005
+ if status == "ready":
1006
+ return True
1007
+
1008
+ is_mac = platform.system() == "Darwin"
1009
+ is_win = platform.system() == "Windows"
1010
+
1011
+ if status == "not_installed":
1012
+ console.print()
1013
+ console.print(Panel(
1014
+ Text.assemble(
1015
+ Text("No container engine found.\n\n", style="bold"),
1016
+ Text(
1017
+ "ClawBench runs every task inside a container, so you "
1018
+ "need either Docker or Podman installed before any mode "
1019
+ "(including Human mode) can work.\n\n"
1020
+ "macOS: brew install --cask docker\n"
1021
+ " — or —\n"
1022
+ " brew install podman && podman machine init && podman machine start\n\n"
1023
+ "Linux: sudo apt install podman (or docker.io)\n\n"
1024
+ "Windows: winget install Docker.DockerDesktop\n"
1025
+ " — or —\n"
1026
+ " winget install RedHat.Podman && podman machine init && podman machine start",
1027
+ style="dim",
1028
+ ),
1029
+ ),
1030
+ title="[bold]Container engine not installed[/]",
1031
+ ))
1032
+ console.print()
1033
+ return False
1034
+
1035
+ if engine == "podman" and status == "podman_no_machine":
1036
+ console.print()
1037
+ console.print(Panel(
1038
+ Text.assemble(
1039
+ Text("Podman needs a Linux VM on this platform.\n\n", style="bold"),
1040
+ Text(
1041
+ "On macOS and Windows, podman runs Linux containers "
1042
+ "inside a small helper VM. You don't have one yet. "
1043
+ "I can run these two commands for you now:\n\n"
1044
+ " podman machine init\n"
1045
+ " podman machine start\n\n"
1046
+ "The first one downloads a ~1 GB VM image, so it "
1047
+ "takes a few minutes.",
1048
+ style="dim",
1049
+ ),
1050
+ ),
1051
+ title="[bold]Podman machine not initialized[/]",
1052
+ ))
1053
+ console.print()
1054
+ ok = questionary.confirm(
1055
+ "Run `podman machine init && podman machine start` now?",
1056
+ default=True, style=STYLE,
1057
+ ).ask()
1058
+ if not ok:
1059
+ return False
1060
+ if _run_streamed(
1061
+ ["podman", "machine", "init"],
1062
+ status_msg="Running podman machine init (downloads VM image, may take a few minutes)...",
1063
+ ) != 0:
1064
+ return False
1065
+ if _run_streamed(
1066
+ ["podman", "machine", "start"],
1067
+ status_msg="Starting podman machine...",
1068
+ ) != 0:
1069
+ return False
1070
+ # Re-verify
1071
+ _, new_status, _ = _check_engine()
1072
+ if new_status == "ready":
1073
+ console.print(" [green]✓[/] Podman is now running.")
1074
+ return True
1075
+ return False
1076
+
1077
+ if engine == "podman" and status == "podman_machine_stopped":
1078
+ console.print()
1079
+ console.print(Panel(
1080
+ Text.assemble(
1081
+ Text("Podman machine is not running.\n\n", style="bold"),
1082
+ Text(
1083
+ "The Linux VM that podman uses to run containers is "
1084
+ "currently stopped. I can start it for you with:\n\n"
1085
+ " podman machine start",
1086
+ style="dim",
1087
+ ),
1088
+ ),
1089
+ title="[bold]Podman machine stopped[/]",
1090
+ ))
1091
+ console.print()
1092
+ ok = questionary.confirm(
1093
+ "Run `podman machine start` now?", default=True, style=STYLE,
1094
+ ).ask()
1095
+ if not ok:
1096
+ return False
1097
+ if _run_streamed(
1098
+ ["podman", "machine", "start"],
1099
+ status_msg="Starting podman machine...",
1100
+ ) != 0:
1101
+ return False
1102
+ _, new_status, _ = _check_engine()
1103
+ if new_status == "ready":
1104
+ console.print(" [green]✓[/] Podman is now running.")
1105
+ return True
1106
+ return False
1107
+
1108
+ if engine == "podman" and status == "podman_low_memory":
1109
+ mem_mb = int(detail) if detail.isdigit() else 0
1110
+ console.print()
1111
+ console.print(Panel(
1112
+ Text.assemble(
1113
+ Text(f"Podman machine has only {mem_mb} MB RAM.\n\n",
1114
+ style="bold"),
1115
+ Text(
1116
+ "ClawBench runs Chrome + an AI agent gateway inside "
1117
+ "the container, which needs at least 4 GB RAM. With "
1118
+ f"the current {mem_mb} MB the agent process will be "
1119
+ "killed by the OOM killer.\n\n"
1120
+ "I can stop the VM, increase its memory to 4 GB, and "
1121
+ "restart it:\n\n"
1122
+ " podman machine stop\n"
1123
+ " podman machine set --memory 4096\n"
1124
+ " podman machine start",
1125
+ style="dim",
1126
+ ),
1127
+ ),
1128
+ title="[bold]Podman machine: not enough memory[/]",
1129
+ ))
1130
+ console.print()
1131
+ ok = questionary.confirm(
1132
+ "Resize podman machine to 4 GB RAM now?",
1133
+ default=True, style=STYLE,
1134
+ ).ask()
1135
+ if not ok:
1136
+ return False
1137
+ for cmd, msg in [
1138
+ (["podman", "machine", "stop"],
1139
+ "Stopping podman machine..."),
1140
+ (["podman", "machine", "set", "--memory", "4096"],
1141
+ "Setting memory to 4096 MB..."),
1142
+ (["podman", "machine", "start"],
1143
+ "Starting podman machine..."),
1144
+ ]:
1145
+ if _run_streamed(cmd, status_msg=msg) != 0:
1146
+ return False
1147
+ _, new_status, _ = _check_engine()
1148
+ if new_status == "ready":
1149
+ console.print(" [green]✓[/] Podman machine now has 4 GB RAM.")
1150
+ return True
1151
+ return False
1152
+
1153
+ if engine == "docker" and status == "docker_not_running":
1154
+ console.print()
1155
+ console.print(Panel(
1156
+ Text.assemble(
1157
+ Text("Docker daemon is not running.\n\n", style="bold"),
1158
+ Text(
1159
+ "The `docker` CLI is installed but can't reach the "
1160
+ "daemon. On macOS/Windows this usually means Docker "
1161
+ "Desktop isn't launched.",
1162
+ style="dim",
1163
+ ),
1164
+ ),
1165
+ title="[bold]Docker daemon unreachable[/]",
1166
+ ))
1167
+ console.print()
1168
+ if is_mac:
1169
+ ok = questionary.confirm(
1170
+ "Open Docker Desktop now? (you'll still need to wait "
1171
+ "for it to finish starting)",
1172
+ default=True, style=STYLE,
1173
+ ).ask()
1174
+ if ok:
1175
+ subprocess.run(["open", "-a", "Docker"])
1176
+ console.print(
1177
+ " [dim]Docker Desktop is starting — re-run ./run.sh "
1178
+ "once its menu-bar icon stops animating.[/]"
1179
+ )
1180
+ else:
1181
+ console.print(
1182
+ " [dim]Start the Docker daemon / Docker Desktop, then "
1183
+ "re-run ./run.sh.[/]"
1184
+ )
1185
+ return False
1186
+
1187
+ # unknown_error — nothing to automate
1188
+ console.print()
1189
+ console.print(Panel(
1190
+ Text.assemble(
1191
+ Text("Container engine probe failed.\n\n", style="bold"),
1192
+ Text(
1193
+ f"Engine: {engine}\n\n"
1194
+ f"{detail or '(no details)'}",
1195
+ style="dim",
1196
+ ),
1197
+ ),
1198
+ title="[bold]Engine check error[/]",
1199
+ ))
1200
+ console.print()
1201
+ return False
1202
+
1203
+
1204
+ def main() -> None:
1205
+ global STYLE, ACCENT, ACCENT2
1206
+
1207
+ _require_tty()
1208
+
1209
+ # Theme: load saved preference, or prompt on first run.
1210
+ # Everything rendered BEFORE _pick_theme() has to look OK on both
1211
+ # light and dark terminals, so we avoid cyan/blue accents here —
1212
+ # plain bold + the terminal's default foreground is the only safe
1213
+ # combination. Once the user picks a theme, we can use accent colors
1214
+ # again via the chosen STYLE.
1215
+ #
1216
+ # We treat "no models configured" as a first-run condition too — if
1217
+ # the user hasn't finished setup yet, we want them to confirm their
1218
+ # terminal theme before we show any colored onboarding UI, even if
1219
+ # they happen to have a stale saved theme from an earlier session.
1220
+ theme = _load_saved_theme()
1221
+ models = load_models()
1222
+ cases = load_cases()
1223
+ first_run = (theme is None) or (not models)
1224
+ if first_run:
1225
+ console.print()
1226
+ console.print("[bold]Welcome to ClawBench.[/]")
1227
+ theme = _pick_theme()
1228
+ STYLE = _make_style(theme)
1229
+ # Apple HIG: Indigo for headers, Blue for inline accents
1230
+ if theme == "light":
1231
+ ACCENT, ACCENT2 = "#5856D6", "#007AFF"
1232
+ else:
1233
+ ACCENT, ACCENT2 = "#5E5CE6", "#0A84FF"
1234
+
1235
+ # Engine health check: every mode (including Human) needs a working
1236
+ # docker/podman. If it's broken, offer to fix it right now rather
1237
+ # than letting the user discover the problem after picking a model
1238
+ # and waiting through the build step.
1239
+ engine, engine_status, engine_detail = _check_engine()
1240
+ if engine_status != "ready":
1241
+ _fix_engine(engine, engine_status, engine_detail)
1242
+ # Re-probe so the menu knows whether run-modes are viable.
1243
+ engine, engine_status, engine_detail = _check_engine()
1244
+
1245
+ # Onboarding: if there are no models configured, this is almost
1246
+ # certainly a fresh install (or someone who copied the example yaml
1247
+ # without editing it). Don't force the user to discover the
1248
+ # "Configure models" menu item — walk them into _add_model() right
1249
+ # now. They can still skip it and fall through to Human mode, which
1250
+ # doesn't need any LLM to run.
1251
+ if not models:
1252
+ _onboard_no_models()
1253
+ models = load_models()
1254
+
1255
+ # Header panel: deliberately uses no explicit color so it renders
1256
+ # legibly regardless of the user's terminal background.
1257
+ title = Text("ClawBench", style="bold")
1258
+ subtitle = Text(
1259
+ f"{len(models)} models configured | {len(cases)} test cases available",
1260
+ style="dim",
1261
+ )
1262
+ console.print()
1263
+ console.print(Panel(
1264
+ Text.assemble(title, "\n", subtitle),
1265
+ ))
1266
+ console.print()
1267
+
1268
+ while True:
1269
+ mode = questionary.select(
1270
+ "Select mode:",
1271
+ choices=[
1272
+ questionary.Choice("Single run (one model x one case)", value="single"),
1273
+ questionary.Choice("Batch run (models x cases)", value="batch"),
1274
+ questionary.Choice("Human mode (no agent, noVNC)", value="human"),
1275
+ questionary.Choice("Configure models", value="configure"),
1276
+ questionary.Choice("Change theme", value="theme"),
1277
+ questionary.Choice("Exit", value="exit"),
1278
+ ],
1279
+ style=STYLE,
1280
+ ).ask()
1281
+
1282
+ if mode is None or mode == "exit":
1283
+ console.print("\n[dim]Bye.[/]")
1284
+ return
1285
+
1286
+ if mode == "theme":
1287
+ theme = _pick_theme()
1288
+ STYLE = _make_style(theme)
1289
+ if theme == "light":
1290
+ ACCENT, ACCENT2 = "#5856D6", "#007AFF"
1291
+ else:
1292
+ ACCENT, ACCENT2 = "#5E5CE6", "#0A84FF"
1293
+ continue
1294
+
1295
+ if mode == "configure":
1296
+ mode_configure()
1297
+ models = load_models()
1298
+ continue
1299
+
1300
+ # Every run mode (including Human) needs a live engine. If it
1301
+ # wasn't fixable earlier, try one more time now — the user may
1302
+ # have started Docker Desktop in another window, or we may be
1303
+ # catching a transient failure.
1304
+ if mode in ("single", "batch", "human"):
1305
+ engine, engine_status, engine_detail = _check_engine()
1306
+ if engine_status != "ready":
1307
+ if not _fix_engine(engine, engine_status, engine_detail):
1308
+ console.print()
1309
+ console.print(
1310
+ " [yellow]Container engine is still not "
1311
+ "ready — can't launch this mode.[/]"
1312
+ )
1313
+ continue
1314
+
1315
+ # Human mode is intentionally allowed with zero models — it
1316
+ # drives the browser via noVNC without any LLM at all.
1317
+ if mode == "human":
1318
+ mode_human(cases)
1319
+ continue
1320
+
1321
+ # Single / batch both need at least one configured model. Instead
1322
+ # of printing an error and looping, offer to configure one now.
1323
+ if mode in ("single", "batch") and not models:
1324
+ console.print()
1325
+ console.print(
1326
+ " [dim]This mode needs at least one configured model.[/]"
1327
+ )
1328
+ add_now = questionary.confirm(
1329
+ "Add a model now?", default=True, style=STYLE
1330
+ ).ask()
1331
+ if add_now:
1332
+ data = load_models_data()
1333
+ _add_model(data)
1334
+ models = load_models()
1335
+ continue
1336
+
1337
+ if mode == "single":
1338
+ mode_single(models, cases)
1339
+ elif mode == "batch":
1340
+ mode_batch(models, cases)
1341
+
1342
+
1343
+ def _onboard_no_models() -> None:
1344
+ """First-run guidance when ``models/models.yaml`` has zero entries.
1345
+
1346
+ Shown after the theme picker but before the main menu. The user has
1347
+ three real options: add a model now, skip straight to Human mode
1348
+ (which doesn't need any LLM), or quit.
1349
+ """
1350
+ console.print()
1351
+ console.print(Panel(
1352
+ Text.assemble(
1353
+ Text("No models configured yet.\n\n", style="bold"),
1354
+ Text(
1355
+ "ClawBench needs at least one model entry in "
1356
+ "models/models.yaml before it can run agent-mode tasks. "
1357
+ "Let's add one now — it takes about 30 seconds and you "
1358
+ "only need your API key.\n\n"
1359
+ "You can also skip this and jump straight to Human mode, "
1360
+ "which drives the browser via noVNC without any LLM.",
1361
+ style="dim",
1362
+ ),
1363
+ ),
1364
+ title="[bold]First-run setup[/]",
1365
+ ))
1366
+ console.print()
1367
+
1368
+ choice = questionary.select(
1369
+ "What would you like to do?",
1370
+ choices=[
1371
+ questionary.Choice("Add a model now (recommended)", value="add"),
1372
+ questionary.Choice("Skip — I'll use Human mode", value="skip"),
1373
+ questionary.Choice("Quit", value="quit"),
1374
+ ],
1375
+ style=STYLE,
1376
+ ).ask()
1377
+
1378
+ if choice is None or choice == "quit":
1379
+ console.print("\n[dim]Bye.[/]")
1380
+ sys.exit(0)
1381
+
1382
+ if choice == "add":
1383
+ data = load_models_data()
1384
+ _add_model(data)
1385
+ # Loop once more in case the user cancelled mid-wizard: we want
1386
+ # them to land in a sane state (either configured or explicitly
1387
+ # choosing to skip to Human mode).
1388
+ if not load_models():
1389
+ console.print()
1390
+ console.print(
1391
+ " [dim]No model was saved. You can still use Human mode "
1392
+ "from the menu, or pick 'Configure models' later.[/]"
1393
+ )
1394
+
1395
+
1396
+ if __name__ == "__main__":
1397
+ try:
1398
+ main()
1399
+ except KeyboardInterrupt:
1400
+ console.print("\n\n[dim]Aborted.[/]")
1401
+ sys.exit(0)