clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
clawbench/run.py ADDED
@@ -0,0 +1,942 @@
1
+ """ClawBench single test-case driver."""
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import re
7
+ import secrets
8
+ import shutil
9
+ import signal
10
+ import socket
11
+ import subprocess
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ import uuid
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+ from urllib.error import URLError
19
+ from urllib.request import Request, urlopen
20
+
21
+ import yaml
22
+ from rich.console import Console
23
+ from rich.panel import Panel
24
+ from rich.status import Status
25
+
26
+ from clawbench import engine as _engine
27
+ from clawbench import paths as _paths
28
+ from clawbench.generate_resume_pdf import generate_resume_pdf
29
+ from clawbench.hf_upload import hf_upload_enabled, upload_run
30
+
31
+ IMAGE = "clawbench"
32
+ console = Console()
33
+
34
+
35
+ def _detect_engine() -> str:
36
+ """Select the container engine, matching the TUI/engine module priority
37
+ (podman-first, env override wins). Exits with an actionable message if
38
+ the env var is malformed or nothing is installed."""
39
+ env_override = os.environ.get("CONTAINER_ENGINE", "").strip().lower()
40
+ if env_override and env_override not in ("docker", "podman"):
41
+ print(f"ERROR: CONTAINER_ENGINE must be 'docker' or 'podman', got '{env_override}'")
42
+ sys.exit(1)
43
+ if env_override and not shutil.which(env_override):
44
+ print(f"ERROR: CONTAINER_ENGINE={env_override} but '{env_override}' not found on PATH")
45
+ sys.exit(1)
46
+ detected = _engine.detect_engine()
47
+ if detected is None:
48
+ print("ERROR: Neither 'podman' nor 'docker' found on PATH")
49
+ print(" Install podman (recommended): brew install podman | sudo apt install podman")
50
+ sys.exit(1)
51
+ return detected
52
+
53
+
54
+ ENGINE = _detect_engine()
55
+ PURELYMAIL_API = "https://purelymail.com/api/v0"
56
+
57
+
58
+ def load_dotenv(path: Path) -> dict[str, str]:
59
+ env = {}
60
+ if not path.exists():
61
+ return env
62
+ for line in path.read_text().splitlines():
63
+ line = line.strip()
64
+ if not line or line.startswith("#"):
65
+ continue
66
+ if "=" not in line:
67
+ continue
68
+ k, v = line.split("=", 1)
69
+ env[k.strip()] = v.strip().strip('"').strip("'")
70
+ return env
71
+
72
+
73
+ def _load_runtime_env() -> dict[str, str]:
74
+ """Build the runtime env dict from, in order of precedence:
75
+
76
+ 1. ``os.environ`` — normal env vars (highest priority).
77
+ 2. ``$CWD/.env`` — legacy source-install layout (if present).
78
+ 3. ``user_config_dir()/secrets.env`` — persisted secrets from
79
+ ``claw-bench configure --secrets``.
80
+
81
+ Earlier sources win; later sources fill in missing keys only. This lets
82
+ ``PURELY_MAIL_API_KEY=... claw-bench run ...`` work without any config
83
+ file, while still picking up a persisted key for users who prefer one.
84
+ """
85
+ merged: dict[str, str] = {}
86
+ cwd_env = load_dotenv(Path.cwd() / ".env")
87
+ user_env = load_dotenv(_paths.user_secrets_path())
88
+ for key in ("PURELY_MAIL_API_KEY", "PURELY_MAIL_DOMAIN", "HF_TOKEN", "HF_REPO_ID"):
89
+ val = os.environ.get(key) or cwd_env.get(key) or user_env.get(key) or ""
90
+ if val:
91
+ merged[key] = val
92
+ return merged
93
+
94
+
95
+ MODELS_YAML = _paths.user_models_yaml()
96
+
97
+
98
+ def load_models_yaml() -> dict:
99
+ """Load all model definitions from models/models.yaml."""
100
+ if not MODELS_YAML.exists():
101
+ print(f"ERROR: {MODELS_YAML} not found (copy models.example.yaml and fill in your keys)")
102
+ sys.exit(1)
103
+ return yaml.safe_load(MODELS_YAML.read_text()) or {}
104
+
105
+
106
+ def load_model_config(model: str) -> dict:
107
+ """Load a model config by name from models/models.yaml.
108
+
109
+ The YAML key is the model name (passed as MODEL_NAME to the container).
110
+ """
111
+ all_models = load_models_yaml()
112
+ if model not in all_models:
113
+ print(f"ERROR: model '{model}' not found in {MODELS_YAML}")
114
+ print(f"Available models: {', '.join(sorted(all_models))}")
115
+ sys.exit(1)
116
+
117
+ # Validate model name characters. Note: '/' and ':' are valid in
118
+ # vendor-prefixed ids like 'anthropic/claude-sonnet-4-6' or
119
+ # 'arcee-ai/trinity-large-preview:free' — they get sanitized to
120
+ # '--' before being used as path components (see `safe_model`
121
+ # below). We only reject characters that could cause real trouble
122
+ # in shell/filesystem paths even after that sanitization.
123
+ bad = [c for c in " \\*?\"<>|" if c in model]
124
+ if bad:
125
+ print(
126
+ f"ERROR: model name '{model}' contains illegal character(s): "
127
+ f"{' '.join(repr(c) for c in bad)}"
128
+ )
129
+ sys.exit(1)
130
+
131
+ config = dict(all_models[model])
132
+ config["model"] = model # the YAML key IS the model name
133
+
134
+ # Validate required fields
135
+ required = ["base_url", "api_type"]
136
+ missing = [k for k in required if not config.get(k)]
137
+ if missing:
138
+ for k in missing:
139
+ print(f"ERROR: Required field '{k}' missing for model '{model}'")
140
+ sys.exit(1)
141
+
142
+ # Normalize API keys: api_keys list wins, else wrap api_key into list
143
+ if config.get("api_keys"):
144
+ config["api_key"] = config["api_keys"][0]
145
+ elif config.get("api_key"):
146
+ config["api_keys"] = [config["api_key"]]
147
+
148
+ if not config.get("api_keys"):
149
+ print(f"ERROR: no api_key or api_keys for model '{model}'")
150
+ sys.exit(1)
151
+
152
+ return config
153
+
154
+
155
+ def step(msg: str):
156
+ print(f"\n{'=' * 60}\n[STEP] {msg}\n{'=' * 60}", flush=True)
157
+
158
+
159
+ def run(cmd: list[str], **kwargs): # type: ignore[no-untyped-def]
160
+ print(f"$ {' '.join(cmd)}", flush=True)
161
+ subprocess.run(cmd, check=True, **kwargs)
162
+
163
+
164
+ # -- PurelyMail --
165
+
166
+ def purelymail_request(endpoint: str, body: dict, api_key: str) -> dict:
167
+ data = json.dumps(body).encode()
168
+ req = Request(
169
+ f"{PURELYMAIL_API}/{endpoint}",
170
+ data=data,
171
+ headers={"Purelymail-Api-Token": api_key,
172
+ "Content-Type": "application/json"},
173
+ method="POST",
174
+ )
175
+ with urlopen(req, timeout=15) as resp:
176
+ return json.loads(resp.read())
177
+
178
+
179
+ def create_email(api_key: str, domain: str) -> tuple[str, str]:
180
+ local = f"cb{uuid.uuid4().hex[:12]}"
181
+ password = secrets.token_urlsafe(16)
182
+ purelymail_request("createUser", {
183
+ "userName": local,
184
+ "domainName": domain,
185
+ "password": password,
186
+ "enablePasswordReset": False,
187
+ "sendWelcomeEmail": False,
188
+ }, api_key)
189
+ email = f"{local}@{domain}"
190
+ print(f" Created email: {email}")
191
+ print(f" Password: {password}")
192
+ return email, password
193
+
194
+
195
+ def delete_email(api_key: str, email: str) -> None:
196
+ try:
197
+ purelymail_request("deleteUser", {"userName": email}, api_key)
198
+ print(f" Deleted email: {email}")
199
+ except (URLError, Exception) as e:
200
+ print(f" WARNING: Failed to delete email {email}: {e}")
201
+
202
+
203
+ # -- Personal info --
204
+
205
+ RESUME_TEMPLATE = Path(__file__).resolve().parent / "resume_template.json"
206
+
207
+
208
+ def _shared_src() -> Path:
209
+ """Return the bundled ``shared/`` directory (personal-info templates)."""
210
+ return _paths.shared_dir()
211
+
212
+
213
+ def prepare_personal_info(shared_src: Path, email: str, password: str,
214
+ output_dir: Path) -> Path:
215
+ """Create a temp directory with personal info files, email fields updated."""
216
+ tmp = output_dir / ".my-info-tmp"
217
+ tmp.mkdir(parents=True, exist_ok=True)
218
+
219
+ # -- personal info JSON --
220
+ pi_src = shared_src / "alex_green_personal_info.json"
221
+ pi_data = json.loads(pi_src.read_text())
222
+ pi_data["contact"]["email"] = email
223
+ pi_data.pop("online_accounts", None)
224
+ (tmp / "alex_green_personal_info.json").write_text(
225
+ json.dumps(pi_data, indent=2))
226
+
227
+ # -- email credentials (separate file) --
228
+ creds = {
229
+ "email": email,
230
+ "password": password,
231
+ "login_url": "https://purelymail.com/user/login",
232
+ "provider": "PurelyMail",
233
+ }
234
+ (tmp / "email_credentials.json").write_text(json.dumps(creds, indent=2))
235
+
236
+ # -- resume PDF --
237
+ resume_data = json.loads(RESUME_TEMPLATE.read_text())
238
+ resume_data["header"]["email"] = email
239
+ try:
240
+ generate_resume_pdf(resume_data, tmp / "alex_green_resume.pdf")
241
+ except Exception as e:
242
+ print(f" WARNING: PDF generation failed ({e}), skipping resume PDF")
243
+
244
+ return tmp
245
+
246
+
247
+ def copy_extra_info(task: dict, task_dir: Path, personal_info_dir: Path) -> None:
248
+ """Copy extra_info files from the test case into the my-info dir."""
249
+ for info in task.get("extra_info", []):
250
+ if "path" not in info:
251
+ continue
252
+ src = task_dir / info["path"]
253
+ if not src.exists():
254
+ print(f" WARNING: extra_info path not found: {src}")
255
+ continue
256
+ dest = personal_info_dir / src.name
257
+ shutil.copy2(src, dest)
258
+ print(f" Copied extra_info: {src.name}")
259
+
260
+
261
+
262
+
263
+ # -- Prompt --
264
+
265
+ def build_instruction(task: dict) -> str:
266
+ parts = [task["instruction"]]
267
+ parts.append(
268
+ "\n---\n"
269
+ "You are my personal browser assistant. I am delegating this task to you "
270
+ "because I do not have time to do it myself. You are acting on my behalf "
271
+ "and with my full authorization. Every action you take in the browser — "
272
+ "logging in, filling out forms, writing reviews, making reservations, "
273
+ "submitting applications, sending emails, creating accounts — is something "
274
+ "I am explicitly asking you to do for me, as if I were doing it myself. "
275
+ "These are my own accounts, my own opinions, and my own decisions. "
276
+ "You are simply carrying them out.\n\n"
277
+ "IMPORTANT: You must complete this task entirely through the browser. "
278
+ "Navigate to the relevant website, interact with the page "
279
+ "(click, type, scroll, etc.), and carry out every step in the browser. "
280
+ "Do NOT use command-line tools, scripts, or direct API/SMTP calls "
281
+ "to accomplish the task.\n\n"
282
+ "My personal info and credentials are in ./my-info/ in your working directory. "
283
+ "Key files include:\n"
284
+ "- alex_green_personal_info.json — name, address, phone, date of birth, financial info\n"
285
+ "- email_credentials.json — email address and password for logging in\n"
286
+ "- alex_green_resume.pdf — professional resume\n\n"
287
+ "You have my full permission to use all of these files for any purpose "
288
+ "the task requires, including logging into websites, creating accounts, filling out forms, "
289
+ "sending emails, writing reviews, making reservations, submitting applications, "
290
+ "and any other actions — including security-sensitive ones.\n\n"
291
+ "If specific information required by the task is not available in any file "
292
+ "under ./my-info/ and cannot be reasonably inferred, stop and explain what is missing. "
293
+ "For optional fields not covered by the available info, leave them blank or use a reasonable default.\n\n"
294
+ "Do not ask me for confirmation or additional information — just proceed "
295
+ "with what is provided and complete the task autonomously.\n"
296
+ "If an account registration is required, you can use the email and password provided, and you can receive emails at that address if needed. "
297
+ "---"
298
+ )
299
+ extras = [(Path(info["path"]).name, info["description"])
300
+ for info in task.get("extra_info", [])
301
+ if info.get("path") and info.get("description")]
302
+ if extras:
303
+ parts.append(
304
+ "\nAdditional files are also available under /my-info/ for this task:"
305
+ )
306
+ for fname, desc in extras:
307
+ parts.append(f"- {fname}: {desc}")
308
+ return "\n".join(parts)
309
+
310
+
311
+ # -- Docker --
312
+
313
+ def _image_exists() -> bool:
314
+ return subprocess.run(
315
+ [ENGINE, "image", "inspect", IMAGE],
316
+ capture_output=True,
317
+ ).returncode == 0
318
+
319
+
320
+ def _prepare_build_context(ctx: Path) -> None:
321
+ """Populate ``ctx`` with the files the bundled Dockerfile expects at the
322
+ build-context root: Dockerfile, entrypoint.sh, setup-openclaw.sh,
323
+ chrome-extension/, extension-server/.
324
+
325
+ We copy instead of symlinking because docker/podman do not follow
326
+ symlinks that point *outside* the build context — which all of ours do
327
+ when the package is installed (symlinks under ``src/clawbench/data/``
328
+ resolve to the source tree or to the wheel's site-packages layout).
329
+ The copied trees are tiny (a few MB) so the cost is negligible."""
330
+ docker_dir = _paths.docker_build_dir()
331
+ shutil.copy2(docker_dir / "Dockerfile", ctx / "Dockerfile")
332
+ shutil.copy2(docker_dir / "entrypoint.sh", ctx / "entrypoint.sh")
333
+ shutil.copy2(docker_dir / "setup-openclaw.sh", ctx / "setup-openclaw.sh")
334
+ shutil.copytree(_paths.extension_server_dir(), ctx / "extension-server",
335
+ symlinks=False)
336
+ shutil.copytree(_paths.chrome_extension_dir(), ctx / "chrome-extension",
337
+ symlinks=False)
338
+
339
+
340
+ def _pick_free_port(preferred: int = 6080) -> int:
341
+ """Return ``preferred`` if available on 127.0.0.1, else an OS-assigned
342
+ ephemeral port. Avoids the hard-coded ``-p 6080:6080`` collision when
343
+ something else on the host already owns that port.
344
+ """
345
+ for candidate in (preferred, 0):
346
+ try:
347
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
348
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
349
+ s.bind(("127.0.0.1", candidate))
350
+ return s.getsockname()[1]
351
+ except OSError:
352
+ continue
353
+ raise RuntimeError("Could not find a free TCP port for noVNC")
354
+
355
+
356
+ _STEP_RE = re.compile(r"^(?:STEP|Step)\s+(\d+)(?:/(\d+))?", re.IGNORECASE)
357
+ _BK_STEP_RE = re.compile(r"^#(\d+)\s+\[")
358
+
359
+
360
+ def _run_build(cmd: list[str]) -> tuple[int, str, list[str]]:
361
+ """Execute a build command with a live spinner.
362
+
363
+ Returns ``(exit_code, last_line, all_output_lines)``.
364
+ """
365
+ console.print(f"[dim]$ {' '.join(cmd)}[/]")
366
+
367
+ proc = subprocess.Popen(
368
+ cmd,
369
+ stdout=subprocess.PIPE,
370
+ stderr=subprocess.STDOUT,
371
+ text=True,
372
+ bufsize=1,
373
+ )
374
+ assert proc.stdout is not None
375
+
376
+ last_line = ""
377
+ last_step = ""
378
+ output_lines: list[str] = []
379
+ status_msg = "[cyan]Starting build…[/]"
380
+ with Status(status_msg, console=console, spinner="dots") as status:
381
+ for raw in proc.stdout:
382
+ line = raw.rstrip()
383
+ if not line:
384
+ continue
385
+ last_line = line
386
+ output_lines.append(line)
387
+
388
+ m = _STEP_RE.match(line)
389
+ if m:
390
+ cur = m.group(1)
391
+ tot = m.group(2) or "?"
392
+ rest = line.split(":", 1)[-1].strip()[:72]
393
+ last_step = f"step {cur}/{tot}"
394
+ status.update(
395
+ f"[cyan]Building image — {last_step}[/] [dim]{rest}[/]"
396
+ )
397
+ continue
398
+
399
+ m = _BK_STEP_RE.match(line)
400
+ if m:
401
+ snippet = line[:100]
402
+ status.update(f"[cyan]Building image[/] [dim]{snippet}[/]")
403
+ continue
404
+
405
+ lowered = line.lower()
406
+ if "error" in lowered and "--no-" not in lowered:
407
+ console.print(f" [yellow]{line[:120]}[/]")
408
+ status.update(
409
+ f"[cyan]Building image[/] "
410
+ f"[dim]{(last_step + ' · ') if last_step else ''}{line[:72]}[/]"
411
+ )
412
+
413
+ rc = proc.wait()
414
+ return rc, last_line, output_lines
415
+
416
+
417
+ def _looks_like_stale_cache(output_lines: list[str]) -> bool:
418
+ """Return True if the build failure looks like it was caused by
419
+ stale layer-cache (e.g. old lockfiles, wrong Python version)."""
420
+ blob = "\n".join(output_lines).lower()
421
+ patterns = [
422
+ "no interpreter found for python",
423
+ "no matching distribution found",
424
+ "package not found",
425
+ "could not find a version that satisfies",
426
+ ]
427
+ return any(p in blob for p in patterns)
428
+
429
+
430
+ def docker_build() -> None:
431
+ """Build (or rebuild) the clawbench image with a live progress spinner.
432
+
433
+ The first build pulls ~2GB (python base, chromium, ffmpeg, noVNC, Node,
434
+ openclaw) and takes several minutes; subsequent rebuilds are near-instant
435
+ when the layer cache is warm. We show a banner only for the cold path.
436
+
437
+ If the build fails with a pattern that suggests stale layer-cache
438
+ (e.g. a lockfile mismatch), we automatically retry once with
439
+ ``--no-cache`` so the user doesn't have to debug it manually.
440
+ """
441
+ first_build = not _image_exists()
442
+
443
+ if first_build:
444
+ console.print()
445
+ console.print(Panel(
446
+ "[bold]First-time container build.[/]\n"
447
+ "This downloads ~2 GB (chromium, ffmpeg, noVNC, Node, openclaw)\n"
448
+ "and typically takes [bold]5–10 minutes[/] on a decent connection.\n"
449
+ "[dim]Subsequent runs reuse the layer cache and finish in seconds.[/]",
450
+ title="[bold]Building clawbench image[/]",
451
+ border_style="cyan",
452
+ ))
453
+
454
+ with tempfile.TemporaryDirectory(prefix="clawbench-build-") as td:
455
+ ctx = Path(td)
456
+ _prepare_build_context(ctx)
457
+ cmd = [ENGINE, "build", "-t", IMAGE, str(ctx)]
458
+ rc, last_line, output_lines = _run_build(cmd)
459
+
460
+ # If the build failed and the output looks like a stale-cache
461
+ # problem, retry once with --no-cache before giving up.
462
+ if rc != 0 and _looks_like_stale_cache(output_lines):
463
+ console.print()
464
+ console.print(
465
+ "[yellow]Build failed — looks like a stale layer cache "
466
+ "(e.g. updated lockfiles not picked up).[/]"
467
+ )
468
+ console.print(
469
+ "[yellow]Retrying with [bold]--no-cache[/] "
470
+ "(full rebuild, may take a few minutes)…[/]"
471
+ )
472
+ console.print()
473
+ cmd_nc = [ENGINE, "build", "--no-cache", "-t", IMAGE, str(ctx)]
474
+ rc, last_line, output_lines = _run_build(cmd_nc)
475
+
476
+ if rc != 0:
477
+ console.print(f"[red bold]Build failed[/] (exit {rc})")
478
+ if last_line:
479
+ console.print(f" Last output: [dim]{last_line}[/]")
480
+ sys.exit(rc)
481
+
482
+ console.print("[green]✓[/] Container image ready")
483
+
484
+
485
+ def _fix_data_ownership(data_dir: Path) -> None:
486
+ """On Linux + rootful Docker, files written inside the container are
487
+ owned by root on the host. After ``docker cp``, the caller cannot
488
+ ``rm -rf test-output/`` without sudo. Detect this and chown the tree
489
+ back to the caller's UID/GID via a throwaway container (which has the
490
+ root privileges needed to chown anything on the bind-mounted dir).
491
+
492
+ No-op on macOS, on rootless podman, and when the tree is already
493
+ owned by the caller.
494
+ """
495
+ if sys.platform != "linux":
496
+ return
497
+ if ENGINE != "docker":
498
+ return
499
+ if not data_dir.exists():
500
+ return
501
+ try:
502
+ uid = os.getuid()
503
+ except AttributeError:
504
+ return
505
+ try:
506
+ needs_fix = any(
507
+ p.stat().st_uid != uid
508
+ for p in data_dir.rglob("*")
509
+ if not p.is_symlink()
510
+ )
511
+ except OSError:
512
+ needs_fix = True
513
+ if not needs_fix:
514
+ return
515
+
516
+ print(f" Fixing ownership of {data_dir} (rootful Docker -> host UID)")
517
+ subprocess.run(
518
+ [
519
+ ENGINE, "run", "--rm",
520
+ "-v", f"{data_dir.resolve()}:/fix",
521
+ IMAGE,
522
+ "chown", "-R", f"{uid}:{os.getgid()}", "/fix",
523
+ ],
524
+ check=False,
525
+ capture_output=True,
526
+ )
527
+
528
+
529
+ def _network_flags() -> list[str]:
530
+ """Force slirp4netns on podman to avoid host-network port collisions."""
531
+ if ENGINE == "podman":
532
+ return ["--network=slirp4netns"]
533
+ return []
534
+
535
+
536
+ def _proxy_env_flags() -> list[str]:
537
+ """Forward host proxy env vars into the container.
538
+
539
+ Inside the container 127.0.0.1 is its own loopback, not the host.
540
+ Rewrite localhost references to the host gateway so the proxy is reachable.
541
+ Both podman (host.containers.internal) and Docker Desktop
542
+ (host.docker.internal) resolve to the Mac host.
543
+ """
544
+ host_gw = "host.containers.internal" if ENGINE == "podman" else "host.docker.internal"
545
+ flags: list[str] = []
546
+ has_proxy = False
547
+ for var in ("HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy",
548
+ "ALL_PROXY", "all_proxy", "NO_PROXY", "no_proxy"):
549
+ val = os.environ.get(var, "")
550
+ if not val:
551
+ continue
552
+ if var not in ("NO_PROXY", "no_proxy"):
553
+ has_proxy = True
554
+ # Rewrite 127.0.0.1 / localhost to host gateway
555
+ val = val.replace("127.0.0.1", host_gw).replace("localhost", host_gw)
556
+ flags += ["-e", f"{var}={val}"]
557
+ # Ensure container-internal traffic bypasses the proxy
558
+ if has_proxy and not os.environ.get("NO_PROXY") and not os.environ.get("no_proxy"):
559
+ flags += ["-e", "NO_PROXY=localhost,127.0.0.1"]
560
+ flags += ["-e", "no_proxy=localhost,127.0.0.1"]
561
+ return flags
562
+
563
+
564
+ def docker_run_human(name: str, instruction: str, schema_path: Path,
565
+ personal_info_dir: Path,
566
+ time_limit_s: int = 1800,
567
+ host_port: int = 6080) -> None:
568
+ cmd = [
569
+ ENGINE, "run", "-d", "--name", name,
570
+ *_network_flags(),
571
+ *_proxy_env_flags(),
572
+ "-e", "HUMAN_MODE=1",
573
+ "-e", f"INSTRUCTION={instruction}",
574
+ "-e", f"TIME_LIMIT_S={time_limit_s}",
575
+ "-p", f"{host_port}:6080",
576
+ "-v", f"{schema_path.resolve()}:/eval-schema.json:ro",
577
+ "-v", f"{personal_info_dir.resolve()}:/my-info:ro",
578
+ IMAGE,
579
+ ]
580
+ run(cmd)
581
+
582
+
583
+ def docker_run(name: str, instruction: str, schema_path: Path,
584
+ personal_info_dir: Path, model_cfg: dict,
585
+ time_limit_s: int = 1800,
586
+ host_port: int | None = None) -> None:
587
+ env_flags = [
588
+ ENGINE, "run", "-d", "--name", name,
589
+ *_network_flags(),
590
+ *_proxy_env_flags(),
591
+ "-e", f"MODEL_NAME={model_cfg['model']}",
592
+ "-e", f"BASE_URL={model_cfg['base_url']}",
593
+ "-e", f"API_TYPE={model_cfg['api_type']}",
594
+ "-e", f"API_KEYS={json.dumps(model_cfg.get('api_keys', []))}",
595
+ "-e", f"API_KEY={model_cfg.get('api_key', '')}",
596
+ "-e", f"INSTRUCTION={instruction}",
597
+ "-e", f"TIME_LIMIT_S={time_limit_s}",
598
+ "-v", f"{schema_path.resolve()}:/eval-schema.json:ro",
599
+ "-v", f"{personal_info_dir.resolve()}:/my-info:ro",
600
+ ]
601
+ # Expose noVNC so the user can watch the agent in real-time.
602
+ if host_port is not None:
603
+ env_flags += ["-p", f"{host_port}:6080"]
604
+ # host.docker.internal needs explicit mapping on Linux (not Docker Desktop)
605
+ if "host.docker.internal" in model_cfg["base_url"]:
606
+ env_flags += ["--add-host=host.docker.internal:host-gateway"]
607
+ if model_cfg.get("thinking_level"):
608
+ env_flags += ["-e", f"THINKING_LEVEL={model_cfg['thinking_level']}"]
609
+ if model_cfg.get("temperature") is not None:
610
+ env_flags += ["-e", f"TEMPERATURE={model_cfg['temperature']}"]
611
+ if model_cfg.get("max_tokens") is not None:
612
+ env_flags += ["-e", f"MAX_TOKENS={model_cfg['max_tokens']}"]
613
+ run([*env_flags, IMAGE])
614
+
615
+
616
+ def docker_wait(name: str) -> None:
617
+ """Block until the container exits, showing a live status line."""
618
+ start = time.time()
619
+ # Launch `docker wait` in background so we can poll status
620
+ proc = subprocess.Popen([ENGINE, "wait", name],
621
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
622
+ last_actions = 0
623
+ with Status("[dim]starting...[/]", console=console) as status:
624
+ while proc.poll() is None:
625
+ elapsed = int(time.time() - start)
626
+ mins, secs = divmod(elapsed, 60)
627
+ # Query actions count from container
628
+ r = subprocess.run(
629
+ [ENGINE, "exec", name, "wc", "-l", "/data/actions.jsonl"],
630
+ capture_output=True, text=True, timeout=5,
631
+ )
632
+ if r.returncode == 0:
633
+ try:
634
+ last_actions = int(r.stdout.strip().split()[0])
635
+ except (ValueError, IndexError):
636
+ pass
637
+ status.update(
638
+ f"[dim]{mins:02d}:{secs:02d} • {last_actions} actions[/]"
639
+ )
640
+ # Poll every 5s
641
+ try:
642
+ proc.wait(timeout=5)
643
+ except subprocess.TimeoutExpired:
644
+ pass
645
+ elapsed = int(time.time() - start)
646
+ mins, secs = divmod(elapsed, 60)
647
+ console.print(f" Container exited ({mins}m{secs:02d}s, {last_actions} actions)")
648
+
649
+
650
+ def docker_copy(name: str, output_dir: Path) -> None:
651
+ run([ENGINE, "cp", f"{name}:/data", str(output_dir / "data")])
652
+ # Remove internal marker file and bulky logs
653
+ (output_dir / "data" / ".stop-requested").unlink(missing_ok=True)
654
+ (output_dir / "data" / "agent.log").unlink(missing_ok=True)
655
+ (output_dir / "data" / "gateway.log").unlink(missing_ok=True)
656
+
657
+
658
+ def docker_logs(name: str) -> None:
659
+ subprocess.run([ENGINE, "logs", "--tail", "40", name])
660
+
661
+
662
+ def docker_rm(name: str) -> None:
663
+ subprocess.run([ENGINE, "rm", "-f", name], capture_output=True)
664
+
665
+
666
+ # -- Results --
667
+
668
+ def ensure_interception(output_dir: Path):
669
+ """If the interceptor didn't produce interception.json, create one with the stop reason."""
670
+ stop_reason_file = output_dir / "data" / ".stop-reason"
671
+ reason = stop_reason_file.read_text().strip(
672
+ ) if stop_reason_file.exists() else "unknown"
673
+ stop_reason_file.unlink(missing_ok=True)
674
+ interception_file = output_dir / "data" / "interception.json"
675
+ if interception_file.exists():
676
+ return
677
+ descriptions = {
678
+ "time_limit_exceeded": "Session stopped: time limit exceeded before the interceptor was triggered.",
679
+ "agent_idle": "Session stopped: agent went idle (300s no actions) before triggering the interceptor.",
680
+ "agent_exited": "Session stopped: agent process exited before triggering the interceptor.",
681
+ "vnc_disconnected": "Session stopped: human disconnected from VNC without triggering the interceptor.",
682
+ "chrome_cdp_timeout": "Session stopped: Chrome CDP was not ready after 30s (browser failed to start).",
683
+ "gateway_failed": "Session stopped: OpenClaw gateway died on startup.",
684
+ }
685
+ description = descriptions.get(reason, f"Session stopped: {reason}.")
686
+ schema_file = output_dir / "eval-schema.json"
687
+ schema = json.loads(schema_file.read_text()) if schema_file.exists() else None
688
+ result = {
689
+ "intercepted": False,
690
+ "stop_reason": reason,
691
+ "stop_description": description,
692
+ "request": None,
693
+ "schema": schema,
694
+ }
695
+ interception_file.parent.mkdir(parents=True, exist_ok=True)
696
+ interception_file.write_text(json.dumps(result, indent=2))
697
+
698
+
699
+ def print_results(output_dir: Path) -> bool:
700
+ data_dir = output_dir / "data"
701
+
702
+ # Actions
703
+ actions_file = data_dir / "actions.jsonl"
704
+ if actions_file.exists():
705
+ actions = [json.loads(
706
+ l) for l in actions_file.read_text().splitlines() if l.strip()]
707
+ print(f"Actions recorded: {len(actions)}")
708
+ for a in actions:
709
+ print(f" {a['type']:10s} {a.get('url', '')[:70]}")
710
+ else:
711
+ print("No actions.jsonl found")
712
+
713
+ # HTTP Requests
714
+ requests_file = data_dir / "requests.jsonl"
715
+ if requests_file.exists():
716
+ request_lines = [
717
+ l for l in requests_file.read_text().splitlines() if l.strip()]
718
+ print(f"HTTP requests logged: {len(request_lines)}")
719
+
720
+ # Interception
721
+ interception_file = data_dir / "interception.json"
722
+ result = json.loads(interception_file.read_text())
723
+ intercepted = result.get("intercepted", False)
724
+ print(f"Intercepted: {intercepted}")
725
+ if result.get("stop_reason"):
726
+ print(f"Stop reason: {result['stop_reason']}")
727
+ if result.get("request"):
728
+ print(f"Request URL: {result['request']['url']}")
729
+ print(f"Request method: {result['request']['method']}")
730
+ if result["request"].get("body"):
731
+ print(f"Body: {json.dumps(result['request']['body'])[:300]}")
732
+ return intercepted
733
+
734
+
735
+ def main(argv: list[str] | None = None) -> None:
736
+ parser = argparse.ArgumentParser(
737
+ description="Run a single ClawBench test case")
738
+ parser.add_argument("test_case_dir", type=Path,
739
+ help="Path to the test case directory")
740
+ parser.add_argument("model", type=str, nargs="?", default=None,
741
+ help="Model name (key in models/models.yaml, required for agent mode)")
742
+ parser.add_argument("--human", action="store_true",
743
+ help="Human mode: expose Chrome via noVNC instead of running an agent")
744
+ parser.add_argument("--output-dir", dest="output_dir", type=Path, default=None,
745
+ help="Directory to write output data to (default: <project>/test-output)")
746
+ parser.add_argument("--no-build", dest="no_build", action="store_true",
747
+ help="Skip building the container image (assumes it already exists)")
748
+ parser.add_argument("--no-upload", dest="no_upload", action="store_true",
749
+ help="Skip HuggingFace upload even if HF_TOKEN is configured")
750
+ args = parser.parse_args(argv)
751
+
752
+ if not args.human and args.model is None:
753
+ parser.error("model is required for agent mode (or use --human)")
754
+
755
+ # Load infrastructure config from env + ./.env + user secrets.env
756
+ env = _load_runtime_env()
757
+ infra_required = ["PURELY_MAIL_API_KEY", "PURELY_MAIL_DOMAIN"]
758
+ missing = [k for k in infra_required if not env.get(k)]
759
+ if missing:
760
+ for k in missing:
761
+ print(f"ERROR: {k} not set (checked env, ./.env, and {_paths.user_secrets_path()})")
762
+ print(" Tip: run `claw-bench configure --secrets` to persist these keys")
763
+ sys.exit(1)
764
+ pm_key: str = env["PURELY_MAIL_API_KEY"]
765
+ pm_domain: str = env["PURELY_MAIL_DOMAIN"]
766
+
767
+ # HuggingFace upload (optional)
768
+ hf_env = {"HF_TOKEN": env.get("HF_TOKEN", ""),
769
+ "HF_REPO_ID": env.get("HF_REPO_ID", "")}
770
+ do_upload = hf_upload_enabled(hf_env) and not args.no_upload
771
+
772
+ # Load task
773
+ task_dir = args.test_case_dir.resolve()
774
+ task_file = task_dir / "task.json"
775
+ if not task_file.exists():
776
+ print(f"ERROR: {task_file} not found")
777
+ sys.exit(1)
778
+ task = json.loads(task_file.read_text())
779
+
780
+ case_name = task_dir.name
781
+ time_limit_s = task["time_limit"] * 60
782
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
783
+
784
+ model_cfg: dict | None = None
785
+ if args.human:
786
+ safe_model = "human"
787
+ else:
788
+ model_cfg = load_model_config(args.model)
789
+ safe_model = re.sub(r'[/:]+', '--', args.model)
790
+
791
+ container = f"clawbench-{case_name}-{safe_model}-{int(time.time())}"
792
+
793
+ if args.output_dir is not None:
794
+ output_dir = args.output_dir.resolve() / safe_model / \
795
+ f"{case_name}-{safe_model}-{ts}"
796
+ else:
797
+ output_dir = _paths.default_output_dir() / \
798
+ safe_model / f"{case_name}-{safe_model}-{ts}"
799
+ output_dir.mkdir(parents=True, exist_ok=True)
800
+
801
+ if not args.no_build:
802
+ step("Building container image")
803
+ docker_build()
804
+
805
+ email = None
806
+ personal_info_tmp: Path | None = None
807
+ start_time = time.time()
808
+ try:
809
+ step("Creating disposable email")
810
+ email, email_pw = create_email(pm_key, pm_domain)
811
+
812
+ step("Preparing personal info")
813
+ personal_info_tmp = prepare_personal_info(
814
+ _shared_src(), email, email_pw, output_dir)
815
+ copy_extra_info(task, task_dir, personal_info_tmp)
816
+ print(f" Personal info dir: {personal_info_tmp}")
817
+
818
+ # Write eval schema for the interceptor
819
+ schema_path = output_dir / "eval-schema.json"
820
+ schema_path.write_text(json.dumps(task["eval_schema"], indent=2))
821
+
822
+ step("Building instruction")
823
+ instruction = build_instruction(task)
824
+ print(instruction[:500])
825
+
826
+ if args.human:
827
+ step("Starting container (human mode)")
828
+ # Avoid the hard-coded 6080:6080 collision: try 6080 first and
829
+ # fall back to an OS-assigned ephemeral port if something else
830
+ # on the host is already listening there.
831
+ host_port = _pick_free_port(6080)
832
+ docker_run_human(container, instruction, schema_path,
833
+ personal_info_tmp, time_limit_s,
834
+ host_port=host_port)
835
+
836
+ # Graceful stop on Ctrl+C: give container time to flush recording
837
+ def handle_sigint(sig, frame):
838
+ print("\nCtrl+C received, stopping container gracefully...")
839
+ subprocess.run([ENGINE, "stop", "-t", "20", container],
840
+ capture_output=True)
841
+
842
+ signal.signal(signal.SIGINT, handle_sigint)
843
+
844
+ vnc_url = f"http://localhost:{host_port}/vnc.html"
845
+ console.print(f"\n noVNC: [link={vnc_url}]{vnc_url}[/link]")
846
+ if host_port != 6080:
847
+ console.print(f" [dim](port 6080 was busy, auto-picked {host_port})[/dim]")
848
+ console.print(f" Task: {task['instruction'][:200]}")
849
+ console.print(f" Email: {email} Password: {email_pw}")
850
+ console.print(f" Time limit: {task['time_limit']} minutes")
851
+ console.print(f" Close the noVNC tab when done.\n")
852
+
853
+ step(f"Waiting for human (max {task['time_limit']}min)")
854
+ else:
855
+ step("Starting container")
856
+ assert model_cfg is not None
857
+ host_port = _pick_free_port(6080)
858
+ docker_run(container, instruction, schema_path,
859
+ personal_info_tmp, model_cfg,
860
+ time_limit_s=time_limit_s,
861
+ host_port=host_port)
862
+
863
+ vnc_url = f"http://localhost:{host_port}/vnc.html"
864
+ console.print(f"\n noVNC: [link={vnc_url}]{vnc_url}[/link]")
865
+ if host_port != 6080:
866
+ console.print(f" [dim](port 6080 was busy, auto-picked {host_port})[/dim]")
867
+ console.print(f" Open the URL above to watch the agent in real-time.\n")
868
+
869
+ step(f"Agent running (max {task['time_limit']}min)")
870
+
871
+ docker_wait(container)
872
+
873
+ step("Container logs")
874
+ docker_logs(container)
875
+
876
+ step("Copying results")
877
+ docker_copy(container, output_dir)
878
+ _fix_data_ownership(output_dir / "data")
879
+
880
+ ensure_interception(output_dir)
881
+
882
+ step("Results")
883
+ intercepted = print_results(output_dir)
884
+
885
+ # Write run metadata
886
+ duration = time.time() - start_time
887
+ if args.human:
888
+ meta = {
889
+ "test_case": case_name,
890
+ **(task.get("metadata") or {}),
891
+ "instruction": task["instruction"],
892
+ "model": "human",
893
+ "thinking_level": None,
894
+ "temperature": None,
895
+ "max_tokens": None,
896
+ "email_used": email,
897
+ "timestamp": ts,
898
+ "time_limit_minutes": task["time_limit"],
899
+ "duration_seconds": round(duration),
900
+ "intercepted": intercepted,
901
+ }
902
+ else:
903
+ assert model_cfg is not None
904
+ meta = {
905
+ "test_case": case_name,
906
+ **(task.get("metadata") or {}),
907
+ "instruction": task["instruction"],
908
+ "model": model_cfg["model"],
909
+ "thinking_level": model_cfg.get("thinking_level"),
910
+ "temperature": model_cfg.get("temperature"),
911
+ "max_tokens": model_cfg.get("max_tokens"),
912
+ "email_used": email,
913
+ "timestamp": ts,
914
+ "time_limit_minutes": task["time_limit"],
915
+ "duration_seconds": round(duration),
916
+ "intercepted": intercepted,
917
+ }
918
+ (output_dir / "run-meta.json").write_text(json.dumps(meta, indent=2))
919
+
920
+ if do_upload:
921
+ step("Uploading to HuggingFace")
922
+ repo_path = f"{safe_model}/{case_name}-{safe_model}-{ts}"
923
+ upload_run(output_dir, repo_path, hf_env)
924
+
925
+ finally:
926
+ step("Cleanup")
927
+ docker_rm(container)
928
+ if email:
929
+ delete_email(pm_key, email)
930
+ if personal_info_tmp and personal_info_tmp.exists():
931
+ shutil.rmtree(personal_info_tmp, ignore_errors=True)
932
+ (output_dir / "eval-schema.json").unlink(missing_ok=True)
933
+
934
+ if intercepted:
935
+ print(f"\nINTERCEPTED — results in {output_dir}")
936
+ else:
937
+ print(f"\nNOT INTERCEPTED — results in {output_dir}")
938
+ sys.exit(1)
939
+
940
+
941
+ if __name__ == "__main__":
942
+ main()