clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
clawbench/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ """ClawBench: Can AI Agents Complete Everyday Online Tasks?
2
+
3
+ A benchmark of 153 everyday tasks across 144 live websites in 15 life categories.
4
+ This package provides the CLI and test driver for running the benchmark against
5
+ frontier AI agents inside an isolated Chromium container.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from importlib.metadata import PackageNotFoundError, version
11
+
12
+ # We publish under several distribution names on PyPI (the original
13
+ # ``claw-bench`` / ``clawbench`` names are currently held by an
14
+ # unrelated project, so the user-facing name is one of the aliases
15
+ # below). Whichever name the user installed under is the one whose
16
+ # metadata will be queryable via ``importlib.metadata``.
17
+ __version__ = "0.0.0+unknown"
18
+ for _dist in (
19
+ "clawbench-eval", # primary (README Quick Start)
20
+ "clawbench-cli",
21
+ "nail-clawbench", # org-prefixed alias
22
+ "clawbench-harness",
23
+ "harness-bench",
24
+ "openclawbench",
25
+ "claw-harness",
26
+ "claw-bench", # original primary (blocked; left for future)
27
+ "clawbench", # original alias (blocked; left for future)
28
+ ):
29
+ try:
30
+ __version__ = version(_dist)
31
+ break
32
+ except PackageNotFoundError:
33
+ continue
34
+
35
+ __all__ = ["__version__"]
clawbench/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Entry point for `python -m clawbench`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from clawbench.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ main()
clawbench/batch.py ADDED
@@ -0,0 +1,619 @@
1
+ """ClawBench batch test driver — run model x case cross-product concurrently."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import fnmatch
6
+ import itertools
7
+ import json
8
+ import os
9
+ import re
10
+ import shutil
11
+ import signal
12
+ import subprocess
13
+ import sys
14
+ import time
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+
19
+ import yaml
20
+
21
+ from clawbench import engine as _engine
22
+ from clawbench import paths as _paths
23
+
24
+
25
+ def detect_engine() -> str:
26
+ """Select the container engine for the batch driver.
27
+
28
+ Same precedence as :mod:`clawbench.run`: ``CONTAINER_ENGINE`` env var
29
+ wins if valid; otherwise podman-first then docker (via
30
+ :func:`clawbench.engine.detect_engine`). Exits with an actionable
31
+ message when the override is malformed or nothing is installed."""
32
+ env_override = os.environ.get("CONTAINER_ENGINE", "").strip().lower()
33
+ if env_override and env_override not in ("docker", "podman"):
34
+ print(f"ERROR: CONTAINER_ENGINE must be 'docker' or 'podman', got '{env_override}'")
35
+ sys.exit(1)
36
+ if env_override and not shutil.which(env_override):
37
+ print(f"ERROR: CONTAINER_ENGINE={env_override} but '{env_override}' not found on PATH")
38
+ sys.exit(1)
39
+ detected = _engine.detect_engine()
40
+ if detected is None:
41
+ print("ERROR: Neither 'podman' nor 'docker' found on PATH")
42
+ sys.exit(1)
43
+ return detected
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Discovery
48
+ # ---------------------------------------------------------------------------
49
+
50
+ MODELS_YAML = _paths.user_models_yaml()
51
+
52
+
53
+ def load_models_yaml() -> dict:
54
+ """Load all model definitions from models/models.yaml."""
55
+ if not MODELS_YAML.exists():
56
+ print(f"ERROR: {MODELS_YAML} not found (copy models.example.yaml and fill in your keys)")
57
+ sys.exit(1)
58
+ return yaml.safe_load(MODELS_YAML.read_text()) or {}
59
+
60
+
61
+ def discover_models(patterns: list[str] | None, all_models: bool) -> list[str]:
62
+ models = load_models_yaml()
63
+ if all_models:
64
+ return sorted(models.keys())
65
+ if not patterns:
66
+ print("ERROR: provide --models or --all-models")
67
+ sys.exit(1)
68
+ matched: list[str] = []
69
+ for name in sorted(models.keys()):
70
+ if any(fnmatch.fnmatch(name, pat) for pat in patterns):
71
+ matched.append(name)
72
+ if not matched:
73
+ print(f"ERROR: no models matched patterns: {patterns}")
74
+ print(f"Available models: {', '.join(sorted(models))}")
75
+ sys.exit(1)
76
+ return matched
77
+
78
+
79
+ def _case_id(d: Path) -> int | None:
80
+ """Extract the leading numeric ID from a case directory name (e.g. '042-foo' -> 42)."""
81
+ parts = d.name.split("-", 1)
82
+ try:
83
+ return int(parts[0])
84
+ except (ValueError, IndexError):
85
+ return None
86
+
87
+
88
+ def discover_cases(patterns: list[str] | None, all_cases: bool,
89
+ case_range: str | None = None) -> list[Path]:
90
+ base = _paths.test_cases_dir()
91
+ if all_cases:
92
+ dirs = sorted(p.parent for p in base.glob("*/task.json"))
93
+ elif patterns:
94
+ dirs = []
95
+ for pat in patterns:
96
+ # Patterns like 'test-cases/042-*' get mapped into the bundled
97
+ # test-cases dir; bare directory names and absolute paths are
98
+ # honored as-is so a user can point at a case outside the package.
99
+ candidates: list[Path] = []
100
+ p = Path(pat)
101
+ if p.is_absolute():
102
+ candidates = sorted(p.parent.glob(p.name)) if "*" in pat else [p]
103
+ elif pat.startswith("test-cases/") or pat.startswith("test-cases"):
104
+ sub = pat.split("/", 1)[1] if "/" in pat else "*"
105
+ candidates = sorted(base.glob(sub))
106
+ else:
107
+ candidates = sorted(base.glob(pat))
108
+ for d in candidates:
109
+ if d.is_dir() and (d / "task.json").exists():
110
+ dirs.append(d)
111
+ elif case_range:
112
+ dirs = sorted(p.parent for p in base.glob("*/task.json"))
113
+ else:
114
+ print("ERROR: provide --cases, --all-cases, or --case-range")
115
+ sys.exit(1)
116
+
117
+ # Apply numeric range filter
118
+ if case_range:
119
+ lo, hi = _parse_range(case_range)
120
+ dirs = [d for d in dirs if (cid := _case_id(d)) is not None and lo <= cid <= hi]
121
+
122
+ dirs = sorted(set(dirs))
123
+ if not dirs:
124
+ print(f"ERROR: no test-case directories matched (patterns={patterns}, range={case_range})")
125
+ sys.exit(1)
126
+ return dirs
127
+
128
+
129
+ def _parse_range(r: str) -> tuple[int, int]:
130
+ """Parse 'START-END' into (start, end) inclusive."""
131
+ parts = r.split("-", 1)
132
+ if len(parts) != 2:
133
+ print(f"ERROR: --case-range must be START-END (e.g. 1-50), got '{r}'")
134
+ sys.exit(1)
135
+ try:
136
+ lo, hi = int(parts[0]), int(parts[1])
137
+ except ValueError:
138
+ print(f"ERROR: --case-range values must be integers, got '{r}'")
139
+ sys.exit(1)
140
+ if lo > hi:
141
+ print(f"ERROR: --case-range start must be <= end, got '{r}'")
142
+ sys.exit(1)
143
+ return lo, hi
144
+
145
+
146
+ # ---------------------------------------------------------------------------
147
+ # Job
148
+ # ---------------------------------------------------------------------------
149
+
150
+ @dataclass
151
+ class Job:
152
+ model: str
153
+ case_dir: Path
154
+ case_name: str
155
+ status: str = "pending"
156
+ duration: float = 0.0
157
+ proc: asyncio.subprocess.Process | None = field(default=None, repr=False)
158
+
159
+
160
+ def fmt_duration(s: float) -> str:
161
+ m, sec = divmod(int(s), 60)
162
+ return f"{m}m{sec:02d}s"
163
+
164
+
165
+ def ts() -> str:
166
+ return datetime.now(timezone.utc).strftime("%H:%M:%S")
167
+
168
+
169
+ # ---------------------------------------------------------------------------
170
+ # Async runner
171
+ # ---------------------------------------------------------------------------
172
+
173
+ shutdown_event: asyncio.Event | None = None
174
+ running_procs: list[asyncio.subprocess.Process] = []
175
+
176
+
177
+ class StartupThrottle:
178
+ """Ensure a minimum gap between consecutive container starts.
179
+
180
+ Unlike a fixed per-index stagger, this adapts dynamically: whenever a
181
+ semaphore slot frees up, the next job still waits until *min_interval*
182
+ seconds have passed since the last container launch.
183
+ """
184
+
185
+ def __init__(self, min_interval: float) -> None:
186
+ self._min_interval = min_interval
187
+ self._lock = asyncio.Lock()
188
+ self._last_start = 0.0
189
+
190
+ async def wait(self) -> None:
191
+ async with self._lock:
192
+ now = time.monotonic()
193
+ delay = self._last_start + self._min_interval - now
194
+ if delay > 0:
195
+ await asyncio.sleep(delay)
196
+ self._last_start = time.monotonic()
197
+
198
+
199
+ async def run_job(
200
+ job: Job,
201
+ sem: asyncio.Semaphore,
202
+ throttle: StartupThrottle,
203
+ base_output: Path,
204
+ log_dir: Path,
205
+ all_jobs: list[Job],
206
+ batch_start: float,
207
+ no_upload: bool = False,
208
+ ) -> None:
209
+ assert shutdown_event is not None
210
+ try:
211
+ async with sem:
212
+ if shutdown_event.is_set():
213
+ job.status = "skipped"
214
+ return
215
+
216
+ # Throttle container startup to avoid resource spikes
217
+ await throttle.wait()
218
+
219
+ # Re-check after throttle wait — Ctrl+C may have fired
220
+ if shutdown_event.is_set():
221
+ job.status = "skipped"
222
+ return
223
+
224
+ job.status = "running"
225
+ print(f"[{ts()}] [START] {job.case_name} x {job.model}")
226
+ print_progress(all_jobs, batch_start)
227
+
228
+ safe_model = re.sub(r'[/:]+', '--', job.model)
229
+ log_path = log_dir / f"{job.case_name}-{safe_model}.log"
230
+ start = time.monotonic()
231
+
232
+ proc: asyncio.subprocess.Process | None = None
233
+ try:
234
+ cmd_parts = [
235
+ sys.executable, "-m", "clawbench", "run",
236
+ str(job.case_dir), job.model,
237
+ "--output-dir", str(base_output),
238
+ "--no-build",
239
+ ]
240
+ if no_upload:
241
+ cmd_parts.append("--no-upload")
242
+ proc = await asyncio.create_subprocess_exec(
243
+ *cmd_parts,
244
+ stdout=asyncio.subprocess.PIPE,
245
+ stderr=asyncio.subprocess.STDOUT,
246
+ start_new_session=True,
247
+ )
248
+ job.proc = proc
249
+ running_procs.append(proc)
250
+ try:
251
+ stdout, _ = await proc.communicate()
252
+ finally:
253
+ if proc in running_procs:
254
+ running_procs.remove(proc)
255
+ job.proc = None
256
+
257
+ job.duration = time.monotonic() - start
258
+ log_path.write_bytes(stdout or b"")
259
+
260
+ if proc.returncode == 0:
261
+ job.status = "passed"
262
+ elif proc.returncode == 1:
263
+ job.status = "failed"
264
+ else:
265
+ job.status = "error"
266
+ except asyncio.CancelledError:
267
+ job.duration = time.monotonic() - start
268
+ # Only mark as error if a subprocess was actually running;
269
+ # otherwise leave status for the outer handler to set "skipped".
270
+ if proc is not None:
271
+ job.status = "error"
272
+ # Kill subprocess if still alive when we get cancelled.
273
+ # Use local `proc` — the inner finally already cleared job.proc.
274
+ if proc.returncode is None:
275
+ try:
276
+ os.killpg(proc.pid, signal.SIGKILL)
277
+ except (ProcessLookupError, OSError):
278
+ pass
279
+ if proc in running_procs:
280
+ running_procs.remove(proc)
281
+ raise
282
+ except Exception as e:
283
+ job.duration = time.monotonic() - start
284
+ job.status = "error"
285
+ try:
286
+ log_path.write_text(f"batch.py: failed to run job: {e}\n")
287
+ except OSError:
288
+ pass
289
+
290
+ tag = job.status.upper()
291
+ print(f"[{ts()}] [DONE] {job.case_name} x {job.model}: {tag} in {fmt_duration(job.duration)}")
292
+ print_progress(all_jobs, batch_start)
293
+
294
+ except asyncio.CancelledError:
295
+ # Task cancelled while waiting on semaphore, throttle wait, or
296
+ # before subprocess was created. "running" can appear here if
297
+ # CancelledError hit after status was set but before proc started.
298
+ if job.status not in ("passed", "failed", "error"):
299
+ job.status = "skipped"
300
+ raise
301
+
302
+
303
+ def print_progress(jobs: list[Job], start: float) -> None:
304
+ done = sum(1 for j in jobs if j.status not in ("pending", "running"))
305
+ running = sum(1 for j in jobs if j.status == "running")
306
+ passed = sum(1 for j in jobs if j.status == "passed")
307
+ failed = sum(1 for j in jobs if j.status in ("failed", "error"))
308
+ elapsed = fmt_duration(time.monotonic() - start)
309
+ print(
310
+ f"[{ts()}] [BATCH] {done}/{len(jobs)} done | {running} running | "
311
+ f"{passed} passed, {failed} failed | {elapsed} elapsed",
312
+ file=sys.stderr,
313
+ )
314
+
315
+
316
+ # ---------------------------------------------------------------------------
317
+ # Summary
318
+ # ---------------------------------------------------------------------------
319
+
320
+ def print_summary(jobs: list[Job], elapsed: float, max_concurrent: int) -> None:
321
+ print(f"\n{'=' * 60}")
322
+ print("BATCH SUMMARY")
323
+ print(f"{'=' * 60}")
324
+
325
+ model_w = max((len(j.model) for j in jobs), default=5)
326
+ case_w = max((len(j.case_name) for j in jobs), default=4)
327
+ header = f"{'Model':<{model_w}} {'Case':<{case_w}} Status Duration"
328
+ print(header)
329
+ print("-" * len(header))
330
+ for j in jobs:
331
+ tag = j.status.upper()
332
+ print(f"{j.model:<{model_w}} {j.case_name:<{case_w}} {tag:<7} {fmt_duration(j.duration)}")
333
+
334
+ totals = {}
335
+ for j in jobs:
336
+ totals[j.status] = totals.get(j.status, 0) + 1
337
+ parts = [f"{totals.get(s, 0)} {s}" for s in ("passed", "failed", "error", "skipped") if totals.get(s)]
338
+ print(f"\nTotal: {len(jobs)} jobs | {' | '.join(parts)}")
339
+ print(f"Total elapsed: {fmt_duration(elapsed)} (max_concurrent={max_concurrent})")
340
+
341
+ # For failed/error jobs, print single-run commands the user can
342
+ # copy-paste to debug with real-time noVNC.
343
+ bad = [j for j in jobs if j.status in ("failed", "error")]
344
+ if bad:
345
+ print(f"\nTo debug a failed case with live noVNC, re-run it as a single run:")
346
+ for j in bad[:10]:
347
+ print(
348
+ f" uv run --project test-driver test-driver/run.py "
349
+ f"{j.case_dir} {j.model}"
350
+ )
351
+ if len(bad) > 10:
352
+ print(f" ... and {len(bad) - 10} more")
353
+
354
+
355
+ def print_run_stats(base_output: Path) -> None:
356
+ """Print per-run statistics from output directories."""
357
+ print(f"\n{'=' * 80}")
358
+ print("PER-RUN STATS")
359
+ print(f"{'=' * 80}")
360
+
361
+ rows = []
362
+ for model_dir in sorted(base_output.iterdir()):
363
+ if not model_dir.is_dir() or model_dir.name.startswith("batch-"):
364
+ continue
365
+ for run_dir in sorted(model_dir.iterdir()):
366
+ if not run_dir.is_dir():
367
+ continue
368
+ data = run_dir / "data"
369
+ if not data.exists():
370
+ continue
371
+
372
+ # Parse case and model from run-meta.json or dir name
373
+ meta_file = run_dir / "run-meta.json"
374
+ if meta_file.exists():
375
+ meta = json.loads(meta_file.read_text())
376
+ case = meta.get("test_case", "?")
377
+ model = meta.get("model", model_dir.name)
378
+ intercepted = meta.get("intercepted", False)
379
+ duration = meta.get("duration_seconds", 0)
380
+ else:
381
+ case = run_dir.name
382
+ model = model_dir.name
383
+ intercepted = False
384
+ duration = 0
385
+
386
+ # Count actions
387
+ actions_file = data / "actions.jsonl"
388
+ actions = sum(1 for _ in open(actions_file)) if actions_file.exists() and actions_file.stat().st_size > 0 else 0
389
+
390
+ # Count screenshots
391
+ ss_dir = data / "screenshots"
392
+ screenshots = len(list(ss_dir.iterdir())) if ss_dir.is_dir() else 0
393
+
394
+ # Recording size
395
+ rec = data / "recording.mp4"
396
+ rec_mb = rec.stat().st_size / (1024 * 1024) if rec.exists() else 0
397
+
398
+ rows.append({
399
+ "case": case, "model": model, "actions": actions,
400
+ "screenshots": screenshots, "recording_mb": rec_mb,
401
+ "duration": duration, "intercepted": intercepted,
402
+ })
403
+
404
+ if not rows:
405
+ print(" No run data found.")
406
+ return
407
+
408
+ RED = "\033[91m"
409
+ RESET = "\033[0m"
410
+
411
+ case_w = min(max(len(r["case"]) for r in rows), 50)
412
+ model_w = max(len(r["model"]) for r in rows)
413
+ header = f"{'Case':<{case_w}} {'Model':<{model_w}} Actions Screenshots Recording Duration Intercepted"
414
+ print(header)
415
+ print("-" * len(header))
416
+ for r in rows:
417
+ result = "yes" if r["intercepted"] else "no"
418
+ case = r["case"][:case_w]
419
+ # Flag abnormal runs: no actions, no screenshots, no recording, or very short duration
420
+ abnormal = (r["actions"] == 0 or r["screenshots"] == 0
421
+ or r["recording_mb"] < 0.5 or r["duration"] < 30)
422
+ line = (
423
+ f"{case:<{case_w}} {r['model']:<{model_w}} "
424
+ f"{r['actions']:>7} {r['screenshots']:>11} "
425
+ f"{r['recording_mb']:>7.1f} MB "
426
+ f"{fmt_duration(r['duration']):>8} {result}"
427
+ )
428
+ if abnormal:
429
+ print(f"{RED}{line}{RESET}")
430
+ else:
431
+ print(line)
432
+
433
+ total_pass = sum(1 for r in rows if r["intercepted"])
434
+ abnormal_count = sum(1 for r in rows if r["actions"] == 0 or r["screenshots"] == 0
435
+ or r["recording_mb"] < 0.5 or r["duration"] < 30)
436
+ print(f"\n{total_pass}/{len(rows)} intercepted", end="")
437
+ if abnormal_count:
438
+ print(f" | {RED}{abnormal_count} abnormal{RESET}")
439
+ else:
440
+ print()
441
+
442
+
443
+ def write_summary_json(jobs: list[Job], base_output: Path, elapsed: float,
444
+ max_concurrent: int, started_at: str) -> None:
445
+ now = datetime.now(timezone.utc).isoformat()
446
+ data = {
447
+ "started_at": started_at,
448
+ "finished_at": now,
449
+ "elapsed_seconds": round(elapsed),
450
+ "max_concurrent": max_concurrent,
451
+ "jobs": [
452
+ {
453
+ "model": j.model,
454
+ "case": j.case_name,
455
+ "status": j.status,
456
+ "duration_seconds": round(j.duration),
457
+ }
458
+ for j in jobs
459
+ ],
460
+ "totals": {
461
+ s: sum(1 for j in jobs if j.status == s)
462
+ for s in ("passed", "failed", "error", "skipped")
463
+ },
464
+ }
465
+ (base_output / "batch-summary.json").write_text(json.dumps(data, indent=2))
466
+
467
+
468
+ # ---------------------------------------------------------------------------
469
+ # Main
470
+ # ---------------------------------------------------------------------------
471
+
472
+ async def async_main(args: argparse.Namespace) -> int:
473
+ global shutdown_event
474
+ shutdown_event = asyncio.Event()
475
+ running_procs.clear()
476
+
477
+ models = discover_models(args.models, args.all_models)
478
+ cases = discover_cases(args.cases, args.all_cases, args.case_range)
479
+
480
+ # Interleave models: iterate cases in the outer loop so consecutive jobs
481
+ # hit different API providers, reducing the chance of draining one API.
482
+ jobs = [
483
+ Job(model=m, case_dir=c, case_name=c.name)
484
+ for c, m in itertools.product(cases, models)
485
+ ]
486
+
487
+ if not jobs:
488
+ print("No jobs to run.")
489
+ return 0
490
+
491
+ print(f"Job matrix: {len(models)} model(s) x {len(cases)} case(s) = {len(jobs)} job(s)")
492
+ for j in jobs:
493
+ print(f" {j.case_name} x {j.model}")
494
+
495
+ if args.dry_run:
496
+ return 0
497
+
498
+ # Build image once — reuse run.py's spinner/progress helper so first-time
499
+ # builds show a clear "~2GB, 5–10min" banner and live step counter instead
500
+ # of a wall of apt/npm output.
501
+ engine = detect_engine()
502
+ # Ensure child `claw-bench run` processes (and the imported helper below)
503
+ # use the same engine as we just detected.
504
+ os.environ["CONTAINER_ENGINE"] = engine
505
+ from clawbench import run as _run_mod # lazy: import after CONTAINER_ENGINE is set
506
+ _run_mod.docker_build()
507
+
508
+ batch_ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
509
+ out_root = Path(args.output_dir).resolve() if args.output_dir else _paths.default_output_dir()
510
+ base_output = out_root / f"batch-{batch_ts}"
511
+ log_dir = base_output / "batch-logs"
512
+ log_dir.mkdir(parents=True, exist_ok=True)
513
+
514
+ sem = asyncio.Semaphore(args.max_concurrent)
515
+ batch_start = time.monotonic()
516
+ started_at = datetime.now(timezone.utc).isoformat()
517
+
518
+ # Signal handling — asyncio-native
519
+ sigint_count = 0
520
+ all_tasks: list[asyncio.Task] = []
521
+ loop = asyncio.get_running_loop()
522
+
523
+ def on_signal() -> None:
524
+ nonlocal sigint_count
525
+ sigint_count += 1
526
+ shutdown_event.set()
527
+
528
+ if sigint_count == 1:
529
+ n_running = sum(1 for j in jobs if j.status == "running")
530
+ print(f"\n[BATCH] Stopping — no new jobs will start. "
531
+ f"Waiting for {n_running} running job(s) to finish...")
532
+ print("[BATCH] Press Ctrl+C again to kill running jobs.")
533
+ # Cancel only non-running tasks so no new jobs start.
534
+ # Running tasks are left alone — they'll finish naturally
535
+ # and their run.py subprocesses will clean up containers.
536
+ for j, t in zip(jobs, all_tasks):
537
+ if j.status != "running" and not t.done():
538
+ t.cancel()
539
+ else:
540
+ n_running = sum(1 for p in running_procs if p.returncode is None)
541
+ print(f"\n[BATCH] Killing {n_running} running job(s)...")
542
+ for proc in list(running_procs):
543
+ try:
544
+ os.killpg(proc.pid, signal.SIGKILL)
545
+ except (ProcessLookupError, OSError):
546
+ pass
547
+ for t in all_tasks:
548
+ if not t.done():
549
+ t.cancel()
550
+
551
+ loop.add_signal_handler(signal.SIGINT, on_signal)
552
+ loop.add_signal_handler(signal.SIGTERM, on_signal)
553
+
554
+ throttle = StartupThrottle(args.stagger_delay)
555
+ all_tasks = [
556
+ asyncio.create_task(
557
+ run_job(j, sem, throttle, base_output, log_dir, jobs, batch_start,
558
+ no_upload=args.no_upload)
559
+ )
560
+ for j in jobs
561
+ ]
562
+
563
+ results = await asyncio.gather(*all_tasks, return_exceptions=True)
564
+
565
+ # Mark cancelled jobs as skipped
566
+ for j, r in zip(jobs, results):
567
+ if isinstance(r, asyncio.CancelledError) and j.status == "pending":
568
+ j.status = "skipped"
569
+
570
+ # Restore default signal handling for cleanup phase
571
+ loop.remove_signal_handler(signal.SIGINT)
572
+ loop.remove_signal_handler(signal.SIGTERM)
573
+
574
+ elapsed = time.monotonic() - batch_start
575
+ print_summary(jobs, elapsed, args.max_concurrent)
576
+ print_run_stats(base_output)
577
+ write_summary_json(jobs, base_output, elapsed, args.max_concurrent, started_at)
578
+ print(f"\nSummary written to {base_output / 'batch-summary.json'}")
579
+
580
+ # Upload batch summary to HuggingFace
581
+ if not args.no_upload:
582
+ from clawbench.hf_upload import hf_upload_enabled, upload_file
583
+ from clawbench.run import _load_runtime_env
584
+ env = _load_runtime_env()
585
+ hf_env = {"HF_TOKEN": env.get("HF_TOKEN", ""), "HF_REPO_ID": env.get("HF_REPO_ID", "")}
586
+ if hf_upload_enabled(hf_env):
587
+ safe_ts = started_at.replace(":", "-")
588
+ upload_file(
589
+ base_output / "batch-summary.json",
590
+ f"batch-summaries/{safe_ts}-batch-summary.json",
591
+ hf_env,
592
+ )
593
+
594
+ has_errors = any(j.status == "error" for j in jobs)
595
+ return 1 if has_errors else 0
596
+
597
+
598
+ def main(argv: list[str] | None = None) -> None:
599
+ p = argparse.ArgumentParser(description="Run ClawBench model x case cross-product")
600
+ p.add_argument("--models", nargs="+", default=None, help="Model name patterns (matched against keys in models/models.yaml)")
601
+ p.add_argument("--all-models", action="store_true", help="Use all models in models/models.yaml")
602
+ p.add_argument("--cases", nargs="+", default=None, help="Glob patterns for case dirs")
603
+ p.add_argument("--all-cases", action="store_true", help="Use all test-cases/ subdirs")
604
+ p.add_argument("--case-range", default=None, help="Numeric ID range, e.g. 1-50")
605
+ p.add_argument("--max-concurrent", type=int, default=2, help="Max parallel jobs (default: 2)")
606
+ p.add_argument("--output-dir", default=None, help="Base output directory (default: ./claw-output)")
607
+ p.add_argument("--stagger-delay", type=float, default=15,
608
+ help="Min seconds between consecutive container starts — rolling start (default: 15)")
609
+ p.add_argument("--dry-run", action="store_true", help="Print job matrix without running")
610
+ p.add_argument("--no-upload", dest="no_upload", action="store_true",
611
+ help="Skip HuggingFace upload for all runs")
612
+ args = p.parse_args(argv)
613
+
614
+ rc = asyncio.run(async_main(args))
615
+ sys.exit(rc)
616
+
617
+
618
+ if __name__ == "__main__":
619
+ main()