clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 877,
5
+ "metaclass": "entertainment-hobbies",
6
+ "class": "show-tickets",
7
+ "description": "On StubHub, buy 2 NHL Toronto Maple Leafs home game tickets at Scotiabank Arena, select seats in sections 118–122 (behind the goal), rows 10 or closer",
8
+ "sites_involved": [
9
+ "stubhub.ca"
10
+ ],
11
+ "platform": "stubhub",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On StubHub, buy 2 NHL Toronto Maple Leafs home game tickets at Scotiabank Arena, select seats in sections 118–122 (behind the goal), rows 10 or closer",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 878,
5
+ "metaclass": "travel",
6
+ "class": "outdoor",
7
+ "description": "On Ontario Parks, reserve a waterfront campsite at Algonquin Provincial Park (Canisbay Lake campground) for 3 nights, arriving the upcoming Friday, filter for electric hookup sites",
8
+ "sites_involved": [
9
+ "reservations.ontarioparks.ca"
10
+ ],
11
+ "platform": "ontario-parks",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Ontario Parks, reserve a waterfront campsite at Algonquin Provincial Park (Canisbay Lake campground) for 3 nights, arriving the upcoming Friday, filter for electric hookup sites",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 883,
5
+ "metaclass": "education-learning",
6
+ "class": "hobby-class",
7
+ "description": "On Sur La Table, register and book an upcoming weekend afternoon in-store cooking class (pasta making) at the closest location to downtown San Francisco for 1 person",
8
+ "sites_involved": [
9
+ "surlatable.com"
10
+ ],
11
+ "platform": "sur-la-table",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Sur La Table, register and book an upcoming weekend afternoon in-store cooking class (pasta making) at the closest location to downtown San Francisco for 1 person",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 884,
5
+ "metaclass": "entertainment-hobbies",
6
+ "class": "experience",
7
+ "description": "On Breakout Games, book an escape room for 4 people at the nearest location to downtown Toronto, select a horror/thriller themed room, the upcoming Saturday evening time slot between 7–9 PM",
8
+ "sites_involved": [
9
+ "breakoutgames.com"
10
+ ],
11
+ "platform": "breakout-games",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Breakout Games, book an escape room for 4 people at the nearest location to downtown Toronto, select a horror/thriller themed room, the upcoming Saturday evening time slot between 7–9 PM",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 885,
5
+ "metaclass": "entertainment-hobbies",
6
+ "class": "experience",
7
+ "description": "On Bowlero, reserve a bowling lane for 6 people at the nearest location to Midtown Manhattan, the upcoming Friday night at 8 PM, add the 2-hour unlimited bowling package",
8
+ "sites_involved": [
9
+ "bowlero.com"
10
+ ],
11
+ "platform": "bowlero",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Bowlero, reserve a bowling lane for 6 people at the nearest location to Midtown Manhattan, the upcoming Friday night at 8 PM, add the 2-hour unlimited bowling package",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 886,
5
+ "metaclass": "entertainment-hobbies",
6
+ "class": "experience",
7
+ "description": "On Topgolf, book a bay for 4 players at the Topgolf nearest to downtown Toronto, the upcoming Saturday afternoon between 2–4 PM, select the longest available session",
8
+ "sites_involved": [
9
+ "topgolf.com"
10
+ ],
11
+ "platform": "topgolf",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Topgolf, book a bay for 4 players at the Topgolf nearest to downtown Toronto, the upcoming Saturday afternoon between 2–4 PM, select the longest available session",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,226 @@
1
+ {
2
+ "$schema": "./lite.schema.json",
3
+ "version": "0.1",
4
+ "created": "2026-04-12",
5
+ "rubric_max": 12,
6
+ "total_tasks": 20,
7
+ "notes": "ClawBench-Lite v0.1 — a 20-task curated subset of the 153-task ClawBench. Designed to match the 20-tasks-per-source convention of browser-use/benchmark (https://github.com/browser-use/benchmark) so the subset can eventually be upstreamed as a 6th source alongside WebBench, Mind2Web 2, GAIA, BrowseComp, and Custom. All Lite tasks are judged by eval/agentic_eval.md regardless of whether eval_schema.url_pattern is a concrete regex or __PLACEHOLDER_WILL_NOT_MATCH__ — tasks were selected on task quality / popularity / real-world relevance, not on url_pattern shape. Selection rubric: 4 axes × 3 points each (site popularity, task realism, difficulty, category ambassador), max 12. Binary gates: accessibility (no hard CAPTCHA / geo-lock / mandatory-phone-step-1 / payment wall before satisfiable) and soft novelty (not a clear duplicate of existing benchmark coverage at the same site). All 20 selections use distinct domains for site-level diversity. Swap history during validation: (1) shopping-commerce pool lacked household-name retailers so the slot was swapped to daily-life/housing/zillow 011; (2) zillow 011 was then dropped after recorded runs showed PerimeterX CAPTCHA walls on 4/4 models in batch-20260327 (borderline accessibility-gate violation), and the slot was reallocated to daily-life/food/instacart 007 — housing is therefore no longer a represented vertical in Lite v0.1.",
8
+ "skipped_categories": [
9
+ "nonprofit-charity",
10
+ "home-services-maintenance",
11
+ "finance-investment",
12
+ "automotive-vehicle-services",
13
+ "automation-workflows",
14
+ "government-civic",
15
+ "deletion-revocation",
16
+ "shopping-commerce",
17
+ "daily-life-housing"
18
+ ],
19
+ "tier_distribution": {
20
+ "flagship": 9,
21
+ "core": 8,
22
+ "wildcard": 3
23
+ },
24
+ "tasks": [
25
+ {
26
+ "task_id": 872,
27
+ "dir": "872-daily-life-food-opentable",
28
+ "tier": "flagship",
29
+ "metaclass": "daily-life",
30
+ "platform": "opentable",
31
+ "url_pattern_concrete": true,
32
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
33
+ "rationale": "OpenTable is the household-name leader for restaurant reservations; dining-out booking is canonical daily-life behavior with multi-step restaurant filtering and seating selection, and the eval_schema has a concrete url_pattern."
34
+ },
35
+ {
36
+ "task_id": 2,
37
+ "dir": "002-daily-life-food-doordash",
38
+ "tier": "flagship",
39
+ "metaclass": "daily-life",
40
+ "platform": "doordash",
41
+ "url_pattern_concrete": false,
42
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 1, "ambassador": 3, "total": 10 },
43
+ "rationale": "DoorDash is the canonical food-delivery flagship on weekly real-world use; simpler flow than OpenTable but an irreplaceable everyday-life ambassador."
44
+ },
45
+ {
46
+ "task_id": 47,
47
+ "dir": "047-daily-life-personal-care-taskrabbit",
48
+ "tier": "flagship",
49
+ "metaclass": "daily-life",
50
+ "platform": "taskrabbit",
51
+ "url_pattern_concrete": true,
52
+ "scores": { "popularity": 2, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 10 },
53
+ "rationale": "TaskRabbit is the vertical leader for gig-services marketplace; booking a helper is a multi-step realistic flow with date/time/service selection, plus concrete url_pattern and extra_info support."
54
+ },
55
+ {
56
+ "task_id": 372,
57
+ "dir": "372-entertainment-hobbies-general-eventbrite",
58
+ "tier": "flagship",
59
+ "metaclass": "entertainment-hobbies",
60
+ "platform": "eventbrite",
61
+ "url_pattern_concrete": true,
62
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
63
+ "rationale": "Eventbrite is the household-name event platform; event creation is a canonical creator-side task with multi-field form, extra_info support, and concrete url_pattern — a rare combination of popularity and eval quality."
64
+ },
65
+ {
66
+ "task_id": 369,
67
+ "dir": "369-entertainment-hobbies-general-goodreads",
68
+ "tier": "flagship",
69
+ "metaclass": "entertainment-hobbies",
70
+ "platform": "goodreads",
71
+ "url_pattern_concrete": true,
72
+ "scores": { "popularity": 3, "realism": 2, "difficulty": 2, "ambassador": 3, "total": 10 },
73
+ "rationale": "Goodreads is the household-name reading community; shelf/list creation is realistic hobby curation, with concrete url_pattern and extra_info — distinct vertical from Eventbrite and Fandango."
74
+ },
75
+ {
76
+ "task_id": 867,
77
+ "dir": "867-entertainment-hobbies-movies-fandango",
78
+ "tier": "flagship",
79
+ "metaclass": "entertainment-hobbies",
80
+ "platform": "fandango",
81
+ "url_pattern_concrete": false,
82
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
83
+ "rationale": "Fandango is the household-name movie-ticket flagship; buying a ticket with theater, showtime, and seat selection is canonical cinema behavior and a distinct entertainment vertical from event creation and reading lists."
84
+ },
85
+ {
86
+ "task_id": 501,
87
+ "dir": "501-creation-init-general-asana",
88
+ "tier": "flagship",
89
+ "metaclass": "creation-init",
90
+ "platform": "asana",
91
+ "url_pattern_concrete": true,
92
+ "scores": { "popularity": 2, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 10 },
93
+ "rationale": "Asana is a B2B SaaS leader for project management; creating a portfolio with three sub-projects is a canonical multi-object creation task, with concrete url_pattern and extra_info."
94
+ },
95
+ {
96
+ "task_id": 486,
97
+ "dir": "486-creation-init-general-mailchimp",
98
+ "tier": "flagship",
99
+ "metaclass": "creation-init",
100
+ "platform": "mailchimp",
101
+ "url_pattern_concrete": true,
102
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 2, "total": 10 },
103
+ "rationale": "Mailchimp is the household-name email-marketing leader; authoring a campaign exercises audience selection, content editing, and concrete url_pattern evaluation — a distinct creation vertical from project tooling and website builders."
104
+ },
105
+ {
106
+ "task_id": 712,
107
+ "dir": "712-creation-init-website-create-squarespace",
108
+ "tier": "flagship",
109
+ "metaclass": "creation-init",
110
+ "platform": "squarespace",
111
+ "url_pattern_concrete": true,
112
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
113
+ "rationale": "Squarespace is the household-name website builder; creating a portfolio site is a canonical self-publishing task with meaningful UI depth and concrete url_pattern."
114
+ },
115
+ {
116
+ "task_id": 142,
117
+ "dir": "142-office-secretary-tasks-collab-trello",
118
+ "tier": "core",
119
+ "metaclass": "office-secretary-tasks",
120
+ "platform": "trello",
121
+ "url_pattern_concrete": true,
122
+ "scores": { "popularity": 2, "realism": 2, "difficulty": 2, "ambassador": 3, "total": 9 },
123
+ "rationale": "Trello is the vertical leader for Kanban collaboration; creating a board with lists and cards is canonical office-collaboration behavior with concrete url_pattern — strongest office-secretary task that is not email-client-specific."
124
+ },
125
+ {
126
+ "task_id": 469,
127
+ "dir": "469-rating-voting-general-tripadvisor",
128
+ "tier": "core",
129
+ "metaclass": "rating-voting",
130
+ "platform": "tripadvisor",
131
+ "url_pattern_concrete": true,
132
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
133
+ "rationale": "TripAdvisor is the household-name global travel-review platform; writing a restaurant review is canonical rating behavior with concrete url_pattern — a natural ambassador for user-generated-content tasks."
134
+ },
135
+ {
136
+ "task_id": 266,
137
+ "dir": "266-education-learning-general-leetcode",
138
+ "tier": "core",
139
+ "metaclass": "education-learning",
140
+ "platform": "leetcode",
141
+ "url_pattern_concrete": true,
142
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 3, "ambassador": 3, "total": 12 },
143
+ "rationale": "LeetCode is the canonical coding-education platform and one of the highest-difficulty ClawBench tasks — opening a problem, writing a solution, and submitting exercises 5+ steps of dynamic UI plus code editing. The only 12/12 task in the shortlist."
144
+ },
145
+ {
146
+ "task_id": 279,
147
+ "dir": "279-travel-general-airbnb",
148
+ "tier": "core",
149
+ "metaclass": "travel",
150
+ "platform": "airbnb",
151
+ "url_pattern_concrete": false,
152
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 3, "ambassador": 3, "total": 12 },
153
+ "rationale": "Airbnb is the household-name home-rental platform; searching for an apartment by city/date/guests, selecting a listing, and driving to the Confirm-and-Pay page with all fields filled is one of the deepest multi-step flows in ClawBench. Chosen over Momondo (flight search) because Airbnb is a global household name and end-to-end booking is a distinctive ClawBench contribution vs existing benchmark coverage."
154
+ },
155
+ {
156
+ "task_id": 91,
157
+ "dir": "091-job-search-hr-job-apply-indeed",
158
+ "tier": "core",
159
+ "metaclass": "job-search-hr",
160
+ "platform": "indeed",
161
+ "url_pattern_concrete": true,
162
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
163
+ "rationale": "Indeed is the global household-name job board; applying to a posting is canonical job-seeker behavior and the strongest job-search-hr ambassador, with concrete url_pattern."
164
+ },
165
+ {
166
+ "task_id": 783,
167
+ "dir": "783-beauty-personal-care-beauty-booking-ulta-beauty",
168
+ "tier": "core",
169
+ "metaclass": "beauty-personal-care",
170
+ "platform": "ulta-beauty",
171
+ "url_pattern_concrete": false,
172
+ "scores": { "popularity": 2, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 10 },
173
+ "rationale": "Ulta Beauty is the vertical leader for beauty retail and services; booking a salon appointment exercises location/stylist/time selection — a realistic multi-step everyday task that represents the beauty vertical without being purely e-commerce."
174
+ },
175
+ {
176
+ "task_id": 7,
177
+ "dir": "007-daily-life-food-instacart",
178
+ "tier": "core",
179
+ "metaclass": "daily-life",
180
+ "platform": "instacart",
181
+ "url_pattern_concrete": false,
182
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 3, "ambassador": 3, "total": 12 },
183
+ "rationale": "Instacart is the household-name leader for grocery delivery in North America; purchasing all ingredients for a multi-day meal plan with recipes, auto-selected nearest store, and checkout is a 5+ step weekly real-world flow with meaningful UI depth plus extra_info (meal_plan.json) for cross-recipe ingredient matching. Occupies a distinct food vertical from DoorDash restaurant delivery. Placeholder url_pattern is acceptable under all-agentic eval. Swapped in after the planned Zillow 011 housing pick showed PerimeterX CAPTCHA walls across 4/4 models in the 03-27 recorded batch, borderline-violating the accessibility gate."
184
+ },
185
+ {
186
+ "task_id": 809,
187
+ "dir": "809-pet-animal-care-pet-adopt-petfinder",
188
+ "tier": "core",
189
+ "metaclass": "pet-animal-care",
190
+ "platform": "petfinder",
191
+ "url_pattern_concrete": true,
192
+ "scores": { "popularity": 2, "realism": 2, "difficulty": 2, "ambassador": 3, "total": 9 },
193
+ "rationale": "Petfinder is the vertical leader for pet adoption in the US; submitting an adoption inquiry is the canonical pet-animal-care task and uses a concrete GraphQL url_pattern."
194
+ },
195
+ {
196
+ "task_id": 179,
197
+ "dir": "179-dev-tech-github-ops-github",
198
+ "tier": "wildcard",
199
+ "metaclass": "dev-tech",
200
+ "platform": "github",
201
+ "url_pattern_concrete": true,
202
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
203
+ "rationale": "GitHub is the household-name code-hosting flagship; creating a repository with README + LICENSE + .gitignore is the canonical dev onboarding flow. Wildcard slot: dev-tech has only 2 tasks but GitHub is a first-tier brand. Concrete url_pattern (POST github.com/repositories). No other source benchmark in browser-use/benchmark covers this, so it is a distinctive ClawBench contribution."
204
+ },
205
+ {
206
+ "task_id": 215,
207
+ "dir": "215-academia-research-paper-tables-overleaf",
208
+ "tier": "wildcard",
209
+ "metaclass": "academia-research",
210
+ "platform": "overleaf",
211
+ "url_pattern_concrete": true,
212
+ "scores": { "popularity": 2, "realism": 3, "difficulty": 3, "ambassador": 3, "total": 11 },
213
+ "rationale": "Overleaf is the vertical leader for collaborative LaTeX; creating a project, writing a booktabs table, and compiling exercises multi-step code-editing UI — one of the deepest tasks in the entire benchmark, with concrete url_pattern and extra_info."
214
+ },
215
+ {
216
+ "task_id": 403,
217
+ "dir": "403-personal-management-account-security-1password-web",
218
+ "tier": "wildcard",
219
+ "metaclass": "personal-management",
220
+ "platform": "1password-web",
221
+ "url_pattern_concrete": true,
222
+ "scores": { "popularity": 3, "realism": 3, "difficulty": 2, "ambassador": 3, "total": 11 },
223
+ "rationale": "1Password is the household-name password manager; adding five login entries exercises repetitive structured data entry — canonical personal-security behavior with concrete url_pattern and extra_info credentials."
224
+ }
225
+ ]
226
+ }
@@ -0,0 +1,105 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "lite.schema.json",
4
+ "title": "ClawBench-Lite Manifest",
5
+ "description": "Schema for ClawBench-Lite — a curated 20-task subset of ClawBench. The manifest lists selected task_ids without duplicating task data; each entry references a test-cases/<dir>/task.json that must exist.",
6
+ "type": "object",
7
+ "properties": {
8
+ "$schema": true,
9
+ "version": {
10
+ "type": "string",
11
+ "description": "Semantic version of the Lite selection (e.g., \"0.1\"). Bump when tasks are added, removed, or re-scored."
12
+ },
13
+ "created": {
14
+ "type": "string",
15
+ "format": "date",
16
+ "description": "ISO date (YYYY-MM-DD) when this version of the manifest was locked."
17
+ },
18
+ "rubric_max": {
19
+ "type": "integer",
20
+ "description": "Maximum possible rubric total. For v0.1 this is 12 (4 axes × 3 points each).",
21
+ "const": 12
22
+ },
23
+ "total_tasks": {
24
+ "type": "integer",
25
+ "description": "Number of tasks in the Lite subset. Must equal the length of the tasks array.",
26
+ "minimum": 1
27
+ },
28
+ "notes": {
29
+ "type": "string",
30
+ "description": "Free-text rationale and provenance notes for this Lite version."
31
+ },
32
+ "skipped_categories": {
33
+ "type": "array",
34
+ "description": "Metaclasses intentionally excluded from Lite v0.1. Documented here so the omission reads as a curatorial choice, not an oversight.",
35
+ "items": {
36
+ "type": "string"
37
+ }
38
+ },
39
+ "tier_distribution": {
40
+ "type": "object",
41
+ "description": "Per-tier slot allocation — flagship/core/wildcard, summing to total_tasks.",
42
+ "properties": {
43
+ "flagship": { "type": "integer", "minimum": 0 },
44
+ "core": { "type": "integer", "minimum": 0 },
45
+ "wildcard": { "type": "integer", "minimum": 0 }
46
+ },
47
+ "required": ["flagship", "core", "wildcard"],
48
+ "additionalProperties": false
49
+ },
50
+ "tasks": {
51
+ "type": "array",
52
+ "description": "Selected tasks. Each entry references a task.json under test-cases/ and records its score and rationale.",
53
+ "items": {
54
+ "type": "object",
55
+ "properties": {
56
+ "task_id": {
57
+ "type": "integer",
58
+ "description": "Matches metadata.task_id in the referenced task.json"
59
+ },
60
+ "dir": {
61
+ "type": "string",
62
+ "description": "Directory name under test-cases/ (e.g., \"179-dev-tech-github-ops-github\")"
63
+ },
64
+ "tier": {
65
+ "type": "string",
66
+ "enum": ["flagship", "core", "wildcard"],
67
+ "description": "Which tier this task fills"
68
+ },
69
+ "metaclass": {
70
+ "type": "string",
71
+ "description": "Matches metadata.metaclass in the referenced task.json"
72
+ },
73
+ "platform": {
74
+ "type": "string",
75
+ "description": "Short site/brand identifier (e.g., \"github\", \"airbnb\", \"1password\")"
76
+ },
77
+ "url_pattern_concrete": {
78
+ "type": "boolean",
79
+ "description": "True if eval_schema.url_pattern is a real regex (machine-scorable); false if it is __PLACEHOLDER_WILL_NOT_MATCH__ (agentic-scoring only). Both are allowed in Lite v0.1 because eval is fully agentic."
80
+ },
81
+ "scores": {
82
+ "type": "object",
83
+ "properties": {
84
+ "popularity": { "type": "integer", "minimum": 0, "maximum": 3 },
85
+ "realism": { "type": "integer", "minimum": 0, "maximum": 3 },
86
+ "difficulty": { "type": "integer", "minimum": 0, "maximum": 3 },
87
+ "ambassador": { "type": "integer", "minimum": 0, "maximum": 3 },
88
+ "total": { "type": "integer", "minimum": 0, "maximum": 12 }
89
+ },
90
+ "required": ["popularity", "realism", "difficulty", "ambassador", "total"],
91
+ "additionalProperties": false
92
+ },
93
+ "rationale": {
94
+ "type": "string",
95
+ "description": "One-sentence justification for inclusion."
96
+ }
97
+ },
98
+ "required": ["task_id", "dir", "tier", "metaclass", "platform", "url_pattern_concrete", "scores", "rationale"],
99
+ "additionalProperties": false
100
+ }
101
+ }
102
+ },
103
+ "required": ["version", "created", "rubric_max", "total_tasks", "notes", "skipped_categories", "tier_distribution", "tasks"],
104
+ "additionalProperties": false
105
+ }
@@ -0,0 +1,132 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "task.schema.json",
4
+ "title": "ClawBench Test Case",
5
+ "description": "Schema for ClawBench test case task.json files",
6
+ "type": "object",
7
+ "properties": {
8
+ "$schema": true,
9
+ "metadata": {
10
+ "type": "object",
11
+ "description": "Human-readable metadata for documentation purposes (not read by the agent)",
12
+ "properties": {
13
+ "task_id": {
14
+ "type": "integer",
15
+ "description": "Unique numeric identifier for the test case"
16
+ },
17
+ "metaclass": {
18
+ "type": "string",
19
+ "description": "High-level category of the test case"
20
+ },
21
+ "class": {
22
+ "type": "string",
23
+ "description": "Granular sub-category of the test case"
24
+ },
25
+ "description": {
26
+ "type": "string",
27
+ "description": "Human-readable description of the test case"
28
+ },
29
+ "sites_involved": {
30
+ "type": "array",
31
+ "description": "Site domains involved in the test case (e.g., google.com, uber.com, github.com, etc.)",
32
+ "items": {
33
+ "type": "string"
34
+ }
35
+ },
36
+ "platform": {
37
+ "type": "string",
38
+ "description": "Platform involved in the test case (e.g., google, uber, github, etc.)"
39
+ },
40
+ "common_info": {
41
+ "type": "object",
42
+ "description": "Common information that is shared among all test cases",
43
+ "properties": {
44
+ "email_credentials": {
45
+ "const": "credentials to use the assigned disposable email account"
46
+ },
47
+ "user_info": {
48
+ "const": "alex_green_personal_info.json; the dummy user's personal information"
49
+ },
50
+ "user_resume": {
51
+ "const": "PDF resume with disposable email account injected"
52
+ }
53
+ },
54
+ "required": ["email_credentials", "user_info", "user_resume"],
55
+ "additionalProperties": false
56
+ }
57
+ },
58
+ "additionalProperties": true,
59
+ "required": [
60
+ "task_id",
61
+ "metaclass",
62
+ "class",
63
+ "description",
64
+ "sites_involved",
65
+ "platform",
66
+ "common_info"
67
+ ]
68
+ },
69
+ "instruction": {
70
+ "type": "string",
71
+ "description": "Task prompt sent to the agent"
72
+ },
73
+ "eval_schema": {
74
+ "type": "object",
75
+ "description": "Configuration for the request interceptor. The interceptor blocks HTTP requests matching the URL pattern, method, and optional body/params filters, preventing irreversible actions (checkout, submission, etc.) from reaching the server.",
76
+ "properties": {
77
+ "url_pattern": {
78
+ "type": "string",
79
+ "description": "Regex pattern the request URL must match to be blocked by the interceptor"
80
+ },
81
+ "method": {
82
+ "type": "string",
83
+ "enum": ["GET", "POST", "PUT", "PATCH", "DELETE"],
84
+ "description": "HTTP method the request must match to be blocked"
85
+ },
86
+ "body": {
87
+ "type": "object",
88
+ "description": "Key-value pairs that must match exactly in the request body. Used to disambiguate when URL + method alone isn't specific enough (e.g., same endpoint for login vs send)."
89
+ },
90
+ "params": {
91
+ "type": "object",
92
+ "description": "Key-value pairs that must match exactly in the URL query parameters. Used to disambiguate when URL + method alone isn't specific enough."
93
+ }
94
+ },
95
+ "required": ["url_pattern", "method"],
96
+ "additionalProperties": false
97
+ },
98
+ "time_limit": {
99
+ "type": "number",
100
+ "description": "Maximum time in minutes before the driver stops the container",
101
+ "minimum": 1
102
+ },
103
+ "extra_info": {
104
+ "type": "array",
105
+ "description": "Additional context injected into the agent prompt",
106
+ "items": {
107
+ "type": "object",
108
+ "properties": {
109
+ "path": {
110
+ "type": "string",
111
+ "description": "Relative path to a file in the test case directory (optional)"
112
+ },
113
+ "description": {
114
+ "type": "string",
115
+ "description": "Description text injected into the agent prompt"
116
+ }
117
+ },
118
+ "required": [
119
+ "path",
120
+ "description"
121
+ ],
122
+ "additionalProperties": false
123
+ }
124
+ }
125
+ },
126
+ "required": [
127
+ "instruction",
128
+ "eval_schema",
129
+ "time_limit"
130
+ ],
131
+ "additionalProperties": false
132
+ }