clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,44 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "description": "Schema for a single model entry in models/models.yaml. The YAML key is the model name.",
4
+ "type": "object",
5
+ "required": ["base_url", "api_type"],
6
+ "properties": {
7
+ "api_key": {
8
+ "type": "string",
9
+ "description": "API key for the provider"
10
+ },
11
+ "api_keys": {
12
+ "type": "array",
13
+ "items": { "type": "string" },
14
+ "description": "Multiple API keys for round-robin rotation (takes precedence over api_key)"
15
+ },
16
+ "thinking_level": {
17
+ "type": "string",
18
+ "enum": ["off", "minimal", "low", "medium", "high", "xhigh", "adaptive"],
19
+ "description": "Reasoning depth for the model"
20
+ },
21
+ "temperature": {
22
+ "type": "number",
23
+ "minimum": 0,
24
+ "maximum": 2,
25
+ "description": "Sampling temperature"
26
+ },
27
+ "max_tokens": {
28
+ "type": "integer",
29
+ "minimum": 1,
30
+ "description": "Maximum output tokens"
31
+ },
32
+ "base_url": {
33
+ "type": "string",
34
+ "format": "uri",
35
+ "description": "API base URL (e.g. https://api.openai.com/v1)"
36
+ },
37
+ "api_type": {
38
+ "type": "string",
39
+ "enum": ["anthropic-messages", "openai-responses", "openai-completions", "google-generative-ai"],
40
+ "description": "API type for the provider endpoint"
41
+ }
42
+ },
43
+ "additionalProperties": false
44
+ }
@@ -0,0 +1,16 @@
1
+ # Copy to models/models.yaml and fill in your API keys.
2
+ # Each top-level key is the model name (passed as MODEL_NAME to the container).
3
+
4
+ qwen3.5-397b-a17b:
5
+ api_key: "sk-or-v1-..."
6
+ base_url: https://openrouter.ai/api/v1
7
+ api_type: openai-completions
8
+ thinking_level: medium # optional
9
+
10
+ # For multiple API keys (round-robin), use api_keys instead of api_key:
11
+ # some-model:
12
+ # api_keys:
13
+ # - "key1"
14
+ # - "key2"
15
+ # base_url: https://api.openai.com/v1
16
+ # api_type: openai-completions
@@ -0,0 +1,451 @@
1
+ {
2
+ "identity": {
3
+ "legal_name": {
4
+ "first": "Alex",
5
+ "middle": null,
6
+ "last": "Green"
7
+ },
8
+ "preferred_name": "Alex",
9
+ "date_of_birth": "1980-May-01",
10
+ "gender": "Female",
11
+ "pronouns": "she/her",
12
+ "nationality": "Canadian",
13
+ "citizenship": [
14
+ "Canada"
15
+ ],
16
+ "marital_status": "Single",
17
+ "height_cm": 168,
18
+ "eye_color": "Brown",
19
+ "mothers_maiden_name": "Campbell",
20
+ "security_questions": [
21
+ {
22
+ "question": "What was the name of your first pet?",
23
+ "answer": "Buddy"
24
+ },
25
+ {
26
+ "question": "What street did you grow up on?",
27
+ "answer": "Spadina Ave"
28
+ },
29
+ {
30
+ "question": "What is your favorite book?",
31
+ "answer": "Designing Data-Intensive Applications"
32
+ }
33
+ ],
34
+ "languages": [
35
+ {
36
+ "language": "English",
37
+ "proficiency": "Native"
38
+ },
39
+ {
40
+ "language": "French",
41
+ "proficiency": "Intermediate (B1)"
42
+ }
43
+ ]
44
+ },
45
+ "contact": {
46
+ "email": "alex.green.uoft@clawbench.cc"
47
+ },
48
+ "address": {
49
+ "home": {
50
+ "unit": "1208",
51
+ "street": "664 Spadina Ave",
52
+ "city": "Toronto",
53
+ "province": "Ontario",
54
+ "postal_code": "M5S 2H7",
55
+ "country": "Canada",
56
+ "full": "Unit 1208, 664 Spadina Ave, Toronto, ON M5S 2H7, Canada",
57
+ "residency_since": "1980-05",
58
+ "previous_addresses": []
59
+ },
60
+ "work": {
61
+ "street": "200 University Ave, Suite 1700",
62
+ "city": "Toronto",
63
+ "province": "Ontario",
64
+ "postal_code": "M5H 3C6",
65
+ "country": "Canada"
66
+ },
67
+ "mailing_same_as_home": true
68
+ },
69
+ "education": [
70
+ {
71
+ "degree": "Bachelor of Science",
72
+ "field": "Computer Science",
73
+ "university": "University of Toronto",
74
+ "campus": "St. George",
75
+ "start_date": "1998-09",
76
+ "graduation_date": "2002-06",
77
+ "gpa": "3.6/4.0",
78
+ "honors": "Dean's List (2001, 2002)",
79
+ "student_id": "1002345674"
80
+ },
81
+ {
82
+ "degree": "Master of Science",
83
+ "field": "Computer Science",
84
+ "university": "University of Toronto",
85
+ "campus": "St. George",
86
+ "start_date": "2002-09",
87
+ "graduation_date": "2004-06",
88
+ "gpa": "3.8/4.0",
89
+ "thesis": "Efficient Query Processing in Distributed Database Systems",
90
+ "student_id": "1002345674"
91
+ },
92
+ {
93
+ "degree": "Doctor of Philosophy",
94
+ "field": "Computer Science",
95
+ "university": "University of Toronto",
96
+ "campus": "St. George",
97
+ "start_date": "2004-09",
98
+ "graduation_date": "2010-06",
99
+ "dissertation": "Scalable Real-Time Data Pipeline Architectures for High-Throughput Transaction Processing",
100
+ "gpa": "3.9/4.0",
101
+ "supervisor": "Prof. Eldon Marchetti",
102
+ "student_id": "1002345674"
103
+ }
104
+ ],
105
+ "work_experience": [
106
+ {
107
+ "title": "Senior Software Engineer",
108
+ "company": "Pinecrest Technologies Inc.",
109
+ "industry": "Enterprise SaaS",
110
+ "location": "Toronto, ON",
111
+ "start_date": "2019-03",
112
+ "end_date": null,
113
+ "is_current": true,
114
+ "salary": {
115
+ "amount": 145000,
116
+ "currency": "CAD",
117
+ "period": "annual"
118
+ },
119
+ "responsibilities": [
120
+ "Lead backend team of 5 engineers building distributed data pipelines for enterprise SaaS platform",
121
+ "Design and implement RESTful APIs serving 2M+ daily requests with sub-100ms p99 latency",
122
+ "Mentor junior developers and conduct code reviews, improving team velocity by 30%"
123
+ ],
124
+ "supervisor": {
125
+ "name": "Jordan Peters",
126
+ "title": "VP Engineering",
127
+ "email": "jordan.peters@pinecresttech.com"
128
+ }
129
+ },
130
+ {
131
+ "title": "Software Engineer",
132
+ "company": "Crestridge Digital Corp.",
133
+ "industry": "FinTech",
134
+ "location": "Toronto, ON",
135
+ "start_date": "2012-06",
136
+ "end_date": "2019-02",
137
+ "is_current": false,
138
+ "responsibilities": [
139
+ "Developed real-time transaction processing systems handling $50M+ daily volume in FinTech",
140
+ "Built automated testing frameworks reducing QA cycle by 40%",
141
+ "Collaborated with product team on mobile banking features serving 500K+ users"
142
+ ],
143
+ "reason_for_leaving": "Career growth opportunity"
144
+ },
145
+ {
146
+ "title": "Software Developer",
147
+ "company": "Cedarbrook Solutions Ltd.",
148
+ "industry": "IT Consulting",
149
+ "location": "Toronto, ON",
150
+ "start_date": "2002-09",
151
+ "end_date": "2012-05",
152
+ "is_current": false,
153
+ "note": "Part-time (20 hrs/week) during MSc (2002-2004) and PhD (2004-2010); transitioned to full-time after PhD completion in 2010",
154
+ "responsibilities": [
155
+ "Full-stack web development for enterprise clients across multiple industries",
156
+ "Database administration and performance optimization for high-traffic applications",
157
+ "Part-time during graduate studies (2002–2010); full-time from 2010"
158
+ ],
159
+ "reason_for_leaving": "Seeking specialization in FinTech"
160
+ }
161
+ ],
162
+ "skills": {
163
+ "technical": [
164
+ "Python",
165
+ "Java",
166
+ "TypeScript",
167
+ "Go",
168
+ "PostgreSQL",
169
+ "Redis",
170
+ "AWS",
171
+ "Docker",
172
+ "Kubernetes",
173
+ "Terraform",
174
+ "CI/CD",
175
+ "REST API Design",
176
+ "GraphQL",
177
+ "React",
178
+ "Node.js"
179
+ ],
180
+ "certifications": [
181
+ {
182
+ "name": "AWS Solutions Architect – Associate",
183
+ "issuer": "Amazon Web Services",
184
+ "date": "2024-08",
185
+ "expiry": "2027-08",
186
+ "id": "4GHKL8N2PQRS7T9V"
187
+ },
188
+ {
189
+ "name": "Certified Kubernetes Administrator – CKA",
190
+ "issuer": "CNCF",
191
+ "date": "2025-03",
192
+ "expiry": "2027-03",
193
+ "id": "LF-k8s7g4m2n1"
194
+ }
195
+ ],
196
+ "soft_skills": [
197
+ "Team Leadership",
198
+ "Technical Mentoring",
199
+ "Cross-functional Collaboration",
200
+ "Agile/Scrum"
201
+ ]
202
+ },
203
+ "government_ids": {
204
+ "sin": "472-345-678",
205
+ "passport": {
206
+ "number": "JK456789",
207
+ "country": "Canada",
208
+ "issue_date": "2021-05-15",
209
+ "expiry_date": "2031-05-14",
210
+ "place_of_birth": {
211
+ "city": "Toronto",
212
+ "province": "Ontario",
213
+ "country": "Canada"
214
+ },
215
+ "place_of_birth_full": "Toronto, Ontario, Canada"
216
+ },
217
+ "drivers_license": {
218
+ "number": "G4567-89018-05501",
219
+ "province": "Ontario",
220
+ "class": "G",
221
+ "issue_date": "2025-10-01",
222
+ "expiry_date": "2030-10-01"
223
+ },
224
+ "health_card": {
225
+ "province": "Ontario",
226
+ "number": "6789-012-345",
227
+ "version_code": "JG",
228
+ "expiry_date": "2027-01-01"
229
+ }
230
+ },
231
+ "financial": {
232
+ "bank_accounts": [
233
+ {
234
+ "institution": "TD Canada Trust",
235
+ "institution_number": "004",
236
+ "transit_number": "10202",
237
+ "account_number": "6781234",
238
+ "type": "Chequing",
239
+ "is_primary": true
240
+ },
241
+ {
242
+ "institution": "TD Canada Trust",
243
+ "institution_number": "004",
244
+ "transit_number": "10202",
245
+ "account_number": "6785678",
246
+ "type": "Savings",
247
+ "is_primary": false
248
+ }
249
+ ],
250
+ "credit_cards": [
251
+ {
252
+ "issuer": "TD",
253
+ "type": "TD Aeroplan Visa Infinite",
254
+ "number": "4519873424604532",
255
+ "number_formatted": "4519 8734 2460 4532",
256
+ "expiry": "2028-09",
257
+ "expiry_formatted": "09/28",
258
+ "cvv": "847",
259
+ "cardholder_name": "ALEX GREEN",
260
+ "billing_address_same_as_home": true
261
+ },
262
+ {
263
+ "issuer": "CIBC",
264
+ "type": "CIBC Aventura Visa",
265
+ "number": "4732001596738901",
266
+ "number_formatted": "4732 0015 9673 8901",
267
+ "expiry": "2027-04",
268
+ "expiry_formatted": "04/27",
269
+ "cvv": "263",
270
+ "cardholder_name": "ALEX GREEN",
271
+ "billing_address_same_as_home": true
272
+ }
273
+ ],
274
+ "annual_income_cad": 145000,
275
+ "tax_filing_status": "Single",
276
+ "rrsp_contribution_room": 42000,
277
+ "tfsa_contribution_room": 64000,
278
+ "investment_accounts": [
279
+ {
280
+ "platform": "Wealthsimple",
281
+ "account_type": "TFSA",
282
+ "balance_approx": 45000
283
+ },
284
+ {
285
+ "platform": "Wealthsimple",
286
+ "account_type": "Personal (Non-registered)",
287
+ "balance_approx": 22000
288
+ },
289
+ {
290
+ "platform": "Questrade",
291
+ "account_type": "RRSP",
292
+ "balance_approx": 120000
293
+ }
294
+ ],
295
+ "investment_profile": {
296
+ "knowledge_level": "Advanced",
297
+ "investment_objectives": "Long-term growth",
298
+ "risk_tolerance": "Medium-High",
299
+ "time_horizon": "10+ years",
300
+ "employment_status": "Employed",
301
+ "years_investing": 15
302
+ },
303
+ "net_worth_range": "500K-1M",
304
+ "liquid_assets_range": "100K-250K"
305
+ },
306
+ "insurance": {
307
+ "health": {
308
+ "provider": "OHIP (Ontario Health Insurance Plan)",
309
+ "number": "6789-012-345",
310
+ "supplementary": {
311
+ "provider": "Sun Life (via employer)",
312
+ "group_number": "502341",
313
+ "member_id": "12345-A",
314
+ "coverage": [
315
+ "Dental",
316
+ "Vision",
317
+ "Prescription",
318
+ "Paramedical"
319
+ ]
320
+ }
321
+ },
322
+ "auto": {
323
+ "provider": "Aviva Canada",
324
+ "policy_number": "W20247890",
325
+ "coverage_type": "Comprehensive",
326
+ "deductible": 500,
327
+ "expiry": "2026-07-01"
328
+ },
329
+ "home_tenant": {
330
+ "provider": "Square One Insurance",
331
+ "policy_number": "SQ1-T-2025-34567",
332
+ "type": "Tenant",
333
+ "monthly_premium": 45,
334
+ "expiry": "2027-04-01"
335
+ }
336
+ },
337
+ "vehicle": {
338
+ "make": "Honda",
339
+ "model": "Civic",
340
+ "year": 2021,
341
+ "color": "Lunar Silver Metallic",
342
+ "vin": "2HGFC2F6XMH012345",
343
+ "plate": "CQXW 234",
344
+ "province": "Ontario",
345
+ "fuel_type": "Gasoline",
346
+ "odometer_km": 38000
347
+ },
348
+ "pet": {
349
+ "name": "Maple",
350
+ "species": "Dog",
351
+ "breed": "Golden Retriever",
352
+ "age_years": 4,
353
+ "weight_lbs": 65,
354
+ "sex": "Female (spayed)",
355
+ "microchip_id": "985121012345678",
356
+ "vaccinations_up_to_date": true,
357
+ "date_of_birth": "2022-03-15",
358
+ "vet": {
359
+ "name": "Harbord Veterinary Hospital",
360
+ "address": "599 Harbord St, Toronto, ON"
361
+ },
362
+ "dietary_notes": "Grain-free kibble, sensitive stomach"
363
+ },
364
+ "medical": {
365
+ "blood_type": "A+",
366
+ "allergies": [
367
+ "Penicillin",
368
+ "Shellfish"
369
+ ],
370
+ "current_medications": [
371
+ {
372
+ "name": "Levothyroxine",
373
+ "dosage": "50mcg",
374
+ "frequency": "daily",
375
+ "prescriber": "Dr. Linnea Vanderholt",
376
+ "rx_number": "RX-2024-08-33741",
377
+ "din": "02550717"
378
+ }
379
+ ],
380
+ "family_doctor": {
381
+ "name": "Dr. Linnea Vanderholt",
382
+ "clinic": "Harbord Health Centre"
383
+ },
384
+ "pharmacy": {
385
+ "name": "Shoppers Drug Mart",
386
+ "address": "360A Bloor St W, Toronto, ON"
387
+ },
388
+ "emergency_contact": {
389
+ "name": "Emily Green",
390
+ "relationship": "Sister"
391
+ }
392
+ },
393
+ "preferences": {
394
+ "dietary": {
395
+ "restrictions": [
396
+ "Shellfish allergy"
397
+ ],
398
+ "preferences": [
399
+ "Low sodium",
400
+ "Mediterranean-style"
401
+ ],
402
+ "favorite_cuisines": [
403
+ "Japanese",
404
+ "Italian",
405
+ "Thai"
406
+ ]
407
+ },
408
+ "travel": {
409
+ "seat_preference": "Window",
410
+ "meal_preference": "Regular (no shellfish)",
411
+ "hotel_preferences": [
412
+ "Non-smoking",
413
+ "High floor",
414
+ "Quiet room"
415
+ ],
416
+ "loyalty_programs": [
417
+ {
418
+ "program": "Aeroplan",
419
+ "number": "284567890"
420
+ },
421
+ {
422
+ "program": "Marriott Bonvoy",
423
+ "number": "847293156"
424
+ },
425
+ {
426
+ "program": "Airbnb",
427
+ "account_linked_to_google": true
428
+ }
429
+ ],
430
+ "passport_country": "Canada",
431
+ "known_traveller_number": "981234567",
432
+ "nexus_card": "981234567"
433
+ },
434
+ "communication": {
435
+ "preferred_language": "English",
436
+ "timezone": "America/Toronto",
437
+ "preferred_contact_method": "Email",
438
+ "notification_preferences": "Email + push, no phone calls"
439
+ },
440
+ "shopping": {
441
+ "clothing_size": {
442
+ "top": "M",
443
+ "bottom": "8",
444
+ "shoe": "US 8.5"
445
+ },
446
+ "shipping_preference": "Standard (free) when available",
447
+ "amazon_prime": true
448
+ }
449
+ },
450
+ "professional_summary": "Senior Software Engineer with 23+ years of experience in full-stack development, distributed systems, and cloud infrastructure. PhD in Computer Science from the University of Toronto. Currently leading a backend team at Pinecrest Technologies Inc., building enterprise data pipeline solutions. Previously built real-time transaction processing systems in FinTech. AWS and Kubernetes certified."
451
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 1,
5
+ "metaclass": "daily-life",
6
+ "class": "food",
7
+ "description": "On Uber Eats, order delivery: one Pad Thai, deliver to home address, note \"no peanuts\"",
8
+ "sites_involved": [
9
+ "ubereats.com"
10
+ ],
11
+ "platform": "uber-eats",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Uber Eats, order delivery: one Pad Thai, deliver to home address, note \"no peanuts\"",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 2,
5
+ "metaclass": "daily-life",
6
+ "class": "food",
7
+ "description": "On DoorDash, order delivery: one Big Mac + two 10 pc McNuggets, select fastest delivery",
8
+ "sites_involved": [
9
+ "doordash.com"
10
+ ],
11
+ "platform": "doordash",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On DoorDash, order delivery: one Big Mac + two 10 pc McNuggets, select fastest delivery",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,36 @@
1
+ {
2
+ "items": [
3
+ {
4
+ "name": "Whole milk (1 gallon)",
5
+ "quantity": 1,
6
+ "category": "dairy"
7
+ },
8
+ {
9
+ "name": "Large eggs (dozen)",
10
+ "quantity": 1,
11
+ "category": "dairy"
12
+ },
13
+ {
14
+ "name": "Whole wheat bread",
15
+ "quantity": 1,
16
+ "category": "bakery"
17
+ },
18
+ {
19
+ "name": "Chicken breast (boneless, skinless)",
20
+ "quantity": "2 lbs",
21
+ "category": "meat"
22
+ },
23
+ {
24
+ "name": "Broccoli (fresh)",
25
+ "quantity": "1 bunch",
26
+ "category": "produce"
27
+ },
28
+ {
29
+ "name": "Bananas",
30
+ "quantity": "1 bunch (6-7)",
31
+ "category": "produce"
32
+ }
33
+ ],
34
+ "delivery_preference": "cheapest available window",
35
+ "store_preference": "nearest"
36
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 4,
5
+ "metaclass": "daily-life",
6
+ "class": "food",
7
+ "description": "Place an Instacart order for a week's groceries: milk, eggs, whole wheat bread, chicken breast, broccoli, bananas; select the cheapest delivery window",
8
+ "sites_involved": [
9
+ "instacart.com"
10
+ ],
11
+ "platform": "instacart",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Place an Instacart order for a week's groceries: milk, eggs, whole wheat bread, chicken breast, broccoli, bananas; select the cheapest delivery window",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/grocery_list.json",
27
+ "description": "Weekly grocery list with items, quantities, and categories"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 6,
5
+ "metaclass": "daily-life",
6
+ "class": "food",
7
+ "description": "On Uber Eats, order a vegan delivery (labeled vegan), deliver to office address",
8
+ "sites_involved": [
9
+ "ubereats.com"
10
+ ],
11
+ "platform": "uber-eats",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On Uber Eats, order a vegan delivery (labeled vegan), deliver to office address",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30
24
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "week_plan": [
3
+ {
4
+ "day": "Monday",
5
+ "breakfast": "Greek yogurt with granola",
6
+ "lunch": "Grilled chicken salad",
7
+ "dinner": "Salmon with steamed broccoli and brown rice"
8
+ },
9
+ {
10
+ "day": "Tuesday",
11
+ "breakfast": "Overnight oats with bananas",
12
+ "lunch": "Turkey wrap with veggies",
13
+ "dinner": "Pasta primavera with garlic bread"
14
+ }
15
+ ],
16
+ "dietary_restrictions": [
17
+ "no shellfish (allergy)",
18
+ "low sodium preferred"
19
+ ],
20
+ "servings_per_meal": 1
21
+ }