clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 413,
5
+ "metaclass": "personal-management",
6
+ "class": "personal-tools",
7
+ "description": "Create project \"Q2 Engineering Goals\" with 8 tasks, due dates, and priorities in Todoist",
8
+ "sites_involved": [
9
+ "todoist.com"
10
+ ],
11
+ "platform": "todoist",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create project \"Q2 Engineering Goals\" with 8 tasks, due dates, and priorities in Todoist",
19
+ "eval_schema": {
20
+ "url_pattern": "app\\.todoist\\.com/api/v1/sync",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/task_list.json",
27
+ "description": "Task list with names, assignees, due dates, and priorities"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "company": "TechCorp Inc.",
3
+ "position": "Senior Software Engineer",
4
+ "date": "2026-02-15",
5
+ "interview_type": "On-site (4 rounds)",
6
+ "difficulty": "Medium-Hard",
7
+ "experience": "Positive overall. The team was friendly and the questions were relevant to the role. Had a system design round focusing on distributed caching, two coding rounds (one on graph algorithms, one on API design), and a behavioral round.",
8
+ "offer_received": true,
9
+ "would_recommend": true
10
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 468,
5
+ "metaclass": "rating-voting",
6
+ "class": "general",
7
+ "description": "Write an interview review on Glassdoor",
8
+ "sites_involved": [
9
+ "glassdoor.com"
10
+ ],
11
+ "platform": "glassdoor",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Write an interview review on Glassdoor",
19
+ "eval_schema": {
20
+ "url_pattern": "glassdoor\\.com/(graph|api|a/interview)",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/interview_experience.json",
27
+ "description": "Interview experience details for review submission"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "review_text": "Excellent service and product quality. The user experience is intuitive and well-designed. Would recommend to colleagues.",
3
+ "rating": 5,
4
+ "pros": "Easy to use, great customer support, reliable service",
5
+ "cons": "Pricing could be more competitive for small teams"
6
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 469,
5
+ "metaclass": "rating-voting",
6
+ "class": "general",
7
+ "description": "Write a 4-star review for a tour on TripAdvisor",
8
+ "sites_involved": [
9
+ "tripadvisor.com"
10
+ ],
11
+ "platform": "tripadvisor",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Write a 4-star review for a tour on TripAdvisor",
19
+ "eval_schema": {
20
+ "url_pattern": "tripadvisor\\.com/data/graphql/ids",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/review_content.json",
27
+ "description": "Review content for submission"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "review_text": "Excellent service and product quality. The user experience is intuitive and well-designed. Would recommend to colleagues.",
3
+ "rating": 5,
4
+ "pros": "Easy to use, great customer support, reliable service",
5
+ "cons": "Pricing could be more competitive for small teams"
6
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 470,
5
+ "metaclass": "rating-voting",
6
+ "class": "general",
7
+ "description": "Write a 5-star review for Wealthsimple on Trustpilot",
8
+ "sites_involved": [
9
+ "trustpilot.com"
10
+ ],
11
+ "platform": "trustpilot",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Write a 5-star review for Wealthsimple on Trustpilot",
19
+ "eval_schema": {
20
+ "url_pattern": "trustpilot\\.com/api/consumer/review",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/review_content.json",
27
+ "description": "Review content for submission"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 474,
5
+ "metaclass": "rating-voting",
6
+ "class": "general",
7
+ "description": "Write a review for Notion on Capterra with overall rating 4 stars and exactly this text: \"Flexible workspace, steep learning curve.\"",
8
+ "sites_involved": [
9
+ "capterra.com"
10
+ ],
11
+ "platform": "capterra",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Write a review for Notion on Capterra with overall rating 4 stars and exactly this text: \"Flexible workspace, steep learning curve.\"",
19
+ "eval_schema": {
20
+ "url_pattern": "reviews\\.capterra\\.com/api/v1/products/[^/]+/partial-review",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 475,
5
+ "metaclass": "rating-voting",
6
+ "class": "general",
7
+ "description": "Write a 4-star review for Jira on G2 with exactly this text: \"Powerful tracking, complex UI.\"",
8
+ "sites_involved": [
9
+ "g2.com"
10
+ ],
11
+ "platform": "g2",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Write a 4-star review for Jira on G2 with exactly this text: \"Powerful tracking, complex UI.\"",
19
+ "eval_schema": {
20
+ "url_pattern": "g2\\.com/survey_responses/.+/autosave_answers\\.json",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,3 @@
1
+ {
2
+ "content": "Sharing our latest research on scaling language models \u2014 key findings on training dynamics, emergent capabilities, and practical deployment considerations. #MachineLearning #AI #Research"
3
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 482,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create a Confluence space \"Research Wiki\" with a home page and 3 sub-pages",
8
+ "sites_involved": [
9
+ "atlassian.com/software/confluence"
10
+ ],
11
+ "platform": "confluence",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create a Confluence space \"Research Wiki\" with a home page and 3 sub-pages",
19
+ "eval_schema": {
20
+ "url_pattern": "atlassian\\.net/wiki/rest/create-dialog/1\\.0/space-blueprint/create-space",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/content.json",
27
+ "description": "Content and links for the task"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 483,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create an Airtable base \"Conference Tracker\" with schema",
8
+ "sites_involved": [
9
+ "airtable.com"
10
+ ],
11
+ "platform": "airtable",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create an Airtable base \"Conference Tracker\" with schema",
19
+ "eval_schema": {
20
+ "url_pattern": "airtable\\.com/v0\\.3/application/[^/]+/create",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 484,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "On ClickUp, create a new workspace, then create a project named \"Benchmark Development\" with at least one custom field added to the project.",
8
+ "sites_involved": [
9
+ "clickup.com"
10
+ ],
11
+ "platform": "clickup",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On ClickUp, create a new workspace, then create a project named \"Benchmark Development\" with at least one custom field added to the project.",
19
+ "eval_schema": {
20
+ "url_pattern": "clickup\\.com/hierarchy/v1/project",
21
+ "method": "POST",
22
+ "body": {
23
+ "name": "Benchmark Development"
24
+ }
25
+ },
26
+ "time_limit": 30,
27
+ "extra_info": []
28
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 485,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create a portfolio website project on Webflow",
8
+ "sites_involved": [
9
+ "webflow.com"
10
+ ],
11
+ "platform": "webflow",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create a portfolio website project on Webflow",
19
+ "eval_schema": {
20
+ "url_pattern": "webflow\\.com/api/workspaces/[^/]+/sites",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,3 @@
1
+ {
2
+ "content": "Sharing our latest research on scaling language models \u2014 key findings on training dynamics, emergent capabilities, and practical deployment considerations. #MachineLearning #AI #Research"
3
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 486,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create a Mailchimp email campaign with audience set to research newsletter",
8
+ "sites_involved": [
9
+ "mailchimp.com"
10
+ ],
11
+ "platform": "mailchimp",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create a Mailchimp email campaign with audience set to research newsletter",
19
+ "eval_schema": {
20
+ "url_pattern": "admin\\.mailchimp\\.com/campaigns/(wizard|send|schedule)",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/content.json",
27
+ "description": "Content and links for the task"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,85 @@
1
+ {
2
+ "title": "User Research Survey: Developer Productivity",
3
+ "questions": [
4
+ {
5
+ "id": 1,
6
+ "type": "multiple_choice",
7
+ "text": "What is your primary programming language?",
8
+ "options": [
9
+ "Python",
10
+ "JavaScript/TypeScript",
11
+ "Go",
12
+ "Java",
13
+ "Rust",
14
+ "Other"
15
+ ]
16
+ },
17
+ {
18
+ "id": 2,
19
+ "type": "rating",
20
+ "text": "How satisfied are you with your current IDE?",
21
+ "scale": "1-5"
22
+ },
23
+ {
24
+ "id": 3,
25
+ "type": "short_text",
26
+ "text": "What is the biggest bottleneck in your daily workflow?"
27
+ },
28
+ {
29
+ "id": 4,
30
+ "type": "multiple_choice",
31
+ "text": "How many hours per week do you spend on code reviews?",
32
+ "options": [
33
+ "0-2",
34
+ "3-5",
35
+ "6-10",
36
+ "10+"
37
+ ]
38
+ },
39
+ {
40
+ "id": 5,
41
+ "type": "yes_no",
42
+ "text": "Do you use AI-assisted coding tools?"
43
+ },
44
+ {
45
+ "id": 6,
46
+ "type": "long_text",
47
+ "text": "Describe your ideal development environment."
48
+ },
49
+ {
50
+ "id": 7,
51
+ "type": "multiple_choice",
52
+ "text": "What deployment method do you use most?",
53
+ "options": [
54
+ "Docker/K8s",
55
+ "Serverless",
56
+ "VMs",
57
+ "PaaS",
58
+ "Other"
59
+ ]
60
+ },
61
+ {
62
+ "id": 8,
63
+ "type": "rating",
64
+ "text": "How would you rate your team's CI/CD pipeline?",
65
+ "scale": "1-5"
66
+ },
67
+ {
68
+ "id": 9,
69
+ "type": "short_text",
70
+ "text": "What tool do you wish existed?"
71
+ },
72
+ {
73
+ "id": 10,
74
+ "type": "multiple_choice",
75
+ "text": "How do you prefer to learn new technologies?",
76
+ "options": [
77
+ "Documentation",
78
+ "Video tutorials",
79
+ "Hands-on projects",
80
+ "Courses",
81
+ "Peer learning"
82
+ ]
83
+ }
84
+ ]
85
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 487,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create a Typeform user research survey with 10 questions",
8
+ "sites_involved": [
9
+ "typeform.com"
10
+ ],
11
+ "platform": "typeform",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create a Typeform user research survey with 10 questions",
19
+ "eval_schema": {
20
+ "url_pattern": "admin\\.typeform\\.com/bff/bob-the-builder/forms/",
21
+ "method": "PUT"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/survey_questions.json",
27
+ "description": "Survey questions for user research with 10 questions of various types"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,3 @@
1
+ {
2
+ "content": "Sharing our latest research on scaling language models \u2014 key findings on training dynamics, emergent capabilities, and practical deployment considerations. #MachineLearning #AI #Research"
3
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 488,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create a Substack newsletter \"ML Research Roundup\" and publish the first issue",
8
+ "sites_involved": [
9
+ "substack.com"
10
+ ],
11
+ "platform": "substack",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create a Substack newsletter \"ML Research Roundup\" and publish the first issue",
19
+ "eval_schema": {
20
+ "url_pattern": "substack\\.com/api/v1/drafts/\\d+/publish",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/content.json",
27
+ "description": "Content and links for the task"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,3 @@
1
+ {
2
+ "content": "Sharing our latest research on scaling language models \u2014 key findings on training dynamics, emergent capabilities, and practical deployment considerations. #MachineLearning #AI #Research"
3
+ }
@@ -0,0 +1,30 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 489,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Set up a Ghost blog and publish a \"Hello World\" post",
8
+ "sites_involved": [
9
+ "ghost.org"
10
+ ],
11
+ "platform": "ghost",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Set up a Ghost blog and publish a \"Hello World\" post",
19
+ "eval_schema": {
20
+ "url_pattern": "ghost\\.io/ghost/api/(admin|content)/posts",
21
+ "method": "PUT"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": [
25
+ {
26
+ "path": "extra_info/content.json",
27
+ "description": "Content and links for the task"
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "project_type": "bathroom renovation",
3
+ "description": "Full bathroom renovation for a standard-size condo bathroom (approx. 5x8 ft). Looking to replace the bathtub with a walk-in shower, update the vanity and mirror, install new tile flooring, and add a heated towel rack. Prefer modern minimalist style with neutral tones.",
4
+ "budget_range": "$8,000 - $15,000 CAD",
5
+ "timeline": "4-6 weeks",
6
+ "location": "Toronto, ON (condo building)",
7
+ "special_requirements": "Must comply with condo renovation rules. Need quiet hours (no work before 9am or after 5pm)."
8
+ }
@@ -0,0 +1,33 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 501,
5
+ "metaclass": "creation-init",
6
+ "class": "general",
7
+ "description": "Create an Asana portfolio \"Research Projects 2026\" with 3 sub-projects",
8
+ "sites_involved": [
9
+ "app.asana.com"
10
+ ],
11
+ "platform": "asana",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "Create an Asana portfolio \"Research Projects 2026\" with 3 sub-projects",
19
+ "eval_schema": {
20
+ "url_pattern": "app\\.asana\\.com/app/asana/-/report_execution_context_activity",
21
+ "method": "POST",
22
+ "body": {
23
+ "context_identifier": "PotGridRoot"
24
+ }
25
+ },
26
+ "time_limit": 30,
27
+ "extra_info": [
28
+ {
29
+ "path": "extra_info/project_description.json",
30
+ "description": "Project description with scope, budget, and requirements"
31
+ }
32
+ ]
33
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 529,
5
+ "metaclass": "daily-life",
6
+ "class": "shopping-delivery",
7
+ "description": "On the King Arthur Baking website, purchase a Sourdough Starter Kit, select standard size, and complete checkout",
8
+ "sites_involved": [
9
+ "kingarthurbaking.com"
10
+ ],
11
+ "platform": "king-arthur-baking",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On the King Arthur Baking website, purchase a Sourdough Starter Kit, select standard size, and complete checkout",
19
+ "eval_schema": {
20
+ "url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "$schema": "../task.schema.json",
3
+ "metadata": {
4
+ "task_id": 533,
5
+ "metaclass": "daily-life",
6
+ "class": "utilities",
7
+ "description": "On InMyArea, enter home address (ZIP 90210), select an internet service plan, click Order Now, then on the provider's signup page fill in personal information and complete the subscription",
8
+ "sites_involved": [
9
+ "inmyarea.com"
10
+ ],
11
+ "platform": "inmyarea",
12
+ "common_info": {
13
+ "email_credentials": "credentials to use the assigned disposable email account",
14
+ "user_info": "alex_green_personal_info.json; the dummy user's personal information",
15
+ "user_resume": "PDF resume with disposable email account injected"
16
+ }
17
+ },
18
+ "instruction": "On InMyArea, enter home address (ZIP 90210), select an internet service plan, click Order Now, then on the provider's signup page fill in personal information and complete the subscription",
19
+ "eval_schema": {
20
+ "url_pattern": "att\\.com/msapi/salesapi/wireless-sales-eapi/v2/addtocart",
21
+ "method": "POST"
22
+ },
23
+ "time_limit": 30,
24
+ "extra_info": []
25
+ }