clawbench-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clawbench/__init__.py +35 -0
- clawbench/__main__.py +8 -0
- clawbench/batch.py +619 -0
- clawbench/cli.py +397 -0
- clawbench/data/chrome-extension/README.md +127 -0
- clawbench/data/chrome-extension/background.js +50 -0
- clawbench/data/chrome-extension/content.js +70 -0
- clawbench/data/chrome-extension/manifest.json +25 -0
- clawbench/data/chrome-extension/setup.sh +27 -0
- clawbench/data/chrome-extension/stealth.js +200 -0
- clawbench/data/docker/Dockerfile +51 -0
- clawbench/data/docker/entrypoint.sh +394 -0
- clawbench/data/docker/setup-openclaw.sh +112 -0
- clawbench/data/eval/README.md +95 -0
- clawbench/data/eval/agentic_eval.md +53 -0
- clawbench/data/extension-server/.python-version +1 -0
- clawbench/data/extension-server/README.md +54 -0
- clawbench/data/extension-server/pyproject.toml +7 -0
- clawbench/data/extension-server/server.py +360 -0
- clawbench/data/extension-server/uv.lock +644 -0
- clawbench/data/models/model.schema.json +44 -0
- clawbench/data/models/models.example.yaml +16 -0
- clawbench/data/shared/alex_green_personal_info.json +451 -0
- clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
- clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
- clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
- clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
- clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
- clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
- clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
- clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
- clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
- clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
- clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
- clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
- clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
- clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
- clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
- clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
- clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
- clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
- clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
- clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
- clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
- clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
- clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
- clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
- clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
- clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
- clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
- clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
- clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
- clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
- clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
- clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
- clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
- clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
- clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
- clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
- clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
- clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
- clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
- clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
- clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
- clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
- clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
- clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
- clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
- clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
- clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
- clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
- clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
- clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
- clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
- clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
- clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
- clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
- clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
- clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
- clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
- clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
- clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
- clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
- clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
- clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
- clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
- clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
- clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
- clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
- clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
- clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
- clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
- clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
- clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
- clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
- clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
- clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
- clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
- clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
- clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
- clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
- clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
- clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
- clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
- clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
- clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
- clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
- clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
- clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
- clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
- clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
- clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
- clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
- clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
- clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
- clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
- clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
- clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
- clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
- clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
- clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
- clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
- clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
- clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
- clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
- clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
- clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
- clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
- clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
- clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
- clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
- clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
- clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
- clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
- clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
- clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
- clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
- clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
- clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
- clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
- clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
- clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
- clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
- clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
- clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
- clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
- clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
- clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
- clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
- clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
- clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
- clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
- clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
- clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
- clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
- clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
- clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
- clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
- clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
- clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
- clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
- clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
- clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
- clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
- clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
- clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
- clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
- clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
- clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
- clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
- clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
- clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
- clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
- clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
- clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
- clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
- clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
- clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
- clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
- clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
- clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
- clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
- clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
- clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
- clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
- clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
- clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
- clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
- clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
- clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
- clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
- clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
- clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
- clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
- clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
- clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
- clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
- clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
- clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
- clawbench/data/test-cases/lite.json +226 -0
- clawbench/data/test-cases/lite.schema.json +105 -0
- clawbench/data/test-cases/task.schema.json +132 -0
- clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
- clawbench/doctor.py +171 -0
- clawbench/engine.py +180 -0
- clawbench/generate_resume_pdf.py +140 -0
- clawbench/hf_upload.py +78 -0
- clawbench/image.py +127 -0
- clawbench/paths.py +150 -0
- clawbench/resume_template.json +104 -0
- clawbench/run.py +942 -0
- clawbench/tui.py +1401 -0
- clawbench_cli-0.1.2.dist-info/METADATA +770 -0
- clawbench_cli-0.1.2.dist-info/RECORD +226 -0
- clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
- clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
- clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"repo_name": "openclaw-agent-benchmark",
|
|
3
|
+
"visibility": "public",
|
|
4
|
+
"license": "MIT",
|
|
5
|
+
"description": "A benchmark for evaluating browser agents on real-world web tasks",
|
|
6
|
+
"topics": [
|
|
7
|
+
"benchmark",
|
|
8
|
+
"browser-agent",
|
|
9
|
+
"evaluation"
|
|
10
|
+
],
|
|
11
|
+
"has_wiki": false,
|
|
12
|
+
"has_issues": true
|
|
13
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 179,
|
|
5
|
+
"metaclass": "dev-tech",
|
|
6
|
+
"class": "github-ops",
|
|
7
|
+
"description": "Create repo \"openclaw-agent-benchmark\" on GitHub + README + MIT LICENSE + .gitignore",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"github.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "github",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Create repo \"openclaw-agent-benchmark\" on GitHub + README + MIT LICENSE + .gitignore",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "github\\.com/repositories$",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": [
|
|
25
|
+
{
|
|
26
|
+
"path": "extra_info/config.json",
|
|
27
|
+
"description": "Configuration details for the task"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 180,
|
|
5
|
+
"metaclass": "dev-tech",
|
|
6
|
+
"class": "github-ops",
|
|
7
|
+
"description": "Fork huggingface/transformers on GitHub and create branch \"fix-tokenizer-bug\"",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"github.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "github",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Fork huggingface/transformers on GitHub and create branch \"fix-tokenizer-bug\"",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "github\\.com/[^/]+/transformers/branches$",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"models": [
|
|
3
|
+
"GPT-4o",
|
|
4
|
+
"Claude 3.5 Sonnet",
|
|
5
|
+
"Gemini 1.5 Pro",
|
|
6
|
+
"Llama 3.1 70B",
|
|
7
|
+
"Mistral Large"
|
|
8
|
+
],
|
|
9
|
+
"metrics": [
|
|
10
|
+
"Accuracy",
|
|
11
|
+
"F1-Score",
|
|
12
|
+
"Latency (ms)",
|
|
13
|
+
"Cost ($/1K tokens)"
|
|
14
|
+
],
|
|
15
|
+
"results": {
|
|
16
|
+
"GPT-4o": [
|
|
17
|
+
0.923,
|
|
18
|
+
0.918,
|
|
19
|
+
342,
|
|
20
|
+
0.015
|
|
21
|
+
],
|
|
22
|
+
"Claude 3.5 Sonnet": [
|
|
23
|
+
0.931,
|
|
24
|
+
0.926,
|
|
25
|
+
298,
|
|
26
|
+
0.018
|
|
27
|
+
],
|
|
28
|
+
"Gemini 1.5 Pro": [
|
|
29
|
+
0.908,
|
|
30
|
+
0.902,
|
|
31
|
+
410,
|
|
32
|
+
0.007
|
|
33
|
+
],
|
|
34
|
+
"Llama 3.1 70B": [
|
|
35
|
+
0.887,
|
|
36
|
+
0.881,
|
|
37
|
+
520,
|
|
38
|
+
0.001
|
|
39
|
+
],
|
|
40
|
+
"Mistral Large": [
|
|
41
|
+
0.901,
|
|
42
|
+
0.895,
|
|
43
|
+
380,
|
|
44
|
+
0.008
|
|
45
|
+
]
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 215,
|
|
5
|
+
"metaclass": "academia-research",
|
|
6
|
+
"class": "paper-tables",
|
|
7
|
+
"description": "On Overleaf, create a new project and write a LaTeX booktabs table with 5 models x 4 metrics + Average + Delta row. The project must compile without errors and display the formatted table.",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"overleaf.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "overleaf",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "On Overleaf, create a new project and write a LaTeX booktabs table with 5 models x 4 metrics + Average + Delta row. The project must compile without errors and display the formatted table.",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "overleaf\\.com/project/[a-f0-9]+/compile",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": [
|
|
25
|
+
{
|
|
26
|
+
"path": "extra_info/raw_results.json",
|
|
27
|
+
"description": "Raw benchmark results: 5 models × 4 metrics"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 242,
|
|
5
|
+
"metaclass": "academia-research",
|
|
6
|
+
"class": "research-tools",
|
|
7
|
+
"description": "Overleaf: Create project \"CVPR2026_Submission\" using CVPR 2026 template",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"overleaf.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "overleaf",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Overleaf: Create project \"CVPR2026_Submission\" using CVPR 2026 template",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "overleaf\\.com/project/[a-f0-9]+/settings",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 246,
|
|
5
|
+
"metaclass": "academia-research",
|
|
6
|
+
"class": "research-tools",
|
|
7
|
+
"description": "Zotero Web: Batch import papers from 5 URLs with auto metadata",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"zotero.org"
|
|
10
|
+
],
|
|
11
|
+
"platform": "zotero",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Zotero Web: Batch import papers from 5 URLs with auto metadata",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "api\\.zotero\\.org/users/\\d+/items",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 247,
|
|
5
|
+
"metaclass": "academia-research",
|
|
6
|
+
"class": "research-tools",
|
|
7
|
+
"description": "Semantic Scholar: Create feed for \"LLMs\"",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"semanticscholar.org"
|
|
10
|
+
],
|
|
11
|
+
"platform": "semantic-scholar",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Semantic Scholar: Create feed for \"LLMs\"",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "semanticscholar\\.org/api/1/alert",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 265,
|
|
5
|
+
"metaclass": "education-learning",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "On Coursera, enroll in the \"Deep Learning Specialization\" by reaching the checkout or payment page with enrollment details filled in.",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"coursera.org"
|
|
10
|
+
],
|
|
11
|
+
"platform": "coursera",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "On Coursera, enroll in the \"Deep Learning Specialization\" by reaching the checkout or payment page with enrollment details filled in.",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "coursera\\.org/api/subscriptions\\.v1/\\?action=createCart",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 266,
|
|
5
|
+
"metaclass": "education-learning",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "On LeetCode, open the \"Two Sum\" problem, write a correct Python solution in the code editor, and click the Submit button.",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"leetcode.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "leetcode",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "On LeetCode, open the \"Two Sum\" problem, write a correct Python solution in the code editor, and click the Submit button.",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "leetcode\\.com/problems/two-sum/submit",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": [
|
|
25
|
+
{
|
|
26
|
+
"path": "extra_info/solution_code.py",
|
|
27
|
+
"description": "Python solution for the Two Sum problem"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 273,
|
|
5
|
+
"metaclass": "education-learning",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "On edX, enroll in CS50 by selecting the free audit track and completing the enrollment process until the course content or dashboard is accessible.",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"edx.org"
|
|
10
|
+
],
|
|
11
|
+
"platform": "edx",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "On edX, enroll in CS50 by selecting the free audit track and completing the enrollment process until the course content or dashboard is accessible.",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "www\\.edx\\.org/track-select/",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 274,
|
|
5
|
+
"metaclass": "education-learning",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "Udemy: Purchase \"Complete Python Bootcamp\"",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"udemy.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "udemy",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Udemy: Purchase \"Complete Python Bootcamp\"",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 279,
|
|
5
|
+
"metaclass": "travel",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "On Airbnb, search for an apartment in Shibuya, Tokyo for July 01–07 next year for 2 guests, select a listing, and proceed to the \"Confirm and pay\" page with all guest and payment details filled in.",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"airbnb.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "airbnb",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "On Airbnb, search for an apartment in Shibuya, Tokyo for July 01–07 next year for 2 guests, select a listing, and proceed to the \"Confirm and pay\" page with all guest and payment details filled in.",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 280,
|
|
5
|
+
"metaclass": "travel",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "Booking.com: Book a hotel in central Paris for 3 nights next August, double room, breakfast included, free cancellation",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"booking.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "booking-com",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Booking.com: Book a hotel in central Paris for 3 nights next August, double room, breakfast included, free cancellation",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 363,
|
|
5
|
+
"metaclass": "entertainment-hobbies",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "Ticketmaster: Buy 2 tickets for an upcoming Raptors home game",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"ticketmaster.ca"
|
|
10
|
+
],
|
|
11
|
+
"platform": "ticketmaster",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Ticketmaster: Buy 2 tickets for an upcoming Raptors home game",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "__PLACEHOLDER_WILL_NOT_MATCH__",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": []
|
|
25
|
+
}
|
clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"books": [
|
|
3
|
+
"Deep Learning by Ian Goodfellow",
|
|
4
|
+
"Pattern Recognition and Machine Learning by Christopher Bishop",
|
|
5
|
+
"The Hundred-Page Machine Learning Book by Andriy Burkov",
|
|
6
|
+
"Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow by Aur\u00e9lien G\u00e9ron",
|
|
7
|
+
"Machine Learning: A Probabilistic Perspective by Kevin Murphy",
|
|
8
|
+
"Reinforcement Learning: An Introduction by Richard Sutton",
|
|
9
|
+
"Speech and Language Processing by Dan Jurafsky",
|
|
10
|
+
"Information Theory, Inference and Learning Algorithms by David MacKay",
|
|
11
|
+
"Probabilistic Graphical Models by Daphne Koller",
|
|
12
|
+
"The Elements of Statistical Learning by Trevor Hastie"
|
|
13
|
+
]
|
|
14
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 369,
|
|
5
|
+
"metaclass": "entertainment-hobbies",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "Goodreads: Create a shelf \"ML Must-Reads\" and add 10 books",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"goodreads.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "goodreads",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Goodreads: Create a shelf \"ML Must-Reads\" and add 10 books",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "goodreads\\.com/shelf/add_to_shelf",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": [
|
|
25
|
+
{
|
|
26
|
+
"path": "extra_info/book_list.json",
|
|
27
|
+
"description": "List of 10 ML books to add to shelf"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"event_name": "ML Paper Reading Group",
|
|
3
|
+
"event_type": "free",
|
|
4
|
+
"date": "2026-04-15",
|
|
5
|
+
"time": "6:00 PM - 8:00 PM ET",
|
|
6
|
+
"location": "Online (Zoom)",
|
|
7
|
+
"description": "Weekly paper reading group focusing on recent ML research. This week: attention mechanisms and transformer architectures. All levels welcome.",
|
|
8
|
+
"capacity": 50,
|
|
9
|
+
"organizer": "Alex Green"
|
|
10
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 372,
|
|
5
|
+
"metaclass": "entertainment-hobbies",
|
|
6
|
+
"class": "general",
|
|
7
|
+
"description": "Eventbrite: Create a free event \"ML Paper Reading Group\"",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"eventbrite.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "eventbrite",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Eventbrite: Create a free event \"ML Paper Reading Group\"",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "www\\.eventbrite\\.com/api/v3/organizations/\\d+/events/auto-create",
|
|
21
|
+
"method": "POST"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": [
|
|
25
|
+
{
|
|
26
|
+
"path": "extra_info/event_details.json",
|
|
27
|
+
"description": "Event details including name, date, time, location, and description"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"entries": [
|
|
3
|
+
{
|
|
4
|
+
"site": "github.com",
|
|
5
|
+
"username": "alexgreen-dev",
|
|
6
|
+
"password": "Gh$ecure2026!ax",
|
|
7
|
+
"notes": "Personal GitHub account"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"site": "aws.amazon.com",
|
|
11
|
+
"username": "alex.green@pinecresttech.com",
|
|
12
|
+
"password": "AwS#Pr0d2026!kx",
|
|
13
|
+
"notes": "Work AWS console"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"site": "slack.com",
|
|
17
|
+
"username": "alex.green@pinecresttech.com",
|
|
18
|
+
"password": "Sl@ckW0rk2026!mn",
|
|
19
|
+
"notes": "Pinecrest Slack"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"site": "jira.atlassian.com",
|
|
23
|
+
"username": "alex.green@pinecresttech.com",
|
|
24
|
+
"password": "J1r@Pr0j2026!vz",
|
|
25
|
+
"notes": "Project management"
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"site": "notion.so",
|
|
29
|
+
"username": "alex.green.uoft@clawbench.cc",
|
|
30
|
+
"password": "N0t10n2026!Pers",
|
|
31
|
+
"notes": "Personal Notion"
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "../task.schema.json",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"task_id": 403,
|
|
5
|
+
"metaclass": "personal-management",
|
|
6
|
+
"class": "account-security",
|
|
7
|
+
"description": "Add 5 login entries to 1Password",
|
|
8
|
+
"sites_involved": [
|
|
9
|
+
"1password.com"
|
|
10
|
+
],
|
|
11
|
+
"platform": "1password-web",
|
|
12
|
+
"common_info": {
|
|
13
|
+
"email_credentials": "credentials to use the assigned disposable email account",
|
|
14
|
+
"user_info": "alex_green_personal_info.json; the dummy user's personal information",
|
|
15
|
+
"user_resume": "PDF resume with disposable email account injected"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"instruction": "Add 5 login entries to 1Password",
|
|
19
|
+
"eval_schema": {
|
|
20
|
+
"url_pattern": "my\\.1password\\.com/api/v3/vault/[a-z0-9]+/\\d+/items",
|
|
21
|
+
"method": "PATCH"
|
|
22
|
+
},
|
|
23
|
+
"time_limit": 30,
|
|
24
|
+
"extra_info": [
|
|
25
|
+
{
|
|
26
|
+
"path": "extra_info/credentials.json",
|
|
27
|
+
"description": "Login credentials for 5 services to add to password manager"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"tasks": [
|
|
3
|
+
{
|
|
4
|
+
"name": "Define Q3 OKRs",
|
|
5
|
+
"assignee": "Alex Green",
|
|
6
|
+
"due_date": "2026-04-07",
|
|
7
|
+
"priority": "high"
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"name": "Review architecture RFC",
|
|
11
|
+
"assignee": "Jordan Peters",
|
|
12
|
+
"due_date": "2026-04-10",
|
|
13
|
+
"priority": "high"
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"name": "Set up CI/CD pipeline",
|
|
17
|
+
"assignee": "Alex Green",
|
|
18
|
+
"due_date": "2026-04-14",
|
|
19
|
+
"priority": "medium"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"name": "Write integration tests",
|
|
23
|
+
"assignee": "Alex Green",
|
|
24
|
+
"due_date": "2026-04-18",
|
|
25
|
+
"priority": "medium"
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"name": "Prepare sprint demo",
|
|
29
|
+
"assignee": "Jordan Peters",
|
|
30
|
+
"due_date": "2026-04-21",
|
|
31
|
+
"priority": "low"
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"name": "Update documentation",
|
|
35
|
+
"assignee": "Alex Green",
|
|
36
|
+
"due_date": "2026-04-25",
|
|
37
|
+
"priority": "low"
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"name": "Security audit review",
|
|
41
|
+
"assignee": "Alex Green",
|
|
42
|
+
"due_date": "2026-04-28",
|
|
43
|
+
"priority": "high"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "Performance benchmarking",
|
|
47
|
+
"assignee": "Alex Green",
|
|
48
|
+
"due_date": "2026-04-30",
|
|
49
|
+
"priority": "medium"
|
|
50
|
+
}
|
|
51
|
+
]
|
|
52
|
+
}
|