clawbench-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clawbench/__init__.py +35 -0
- clawbench/__main__.py +8 -0
- clawbench/batch.py +619 -0
- clawbench/cli.py +397 -0
- clawbench/data/chrome-extension/README.md +127 -0
- clawbench/data/chrome-extension/background.js +50 -0
- clawbench/data/chrome-extension/content.js +70 -0
- clawbench/data/chrome-extension/manifest.json +25 -0
- clawbench/data/chrome-extension/setup.sh +27 -0
- clawbench/data/chrome-extension/stealth.js +200 -0
- clawbench/data/docker/Dockerfile +51 -0
- clawbench/data/docker/entrypoint.sh +394 -0
- clawbench/data/docker/setup-openclaw.sh +112 -0
- clawbench/data/eval/README.md +95 -0
- clawbench/data/eval/agentic_eval.md +53 -0
- clawbench/data/extension-server/.python-version +1 -0
- clawbench/data/extension-server/README.md +54 -0
- clawbench/data/extension-server/pyproject.toml +7 -0
- clawbench/data/extension-server/server.py +360 -0
- clawbench/data/extension-server/uv.lock +644 -0
- clawbench/data/models/model.schema.json +44 -0
- clawbench/data/models/models.example.yaml +16 -0
- clawbench/data/shared/alex_green_personal_info.json +451 -0
- clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
- clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
- clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
- clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
- clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
- clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
- clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
- clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
- clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
- clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
- clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
- clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
- clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
- clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
- clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
- clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
- clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
- clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
- clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
- clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
- clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
- clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
- clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
- clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
- clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
- clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
- clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
- clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
- clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
- clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
- clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
- clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
- clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
- clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
- clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
- clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
- clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
- clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
- clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
- clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
- clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
- clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
- clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
- clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
- clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
- clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
- clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
- clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
- clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
- clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
- clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
- clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
- clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
- clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
- clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
- clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
- clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
- clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
- clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
- clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
- clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
- clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
- clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
- clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
- clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
- clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
- clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
- clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
- clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
- clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
- clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
- clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
- clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
- clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
- clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
- clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
- clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
- clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
- clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
- clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
- clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
- clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
- clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
- clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
- clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
- clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
- clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
- clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
- clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
- clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
- clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
- clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
- clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
- clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
- clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
- clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
- clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
- clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
- clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
- clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
- clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
- clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
- clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
- clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
- clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
- clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
- clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
- clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
- clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
- clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
- clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
- clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
- clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
- clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
- clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
- clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
- clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
- clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
- clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
- clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
- clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
- clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
- clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
- clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
- clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
- clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
- clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
- clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
- clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
- clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
- clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
- clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
- clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
- clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
- clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
- clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
- clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
- clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
- clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
- clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
- clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
- clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
- clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
- clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
- clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
- clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
- clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
- clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
- clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
- clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
- clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
- clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
- clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
- clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
- clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
- clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
- clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
- clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
- clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
- clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
- clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
- clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
- clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
- clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
- clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
- clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
- clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
- clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
- clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
- clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
- clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
- clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
- clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
- clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
- clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
- clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
- clawbench/data/test-cases/lite.json +226 -0
- clawbench/data/test-cases/lite.schema.json +105 -0
- clawbench/data/test-cases/task.schema.json +132 -0
- clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
- clawbench/doctor.py +171 -0
- clawbench/engine.py +180 -0
- clawbench/generate_resume_pdf.py +140 -0
- clawbench/hf_upload.py +78 -0
- clawbench/image.py +127 -0
- clawbench/paths.py +150 -0
- clawbench/resume_template.json +104 -0
- clawbench/run.py +942 -0
- clawbench/tui.py +1401 -0
- clawbench_cli-0.1.2.dist-info/METADATA +770 -0
- clawbench_cli-0.1.2.dist-info/RECORD +226 -0
- clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
- clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
- clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
clawbench/cli.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
"""``claw-bench`` command-line entry point (click-based).
|
|
2
|
+
|
|
3
|
+
Design notes:
|
|
4
|
+
|
|
5
|
+
- Bare ``claw-bench`` launches the TUI. This preserves muscle memory from
|
|
6
|
+
the old ``./run.sh`` and keeps the zero-friction experience for users
|
|
7
|
+
who just typed ``pip install claw-bench`` and hit enter.
|
|
8
|
+
- Every power-user action has an explicit subcommand so scripts don't
|
|
9
|
+
need to navigate a menu (``run``, ``batch``, ``build``, ``cases``,
|
|
10
|
+
``models``, ``configure``, ``doctor``, ``version``).
|
|
11
|
+
- Subcommands are thin wrappers that delegate to the module-level
|
|
12
|
+
``main()`` functions in :mod:`clawbench.run` / :mod:`clawbench.batch`.
|
|
13
|
+
Those modules still accept argparse argv so they can be invoked
|
|
14
|
+
in-process *and* via ``python -m clawbench run ...`` from the batch
|
|
15
|
+
runner's subprocess fan-out — one code path, two callers.
|
|
16
|
+
|
|
17
|
+
Subcommand surface intentionally kept small. Every flag the TUI exposes
|
|
18
|
+
is reachable from the CLI, but we don't duplicate every internal toggle
|
|
19
|
+
that ``run.py``/``batch.py`` support as argparse args — click just
|
|
20
|
+
forwards through to them via ``extra_args``.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
import stat
|
|
27
|
+
import subprocess
|
|
28
|
+
import sys
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
import click
|
|
32
|
+
|
|
33
|
+
from clawbench import __version__
|
|
34
|
+
from clawbench import doctor as _doctor
|
|
35
|
+
from clawbench import paths as _paths
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Small helpers
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
def _echo_result(r: _doctor.CheckResult) -> None:
|
|
43
|
+
"""Render a single doctor CheckResult with color-coded status."""
|
|
44
|
+
symbol = {"ok": "[OK] ", "warn": "[WARN]", "fail": "[FAIL]"}.get(r.status, "[?]")
|
|
45
|
+
color = {"ok": "green", "warn": "yellow", "fail": "red"}.get(r.status, "white")
|
|
46
|
+
click.echo(f" {click.style(symbol, fg=color)} {r.name}: {r.detail}")
|
|
47
|
+
if r.hint and r.status != "ok":
|
|
48
|
+
for line in r.hint.splitlines():
|
|
49
|
+
click.echo(f" {click.style(line, dim=True)}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Root group
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
@click.group(
|
|
57
|
+
invoke_without_command=True,
|
|
58
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
59
|
+
)
|
|
60
|
+
@click.version_option(__version__, "-V", "--version", prog_name="claw-bench")
|
|
61
|
+
@click.pass_context
|
|
62
|
+
def main(ctx: click.Context) -> None:
|
|
63
|
+
"""ClawBench — benchmark AI agents on 153 everyday web tasks.
|
|
64
|
+
|
|
65
|
+
Run without a subcommand to launch the interactive TUI. Use
|
|
66
|
+
``claw-bench run``, ``batch``, ``build``, ``doctor``, etc. for
|
|
67
|
+
scripting.
|
|
68
|
+
"""
|
|
69
|
+
if ctx.invoked_subcommand is None:
|
|
70
|
+
# No subcommand → TUI.
|
|
71
|
+
from clawbench import tui
|
|
72
|
+
tui.main()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
# tui
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
@main.command("tui")
|
|
80
|
+
def tui_cmd() -> None:
|
|
81
|
+
"""Launch the interactive TUI (default action if no subcommand given)."""
|
|
82
|
+
from clawbench import tui
|
|
83
|
+
tui.main()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# run
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
@main.command(
|
|
91
|
+
"run",
|
|
92
|
+
context_settings={
|
|
93
|
+
"ignore_unknown_options": True,
|
|
94
|
+
"allow_extra_args": True,
|
|
95
|
+
"help_option_names": ["-h", "--help"],
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
@click.argument("test_case_dir", type=click.Path(path_type=Path))
|
|
99
|
+
@click.argument("model", required=False)
|
|
100
|
+
@click.option("--human", is_flag=True, help="Human mode: expose Chrome via noVNC instead of running an agent.")
|
|
101
|
+
@click.option("--output-dir", type=click.Path(path_type=Path), default=None,
|
|
102
|
+
help="Directory to write output to (default: ./claw-output).")
|
|
103
|
+
@click.option("--no-build", is_flag=True, help="Skip building the container image.")
|
|
104
|
+
@click.option("--no-upload", is_flag=True, help="Skip HuggingFace upload even if configured.")
|
|
105
|
+
@click.pass_context
|
|
106
|
+
def run_cmd(
|
|
107
|
+
ctx: click.Context,
|
|
108
|
+
test_case_dir: Path,
|
|
109
|
+
model: str | None,
|
|
110
|
+
human: bool,
|
|
111
|
+
output_dir: Path | None,
|
|
112
|
+
no_build: bool,
|
|
113
|
+
no_upload: bool,
|
|
114
|
+
) -> None:
|
|
115
|
+
"""Run a single test case against a model (or in --human mode)."""
|
|
116
|
+
from clawbench import run as _run
|
|
117
|
+
# Accept three forms for the case argument:
|
|
118
|
+
# (a) an absolute / already-existing path (user points at their own case),
|
|
119
|
+
# (b) ``test-cases/<name>`` relative to the project (dev convenience),
|
|
120
|
+
# (c) a bare case name like ``006-daily-life-food-uber-eats`` — looked up
|
|
121
|
+
# inside the wheel's bundled test-cases. This is the common case from
|
|
122
|
+
# the TUI, which passes only the case name.
|
|
123
|
+
resolved = test_case_dir
|
|
124
|
+
if not resolved.exists():
|
|
125
|
+
bundled = _paths.test_cases_dir() / test_case_dir.name
|
|
126
|
+
if bundled.exists():
|
|
127
|
+
resolved = bundled
|
|
128
|
+
argv: list[str] = [str(resolved)]
|
|
129
|
+
if model:
|
|
130
|
+
argv.append(model)
|
|
131
|
+
if human:
|
|
132
|
+
argv.append("--human")
|
|
133
|
+
if output_dir:
|
|
134
|
+
argv += ["--output-dir", str(output_dir)]
|
|
135
|
+
if no_build:
|
|
136
|
+
argv.append("--no-build")
|
|
137
|
+
if no_upload:
|
|
138
|
+
argv.append("--no-upload")
|
|
139
|
+
argv += list(ctx.args)
|
|
140
|
+
_run.main(argv)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# batch
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
@main.command(
|
|
148
|
+
"batch",
|
|
149
|
+
context_settings={
|
|
150
|
+
"ignore_unknown_options": True,
|
|
151
|
+
"allow_extra_args": True,
|
|
152
|
+
"help_option_names": ["-h", "--help"],
|
|
153
|
+
},
|
|
154
|
+
)
|
|
155
|
+
@click.option("--models", "models_", multiple=True, help="Glob(s) matching model keys in models.yaml.")
|
|
156
|
+
@click.option("--all-models", is_flag=True, help="Run every model in models.yaml.")
|
|
157
|
+
@click.option("--cases", "cases_", multiple=True, help="Glob(s) matching test-case dirs.")
|
|
158
|
+
@click.option("--all-cases", is_flag=True, help="Run every bundled test case.")
|
|
159
|
+
@click.option("--case-range", default=None, help="Numeric ID range, e.g. 1-50.")
|
|
160
|
+
@click.option("--max-concurrent", type=int, default=2, help="Max parallel jobs (default: 2).")
|
|
161
|
+
@click.option("--output-dir", type=click.Path(path_type=Path), default=None,
|
|
162
|
+
help="Base output directory (default: ./claw-output).")
|
|
163
|
+
@click.option("--stagger-delay", type=float, default=15,
|
|
164
|
+
help="Min seconds between container starts (default: 15).")
|
|
165
|
+
@click.option("--dry-run", is_flag=True, help="Print job matrix without running.")
|
|
166
|
+
@click.option("--no-upload", is_flag=True, help="Skip HuggingFace upload for all runs.")
|
|
167
|
+
@click.pass_context
|
|
168
|
+
def batch_cmd(
|
|
169
|
+
ctx: click.Context,
|
|
170
|
+
models_: tuple[str, ...],
|
|
171
|
+
all_models: bool,
|
|
172
|
+
cases_: tuple[str, ...],
|
|
173
|
+
all_cases: bool,
|
|
174
|
+
case_range: str | None,
|
|
175
|
+
max_concurrent: int,
|
|
176
|
+
output_dir: Path | None,
|
|
177
|
+
stagger_delay: float,
|
|
178
|
+
dry_run: bool,
|
|
179
|
+
no_upload: bool,
|
|
180
|
+
) -> None:
|
|
181
|
+
"""Run a model x case cross-product concurrently."""
|
|
182
|
+
from clawbench import batch as _batch
|
|
183
|
+
argv: list[str] = []
|
|
184
|
+
if models_:
|
|
185
|
+
argv += ["--models", *models_]
|
|
186
|
+
if all_models:
|
|
187
|
+
argv.append("--all-models")
|
|
188
|
+
if cases_:
|
|
189
|
+
argv += ["--cases", *cases_]
|
|
190
|
+
if all_cases:
|
|
191
|
+
argv.append("--all-cases")
|
|
192
|
+
if case_range:
|
|
193
|
+
argv += ["--case-range", case_range]
|
|
194
|
+
argv += ["--max-concurrent", str(max_concurrent)]
|
|
195
|
+
if output_dir:
|
|
196
|
+
argv += ["--output-dir", str(output_dir)]
|
|
197
|
+
argv += ["--stagger-delay", str(stagger_delay)]
|
|
198
|
+
if dry_run:
|
|
199
|
+
argv.append("--dry-run")
|
|
200
|
+
if no_upload:
|
|
201
|
+
argv.append("--no-upload")
|
|
202
|
+
argv += list(ctx.args)
|
|
203
|
+
_batch.main(argv)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ---------------------------------------------------------------------------
|
|
207
|
+
# build
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
@main.command("build")
|
|
211
|
+
@click.option("--no-cache", is_flag=True, help="Ignore layer cache — full rebuild.")
|
|
212
|
+
def build_cmd(no_cache: bool) -> None:
|
|
213
|
+
"""Build the clawbench container image from the bundled Dockerfile."""
|
|
214
|
+
from clawbench import run as _run
|
|
215
|
+
# ``run.docker_build`` already retries with --no-cache on stale-cache
|
|
216
|
+
# detection; if the user explicitly asks for a cold build, we blow the
|
|
217
|
+
# cache up front by removing the existing image and then rebuilding.
|
|
218
|
+
if no_cache:
|
|
219
|
+
from clawbench.engine import detect_engine
|
|
220
|
+
eng = detect_engine()
|
|
221
|
+
if eng:
|
|
222
|
+
subprocess.run([eng, "image", "rm", "-f", "clawbench"],
|
|
223
|
+
capture_output=True)
|
|
224
|
+
_run.docker_build()
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ---------------------------------------------------------------------------
|
|
228
|
+
# cases
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
|
|
231
|
+
@main.command("cases")
|
|
232
|
+
@click.option("--category", default=None, help="Filter by category (substring match).")
|
|
233
|
+
def cases_cmd(category: str | None) -> None:
|
|
234
|
+
"""List bundled test cases (name, category, time-limit)."""
|
|
235
|
+
import json as _json
|
|
236
|
+
base = _paths.test_cases_dir()
|
|
237
|
+
dirs = sorted(p.parent for p in base.glob("*/task.json"))
|
|
238
|
+
if not dirs:
|
|
239
|
+
click.echo("No test cases found.")
|
|
240
|
+
sys.exit(1)
|
|
241
|
+
# One outlier case has a 180+ char name; cap padding at 60 so the
|
|
242
|
+
# common case doesn't get a wall of whitespace.
|
|
243
|
+
width = min(60, max(len(d.name) for d in dirs))
|
|
244
|
+
shown = 0
|
|
245
|
+
for d in dirs:
|
|
246
|
+
try:
|
|
247
|
+
task = _json.loads((d / "task.json").read_text())
|
|
248
|
+
except Exception as e:
|
|
249
|
+
click.echo(f" {d.name:<{width}} [unreadable: {e}]")
|
|
250
|
+
continue
|
|
251
|
+
cat = task.get("category", "?")
|
|
252
|
+
if category and category.lower() not in cat.lower():
|
|
253
|
+
continue
|
|
254
|
+
time_limit = task.get("time_limit", "?")
|
|
255
|
+
click.echo(f" {d.name:<{width}} {cat:<20} {time_limit} min")
|
|
256
|
+
shown += 1
|
|
257
|
+
click.echo(f"\n{shown} case(s)")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# ---------------------------------------------------------------------------
|
|
261
|
+
# models
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
|
|
264
|
+
@main.command("models")
|
|
265
|
+
def models_cmd() -> None:
|
|
266
|
+
"""List configured models from the user's models.yaml."""
|
|
267
|
+
import yaml as _yaml
|
|
268
|
+
path = _paths.user_models_yaml()
|
|
269
|
+
try:
|
|
270
|
+
data = _yaml.safe_load(path.read_text()) or {}
|
|
271
|
+
except Exception as e:
|
|
272
|
+
click.echo(f"ERROR: cannot read {path}: {e}", err=True)
|
|
273
|
+
sys.exit(1)
|
|
274
|
+
if not data:
|
|
275
|
+
click.echo(f"No models configured. Edit {path} or run `claw-bench configure`.")
|
|
276
|
+
return
|
|
277
|
+
click.echo(f"Models configured in {path}:")
|
|
278
|
+
for name in sorted(data):
|
|
279
|
+
api = data[name].get("api_type") if isinstance(data[name], dict) else "?"
|
|
280
|
+
click.echo(f" {name} ({api})")
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
# configure
|
|
285
|
+
# ---------------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
@main.command("configure")
|
|
288
|
+
@click.option("--show", is_flag=True, help="Print the config file path and exit.")
|
|
289
|
+
@click.option("--secrets", is_flag=True, help="Write a secrets.env file (chmod 600) interactively.")
|
|
290
|
+
def configure_cmd(show: bool, secrets: bool) -> None:
|
|
291
|
+
"""Open the user's models.yaml in $EDITOR, or manage secrets."""
|
|
292
|
+
if show and secrets:
|
|
293
|
+
click.echo("ERROR: pass --show OR --secrets, not both", err=True)
|
|
294
|
+
sys.exit(1)
|
|
295
|
+
if show:
|
|
296
|
+
click.echo(f"models.yaml: {_paths.user_models_yaml()}")
|
|
297
|
+
click.echo(f"config.json: {_paths.user_config_json()}")
|
|
298
|
+
click.echo(f"secrets.env: {_paths.user_secrets_path()}")
|
|
299
|
+
return
|
|
300
|
+
if secrets:
|
|
301
|
+
_write_secrets_interactive()
|
|
302
|
+
return
|
|
303
|
+
# Default: $EDITOR on models.yaml (seeds it first if missing).
|
|
304
|
+
path = _paths.user_models_yaml()
|
|
305
|
+
editor = os.environ.get("EDITOR") or os.environ.get("VISUAL") or "vi"
|
|
306
|
+
click.echo(f"Opening {path} with {editor}...")
|
|
307
|
+
try:
|
|
308
|
+
subprocess.run([editor, str(path)], check=False)
|
|
309
|
+
except FileNotFoundError:
|
|
310
|
+
click.echo(f"ERROR: editor '{editor}' not found. Set $EDITOR.", err=True)
|
|
311
|
+
sys.exit(1)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _write_secrets_interactive() -> None:
|
|
315
|
+
"""Prompt for PurelyMail + optional HF keys and persist to secrets.env.
|
|
316
|
+
|
|
317
|
+
We chmod 600 and parent-dir mkdir exists_ok=True via
|
|
318
|
+
:func:`_paths.user_config_dir`. Values blanked out are omitted so we
|
|
319
|
+
never overwrite a previously-persisted key with "".
|
|
320
|
+
"""
|
|
321
|
+
target = _paths.user_secrets_path()
|
|
322
|
+
existing: dict[str, str] = {}
|
|
323
|
+
if target.exists():
|
|
324
|
+
for line in target.read_text().splitlines():
|
|
325
|
+
line = line.strip()
|
|
326
|
+
if not line or line.startswith("#") or "=" not in line:
|
|
327
|
+
continue
|
|
328
|
+
k, v = line.split("=", 1)
|
|
329
|
+
existing[k.strip()] = v.strip().strip('"').strip("'")
|
|
330
|
+
click.echo(f"Writing to {target}")
|
|
331
|
+
click.echo("Leave blank to keep the current value (or skip if unset).\n")
|
|
332
|
+
|
|
333
|
+
keys = [
|
|
334
|
+
("PURELY_MAIL_API_KEY", "PurelyMail API key"),
|
|
335
|
+
("PURELY_MAIL_DOMAIN", "PurelyMail domain"),
|
|
336
|
+
("HF_TOKEN", "HuggingFace token (optional)"),
|
|
337
|
+
("HF_REPO_ID", "HuggingFace dataset repo id (optional)"),
|
|
338
|
+
]
|
|
339
|
+
updated: dict[str, str] = dict(existing)
|
|
340
|
+
for key, label in keys:
|
|
341
|
+
cur = existing.get(key, "")
|
|
342
|
+
hint = f" [current: {_redact(cur)}]" if cur else ""
|
|
343
|
+
val = click.prompt(f" {label}{hint}", default="", show_default=False).strip()
|
|
344
|
+
if val:
|
|
345
|
+
updated[key] = val
|
|
346
|
+
|
|
347
|
+
lines = ["# claw-bench secrets — chmod 600", ""]
|
|
348
|
+
lines += [f'{k}="{v}"' for k, v in updated.items()]
|
|
349
|
+
target.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
350
|
+
try:
|
|
351
|
+
target.chmod(stat.S_IRUSR | stat.S_IWUSR)
|
|
352
|
+
except OSError:
|
|
353
|
+
pass # windows / non-posix — best effort
|
|
354
|
+
click.echo(f"\nWrote {len(updated)} key(s) to {target}")
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _redact(v: str) -> str:
|
|
358
|
+
if len(v) <= 4:
|
|
359
|
+
return "****"
|
|
360
|
+
return v[:2] + "****" + v[-2:]
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
# ---------------------------------------------------------------------------
|
|
364
|
+
# doctor
|
|
365
|
+
# ---------------------------------------------------------------------------
|
|
366
|
+
|
|
367
|
+
@main.command("doctor")
|
|
368
|
+
def doctor_cmd() -> None:
|
|
369
|
+
"""Validate engine, image, test-cases, output perms, and secrets."""
|
|
370
|
+
click.echo("claw-bench diagnostics\n")
|
|
371
|
+
results = _doctor.run_all()
|
|
372
|
+
for r in results:
|
|
373
|
+
_echo_result(r)
|
|
374
|
+
click.echo()
|
|
375
|
+
fails = [r for r in results if r.status == "fail"]
|
|
376
|
+
warns = [r for r in results if r.status == "warn"]
|
|
377
|
+
if fails:
|
|
378
|
+
click.echo(click.style(f"{len(fails)} failing check(s). Fix and re-run.", fg="red"))
|
|
379
|
+
sys.exit(1)
|
|
380
|
+
if warns:
|
|
381
|
+
click.echo(click.style(f"{len(warns)} warning(s). ClawBench should still work.", fg="yellow"))
|
|
382
|
+
else:
|
|
383
|
+
click.echo(click.style("All checks passed.", fg="green"))
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# ---------------------------------------------------------------------------
|
|
387
|
+
# version (explicit subcommand in addition to --version)
|
|
388
|
+
# ---------------------------------------------------------------------------
|
|
389
|
+
|
|
390
|
+
@main.command("version")
|
|
391
|
+
def version_cmd() -> None:
|
|
392
|
+
"""Print the installed version."""
|
|
393
|
+
click.echo(__version__)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
if __name__ == "__main__":
|
|
397
|
+
main()
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# ClawBench Chrome Extension
|
|
2
|
+
|
|
3
|
+
This is the source code for the ClawBench Chrome Extension, which acts as the client for the ClawBench benchmarking framework.
|
|
4
|
+
|
|
5
|
+
The extension is responsible for the following tasks:
|
|
6
|
+
|
|
7
|
+
- Collecting every action performed by the user/agent on the browser and send the data to the ClawBench server.
|
|
8
|
+
- Take screenshots of the browser on every action, with high-frequency events throttled.
|
|
9
|
+
- Send full screen recording chunks to the server which can be later stitched together to an .mp4 file.
|
|
10
|
+
|
|
11
|
+
The extension should auto start when any non-built-in page is loaded, and should stop when the browser is closed. No UI or configuration is needed for the extension, as all configuration is done on the server side.
|
|
12
|
+
|
|
13
|
+
A `setup.sh` script is provided to load the extension into Chrome. Linux and MacOS are supported.
|
|
14
|
+
|
|
15
|
+
## Files
|
|
16
|
+
|
|
17
|
+
| File | Description |
|
|
18
|
+
|------|-------------|
|
|
19
|
+
| `manifest.json` | Manifest V3 extension definition. Permissions: `activeTab`, `tabs`. Content scripts injected on all URLs. |
|
|
20
|
+
| `stealth.js` | Anti-bot-detection patches. Runs at `document_start` in `MAIN` world. Overrides `navigator.webdriver`, plugins, WebGL, permissions, etc. |
|
|
21
|
+
| `content.js` | Injected into every non-chrome:// page. Listens for DOM events, extracts metadata, sends to background. Runs at `document_idle` in `ISOLATED` world. |
|
|
22
|
+
| `background.js` | Service worker. Relays actions to server via HTTP POST. Captures screenshots with `chrome.tabs.captureVisibleTab`. |
|
|
23
|
+
| `setup.sh` | Detects Chrome/Chromium binary on macOS or Linux and launches with `--load-extension` and remote debugging enabled. |
|
|
24
|
+
|
|
25
|
+
## Event Capture
|
|
26
|
+
|
|
27
|
+
### Captured Events
|
|
28
|
+
|
|
29
|
+
`click`, `keydown`, `keyup`, `input`, `scroll`, `change`, `submit`, plus a synthetic `pageLoad` on each navigation.
|
|
30
|
+
|
|
31
|
+
### Throttling
|
|
32
|
+
|
|
33
|
+
High-frequency events (`scroll`, `input`) are throttled to one every 500ms. Screenshots are also throttled to one every 500ms.
|
|
34
|
+
|
|
35
|
+
### Action Payload
|
|
36
|
+
|
|
37
|
+
Each action sent to the server contains:
|
|
38
|
+
|
|
39
|
+
```json
|
|
40
|
+
{
|
|
41
|
+
"type": "click",
|
|
42
|
+
"timestamp": 1710000001234,
|
|
43
|
+
"url": "https://example.com/",
|
|
44
|
+
"target": {
|
|
45
|
+
"tagName": "BUTTON",
|
|
46
|
+
"id": "submit-btn",
|
|
47
|
+
"className": "btn primary",
|
|
48
|
+
"textContent": "Submit",
|
|
49
|
+
"xpath": "/html[1]/body[1]/form[1]/button[1]"
|
|
50
|
+
},
|
|
51
|
+
"x": 255,
|
|
52
|
+
"y": 245
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Additional fields by event type:
|
|
57
|
+
- **click**: `x`, `y` (coordinates)
|
|
58
|
+
- **keydown/keyup**: `key` (key name)
|
|
59
|
+
- **input/change**: `value` (truncated to 200 chars)
|
|
60
|
+
- **scroll**: `scrollX`, `scrollY`
|
|
61
|
+
- **pageLoad**: `title`
|
|
62
|
+
|
|
63
|
+
## Anti-Bot-Detection (Stealth)
|
|
64
|
+
|
|
65
|
+
The extension includes `stealth.js`, a content script injected at `document_start` in the `MAIN` world — meaning it runs before any page JavaScript and patches the page's actual `window`/`navigator` objects (not the extension's isolated world). This reduces the chance of being blocked by reCAPTCHA, Cloudflare Turnstile, and similar bot-detection systems.
|
|
66
|
+
|
|
67
|
+
The stealth measures are split across three layers:
|
|
68
|
+
|
|
69
|
+
### Layer 1: Chrome Launch Flags (`entrypoint.sh`)
|
|
70
|
+
|
|
71
|
+
| Flag | What it does |
|
|
72
|
+
|------|-------------|
|
|
73
|
+
| Removed `--enable-automation` | Was explicitly telling Chrome to set `navigator.webdriver = true` and show the "controlled by automated software" infobar. Removing it eliminates both signals. |
|
|
74
|
+
| Removed `--disable-gpu` | Was disabling all GPU/WebGL rendering. Sites that fingerprint WebGL would see no renderer — a strong headless signal. |
|
|
75
|
+
| `--disable-blink-features=AutomationControlled` | Tells Blink not to set `navigator.webdriver = true`, even if CDP is attached. Belt-and-suspenders with the flag removal. |
|
|
76
|
+
| `--use-gl=angle --use-angle=swiftshader` | Enables software-rendered WebGL via SwiftShader through the ANGLE backend. This makes WebGL available with realistic renderer strings without a real GPU. Trade-off: higher CPU usage since all GL operations run in software. |
|
|
77
|
+
| `--enable-webgl` | Explicitly ensures WebGL contexts can be created. |
|
|
78
|
+
| `--remote-debugging-address=127.0.0.1` | CDP was previously bound to `0.0.0.0` (all interfaces). Now only accessible internally. External access still works through the `socat` forwarder on port 9223. Prevents page JavaScript from detecting CDP by probing network ports. |
|
|
79
|
+
|
|
80
|
+
### Layer 2: Chrome Profile (`entrypoint.sh`)
|
|
81
|
+
|
|
82
|
+
An empty Chrome profile with no bookmarks, no history, and no preferences is a strong signal of a freshly-created automated browser. The entrypoint now pre-populates:
|
|
83
|
+
|
|
84
|
+
- **Preferences**: `accept_languages`, `safebrowsing`, `dns_prefetching`, `window_placement`, `skip_first_run_ui`, etc.
|
|
85
|
+
- **Bookmarks**: Three common entries (Google, YouTube, Wikipedia).
|
|
86
|
+
- **Local State**: Profile metadata with a named profile ("Person 1").
|
|
87
|
+
|
|
88
|
+
### Layer 3: JavaScript Patches (`stealth.js`)
|
|
89
|
+
|
|
90
|
+
| # | Patch | Why |
|
|
91
|
+
|---|-------|-----|
|
|
92
|
+
| 1 | `navigator.webdriver → false` | The #1 bot detection signal. Real Chrome returns `false`; automated Chrome returns `true`. Even with the Blink flag, CDP attachment can re-enable it. |
|
|
93
|
+
| 2 | `navigator.languages → ['en-US', 'en']` | Ensures consistent locale regardless of container environment. |
|
|
94
|
+
| 3 | `navigator.plugins` — fake Chrome PDF Plugin, Chrome PDF Viewer, Native Client | Headless/automated Chrome reports an empty `PluginArray` (length 0). Real Chrome always has PDF and NaCl plugins. |
|
|
95
|
+
| 4 | `navigator.mimeTypes` — fake `application/pdf` entries | Must match the fake plugins. Empty mimeTypes = headless signal. |
|
|
96
|
+
| 5 | WebGL `getParameter()` — return SwiftShader vendor/renderer | Even with SwiftShader actually running, this ensures consistent, known-good strings across Chromium versions. Intercepts `UNMASKED_VENDOR_WEBGL` (0x9245) and `UNMASKED_RENDERER_WEBGL` (0x9246). |
|
|
97
|
+
| 6 | `Permissions.query({name:'notifications'})` → `'prompt'`, `Notification.permission` → `'default'` | Automated browsers deny all permissions by default. Real browsers return `'prompt'`/`'default'` for notifications. |
|
|
98
|
+
| 7 | `window.chrome.runtime` — ensure object exists | Some bot detectors check `if (!window.chrome \|\| !window.chrome.runtime)` to distinguish headless Chrome from real Chrome. |
|
|
99
|
+
| 8 | Remove `$cdc_`/`cdc_` properties on `document` | Chromedriver injects these properties. Not used by CDP directly, but removed as a precaution. |
|
|
100
|
+
| 9 | `navigator.hardwareConcurrency` → 8 (if < 4) | Docker containers with limited CPUs may report 1-2, which is suspicious for a desktop browser. |
|
|
101
|
+
| 10 | `navigator.deviceMemory` → 8 (if < 4) | Same — low memory is suspicious for desktop. |
|
|
102
|
+
| 11 | Iframe `navigator.webdriver` patching | Advanced fingerprinters create iframes and check `navigator.webdriver` inside them to bypass page-level overrides. We hook `document.createElement('iframe')` and patch the iframe's navigator on load. |
|
|
103
|
+
|
|
104
|
+
### Layer 4: Dockerfile
|
|
105
|
+
|
|
106
|
+
`libegl1` and `libgbm1` are installed to provide the EGL and GBM libraries that Chrome's ANGLE/SwiftShader backend needs. Without them, `--use-gl=angle` silently falls back to no-GPU mode.
|
|
107
|
+
|
|
108
|
+
### Test Results
|
|
109
|
+
|
|
110
|
+
Verified against bot-detection sites (2026-03-28):
|
|
111
|
+
|
|
112
|
+
| Test | Result |
|
|
113
|
+
|------|--------|
|
|
114
|
+
| bot.sannysoft.com | 10/11 main tests pass (only "WebDriver New" orange — CDP attachment quirk; "WebDriver Advanced" passes) |
|
|
115
|
+
| intoli headless detection | "You are not Chrome headless" |
|
|
116
|
+
| Cloudflare (nowsecure.nl) | Soft Turnstile challenge (not hard-blocked) |
|
|
117
|
+
| CreepJS | Fingerprint generated without bot flag |
|
|
118
|
+
|
|
119
|
+
## Local Development
|
|
120
|
+
|
|
121
|
+
Run Chrome with the extension loaded:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
./setup.sh https://example.com
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
The server must be running on `http://localhost:7878` for the extension to send data.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
const SERVER = "http://localhost:7878";
|
|
2
|
+
const SCREENSHOT_THROTTLE_MS = 500;
|
|
3
|
+
|
|
4
|
+
let lastScreenshot = 0;
|
|
5
|
+
|
|
6
|
+
// Auto-focus newly created tabs so the agent's working tab is always visible
|
|
7
|
+
chrome.tabs.onCreated.addListener((tab) => {
|
|
8
|
+
if (tab.id) {
|
|
9
|
+
chrome.tabs.update(tab.id, { active: true });
|
|
10
|
+
}
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
// Receive events from content script
|
|
14
|
+
chrome.runtime.onMessage.addListener((msg, sender) => {
|
|
15
|
+
if (msg.type === "action") {
|
|
16
|
+
// Bring the tab where the action occurred to front so the screen recording
|
|
17
|
+
// and captureVisibleTab always show the tab the agent is working on.
|
|
18
|
+
if (sender.tab && sender.tab.id) {
|
|
19
|
+
chrome.tabs.update(sender.tab.id, { active: true });
|
|
20
|
+
}
|
|
21
|
+
postAction(msg.data);
|
|
22
|
+
captureScreenshot();
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
async function postAction(data) {
|
|
27
|
+
try {
|
|
28
|
+
await fetch(`${SERVER}/api/action`, {
|
|
29
|
+
method: "POST",
|
|
30
|
+
headers: { "Content-Type": "application/json" },
|
|
31
|
+
body: JSON.stringify(data),
|
|
32
|
+
});
|
|
33
|
+
} catch (e) { console.error("[clawbench] postAction failed:", e); }
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async function captureScreenshot() {
|
|
37
|
+
const now = Date.now();
|
|
38
|
+
if (now - lastScreenshot < SCREENSHOT_THROTTLE_MS) return;
|
|
39
|
+
lastScreenshot = now;
|
|
40
|
+
|
|
41
|
+
try {
|
|
42
|
+
const dataUrl = await chrome.tabs.captureVisibleTab(null, { format: "png" });
|
|
43
|
+
const base64 = dataUrl.replace(/^data:image\/png;base64,/, "");
|
|
44
|
+
await fetch(`${SERVER}/api/screenshot`, {
|
|
45
|
+
method: "POST",
|
|
46
|
+
headers: { "Content-Type": "application/json" },
|
|
47
|
+
body: JSON.stringify({ timestamp: now, data: base64 }),
|
|
48
|
+
});
|
|
49
|
+
} catch (e) { console.error("[clawbench] captureScreenshot failed:", e); }
|
|
50
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
const THROTTLE_MS = 500;
|
|
2
|
+
const lastSent = {};
|
|
3
|
+
|
|
4
|
+
function getXPath(el) {
|
|
5
|
+
if (!el || el.nodeType !== 1) return "";
|
|
6
|
+
const parts = [];
|
|
7
|
+
while (el && el.nodeType === 1) {
|
|
8
|
+
let idx = 1;
|
|
9
|
+
for (let sib = el.previousElementSibling; sib; sib = sib.previousElementSibling) {
|
|
10
|
+
if (sib.tagName === el.tagName) idx++;
|
|
11
|
+
}
|
|
12
|
+
parts.unshift(`${el.tagName.toLowerCase()}[${idx}]`);
|
|
13
|
+
el = el.parentElement;
|
|
14
|
+
}
|
|
15
|
+
return "/" + parts.join("/");
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function buildPayload(type, e) {
|
|
19
|
+
const target = e.target || {};
|
|
20
|
+
const payload = {
|
|
21
|
+
type,
|
|
22
|
+
timestamp: Date.now(),
|
|
23
|
+
url: location.href,
|
|
24
|
+
target: {
|
|
25
|
+
tagName: target.tagName || "",
|
|
26
|
+
id: target.id || "",
|
|
27
|
+
className: target.className || "",
|
|
28
|
+
textContent: (target.textContent || "").slice(0, 100),
|
|
29
|
+
xpath: getXPath(target),
|
|
30
|
+
},
|
|
31
|
+
};
|
|
32
|
+
if (e.clientX !== undefined) {
|
|
33
|
+
payload.x = e.clientX;
|
|
34
|
+
payload.y = e.clientY;
|
|
35
|
+
}
|
|
36
|
+
if (e.key) payload.key = e.key;
|
|
37
|
+
if (target.value !== undefined) payload.value = String(target.value).slice(0, 200);
|
|
38
|
+
if (type === "scroll") {
|
|
39
|
+
payload.scrollX = window.scrollX;
|
|
40
|
+
payload.scrollY = window.scrollY;
|
|
41
|
+
}
|
|
42
|
+
return payload;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function throttled(type) {
|
|
46
|
+
return type === "scroll" || type === "input";
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function send(type, e) {
|
|
50
|
+
if (throttled(type)) {
|
|
51
|
+
const now = Date.now();
|
|
52
|
+
if (lastSent[type] && now - lastSent[type] < THROTTLE_MS) return;
|
|
53
|
+
lastSent[type] = now;
|
|
54
|
+
}
|
|
55
|
+
chrome.runtime.sendMessage({ type: "action", data: buildPayload(type, e) });
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
["click", "keydown", "keyup", "input", "scroll", "change", "submit"].forEach((evt) => {
|
|
59
|
+
document.addEventListener(evt, (e) => send(evt, e), true);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
chrome.runtime.sendMessage({
|
|
63
|
+
type: "action",
|
|
64
|
+
data: {
|
|
65
|
+
type: "pageLoad",
|
|
66
|
+
timestamp: Date.now(),
|
|
67
|
+
url: location.href,
|
|
68
|
+
title: document.title,
|
|
69
|
+
},
|
|
70
|
+
});
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"manifest_version": 3,
|
|
3
|
+
"name": "ClawBench",
|
|
4
|
+
"version": "1.0",
|
|
5
|
+
"description": "Browser action recording for benchmarking",
|
|
6
|
+
"permissions": ["activeTab", "tabs"],
|
|
7
|
+
"host_permissions": ["<all_urls>"],
|
|
8
|
+
"background": {
|
|
9
|
+
"service_worker": "background.js"
|
|
10
|
+
},
|
|
11
|
+
"content_scripts": [
|
|
12
|
+
{
|
|
13
|
+
"matches": ["<all_urls>"],
|
|
14
|
+
"js": ["stealth.js"],
|
|
15
|
+
"run_at": "document_start",
|
|
16
|
+
"world": "MAIN"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"matches": ["<all_urls>"],
|
|
20
|
+
"js": ["content.js"],
|
|
21
|
+
"run_at": "document_idle",
|
|
22
|
+
"all_frames": true
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|