clawbench-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clawbench/__init__.py +35 -0
- clawbench/__main__.py +8 -0
- clawbench/batch.py +619 -0
- clawbench/cli.py +397 -0
- clawbench/data/chrome-extension/README.md +127 -0
- clawbench/data/chrome-extension/background.js +50 -0
- clawbench/data/chrome-extension/content.js +70 -0
- clawbench/data/chrome-extension/manifest.json +25 -0
- clawbench/data/chrome-extension/setup.sh +27 -0
- clawbench/data/chrome-extension/stealth.js +200 -0
- clawbench/data/docker/Dockerfile +51 -0
- clawbench/data/docker/entrypoint.sh +394 -0
- clawbench/data/docker/setup-openclaw.sh +112 -0
- clawbench/data/eval/README.md +95 -0
- clawbench/data/eval/agentic_eval.md +53 -0
- clawbench/data/extension-server/.python-version +1 -0
- clawbench/data/extension-server/README.md +54 -0
- clawbench/data/extension-server/pyproject.toml +7 -0
- clawbench/data/extension-server/server.py +360 -0
- clawbench/data/extension-server/uv.lock +644 -0
- clawbench/data/models/model.schema.json +44 -0
- clawbench/data/models/models.example.yaml +16 -0
- clawbench/data/shared/alex_green_personal_info.json +451 -0
- clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
- clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
- clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
- clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
- clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
- clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
- clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
- clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
- clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
- clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
- clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
- clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
- clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
- clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
- clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
- clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
- clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
- clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
- clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
- clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
- clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
- clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
- clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
- clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
- clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
- clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
- clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
- clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
- clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
- clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
- clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
- clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
- clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
- clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
- clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
- clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
- clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
- clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
- clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
- clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
- clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
- clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
- clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
- clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
- clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
- clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
- clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
- clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
- clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
- clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
- clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
- clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
- clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
- clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
- clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
- clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
- clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
- clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
- clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
- clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
- clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
- clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
- clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
- clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
- clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
- clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
- clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
- clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
- clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
- clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
- clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
- clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
- clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
- clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
- clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
- clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
- clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
- clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
- clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
- clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
- clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
- clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
- clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
- clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
- clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
- clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
- clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
- clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
- clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
- clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
- clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
- clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
- clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
- clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
- clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
- clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
- clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
- clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
- clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
- clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
- clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
- clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
- clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
- clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
- clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
- clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
- clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
- clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
- clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
- clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
- clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
- clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
- clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
- clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
- clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
- clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
- clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
- clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
- clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
- clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
- clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
- clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
- clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
- clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
- clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
- clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
- clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
- clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
- clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
- clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
- clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
- clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
- clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
- clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
- clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
- clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
- clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
- clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
- clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
- clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
- clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
- clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
- clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
- clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
- clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
- clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
- clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
- clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
- clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
- clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
- clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
- clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
- clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
- clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
- clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
- clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
- clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
- clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
- clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
- clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
- clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
- clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
- clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
- clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
- clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
- clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
- clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
- clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
- clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
- clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
- clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
- clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
- clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
- clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
- clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
- clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
- clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
- clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
- clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
- clawbench/data/test-cases/lite.json +226 -0
- clawbench/data/test-cases/lite.schema.json +105 -0
- clawbench/data/test-cases/task.schema.json +132 -0
- clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
- clawbench/doctor.py +171 -0
- clawbench/engine.py +180 -0
- clawbench/generate_resume_pdf.py +140 -0
- clawbench/hf_upload.py +78 -0
- clawbench/image.py +127 -0
- clawbench/paths.py +150 -0
- clawbench/resume_template.json +104 -0
- clawbench/run.py +942 -0
- clawbench/tui.py +1401 -0
- clawbench_cli-0.1.2.dist-info/METADATA +770 -0
- clawbench_cli-0.1.2.dist-info/RECORD +226 -0
- clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
- clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
- clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
clawbench/run.py
ADDED
|
@@ -0,0 +1,942 @@
|
|
|
1
|
+
"""ClawBench single test-case driver."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import secrets
|
|
8
|
+
import shutil
|
|
9
|
+
import signal
|
|
10
|
+
import socket
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import tempfile
|
|
14
|
+
import time
|
|
15
|
+
import uuid
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from urllib.error import URLError
|
|
19
|
+
from urllib.request import Request, urlopen
|
|
20
|
+
|
|
21
|
+
import yaml
|
|
22
|
+
from rich.console import Console
|
|
23
|
+
from rich.panel import Panel
|
|
24
|
+
from rich.status import Status
|
|
25
|
+
|
|
26
|
+
from clawbench import engine as _engine
|
|
27
|
+
from clawbench import paths as _paths
|
|
28
|
+
from clawbench.generate_resume_pdf import generate_resume_pdf
|
|
29
|
+
from clawbench.hf_upload import hf_upload_enabled, upload_run
|
|
30
|
+
|
|
31
|
+
IMAGE = "clawbench"
|
|
32
|
+
console = Console()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _detect_engine() -> str:
|
|
36
|
+
"""Select the container engine, matching the TUI/engine module priority
|
|
37
|
+
(podman-first, env override wins). Exits with an actionable message if
|
|
38
|
+
the env var is malformed or nothing is installed."""
|
|
39
|
+
env_override = os.environ.get("CONTAINER_ENGINE", "").strip().lower()
|
|
40
|
+
if env_override and env_override not in ("docker", "podman"):
|
|
41
|
+
print(f"ERROR: CONTAINER_ENGINE must be 'docker' or 'podman', got '{env_override}'")
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
if env_override and not shutil.which(env_override):
|
|
44
|
+
print(f"ERROR: CONTAINER_ENGINE={env_override} but '{env_override}' not found on PATH")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
detected = _engine.detect_engine()
|
|
47
|
+
if detected is None:
|
|
48
|
+
print("ERROR: Neither 'podman' nor 'docker' found on PATH")
|
|
49
|
+
print(" Install podman (recommended): brew install podman | sudo apt install podman")
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
return detected
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
ENGINE = _detect_engine()
|
|
55
|
+
PURELYMAIL_API = "https://purelymail.com/api/v0"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_dotenv(path: Path) -> dict[str, str]:
|
|
59
|
+
env = {}
|
|
60
|
+
if not path.exists():
|
|
61
|
+
return env
|
|
62
|
+
for line in path.read_text().splitlines():
|
|
63
|
+
line = line.strip()
|
|
64
|
+
if not line or line.startswith("#"):
|
|
65
|
+
continue
|
|
66
|
+
if "=" not in line:
|
|
67
|
+
continue
|
|
68
|
+
k, v = line.split("=", 1)
|
|
69
|
+
env[k.strip()] = v.strip().strip('"').strip("'")
|
|
70
|
+
return env
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _load_runtime_env() -> dict[str, str]:
|
|
74
|
+
"""Build the runtime env dict from, in order of precedence:
|
|
75
|
+
|
|
76
|
+
1. ``os.environ`` — normal env vars (highest priority).
|
|
77
|
+
2. ``$CWD/.env`` — legacy source-install layout (if present).
|
|
78
|
+
3. ``user_config_dir()/secrets.env`` — persisted secrets from
|
|
79
|
+
``claw-bench configure --secrets``.
|
|
80
|
+
|
|
81
|
+
Earlier sources win; later sources fill in missing keys only. This lets
|
|
82
|
+
``PURELY_MAIL_API_KEY=... claw-bench run ...`` work without any config
|
|
83
|
+
file, while still picking up a persisted key for users who prefer one.
|
|
84
|
+
"""
|
|
85
|
+
merged: dict[str, str] = {}
|
|
86
|
+
cwd_env = load_dotenv(Path.cwd() / ".env")
|
|
87
|
+
user_env = load_dotenv(_paths.user_secrets_path())
|
|
88
|
+
for key in ("PURELY_MAIL_API_KEY", "PURELY_MAIL_DOMAIN", "HF_TOKEN", "HF_REPO_ID"):
|
|
89
|
+
val = os.environ.get(key) or cwd_env.get(key) or user_env.get(key) or ""
|
|
90
|
+
if val:
|
|
91
|
+
merged[key] = val
|
|
92
|
+
return merged
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
MODELS_YAML = _paths.user_models_yaml()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def load_models_yaml() -> dict:
|
|
99
|
+
"""Load all model definitions from models/models.yaml."""
|
|
100
|
+
if not MODELS_YAML.exists():
|
|
101
|
+
print(f"ERROR: {MODELS_YAML} not found (copy models.example.yaml and fill in your keys)")
|
|
102
|
+
sys.exit(1)
|
|
103
|
+
return yaml.safe_load(MODELS_YAML.read_text()) or {}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def load_model_config(model: str) -> dict:
|
|
107
|
+
"""Load a model config by name from models/models.yaml.
|
|
108
|
+
|
|
109
|
+
The YAML key is the model name (passed as MODEL_NAME to the container).
|
|
110
|
+
"""
|
|
111
|
+
all_models = load_models_yaml()
|
|
112
|
+
if model not in all_models:
|
|
113
|
+
print(f"ERROR: model '{model}' not found in {MODELS_YAML}")
|
|
114
|
+
print(f"Available models: {', '.join(sorted(all_models))}")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
# Validate model name characters. Note: '/' and ':' are valid in
|
|
118
|
+
# vendor-prefixed ids like 'anthropic/claude-sonnet-4-6' or
|
|
119
|
+
# 'arcee-ai/trinity-large-preview:free' — they get sanitized to
|
|
120
|
+
# '--' before being used as path components (see `safe_model`
|
|
121
|
+
# below). We only reject characters that could cause real trouble
|
|
122
|
+
# in shell/filesystem paths even after that sanitization.
|
|
123
|
+
bad = [c for c in " \\*?\"<>|" if c in model]
|
|
124
|
+
if bad:
|
|
125
|
+
print(
|
|
126
|
+
f"ERROR: model name '{model}' contains illegal character(s): "
|
|
127
|
+
f"{' '.join(repr(c) for c in bad)}"
|
|
128
|
+
)
|
|
129
|
+
sys.exit(1)
|
|
130
|
+
|
|
131
|
+
config = dict(all_models[model])
|
|
132
|
+
config["model"] = model # the YAML key IS the model name
|
|
133
|
+
|
|
134
|
+
# Validate required fields
|
|
135
|
+
required = ["base_url", "api_type"]
|
|
136
|
+
missing = [k for k in required if not config.get(k)]
|
|
137
|
+
if missing:
|
|
138
|
+
for k in missing:
|
|
139
|
+
print(f"ERROR: Required field '{k}' missing for model '{model}'")
|
|
140
|
+
sys.exit(1)
|
|
141
|
+
|
|
142
|
+
# Normalize API keys: api_keys list wins, else wrap api_key into list
|
|
143
|
+
if config.get("api_keys"):
|
|
144
|
+
config["api_key"] = config["api_keys"][0]
|
|
145
|
+
elif config.get("api_key"):
|
|
146
|
+
config["api_keys"] = [config["api_key"]]
|
|
147
|
+
|
|
148
|
+
if not config.get("api_keys"):
|
|
149
|
+
print(f"ERROR: no api_key or api_keys for model '{model}'")
|
|
150
|
+
sys.exit(1)
|
|
151
|
+
|
|
152
|
+
return config
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def step(msg: str):
|
|
156
|
+
print(f"\n{'=' * 60}\n[STEP] {msg}\n{'=' * 60}", flush=True)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def run(cmd: list[str], **kwargs): # type: ignore[no-untyped-def]
|
|
160
|
+
print(f"$ {' '.join(cmd)}", flush=True)
|
|
161
|
+
subprocess.run(cmd, check=True, **kwargs)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# -- PurelyMail --
|
|
165
|
+
|
|
166
|
+
def purelymail_request(endpoint: str, body: dict, api_key: str) -> dict:
|
|
167
|
+
data = json.dumps(body).encode()
|
|
168
|
+
req = Request(
|
|
169
|
+
f"{PURELYMAIL_API}/{endpoint}",
|
|
170
|
+
data=data,
|
|
171
|
+
headers={"Purelymail-Api-Token": api_key,
|
|
172
|
+
"Content-Type": "application/json"},
|
|
173
|
+
method="POST",
|
|
174
|
+
)
|
|
175
|
+
with urlopen(req, timeout=15) as resp:
|
|
176
|
+
return json.loads(resp.read())
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def create_email(api_key: str, domain: str) -> tuple[str, str]:
|
|
180
|
+
local = f"cb{uuid.uuid4().hex[:12]}"
|
|
181
|
+
password = secrets.token_urlsafe(16)
|
|
182
|
+
purelymail_request("createUser", {
|
|
183
|
+
"userName": local,
|
|
184
|
+
"domainName": domain,
|
|
185
|
+
"password": password,
|
|
186
|
+
"enablePasswordReset": False,
|
|
187
|
+
"sendWelcomeEmail": False,
|
|
188
|
+
}, api_key)
|
|
189
|
+
email = f"{local}@{domain}"
|
|
190
|
+
print(f" Created email: {email}")
|
|
191
|
+
print(f" Password: {password}")
|
|
192
|
+
return email, password
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def delete_email(api_key: str, email: str) -> None:
|
|
196
|
+
try:
|
|
197
|
+
purelymail_request("deleteUser", {"userName": email}, api_key)
|
|
198
|
+
print(f" Deleted email: {email}")
|
|
199
|
+
except (URLError, Exception) as e:
|
|
200
|
+
print(f" WARNING: Failed to delete email {email}: {e}")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# -- Personal info --
|
|
204
|
+
|
|
205
|
+
RESUME_TEMPLATE = Path(__file__).resolve().parent / "resume_template.json"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _shared_src() -> Path:
|
|
209
|
+
"""Return the bundled ``shared/`` directory (personal-info templates)."""
|
|
210
|
+
return _paths.shared_dir()
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def prepare_personal_info(shared_src: Path, email: str, password: str,
|
|
214
|
+
output_dir: Path) -> Path:
|
|
215
|
+
"""Create a temp directory with personal info files, email fields updated."""
|
|
216
|
+
tmp = output_dir / ".my-info-tmp"
|
|
217
|
+
tmp.mkdir(parents=True, exist_ok=True)
|
|
218
|
+
|
|
219
|
+
# -- personal info JSON --
|
|
220
|
+
pi_src = shared_src / "alex_green_personal_info.json"
|
|
221
|
+
pi_data = json.loads(pi_src.read_text())
|
|
222
|
+
pi_data["contact"]["email"] = email
|
|
223
|
+
pi_data.pop("online_accounts", None)
|
|
224
|
+
(tmp / "alex_green_personal_info.json").write_text(
|
|
225
|
+
json.dumps(pi_data, indent=2))
|
|
226
|
+
|
|
227
|
+
# -- email credentials (separate file) --
|
|
228
|
+
creds = {
|
|
229
|
+
"email": email,
|
|
230
|
+
"password": password,
|
|
231
|
+
"login_url": "https://purelymail.com/user/login",
|
|
232
|
+
"provider": "PurelyMail",
|
|
233
|
+
}
|
|
234
|
+
(tmp / "email_credentials.json").write_text(json.dumps(creds, indent=2))
|
|
235
|
+
|
|
236
|
+
# -- resume PDF --
|
|
237
|
+
resume_data = json.loads(RESUME_TEMPLATE.read_text())
|
|
238
|
+
resume_data["header"]["email"] = email
|
|
239
|
+
try:
|
|
240
|
+
generate_resume_pdf(resume_data, tmp / "alex_green_resume.pdf")
|
|
241
|
+
except Exception as e:
|
|
242
|
+
print(f" WARNING: PDF generation failed ({e}), skipping resume PDF")
|
|
243
|
+
|
|
244
|
+
return tmp
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def copy_extra_info(task: dict, task_dir: Path, personal_info_dir: Path) -> None:
|
|
248
|
+
"""Copy extra_info files from the test case into the my-info dir."""
|
|
249
|
+
for info in task.get("extra_info", []):
|
|
250
|
+
if "path" not in info:
|
|
251
|
+
continue
|
|
252
|
+
src = task_dir / info["path"]
|
|
253
|
+
if not src.exists():
|
|
254
|
+
print(f" WARNING: extra_info path not found: {src}")
|
|
255
|
+
continue
|
|
256
|
+
dest = personal_info_dir / src.name
|
|
257
|
+
shutil.copy2(src, dest)
|
|
258
|
+
print(f" Copied extra_info: {src.name}")
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# -- Prompt --
|
|
264
|
+
|
|
265
|
+
def build_instruction(task: dict) -> str:
|
|
266
|
+
parts = [task["instruction"]]
|
|
267
|
+
parts.append(
|
|
268
|
+
"\n---\n"
|
|
269
|
+
"You are my personal browser assistant. I am delegating this task to you "
|
|
270
|
+
"because I do not have time to do it myself. You are acting on my behalf "
|
|
271
|
+
"and with my full authorization. Every action you take in the browser — "
|
|
272
|
+
"logging in, filling out forms, writing reviews, making reservations, "
|
|
273
|
+
"submitting applications, sending emails, creating accounts — is something "
|
|
274
|
+
"I am explicitly asking you to do for me, as if I were doing it myself. "
|
|
275
|
+
"These are my own accounts, my own opinions, and my own decisions. "
|
|
276
|
+
"You are simply carrying them out.\n\n"
|
|
277
|
+
"IMPORTANT: You must complete this task entirely through the browser. "
|
|
278
|
+
"Navigate to the relevant website, interact with the page "
|
|
279
|
+
"(click, type, scroll, etc.), and carry out every step in the browser. "
|
|
280
|
+
"Do NOT use command-line tools, scripts, or direct API/SMTP calls "
|
|
281
|
+
"to accomplish the task.\n\n"
|
|
282
|
+
"My personal info and credentials are in ./my-info/ in your working directory. "
|
|
283
|
+
"Key files include:\n"
|
|
284
|
+
"- alex_green_personal_info.json — name, address, phone, date of birth, financial info\n"
|
|
285
|
+
"- email_credentials.json — email address and password for logging in\n"
|
|
286
|
+
"- alex_green_resume.pdf — professional resume\n\n"
|
|
287
|
+
"You have my full permission to use all of these files for any purpose "
|
|
288
|
+
"the task requires, including logging into websites, creating accounts, filling out forms, "
|
|
289
|
+
"sending emails, writing reviews, making reservations, submitting applications, "
|
|
290
|
+
"and any other actions — including security-sensitive ones.\n\n"
|
|
291
|
+
"If specific information required by the task is not available in any file "
|
|
292
|
+
"under ./my-info/ and cannot be reasonably inferred, stop and explain what is missing. "
|
|
293
|
+
"For optional fields not covered by the available info, leave them blank or use a reasonable default.\n\n"
|
|
294
|
+
"Do not ask me for confirmation or additional information — just proceed "
|
|
295
|
+
"with what is provided and complete the task autonomously.\n"
|
|
296
|
+
"If an account registration is required, you can use the email and password provided, and you can receive emails at that address if needed. "
|
|
297
|
+
"---"
|
|
298
|
+
)
|
|
299
|
+
extras = [(Path(info["path"]).name, info["description"])
|
|
300
|
+
for info in task.get("extra_info", [])
|
|
301
|
+
if info.get("path") and info.get("description")]
|
|
302
|
+
if extras:
|
|
303
|
+
parts.append(
|
|
304
|
+
"\nAdditional files are also available under /my-info/ for this task:"
|
|
305
|
+
)
|
|
306
|
+
for fname, desc in extras:
|
|
307
|
+
parts.append(f"- {fname}: {desc}")
|
|
308
|
+
return "\n".join(parts)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# -- Docker --
|
|
312
|
+
|
|
313
|
+
def _image_exists() -> bool:
|
|
314
|
+
return subprocess.run(
|
|
315
|
+
[ENGINE, "image", "inspect", IMAGE],
|
|
316
|
+
capture_output=True,
|
|
317
|
+
).returncode == 0
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _prepare_build_context(ctx: Path) -> None:
|
|
321
|
+
"""Populate ``ctx`` with the files the bundled Dockerfile expects at the
|
|
322
|
+
build-context root: Dockerfile, entrypoint.sh, setup-openclaw.sh,
|
|
323
|
+
chrome-extension/, extension-server/.
|
|
324
|
+
|
|
325
|
+
We copy instead of symlinking because docker/podman do not follow
|
|
326
|
+
symlinks that point *outside* the build context — which all of ours do
|
|
327
|
+
when the package is installed (symlinks under ``src/clawbench/data/``
|
|
328
|
+
resolve to the source tree or to the wheel's site-packages layout).
|
|
329
|
+
The copied trees are tiny (a few MB) so the cost is negligible."""
|
|
330
|
+
docker_dir = _paths.docker_build_dir()
|
|
331
|
+
shutil.copy2(docker_dir / "Dockerfile", ctx / "Dockerfile")
|
|
332
|
+
shutil.copy2(docker_dir / "entrypoint.sh", ctx / "entrypoint.sh")
|
|
333
|
+
shutil.copy2(docker_dir / "setup-openclaw.sh", ctx / "setup-openclaw.sh")
|
|
334
|
+
shutil.copytree(_paths.extension_server_dir(), ctx / "extension-server",
|
|
335
|
+
symlinks=False)
|
|
336
|
+
shutil.copytree(_paths.chrome_extension_dir(), ctx / "chrome-extension",
|
|
337
|
+
symlinks=False)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _pick_free_port(preferred: int = 6080) -> int:
|
|
341
|
+
"""Return ``preferred`` if available on 127.0.0.1, else an OS-assigned
|
|
342
|
+
ephemeral port. Avoids the hard-coded ``-p 6080:6080`` collision when
|
|
343
|
+
something else on the host already owns that port.
|
|
344
|
+
"""
|
|
345
|
+
for candidate in (preferred, 0):
|
|
346
|
+
try:
|
|
347
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
348
|
+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
349
|
+
s.bind(("127.0.0.1", candidate))
|
|
350
|
+
return s.getsockname()[1]
|
|
351
|
+
except OSError:
|
|
352
|
+
continue
|
|
353
|
+
raise RuntimeError("Could not find a free TCP port for noVNC")
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
_STEP_RE = re.compile(r"^(?:STEP|Step)\s+(\d+)(?:/(\d+))?", re.IGNORECASE)
|
|
357
|
+
_BK_STEP_RE = re.compile(r"^#(\d+)\s+\[")
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _run_build(cmd: list[str]) -> tuple[int, str, list[str]]:
|
|
361
|
+
"""Execute a build command with a live spinner.
|
|
362
|
+
|
|
363
|
+
Returns ``(exit_code, last_line, all_output_lines)``.
|
|
364
|
+
"""
|
|
365
|
+
console.print(f"[dim]$ {' '.join(cmd)}[/]")
|
|
366
|
+
|
|
367
|
+
proc = subprocess.Popen(
|
|
368
|
+
cmd,
|
|
369
|
+
stdout=subprocess.PIPE,
|
|
370
|
+
stderr=subprocess.STDOUT,
|
|
371
|
+
text=True,
|
|
372
|
+
bufsize=1,
|
|
373
|
+
)
|
|
374
|
+
assert proc.stdout is not None
|
|
375
|
+
|
|
376
|
+
last_line = ""
|
|
377
|
+
last_step = ""
|
|
378
|
+
output_lines: list[str] = []
|
|
379
|
+
status_msg = "[cyan]Starting build…[/]"
|
|
380
|
+
with Status(status_msg, console=console, spinner="dots") as status:
|
|
381
|
+
for raw in proc.stdout:
|
|
382
|
+
line = raw.rstrip()
|
|
383
|
+
if not line:
|
|
384
|
+
continue
|
|
385
|
+
last_line = line
|
|
386
|
+
output_lines.append(line)
|
|
387
|
+
|
|
388
|
+
m = _STEP_RE.match(line)
|
|
389
|
+
if m:
|
|
390
|
+
cur = m.group(1)
|
|
391
|
+
tot = m.group(2) or "?"
|
|
392
|
+
rest = line.split(":", 1)[-1].strip()[:72]
|
|
393
|
+
last_step = f"step {cur}/{tot}"
|
|
394
|
+
status.update(
|
|
395
|
+
f"[cyan]Building image — {last_step}[/] [dim]{rest}[/]"
|
|
396
|
+
)
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
m = _BK_STEP_RE.match(line)
|
|
400
|
+
if m:
|
|
401
|
+
snippet = line[:100]
|
|
402
|
+
status.update(f"[cyan]Building image[/] [dim]{snippet}[/]")
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
lowered = line.lower()
|
|
406
|
+
if "error" in lowered and "--no-" not in lowered:
|
|
407
|
+
console.print(f" [yellow]{line[:120]}[/]")
|
|
408
|
+
status.update(
|
|
409
|
+
f"[cyan]Building image[/] "
|
|
410
|
+
f"[dim]{(last_step + ' · ') if last_step else ''}{line[:72]}[/]"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
rc = proc.wait()
|
|
414
|
+
return rc, last_line, output_lines
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _looks_like_stale_cache(output_lines: list[str]) -> bool:
|
|
418
|
+
"""Return True if the build failure looks like it was caused by
|
|
419
|
+
stale layer-cache (e.g. old lockfiles, wrong Python version)."""
|
|
420
|
+
blob = "\n".join(output_lines).lower()
|
|
421
|
+
patterns = [
|
|
422
|
+
"no interpreter found for python",
|
|
423
|
+
"no matching distribution found",
|
|
424
|
+
"package not found",
|
|
425
|
+
"could not find a version that satisfies",
|
|
426
|
+
]
|
|
427
|
+
return any(p in blob for p in patterns)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def docker_build() -> None:
|
|
431
|
+
"""Build (or rebuild) the clawbench image with a live progress spinner.
|
|
432
|
+
|
|
433
|
+
The first build pulls ~2GB (python base, chromium, ffmpeg, noVNC, Node,
|
|
434
|
+
openclaw) and takes several minutes; subsequent rebuilds are near-instant
|
|
435
|
+
when the layer cache is warm. We show a banner only for the cold path.
|
|
436
|
+
|
|
437
|
+
If the build fails with a pattern that suggests stale layer-cache
|
|
438
|
+
(e.g. a lockfile mismatch), we automatically retry once with
|
|
439
|
+
``--no-cache`` so the user doesn't have to debug it manually.
|
|
440
|
+
"""
|
|
441
|
+
first_build = not _image_exists()
|
|
442
|
+
|
|
443
|
+
if first_build:
|
|
444
|
+
console.print()
|
|
445
|
+
console.print(Panel(
|
|
446
|
+
"[bold]First-time container build.[/]\n"
|
|
447
|
+
"This downloads ~2 GB (chromium, ffmpeg, noVNC, Node, openclaw)\n"
|
|
448
|
+
"and typically takes [bold]5–10 minutes[/] on a decent connection.\n"
|
|
449
|
+
"[dim]Subsequent runs reuse the layer cache and finish in seconds.[/]",
|
|
450
|
+
title="[bold]Building clawbench image[/]",
|
|
451
|
+
border_style="cyan",
|
|
452
|
+
))
|
|
453
|
+
|
|
454
|
+
with tempfile.TemporaryDirectory(prefix="clawbench-build-") as td:
|
|
455
|
+
ctx = Path(td)
|
|
456
|
+
_prepare_build_context(ctx)
|
|
457
|
+
cmd = [ENGINE, "build", "-t", IMAGE, str(ctx)]
|
|
458
|
+
rc, last_line, output_lines = _run_build(cmd)
|
|
459
|
+
|
|
460
|
+
# If the build failed and the output looks like a stale-cache
|
|
461
|
+
# problem, retry once with --no-cache before giving up.
|
|
462
|
+
if rc != 0 and _looks_like_stale_cache(output_lines):
|
|
463
|
+
console.print()
|
|
464
|
+
console.print(
|
|
465
|
+
"[yellow]Build failed — looks like a stale layer cache "
|
|
466
|
+
"(e.g. updated lockfiles not picked up).[/]"
|
|
467
|
+
)
|
|
468
|
+
console.print(
|
|
469
|
+
"[yellow]Retrying with [bold]--no-cache[/] "
|
|
470
|
+
"(full rebuild, may take a few minutes)…[/]"
|
|
471
|
+
)
|
|
472
|
+
console.print()
|
|
473
|
+
cmd_nc = [ENGINE, "build", "--no-cache", "-t", IMAGE, str(ctx)]
|
|
474
|
+
rc, last_line, output_lines = _run_build(cmd_nc)
|
|
475
|
+
|
|
476
|
+
if rc != 0:
|
|
477
|
+
console.print(f"[red bold]Build failed[/] (exit {rc})")
|
|
478
|
+
if last_line:
|
|
479
|
+
console.print(f" Last output: [dim]{last_line}[/]")
|
|
480
|
+
sys.exit(rc)
|
|
481
|
+
|
|
482
|
+
console.print("[green]✓[/] Container image ready")
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _fix_data_ownership(data_dir: Path) -> None:
|
|
486
|
+
"""On Linux + rootful Docker, files written inside the container are
|
|
487
|
+
owned by root on the host. After ``docker cp``, the caller cannot
|
|
488
|
+
``rm -rf test-output/`` without sudo. Detect this and chown the tree
|
|
489
|
+
back to the caller's UID/GID via a throwaway container (which has the
|
|
490
|
+
root privileges needed to chown anything on the bind-mounted dir).
|
|
491
|
+
|
|
492
|
+
No-op on macOS, on rootless podman, and when the tree is already
|
|
493
|
+
owned by the caller.
|
|
494
|
+
"""
|
|
495
|
+
if sys.platform != "linux":
|
|
496
|
+
return
|
|
497
|
+
if ENGINE != "docker":
|
|
498
|
+
return
|
|
499
|
+
if not data_dir.exists():
|
|
500
|
+
return
|
|
501
|
+
try:
|
|
502
|
+
uid = os.getuid()
|
|
503
|
+
except AttributeError:
|
|
504
|
+
return
|
|
505
|
+
try:
|
|
506
|
+
needs_fix = any(
|
|
507
|
+
p.stat().st_uid != uid
|
|
508
|
+
for p in data_dir.rglob("*")
|
|
509
|
+
if not p.is_symlink()
|
|
510
|
+
)
|
|
511
|
+
except OSError:
|
|
512
|
+
needs_fix = True
|
|
513
|
+
if not needs_fix:
|
|
514
|
+
return
|
|
515
|
+
|
|
516
|
+
print(f" Fixing ownership of {data_dir} (rootful Docker -> host UID)")
|
|
517
|
+
subprocess.run(
|
|
518
|
+
[
|
|
519
|
+
ENGINE, "run", "--rm",
|
|
520
|
+
"-v", f"{data_dir.resolve()}:/fix",
|
|
521
|
+
IMAGE,
|
|
522
|
+
"chown", "-R", f"{uid}:{os.getgid()}", "/fix",
|
|
523
|
+
],
|
|
524
|
+
check=False,
|
|
525
|
+
capture_output=True,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _network_flags() -> list[str]:
|
|
530
|
+
"""Force slirp4netns on podman to avoid host-network port collisions."""
|
|
531
|
+
if ENGINE == "podman":
|
|
532
|
+
return ["--network=slirp4netns"]
|
|
533
|
+
return []
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def _proxy_env_flags() -> list[str]:
|
|
537
|
+
"""Forward host proxy env vars into the container.
|
|
538
|
+
|
|
539
|
+
Inside the container 127.0.0.1 is its own loopback, not the host.
|
|
540
|
+
Rewrite localhost references to the host gateway so the proxy is reachable.
|
|
541
|
+
Both podman (host.containers.internal) and Docker Desktop
|
|
542
|
+
(host.docker.internal) resolve to the Mac host.
|
|
543
|
+
"""
|
|
544
|
+
host_gw = "host.containers.internal" if ENGINE == "podman" else "host.docker.internal"
|
|
545
|
+
flags: list[str] = []
|
|
546
|
+
has_proxy = False
|
|
547
|
+
for var in ("HTTP_PROXY", "HTTPS_PROXY", "http_proxy", "https_proxy",
|
|
548
|
+
"ALL_PROXY", "all_proxy", "NO_PROXY", "no_proxy"):
|
|
549
|
+
val = os.environ.get(var, "")
|
|
550
|
+
if not val:
|
|
551
|
+
continue
|
|
552
|
+
if var not in ("NO_PROXY", "no_proxy"):
|
|
553
|
+
has_proxy = True
|
|
554
|
+
# Rewrite 127.0.0.1 / localhost to host gateway
|
|
555
|
+
val = val.replace("127.0.0.1", host_gw).replace("localhost", host_gw)
|
|
556
|
+
flags += ["-e", f"{var}={val}"]
|
|
557
|
+
# Ensure container-internal traffic bypasses the proxy
|
|
558
|
+
if has_proxy and not os.environ.get("NO_PROXY") and not os.environ.get("no_proxy"):
|
|
559
|
+
flags += ["-e", "NO_PROXY=localhost,127.0.0.1"]
|
|
560
|
+
flags += ["-e", "no_proxy=localhost,127.0.0.1"]
|
|
561
|
+
return flags
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def docker_run_human(name: str, instruction: str, schema_path: Path,
|
|
565
|
+
personal_info_dir: Path,
|
|
566
|
+
time_limit_s: int = 1800,
|
|
567
|
+
host_port: int = 6080) -> None:
|
|
568
|
+
cmd = [
|
|
569
|
+
ENGINE, "run", "-d", "--name", name,
|
|
570
|
+
*_network_flags(),
|
|
571
|
+
*_proxy_env_flags(),
|
|
572
|
+
"-e", "HUMAN_MODE=1",
|
|
573
|
+
"-e", f"INSTRUCTION={instruction}",
|
|
574
|
+
"-e", f"TIME_LIMIT_S={time_limit_s}",
|
|
575
|
+
"-p", f"{host_port}:6080",
|
|
576
|
+
"-v", f"{schema_path.resolve()}:/eval-schema.json:ro",
|
|
577
|
+
"-v", f"{personal_info_dir.resolve()}:/my-info:ro",
|
|
578
|
+
IMAGE,
|
|
579
|
+
]
|
|
580
|
+
run(cmd)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def docker_run(name: str, instruction: str, schema_path: Path,
|
|
584
|
+
personal_info_dir: Path, model_cfg: dict,
|
|
585
|
+
time_limit_s: int = 1800,
|
|
586
|
+
host_port: int | None = None) -> None:
|
|
587
|
+
env_flags = [
|
|
588
|
+
ENGINE, "run", "-d", "--name", name,
|
|
589
|
+
*_network_flags(),
|
|
590
|
+
*_proxy_env_flags(),
|
|
591
|
+
"-e", f"MODEL_NAME={model_cfg['model']}",
|
|
592
|
+
"-e", f"BASE_URL={model_cfg['base_url']}",
|
|
593
|
+
"-e", f"API_TYPE={model_cfg['api_type']}",
|
|
594
|
+
"-e", f"API_KEYS={json.dumps(model_cfg.get('api_keys', []))}",
|
|
595
|
+
"-e", f"API_KEY={model_cfg.get('api_key', '')}",
|
|
596
|
+
"-e", f"INSTRUCTION={instruction}",
|
|
597
|
+
"-e", f"TIME_LIMIT_S={time_limit_s}",
|
|
598
|
+
"-v", f"{schema_path.resolve()}:/eval-schema.json:ro",
|
|
599
|
+
"-v", f"{personal_info_dir.resolve()}:/my-info:ro",
|
|
600
|
+
]
|
|
601
|
+
# Expose noVNC so the user can watch the agent in real-time.
|
|
602
|
+
if host_port is not None:
|
|
603
|
+
env_flags += ["-p", f"{host_port}:6080"]
|
|
604
|
+
# host.docker.internal needs explicit mapping on Linux (not Docker Desktop)
|
|
605
|
+
if "host.docker.internal" in model_cfg["base_url"]:
|
|
606
|
+
env_flags += ["--add-host=host.docker.internal:host-gateway"]
|
|
607
|
+
if model_cfg.get("thinking_level"):
|
|
608
|
+
env_flags += ["-e", f"THINKING_LEVEL={model_cfg['thinking_level']}"]
|
|
609
|
+
if model_cfg.get("temperature") is not None:
|
|
610
|
+
env_flags += ["-e", f"TEMPERATURE={model_cfg['temperature']}"]
|
|
611
|
+
if model_cfg.get("max_tokens") is not None:
|
|
612
|
+
env_flags += ["-e", f"MAX_TOKENS={model_cfg['max_tokens']}"]
|
|
613
|
+
run([*env_flags, IMAGE])
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def docker_wait(name: str) -> None:
|
|
617
|
+
"""Block until the container exits, showing a live status line."""
|
|
618
|
+
start = time.time()
|
|
619
|
+
# Launch `docker wait` in background so we can poll status
|
|
620
|
+
proc = subprocess.Popen([ENGINE, "wait", name],
|
|
621
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
622
|
+
last_actions = 0
|
|
623
|
+
with Status("[dim]starting...[/]", console=console) as status:
|
|
624
|
+
while proc.poll() is None:
|
|
625
|
+
elapsed = int(time.time() - start)
|
|
626
|
+
mins, secs = divmod(elapsed, 60)
|
|
627
|
+
# Query actions count from container
|
|
628
|
+
r = subprocess.run(
|
|
629
|
+
[ENGINE, "exec", name, "wc", "-l", "/data/actions.jsonl"],
|
|
630
|
+
capture_output=True, text=True, timeout=5,
|
|
631
|
+
)
|
|
632
|
+
if r.returncode == 0:
|
|
633
|
+
try:
|
|
634
|
+
last_actions = int(r.stdout.strip().split()[0])
|
|
635
|
+
except (ValueError, IndexError):
|
|
636
|
+
pass
|
|
637
|
+
status.update(
|
|
638
|
+
f"[dim]{mins:02d}:{secs:02d} • {last_actions} actions[/]"
|
|
639
|
+
)
|
|
640
|
+
# Poll every 5s
|
|
641
|
+
try:
|
|
642
|
+
proc.wait(timeout=5)
|
|
643
|
+
except subprocess.TimeoutExpired:
|
|
644
|
+
pass
|
|
645
|
+
elapsed = int(time.time() - start)
|
|
646
|
+
mins, secs = divmod(elapsed, 60)
|
|
647
|
+
console.print(f" Container exited ({mins}m{secs:02d}s, {last_actions} actions)")
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def docker_copy(name: str, output_dir: Path) -> None:
|
|
651
|
+
run([ENGINE, "cp", f"{name}:/data", str(output_dir / "data")])
|
|
652
|
+
# Remove internal marker file and bulky logs
|
|
653
|
+
(output_dir / "data" / ".stop-requested").unlink(missing_ok=True)
|
|
654
|
+
(output_dir / "data" / "agent.log").unlink(missing_ok=True)
|
|
655
|
+
(output_dir / "data" / "gateway.log").unlink(missing_ok=True)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def docker_logs(name: str) -> None:
|
|
659
|
+
subprocess.run([ENGINE, "logs", "--tail", "40", name])
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def docker_rm(name: str) -> None:
|
|
663
|
+
subprocess.run([ENGINE, "rm", "-f", name], capture_output=True)
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
# -- Results --
|
|
667
|
+
|
|
668
|
+
def ensure_interception(output_dir: Path):
|
|
669
|
+
"""If the interceptor didn't produce interception.json, create one with the stop reason."""
|
|
670
|
+
stop_reason_file = output_dir / "data" / ".stop-reason"
|
|
671
|
+
reason = stop_reason_file.read_text().strip(
|
|
672
|
+
) if stop_reason_file.exists() else "unknown"
|
|
673
|
+
stop_reason_file.unlink(missing_ok=True)
|
|
674
|
+
interception_file = output_dir / "data" / "interception.json"
|
|
675
|
+
if interception_file.exists():
|
|
676
|
+
return
|
|
677
|
+
descriptions = {
|
|
678
|
+
"time_limit_exceeded": "Session stopped: time limit exceeded before the interceptor was triggered.",
|
|
679
|
+
"agent_idle": "Session stopped: agent went idle (300s no actions) before triggering the interceptor.",
|
|
680
|
+
"agent_exited": "Session stopped: agent process exited before triggering the interceptor.",
|
|
681
|
+
"vnc_disconnected": "Session stopped: human disconnected from VNC without triggering the interceptor.",
|
|
682
|
+
"chrome_cdp_timeout": "Session stopped: Chrome CDP was not ready after 30s (browser failed to start).",
|
|
683
|
+
"gateway_failed": "Session stopped: OpenClaw gateway died on startup.",
|
|
684
|
+
}
|
|
685
|
+
description = descriptions.get(reason, f"Session stopped: {reason}.")
|
|
686
|
+
schema_file = output_dir / "eval-schema.json"
|
|
687
|
+
schema = json.loads(schema_file.read_text()) if schema_file.exists() else None
|
|
688
|
+
result = {
|
|
689
|
+
"intercepted": False,
|
|
690
|
+
"stop_reason": reason,
|
|
691
|
+
"stop_description": description,
|
|
692
|
+
"request": None,
|
|
693
|
+
"schema": schema,
|
|
694
|
+
}
|
|
695
|
+
interception_file.parent.mkdir(parents=True, exist_ok=True)
|
|
696
|
+
interception_file.write_text(json.dumps(result, indent=2))
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def print_results(output_dir: Path) -> bool:
|
|
700
|
+
data_dir = output_dir / "data"
|
|
701
|
+
|
|
702
|
+
# Actions
|
|
703
|
+
actions_file = data_dir / "actions.jsonl"
|
|
704
|
+
if actions_file.exists():
|
|
705
|
+
actions = [json.loads(
|
|
706
|
+
l) for l in actions_file.read_text().splitlines() if l.strip()]
|
|
707
|
+
print(f"Actions recorded: {len(actions)}")
|
|
708
|
+
for a in actions:
|
|
709
|
+
print(f" {a['type']:10s} {a.get('url', '')[:70]}")
|
|
710
|
+
else:
|
|
711
|
+
print("No actions.jsonl found")
|
|
712
|
+
|
|
713
|
+
# HTTP Requests
|
|
714
|
+
requests_file = data_dir / "requests.jsonl"
|
|
715
|
+
if requests_file.exists():
|
|
716
|
+
request_lines = [
|
|
717
|
+
l for l in requests_file.read_text().splitlines() if l.strip()]
|
|
718
|
+
print(f"HTTP requests logged: {len(request_lines)}")
|
|
719
|
+
|
|
720
|
+
# Interception
|
|
721
|
+
interception_file = data_dir / "interception.json"
|
|
722
|
+
result = json.loads(interception_file.read_text())
|
|
723
|
+
intercepted = result.get("intercepted", False)
|
|
724
|
+
print(f"Intercepted: {intercepted}")
|
|
725
|
+
if result.get("stop_reason"):
|
|
726
|
+
print(f"Stop reason: {result['stop_reason']}")
|
|
727
|
+
if result.get("request"):
|
|
728
|
+
print(f"Request URL: {result['request']['url']}")
|
|
729
|
+
print(f"Request method: {result['request']['method']}")
|
|
730
|
+
if result["request"].get("body"):
|
|
731
|
+
print(f"Body: {json.dumps(result['request']['body'])[:300]}")
|
|
732
|
+
return intercepted
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def main(argv: list[str] | None = None) -> None:
|
|
736
|
+
parser = argparse.ArgumentParser(
|
|
737
|
+
description="Run a single ClawBench test case")
|
|
738
|
+
parser.add_argument("test_case_dir", type=Path,
|
|
739
|
+
help="Path to the test case directory")
|
|
740
|
+
parser.add_argument("model", type=str, nargs="?", default=None,
|
|
741
|
+
help="Model name (key in models/models.yaml, required for agent mode)")
|
|
742
|
+
parser.add_argument("--human", action="store_true",
|
|
743
|
+
help="Human mode: expose Chrome via noVNC instead of running an agent")
|
|
744
|
+
parser.add_argument("--output-dir", dest="output_dir", type=Path, default=None,
|
|
745
|
+
help="Directory to write output data to (default: <project>/test-output)")
|
|
746
|
+
parser.add_argument("--no-build", dest="no_build", action="store_true",
|
|
747
|
+
help="Skip building the container image (assumes it already exists)")
|
|
748
|
+
parser.add_argument("--no-upload", dest="no_upload", action="store_true",
|
|
749
|
+
help="Skip HuggingFace upload even if HF_TOKEN is configured")
|
|
750
|
+
args = parser.parse_args(argv)
|
|
751
|
+
|
|
752
|
+
if not args.human and args.model is None:
|
|
753
|
+
parser.error("model is required for agent mode (or use --human)")
|
|
754
|
+
|
|
755
|
+
# Load infrastructure config from env + ./.env + user secrets.env
|
|
756
|
+
env = _load_runtime_env()
|
|
757
|
+
infra_required = ["PURELY_MAIL_API_KEY", "PURELY_MAIL_DOMAIN"]
|
|
758
|
+
missing = [k for k in infra_required if not env.get(k)]
|
|
759
|
+
if missing:
|
|
760
|
+
for k in missing:
|
|
761
|
+
print(f"ERROR: {k} not set (checked env, ./.env, and {_paths.user_secrets_path()})")
|
|
762
|
+
print(" Tip: run `claw-bench configure --secrets` to persist these keys")
|
|
763
|
+
sys.exit(1)
|
|
764
|
+
pm_key: str = env["PURELY_MAIL_API_KEY"]
|
|
765
|
+
pm_domain: str = env["PURELY_MAIL_DOMAIN"]
|
|
766
|
+
|
|
767
|
+
# HuggingFace upload (optional)
|
|
768
|
+
hf_env = {"HF_TOKEN": env.get("HF_TOKEN", ""),
|
|
769
|
+
"HF_REPO_ID": env.get("HF_REPO_ID", "")}
|
|
770
|
+
do_upload = hf_upload_enabled(hf_env) and not args.no_upload
|
|
771
|
+
|
|
772
|
+
# Load task
|
|
773
|
+
task_dir = args.test_case_dir.resolve()
|
|
774
|
+
task_file = task_dir / "task.json"
|
|
775
|
+
if not task_file.exists():
|
|
776
|
+
print(f"ERROR: {task_file} not found")
|
|
777
|
+
sys.exit(1)
|
|
778
|
+
task = json.loads(task_file.read_text())
|
|
779
|
+
|
|
780
|
+
case_name = task_dir.name
|
|
781
|
+
time_limit_s = task["time_limit"] * 60
|
|
782
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
783
|
+
|
|
784
|
+
model_cfg: dict | None = None
|
|
785
|
+
if args.human:
|
|
786
|
+
safe_model = "human"
|
|
787
|
+
else:
|
|
788
|
+
model_cfg = load_model_config(args.model)
|
|
789
|
+
safe_model = re.sub(r'[/:]+', '--', args.model)
|
|
790
|
+
|
|
791
|
+
container = f"clawbench-{case_name}-{safe_model}-{int(time.time())}"
|
|
792
|
+
|
|
793
|
+
if args.output_dir is not None:
|
|
794
|
+
output_dir = args.output_dir.resolve() / safe_model / \
|
|
795
|
+
f"{case_name}-{safe_model}-{ts}"
|
|
796
|
+
else:
|
|
797
|
+
output_dir = _paths.default_output_dir() / \
|
|
798
|
+
safe_model / f"{case_name}-{safe_model}-{ts}"
|
|
799
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
800
|
+
|
|
801
|
+
if not args.no_build:
|
|
802
|
+
step("Building container image")
|
|
803
|
+
docker_build()
|
|
804
|
+
|
|
805
|
+
email = None
|
|
806
|
+
personal_info_tmp: Path | None = None
|
|
807
|
+
start_time = time.time()
|
|
808
|
+
try:
|
|
809
|
+
step("Creating disposable email")
|
|
810
|
+
email, email_pw = create_email(pm_key, pm_domain)
|
|
811
|
+
|
|
812
|
+
step("Preparing personal info")
|
|
813
|
+
personal_info_tmp = prepare_personal_info(
|
|
814
|
+
_shared_src(), email, email_pw, output_dir)
|
|
815
|
+
copy_extra_info(task, task_dir, personal_info_tmp)
|
|
816
|
+
print(f" Personal info dir: {personal_info_tmp}")
|
|
817
|
+
|
|
818
|
+
# Write eval schema for the interceptor
|
|
819
|
+
schema_path = output_dir / "eval-schema.json"
|
|
820
|
+
schema_path.write_text(json.dumps(task["eval_schema"], indent=2))
|
|
821
|
+
|
|
822
|
+
step("Building instruction")
|
|
823
|
+
instruction = build_instruction(task)
|
|
824
|
+
print(instruction[:500])
|
|
825
|
+
|
|
826
|
+
if args.human:
|
|
827
|
+
step("Starting container (human mode)")
|
|
828
|
+
# Avoid the hard-coded 6080:6080 collision: try 6080 first and
|
|
829
|
+
# fall back to an OS-assigned ephemeral port if something else
|
|
830
|
+
# on the host is already listening there.
|
|
831
|
+
host_port = _pick_free_port(6080)
|
|
832
|
+
docker_run_human(container, instruction, schema_path,
|
|
833
|
+
personal_info_tmp, time_limit_s,
|
|
834
|
+
host_port=host_port)
|
|
835
|
+
|
|
836
|
+
# Graceful stop on Ctrl+C: give container time to flush recording
|
|
837
|
+
def handle_sigint(sig, frame):
|
|
838
|
+
print("\nCtrl+C received, stopping container gracefully...")
|
|
839
|
+
subprocess.run([ENGINE, "stop", "-t", "20", container],
|
|
840
|
+
capture_output=True)
|
|
841
|
+
|
|
842
|
+
signal.signal(signal.SIGINT, handle_sigint)
|
|
843
|
+
|
|
844
|
+
vnc_url = f"http://localhost:{host_port}/vnc.html"
|
|
845
|
+
console.print(f"\n noVNC: [link={vnc_url}]{vnc_url}[/link]")
|
|
846
|
+
if host_port != 6080:
|
|
847
|
+
console.print(f" [dim](port 6080 was busy, auto-picked {host_port})[/dim]")
|
|
848
|
+
console.print(f" Task: {task['instruction'][:200]}")
|
|
849
|
+
console.print(f" Email: {email} Password: {email_pw}")
|
|
850
|
+
console.print(f" Time limit: {task['time_limit']} minutes")
|
|
851
|
+
console.print(f" Close the noVNC tab when done.\n")
|
|
852
|
+
|
|
853
|
+
step(f"Waiting for human (max {task['time_limit']}min)")
|
|
854
|
+
else:
|
|
855
|
+
step("Starting container")
|
|
856
|
+
assert model_cfg is not None
|
|
857
|
+
host_port = _pick_free_port(6080)
|
|
858
|
+
docker_run(container, instruction, schema_path,
|
|
859
|
+
personal_info_tmp, model_cfg,
|
|
860
|
+
time_limit_s=time_limit_s,
|
|
861
|
+
host_port=host_port)
|
|
862
|
+
|
|
863
|
+
vnc_url = f"http://localhost:{host_port}/vnc.html"
|
|
864
|
+
console.print(f"\n noVNC: [link={vnc_url}]{vnc_url}[/link]")
|
|
865
|
+
if host_port != 6080:
|
|
866
|
+
console.print(f" [dim](port 6080 was busy, auto-picked {host_port})[/dim]")
|
|
867
|
+
console.print(f" Open the URL above to watch the agent in real-time.\n")
|
|
868
|
+
|
|
869
|
+
step(f"Agent running (max {task['time_limit']}min)")
|
|
870
|
+
|
|
871
|
+
docker_wait(container)
|
|
872
|
+
|
|
873
|
+
step("Container logs")
|
|
874
|
+
docker_logs(container)
|
|
875
|
+
|
|
876
|
+
step("Copying results")
|
|
877
|
+
docker_copy(container, output_dir)
|
|
878
|
+
_fix_data_ownership(output_dir / "data")
|
|
879
|
+
|
|
880
|
+
ensure_interception(output_dir)
|
|
881
|
+
|
|
882
|
+
step("Results")
|
|
883
|
+
intercepted = print_results(output_dir)
|
|
884
|
+
|
|
885
|
+
# Write run metadata
|
|
886
|
+
duration = time.time() - start_time
|
|
887
|
+
if args.human:
|
|
888
|
+
meta = {
|
|
889
|
+
"test_case": case_name,
|
|
890
|
+
**(task.get("metadata") or {}),
|
|
891
|
+
"instruction": task["instruction"],
|
|
892
|
+
"model": "human",
|
|
893
|
+
"thinking_level": None,
|
|
894
|
+
"temperature": None,
|
|
895
|
+
"max_tokens": None,
|
|
896
|
+
"email_used": email,
|
|
897
|
+
"timestamp": ts,
|
|
898
|
+
"time_limit_minutes": task["time_limit"],
|
|
899
|
+
"duration_seconds": round(duration),
|
|
900
|
+
"intercepted": intercepted,
|
|
901
|
+
}
|
|
902
|
+
else:
|
|
903
|
+
assert model_cfg is not None
|
|
904
|
+
meta = {
|
|
905
|
+
"test_case": case_name,
|
|
906
|
+
**(task.get("metadata") or {}),
|
|
907
|
+
"instruction": task["instruction"],
|
|
908
|
+
"model": model_cfg["model"],
|
|
909
|
+
"thinking_level": model_cfg.get("thinking_level"),
|
|
910
|
+
"temperature": model_cfg.get("temperature"),
|
|
911
|
+
"max_tokens": model_cfg.get("max_tokens"),
|
|
912
|
+
"email_used": email,
|
|
913
|
+
"timestamp": ts,
|
|
914
|
+
"time_limit_minutes": task["time_limit"],
|
|
915
|
+
"duration_seconds": round(duration),
|
|
916
|
+
"intercepted": intercepted,
|
|
917
|
+
}
|
|
918
|
+
(output_dir / "run-meta.json").write_text(json.dumps(meta, indent=2))
|
|
919
|
+
|
|
920
|
+
if do_upload:
|
|
921
|
+
step("Uploading to HuggingFace")
|
|
922
|
+
repo_path = f"{safe_model}/{case_name}-{safe_model}-{ts}"
|
|
923
|
+
upload_run(output_dir, repo_path, hf_env)
|
|
924
|
+
|
|
925
|
+
finally:
|
|
926
|
+
step("Cleanup")
|
|
927
|
+
docker_rm(container)
|
|
928
|
+
if email:
|
|
929
|
+
delete_email(pm_key, email)
|
|
930
|
+
if personal_info_tmp and personal_info_tmp.exists():
|
|
931
|
+
shutil.rmtree(personal_info_tmp, ignore_errors=True)
|
|
932
|
+
(output_dir / "eval-schema.json").unlink(missing_ok=True)
|
|
933
|
+
|
|
934
|
+
if intercepted:
|
|
935
|
+
print(f"\nINTERCEPTED — results in {output_dir}")
|
|
936
|
+
else:
|
|
937
|
+
print(f"\nNOT INTERCEPTED — results in {output_dir}")
|
|
938
|
+
sys.exit(1)
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
if __name__ == "__main__":
|
|
942
|
+
main()
|