clawbench-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. clawbench/__init__.py +35 -0
  2. clawbench/__main__.py +8 -0
  3. clawbench/batch.py +619 -0
  4. clawbench/cli.py +397 -0
  5. clawbench/data/chrome-extension/README.md +127 -0
  6. clawbench/data/chrome-extension/background.js +50 -0
  7. clawbench/data/chrome-extension/content.js +70 -0
  8. clawbench/data/chrome-extension/manifest.json +25 -0
  9. clawbench/data/chrome-extension/setup.sh +27 -0
  10. clawbench/data/chrome-extension/stealth.js +200 -0
  11. clawbench/data/docker/Dockerfile +51 -0
  12. clawbench/data/docker/entrypoint.sh +394 -0
  13. clawbench/data/docker/setup-openclaw.sh +112 -0
  14. clawbench/data/eval/README.md +95 -0
  15. clawbench/data/eval/agentic_eval.md +53 -0
  16. clawbench/data/extension-server/.python-version +1 -0
  17. clawbench/data/extension-server/README.md +54 -0
  18. clawbench/data/extension-server/pyproject.toml +7 -0
  19. clawbench/data/extension-server/server.py +360 -0
  20. clawbench/data/extension-server/uv.lock +644 -0
  21. clawbench/data/models/model.schema.json +44 -0
  22. clawbench/data/models/models.example.yaml +16 -0
  23. clawbench/data/shared/alex_green_personal_info.json +451 -0
  24. clawbench/data/test-cases/001-daily-life-food-uber-eats/task.json +25 -0
  25. clawbench/data/test-cases/002-daily-life-food-doordash/task.json +25 -0
  26. clawbench/data/test-cases/004-daily-life-food-instacart/extra_info/grocery_list.json +36 -0
  27. clawbench/data/test-cases/004-daily-life-food-instacart/task.json +30 -0
  28. clawbench/data/test-cases/006-daily-life-food-uber-eats/task.json +24 -0
  29. clawbench/data/test-cases/007-daily-life-food-instacart/extra_info/meal_plan.json +21 -0
  30. clawbench/data/test-cases/007-daily-life-food-instacart/task.json +30 -0
  31. clawbench/data/test-cases/011-daily-life-housing-zillow/task.json +25 -0
  32. clawbench/data/test-cases/015-daily-life-housing-craigslist/extra_info/listing_details.json +26 -0
  33. clawbench/data/test-cases/015-daily-life-housing-craigslist/task.json +30 -0
  34. clawbench/data/test-cases/035-daily-life-health-medical-betterhelp/task.json +25 -0
  35. clawbench/data/test-cases/041-daily-life-pets-rover/task.json +25 -0
  36. clawbench/data/test-cases/043-daily-life-pets-rover/extra_info/pet_info.json +12 -0
  37. clawbench/data/test-cases/043-daily-life-pets-rover/task.json +30 -0
  38. clawbench/data/test-cases/045-daily-life-personal-care-booksy/task.json +25 -0
  39. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/extra_info/address_info.json +7 -0
  40. clawbench/data/test-cases/047-daily-life-personal-care-taskrabbit/task.json +30 -0
  41. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/extra_info/job_links.json +5 -0
  42. clawbench/data/test-cases/086-job-search-hr-cv-autofill-greenhouse-meta/task.json +30 -0
  43. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/extra_info/job_links.json +5 -0
  44. clawbench/data/test-cases/089-job-search-hr-cv-autofill-simplify-jobs/task.json +30 -0
  45. clawbench/data/test-cases/091-job-search-hr-job-apply-indeed/task.json +25 -0
  46. clawbench/data/test-cases/120-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  47. clawbench/data/test-cases/121-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  48. clawbench/data/test-cases/128-office-secretary-tasks-email-mgmt-purelymail/task.json +28 -0
  49. clawbench/data/test-cases/134-office-secretary-tasks-calendar-calendly/task.json +25 -0
  50. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/extra_info/meeting_details.json +30 -0
  51. clawbench/data/test-cases/137-office-secretary-tasks-calendar-doodle/task.json +30 -0
  52. clawbench/data/test-cases/139-office-secretary-tasks-calendar-calendly/task.json +25 -0
  53. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/extra_info/task_list.json +29 -0
  54. clawbench/data/test-cases/142-office-secretary-tasks-collab-trello/task.json +30 -0
  55. clawbench/data/test-cases/179-dev-tech-github-ops-github/extra_info/config.json +13 -0
  56. clawbench/data/test-cases/179-dev-tech-github-ops-github/task.json +30 -0
  57. clawbench/data/test-cases/180-dev-tech-github-ops-github/task.json +25 -0
  58. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/extra_info/raw_results.json +47 -0
  59. clawbench/data/test-cases/215-academia-research-paper-tables-overleaf/task.json +30 -0
  60. clawbench/data/test-cases/242-academia-research-research-tools-overleaf/task.json +25 -0
  61. clawbench/data/test-cases/246-academia-research-research-tools-zotero/task.json +25 -0
  62. clawbench/data/test-cases/247-academia-research-research-tools-semantic-scholar/task.json +25 -0
  63. clawbench/data/test-cases/265-education-learning-general-coursera/task.json +25 -0
  64. clawbench/data/test-cases/266-education-learning-general-leetcode/extra_info/solution_code.py +9 -0
  65. clawbench/data/test-cases/266-education-learning-general-leetcode/task.json +30 -0
  66. clawbench/data/test-cases/273-education-learning-general-edx/task.json +25 -0
  67. clawbench/data/test-cases/274-education-learning-general-udemy/task.json +25 -0
  68. clawbench/data/test-cases/279-travel-general-airbnb/task.json +25 -0
  69. clawbench/data/test-cases/280-travel-general-booking-com/task.json +25 -0
  70. clawbench/data/test-cases/363-entertainment-hobbies-general-ticketmaster/task.json +25 -0
  71. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/extra_info/book_list.json +14 -0
  72. clawbench/data/test-cases/369-entertainment-hobbies-general-goodreads/task.json +30 -0
  73. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/extra_info/event_details.json +10 -0
  74. clawbench/data/test-cases/372-entertainment-hobbies-general-eventbrite/task.json +30 -0
  75. clawbench/data/test-cases/403-personal-management-account-security-1password-web/extra_info/credentials.json +34 -0
  76. clawbench/data/test-cases/403-personal-management-account-security-1password-web/task.json +30 -0
  77. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/extra_info/task_list.json +52 -0
  78. clawbench/data/test-cases/413-personal-management-personal-tools-todoist/task.json +30 -0
  79. clawbench/data/test-cases/468-rating-voting-general-glassdoor/extra_info/interview_experience.json +10 -0
  80. clawbench/data/test-cases/468-rating-voting-general-glassdoor/task.json +30 -0
  81. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/extra_info/review_content.json +6 -0
  82. clawbench/data/test-cases/469-rating-voting-general-tripadvisor/task.json +30 -0
  83. clawbench/data/test-cases/470-rating-voting-general-trustpilot/extra_info/review_content.json +6 -0
  84. clawbench/data/test-cases/470-rating-voting-general-trustpilot/task.json +30 -0
  85. clawbench/data/test-cases/474-rating-voting-general-capterra/task.json +25 -0
  86. clawbench/data/test-cases/475-rating-voting-general-g2/task.json +25 -0
  87. clawbench/data/test-cases/482-creation-init-general-confluence/extra_info/content.json +3 -0
  88. clawbench/data/test-cases/482-creation-init-general-confluence/task.json +30 -0
  89. clawbench/data/test-cases/483-creation-init-general-airtable/task.json +25 -0
  90. clawbench/data/test-cases/484-creation-init-general-clickup/task.json +28 -0
  91. clawbench/data/test-cases/485-creation-init-general-webflow/task.json +25 -0
  92. clawbench/data/test-cases/486-creation-init-general-mailchimp/extra_info/content.json +3 -0
  93. clawbench/data/test-cases/486-creation-init-general-mailchimp/task.json +30 -0
  94. clawbench/data/test-cases/487-creation-init-general-typeform/extra_info/survey_questions.json +85 -0
  95. clawbench/data/test-cases/487-creation-init-general-typeform/task.json +30 -0
  96. clawbench/data/test-cases/488-creation-init-general-substack/extra_info/content.json +3 -0
  97. clawbench/data/test-cases/488-creation-init-general-substack/task.json +30 -0
  98. clawbench/data/test-cases/489-creation-init-general-ghost/extra_info/content.json +3 -0
  99. clawbench/data/test-cases/489-creation-init-general-ghost/task.json +30 -0
  100. clawbench/data/test-cases/501-creation-init-general-asana/extra_info/project_description.json +8 -0
  101. clawbench/data/test-cases/501-creation-init-general-asana/task.json +33 -0
  102. clawbench/data/test-cases/529-daily-life-shopping-delivery-king-arthur-baking/task.json +25 -0
  103. clawbench/data/test-cases/533-daily-life-utilities-inmyarea/task.json +25 -0
  104. clawbench/data/test-cases/535-daily-life-home-home-depot/task.json +25 -0
  105. clawbench/data/test-cases/537-daily-life-food-crumbl/task.json +25 -0
  106. clawbench/data/test-cases/539-daily-life-health-jefit/task.json +25 -0
  107. clawbench/data/test-cases/542-daily-life-pets-wag/task.json +25 -0
  108. clawbench/data/test-cases/551-finance-investment-crypto-wallet-trezor/task.json +25 -0
  109. clawbench/data/test-cases/552-finance-investment-business-payment-plooto/task.json +25 -0
  110. clawbench/data/test-cases/555-finance-investment-insurance-insureon/task.json +25 -0
  111. clawbench/data/test-cases/559-finance-investment-crowdfunding-frontfundr/task.json +25 -0
  112. clawbench/data/test-cases/564-daily-life-event-registration-race-roster/task.json +25 -0
  113. clawbench/data/test-cases/565-job-search-hr-job-search-jopwell/task.json +25 -0
  114. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/extra_info/listing_details.json +26 -0
  115. clawbench/data/test-cases/566-job-search-hr-job-search-ziprecruiter/task.json +30 -0
  116. clawbench/data/test-cases/569-job-search-hr-job-search-careerbuilder/task.json +25 -0
  117. clawbench/data/test-cases/570-job-search-hr-job-search-hired/task.json +25 -0
  118. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/extra_info/listing_details.json +26 -0
  119. clawbench/data/test-cases/571-job-search-hr-recruitment-mgmt-workable/task.json +30 -0
  120. clawbench/data/test-cases/576-office-secretary-tasks-reports-ftc-reportfraud/task.json +25 -0
  121. clawbench/data/test-cases/583-office-secretary-tasks-support-tickets-freshdesk/task.json +25 -0
  122. clawbench/data/test-cases/598-academia-research-legal-docs-formswift/task.json +25 -0
  123. clawbench/data/test-cases/606-education-learning-kids-courses-outschool/task.json +25 -0
  124. clawbench/data/test-cases/607-education-learning-art-courses-creativebug/task.json +25 -0
  125. clawbench/data/test-cases/609-education-learning-meditation-spirit-rock-meditation-center/task.json +25 -0
  126. clawbench/data/test-cases/615-travel-flights-spirit-airlines/task.json +25 -0
  127. clawbench/data/test-cases/618-travel-train-bus-12go-asia/task.json +25 -0
  128. clawbench/data/test-cases/625-travel-camping-outdoor-parks-canada-reservations/task.json +25 -0
  129. clawbench/data/test-cases/626-travel-bus-flixbus/task.json +25 -0
  130. clawbench/data/test-cases/627-travel-flights-momondo/task.json +25 -0
  131. clawbench/data/test-cases/632-shopping-commerce-beauty-care-olaplex/task.json +25 -0
  132. clawbench/data/test-cases/634-shopping-commerce-apparel-dooney-bourke/task.json +25 -0
  133. clawbench/data/test-cases/635-shopping-commerce-gifts-uncommon-goods/task.json +25 -0
  134. clawbench/data/test-cases/636-shopping-commerce-auto-parts-rockauto/task.json +25 -0
  135. clawbench/data/test-cases/638-shopping-commerce-print-custom-vistaprint/task.json +25 -0
  136. clawbench/data/test-cases/639-shopping-commerce-luxury-mansur-gavriel/task.json +25 -0
  137. clawbench/data/test-cases/671-entertainment-gaming-humble-bundle/task.json +25 -0
  138. clawbench/data/test-cases/672-entertainment-hobbies-anime-streaming-crunchyroll/task.json +25 -0
  139. clawbench/data/test-cases/674-entertainment-hobbies-masterclass-masterclass/task.json +25 -0
  140. clawbench/data/test-cases/676-government-civic-legal-docs-legalnature/task.json +25 -0
  141. clawbench/data/test-cases/685-personal-management-budget-mgmt-everydollar/task.json +25 -0
  142. clawbench/data/test-cases/687-personal-management-vpn-subscription-ipvanish/task.json +25 -0
  143. clawbench/data/test-cases/688-personal-management-insurance-compare-insurify/task.json +25 -0
  144. clawbench/data/test-cases/695-automation-workflows-recurring-order-stumptown-coffee/task.json +25 -0
  145. clawbench/data/test-cases/697-automation-workflows-recurring-order-bean-box/task.json +25 -0
  146. clawbench/data/test-cases/699-automation-workflows-recurring-order-mistobox/task.json +25 -0
  147. clawbench/data/test-cases/700-deletion-revocation-data-deletion-deleteme/task.json +25 -0
  148. clawbench/data/test-cases/705-rating-voting-wine-review-vivino/task.json +25 -0
  149. clawbench/data/test-cases/706-rating-voting-beer-review-beeradvocate/task.json +25 -0
  150. clawbench/data/test-cases/707-rating-voting-social-wine-untappd/task.json +25 -0
  151. clawbench/data/test-cases/708-rating-voting-professor-review-ratemyprofessors/task.json +28 -0
  152. clawbench/data/test-cases/709-rating-voting-service-review-angi/task.json +25 -0
  153. clawbench/data/test-cases/710-creation-init-interior-design-roomsketcher/task.json +25 -0
  154. clawbench/data/test-cases/711-creation-init-color-design-coolors/task.json +25 -0
  155. clawbench/data/test-cases/712-creation-init-website-create-squarespace/task.json +25 -0
  156. clawbench/data/test-cases/713-creation-init-website-build-wix/task.json +25 -0
  157. clawbench/data/test-cases/735-home-services-maintenance-house-cleaning-bark/task.json +25 -0
  158. clawbench/data/test-cases/736-home-services-maintenance-plumbing-ace-hardware/task.json +25 -0
  159. clawbench/data/test-cases/737-home-services-maintenance-kitchen-remodel-lowes/task.json +25 -0
  160. clawbench/data/test-cases/738-home-services-maintenance-equipment-install-amazon-home-services/task.json +25 -0
  161. clawbench/data/test-cases/750-automotive-vehicle-services-car-insurance-compare-kanetix/task.json +25 -0
  162. clawbench/data/test-cases/751-automotive-vehicle-services-car-lease-sixt/task.json +25 -0
  163. clawbench/data/test-cases/754-automotive-vehicle-services-used-car-listing-autotrader/task.json +25 -0
  164. clawbench/data/test-cases/763-automotive-vehicle-services-car-lease-autoslash/task.json +25 -0
  165. clawbench/data/test-cases/766-nonprofit-charity-donation-doctors-without-borders-msf/task.json +25 -0
  166. clawbench/data/test-cases/768-nonprofit-charity-community-crowdfund-ioby/task.json +25 -0
  167. clawbench/data/test-cases/770-nonprofit-charity-volunteer-apply-on-make-a-wish-foundation-website-complete-and-submit-a-volunteer-application-form-selecting-the-wish-granter-role-and-entering-city-phoenix-az/task.json +25 -0
  168. clawbench/data/test-cases/774-nonprofit-charity-nonprofit-job-apply-charity-village/task.json +25 -0
  169. clawbench/data/test-cases/776-nonprofit-charity-volunteer-signup-idealist/task.json +25 -0
  170. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/extra_info/payment_info.json +3 -0
  171. clawbench/data/test-cases/778-nonprofit-charity-donation-globalgiving/task.json +30 -0
  172. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/extra_info/address_info.json +4 -0
  173. clawbench/data/test-cases/780-beauty-personal-care-skincare-purchase-soko-glam/task.json +30 -0
  174. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/extra_info/email_info.json +3 -0
  175. clawbench/data/test-cases/781-beauty-personal-care-beauty-booking-bluemercury/task.json +30 -0
  176. clawbench/data/test-cases/782-beauty-personal-care-skincare-purchase-paulas-choice/task.json +24 -0
  177. clawbench/data/test-cases/783-beauty-personal-care-beauty-booking-ulta-beauty/task.json +24 -0
  178. clawbench/data/test-cases/785-beauty-personal-care-skincare-curology/task.json +25 -0
  179. clawbench/data/test-cases/788-beauty-personal-care-makeup-the-ordinary/task.json +25 -0
  180. clawbench/data/test-cases/789-beauty-personal-care-makeup-fenty-beauty/task.json +25 -0
  181. clawbench/data/test-cases/793-beauty-personal-care-beauty-retail-mac-cosmetics/task.json +25 -0
  182. clawbench/data/test-cases/794-beauty-personal-care-salon-booking-styleseat/task.json +25 -0
  183. clawbench/data/test-cases/795-pet-animal-care-pet-adoption-aspca/task.json +25 -0
  184. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/extra_info/pet_info.json +12 -0
  185. clawbench/data/test-cases/796-pet-animal-care-pet-supplies-grooming-petsmart/task.json +30 -0
  186. clawbench/data/test-cases/799-pet-animal-care-pet-insurance-aspca-pet-health-insurance/task.json +25 -0
  187. clawbench/data/test-cases/801-pet-animal-care-pet-friendly-travel-bringfido/task.json +25 -0
  188. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/extra_info/pet_info.json +12 -0
  189. clawbench/data/test-cases/803-pet-animal-care-pet-medical-pawp/task.json +30 -0
  190. clawbench/data/test-cases/807-pet-animal-care-pet-dna-embark/task.json +25 -0
  191. clawbench/data/test-cases/809-pet-animal-care-pet-adopt-petfinder/task.json +28 -0
  192. clawbench/data/test-cases/812-pet-animal-care-pet-subscription-ollie/task.json +25 -0
  193. clawbench/data/test-cases/815-personal-management-records-mgmt-myheritage/task.json +25 -0
  194. clawbench/data/test-cases/821-education-learning-reading-self-study-blinkist/task.json +25 -0
  195. clawbench/data/test-cases/861-entertainment-hobbies-movies-cineplex/task.json +25 -0
  196. clawbench/data/test-cases/862-entertainment-hobbies-movies-amc-theatres/task.json +25 -0
  197. clawbench/data/test-cases/864-entertainment-hobbies-show-tickets-ticketmaster/task.json +25 -0
  198. clawbench/data/test-cases/865-travel-outdoor-hipcamp/task.json +25 -0
  199. clawbench/data/test-cases/867-entertainment-hobbies-movies-fandango/task.json +25 -0
  200. clawbench/data/test-cases/872-daily-life-food-opentable/task.json +25 -0
  201. clawbench/data/test-cases/873-daily-life-food-resy/task.json +28 -0
  202. clawbench/data/test-cases/876-entertainment-hobbies-show-tickets-vivid-seats/task.json +25 -0
  203. clawbench/data/test-cases/877-entertainment-hobbies-show-tickets-stubhub/task.json +25 -0
  204. clawbench/data/test-cases/878-travel-outdoor-ontario-parks/task.json +25 -0
  205. clawbench/data/test-cases/883-education-learning-hobby-class-sur-la-table/task.json +25 -0
  206. clawbench/data/test-cases/884-entertainment-hobbies-experience-breakout-games/task.json +25 -0
  207. clawbench/data/test-cases/885-entertainment-hobbies-experience-bowlero/task.json +25 -0
  208. clawbench/data/test-cases/886-entertainment-hobbies-experience-topgolf/task.json +25 -0
  209. clawbench/data/test-cases/lite.json +226 -0
  210. clawbench/data/test-cases/lite.schema.json +105 -0
  211. clawbench/data/test-cases/task.schema.json +132 -0
  212. clawbench/data/tools/build_clawbench_lite_enc.py +161 -0
  213. clawbench/doctor.py +171 -0
  214. clawbench/engine.py +180 -0
  215. clawbench/generate_resume_pdf.py +140 -0
  216. clawbench/hf_upload.py +78 -0
  217. clawbench/image.py +127 -0
  218. clawbench/paths.py +150 -0
  219. clawbench/resume_template.json +104 -0
  220. clawbench/run.py +942 -0
  221. clawbench/tui.py +1401 -0
  222. clawbench_cli-0.1.2.dist-info/METADATA +770 -0
  223. clawbench_cli-0.1.2.dist-info/RECORD +226 -0
  224. clawbench_cli-0.1.2.dist-info/WHEEL +4 -0
  225. clawbench_cli-0.1.2.dist-info/entry_points.txt +4 -0
  226. clawbench_cli-0.1.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,112 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # All config comes from env vars set by the test driver (sourced from models.yaml).
5
+ # BASE_URL and API_TYPE are required.
6
+ if [ -z "$BASE_URL" ] || [ -z "$API_TYPE" ]; then
7
+ echo "ERROR: BASE_URL and API_TYPE must be set"
8
+ exit 1
9
+ fi
10
+
11
+ PROVIDER="api"
12
+ MODEL="api/$MODEL_NAME"
13
+ MODEL_ID="$MODEL_NAME"
14
+
15
+ # Build optional model parameters
16
+ MODEL_OPTS=""
17
+ if [ -n "$TEMPERATURE" ]; then
18
+ MODEL_OPTS="$MODEL_OPTS, \"temperature\": $TEMPERATURE"
19
+ fi
20
+ if [ -n "$MAX_TOKENS" ]; then
21
+ MODEL_OPTS="$MODEL_OPTS, \"maxOutputTokens\": $MAX_TOKENS"
22
+ fi
23
+
24
+ mkdir -p ~/.openclaw/agents/main/agent
25
+
26
+ # Restrict exec to safe read-only commands (allowlist mode).
27
+ # The agent cannot run curl, python, node, etc. — only ls/cat/grep and default safe bins.
28
+ cat > ~/.openclaw/openclaw.json << JSONEOF
29
+ {
30
+ "gateway": {
31
+ "port": 18789,
32
+ "mode": "local"
33
+ },
34
+ "tools": {
35
+ "exec": {
36
+ "security": "allowlist",
37
+ "safeBins": ["ls", "cat", "find", "file", "jq", "cut", "uniq", "head", "tail", "tr", "wc", "grep", "sort"]
38
+ }
39
+ },
40
+ "agents": {
41
+ "defaults": {
42
+ "workspace": "/root/workspace",
43
+ "skipBootstrap": true,
44
+ "model": {
45
+ "primary": "$MODEL"
46
+ }
47
+ }
48
+ },
49
+ "models": {
50
+ "providers": {
51
+ "$PROVIDER": {
52
+ "baseUrl": "$BASE_URL",
53
+ "api": "$API_TYPE",
54
+ "models": [
55
+ { "id": "$MODEL_ID", "name": "$MODEL_ID", "reasoning": true$MODEL_OPTS }
56
+ ]
57
+ }
58
+ }
59
+ },
60
+ "browser": {
61
+ "enabled": true,
62
+ "defaultProfile": "container",
63
+ "profiles": {
64
+ "container": {
65
+ "cdpUrl": "http://127.0.0.1:9222",
66
+ "color": "#FB542B"
67
+ }
68
+ }
69
+ }
70
+ }
71
+ JSONEOF
72
+
73
+ # Generate auth-profiles.json with multi-key rotation support
74
+ python3 -c "
75
+ import json, os
76
+
77
+ provider = '$PROVIDER'
78
+
79
+ # Parse keys from API_KEYS env var, fall back to API_KEY
80
+ keys_json = os.environ.get('API_KEYS', '')
81
+ single_key = os.environ.get('API_KEY', '')
82
+
83
+ keys = []
84
+ if keys_json:
85
+ try:
86
+ parsed = json.loads(keys_json)
87
+ except json.JSONDecodeError:
88
+ parsed = []
89
+ keys = [{'key': k, 'source': 'apikey'} for k in parsed]
90
+ if not keys and single_key:
91
+ keys = [{'key': single_key, 'source': 'apikey'}]
92
+
93
+ profiles = {}
94
+ order = []
95
+ for i, entry in enumerate(keys, 1):
96
+ name = f'{provider}:api-{i}'
97
+ profiles[name] = {
98
+ 'provider': provider,
99
+ 'type': 'api_key',
100
+ 'key': entry['key'],
101
+ }
102
+ order.append(name)
103
+
104
+ result = {'profiles': profiles, 'order': {provider: order}}
105
+
106
+ path = os.path.expanduser('~/.openclaw/agents/main/agent/auth-profiles.json')
107
+ with open(path, 'w') as f:
108
+ json.dump(result, f, indent=2)
109
+ os.chmod(path, 0o600)
110
+
111
+ print(f'Auth profiles: {len(keys)} API key(s) for {provider}')
112
+ "
@@ -0,0 +1,95 @@
1
+ # ClawBench Evaluation
2
+
3
+ ClawBench evaluation is a **post-session** step: first you run agents to collect trajectories, then you evaluate those trajectories against human reference runs.
4
+
5
+ ```
6
+ Step 1: Run agents Step 2: Evaluate
7
+ (test-driver) (this directory)
8
+
9
+ ./run.sh Claude Code subagents compare
10
+ or agent traces vs human references
11
+ test-driver/batch.py under eval/agentic_eval.md rubric
12
+ │ │
13
+ ▼ ▼
14
+ test-output/ {model}-eval-results.csv
15
+ {model}/{run}/ {model}-eval-results.json
16
+ data/
17
+ actions.jsonl
18
+ requests.jsonl
19
+ screenshots/
20
+ recording.mp4
21
+ interception.json
22
+ agent-messages.jsonl
23
+ ```
24
+
25
+ ## How It Works
26
+
27
+ The evaluator is a Claude Code subagent that compares two trajectories side by side:
28
+
29
+ - **Agent trajectory** -- the five-layer recording from the AI agent's run
30
+ - **Human reference trajectory** -- the same five layers recorded by a human annotator completing the task correctly
31
+
32
+ The evaluator follows a fixed rubric ([`agentic_eval.md`](agentic_eval.md)) to determine PASS or FAIL for each task. This comparative approach means the evaluator has a concrete ground truth -- it knows exactly which form fields to fill, which buttons to click, and which endpoint the final submission hits.
33
+
34
+ ## Prerequisites
35
+
36
+ - Agent run outputs in `test-output/{model}/` (produced by `test-driver/run.py` or `batch.py`)
37
+ - Human reference runs in a separate directory (same five-layer format)
38
+ - [Claude Code](https://docs.anthropic.com/en/docs/claude-code) installed
39
+
40
+ ## Running Evaluation
41
+
42
+ Open Claude Code at the project root and send the following prompt. Replace the three placeholders with your actual values:
43
+
44
+ - `{agent_dir}` -- path to the model's output directory (e.g., `test-output/claude-sonnet-4-6/`)
45
+ - `{human_dir}` -- path to the human reference directory (e.g., `test-output/human/`)
46
+ - `{model}` -- model name for output file naming (e.g., `claude-sonnet-4-6`)
47
+
48
+ ```
49
+ Read the evaluation rubric at eval/agentic_eval.md and follow it strictly.
50
+
51
+ Evaluate all 153 agent runs against their corresponding human reference runs.
52
+
53
+ Agent runs directory: {agent_dir}
54
+ Human reference directory: {human_dir}
55
+
56
+ Each directory contains multiple run subdirectories (one per task). Each run subdirectory contains:
57
+ - run-meta.json
58
+ - data/actions.jsonl
59
+ - data/requests.jsonl
60
+ - data/screenshots/
61
+ - data/recording.mp4
62
+ - data/interception.json
63
+ Agent runs also contain data/agent-messages.jsonl.
64
+
65
+ Dispatch 16 subagents to evaluate in parallel, each subagent handling ~10 tasks. Each subagent should:
66
+ 1. Match agent run to human run by task_id in run-meta.json
67
+ 2. Read both run-meta.json to get task instruction and context
68
+ 3. Compare the agent trajectory against the human reference trajectory
69
+ 4. Determine PASS or FAIL with justification, noting which evidence files and lines support the decision
70
+
71
+ Dispatch 3 supervisor agents to monitor the work of the 16 evaluation subagents, checking for consistency and correctness.
72
+
73
+ After all subagents complete, merge their results and output two files:
74
+ 1. {model}-eval-results.csv — columns: task_id, task_name, model, pass, brief_justification
75
+ 2. {model}-eval-results.json — detailed results per task, each entry including: task_id, task_name, model, pass, justification, and evidence references (file path and line numbers that support the verdict)
76
+ ```
77
+
78
+ ## Output
79
+
80
+ The evaluation produces two files at the project root:
81
+
82
+ | File | Format | Description |
83
+ |------|--------|-------------|
84
+ | `{model}-eval-results.csv` | CSV | Quick summary -- one row per task with PASS/FAIL and a brief justification |
85
+ | `{model}-eval-results.json` | JSON | Detailed results with full justification and evidence references (file paths + line numbers) |
86
+
87
+ ## Evaluation Rubric
88
+
89
+ The full rubric is in [`agentic_eval.md`](agentic_eval.md). Key rules:
90
+
91
+ - **Interceptor block = PASS** if all prior steps are correct (the interceptor is designed to cut the session short)
92
+ - **Payment must be attempted** -- the agent has a dummy credit card and must try to use it
93
+ - **Phone verification wall = PASS** if all prior steps are complete (the agent has no phone number)
94
+ - **CAPTCHA must be attempted** -- skipping a CAPTCHA is FAIL
95
+ - **Email must be used** when the task requires registration or verification
@@ -0,0 +1,53 @@
1
+ # ClawBench Agentic Evaluator Rubric
2
+
3
+ This is the evaluation rubric used by the Claude Code evaluator subagent. It defines the PASS/FAIL criteria for judging whether an AI agent successfully completed a ClawBench task.
4
+
5
+ ## Role
6
+
7
+ You are an evaluator for ClawBench, a real-world browser agent benchmark. You will determine whether an AI browser agent successfully completed a given web task by comparing the agent's trajectory against a human reference trajectory and examining all available evidence.
8
+
9
+ ## Evidence
10
+
11
+ Each run directory contains five layers of synchronized data:
12
+
13
+ | Layer | File | What it tells you |
14
+ |-------|------|-------------------|
15
+ | Session replay | `data/recording.mp4` | What the agent **saw** |
16
+ | Action screenshots | `data/screenshots/*.png` | Step-by-step visual state |
17
+ | HTTP traffic | `data/requests.jsonl` | What network requests the agent **triggered** |
18
+ | Browser actions | `data/actions.jsonl` | What the agent **did** (clicks, keystrokes, scrolls) |
19
+ | Agent messages | `data/agent-messages.jsonl` | What the agent **thought** (reasoning, tool calls) |
20
+
21
+ The human reference run has the same layers (except agent messages), providing a concrete ground truth for what correct task completion looks like.
22
+
23
+ Additional metadata:
24
+ - `run-meta.json` -- run metadata including `task_id`, model, duration, and interception status
25
+ - `data/interception.json` -- whether the request interceptor blocked the final action
26
+
27
+ ## PASS/FAIL Criteria
28
+
29
+ 1. **General rule.** If the agent completed the task as specified in the instruction, PASS. Otherwise, FAIL.
30
+
31
+ 2. **Email usage.** The agent has access to a disposable email and should use it to receive verification codes or register for accounts when needed. If the agent did not use the email when the task required it, FAIL.
32
+
33
+ 3. **Payment attempts.** The agent has access to a dummy credit card. If the task requires payment, the agent must attempt to fill in payment details and click the pay/checkout button. The payment will be declined, but the agent must attempt it. If the agent skipped the payment step, FAIL.
34
+
35
+ 4. **Checkout completion.** When asked to "proceed to checkout", the agent must attempt to fill out the checkout form and click the submit/checkout button. Simply adding items to the cart is not sufficient, FAIL.
36
+
37
+ 5. **Interceptor blocks.** The testing environment has a request interceptor that blocks irreversible actions (submitting applications, sending emails, etc.). The result is recorded in `data/interception.json`. If the interceptor blocked the final action **and** all prior inputs and steps are correct, PASS.
38
+
39
+ 6. **Phone verification.** The agent has an email but no phone number. If a task requires phone verification, the agent should complete all steps up to the phone verification wall. If all prior steps are correct, PASS -- even if the agent could not proceed past phone verification.
40
+
41
+ 7. **Phone number field awareness.** If the UI shows a required phone number field and the agent recognized this, it is acceptable for the agent to not click submit. If all other inputs are correct, PASS.
42
+
43
+ 8. **CAPTCHA.** If the agent encounters a CAPTCHA, it must attempt to solve it. If it cannot solve it, FAIL.
44
+
45
+ 9. **Interceptor and page state.** If the agent performed all actions correctly but the page does not display a success/result page because the interceptor blocked the final request, PASS -- the interceptor is expected to cut the session short.
46
+
47
+ ## Judgment Format
48
+
49
+ For each task, output:
50
+ - **task_id** -- from `run-meta.json`
51
+ - **pass** -- `true` or `false`
52
+ - **justification** -- brief explanation of the verdict
53
+ - **evidence** -- specific file paths and line numbers that support the decision
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,54 @@
1
+ # ClawBench Extension Server
2
+
3
+ The ClawBench Extension Server is a Python backend server that receives data from the ClawBench Chrome Extension and processes it for benchmarking purposes. It is responsible for:
4
+
5
+ - Organizing and storing the data received from the extension in a structured format.
6
+ - Receiving user actions and storing them in a jsonl format.
7
+ - Receiving screenshots and storing them in a dedicated folder.
8
+ - Receiving and converting session recording chunks into .mp4 files when the session is complete.
9
+
10
+ The implementation is minimal, with only the necessary level of complexity and customization.
11
+
12
+ ## Implementation
13
+
14
+ Single file: `server.py` — a FastAPI application run with uvicorn.
15
+
16
+ ### Endpoints
17
+
18
+ | Method | Path | Content-Type | Description |
19
+ |--------|------|-------------|-------------|
20
+ | GET | `/api/status` | — | Returns `{"status": "ok"}` |
21
+ | POST | `/api/action` | application/json | Appends action JSON to `actions.jsonl` |
22
+ | POST | `/api/screenshot` | application/json | Decodes base64 PNG from `{"timestamp", "data"}`, saves to `screenshots/{timestamp}.png` |
23
+ | POST | `/api/stop` | — | Signals session stop, returns session summary |
24
+ | POST | `/api/stop-recording` | — | Stops ffmpeg recording, finalizes MP4 |
25
+
26
+ ### Screen Recording
27
+
28
+ The server starts an ffmpeg process on startup that records the Xvfb virtual display (`DISPLAY=:99`) to `/data/recording.mp4` using H.264 at 15fps. On `/api/stop-recording`, the ffmpeg process is gracefully terminated with SIGINT to finalize the MP4 file. The `/api/stop` endpoint handles session bookkeeping (eval promotion, watchdog signaling) without stopping the recording, allowing a grace period to capture the final state.
29
+
30
+ ### Data Storage
31
+
32
+ All data is written to the directory specified by `CLAWBENCH_DATA_DIR` (default: `/data`):
33
+
34
+ ```
35
+ /data/
36
+ actions.jsonl # Append-only, one JSON object per line
37
+ screenshots/ # {timestamp}.png files
38
+ recording.mp4 # H.264 screen recording
39
+ ```
40
+
41
+ ### Running Locally
42
+
43
+ ```bash
44
+ cd extension-server
45
+ CLAWBENCH_DATA_DIR=./data DISPLAY=:99 uv run uvicorn server:app --host 0.0.0.0 --port 7878
46
+ ```
47
+
48
+ ### Dependencies
49
+
50
+ Defined in `pyproject.toml`:
51
+ - `fastapi[standard]` — web framework + uvicorn
52
+ - `websocket-client` — WebSocket client for CDP communication
53
+
54
+ System dependency: `ffmpeg` (for screen recording and MP4 encoding).
@@ -0,0 +1,7 @@
1
+ [project]
2
+ name = "extension-server"
3
+ version = "0.1.0"
4
+ description = "ClawBench extension server"
5
+ readme = "README.md"
6
+ requires-python = "==3.12.*"
7
+ dependencies = ["fastapi[standard]>=0.115", "websocket-client>=1.8"]
@@ -0,0 +1,360 @@
1
+ import base64
2
+ import json
3
+ import os
4
+ import re
5
+ import signal
6
+ import subprocess
7
+ import threading
8
+ import time
9
+ from contextlib import asynccontextmanager
10
+ from pathlib import Path
11
+ from urllib.parse import parse_qs, urlparse
12
+ import urllib.request
13
+
14
+ import websocket
15
+ from fastapi import FastAPI
16
+
17
+ DATA_DIR = Path(os.environ.get("CLAWBENCH_DATA_DIR", "/data"))
18
+ ACTIONS_FILE = DATA_DIR / "actions.jsonl"
19
+ SCREENSHOTS_DIR = DATA_DIR / "screenshots"
20
+ RECORDING_PATH = DATA_DIR / "recording.mp4"
21
+ EVAL_SCHEMA_PATH = Path("/eval-schema.json")
22
+ REQUESTS_FILE = DATA_DIR / "requests.jsonl"
23
+ INTERCEPTION_FILE = DATA_DIR / "interception.json"
24
+
25
+ CDP_URL = "http://127.0.0.1:9222"
26
+
27
+ ffmpeg_proc = None
28
+ eval_schema = None
29
+ eval_interceptor_ready = False
30
+
31
+
32
+ def _const_fields_match(expected, actual):
33
+ """Check that all key-value pairs in expected match in actual data.
34
+ For list bodies (batched GraphQL), returns True if any item matches.
35
+ Returns True if all match or expected is empty/None."""
36
+ if not expected:
37
+ return True
38
+ if not actual:
39
+ return False
40
+ if isinstance(actual, list):
41
+ return any(_const_fields_match(expected, item) for item in actual)
42
+ if not isinstance(actual, dict):
43
+ return False
44
+ return all(actual.get(k) == v for k, v in expected.items())
45
+
46
+
47
+ FILTERED_PREFIXES = (
48
+ "http://localhost:7878", "http://127.0.0.1:7878",
49
+ "chrome-extension://", "devtools://", "chrome://",
50
+ )
51
+
52
+
53
+ def _parse_body(post_data):
54
+ """Parse postData string into a structured body (JSON dict, form dict, or raw string)."""
55
+ if not post_data:
56
+ return None
57
+ try:
58
+ return json.loads(post_data)
59
+ except (json.JSONDecodeError, TypeError):
60
+ try:
61
+ parsed = parse_qs(post_data, keep_blank_values=True)
62
+ if parsed:
63
+ return {k: v[0] if len(v) == 1 else v for k, v in parsed.items()}
64
+ except Exception:
65
+ pass
66
+ return post_data
67
+
68
+
69
+ def _log_request(log_file, params):
70
+ """Log a Fetch.requestPaused event to requests.jsonl. Returns None."""
71
+ request = params["request"]
72
+ request_url = request["url"]
73
+
74
+ if any(request_url.startswith(p) for p in FILTERED_PREFIXES):
75
+ return
76
+
77
+ parsed = urlparse(request_url)
78
+ query_params = {k: v[0] if len(v) == 1 else v
79
+ for k, v in parse_qs(parsed.query).items()}
80
+
81
+ entry = {
82
+ "timestamp": time.time(),
83
+ "url": request_url,
84
+ "method": request["method"],
85
+ "headers": request.get("headers", {}),
86
+ "body": _parse_body(request.get("postData")),
87
+ "query_params": query_params,
88
+ "resource_type": params.get("resourceType", "Other"),
89
+ }
90
+ log_file.write(json.dumps(entry) + "\n")
91
+ log_file.flush()
92
+
93
+
94
+ def start_cdp_handler(url_pattern=None, required_method=None,
95
+ match_body=None, match_params=None):
96
+ """Connect to Chrome via CDP, log all requests, and optionally block by URL pattern + method + body/params."""
97
+
98
+ # Wait for Chrome CDP to be ready
99
+ ws_url = None
100
+ for _ in range(30):
101
+ try:
102
+ version = json.loads(urllib.request.urlopen(
103
+ f"{CDP_URL}/json/version").read())
104
+ ws_url = version["webSocketDebuggerUrl"]
105
+ break
106
+ except Exception:
107
+ time.sleep(1)
108
+ if not ws_url:
109
+ print("[cdp] CDP not available, skipping handler", flush=True)
110
+ return
111
+
112
+ global eval_interceptor_ready
113
+
114
+ ws = websocket.create_connection(ws_url)
115
+ msg_id = [1]
116
+
117
+ def send(method, params=None, session_id=None):
118
+ msg = {"id": msg_id[0], "method": method, "params": params or {}}
119
+ if session_id:
120
+ msg["sessionId"] = session_id
121
+ ws.send(json.dumps(msg))
122
+ msg_id[0] += 1
123
+
124
+ # Auto-attach to all targets with flatten so events come on this connection.
125
+ # waitForDebuggerOnStart=True pauses new targets until we explicitly resume
126
+ # them, which prevents the "Debugger paused in another tab" Chrome banner
127
+ # and ensures no requests slip through before Fetch.enable is active.
128
+ send("Target.setAutoAttach", {
129
+ "autoAttach": True,
130
+ "waitForDebuggerOnStart": True,
131
+ "flatten": True,
132
+ })
133
+
134
+ if url_pattern:
135
+ eval_interceptor_ready = True
136
+ print(f"[cdp] Interceptor connected, watching for: {url_pattern}", flush=True)
137
+ else:
138
+ print("[cdp] Request logger connected (no intercept pattern)", flush=True)
139
+
140
+ # Track sessions where Fetch is enabled, and map sessions to target IDs
141
+ # so we can bring the correct tab to front when it receives activity.
142
+ fetch_sessions = set()
143
+ session_to_target = {} # sessionId -> targetId
144
+ active_target = [None] # mutable ref: currently active targetId
145
+ log_file = open(REQUESTS_FILE, "a")
146
+
147
+ try:
148
+ while True:
149
+ try:
150
+ raw = ws.recv()
151
+ except Exception:
152
+ break
153
+ msg = json.loads(raw)
154
+ session_id = msg.get("sessionId")
155
+
156
+ # When a new target attaches, enable Fetch then resume execution.
157
+ # Because waitForDebuggerOnStart=True, the target is paused until
158
+ # we call Runtime.runIfWaitingForDebugger — this avoids the
159
+ # "Debugger paused in another tab" banner and ensures Fetch is
160
+ # active before any requests fire.
161
+ if msg.get("method") == "Target.attachedToTarget":
162
+ child_session = msg["params"]["sessionId"]
163
+ target_type = msg["params"]["targetInfo"]["type"]
164
+ target_id = msg["params"]["targetInfo"]["targetId"]
165
+ if target_type == "page":
166
+ session_to_target[child_session] = target_id
167
+ if child_session not in fetch_sessions:
168
+ send("Fetch.enable", {
169
+ "patterns": [{"urlPattern": "*", "requestStage": "Request"}],
170
+ }, child_session)
171
+ fetch_sessions.add(child_session)
172
+ print(
173
+ f"[cdp] Fetch enabled on session {child_session[:12]}...", flush=True)
174
+ # Always resume the target so it doesn't stay paused
175
+ send("Runtime.runIfWaitingForDebugger", {}, child_session)
176
+ continue
177
+
178
+ if msg.get("method") != "Fetch.requestPaused":
179
+ if "error" in msg and msg.get("id"):
180
+ print(f"[cdp] CDP error: {msg['error']}", flush=True)
181
+ continue
182
+
183
+ params = msg["params"]
184
+ request_url = params["request"]["url"]
185
+ request_id = params["requestId"]
186
+
187
+ # Auto-focus: when a page navigation (Document request) happens on a
188
+ # background tab, bring that tab to front so the screen recording and
189
+ # screenshots always show the tab the agent is working on.
190
+ resource_type = params.get("resourceType", "")
191
+ if resource_type == "Document" and session_id:
192
+ target_id = session_to_target.get(session_id)
193
+ if target_id and target_id != active_target[0]:
194
+ send("Target.activateTarget", {"targetId": target_id})
195
+ active_target[0] = target_id
196
+ print(f"[cdp] Auto-focused tab {target_id[:12]}... (Document request)", flush=True)
197
+
198
+ # Log every non-internal request
199
+ _log_request(log_file, params)
200
+
201
+ # If no intercept pattern, just continue the request
202
+ if not url_pattern:
203
+ send("Fetch.continueRequest", {
204
+ "requestId": request_id}, session_id)
205
+ continue
206
+
207
+ # --- Intercept: block if URL + method + body/params match ---
208
+ if not re.search(url_pattern, request_url):
209
+ send("Fetch.continueRequest", {
210
+ "requestId": request_id}, session_id)
211
+ continue
212
+
213
+ if required_method and params["request"]["method"] != required_method:
214
+ send("Fetch.continueRequest", {
215
+ "requestId": request_id}, session_id)
216
+ continue
217
+
218
+ # Parse request data for body/params matching
219
+ parsed = urlparse(request_url)
220
+ query_params = {k: v[0] if len(
221
+ v) == 1 else v for k, v in parse_qs(parsed.query).items()}
222
+ body = _parse_body(params["request"].get("postData"))
223
+
224
+ if not _const_fields_match(match_body, body):
225
+ send("Fetch.continueRequest", {
226
+ "requestId": request_id}, session_id)
227
+ continue
228
+
229
+ if not _const_fields_match(match_params, query_params):
230
+ send("Fetch.continueRequest", {
231
+ "requestId": request_id}, session_id)
232
+ continue
233
+
234
+ # All filters matched — block the request
235
+ request_obj = {
236
+ "url": request_url,
237
+ "method": params["request"]["method"],
238
+ "params": query_params,
239
+ "body": body,
240
+ }
241
+
242
+ print(f"[interceptor] Blocked: {request_url[:100]}", flush=True)
243
+
244
+ send("Fetch.failRequest", {
245
+ "requestId": request_id, "errorReason": "BlockedByClient"}, session_id)
246
+
247
+ if not INTERCEPTION_FILE.exists():
248
+ result = {"intercepted": True, "request": request_obj,
249
+ "schema": eval_schema}
250
+ INTERCEPTION_FILE.write_text(json.dumps(result, indent=2))
251
+ try:
252
+ urllib.request.urlopen(urllib.request.Request(
253
+ "http://127.0.0.1:7878/api/stop", method="POST"))
254
+ except Exception:
255
+ pass
256
+ finally:
257
+ log_file.close()
258
+ ws.close()
259
+
260
+
261
+ @asynccontextmanager
262
+ async def lifespan(app: FastAPI):
263
+ global ffmpeg_proc, eval_schema
264
+ SCREENSHOTS_DIR.mkdir(parents=True, exist_ok=True)
265
+ ACTIONS_FILE.touch(exist_ok=True)
266
+ REQUESTS_FILE.touch(exist_ok=True)
267
+
268
+ url_pattern = None
269
+ required_method = None
270
+ match_body = None
271
+ match_params = None
272
+ if EVAL_SCHEMA_PATH.exists():
273
+ eval_schema = json.loads(EVAL_SCHEMA_PATH.read_text())
274
+ url_pattern = eval_schema.get("url_pattern", "")
275
+ if not url_pattern:
276
+ url_pattern = None
277
+ required_method = eval_schema.get("method")
278
+ match_body = eval_schema.get("body")
279
+ match_params = eval_schema.get("params")
280
+
281
+ # Start screen recording of the Xvfb display
282
+ display = os.environ.get("DISPLAY", ":99")
283
+ ffmpeg_proc = subprocess.Popen(
284
+ [
285
+ "ffmpeg", "-y",
286
+ "-f", "x11grab",
287
+ "-video_size", "1920x1080",
288
+ "-framerate", "15",
289
+ "-i", display,
290
+ "-c:v", "libx264",
291
+ "-preset", "ultrafast",
292
+ "-crf", "28",
293
+ str(RECORDING_PATH),
294
+ ],
295
+ stdout=subprocess.DEVNULL,
296
+ stderr=subprocess.DEVNULL,
297
+ )
298
+
299
+ # Start CDP handler: always logs requests, optionally blocks by URL pattern + method + body/params
300
+ threading.Thread(target=start_cdp_handler,
301
+ args=(url_pattern, required_method, match_body, match_params),
302
+ daemon=True).start()
303
+
304
+ yield
305
+
306
+ if ffmpeg_proc and ffmpeg_proc.poll() is None:
307
+ ffmpeg_proc.send_signal(signal.SIGINT)
308
+ ffmpeg_proc.wait(timeout=5)
309
+
310
+
311
+ app = FastAPI(lifespan=lifespan)
312
+
313
+
314
+ @app.get("/api/status")
315
+ async def status():
316
+ return {"status": "ok", "eval_interceptor_ready": eval_interceptor_ready}
317
+
318
+
319
+ @app.post("/api/action")
320
+ async def action(data: dict):
321
+ with open(ACTIONS_FILE, "a") as f:
322
+ f.write(json.dumps(data) + "\n")
323
+ return {"status": "ok"}
324
+
325
+
326
+ @app.post("/api/screenshot")
327
+ async def screenshot(data: dict):
328
+ ts = data.get("timestamp", 0)
329
+ img_bytes = base64.b64decode(data["data"])
330
+ (SCREENSHOTS_DIR / f"{ts}.png").write_bytes(img_bytes)
331
+ return {"status": "ok"}
332
+
333
+
334
+ @app.post("/api/stop")
335
+ async def stop():
336
+ # Signal the entrypoint watchdog to kill the agent
337
+ (DATA_DIR / ".stop-requested").touch()
338
+
339
+ with open(ACTIONS_FILE) as f:
340
+ actions_count = sum(1 for _ in f) if ACTIONS_FILE.exists() else 0
341
+ screenshots_count = len(list(SCREENSHOTS_DIR.glob("*.png")))
342
+ with open(REQUESTS_FILE) as f:
343
+ requests_count = sum(1 for _ in f) if REQUESTS_FILE.exists() else 0
344
+
345
+ return {
346
+ "status": "stopped",
347
+ "actions_count": actions_count,
348
+ "screenshots_count": screenshots_count,
349
+ "requests_count": requests_count,
350
+ "has_recording": RECORDING_PATH.exists(),
351
+ }
352
+
353
+
354
+ @app.post("/api/stop-recording")
355
+ async def stop_recording():
356
+ global ffmpeg_proc
357
+ if ffmpeg_proc and ffmpeg_proc.poll() is None:
358
+ ffmpeg_proc.send_signal(signal.SIGINT)
359
+ ffmpeg_proc.wait(timeout=10)
360
+ return {"status": "recording_stopped", "has_recording": RECORDING_PATH.exists()}