ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
@@ -0,0 +1,3115 @@
1
+ """
2
+ WebScrapingTool for AI-Parrot
3
+ Combines Selenium/Playwright automation with LLM-directed scraping
4
+ """
5
+ from pathlib import Path
6
+ import random
7
+ import sys
8
+ from typing import Dict, List, Any, Optional, Union, Literal
9
+ import select
10
+ import time
11
+ import asyncio
12
+ import logging
13
+ import base64
14
+ import re
15
+ import json
16
+ import contextlib
17
+ from urllib.parse import urlparse, urljoin
18
+ from lxml import html as lxml_html
19
+ import aiofiles
20
+ from pydantic import BaseModel, Field
21
+ from bs4 import BeautifulSoup
22
+ # Selenium imports
23
+ try:
24
+ from seleniumwire import webdriver
25
+ except ImportError:
26
+ from selenium import webdriver
27
+ from selenium.webdriver.chrome.options import Options
28
+ from selenium.webdriver.common.by import By
29
+ from selenium.webdriver.common.keys import Keys
30
+ from selenium.webdriver.support.ui import WebDriverWait
31
+ from selenium.webdriver.support import expected_conditions as EC
32
+ from selenium.common.exceptions import NoSuchElementException, TimeoutException
33
+ # For Playwright alternative
34
+ try:
35
+ from playwright.async_api import async_playwright, Page, Browser
36
+ PLAYWRIGHT_AVAILABLE = True
37
+ except ImportError:
38
+ PLAYWRIGHT_AVAILABLE = False
39
+ from ..abstract import AbstractTool
40
+ from .driver import SeleniumSetup
41
+ from .models import (
42
+ BrowserAction,
43
+ Navigate,
44
+ Click,
45
+ Fill,
46
+ Select,
47
+ Evaluate,
48
+ PressKey,
49
+ Refresh,
50
+ Back,
51
+ Wait,
52
+ Scroll,
53
+ Authenticate,
54
+ GetCookies,
55
+ SetCookies,
56
+ GetText,
57
+ GetHTML,
58
+ Screenshot,
59
+ WaitForDownload,
60
+ UploadFile,
61
+ AwaitHuman,
62
+ AwaitKeyPress,
63
+ AwaitBrowserEvent,
64
+ Loop,
65
+ ScrapingStep,
66
+ ScrapingSelector,
67
+ ScrapingResult,
68
+ Conditional
69
+ )
70
+
71
+
72
+ class WebScrapingToolArgs(BaseModel):
73
+ """Arguments schema for WebScrapingTool."""
74
+ steps: List[Dict[str, Any]] = Field(
75
+ description="List of navigation and interaction steps. Each step should have 'action' and 'description'"
76
+ )
77
+ selectors: Optional[List[Dict[str, Any]]] = Field(
78
+ default=None,
79
+ description="Content selectors for extraction. Each selector should have 'name', 'selector', and optional 'extract_type', 'multiple'"
80
+ )
81
+ base_url: Optional[str] = Field(
82
+ default="",
83
+ description="Base URL for relative links"
84
+ )
85
+ browser_config: Optional[Dict[str, Any]] = Field(
86
+ default=None,
87
+ description="Any Selenium configuration overrides (e.g., headless, mobile, browser type)"
88
+ )
89
+ full_page: bool = Field(
90
+ default=False,
91
+ description="Whether to capture full page content"
92
+ )
93
+ headless: bool = Field(
94
+ default=True,
95
+ description="Whether to run the browser in headless mode"
96
+ )
97
+
98
+
99
+ class WebScrapingTool(AbstractTool):
100
+ """
101
+ Advanced web scraping tool with LLM integration support.
102
+
103
+ Features:
104
+ - Support for both Selenium and Playwright
105
+ - Step-by-step navigation instructions
106
+ - Flexible content extraction
107
+ - Intermediate result storage
108
+ - Error handling and retry logic
109
+
110
+ Supported Actions:
111
+ * Navigation: navigate, back, refresh
112
+ * Interaction: click, fill, press_key, scroll
113
+ * Data Extraction: get_text, get_html, get_cookies
114
+ * Authentication: authenticate
115
+ * File Operations: upload_file, wait_for_download, screenshot
116
+ * State Management: set_cookies
117
+ * Waiting: wait, await_human, await_keypress, await_browser_event
118
+ * Evaluation: evaluate
119
+ * Control Flow: loop
120
+ """
121
+
122
+ name = "WebScrapingTool"
123
+ description = """Execute automated web scraping with JSON-based, step-by-step navigation and content extraction.
124
+
125
+ IMPORTANT: This tool requires a 'steps' parameter (not 'actions'!) containing a list of navigation/interaction steps.
126
+
127
+ Example usage:
128
+ {
129
+ "steps": [
130
+ {"action": "navigate", "url": "https://example.com/login", "description": "Navigate to login page"},
131
+ {"action": "fill", "selector": "#email", "selector_type": "css", "value": "user@example.com", "description": "Fill email field"},
132
+ {"action": "fill", "selector": "#password", "selector_type": "css", "value": "password123", "description": "Fill password field"},
133
+ {"action": "click", "selector": "button[type='submit']", "selector_type": "css", "description": "Click login button"},
134
+ {"action": "navigate", "url": "https://example.com/dashboard", "description": "Navigate to dashboard"}
135
+ ],
136
+ "selectors": [ // Optional - if omitted, returns full page HTML
137
+ {"name": "title", "selector": "h1", "selector_type": "css"},
138
+ {"name": "content", "selector": ".main-content", "selector_type": "css"}
139
+ ],
140
+ "full_page": true // Optional - set to true to capture full page content when no selectors provided
141
+ }
142
+
143
+ Each step must include:
144
+ - "action": The action type (required)
145
+ - "description": Why this step is needed (required for clarity)
146
+ - Additional fields depending on action type (e.g., "url" for navigate, "selector" for click/fill)
147
+
148
+ Pair every selector with a `selector_type` (`css`, `xpath`, or `text`). Keep waits explicit via `condition_type` (`simple`, `selector`, `url_is`, `url_contains`, `title_contains`, or `custom`).
149
+
150
+ Supported actions:
151
+ - Navigation: navigate, back, refresh
152
+ - Interaction: click, fill, press_key, scroll, select
153
+ - Data Extraction: get_text, get_html, get_cookies
154
+ - Authentication: authenticate (include method, selectors, credentials)
155
+ - File Operations: upload_file, wait_for_download, screenshot
156
+ - State Management: set_cookies
157
+ - Waiting: wait, await_human, await_keypress, await_browser_event
158
+ - Evaluation: evaluate
159
+ - Control Flow: loop
160
+
161
+ If no selectors are provided and full_page is False, the tool will still return the complete HTML body of the final page for your reference."""
162
+ args_schema = WebScrapingToolArgs
163
+
164
+ def __init__(
165
+ self,
166
+ browser: Literal['chrome', 'firefox', 'edge', 'safari', 'undetected'] = 'chrome',
167
+ driver_type: Literal['selenium', 'playwright'] = 'selenium',
168
+ full_page: bool = False,
169
+ headless: bool = True,
170
+ mobile: bool = False,
171
+ mobile_device: Optional[str] = None,
172
+ browser_binary: Optional[str] = None,
173
+ driver_binary: Optional[str] = None,
174
+ auto_install: bool = True,
175
+ **kwargs
176
+ ):
177
+ super().__init__(**kwargs)
178
+ self.driver_type = driver_type
179
+ # Browser configuration
180
+ self.browser_config = {
181
+ 'browser': browser,
182
+ 'headless': headless,
183
+ 'mobile': mobile,
184
+ 'mobile_device': mobile_device,
185
+ 'browser_binary': browser_binary,
186
+ 'driver_binary': driver_binary,
187
+ 'auto_install': auto_install,
188
+ **kwargs
189
+ }
190
+ self.driver = None
191
+ self.browser = None # For Playwright
192
+ self.page = None # For Playwright
193
+ self._full_page: bool = full_page
194
+ self.results: List[ScrapingResult] = []
195
+ # Allow turning overlay housekeeping on/off (default ON)
196
+ self.overlay_housekeeping: bool = kwargs.get('overlay_housekeeping', True)
197
+ # Configuration
198
+ self.default_timeout = kwargs.get('default_timeout', 10)
199
+ self.retry_attempts = kwargs.get('retry_attempts', 3)
200
+ self.delay_between_actions = kwargs.get('delay_between_actions', 1)
201
+ # extracted cookies and headers from Driver
202
+ self.extracted_cookies: Dict[str, str] = {}
203
+ self.extracted_headers: Dict[str, str] = {}
204
+ self.extracted_authorization: str = None
205
+ logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
206
+
207
+ async def _execute(
208
+ self,
209
+ steps: List[Dict[str, Any]],
210
+ selectors: Optional[List[Dict[str, Any]]] = None,
211
+ base_url: str = "",
212
+ browser_config: Optional[Dict[str, Any]] = None,
213
+ **kwargs
214
+ ) -> Dict[str, Any]:
215
+ """
216
+ Execute the web scraping workflow.
217
+
218
+ Args:
219
+ steps: List of navigation/interaction steps
220
+ selectors: List of content selectors to do extraction
221
+ base_url: Base URL for relative links
222
+
223
+ Returns:
224
+ Dictionary with scraping results
225
+ """
226
+ self.results = []
227
+
228
+ try:
229
+ await self.initialize_driver(
230
+ config_overrides=browser_config
231
+ )
232
+
233
+ # Convert dictionaries to dataclasses
234
+ scraping_steps = [ScrapingStep.from_dict(step) for step in steps]
235
+ scraping_selectors = [ScrapingSelector(**sel) for sel in selectors] if selectors else None
236
+
237
+ # Execute scraping workflow
238
+ results = await self.execute_scraping_workflow(
239
+ scraping_steps,
240
+ scraping_selectors,
241
+ base_url
242
+ )
243
+ successful_scrapes = len([r for r in results if r.success])
244
+ return {
245
+ "status": "success" if successful_scrapes > 0 else "failed",
246
+ "result": [
247
+ {
248
+ "url": r.url,
249
+ "extracted_data": r.extracted_data,
250
+ "metadata": r.metadata,
251
+ "success": r.success,
252
+ "error_message": r.error_message,
253
+ "content": r.content
254
+ } for r in results
255
+ ],
256
+ "metadata": {
257
+ "total_pages_scraped": len(results),
258
+ "successful_scrapes": successful_scrapes,
259
+ "browser_used": self.selenium_setup.browser,
260
+ "mobile_mode": self.selenium_setup.mobile,
261
+ }
262
+ }
263
+
264
+ except Exception as e:
265
+ self.logger.error(f"Scraping execution failed: {str(e)}")
266
+ return {
267
+ "status": "error",
268
+ "error": str(e),
269
+ "result": [],
270
+ "metadata": {
271
+ "browser_used": self.browser_config.get('browser', 'unknown'),
272
+ }
273
+ }
274
+
275
+ async def initialize_driver(self, config_overrides: Optional[Dict[str, Any]] = None):
276
+ """Initialize the web driver based on configuration"""
277
+ if self.driver_type == 'selenium':
278
+ await self._setup_selenium(config_overrides)
279
+ elif self.driver_type == 'playwright' and PLAYWRIGHT_AVAILABLE:
280
+ await self._setup_playwright()
281
+ else:
282
+ raise ValueError(
283
+ f"Driver type '{self.driver_type}' not supported or not available"
284
+ )
285
+
286
+ async def _get_selenium_driver(self, config: Dict[str, Any]) -> webdriver.Chrome:
287
+ # Create Selenium setup
288
+ self.selenium_setup = SeleniumSetup(**config)
289
+ # Get the driver
290
+ return await self.selenium_setup.get_driver()
291
+
292
+ async def _setup_selenium(self, config_overrides: Optional[Dict[str, Any]] = None):
293
+ """Setup Selenium WebDriver"""
294
+ final_config = self.browser_config.copy()
295
+ if config_overrides:
296
+ final_config.update(config_overrides)
297
+ self.driver = await self._get_selenium_driver(final_config)
298
+ # Attempt to capture from performance logs first
299
+ try:
300
+ # turn on CDP Network domain
301
+ self.driver.execute_cdp_cmd("Network.enable", {})
302
+ except Exception: # pragma: no cover - command may not exist
303
+ pass
304
+ return self.driver
305
+
306
+ async def _setup_playwright(self):
307
+ """Setup Playwright browser"""
308
+ if not PLAYWRIGHT_AVAILABLE:
309
+ raise ImportError("Playwright is not installed. Install with: pip install playwright")
310
+
311
+ playwright = await async_playwright().start()
312
+ self.browser = await playwright.chromium.launch(
313
+ headless=self.browser_config.get('headless', True)
314
+ )
315
+ self.page = await self.browser.new_page()
316
+ await self.page.set_viewport_size({"width": 1920, "height": 1080})
317
+
318
+ async def execute_scraping_workflow(
319
+ self,
320
+ steps: List[ScrapingStep],
321
+ selectors: Optional[List[ScrapingSelector]] = None,
322
+ base_url: str = ""
323
+ ) -> List[ScrapingResult]:
324
+ """
325
+ Execute a complete scraping workflow
326
+
327
+ Args:
328
+ steps: List of navigation/interaction steps
329
+ selectors: List of content selectors to extract
330
+ base_url: Base URL for relative links
331
+
332
+ Returns:
333
+ List of ScrapingResult objects
334
+ """
335
+ self.results = []
336
+
337
+ try:
338
+ # Execute each step in sequence
339
+ for i, step in enumerate(steps):
340
+ self.logger.info(f"Executing step {i+1}/{len(steps)}: {step.description}")
341
+ print(' DEBUG STEP > ', step, base_url)
342
+ try:
343
+ success = await self._execute_step(step, base_url)
344
+ except TimeoutError:
345
+ self.logger.error(f"Step timed out: {step.description}")
346
+ success = False
347
+ break
348
+
349
+ if not success and step.action in ['navigate', 'authenticate']:
350
+ # Critical steps - abort if they fail
351
+ self.logger.error(
352
+ f"Critical step failed: {step.description}"
353
+ )
354
+ break
355
+
356
+ # Add delay between actions
357
+ await asyncio.sleep(self.delay_between_actions)
358
+
359
+ # Extract content using selectors
360
+ if selectors:
361
+ current_url = await self._get_current_url()
362
+ result = await self._extract_content(current_url, selectors)
363
+ if result:
364
+ self.results.append(result)
365
+ else:
366
+ # When no selectors provided, always extract full page content
367
+ # This ensures the tool returns the HTML body for reference
368
+ current_url = await self._get_current_url()
369
+ result = await self._extract_full_content(current_url)
370
+ if result:
371
+ self.results.append(result)
372
+ # and extract the headers, authorization and cookies
373
+ try:
374
+ self.extracted_headers = self._extract_headers()
375
+ self.extracted_authorization = self._extract_authorization()
376
+ self.extracted_cookies = self._collect_cookies()
377
+ except Exception as e:
378
+ self.logger.error(
379
+ f"Error extracting headers, authorization, or cookies: {str(e)}"
380
+ )
381
+
382
+ except Exception as e:
383
+ self.logger.error(f"Scraping workflow failed: {str(e)}")
384
+ # Create error result
385
+ error_result = ScrapingResult(
386
+ url="",
387
+ content="",
388
+ bs_soup=BeautifulSoup("", 'html.parser'),
389
+ success=False,
390
+ error_message=str(e)
391
+ )
392
+ self.results.append(error_result)
393
+
394
+ finally:
395
+ await self.cleanup()
396
+
397
+ return self.results
398
+
399
+ async def _execute_step(self, step: ScrapingStep, base_url: str = "", args: dict = None) -> bool:
400
+ """Execute a single scraping step with a hard timeout per step."""
401
+ action = step.action
402
+ action_type = action.get_action_type()
403
+ result = None
404
+ try:
405
+ if action_type == 'navigate':
406
+ result = await self._navigate_to(action, base_url)
407
+ elif action_type == 'click':
408
+ result = await self._click(
409
+ action,
410
+ timeout=action.timeout or self.default_timeout
411
+ )
412
+ elif action_type == 'fill':
413
+ result = await self._fill(action)
414
+ elif action_type == 'select':
415
+ result = await self._select(action)
416
+ elif action_type == 'evaluate':
417
+ result = await self._evaluate_js(action)
418
+ elif action_type == 'await_human':
419
+ result = await self._await_human(action)
420
+ elif action_type == 'press_key':
421
+ result = await self._press_key(action)
422
+ elif action_type == 'refresh':
423
+ result = await self._handle_refresh(action)
424
+ elif action_type == 'back':
425
+ result = await self._handle_back(action)
426
+ elif action_type == 'get_cookies':
427
+ result = await self._get_cookies(action)
428
+ elif action_type == 'set_cookies':
429
+ result = await self._set_cookies(action)
430
+ elif action_type == 'get_text':
431
+ result = await self._get_text(action)
432
+ elif action_type == 'get_html':
433
+ result = await self._get_html(action, args)
434
+ elif action_type == 'screenshot':
435
+ result = await self._take_screenshot(action)
436
+ elif action_type == 'wait_for_download':
437
+ result = await self._wait_for_download(action)
438
+ elif action_type == 'upload_file':
439
+ result = await self._upload_file(action)
440
+ elif action_type == 'await_keypress':
441
+ try:
442
+ result = await self._await_keypress(action)
443
+ except TimeoutError:
444
+ raise
445
+ elif action_type == 'await_browser_event':
446
+ try:
447
+ result = await self._await_browser_event(action)
448
+ except TimeoutError:
449
+ raise
450
+ elif action_type == 'wait':
451
+ result = await self._wait_for_condition(
452
+ action,
453
+ step.action.timeout or self.default_timeout
454
+ )
455
+ elif action_type == 'scroll':
456
+ result = await self._scroll_page(action)
457
+ elif action_type == 'authenticate':
458
+ result = await self._handle_authentication(action)
459
+ elif action_type == 'loop':
460
+ result = await self._exec_loop(action, base_url)
461
+ elif action_type == 'conditional':
462
+ result = await self._exec_conditional(action, base_url, args)
463
+ else:
464
+ self.logger.warning(f"Unknown action: {step.action}")
465
+ return False
466
+ return result
467
+ except asyncio.TimeoutError:
468
+ self.logger.error(f"Step timed out: {step.description or step.action}")
469
+ return False
470
+ except Exception as e:
471
+ self.logger.error(f"Step execution failed: {step.action} - {str(e)}")
472
+ return False
473
+
474
+ async def _select_option(
475
+ self,
476
+ selector: str,
477
+ value: Optional[str] = None,
478
+ text: Optional[str] = None,
479
+ index: Optional[int] = None,
480
+ by: str = 'value',
481
+ blur_after: bool = True,
482
+ wait_after_select: Optional[str] = None,
483
+ wait_timeout: int = 2
484
+ ) -> bool:
485
+ """Select an option from a dropdown/select element"""
486
+
487
+ if self.driver_type == 'selenium':
488
+ from selenium.webdriver.support.ui import Select as SeleniumSelect
489
+
490
+ loop = asyncio.get_running_loop()
491
+
492
+ def select_sync():
493
+ # Wait for select element to be present
494
+ element = WebDriverWait(
495
+ self.driver,
496
+ self.default_timeout,
497
+ poll_frequency=0.25
498
+ ).until(
499
+ EC.presence_of_element_located((By.CSS_SELECTOR, selector))
500
+ )
501
+
502
+ # Create Select object
503
+ select = SeleniumSelect(element)
504
+
505
+ # Perform selection based on method
506
+ if by == 'value':
507
+ select.select_by_value(value)
508
+ elif by == 'text':
509
+ select.select_by_visible_text(text)
510
+ elif by == 'index':
511
+ select.select_by_index(index)
512
+
513
+ # Trigger blur/change events if requested
514
+ if blur_after:
515
+ # Trigger change event
516
+ self.driver.execute_script(
517
+ "arguments[0].dispatchEvent(new Event('change', { bubbles: true }));",
518
+ element
519
+ )
520
+ # Trigger blur event
521
+ self.driver.execute_script(
522
+ "arguments[0].blur();",
523
+ element
524
+ )
525
+
526
+ # Wait for post-select element if specified
527
+ if wait_after_select:
528
+ try:
529
+ WebDriverWait(self.driver, wait_timeout).until(
530
+ EC.presence_of_element_located((By.CSS_SELECTOR, wait_after_select))
531
+ )
532
+ self.logger.debug(f"Post-select element found: {wait_after_select}")
533
+ except TimeoutException:
534
+ self.logger.warning(
535
+ f"Post-select wait timed out: {wait_after_select}"
536
+ )
537
+
538
+ await loop.run_in_executor(None, select_sync)
539
+ return True
540
+
541
+ else: # Playwright
542
+ # Playwright has built-in select support
543
+ if by == 'value':
544
+ await self.page.select_option(selector, value=value)
545
+ elif by == 'text':
546
+ await self.page.select_option(selector, label=text)
547
+ elif by == 'index':
548
+ await self.page.select_option(selector, index=index)
549
+
550
+ # Trigger blur/change events if requested
551
+ if blur_after:
552
+ await self.page.evaluate(f"""
553
+ const select = document.querySelector('{selector}');
554
+ select.dispatchEvent(new Event('change', {{ bubbles: true }}));
555
+ select.blur();
556
+ """)
557
+
558
+ # Wait for post-select element if specified
559
+ if wait_after_select:
560
+ try:
561
+ await self.page.wait_for_selector(
562
+ wait_after_select,
563
+ timeout=wait_timeout * 1000
564
+ )
565
+ self.logger.debug(f"Post-select element found: {wait_after_select}")
566
+ except Exception:
567
+ self.logger.warning(
568
+ f"Post-select wait timed out: {wait_after_select}"
569
+ )
570
+
571
+ return True
572
+
573
+
574
+ async def _select(self, action: Select):
575
+ """Handle select action"""
576
+ return await self._select_option(
577
+ selector=action.selector,
578
+ value=action.value,
579
+ text=action.text,
580
+ index=action.index,
581
+ by=action.by,
582
+ blur_after=action.blur_after,
583
+ wait_after_select=action.wait_after_select,
584
+ wait_timeout=action.wait_timeout
585
+ )
586
+
587
+ async def _evaluate_js(self, action: Evaluate) -> Any:
588
+ """Handle Evaluate action"""
589
+ script = action.script
590
+
591
+ # Load script from file if specified
592
+ if action.script_file:
593
+ with open(action.script_file, 'r') as f:
594
+ script = f.read()
595
+
596
+ if not script:
597
+ self.logger.warning(
598
+ "No script provided for Evaluate action"
599
+ )
600
+ return False
601
+
602
+ if self.driver_type == 'selenium':
603
+ loop = asyncio.get_running_loop()
604
+ result = await loop.run_in_executor(
605
+ None,
606
+ lambda: self.driver.execute_script(script, *action.args)
607
+ )
608
+ else: # Playwright
609
+ result = await self.page.evaluate(script, *action.args)
610
+
611
+ return result if action.return_value else True
612
+
613
+ async def _press_key(self, action: PressKey) -> bool:
614
+ """Handle PressKey action"""
615
+ # Focus on target element if specified
616
+ if action.target:
617
+ if self.driver_type == 'selenium':
618
+ element = self.driver.find_element(By.CSS_SELECTOR, action.target)
619
+ element.click()
620
+ else:
621
+ await self.page.focus(action.target)
622
+
623
+ # Press keys
624
+ for key in action.keys:
625
+ if self.driver_type == 'selenium':
626
+ key_obj = getattr(Keys, key.upper(), key)
627
+ if action.target:
628
+ element.send_keys(key_obj)
629
+ else:
630
+ self.driver.switch_to.active_element.send_keys(key_obj)
631
+ else: # Playwright
632
+ await self.page.keyboard.press(key)
633
+
634
+ return True
635
+
636
+ async def _handle_refresh(self, action: Refresh) -> bool:
637
+ """Handle Refresh action"""
638
+ if self.driver_type == 'selenium':
639
+ loop = asyncio.get_running_loop()
640
+ if action.hard:
641
+ await loop.run_in_executor(
642
+ None,
643
+ lambda: self.driver.execute_script("location.reload(true)")
644
+ )
645
+ else:
646
+ await loop.run_in_executor(None, self.driver.refresh)
647
+ else: # Playwright
648
+ await self.page.reload(wait_until='domcontentloaded')
649
+
650
+ return True
651
+
652
+ async def _handle_back(self, action: Back) -> bool:
653
+ """Handle Back action"""
654
+ for _ in range(action.steps):
655
+ if self.driver_type == 'selenium':
656
+ loop = asyncio.get_running_loop()
657
+ await loop.run_in_executor(None, self.driver.back)
658
+ else: # Playwright
659
+ await self.page.go_back()
660
+
661
+ return True
662
+
663
+ async def _post_navigate_housekeeping(self):
664
+ """Best-effort, non-blocking overlay dismissal. Never stalls navigation."""
665
+ selectors = [
666
+ ".c-close-icon",
667
+ "button#attn-overlay-close",
668
+ "button[aria-label*='Close']",
669
+ "button[aria-label*='close']",
670
+ "button[aria-label*='Dismiss']",
671
+ "#onetrust-accept-btn-handler",
672
+ ".oci-accept-button",
673
+ ]
674
+
675
+ if self.driver_type == 'selenium':
676
+ loop = asyncio.get_running_loop()
677
+
678
+ def quick_dismiss():
679
+ clicked = 0
680
+ for sel in selectors:
681
+ try:
682
+ # No waits—instant check
683
+ els = self.driver.find_elements(By.CSS_SELECTOR, sel)
684
+ if not els:
685
+ continue
686
+ # Try first two matches at most
687
+ for el in els[:2]:
688
+ try:
689
+ el.click()
690
+ clicked += 1
691
+ except Exception:
692
+ try:
693
+ self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
694
+ self.driver.execute_script("arguments[0].click();", el)
695
+ clicked += 1
696
+ except Exception:
697
+ continue
698
+ except Exception:
699
+ continue
700
+ return clicked
701
+
702
+ # Run quickly in executor; don't care about result
703
+ try:
704
+ await asyncio.wait_for(
705
+ loop.run_in_executor(None, quick_dismiss), timeout=1.0
706
+ )
707
+ except Exception:
708
+ pass
709
+
710
+ else:
711
+ # Playwright: tiny timeouts; ignore errors
712
+ for sel in selectors:
713
+ try:
714
+ await self.page.click(sel, timeout=300) # 0.3s max per selector
715
+ except Exception:
716
+ continue
717
+
718
+ def _session_alive(self) -> bool:
719
+ """Cheap ping to confirm the driver session is alive."""
720
+ try:
721
+ # current_url is a lightweight call; will raise if session is gone
722
+ _ = self.driver.current_url if self.driver_type == 'selenium' else self.page.url
723
+ return True
724
+ except Exception:
725
+ return False
726
+
727
+ async def _navigate_to(self, action: Navigate, base_url: str):
728
+ url = urljoin(base_url, action.url) if base_url else action.url
729
+ if self.driver_type == 'selenium':
730
+ loop = asyncio.get_running_loop()
731
+ await loop.run_in_executor(None, self.driver.get, url)
732
+ if self.overlay_housekeeping:
733
+ try:
734
+ current = self.driver.current_url
735
+ host = (urlparse(current).hostname or "").lower()
736
+ # TODO create a whitelist of hosts where overlays are common
737
+ if host and any(x in host for x in ['bestbuy', 'amazon', 'ebay', 'walmart', 'target']):
738
+ try:
739
+ await asyncio.wait_for(
740
+ self._post_navigate_housekeeping(), timeout=1.25
741
+ )
742
+ except Exception:
743
+ pass
744
+ except Exception:
745
+ pass
746
+ else:
747
+ await self.page.goto(url, wait_until='domcontentloaded')
748
+ if self.overlay_housekeeping:
749
+ try:
750
+ await asyncio.wait_for(self._post_navigate_housekeeping(), timeout=1.25)
751
+ except Exception:
752
+ pass
753
+ return True
754
+
755
+ def js_click(self, driver, element):
756
+ try:
757
+ driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
758
+ driver.execute_script("arguments[0].click();", element)
759
+ return True
760
+ except Exception:
761
+ return False
762
+
763
+ async def _click_element(
764
+ self,
765
+ selector: str,
766
+ timeout: Optional[int] = None
767
+ ):
768
+ """Click an element by selector."""
769
+ wait = WebDriverWait(
770
+ self.driver,
771
+ timeout or self.default_timeout,
772
+ poll_frequency=0.25
773
+ )
774
+ try:
775
+ el = wait.until(
776
+ EC.presence_of_element_located(
777
+ (By.CSS_SELECTOR, selector)
778
+ )
779
+ )
780
+ el.click()
781
+ except Exception:
782
+ # fallback to JS click
783
+ try:
784
+ self.js_click(self.driver, el)
785
+ except Exception:
786
+ return False
787
+
788
+ async def _click(self, action: Click, timeout: Optional[int] = None) -> bool:
789
+ """
790
+ Enhanced click method supporting CSS, XPath, and text-based selection.
791
+
792
+ Args:
793
+ action: Click action with selector and options
794
+ timeout: Optional timeout override
795
+
796
+ Returns:
797
+ bool: True if click successful
798
+ """
799
+ selector = action.selector
800
+ selector_type = action.selector_type
801
+ timeout = timeout or action.timeout or self.default_timeout
802
+
803
+ if self.driver_type == 'selenium':
804
+ loop = asyncio.get_running_loop()
805
+
806
+ def click_sync():
807
+ # Determine the locator strategy based on selector_type
808
+ if selector_type == 'xpath':
809
+ by_type = By.XPATH
810
+ locator = selector
811
+ elif selector_type == 'text':
812
+ # Convert text search to XPath
813
+ # Supports exact match, contains, and case-insensitive
814
+ if selector.startswith('='):
815
+ # Exact match: =Filters
816
+ text = selector[1:]
817
+ by_type = By.XPATH
818
+ locator = f"//*[normalize-space(text())='{text}']"
819
+ elif selector.startswith('~'):
820
+ # Case-insensitive contains: ~filters
821
+ text = selector[1:].lower()
822
+ by_type = By.XPATH
823
+ locator = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]"
824
+ else:
825
+ # Default: contains (case-sensitive)
826
+ by_type = By.XPATH
827
+ locator = f"//*[contains(text(), '{selector}')]"
828
+ else: # css (default)
829
+ by_type = By.CSS_SELECTOR
830
+ locator = selector
831
+
832
+ self.logger.debug(f"Clicking element: {by_type}='{locator}'")
833
+
834
+ wait = WebDriverWait(
835
+ self.driver,
836
+ timeout,
837
+ poll_frequency=0.25
838
+ )
839
+
840
+ # Wait for element to be present
841
+ try:
842
+ element = wait.until(
843
+ EC.presence_of_element_located((by_type, locator))
844
+ )
845
+ except Exception as e:
846
+ self.logger.error(f"Element not found: {by_type}='{locator}'")
847
+ raise
848
+
849
+ # Try regular click first
850
+ try:
851
+ # Wait for element to be clickable
852
+ element = wait.until(
853
+ EC.element_to_be_clickable((by_type, locator))
854
+ )
855
+ element.click()
856
+ self.logger.debug(f"Click performed on: {locator}")
857
+ except Exception:
858
+ # Fallback to JS click
859
+ try:
860
+ self.logger.debug("Regular click failed, trying JS click")
861
+ self.js_click(self.driver, element)
862
+ except Exception as e:
863
+ self.logger.error(f"Both click methods failed: {str(e)}")
864
+ raise
865
+
866
+ # Handle post-click waiting
867
+ if action.no_wait:
868
+ self.logger.debug("no_wait=True, skipping post-click wait")
869
+ return True
870
+ elif action.wait_after_click:
871
+ # Wait for specified element to appear
872
+ try:
873
+ WebDriverWait(
874
+ self.driver,
875
+ action.wait_timeout or self.default_timeout,
876
+ poll_frequency=0.25
877
+ ).until(
878
+ EC.presence_of_element_located(
879
+ (By.CSS_SELECTOR, action.wait_after_click)
880
+ )
881
+ )
882
+ self.logger.debug(f"Post-click element found: {action.wait_after_click}")
883
+ except Exception:
884
+ self.logger.warning(
885
+ f"Post-click wait element not found: {action.wait_after_click}"
886
+ )
887
+ else:
888
+ # Default: small sleep to allow any navigation/JS to start
889
+ time.sleep(0.5)
890
+
891
+ return True
892
+
893
+ await loop.run_in_executor(None, click_sync)
894
+ return True
895
+
896
+ else: # Playwright
897
+ if selector_type == 'xpath':
898
+ # Playwright supports XPath directly
899
+ await self.page.click(f"xpath={selector}", timeout=timeout * 1000)
900
+ elif selector_type == 'text':
901
+ # Playwright has native text selection
902
+ if selector.startswith('='):
903
+ # Exact text match
904
+ text = selector[1:]
905
+ await self.page.click(f"text={text}", timeout=timeout * 1000)
906
+ elif selector.startswith('~'):
907
+ # Case-insensitive (Playwright doesn't have built-in, use XPath)
908
+ text = selector[1:].lower()
909
+ xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]"
910
+ await self.page.click(f"xpath={xpath}", timeout=timeout * 1000)
911
+ else:
912
+ # Contains (partial match)
913
+ await self.page.click(f"text={selector}", timeout=timeout * 1000)
914
+ else:
915
+ # CSS selector
916
+ await self.page.click(selector, timeout=timeout * 1000)
917
+
918
+ # Handle post-click waiting for Playwright
919
+ if action.no_wait:
920
+ self.logger.debug("no_wait=True, skipping post-click wait")
921
+ elif action.wait_after_click:
922
+ try:
923
+ await self.page.wait_for_selector(
924
+ action.wait_after_click,
925
+ timeout=(action.wait_timeout or self.default_timeout) * 1000
926
+ )
927
+ self.logger.debug(f"Post-click element found: {action.wait_after_click}")
928
+ except Exception:
929
+ self.logger.warning(
930
+ f"Post-click wait timed out: {action.wait_after_click}"
931
+ )
932
+
933
+ return True
934
+
935
+ async def _fill_element(
936
+ self,
937
+ selector: Any,
938
+ value: str,
939
+ selector_type: str = 'css',
940
+ clear_first: bool = False,
941
+ press_enter: bool = False
942
+ ) -> bool:
943
+ """Fill an input element"""
944
+ if self.driver_type == 'selenium':
945
+ loop = asyncio.get_running_loop()
946
+ def fill_sync():
947
+ if selector_type == 'xpath':
948
+ by_type = By.XPATH
949
+ locator = selector
950
+ elif selector_type == 'text':
951
+ # Convert text to XPath for form fields
952
+ by_type = By.XPATH
953
+ if selector.startswith('='):
954
+ text = selector[1:]
955
+ # Find input with label containing text
956
+ locator = f"//label[contains(text(), '{text}')]/following-sibling::input | //input[@placeholder='{text}']"
957
+ else:
958
+ locator = f"//label[contains(text(), '{selector}')]/following-sibling::input | //input[@placeholder='{selector}']"
959
+ else:
960
+ by_type = By.CSS_SELECTOR
961
+ locator = selector
962
+ element = WebDriverWait(
963
+ self.driver,
964
+ self.default_timeout,
965
+ poll_frequency=0.25
966
+ ).until(
967
+ EC.presence_of_element_located((by_type, locator))
968
+ )
969
+ if clear_first:
970
+ element.clear()
971
+ element.send_keys(value)
972
+ if press_enter:
973
+ element.send_keys(Keys.ENTER)
974
+ await loop.run_in_executor(None, fill_sync)
975
+ return True
976
+ else: # Playwright
977
+ if selector_type == 'xpath':
978
+ await self.page.fill(f"xpath={selector}", value)
979
+ elif selector_type == 'text':
980
+ # Playwright text selector for inputs
981
+ if selector.startswith('='):
982
+ text = selector[1:]
983
+ await self.page.fill(f"text={text}", value)
984
+ else:
985
+ await self.page.fill(f"text={selector}", value)
986
+ else:
987
+ await self.page.fill(selector, value)
988
+
989
+ if press_enter:
990
+ await self.page.keyboard.press('Enter')
991
+
992
+ return True
993
+
994
+ async def _fill(self, action: Fill):
995
+ """Fill an input element"""
996
+ selector = action.selector
997
+ value = action.value
998
+ clear_first = action.clear_first
999
+ press_enter = action.press_enter
1000
+ selector_type = getattr(action, 'selector_type', 'css')
1001
+ return await self._fill_element(
1002
+ selector, value,
1003
+ selector_type=selector_type,
1004
+ clear_first=clear_first,
1005
+ press_enter=press_enter
1006
+ )
1007
+
1008
+ async def _wait_for_condition(self, action: Wait, timeout: int = 5):
1009
+ """
1010
+ Wait for a specific condition to be met.
1011
+ Handles multiple selectors separated by commas.
1012
+ """
1013
+ condition = action.condition
1014
+ if self.driver_type == 'selenium':
1015
+ loop = asyncio.get_running_loop()
1016
+
1017
+ def wait_sync():
1018
+ # Fail fast if session died
1019
+ try:
1020
+ _ = self.driver.current_url
1021
+ except Exception as e:
1022
+ raise RuntimeError(
1023
+ f"Selenium session not alive: {e}"
1024
+ ) from e
1025
+ if action.condition_type == 'simple':
1026
+ # do a simple wait of N.seconds:
1027
+ time.sleep(int(timeout))
1028
+ return True
1029
+ elif action.condition_type == 'url_contains':
1030
+ WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
1031
+ EC.url_contains(condition)
1032
+ )
1033
+ self.logger.debug(f"URL contains: {condition}")
1034
+ return True
1035
+ elif action.condition_type == 'url_is':
1036
+ WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
1037
+ EC.url_to_be(condition)
1038
+ )
1039
+ self.logger.debug(f"URL is: {condition}")
1040
+ return True
1041
+ elif action.condition_type == 'selector':
1042
+ # Check if selector is present.
1043
+ selectors = [s.strip() for s in condition.split(',')]
1044
+ for selector in selectors:
1045
+ try:
1046
+ WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
1047
+ EC.presence_of_element_located((By.CSS_SELECTOR, selector))
1048
+ )
1049
+ self.logger.debug(f"Element found: {selector}")
1050
+ return True
1051
+ except TimeoutException:
1052
+ if selector == selectors[-1]: # Last selector
1053
+ raise TimeoutException(f"None of the selectors found: {selectors}")
1054
+ continue # Try next selector
1055
+
1056
+ # Handle prefixed conditions
1057
+ if condition.startswith('presence_of_element_located:'):
1058
+ selectors_str = condition.split(':', 1)[1]
1059
+ selectors = [s.strip() for s in selectors_str.split(',')]
1060
+
1061
+ # Try each selector until one works
1062
+ for selector in selectors:
1063
+ try:
1064
+ WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
1065
+ EC.presence_of_element_located((By.CSS_SELECTOR, selector))
1066
+ )
1067
+ self.logger.debug(f"Element found: {selector}")
1068
+ return True # IMPORTANT: Return immediately when found
1069
+ except TimeoutException:
1070
+ if selector == selectors[-1]: # Last selector
1071
+ raise TimeoutException(f"None of the selectors found: {selectors}")
1072
+ continue # Try next selector
1073
+
1074
+ elif condition.startswith('element_to_be_clickable:'):
1075
+ selectors_str = condition.split(':', 1)[1]
1076
+ selectors = [s.strip() for s in selectors_str.split(',')]
1077
+
1078
+ for selector in selectors:
1079
+ try:
1080
+ WebDriverWait(self.driver, timeout).until(
1081
+ EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
1082
+ )
1083
+ self.logger.debug(f"Clickable element found: {selector}")
1084
+ return True # Return immediately
1085
+ except TimeoutException:
1086
+ if selector == selectors[-1]:
1087
+ raise TimeoutException(f"None of the selectors clickable: {selectors}")
1088
+ continue
1089
+
1090
+ elif condition.startswith('text_to_be_present:'):
1091
+ text = condition.split(':', 1)[1]
1092
+ WebDriverWait(self.driver, timeout, poll_frequency=0.25).until(
1093
+ EC.text_to_be_present_in_element((By.TAG_NAME, "body"), text)
1094
+ )
1095
+ self.logger.debug(f"Text found: {text}")
1096
+ return True # Return immediately
1097
+
1098
+ elif condition.startswith('invisibility_of_element:'):
1099
+ selector = condition.split(':', 1)[1]
1100
+ WebDriverWait(self.driver, timeout).until(
1101
+ EC.invisibility_of_element_located((By.CSS_SELECTOR, selector))
1102
+ )
1103
+ self.logger.debug(f"Element invisible: {selector}")
1104
+ return True # Return immediately
1105
+
1106
+ else:
1107
+ # DEFAULT: Plain CSS selector(s) - use fast JS polling
1108
+ selectors = [s.strip() for s in condition.split(',')]
1109
+ deadline = time.monotonic() + timeout
1110
+ while time.monotonic() < deadline:
1111
+ for selector in selectors:
1112
+ try:
1113
+ count = self.driver.execute_script(
1114
+ "return document.querySelectorAll(arguments[0]).length;",
1115
+ selector
1116
+ )
1117
+ if isinstance(count, int) and count > 0:
1118
+ self.logger.debug(f"Element found via JS: {selector}")
1119
+ return True # Return immediately when found
1120
+ except Exception:
1121
+ pass
1122
+ time.sleep(0.15) # Small delay before retry
1123
+ # Timeout reached
1124
+ raise TimeoutException(f"Timeout waiting for selectors: {selectors}")
1125
+
1126
+ # Execute and return result
1127
+ result = await loop.run_in_executor(None, wait_sync)
1128
+ return result
1129
+
1130
+ else: # Playwright
1131
+ if condition.startswith('presence_of_element_located:'):
1132
+ selectors_str = condition.replace('presence_of_element_located:', '')
1133
+ selectors = [s.strip() for s in selectors_str.split(',')]
1134
+
1135
+ # Try each selector
1136
+ for selector in selectors:
1137
+ try:
1138
+ await self.page.wait_for_selector(selector, timeout=timeout * 1000)
1139
+ self.logger.debug(f"Playwright found: {selector}")
1140
+ return True
1141
+ except Exception:
1142
+ if selector == selectors[-1]:
1143
+ raise
1144
+ continue
1145
+
1146
+ elif condition.startswith('text_to_be_present:'):
1147
+ text = condition.replace('text_to_be_present:', '')
1148
+ await self.page.wait_for_function(
1149
+ f"document.body.textContent.includes('{text}')",
1150
+ timeout=timeout * 1000
1151
+ )
1152
+ return True
1153
+
1154
+ else:
1155
+ # Try multiple selectors if comma-separated
1156
+ selectors = [s.strip() for s in condition.split(',')]
1157
+ for selector in selectors:
1158
+ try:
1159
+ await self.page.wait_for_selector(selector, timeout=timeout * 1000)
1160
+ return True
1161
+ except Exception:
1162
+ if selector == selectors[-1]:
1163
+ raise
1164
+ continue
1165
+
1166
+ return True
1167
+
1168
+ async def _get_text(self, action: GetText) -> bool:
1169
+ """
1170
+ Extract pure text content from elements and save to results.
1171
+
1172
+ Args:
1173
+ action: GetText action with selector and options
1174
+
1175
+ Returns:
1176
+ bool: True if extraction successful
1177
+ """
1178
+ try:
1179
+ # Get current URL
1180
+ current_url = await self._get_current_url()
1181
+
1182
+ # Get page source
1183
+ if self.driver_type == 'selenium':
1184
+ loop = asyncio.get_running_loop()
1185
+ page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
1186
+ else: # Playwright
1187
+ page_source = await self.page.content()
1188
+
1189
+ # Parse with BeautifulSoup
1190
+ soup = BeautifulSoup(page_source, 'html.parser')
1191
+
1192
+ # Find elements by selector
1193
+ elements = soup.select(action.selector)
1194
+
1195
+ if not elements:
1196
+ self.logger.warning(f"No elements found for selector: {action.selector}")
1197
+ extracted_text = None
1198
+ elif action.multiple:
1199
+ # Extract text from all matching elements
1200
+ extracted_text = [elem.get_text(strip=True) for elem in elements]
1201
+ else:
1202
+ # Extract text from first element only
1203
+ extracted_text = elements[0].get_text(strip=True)
1204
+
1205
+ # Create ScrapingResult and append to results
1206
+ result = ScrapingResult(
1207
+ url=current_url,
1208
+ content=page_source,
1209
+ bs_soup=soup,
1210
+ extracted_data={action.extract_name: extracted_text},
1211
+ metadata={
1212
+ "selector": action.selector,
1213
+ "multiple": action.multiple,
1214
+ "elements_found": len(elements)
1215
+ },
1216
+ timestamp=str(time.time()),
1217
+ success=extracted_text is not None
1218
+ )
1219
+
1220
+ self.results.append(result)
1221
+ self.logger.info(
1222
+ f"Extracted text from {len(elements)} element(s) using selector: {action.selector}"
1223
+ )
1224
+
1225
+ return True
1226
+
1227
+ except Exception as e:
1228
+ self.logger.error(f"GetText action failed: {str(e)}")
1229
+ # Create error result
1230
+ error_result = ScrapingResult(
1231
+ url=await self._get_current_url() if hasattr(self, 'driver') or hasattr(self, 'page') else "",
1232
+ content="",
1233
+ bs_soup=BeautifulSoup("", 'html.parser'),
1234
+ extracted_data={action.extract_name: None},
1235
+ success=False,
1236
+ error_message=str(e),
1237
+ timestamp=str(time.time())
1238
+ )
1239
+ self.results.append(error_result)
1240
+ return False
1241
+
1242
+
1243
+ async def _get_html(self, action: GetHTML, args: dict) -> bool:
1244
+ """
1245
+ Extract complete HTML content from elements and save to results.
1246
+
1247
+ Args:
1248
+ action: GetHTML action with selector and options
1249
+ args: Additional arguments for the action
1250
+
1251
+ Returns:
1252
+ bool: True if extraction successful
1253
+ """
1254
+ try:
1255
+ # Get current URL
1256
+ current_url = await self._get_current_url()
1257
+
1258
+ # Get page source
1259
+ if self.driver_type == 'selenium':
1260
+ loop = asyncio.get_running_loop()
1261
+ page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
1262
+ else: # Playwright
1263
+ page_source = await self.page.content()
1264
+
1265
+ # Parse with BeautifulSoup
1266
+ soup = BeautifulSoup(page_source, 'html.parser')
1267
+
1268
+ # Handle different selector types
1269
+ selector_type = getattr(action, 'selector_type', 'css')
1270
+
1271
+ # Find elements by selector
1272
+ if selector_type == 'xpath':
1273
+ # Use lxml for XPath support
1274
+ tree = lxml_html.fromstring(page_source)
1275
+ elements_lxml = tree.xpath(action.selector)
1276
+
1277
+ # Convert lxml elements back to BeautifulSoup for consistency
1278
+ elements = []
1279
+ for elem in elements_lxml:
1280
+ html_str = lxml_html.tostring(elem, encoding='unicode')
1281
+ elements.append(BeautifulSoup(html_str, 'html.parser'))
1282
+ else:
1283
+ # CSS selector (default)
1284
+ elements = soup.select(action.selector)
1285
+
1286
+ if not elements:
1287
+ self.logger.warning(f"No elements found for selector: {action.selector}")
1288
+ extracted_html = None
1289
+
1290
+ # Extract HTML from all matching elements
1291
+ elif action.multiple:
1292
+ for elem in elements:
1293
+ # generate one scrapping result per element:
1294
+ elem_bs = elem if isinstance(elem, BeautifulSoup) else BeautifulSoup(str(elem), 'html.parser')
1295
+ data = args.get('data', {}) if args else {}
1296
+ result = ScrapingResult(
1297
+ url=current_url,
1298
+ content=page_source,
1299
+ bs_soup=elem_bs,
1300
+ extracted_data={action.extract_name: str(elem)},
1301
+ metadata={
1302
+ "selector": action.selector,
1303
+ "selector_type": selector_type,
1304
+ "multiple": action.multiple,
1305
+ "iteration": (args or {}).get("iteration"),
1306
+ "data": data,
1307
+ },
1308
+ timestamp=str(time.time()),
1309
+ success=True
1310
+ )
1311
+ # print('DEBUG HTML > ', result)
1312
+ self.results.append(result)
1313
+ else:
1314
+ extracted_html = str(elements[0])
1315
+ # Create ScrapingResult and append to results
1316
+ result = ScrapingResult(
1317
+ url=current_url,
1318
+ content=page_source,
1319
+ bs_soup=soup,
1320
+ extracted_data={action.extract_name: extracted_html},
1321
+ metadata={
1322
+ "selector": action.selector,
1323
+ "selector_type": selector_type,
1324
+ "multiple": action.multiple,
1325
+ "elements_found": len(elements)
1326
+ },
1327
+ timestamp=str(time.time()),
1328
+ success=extracted_html is not None
1329
+ )
1330
+
1331
+ self.results.append(result)
1332
+ self.logger.info(
1333
+ f"Extracted HTML from {len(elements)} element(s) using selector: {action.selector}"
1334
+ )
1335
+
1336
+ return True
1337
+
1338
+ except Exception as e:
1339
+ self.logger.error(f"GetHTML action failed: {str(e)}")
1340
+ # Create error result
1341
+ error_result = ScrapingResult(
1342
+ url=await self._get_current_url() if hasattr(self, 'driver') or hasattr(self, 'page') else "",
1343
+ content="",
1344
+ bs_soup=BeautifulSoup("", 'html.parser'),
1345
+ extracted_data={action.extract_name: None},
1346
+ success=False,
1347
+ error_message=str(e),
1348
+ timestamp=str(time.time())
1349
+ )
1350
+ self.results.append(error_result)
1351
+ return False
1352
+
1353
+
1354
+ async def _take_screenshot(self, action: Screenshot) -> bool:
1355
+ """
1356
+ Take a screenshot of the page or specific element.
1357
+
1358
+ Args:
1359
+ action: Screenshot action with options
1360
+
1361
+ Returns:
1362
+ bool: True if screenshot successful
1363
+ """
1364
+ try:
1365
+ screenshot_data = None
1366
+ output_path = action.output_path
1367
+ if isinstance(output_path, str):
1368
+ output_path = Path(output_path).resolve()
1369
+ screenshot_name = action.get_filename()
1370
+
1371
+ if self.driver_type == 'selenium':
1372
+ loop = asyncio.get_running_loop()
1373
+
1374
+ def take_screenshot_sync():
1375
+ if action.selector:
1376
+ # Screenshot of specific element
1377
+ element = self.driver.find_element(By.CSS_SELECTOR, action.selector)
1378
+ screenshot_bytes = element.screenshot_as_png
1379
+ else:
1380
+ # Full page screenshot
1381
+ if action.full_page:
1382
+ # Full page screenshot (requires scrolling for some drivers)
1383
+ screenshot_bytes = self.driver.get_screenshot_as_png()
1384
+ else:
1385
+ # Viewport screenshot only
1386
+ screenshot_bytes = self.driver.get_screenshot_as_png()
1387
+
1388
+ return screenshot_bytes
1389
+
1390
+ screenshot_bytes = await loop.run_in_executor(None, take_screenshot_sync)
1391
+
1392
+ # Save to file if path provided
1393
+ filename = output_path.joinpath(screenshot_name)
1394
+ async with aiofiles.open(filename, 'wb') as f:
1395
+ await f.write(screenshot_bytes)
1396
+ self.logger.info(f"Screenshot saved to: {filename}")
1397
+
1398
+ # Return base64 if requested
1399
+ if action.return_base64:
1400
+ return base64.b64encode(screenshot_bytes).decode('utf-8')
1401
+
1402
+ return True
1403
+
1404
+ else: # Playwright
1405
+ screenshot_options = {}
1406
+
1407
+ if action.full_page:
1408
+ screenshot_options['full_page'] = True
1409
+
1410
+ if action.selector:
1411
+ # Screenshot of specific element
1412
+ element = self.page.locator(action.selector)
1413
+ screenshot_bytes = await element.screenshot(**screenshot_options)
1414
+ else:
1415
+ # Page screenshot
1416
+ screenshot_bytes = await self.page.screenshot(**screenshot_options)
1417
+
1418
+ # Save to file if path provided
1419
+ if output_path:
1420
+ with open(output_path, 'wb') as f:
1421
+ f.write(screenshot_bytes)
1422
+ self.logger.info(f"Screenshot saved to: {output_path}")
1423
+
1424
+ # Return base64 if requested
1425
+ if action.return_base64:
1426
+ screenshot_data = base64.b64encode(screenshot_bytes).decode('utf-8')
1427
+ else:
1428
+ screenshot_data = True
1429
+
1430
+ # Create ScrapingResult with screenshot data
1431
+ current_url = await self._get_current_url()
1432
+
1433
+ result = ScrapingResult(
1434
+ url=current_url,
1435
+ content="", # No HTML content for screenshots
1436
+ bs_soup=BeautifulSoup("", 'html.parser'),
1437
+ extracted_data={
1438
+ "screenshot": screenshot_data if action.return_base64 else output_path,
1439
+ "screenshot_base64": screenshot_data if action.return_base64 else None
1440
+ },
1441
+ metadata={
1442
+ "selector": action.selector,
1443
+ "full_page": action.full_page,
1444
+ "output_path": output_path,
1445
+ "returned_base64": action.return_base64
1446
+ },
1447
+ timestamp=str(time.time()),
1448
+ success=True
1449
+ )
1450
+
1451
+ self.results.append(result)
1452
+ self.logger.info(
1453
+ f"Screenshot taken: {'element ' + action.selector if action.selector else 'full page'}"
1454
+ )
1455
+
1456
+ return True
1457
+
1458
+ except Exception as e:
1459
+ self.logger.error(f"Screenshot action failed: {str(e)}")
1460
+ # Create error result
1461
+ error_result = ScrapingResult(
1462
+ url=await self._get_current_url() if hasattr(self, 'driver') or hasattr(self, 'page') else "",
1463
+ content="",
1464
+ bs_soup=BeautifulSoup("", 'html.parser'),
1465
+ extracted_data={"screenshot": None},
1466
+ success=False,
1467
+ error_message=str(e),
1468
+ timestamp=str(time.time())
1469
+ )
1470
+ self.results.append(error_result)
1471
+ return False
1472
+
1473
+ async def _scroll_page(self, action: Scroll):
1474
+ """Scroll the page"""
1475
+ if self.driver_type == 'selenium':
1476
+ target = f"document.querySelector('{action.selector}')" if action.selector else "window"
1477
+ behavior = "'smooth'" if action.smooth else "'auto'"
1478
+ loop = asyncio.get_running_loop()
1479
+ def scroll_sync():
1480
+ if action.direction == "top":
1481
+ return f"{target}.scrollTo({{top: 0, behavior: {behavior}}});"
1482
+ elif action.direction == "bottom":
1483
+ return f"{target}.scrollTo({{top: {target}.scrollHeight, behavior: {behavior}}});"
1484
+ elif action.direction == "up":
1485
+ amount = action.amount or 300
1486
+ return f"{target}.scrollBy({{top: -{amount}, behavior: {behavior}}});"
1487
+ elif action.direction == "down":
1488
+ amount = action.amount or 300
1489
+ return f"{target}.scrollBy({{top: {amount}, behavior: {behavior}}});"
1490
+ elif action.amount:
1491
+ self.driver.execute_script(f"window.scrollBy(0, {action.amount});")
1492
+ elif action.selector:
1493
+ # Scroll to element
1494
+ try:
1495
+ element = self.driver.find_element(By.CSS_SELECTOR, action.selector)
1496
+ self.driver.execute_script("arguments[0].scrollIntoView();", element)
1497
+ except NoSuchElementException:
1498
+ self.logger.warning(
1499
+ f"Element not found for scrolling: {action.selector}"
1500
+ )
1501
+
1502
+ await loop.run_in_executor(None, scroll_sync)
1503
+ else: # Playwright
1504
+ if action.direction == "bottom":
1505
+ await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
1506
+ elif action.direction == "top":
1507
+ await self.page.evaluate("window.scrollTo(0, 0)")
1508
+ elif action.amount:
1509
+ await self.page.evaluate(f"window.scrollBy(0, {action.amount})")
1510
+ else:
1511
+ # Scroll to element
1512
+ try:
1513
+ await self.page.locator(action.selector).scroll_into_view_if_needed()
1514
+ except:
1515
+ self.logger.warning(f"Element not found for scrolling: {action.selector}")
1516
+
1517
+ async def _get_cookies(self, action: GetCookies) -> Dict[str, Any]:
1518
+ """Handle GetCookies action"""
1519
+ if self.driver_type == 'selenium':
1520
+ loop = asyncio.get_running_loop()
1521
+ cookies = await loop.run_in_executor(None, self.driver.get_cookies)
1522
+ else: # Playwright
1523
+ cookies = await self.page.context.cookies()
1524
+
1525
+ # Filter by names if specified
1526
+ if action.names:
1527
+ cookies = [c for c in cookies if c.get('name') in action.names]
1528
+
1529
+ # Filter by domain if specified
1530
+ if action.domain:
1531
+ cookies = [c for c in cookies if action.domain in c.get('domain', '')]
1532
+
1533
+ self.logger.info(f"Retrieved {len(cookies)} cookies")
1534
+ return {"cookies": cookies}
1535
+
1536
+ async def _set_cookies(self, action: SetCookies) -> bool:
1537
+ """Handle SetCookies action"""
1538
+ if self.driver_type == 'selenium':
1539
+ loop = asyncio.get_running_loop()
1540
+ for cookie in action.cookies:
1541
+ await loop.run_in_executor(
1542
+ None,
1543
+ lambda c=cookie: self.driver.add_cookie(c)
1544
+ )
1545
+ else: # Playwright
1546
+ await self.page.context.add_cookies(action.cookies)
1547
+
1548
+ self.logger.info(f"Set {len(action.cookies)} cookies")
1549
+ return True
1550
+
1551
+ async def _handle_authentication(self, action: Authenticate):
1552
+ """Handle authentication flows"""
1553
+ if action.method == 'bearer':
1554
+ if not action.token:
1555
+ self.logger.error("Bearer token authentication requires a 'token' value.")
1556
+ return False
1557
+ # Construct the header from the provided format and token
1558
+ header_value = action.header_value_format.format(action.token)
1559
+ headers = {action.header_name: header_value}
1560
+ if self.driver_type == 'selenium':
1561
+ # For Selenium, we use the Chrome DevTools Protocol (CDP) to set headers.
1562
+ # This requires a Chromium-based browser (Chrome, Edge).
1563
+ if not hasattr(self.driver, 'execute_cdp_cmd'):
1564
+ self.logger.error(
1565
+ "Bearer token injection for Selenium is only supported on Chromium-based browsers."
1566
+ )
1567
+ return False
1568
+ self.logger.info(f"Setting extra HTTP headers for Selenium session: {list(headers.keys())}")
1569
+ loop = asyncio.get_running_loop()
1570
+ await loop.run_in_executor(
1571
+ None,
1572
+ lambda: self.driver.execute_cdp_cmd(
1573
+ 'Network.setExtraHTTPHeaders', {'headers': headers}
1574
+ )
1575
+ )
1576
+
1577
+ elif self.driver_type == 'playwright' and PLAYWRIGHT_AVAILABLE:
1578
+ # Playwright has a direct and simple method for this.
1579
+ self.logger.info(f"Setting extra HTTP headers for Playwright session: {list(headers.keys())}")
1580
+ await self.page.set_extra_http_headers(headers)
1581
+
1582
+ else:
1583
+ self.logger.error(f"Bearer token authentication is not implemented for driver type: {self.driver_type}")
1584
+ return False
1585
+
1586
+ self.logger.info("Bearer token authentication configured. All subsequent requests will include the specified header.")
1587
+ return True
1588
+
1589
+ # action form (only programmed until now)
1590
+ username = action.username
1591
+ password = action.password
1592
+ username_selector = action.username_selector or '#username'
1593
+ password_selector = action.password_selector or '#password'
1594
+ submit_selector = action.submit_selector or 'input[type="submit"], button[type="submit"]'
1595
+
1596
+ if not username or not password:
1597
+ self.logger.error(
1598
+ "Authentication requires username and password"
1599
+ )
1600
+ return
1601
+
1602
+ try:
1603
+ # Fill username
1604
+ await self._fill_element(username_selector, username, press_enter=action.enter_on_username)
1605
+ await asyncio.sleep(0.5)
1606
+
1607
+ # Fill password
1608
+ await self._fill_element(password_selector, password)
1609
+ await asyncio.sleep(0.5)
1610
+
1611
+ # Submit form
1612
+ await self._click_element(submit_selector)
1613
+
1614
+ # Wait for navigation/login completion
1615
+ await asyncio.sleep(2)
1616
+
1617
+ self.logger.info("Authentication completed")
1618
+
1619
+ except Exception as e:
1620
+ self.logger.error(f"Authentication failed: {str(e)}")
1621
+ raise
1622
+
1623
+ async def _await_browser_event(self, action: AwaitBrowserEvent) -> bool:
1624
+ """
1625
+ Pause automation until a user triggers a browser-side event.
1626
+
1627
+ Config (put in step.wait_condition or step.target as dict):
1628
+ - key_combo: one of ["ctrl_enter", "cmd_enter", "alt_shift_s"] (default: "ctrl_enter")
1629
+ - show_overlay_button: bool (default False) → injects a floating "Resume" button
1630
+ - local_storage_key: str (default "__scrapeResume")
1631
+ - predicate_js: str (optional) → JS snippet returning boolean; if true, resume
1632
+ - custom_event_name: str (optional) → window.dispatchEvent(new Event(name)) resumes
1633
+
1634
+ Any of these will resume:
1635
+ 1) Pressing the configured key combo in the page
1636
+ 2) Clicking the optional overlay "Resume" button
1637
+ 3) Dispatching the custom event: window.dispatchEvent(new Event('scrape-resume'))
1638
+ 4) Setting localStorage[local_storage_key] = "1"
1639
+ 5) predicate_js() evaluates to true
1640
+ """
1641
+ cfg = action.wait_condition or action.target or {}
1642
+ if isinstance(cfg, str):
1643
+ cfg = {"key_combo": cfg}
1644
+
1645
+ key_combo = (cfg.get("key_combo") or "ctrl_enter").lower()
1646
+ show_overlay = bool(cfg.get("show_overlay_button", False))
1647
+ ls_key = cfg.get("local_storage_key", "__scrapeResume")
1648
+ predicate_js = cfg.get("predicate_js") # e.g., "return !!document.querySelector('.dashboard');"
1649
+ custom_event = cfg.get("custom_event_name", "scrape-resume")
1650
+ timeout = int(action.timeout or 300)
1651
+
1652
+ # Inject listener with green button and auto-removal
1653
+ inject_script = f"""
1654
+ (function() {{
1655
+ if (window.__scrapeSignal && window.__scrapeSignal._bound) return 0;
1656
+ window.__scrapeSignal = window.__scrapeSignal || {{ ready:false, _bound:false }};
1657
+ function signal() {{
1658
+ try {{ localStorage.setItem('{ls_key}', '1'); }} catch(e) {{}}
1659
+ window.__scrapeSignal.ready = true;
1660
+ // Remove the button when clicked
1661
+ var btn = document.getElementById('__scrapeResumeBtn');
1662
+ if (btn) {{ btn.remove(); }}
1663
+ }}
1664
+
1665
+ // Key combos
1666
+ window.addEventListener('keydown', function(e) {{
1667
+ try {{
1668
+ var k = '{key_combo}';
1669
+ if (k === 'ctrl_enter' && (e.ctrlKey || e.metaKey) && e.key === 'Enter') {{ e.preventDefault(); signal(); }}
1670
+ else if (k === 'cmd_enter' && e.metaKey && e.key === 'Enter') {{ e.preventDefault(); signal(); }}
1671
+ else if (k === 'alt_shift_s' && e.altKey && e.shiftKey && (e.key.toLowerCase() === 's')) {{ e.preventDefault(); signal(); }}
1672
+ }} catch(_e) {{}}
1673
+ }}, true);
1674
+
1675
+ // Custom DOM event
1676
+ try {{
1677
+ window.addEventListener('{custom_event}', function() {{ signal(); }}, false);
1678
+ }} catch(_e) {{}}
1679
+
1680
+ // Optional overlay button with green background
1681
+ if ({'true' if show_overlay else 'false'}) {{
1682
+ try {{
1683
+ if (!document.getElementById('__scrapeResumeBtn')) {{
1684
+ var btn = document.createElement('button');
1685
+ btn.id = '__scrapeResumeBtn';
1686
+ btn.textContent = 'Resume scraping';
1687
+ Object.assign(btn.style, {{
1688
+ position: 'fixed',
1689
+ right: '16px',
1690
+ bottom: '16px',
1691
+ zIndex: 2147483647,
1692
+ padding: '10px 14px',
1693
+ fontSize: '14px',
1694
+ borderRadius: '8px',
1695
+ border: 'none',
1696
+ cursor: 'pointer',
1697
+ background: '#10b981',
1698
+ color: '#fff',
1699
+ boxShadow: '0 4px 12px rgba(0,0,0,0.2)'
1700
+ }});
1701
+ btn.addEventListener('click', function(e) {{ e.preventDefault(); signal(); }});
1702
+ document.body.appendChild(btn);
1703
+ }}
1704
+ }} catch(_e) {{}}
1705
+ }}
1706
+
1707
+ window.__scrapeSignal._bound = true;
1708
+ return 1;
1709
+ }})();
1710
+ """
1711
+
1712
+ def _inject_and_check_ready():
1713
+ # Return True if already signaled
1714
+ try:
1715
+ if self.driver_type == 'selenium':
1716
+ # inject
1717
+ try:
1718
+ self.driver.execute_script(inject_script)
1719
+ except Exception:
1720
+ pass
1721
+ # check any of the resume signals
1722
+ if predicate_js:
1723
+ try:
1724
+ ok = self.driver.execute_script(predicate_js)
1725
+ if bool(ok):
1726
+ return True
1727
+ except Exception:
1728
+ pass
1729
+ try:
1730
+ # localStorage flag
1731
+ val = self.driver.execute_script(f"try{{return localStorage.getItem('{ls_key}')}}catch(e){{return null}}")
1732
+ if val == "1":
1733
+ return True
1734
+ except Exception:
1735
+ pass
1736
+ try:
1737
+ # in-memory flag
1738
+ ready = self.driver.execute_script("return !!(window.__scrapeSignal && window.__scrapeSignal.ready);")
1739
+ if bool(ready):
1740
+ return True
1741
+ except Exception:
1742
+ pass
1743
+ return False
1744
+ else:
1745
+ # Playwright branch (optional): basic injection + predicate check
1746
+ try:
1747
+ self.page.evaluate(inject_script)
1748
+ except Exception:
1749
+ pass
1750
+ if predicate_js:
1751
+ try:
1752
+ ok = self.page.evaluate(predicate_js)
1753
+ if bool(ok):
1754
+ return True
1755
+ except Exception:
1756
+ pass
1757
+ try:
1758
+ val = self.page.evaluate(f"try{{return localStorage.getItem('{ls_key}')}}catch(e){{return null}}")
1759
+ if val == "1":
1760
+ return True
1761
+ except Exception:
1762
+ pass
1763
+ try:
1764
+ ready = self.page.evaluate("() => !!(window.__scrapeSignal && window.__scrapeSignal.ready)")
1765
+ if bool(ready):
1766
+ return True
1767
+ except Exception:
1768
+ pass
1769
+ return False
1770
+ except Exception:
1771
+ return False
1772
+
1773
+ loop = asyncio.get_running_loop()
1774
+ self.logger.info(
1775
+ "🛑 Awaiting browser event: press the configured key combo in the page, click the floating button, dispatch the custom event, or set the localStorage flag to resume."
1776
+ )
1777
+
1778
+ deadline = time.monotonic() + timeout
1779
+ while time.monotonic() < deadline:
1780
+ if await loop.run_in_executor(None, _inject_and_check_ready):
1781
+ # Clear the LS flag so future waits don't auto-trigger
1782
+ try:
1783
+ if self.driver_type == 'selenium':
1784
+ self.driver.execute_script(f"try{{localStorage.removeItem('{ls_key}')}}catch(e){{}}")
1785
+ self.driver.execute_script("if(window.__scrapeSignal){window.__scrapeSignal.ready=false}")
1786
+ else:
1787
+ self.page.evaluate(f"() => {{ try{{localStorage.removeItem('{ls_key}')}}catch(e){{}}; if(window.__scrapeSignal) window.__scrapeSignal.ready=false; }}")
1788
+ except Exception:
1789
+ pass
1790
+ self.logger.info("✅ Browser event received. Resuming automation.")
1791
+ return
1792
+ await asyncio.sleep(0.3)
1793
+
1794
+ raise TimeoutError("await_browser_event timed out.")
1795
+
1796
+ async def _await_human(self, action: AwaitHuman):
1797
+ """
1798
+ Let a human drive the already-open browser, then resume when a condition is met.
1799
+ 'wait_condition' or 'target' may contain:
1800
+ - selector: CSS selector to appear (presence)
1801
+ - url_contains: substring expected in current URL
1802
+ - title_contains: substring expected in document.title
1803
+ """
1804
+ timeout = int(action.timeout or 300)
1805
+ selector = None
1806
+ url_contains = None
1807
+ title_contains = None
1808
+
1809
+ if action.condition_type == 'selector':
1810
+ selector = action.target
1811
+ elif action.condition_type == 'url_contains':
1812
+ selector = None
1813
+ url_contains = action.target
1814
+ elif action.condition_type == 'title_contains':
1815
+ selector = None
1816
+ title_contains = action.target
1817
+ else:
1818
+ # Default: expect a dict in target or wait_condition
1819
+ cond = action.wait_condition or action.target or {}
1820
+ if isinstance(cond, str):
1821
+ cond = {"selector": cond}
1822
+ selector = cond.get("selector")
1823
+ if not selector:
1824
+ self.logger.error("await_human requires at least one condition (selector, url_contains, title_contains)")
1825
+ return
1826
+
1827
+ loop = asyncio.get_running_loop()
1828
+
1829
+ def _check_sync() -> bool:
1830
+ try:
1831
+ if self.driver_type == 'selenium':
1832
+ cur_url = self.driver.current_url
1833
+ cur_title = self.driver.title
1834
+ if url_contains and (url_contains not in cur_url):
1835
+ return False
1836
+ if title_contains and (title_contains not in cur_title):
1837
+ return False
1838
+ if selector:
1839
+ try:
1840
+ count = self.driver.execute_script(
1841
+ "return document.querySelectorAll(arguments[0]).length;", selector
1842
+ )
1843
+ if int(count) <= 0:
1844
+ return False
1845
+ except Exception:
1846
+ return False
1847
+ return True
1848
+ else:
1849
+ cur_url = self.page.url
1850
+ if url_contains and (url_contains not in cur_url):
1851
+ return False
1852
+ if selector:
1853
+ try:
1854
+ # tiny, non-blocking check
1855
+ el = self.page.query_selector(selector)
1856
+ if not el:
1857
+ return False
1858
+ except Exception:
1859
+ return False
1860
+ return True
1861
+ except Exception:
1862
+ return False
1863
+
1864
+ self.logger.info(
1865
+ f"🛑 {action.message} in the browser window..."
1866
+ )
1867
+ self.logger.info(
1868
+ "ℹ️ I’ll resume automatically when the expected page/element is present."
1869
+ )
1870
+
1871
+ deadline = time.monotonic() + timeout
1872
+ while time.monotonic() < deadline:
1873
+ ok = await loop.run_in_executor(None, _check_sync)
1874
+ if ok:
1875
+ self.logger.info(
1876
+ "✅ Human step condition satisfied. Resuming automation."
1877
+ )
1878
+ return
1879
+ await asyncio.sleep(0.5)
1880
+
1881
+ raise TimeoutError(
1882
+ "await_human timed out waiting for the specified condition."
1883
+ )
1884
+
1885
+ async def _await_keypress(self, action: AwaitKeyPress):
1886
+ """
1887
+ Pause until the operator presses ENTER in the console.
1888
+ Useful when there is no reliable selector to wait on.
1889
+ """
1890
+ timeout = int(action.timeout or 300)
1891
+ prompt = action.message or "Press ENTER to continue..."
1892
+ expected_key = action.key
1893
+
1894
+ self.logger.info(f"🛑 {prompt}")
1895
+ start = time.monotonic()
1896
+
1897
+ loop = asyncio.get_running_loop()
1898
+ while time.monotonic() - start < timeout:
1899
+ ready, _, _ = await loop.run_in_executor(
1900
+ None, lambda: select.select([sys.stdin], [], [], 0.5)
1901
+ )
1902
+ if ready:
1903
+ try:
1904
+ keypress = sys.stdin.readline().strip()
1905
+ if expected_key is None or keypress == expected_key:
1906
+ self.logger.info("✅ Continuing after keypress.")
1907
+ return
1908
+ except Exception:
1909
+ pass
1910
+ raise TimeoutError("await_keypress timed out.")
1911
+
1912
+ async def _wait_for_download(self, action: WaitForDownload) -> bool:
1913
+ """
1914
+ Wait for a file download to complete.
1915
+
1916
+ Args:
1917
+ action: WaitForDownload action with download monitoring options
1918
+
1919
+ Returns:
1920
+ bool: True if download detected successfully
1921
+ """
1922
+ try:
1923
+ # Determine download directory
1924
+ if action.download_path:
1925
+ download_dir = Path(action.download_path)
1926
+ else:
1927
+ # Try to get default download directory from browser
1928
+ if self.driver_type == 'selenium':
1929
+ # Check Chrome prefs for download directory
1930
+ try:
1931
+ prefs = self.driver.execute_cdp_cmd(
1932
+ 'Page.getDownloadInfo', {}
1933
+ )
1934
+ download_dir = Path(prefs.get('behavior', {}).get('downloadPath', '.'))
1935
+ except:
1936
+ # Fallback to common default locations
1937
+ download_dir = Path.home() / 'Downloads'
1938
+ else: # Playwright
1939
+ # Playwright typically uses its own download handling
1940
+ download_dir = Path.cwd() / 'downloads'
1941
+
1942
+ if not download_dir.exists():
1943
+ download_dir.mkdir(parents=True, exist_ok=True)
1944
+
1945
+ self.logger.info(f"Monitoring for downloads in: {download_dir}")
1946
+
1947
+ # Get initial files in directory
1948
+ initial_files = set(download_dir.glob('*'))
1949
+
1950
+ # Wait for new file to appear
1951
+ timeout = action.timeout
1952
+ start_time = time.time()
1953
+ downloaded_file = None
1954
+
1955
+ while time.time() - start_time < timeout:
1956
+ current_files = set(download_dir.glob('*'))
1957
+ new_files = current_files - initial_files
1958
+
1959
+ # Filter by pattern if specified
1960
+ if action.filename_pattern:
1961
+ matching_files = [
1962
+ f for f in new_files
1963
+ if f.match(action.filename_pattern)
1964
+ ]
1965
+ else:
1966
+ matching_files = list(new_files)
1967
+
1968
+ # Check if any new files are complete (not .tmp, .crdownload, .part, etc.)
1969
+ for file_path in matching_files:
1970
+ # Skip temporary download files
1971
+ if any(ext in file_path.suffix.lower() for ext in ['.tmp', '.crdownload', '.part', '.download']):
1972
+ continue
1973
+
1974
+ # Check if file is still being written (size changing)
1975
+ try:
1976
+ size1 = file_path.stat().st_size
1977
+ await asyncio.sleep(0.5)
1978
+ size2 = file_path.stat().st_size
1979
+
1980
+ if size1 == size2 and size1 > 0:
1981
+ # File size stable and non-zero - download complete
1982
+ downloaded_file = file_path
1983
+ break
1984
+ except:
1985
+ continue
1986
+
1987
+ if downloaded_file:
1988
+ break
1989
+
1990
+ await asyncio.sleep(1)
1991
+
1992
+ if not downloaded_file:
1993
+ self.logger.error(
1994
+ f"Download not detected within {timeout} seconds"
1995
+ )
1996
+ return False
1997
+
1998
+ self.logger.info(f"Download complete: {downloaded_file.name}")
1999
+
2000
+ # Move file if requested
2001
+ if action.move_to:
2002
+ move_to_path = Path(action.move_to)
2003
+ if move_to_path.is_dir():
2004
+ final_path = move_to_path / downloaded_file.name
2005
+ else:
2006
+ final_path = move_to_path
2007
+
2008
+ final_path.parent.mkdir(parents=True, exist_ok=True)
2009
+ downloaded_file.rename(final_path)
2010
+ self.logger.info(f"Moved download to: {final_path}")
2011
+ downloaded_file = final_path
2012
+
2013
+ # Store download info in results
2014
+ current_url = await self._get_current_url()
2015
+ result = ScrapingResult(
2016
+ url=current_url,
2017
+ content="",
2018
+ bs_soup=BeautifulSoup("", 'html.parser'),
2019
+ extracted_data={
2020
+ "downloaded_file": str(downloaded_file),
2021
+ "file_name": downloaded_file.name,
2022
+ "file_size": downloaded_file.stat().st_size
2023
+ },
2024
+ metadata={
2025
+ "download_path": str(download_dir),
2026
+ "filename_pattern": action.filename_pattern,
2027
+ "moved_to": action.move_to
2028
+ },
2029
+ timestamp=str(time.time()),
2030
+ success=True
2031
+ )
2032
+ self.results.append(result)
2033
+
2034
+ # Delete file if requested
2035
+ if action.delete_after:
2036
+ downloaded_file.unlink()
2037
+ self.logger.info(f"Deleted file: {downloaded_file.name}")
2038
+
2039
+ return True
2040
+
2041
+ except Exception as e:
2042
+ self.logger.error(f"WaitForDownload action failed: {str(e)}")
2043
+ return False
2044
+
2045
+
2046
+ async def _upload_file(self, action: UploadFile) -> bool:
2047
+ """
2048
+ Upload a file to a file input element.
2049
+
2050
+ Args:
2051
+ action: UploadFile action with file path and selector
2052
+
2053
+ Returns:
2054
+ bool: True if upload successful
2055
+ """
2056
+ try:
2057
+ # Determine file paths
2058
+ if action.multiple_files and action.file_paths:
2059
+ file_paths = [Path(fp).resolve() for fp in action.file_paths]
2060
+ else:
2061
+ file_paths = [Path(action.file_path).resolve()]
2062
+
2063
+ # Verify files exist
2064
+ for file_path in file_paths:
2065
+ if not file_path.exists():
2066
+ self.logger.error(f"File not found: {file_path}")
2067
+ return False
2068
+
2069
+ self.logger.info(f"Uploading {len(file_paths)} file(s)")
2070
+
2071
+ if self.driver_type == 'selenium':
2072
+ loop = asyncio.get_running_loop()
2073
+
2074
+ def upload_sync():
2075
+ # Find the file input element
2076
+ file_input = WebDriverWait(
2077
+ self.driver,
2078
+ action.timeout or self.default_timeout
2079
+ ).until(
2080
+ EC.presence_of_element_located(
2081
+ (By.CSS_SELECTOR, action.selector)
2082
+ )
2083
+ )
2084
+
2085
+ # Send file paths to input
2086
+ if len(file_paths) == 1:
2087
+ file_input.send_keys(str(file_paths[0]))
2088
+ else:
2089
+ # Multiple files - join with newline
2090
+ file_input.send_keys('\n'.join(str(fp) for fp in file_paths))
2091
+
2092
+ self.logger.info("File(s) uploaded successfully")
2093
+
2094
+ # Wait for post-upload element if specified
2095
+ if action.wait_after_upload:
2096
+ try:
2097
+ WebDriverWait(
2098
+ self.driver,
2099
+ action.wait_timeout
2100
+ ).until(
2101
+ EC.presence_of_element_located(
2102
+ (By.CSS_SELECTOR, action.wait_after_upload)
2103
+ )
2104
+ )
2105
+ self.logger.info(
2106
+ f"Post-upload element found: {action.wait_after_upload}"
2107
+ )
2108
+ except Exception as e:
2109
+ self.logger.warning(
2110
+ f"Post-upload wait timed out: {action.wait_after_upload}"
2111
+ )
2112
+
2113
+ await loop.run_in_executor(None, upload_sync)
2114
+
2115
+ else: # Playwright
2116
+ # For Playwright, set the files directly
2117
+ if len(file_paths) == 1:
2118
+ await self.page.set_input_files(action.selector, str(file_paths[0]))
2119
+ else:
2120
+ await self.page.set_input_files(
2121
+ action.selector,
2122
+ [str(fp) for fp in file_paths]
2123
+ )
2124
+
2125
+ self.logger.info("File(s) uploaded successfully")
2126
+
2127
+ # Wait for post-upload element if specified
2128
+ if action.wait_after_upload:
2129
+ try:
2130
+ await self.page.wait_for_selector(
2131
+ action.wait_after_upload,
2132
+ timeout=action.wait_timeout * 1000
2133
+ )
2134
+ self.logger.info(
2135
+ f"Post-upload element found: {action.wait_after_upload}"
2136
+ )
2137
+ except Exception:
2138
+ self.logger.warning(
2139
+ f"Post-upload wait timed out: {action.wait_after_upload}"
2140
+ )
2141
+
2142
+ # Store upload info in results
2143
+ current_url = await self._get_current_url()
2144
+ result = ScrapingResult(
2145
+ url=current_url,
2146
+ content="",
2147
+ bs_soup=BeautifulSoup("", 'html.parser'),
2148
+ extracted_data={
2149
+ "uploaded_files": [fp.name for fp in file_paths],
2150
+ "file_count": len(file_paths)
2151
+ },
2152
+ metadata={
2153
+ "selector": action.selector,
2154
+ "file_paths": [str(fp) for fp in file_paths],
2155
+ "multiple_files": action.multiple_files
2156
+ },
2157
+ timestamp=str(time.time()),
2158
+ success=True
2159
+ )
2160
+ self.results.append(result)
2161
+
2162
+ return True
2163
+
2164
+ except Exception as e:
2165
+ self.logger.error(f"UploadFile action failed: {str(e)}")
2166
+ return False
2167
+
2168
+ async def _exec_conditional(
2169
+ self,
2170
+ action: Conditional,
2171
+ base_url: str = "",
2172
+ args: Optional[dict] = None
2173
+ ) -> bool:
2174
+ """Handle Conditional action - execute actions based on a condition."""
2175
+
2176
+ CONDITION_TYPES = {
2177
+ 'exists': lambda element, expected: element is not None,
2178
+ 'not_exists': lambda element, expected: element is None,
2179
+ 'text_contains': lambda element, expected: expected in (element.text if element else ''),
2180
+ 'text_equals': lambda element, expected: (element.text if element else '') == expected,
2181
+ 'attribute_equals': lambda element, expected: element.get_attribute(expected['attr']) == expected['value'] if element else False,
2182
+ }
2183
+
2184
+ target = action.target
2185
+ target_type = action.target_type or 'css'
2186
+ condition_type = action.condition_type
2187
+ expected_value = action.expected_value
2188
+ timeout = action.timeout or 5
2189
+
2190
+ self.logger.info(
2191
+ f"Evaluating conditional: {condition_type} on {target_type}='{target}' with value '*{expected_value}*'"
2192
+ )
2193
+
2194
+ # Find the element
2195
+ element = None
2196
+ if self.driver_type == 'selenium':
2197
+ loop = asyncio.get_running_loop()
2198
+
2199
+ def find_element_sync():
2200
+ try:
2201
+ # Determine locator type
2202
+ if target_type == 'xpath':
2203
+ by_type = By.XPATH
2204
+ else: # css
2205
+ by_type = By.CSS_SELECTOR
2206
+
2207
+ # Try to find element with timeout
2208
+ try:
2209
+ el = WebDriverWait(
2210
+ self.driver,
2211
+ timeout,
2212
+ poll_frequency=0.25
2213
+ ).until(
2214
+ EC.presence_of_element_located((by_type, target))
2215
+ )
2216
+ return el
2217
+ except (TimeoutException, NoSuchElementException):
2218
+ return None
2219
+ except Exception as e:
2220
+ self.logger.debug(f"Error finding element: {str(e)}")
2221
+ return None
2222
+
2223
+ element = await loop.run_in_executor(None, find_element_sync)
2224
+
2225
+ else: # Playwright
2226
+ try:
2227
+ if target_type == 'xpath':
2228
+ selector = f"xpath={target}"
2229
+ else:
2230
+ selector = target
2231
+
2232
+ element = await self.page.wait_for_selector(
2233
+ selector,
2234
+ timeout=timeout * 1000,
2235
+ state='attached'
2236
+ )
2237
+ except Exception:
2238
+ element = None
2239
+
2240
+ # Evaluate condition
2241
+ condition_func = CONDITION_TYPES.get(condition_type)
2242
+ if not condition_func:
2243
+ self.logger.error(f"Unknown condition type: {condition_type}")
2244
+ return False
2245
+
2246
+ # For attribute_equals, expected_value should be a dict
2247
+ if condition_type == 'attribute_equals' and isinstance(expected_value, str):
2248
+ # Try to parse as "attr=value"
2249
+ if '=' in expected_value:
2250
+ attr, val = expected_value.split('=', 1)
2251
+ expected_value = {'attr': attr.strip(), 'value': val.strip()}
2252
+
2253
+ try:
2254
+ condition_result = condition_func(element, expected_value)
2255
+ except Exception as e:
2256
+ self.logger.error(f"Error evaluating condition: {str(e)}")
2257
+ condition_result = False
2258
+
2259
+ self.logger.notice(
2260
+ f"Condition result: {condition_result}"
2261
+ )
2262
+
2263
+ # Determine which actions to execute
2264
+ actions_to_execute = (
2265
+ action.actions_if_true if condition_result
2266
+ else (action.actions_if_false or [])
2267
+ )
2268
+
2269
+ if not actions_to_execute:
2270
+ self.logger.info(
2271
+ f"No actions to execute for condition result: {condition_result}"
2272
+ )
2273
+ return True
2274
+
2275
+ self.logger.info(
2276
+ f"Executing {len(actions_to_execute)} action(s) based on condition result"
2277
+ )
2278
+
2279
+ # Execute the actions
2280
+ all_success = True
2281
+ for sub_action in actions_to_execute:
2282
+ step = ScrapingStep(action=sub_action)
2283
+ success = await self._execute_step(step, base_url, args)
2284
+
2285
+ if not success:
2286
+ self.logger.warning(
2287
+ f"Conditional sub-action failed: {sub_action.description}"
2288
+ )
2289
+ all_success = False
2290
+ # Continue executing remaining actions even if one fails
2291
+
2292
+ return all_success
2293
+
2294
+ async def _exec_loop(self, action: Loop, base_url: str) -> bool:
2295
+ """Handle Loop action - execute actions repeatedly.
2296
+
2297
+ Supports:
2298
+ - Fixed iterations
2299
+ - Iterating over a list of values
2300
+ - Template variable substitution
2301
+
2302
+ Template Variables:
2303
+ - {i}, {index}, {iteration} - Current iteration number
2304
+ - {i+1} - 1-based iteration (useful for page numbers)
2305
+ - {i-1}, {i*2}, etc. - Arithmetic expressions
2306
+ - {value} - Current value from values list
2307
+
2308
+ Example:
2309
+ Loop with iterations=3, start_index=1:
2310
+ - First iteration: {i} -> 1, {i+1} -> 2
2311
+ - Second iteration: {i} -> 2, {i+1} -> 3
2312
+ - Third iteration: {i} -> 3, {i+1} -> 4
2313
+ """
2314
+ iteration = 0
2315
+ start_index = action.start_index
2316
+ value_name = action.value_name
2317
+
2318
+ if action.values:
2319
+ max_iter = len(action.values)
2320
+ self.logger.info(
2321
+ f"Starting loop over {max_iter} values, start_index={start_index}"
2322
+ )
2323
+ else:
2324
+ max_iter = action.iterations or action.max_iterations
2325
+ self.logger.info(
2326
+ f"Starting loop: {max_iter} iterations, start_index={start_index}"
2327
+ )
2328
+
2329
+ while iteration < max_iter:
2330
+ display_index = start_index + iteration
2331
+ # Get current value if iterating over values
2332
+ current_value = action.values[iteration] if action.values else None
2333
+
2334
+ # Check condition if provided
2335
+ if action.condition:
2336
+ should_continue = await self._evaluate_condition(action.condition)
2337
+ if not should_continue:
2338
+ break
2339
+
2340
+ # Execute all actions in the loop
2341
+ for loop_action in action.actions:
2342
+ # Substitute template variables in the action
2343
+ if action.do_replace:
2344
+ sub_action = self._substitute_action_vars(
2345
+ loop_action,
2346
+ iteration,
2347
+ start_index,
2348
+ current_value
2349
+ )
2350
+ else:
2351
+ sub_action = loop_action
2352
+
2353
+ step = ScrapingStep(action=sub_action)
2354
+ args = {
2355
+ "iteration": iteration,
2356
+ "data": {
2357
+ "index": display_index,
2358
+ value_name: current_value
2359
+ }
2360
+ }
2361
+ success = await self._execute_step(step, base_url, args)
2362
+
2363
+ if not success and action.break_on_error:
2364
+ self.logger.warning(f"Loop stopped at iteration {iteration} due to error")
2365
+ return False
2366
+
2367
+ iteration += 1
2368
+
2369
+ # Break if we've reached specified iterations
2370
+ if action.iterations and iteration >= action.iterations:
2371
+ break
2372
+ # do a small delay (random) between iterations
2373
+ await asyncio.sleep(random.uniform(0.1, 0.5))
2374
+
2375
+ self.logger.info(f"Loop completed {iteration} iterations")
2376
+ return True
2377
+
2378
+ async def _evaluate_condition(self, condition: str) -> bool:
2379
+ """Evaluate a JavaScript condition"""
2380
+ if self.driver_type == 'selenium':
2381
+ loop = asyncio.get_running_loop()
2382
+ result = await loop.run_in_executor(
2383
+ None,
2384
+ lambda: self.driver.execute_script(f"return Boolean({condition})")
2385
+ )
2386
+ else: # Playwright
2387
+ result = await self.page.evaluate(f"() => Boolean({condition})")
2388
+
2389
+ return bool(result)
2390
+
2391
+ async def _extract_content(
2392
+ self,
2393
+ url: str,
2394
+ selectors: List[ScrapingSelector]
2395
+ ) -> ScrapingResult:
2396
+ """Extract content based on provided selectors"""
2397
+ # Get page source
2398
+ if self.driver_type == 'selenium':
2399
+ loop = asyncio.get_running_loop()
2400
+ page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
2401
+ else: # Playwright
2402
+ page_source = await self.page.content()
2403
+
2404
+ # Parse with BeautifulSoup
2405
+ soup = BeautifulSoup(page_source, 'html.parser')
2406
+
2407
+ # Extract data based on selectors
2408
+ extracted_data = {}
2409
+ for selector_config in selectors:
2410
+ try:
2411
+ data = await self._extract_by_selector(soup, selector_config)
2412
+ extracted_data[selector_config.name] = data
2413
+ except Exception as e:
2414
+ self.logger.warning(f"Failed to extract {selector_config.name}: {str(e)}")
2415
+ extracted_data[selector_config.name] = None
2416
+
2417
+ return ScrapingResult(
2418
+ url=url,
2419
+ content=page_source,
2420
+ bs_soup=soup,
2421
+ extracted_data=extracted_data,
2422
+ timestamp=str(time.time())
2423
+ )
2424
+
2425
+ async def _extract_full_content(self, url: str) -> ScrapingResult:
2426
+ """Extract full page content when no selectors provided"""
2427
+ # Get page source
2428
+ if self.driver_type == 'selenium':
2429
+ loop = asyncio.get_running_loop()
2430
+ page_source = await loop.run_in_executor(None, lambda: self.driver.page_source)
2431
+ else: # Playwright
2432
+ page_source = await self.page.content()
2433
+
2434
+ # Parse with BeautifulSoup
2435
+ soup = BeautifulSoup(page_source, 'html.parser')
2436
+
2437
+ # Extract basic page information
2438
+ extracted_data = {
2439
+ "title": soup.title.string if soup.title else "",
2440
+ "body_text": soup.get_text(strip=True),
2441
+ "links": [a.get('href') for a in soup.find_all('a', href=True)],
2442
+ "images": [img.get('src') for img in soup.find_all('img', src=True)]
2443
+ }
2444
+
2445
+ return ScrapingResult(
2446
+ url=url,
2447
+ content=page_source,
2448
+ bs_soup=soup,
2449
+ extracted_data=extracted_data,
2450
+ timestamp=str(time.time())
2451
+ )
2452
+
2453
+ async def _extract_by_selector(
2454
+ self,
2455
+ soup: BeautifulSoup,
2456
+ selector_config: ScrapingSelector
2457
+ ) -> Union[str, List[str], Dict[str, Any]]:
2458
+ """Extract content using a specific selector configuration"""
2459
+ if selector_config.selector_type == 'css':
2460
+ elements = soup.select(selector_config.selector)
2461
+ elif selector_config.selector_type == 'xpath':
2462
+ # BeautifulSoup doesn't support XPath, you'd need lxml here
2463
+ # For now, fallback to CSS
2464
+ elements = soup.select(selector_config.selector)
2465
+ else: # tag
2466
+ elements = soup.find_all(selector_config.selector)
2467
+
2468
+ if not elements:
2469
+ return None if not selector_config.multiple else []
2470
+
2471
+ # Extract content based on type
2472
+ extracted = []
2473
+ for element in elements:
2474
+ if selector_config.extract_type == 'text':
2475
+ content = element.get_text(strip=True)
2476
+ elif selector_config.extract_type == 'html':
2477
+ content = str(element)
2478
+ elif selector_config.extract_type == 'attribute':
2479
+ content = element.get(selector_config.attribute, '')
2480
+ else:
2481
+ content = element.get_text(strip=True)
2482
+
2483
+ extracted.append(content)
2484
+
2485
+ return extracted if selector_config.multiple else extracted[0] if extracted else None
2486
+
2487
+ async def _get_current_url(self) -> str:
2488
+ """Get current page URL"""
2489
+ if self.driver_type == 'selenium':
2490
+ loop = asyncio.get_running_loop()
2491
+ return await loop.run_in_executor(None, lambda: self.driver.current_url)
2492
+ else: # Playwright
2493
+ return self.page.url
2494
+
2495
+ async def cleanup(self):
2496
+ """Clean up resources"""
2497
+ try:
2498
+ if self.driver_type == 'selenium' and self.driver:
2499
+ loop = asyncio.get_running_loop()
2500
+ await loop.run_in_executor(None, self.driver.quit)
2501
+ elif self.browser:
2502
+ await self.browser.close()
2503
+ except Exception as e:
2504
+ self.logger.error(f"Cleanup failed: {str(e)}")
2505
+
2506
+ def get_tool_schema(self) -> Dict[str, Any]:
2507
+ """
2508
+ Define the tool schema for LLM interaction.
2509
+ Provides comprehensive documentation of all available actions and their parameters.
2510
+ """
2511
+ return {
2512
+ "type": "function",
2513
+ "function": {
2514
+ "name": "web_scraping_tool",
2515
+ "description": """Execute automated web scraping with step-by-step navigation and content extraction.
2516
+ Supports navigation, interaction, authentication, content extraction, screenshots, file uploads, and download monitoring.
2517
+ Works with both Selenium and Playwright drivers.""",
2518
+ "parameters": {
2519
+ "type": "object",
2520
+ "properties": {
2521
+ "steps": {
2522
+ "type": "array",
2523
+ "description": "List of navigation and interaction steps to execute in sequence",
2524
+ "items": {
2525
+ "type": "object",
2526
+ "required": ["action"],
2527
+ "properties": {
2528
+ "action": {
2529
+ "type": "string",
2530
+ "enum": [
2531
+ "navigate",
2532
+ "click",
2533
+ "fill",
2534
+ "evaluate",
2535
+ "press_key",
2536
+ "refresh",
2537
+ "back",
2538
+ "scroll",
2539
+ "get_cookies",
2540
+ "set_cookies",
2541
+ "wait",
2542
+ "authenticate",
2543
+ "await_human",
2544
+ "await_keypress",
2545
+ "await_browser_event",
2546
+ "loop",
2547
+ "get_text",
2548
+ "get_html",
2549
+ "screenshot",
2550
+ "wait_for_download",
2551
+ "upload_file"
2552
+ ],
2553
+ "description": "Type of action to perform"
2554
+ },
2555
+ "description": {
2556
+ "type": "string",
2557
+ "description": "Human-readable description of what this action does"
2558
+ },
2559
+ "timeout": {
2560
+ "type": "integer",
2561
+ "description": "Maximum time to wait for action completion (seconds)"
2562
+ },
2563
+
2564
+ # Navigate action
2565
+ "url": {
2566
+ "type": "string",
2567
+ "description": "URL to navigate to (for 'navigate' action)"
2568
+ },
2569
+
2570
+ # Click action
2571
+ "selector": {
2572
+ "type": "string",
2573
+ "description": "CSS selector for element (for 'click', 'fill', 'get_text', 'get_html', 'screenshot', 'upload_file' actions)"
2574
+ },
2575
+ "click_type": {
2576
+ "type": "string",
2577
+ "enum": ["single", "double", "right"],
2578
+ "description": "Type of click (for 'click' action)"
2579
+ },
2580
+ "wait_after_click": {
2581
+ "type": "string",
2582
+ "description": "CSS selector of element to wait for after clicking (for 'click' action)"
2583
+ },
2584
+ "wait_timeout": {
2585
+ "type": "integer",
2586
+ "description": "Timeout for post-click wait in seconds (for 'click' action)"
2587
+ },
2588
+ "no_wait": {
2589
+ "type": "boolean",
2590
+ "description": "Skip waiting after click (for 'click' action)"
2591
+ },
2592
+
2593
+ # Fill action
2594
+ "value": {
2595
+ "type": "string",
2596
+ "description": "Text value to enter (for 'fill' action)"
2597
+ },
2598
+ "clear_first": {
2599
+ "type": "boolean",
2600
+ "description": "Clear existing content before filling (for 'fill' action)"
2601
+ },
2602
+ "press_enter": {
2603
+ "type": "boolean",
2604
+ "description": "Press Enter after filling (for 'fill' action)"
2605
+ },
2606
+
2607
+ # Evaluate action
2608
+ "script": {
2609
+ "type": "string",
2610
+ "description": "JavaScript code to execute (for 'evaluate' action)"
2611
+ },
2612
+ "script_file": {
2613
+ "type": "string",
2614
+ "description": "Path to JavaScript file to execute (for 'evaluate' action)"
2615
+ },
2616
+ "args": {
2617
+ "type": "array",
2618
+ "description": "Arguments to pass to script (for 'evaluate' action)",
2619
+ "items": {"type": "string"}
2620
+ },
2621
+ "return_value": {
2622
+ "type": "boolean",
2623
+ "description": "Whether to return script result (for 'evaluate' action)"
2624
+ },
2625
+
2626
+ # PressKey action
2627
+ "keys": {
2628
+ "type": "array",
2629
+ "description": "Keys to press, e.g., ['Tab', 'Enter'] (for 'press_key' action)",
2630
+ "items": {"type": "string"}
2631
+ },
2632
+ "sequential": {
2633
+ "type": "boolean",
2634
+ "description": "Press keys sequentially vs as combination (for 'press_key' action)"
2635
+ },
2636
+ "target": {
2637
+ "type": "string",
2638
+ "description": "CSS selector to focus before pressing keys (for 'press_key' action)"
2639
+ },
2640
+
2641
+ # Refresh action
2642
+ "hard": {
2643
+ "type": "boolean",
2644
+ "description": "Perform hard refresh clearing cache (for 'refresh' action)"
2645
+ },
2646
+
2647
+ # Back action
2648
+ "steps": {
2649
+ "type": "integer",
2650
+ "description": "Number of steps to go back in history (for 'back' action)"
2651
+ },
2652
+
2653
+ # Scroll action
2654
+ "direction": {
2655
+ "type": "string",
2656
+ "enum": ["up", "down", "top", "bottom"],
2657
+ "description": "Scroll direction (for 'scroll' action)"
2658
+ },
2659
+ "amount": {
2660
+ "type": "integer",
2661
+ "description": "Pixels to scroll (for 'scroll' action)"
2662
+ },
2663
+ "smooth": {
2664
+ "type": "boolean",
2665
+ "description": "Use smooth scrolling animation (for 'scroll' action)"
2666
+ },
2667
+
2668
+ # GetCookies action
2669
+ "names": {
2670
+ "type": "array",
2671
+ "description": "Specific cookie names to retrieve (for 'get_cookies' action)",
2672
+ "items": {"type": "string"}
2673
+ },
2674
+ "domain": {
2675
+ "type": "string",
2676
+ "description": "Filter cookies by domain (for 'get_cookies' action)"
2677
+ },
2678
+
2679
+ # SetCookies action
2680
+ "cookies": {
2681
+ "type": "array",
2682
+ "description": "List of cookie objects to set (for 'set_cookies' action)",
2683
+ "items": {
2684
+ "type": "object",
2685
+ "properties": {
2686
+ "name": {"type": "string"},
2687
+ "value": {"type": "string"},
2688
+ "domain": {"type": "string"},
2689
+ "path": {"type": "string"},
2690
+ "secure": {"type": "boolean"},
2691
+ "httpOnly": {"type": "boolean"}
2692
+ }
2693
+ }
2694
+ },
2695
+
2696
+ # Wait action
2697
+ "condition": {
2698
+ "type": "string",
2699
+ "description": "Condition value - CSS selector, URL substring, etc. (for 'wait' action)"
2700
+ },
2701
+ "condition_type": {
2702
+ "type": "string",
2703
+ "enum": ["selector", "url_contains", "title_contains", "custom"],
2704
+ "description": "Type of condition to wait for (for 'wait' action)"
2705
+ },
2706
+ "custom_script": {
2707
+ "type": "string",
2708
+ "description": "JavaScript returning boolean for custom wait (for 'wait' action)"
2709
+ },
2710
+
2711
+ # Authenticate action
2712
+ "method": {
2713
+ "type": "string",
2714
+ "enum": ["form", "basic", "oauth", "custom"],
2715
+ "description": "Authentication method (for 'authenticate' action)"
2716
+ },
2717
+ "username": {
2718
+ "type": "string",
2719
+ "description": "Username or email (for 'authenticate' action)"
2720
+ },
2721
+ "enter_on_username": {
2722
+ "type": "boolean",
2723
+ "description": "Press Enter after filling username (for multi-step logins, 'authenticate' action)"
2724
+ },
2725
+ "password": {
2726
+ "type": "string",
2727
+ "description": "Password (for 'authenticate' action)"
2728
+ },
2729
+ "username_selector": {
2730
+ "type": "string",
2731
+ "description": "CSS selector for username field (for 'authenticate' action)"
2732
+ },
2733
+ "password_selector": {
2734
+ "type": "string",
2735
+ "description": "CSS selector for password field (for 'authenticate' action)"
2736
+ },
2737
+ "submit_selector": {
2738
+ "type": "string",
2739
+ "description": "CSS selector for submit button (for 'authenticate' action)"
2740
+ },
2741
+
2742
+ # AwaitHuman action
2743
+ "message": {
2744
+ "type": "string",
2745
+ "description": "Message to display while waiting (for 'await_human', 'await_keypress' actions)"
2746
+ },
2747
+
2748
+ # AwaitKeyPress action
2749
+ "expected_key": {
2750
+ "type": "string",
2751
+ "description": "Specific key to wait for (for 'await_keypress' action)"
2752
+ },
2753
+
2754
+ # AwaitBrowserEvent action
2755
+ "wait_condition": {
2756
+ "type": "object",
2757
+ "description": "Condition configuration for browser event (for 'await_browser_event' action)"
2758
+ },
2759
+
2760
+ # Loop action
2761
+ "actions": {
2762
+ "type": "array",
2763
+ "description": "List of actions to repeat (for 'loop' action)",
2764
+ "items": {"type": "object"}
2765
+ },
2766
+ "iterations": {
2767
+ "type": "integer",
2768
+ "description": "Number of times to repeat (for 'loop' action)"
2769
+ },
2770
+ "break_on_error": {
2771
+ "type": "boolean",
2772
+ "description": "Stop loop if action fails (for 'loop' action)"
2773
+ },
2774
+ "max_iterations": {
2775
+ "type": "integer",
2776
+ "description": "Safety limit for condition-based loops (for 'loop' action)"
2777
+ },
2778
+
2779
+ # GetText action
2780
+ "multiple": {
2781
+ "type": "boolean",
2782
+ "description": "Extract from all matching elements (for 'get_text', 'get_html' actions)"
2783
+ },
2784
+ "extract_name": {
2785
+ "type": "string",
2786
+ "description": "Name for extracted data in results (for 'get_text', 'get_html' actions)"
2787
+ },
2788
+
2789
+ # Screenshot action
2790
+ "full_page": {
2791
+ "type": "boolean",
2792
+ "description": "Capture full scrollable page (for 'screenshot' action)"
2793
+ },
2794
+ "output_path": {
2795
+ "type": "string",
2796
+ "description": "File path to save screenshot (for 'screenshot' action)"
2797
+ },
2798
+ "return_base64": {
2799
+ "type": "boolean",
2800
+ "description": "Return screenshot as base64 (for 'screenshot' action)"
2801
+ },
2802
+
2803
+ # WaitForDownload action
2804
+ "filename_pattern": {
2805
+ "type": "string",
2806
+ "description": "Filename pattern to match, e.g., '*.pdf' (for 'wait_for_download' action)"
2807
+ },
2808
+ "download_path": {
2809
+ "type": "string",
2810
+ "description": "Directory to monitor for downloads (for 'wait_for_download' action)"
2811
+ },
2812
+ "move_to": {
2813
+ "type": "string",
2814
+ "description": "Path to move downloaded file (for 'wait_for_download' action)"
2815
+ },
2816
+ "delete_after": {
2817
+ "type": "boolean",
2818
+ "description": "Delete file after detection (for 'wait_for_download' action)"
2819
+ },
2820
+
2821
+ # UploadFile action
2822
+ "file_path": {
2823
+ "type": "string",
2824
+ "description": "Path to file to upload (for 'upload_file' action)"
2825
+ },
2826
+ "wait_after_upload": {
2827
+ "type": "string",
2828
+ "description": "CSS selector to wait for after upload (for 'upload_file' action)"
2829
+ },
2830
+ "multiple_files": {
2831
+ "type": "boolean",
2832
+ "description": "Whether uploading multiple files (for 'upload_file' action)"
2833
+ },
2834
+ "file_paths": {
2835
+ "type": "array",
2836
+ "description": "List of file paths for multiple uploads (for 'upload_file' action)",
2837
+ "items": {"type": "string"}
2838
+ }
2839
+ }
2840
+ }
2841
+ },
2842
+ "selectors": {
2843
+ "type": "array",
2844
+ "description": "Content selectors for extraction (legacy - prefer using get_text/get_html actions)",
2845
+ "items": {
2846
+ "type": "object",
2847
+ "required": ["name", "selector"],
2848
+ "properties": {
2849
+ "name": {
2850
+ "type": "string",
2851
+ "description": "Friendly name for the extracted content"
2852
+ },
2853
+ "selector": {
2854
+ "type": "string",
2855
+ "description": "CSS selector for the content"
2856
+ },
2857
+ "selector_type": {
2858
+ "type": "string",
2859
+ "enum": ["css", "xpath", "tag"],
2860
+ "description": "Type of selector"
2861
+ },
2862
+ "extract_type": {
2863
+ "type": "string",
2864
+ "enum": ["text", "html", "attribute"],
2865
+ "description": "What to extract from matched elements"
2866
+ },
2867
+ "attribute": {
2868
+ "type": "string",
2869
+ "description": "Attribute name (when extract_type is 'attribute')"
2870
+ },
2871
+ "multiple": {
2872
+ "type": "boolean",
2873
+ "description": "Extract from all matching elements"
2874
+ }
2875
+ }
2876
+ }
2877
+ },
2878
+ "base_url": {
2879
+ "type": "string",
2880
+ "description": "Base URL for resolving relative links"
2881
+ },
2882
+ "browser_config": {
2883
+ "type": "object",
2884
+ "description": "Browser configuration overrides",
2885
+ "properties": {
2886
+ "browser": {
2887
+ "type": "string",
2888
+ "enum": ["chrome", "firefox", "edge", "safari", "undetected"],
2889
+ "description": "Browser to use"
2890
+ },
2891
+ "headless": {
2892
+ "type": "boolean",
2893
+ "description": "Run browser in headless mode"
2894
+ },
2895
+ "mobile": {
2896
+ "type": "boolean",
2897
+ "description": "Emulate mobile device"
2898
+ },
2899
+ "mobile_device": {
2900
+ "type": "string",
2901
+ "description": "Specific mobile device to emulate"
2902
+ }
2903
+ }
2904
+ }
2905
+ },
2906
+ "required": ["steps"]
2907
+ }
2908
+ }
2909
+ }
2910
+
2911
+ def _substitute_template_vars(
2912
+ self,
2913
+ value: Any,
2914
+ iteration: int,
2915
+ start_index: int = 0,
2916
+ current_value: Any = None
2917
+ ) -> Any:
2918
+ """
2919
+ Recursively substitute template variables in strings.
2920
+
2921
+ Supported variables:
2922
+ - {i}, {index}, {iteration} - Current iteration (0-based by default)
2923
+ - {i+1}, {index+1}, {iteration+1} - Current iteration + 1 (1-based)
2924
+ - {i-1}, {index-1} - Current iteration - 1
2925
+ - {value} - Current value from values list (if provided)
2926
+ - Any arithmetic expression: {i*2}, {i+5}, etc.
2927
+
2928
+ Args:
2929
+ value: Value to substitute (can be str, dict, list, or other)
2930
+ iteration: Current iteration number (internal, 0-based counter)
2931
+ start_index: Starting index for display (default 0)
2932
+ current_value: Current value from the values list (if iterating over values)
2933
+
2934
+ Returns:
2935
+ Value with substituted variables
2936
+ """
2937
+ if isinstance(value, str):
2938
+ # Actual index to expose to user (respects start_index)
2939
+ actual_index = start_index + iteration
2940
+
2941
+ # Replace simple variables first
2942
+ value = value.replace('{i}', str(actual_index))
2943
+ value = value.replace('{index}', str(actual_index))
2944
+ value = value.replace('{iteration}', str(actual_index))
2945
+
2946
+ # Replace {value} with current value from list
2947
+ if current_value is not None:
2948
+ value = value.replace('{value}', str(current_value))
2949
+
2950
+ # Handle arithmetic expressions like {i+1}, {i-1}, {i*2}, etc.
2951
+ def eval_expr(match):
2952
+ expr = match.group(1)
2953
+ # Replace variable names with actual value
2954
+ expr = expr.replace('i', str(actual_index))
2955
+ expr = expr.replace('index', str(actual_index))
2956
+ expr = expr.replace('iteration', str(actual_index))
2957
+ try:
2958
+ # Safe evaluation of arithmetic
2959
+ result = eval(expr, {"__builtins__": {}}, {})
2960
+ return str(result)
2961
+ except:
2962
+ # If evaluation fails, return original
2963
+ return match.group(0)
2964
+
2965
+ # Pattern to match {expression} where expression contains i/index/iteration
2966
+ pattern = r'\{([^}]*(?:i|index|iteration)[^}]*)\}'
2967
+ value = re.sub(pattern, eval_expr, value)
2968
+
2969
+ return value
2970
+
2971
+ elif isinstance(value, dict):
2972
+ return {k: self._substitute_template_vars(v, iteration, start_index, current_value) for k, v in value.items()}
2973
+
2974
+ elif isinstance(value, list):
2975
+ return [self._substitute_template_vars(item, iteration, start_index, current_value) for item in value]
2976
+ else:
2977
+ # Return as-is for other types (int, bool, None, etc.)
2978
+ return value
2979
+
2980
+ def _substitute_action_vars(
2981
+ self,
2982
+ action: BrowserAction,
2983
+ iteration: int,
2984
+ start_index: int = 0,
2985
+ current_value: Any = None
2986
+ ) -> BrowserAction:
2987
+ """
2988
+ Create a copy of the action with template variables substituted.
2989
+
2990
+ Args:
2991
+ action: Original action
2992
+ iteration: Current iteration number (0-based internally)
2993
+ start_index: Starting index for display
2994
+ current_value: Current value from values list (if provided)
2995
+
2996
+ Returns:
2997
+ New action instance with substituted values
2998
+ """
2999
+ # Get the action as a dictionary
3000
+ action_dict = action.model_dump()
3001
+
3002
+ # Substitute variables in all string fields
3003
+ substituted_dict = self._substitute_template_vars(
3004
+ action_dict,
3005
+ iteration,
3006
+ start_index,
3007
+ current_value
3008
+ )
3009
+
3010
+ # Create new action instance from substituted dict
3011
+ action_class = type(action)
3012
+ return action_class(**substituted_dict)
3013
+
3014
+ def _collect_cookies(self) -> Dict[str, str]:
3015
+ if not self.driver:
3016
+ raise RuntimeError(
3017
+ "Selenium driver not available after scraping flow"
3018
+ )
3019
+ cookies: Dict[str, str] = {}
3020
+ with contextlib.suppress(Exception):
3021
+ cookies = self.driver.execute_cdp_cmd("Network.getAllCookies", {})["cookies"]
3022
+ if not cookies:
3023
+ for cookie in self.driver.get_cookies():
3024
+ name = cookie.get("name")
3025
+ if name:
3026
+ cookies[name] = cookie.get("value", "")
3027
+ return cookies
3028
+
3029
+ def _extract_headers(self) -> Dict[str, str]:
3030
+ headers: Dict[str, str] = {}
3031
+ if not self.driver:
3032
+ return headers
3033
+
3034
+ # for Selenium Wire, this path:
3035
+ try:
3036
+ for req in self.driver.requests:
3037
+ for key, value in req.headers.items():
3038
+ headers[key] = value
3039
+ return headers
3040
+ except Exception:
3041
+ pass
3042
+
3043
+ try:
3044
+ performance_logs = self.driver.get_log("performance")
3045
+ except Exception:
3046
+ performance_logs = []
3047
+
3048
+ for entry in reversed(performance_logs):
3049
+ try:
3050
+ message = json.loads(entry.get("message", "{}"))
3051
+ log = message.get("message", {})
3052
+ if log.get("method") != "Network.requestWillBeSent":
3053
+ continue
3054
+ req_headers = log.get("params", {}).get("request", {}).get("headers", {})
3055
+ for key, value in req_headers.items():
3056
+ if key not in headers:
3057
+ headers[key] = value
3058
+ except (ValueError, TypeError):
3059
+ continue
3060
+
3061
+ return headers
3062
+
3063
+ def _extract_authorization(self) -> Optional[str]:
3064
+ if not self.driver:
3065
+ return None
3066
+
3067
+ # Check first if Authorization is in headers:
3068
+ if 'Authorization' in self.extracted_headers:
3069
+ return self.extracted_headers['Authorization']
3070
+ if 'authorization' in self.extracted_headers:
3071
+ return self.extracted_headers['authorization']
3072
+
3073
+ # Attempt to capture from performance logs first
3074
+ try:
3075
+ self.driver.execute_cdp_cmd("Network.enable", {})
3076
+ except Exception: # pragma: no cover - command may not exist
3077
+ pass
3078
+
3079
+ try:
3080
+ performance_logs = self.driver.get_log("performance")
3081
+ except Exception:
3082
+ performance_logs = []
3083
+
3084
+ for entry in reversed(performance_logs):
3085
+ try:
3086
+ message = json.loads(entry.get("message", "{}"))
3087
+ log = message.get("message", {})
3088
+ if log.get("method") != "Network.requestWillBeSent":
3089
+ continue
3090
+ headers = log.get("params", {}).get("request", {}).get("headers", {})
3091
+ authorization = headers.get("Authorization") or headers.get("authorization")
3092
+ if authorization:
3093
+ return authorization
3094
+ except (ValueError, TypeError):
3095
+ continue
3096
+
3097
+ # Fallback: check localStorage/sessionStorage for tokens
3098
+ script_templates = [
3099
+ "return window.sessionStorage.getItem('authorization');",
3100
+ "return window.localStorage.getItem('authorization');",
3101
+ "return window.sessionStorage.getItem('authToken');",
3102
+ "return window.localStorage.getItem('authToken');",
3103
+ "return window.localStorage.getItem('token');",
3104
+ ]
3105
+ for script in script_templates:
3106
+ try:
3107
+ token = self.driver.execute_script(script)
3108
+ except Exception:
3109
+ token = None
3110
+ if token:
3111
+ if not token.lower().startswith("bearer"):
3112
+ token = f"Bearer {token}".strip()
3113
+ return token
3114
+
3115
+ return None