ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
@@ -0,0 +1,1173 @@
1
+ """
2
+ ScrapingAgent for AI-Parrot
3
+ LLM-powered agent that makes intelligent decisions about web scraping
4
+ Updated to better integrate with current WebScrapingTool architecture
5
+ """
6
+ from typing import Dict, List, Any, Optional, Literal
7
+ import json
8
+ import re
9
+ import logging
10
+ from datetime import datetime
11
+ from urllib.parse import urlparse
12
+ from bs4 import BeautifulSoup
13
+ from ..abstract import AbstractBot
14
+ from ...tools.scraping import (
15
+ WebScrapingTool,
16
+ ScrapingStep,
17
+ ScrapingSelector,
18
+ ScrapingResult
19
+ )
20
+ from .templates import (
21
+ BESTBUY_TEMPLATE,
22
+ AMAZON_TEMPLATE,
23
+ EBAY_TEMPLATE
24
+ )
25
+ from .models import (
26
+ ScrapingPlanSchema
27
+ )
28
+
29
+
30
+ class ScrapingAgent(AbstractBot):
31
+ """
32
+ Intelligent web scraping agent that uses LLM to:
33
+ - Analyze web pages and determine optimal scraping strategies
34
+ - Generate navigation steps based on page structure
35
+ - Adapt selectors based on content analysis
36
+ - Handle dynamic content and authentication flows
37
+ - Recommend optimal browser configurations
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ name: str = "WebScrapingAgent",
43
+ browser: Literal['chrome', 'firefox', 'edge', 'safari', 'undetected'] = 'chrome',
44
+ driver_type: Literal['selenium', 'playwright'] = 'selenium',
45
+ headless: bool = True,
46
+ mobile: bool = False,
47
+ mobile_device: Optional[str] = None,
48
+ auto_install: bool = True,
49
+ **kwargs
50
+ ):
51
+ # Enhanced system prompt for web scraping
52
+ system_prompt = self._build_scraping_system_prompt()
53
+
54
+ super().__init__(
55
+ name=name,
56
+ system_prompt=system_prompt,
57
+ **kwargs
58
+ )
59
+
60
+ # Store browser configuration for dynamic adjustments
61
+ self.browser_config = {
62
+ 'browser': browser,
63
+ 'driver_type': driver_type,
64
+ 'headless': headless,
65
+ 'mobile': mobile,
66
+ 'mobile_device': mobile_device,
67
+ 'auto_install': auto_install,
68
+ **kwargs
69
+ }
70
+
71
+ # Initialize scraping tool with configuration
72
+ self.scraping_tool = WebScrapingTool(**self.browser_config)
73
+ self.tool_manager.register_tool(self.scraping_tool)
74
+ self.logger = logging.getLogger(f"AI-Parrot.ScrapingAgent")
75
+
76
+ # Scraping context and memory
77
+ self.scraping_history: List[Dict[str, Any]] = []
78
+ self.site_knowledge: Dict[str, Dict[str, Any]] = {}
79
+
80
+ # Site-specific templates and guidance
81
+ self.scraping_templates = self._initialize_templates()
82
+
83
+ # Browser capability knowledge
84
+ self.browser_capabilities = {
85
+ 'chrome': {
86
+ 'mobile_emulation': True,
87
+ 'undetected_mode': True,
88
+ 'performance_options': True,
89
+ 'best_for': ['SPA', 'heavy_js', 'mobile_testing']
90
+ },
91
+ 'firefox': {
92
+ 'mobile_emulation': False,
93
+ 'undetected_mode': False,
94
+ 'performance_options': True,
95
+ 'best_for': ['privacy', 'legacy_sites', 'debugging']
96
+ },
97
+ 'edge': {
98
+ 'mobile_emulation': True,
99
+ 'undetected_mode': False,
100
+ 'performance_options': True,
101
+ 'best_for': ['enterprise', 'windows_specific']
102
+ },
103
+ 'safari': {
104
+ 'mobile_emulation': False,
105
+ 'undetected_mode': False,
106
+ 'performance_options': False,
107
+ 'best_for': ['apple_ecosystem', 'webkit_testing']
108
+ },
109
+ 'undetected': {
110
+ 'mobile_emulation': True,
111
+ 'undetected_mode': True,
112
+ 'performance_options': True,
113
+ 'best_for': ['anti_bot', 'stealth_scraping', 'protected_sites']
114
+ }
115
+ }
116
+
117
+ def _initialize_templates(self) -> Dict[str, Dict[str, Any]]:
118
+ """Initialize site-specific scraping templates and guidance"""
119
+ return {
120
+ 'bestbuy.com': BESTBUY_TEMPLATE,
121
+ 'amazon.com': AMAZON_TEMPLATE,
122
+ 'ebay.com': EBAY_TEMPLATE,
123
+ 'generic_ecommerce': {
124
+ 'search_steps': [
125
+ {
126
+ 'action': 'navigate',
127
+ 'target': '{url}',
128
+ 'description': 'Navigate to target site'
129
+ },
130
+ {
131
+ 'action': 'fill',
132
+ 'target': 'input[type="search"], input[name*="search"], input[placeholder*="search"]',
133
+ 'value': '{search_term}',
134
+ 'description': 'Fill most common search input patterns'
135
+ },
136
+ {
137
+ 'action': 'click',
138
+ 'target': 'button[type="submit"], input[type="submit"], .search-button',
139
+ 'description': 'Click search button'
140
+ }
141
+ ],
142
+ 'product_selectors': [
143
+ {
144
+ 'name': 'products',
145
+ 'selector': '.product, .item, .listing',
146
+ 'extract_type': 'html',
147
+ 'multiple': True
148
+ }
149
+ ],
150
+ 'guidance': 'Generic e-commerce patterns. May need site-specific adjustments.'
151
+ }
152
+ }
153
+
154
+ def _build_scraping_system_prompt(self) -> str:
155
+ """Build specialized system prompt for web scraping tasks"""
156
+ return """You are an expert web scraping agent with advanced capabilities in:
157
+
158
+ 1. **Web Page Analysis**: Analyzing HTML structure, identifying key elements, and understanding page layouts
159
+ 2. **Navigation Strategy**: Creating step-by-step navigation plans for complex user journeys
160
+ 3. **Content Extraction**: Determining optimal selectors for extracting specific data
161
+ 4. **Error Handling**: Adapting to dynamic content, handling timeouts, and recovering from failures
162
+ 5. **Authentication**: Managing login flows, sessions, and security measures
163
+ 6. **Browser Optimization**: Recommending optimal browser configurations based on target sites
164
+
165
+ **Available Browser Options:**
166
+ - chrome: Default, best performance, mobile emulation, wide compatibility
167
+ - firefox: Good privacy, stable, good for debugging
168
+ - edge: Enterprise-friendly, good performance
169
+ - safari: Apple ecosystem, webkit testing
170
+ - undetected: Anti-detection features, stealth scraping
171
+
172
+ **Core Responsibilities:**
173
+ - Analyze user scraping requirements and website structure
174
+ - Generate detailed navigation steps (ScrapingStep objects)
175
+ - Create precise content selectors (ScrapingSelector objects)
176
+ - Recommend optimal browser configuration for target sites
177
+ - Adapt strategies based on scraping results and feedback
178
+ - Provide insights about scraped content and suggest improvements
179
+
180
+ **Available Actions:**
181
+ - navigate: Go to a specific URL
182
+ - click: Click on elements (buttons, links, etc.)
183
+ - fill: Fill form fields with data
184
+ - wait: Wait for specific conditions or elements
185
+ - scroll: Scroll to load dynamic content
186
+ - authenticate: Handle login/authentication flows
187
+ - await_human: Pause automation; a human completes login/SSO/MFA in the browser. Resume when a selector/URL/title condition is met.
188
+ - await_keypress: Pause until the operator presses ENTER in the console.
189
+ - await_browser_event: Wait for a real page event (keyboard/overlay button/custom event/localStorage/predicate)
190
+
191
+ **Selector Types:**
192
+ - CSS selectors: Standard CSS syntax (.class, #id, element[attribute])
193
+ - XPath: For complex element selection
194
+ - Tag-based: Direct HTML tag selection
195
+
196
+ **Browser Configuration Recommendations:**
197
+ - Use 'undetected' browser for sites with anti-bot protection
198
+ - Use 'chrome' with mobile=True for mobile-responsive testing
199
+ - Use 'firefox' for sites that work better with Gecko engine
200
+ - Enable headless=False for debugging complex interactions
201
+ - Use custom user agents and mobile devices for specific testing
202
+
203
+ **Best Practices:**
204
+ - Always provide detailed descriptions for each step
205
+ - Use specific, robust selectors that are less likely to break
206
+ - Include appropriate wait conditions for dynamic content
207
+ - Plan authentication flows carefully with proper error handling
208
+ - Consider mobile responsiveness and different viewport sizes
209
+ - Recommend browser configuration based on site characteristics
210
+
211
+ When given a scraping task, analyze the requirements thoroughly and create a comprehensive plan that maximizes success while being respectful of website resources and terms of service.
212
+ """
213
+
214
+ async def analyze_scraping_request(
215
+ self,
216
+ request: Dict[str, Any]
217
+ ) -> Dict[str, Any]:
218
+ """
219
+ Analyze a scraping request and generate an execution plan with browser recommendations
220
+
221
+ Args:
222
+ request: Dictionary containing:
223
+ - target_url: URL to scrape
224
+ - objective: What data to extract
225
+ - authentication: Login details if needed
226
+ - constraints: Rate limiting, ethical guidelines
227
+ - preferred_browser: Optional browser preference
228
+ - use_template: Whether to use site-specific templates (default: True)
229
+
230
+ Returns:
231
+ Dictionary with execution plan including steps, selectors, and browser config
232
+ """
233
+ target_url = request.get('target_url', '')
234
+ objective = request.get('objective', 'General content extraction')
235
+ use_template = request.get('use_template', True)
236
+ steps = request.get('steps', [])
237
+
238
+ # Check for site-specific templates
239
+ template_guidance = ""
240
+ suggested_steps = []
241
+ suggested_selectors = []
242
+
243
+ if use_template and target_url:
244
+ domain = self._extract_domain(target_url)
245
+ if domain:
246
+ # Check for exact domain match
247
+ template = self.scraping_templates.get(domain)
248
+ if not template:
249
+ # Check for partial domain matches
250
+ for template_domain, template_data in self.scraping_templates.items():
251
+ if template_domain in domain or domain in template_domain:
252
+ template = template_data
253
+ break
254
+
255
+ if template:
256
+ template_guidance = f"\n\n**MANDATORY TEMPLATE FOR {domain.upper()}:**"
257
+ template_guidance += "\n**IMPORTANT:** These selectors are VERIFIED and TESTED. You MUST use these exact values.\n"
258
+ # Customize template steps with actual search term
259
+ if 'search_steps' in template and any(term in objective.lower() for term in ['search', 'product', 'find', 'extract']):
260
+ search_term = self._extract_search_term_from_objective(objective)
261
+ suggested_steps = self._customize_template_steps(
262
+ template['search_steps'], {
263
+ 'search_term': search_term,
264
+ 'url': target_url
265
+ }
266
+ )
267
+ template_guidance += f"\n\n**SUGGESTED STEPS (customized for '{search_term}'):**\n"
268
+ for i, step in enumerate(suggested_steps):
269
+ template_guidance += f"{i+1}. {step['action']}: {step.get('description', step['target'])}\n"
270
+
271
+ if 'product_selectors' in template:
272
+ suggested_selectors = template['product_selectors']
273
+ template_guidance += f"\n\n** SELECTORS:**\n"
274
+ for sel in suggested_selectors:
275
+ template_guidance += f"- {sel['name']}: {sel['selector']}\n"
276
+ template_guidance += "\n⚠️ CRITICAL: Use the exact 'target' values above. Do not substitute with '#gh-search-input' or other guesses.\n"
277
+ elif steps:
278
+ # use suggested steps from user:
279
+ template_guidance += f"\n\n**SUGGESTED STEPS:**\n"
280
+ for step in steps:
281
+ template_guidance += f"- {step}\n"
282
+
283
+ prompt = f"""
284
+ Analyze this web scraping request and create a comprehensive execution plan:
285
+
286
+ **Target URL:** {target_url}
287
+ **Objective:** {objective}
288
+ **Authentication Required:** {request.get('authentication', {}).get('required', False)}
289
+ **Special Requirements:** {request.get('constraints', 'None')}
290
+ **Current Browser Config:** {json.dumps(self.browser_config, indent=2)}
291
+
292
+ {template_guidance}
293
+
294
+ Please provide:
295
+ 1. A detailed analysis of the scraping challenge
296
+ 2. Recommended browser configuration (browser type, mobile mode, headless, etc.)
297
+ 3. Step-by-step navigation plan (as JSON array of ScrapingStep objects)
298
+ 4. Content extraction selectors (as JSON array of ScrapingSelector objects)
299
+ 5. Risk assessment and mitigation strategies
300
+ 6. Expected challenges and fallback options
301
+
302
+ **Browser Capabilities Available:**
303
+ {json.dumps(self.browser_capabilities, indent=2)}
304
+
305
+ **CRITICAL INSTRUCTIONS:**
306
+ 1. For 'navigate' actions: target MUST be a complete URL starting with http:// or https://
307
+ 2. For 'click', 'fill', 'wait' actions: target MUST be a CSS selector (e.g., '#id', '.class', 'button[type="submit"]')
308
+ 3. NEVER use natural language descriptions as targets (e.g., "the search box" is WRONG, "#search-input" is CORRECT)
309
+ 4. If template steps are provided above, use those EXACT targets - they are proven to work
310
+ 5. Steps must be in logical order: navigate → wait → fill → click → wait for results
311
+ 6. Never invent or hallucinate details about the page structure or content.
312
+
313
+ Provide your response as a structured plan following the ScrapingPlanSchema.
314
+ """
315
+
316
+ async with self._llm as client:
317
+ response = await client.ask(
318
+ prompt=prompt,
319
+ system_prompt=self.system_prompt_template,
320
+ model=self._llm_model,
321
+ max_tokens=self._max_tokens,
322
+ temperature=self._llm_temp,
323
+ use_tools=True,
324
+ structured_output=ScrapingPlanSchema
325
+ )
326
+
327
+ if isinstance(response.output, ScrapingPlanSchema):
328
+ response = response.output
329
+ merged_steps = []
330
+ for i, template_step in enumerate(suggested_steps):
331
+ merged = template_step.copy()
332
+ # If LLM generated a corresponding step, take its metadata
333
+ if i < len(response.steps):
334
+ llm_step = response.steps[i].model_dump()
335
+ # Keep template's target (proven to work)
336
+ # But use LLM's wait_condition and description if present
337
+ if llm_step.get('wait_condition'):
338
+ merged['wait_condition'] = llm_step['wait_condition']
339
+ if llm_step.get('description') and len(llm_step['description']) > len(merged.get('description', '')):
340
+ merged['description'] = llm_step['description']
341
+ # Use higher timeout if LLM suggests it
342
+ if llm_step.get('timeout', 10) > merged.get('timeout', 10):
343
+ merged['timeout'] = llm_step['timeout']
344
+ merged_steps.append(merged)
345
+ plan = {
346
+ 'steps': merged_steps,
347
+ 'selectors': suggested_selectors or [sel.model_dump() for sel in response.selectors],
348
+ 'browser_config': response.browser_config.model_dump(),
349
+ 'analysis': response.analysis,
350
+ 'risks': response.risks,
351
+ 'fallback_strategy': response.fallback_strategy,
352
+ 'parsed_successfully': True,
353
+ 'used_template': True
354
+ }
355
+ else:
356
+ # Fallback if structured output not available
357
+ content = self._safe_extract_text(response)
358
+ plan = self._parse_scraping_plan(content)
359
+
360
+ # If LLM didn't generate steps but we have template suggestions, use them as fallback
361
+ if not plan.get('steps') and suggested_steps:
362
+ self.logger.info("Using template steps as fallback")
363
+ plan['steps'] = suggested_steps
364
+
365
+ if not plan.get('selectors') and suggested_selectors:
366
+ self.logger.info("Using template selectors as fallback")
367
+ plan['selectors'] = suggested_selectors
368
+
369
+ # Store this request in our knowledge base
370
+ site_domain = self._extract_domain(target_url)
371
+ if site_domain:
372
+ self.site_knowledge[site_domain] = {
373
+ 'last_analyzed': datetime.now().isoformat(),
374
+ 'request': request,
375
+ 'plan': plan,
376
+ 'success_rate': 0.0, # Will be updated based on results
377
+ 'recommended_config': plan.get('browser_config', {}),
378
+ 'used_template': bool(template_guidance)
379
+ }
380
+
381
+ return plan
382
+
383
+ def _extract_search_term_from_objective(self, objective: str) -> str:
384
+ """Extract search term from objective description"""
385
+ # Look for product names, quotes, or specific terms
386
+ # Try to find quoted terms first
387
+ quoted_match = re.search(r'"([^"]+)"', objective)
388
+ if quoted_match:
389
+ return quoted_match.group(1)
390
+
391
+ # Look for "for X" pattern
392
+ for_match = re.search(r'\bfor\s+([^,\.]+)', objective, re.IGNORECASE)
393
+ if for_match:
394
+ return for_match.group(1).strip()
395
+
396
+ # Look for product-like terms (words with numbers, proper nouns)
397
+ product_match = re.search(r'\b([A-Z][a-z]*(?:\s+[A-Z0-9][a-z0-9]*)*(?:\s+\d+\w*)*)\b', objective)
398
+ if product_match:
399
+ return product_match.group(1)
400
+
401
+ # Fallback: take last few words that might be product name
402
+ words = objective.split()
403
+ if len(words) >= 3:
404
+ return ' '.join(words[-3:])
405
+ elif len(words) >= 2:
406
+ return ' '.join(words[-2:])
407
+ else:
408
+ return words[-1] if words else "product"
409
+
410
+ def _customize_template_steps(self, template_steps: List[Dict], variables: Dict[str, str]) -> List[Dict]:
411
+ """Customize template steps with actual values"""
412
+ customized_steps = []
413
+ for step in template_steps:
414
+ customized_step = step.copy()
415
+
416
+ # Replace variables in target and value fields
417
+ if 'target' in customized_step:
418
+ for var, value in variables.items():
419
+ customized_step['target'] = customized_step['target'].replace(f'{{{var}}}', value)
420
+
421
+ if 'value' in customized_step and customized_step['value']:
422
+ for var, value in variables.items():
423
+ customized_step['value'] = customized_step['value'].replace(f'{{{var}}}', value)
424
+
425
+ customized_steps.append(customized_step)
426
+
427
+ return customized_steps
428
+
429
+ def add_scraping_template(self, domain: str, template: Dict[str, Any]):
430
+ """Add or update a scraping template for a specific domain"""
431
+ self.scraping_templates[domain] = template
432
+ self.logger.info(f"Added scraping template for {domain}")
433
+
434
+ async def execute_intelligent_scraping(
435
+ self,
436
+ request: Dict[str, Any],
437
+ adaptive_config: bool = True
438
+ ) -> List[ScrapingResult]:
439
+ """
440
+ Execute intelligent scraping with LLM-driven adaptations and browser optimization
441
+
442
+ Args:
443
+ request: Scraping request dictionary
444
+ adaptive_config: Whether to adapt browser configuration based on LLM recommendations
445
+
446
+ Returns:
447
+ List of ScrapingResult objects
448
+ """
449
+ self.logger.info(
450
+ f"Starting intelligent scraping for: {request.get('target_url')}"
451
+ )
452
+
453
+ try:
454
+ # Step 1: Analyze and plan
455
+ plan = await self.analyze_scraping_request(request)
456
+ # some sanitization
457
+ plan = self._sanitize_plan(plan, request)
458
+ self.logger.debug(
459
+ "Plan steps: %s", json.dumps(plan["steps"], indent=2)
460
+ )
461
+ self.logger.debug(
462
+ "Sanitized selectors: %s", json.dumps(plan["selectors"], indent=2)
463
+ )
464
+
465
+ if not plan.get('steps'):
466
+ self.logger.error("No scraping plan generated")
467
+ return [ScrapingResult(
468
+ url=request.get('target_url', ''),
469
+ content='',
470
+ bs_soup=BeautifulSoup('', 'html.parser'),
471
+ success=False,
472
+ error_message="No scraping plan could be generated"
473
+ )]
474
+
475
+ # Step 2: Adapt browser configuration if recommended and allowed
476
+ if adaptive_config and plan.get('browser_config'):
477
+ await self._adapt_browser_configuration(plan['browser_config'])
478
+
479
+ # Step 3: Ensure scraping tool is properly initialized
480
+ if not hasattr(self.scraping_tool, 'driver') or self.scraping_tool.driver is None:
481
+ await self.scraping_tool.initialize_driver()
482
+
483
+ # Step 4: Execute initial scraping
484
+ steps = [self._create_scraping_step(step) for step in plan['steps']]
485
+ selectors = [self._create_scraping_selector(sel) for sel in plan.get('selectors', [])]
486
+
487
+ results = await self.scraping_tool.execute_scraping_workflow(
488
+ steps=steps,
489
+ selectors=selectors,
490
+ base_url=request.get('base_url', '')
491
+ )
492
+
493
+ # Step 5: Analyze results and adapt if necessary
494
+ if results and not all(r.success for r in results):
495
+ self.logger.info("Some scraping attempts failed, attempting recovery")
496
+ results = await self._attempt_recovery(request, results, plan)
497
+
498
+ # Step 6: Post-process and enhance results
499
+ enhanced_results = await self._enhance_results(results, request)
500
+
501
+ # Step 7: Update site knowledge
502
+ self._update_site_knowledge(request, enhanced_results)
503
+
504
+ return enhanced_results
505
+
506
+ except Exception as e:
507
+ self.logger.error(f"Intelligent scraping failed: {str(e)}")
508
+ return [ScrapingResult(
509
+ url=request.get('target_url', ''),
510
+ content='',
511
+ bs_soup=BeautifulSoup('', 'html.parser'),
512
+ success=False,
513
+ error_message=f"Scraping failed: {str(e)}"
514
+ )]
515
+
516
+ async def _adapt_browser_configuration(self, recommended_config: Dict[str, Any]):
517
+ """
518
+ Adapt browser configuration based on LLM recommendations
519
+ """
520
+ changes_made = []
521
+
522
+ for key, value in recommended_config.items():
523
+ if key in self.browser_config and self.browser_config[key] != value:
524
+ old_value = self.browser_config[key]
525
+ self.browser_config[key] = value
526
+ changes_made.append(f"{key}: {old_value} -> {value}")
527
+
528
+ if changes_made:
529
+ self.logger.info(f"Adapting browser config: {', '.join(changes_made)}")
530
+
531
+ # Reinitialize scraping tool with new configuration
532
+ await self._reinitialize_scraping_tool()
533
+
534
+ async def _reinitialize_scraping_tool(self):
535
+ """Safely reinitialize the scraping tool with new configuration"""
536
+ try:
537
+ # Clean up existing tool
538
+ if hasattr(self.scraping_tool, 'cleanup'):
539
+ await self.scraping_tool.cleanup()
540
+
541
+ # Create new tool with updated config
542
+ self.scraping_tool = WebScrapingTool(**self.browser_config)
543
+
544
+ # Re-register the tool
545
+ if hasattr(self.tool_manager, 'unregister_tool'):
546
+ self.tool_manager.unregister_tool('WebScrapingTool')
547
+ self.tool_manager.register_tool(self.scraping_tool)
548
+
549
+ except Exception as e:
550
+ self.logger.warning(
551
+ f"Failed to reinitialize scraping tool: {e}"
552
+ )
553
+
554
+ def _normalize_action(self, action: Optional[str]) -> str:
555
+ return (action or 'navigate').strip().lower()
556
+
557
+ def _normalize_target(self, target: Any) -> str:
558
+ # Accept dicts like {"url": "..."} or {"selector": "..."} or lists
559
+ if isinstance(target, dict):
560
+ target = target.get('url') or target.get('selector') or target.get('text') or ''
561
+ elif isinstance(target, (list, tuple)) and target:
562
+ target = target[0]
563
+ target = '' if target is None else str(target).strip()
564
+ # Basic URL rescue: if it looks like a domain, prefix https://
565
+ if target and (' ' not in target) and ('.' in target) and not target.startswith(('http://','https://','#','/')):
566
+ target = f'https://{target}'
567
+ return target
568
+
569
+ def _normalize_value(self, value: Any) -> Optional[str]:
570
+ return None if value is None else str(value)
571
+
572
+ def _create_scraping_step(self, step_data: Dict[str, Any]) -> ScrapingStep:
573
+ return ScrapingStep(
574
+ action=self._normalize_action(step_data.get('action')),
575
+ target=self._normalize_target(step_data.get('target', '')),
576
+ value=self._normalize_value(step_data.get('value')),
577
+ wait_condition=step_data.get('wait_condition'),
578
+ timeout=step_data.get('timeout', 10),
579
+ description=step_data.get('description', '')
580
+ )
581
+
582
+ def _create_scraping_selector(self, selector_data: Dict[str, Any]) -> ScrapingSelector:
583
+ """Create ScrapingSelector object from dictionary, handling missing/odd fields"""
584
+ name = selector_data.get('name', 'unnamed')
585
+ selector = selector_data.get('selector', 'body')
586
+ selector_type = selector_data.get('selector_type', 'css')
587
+ extract_type = selector_data.get('extract_type', 'text')
588
+ attribute = selector_data.get('attribute')
589
+ multiple = selector_data.get('multiple', False)
590
+
591
+ return ScrapingSelector(
592
+ name=str(name),
593
+ selector=str(selector),
594
+ selector_type=str(selector_type),
595
+ extract_type=str(extract_type),
596
+ attribute=(str(attribute) if attribute is not None else None),
597
+ multiple=bool(multiple)
598
+ )
599
+
600
+ async def recommend_browser_for_site(self, url: str) -> Dict[str, Any]:
601
+ """
602
+ Analyze a site and recommend optimal browser configuration
603
+ """
604
+ domain = self._extract_domain(url)
605
+
606
+ # Check if we have prior knowledge
607
+ if domain in self.site_knowledge:
608
+ stored_config = self.site_knowledge[domain].get('recommended_config', {})
609
+ if stored_config:
610
+ return {
611
+ 'source': 'historical_data',
612
+ 'config': stored_config,
613
+ 'confidence': 'high',
614
+ 'reason': 'Based on previous successful scraping'
615
+ }
616
+
617
+ # Use LLM to analyze the site
618
+ analysis_prompt = f"""
619
+ Analyze this website and recommend the optimal browser configuration for scraping:
620
+
621
+ **URL:** {url}
622
+ **Available Browsers:** {list(self.browser_capabilities.keys())}
623
+ **Browser Capabilities:** {json.dumps(self.browser_capabilities, indent=2)}
624
+
625
+ Please analyze the site characteristics and recommend:
626
+ 1. Best browser choice (chrome, firefox, edge, safari, undetected)
627
+ 2. Whether to use headless mode
628
+ 3. Whether mobile emulation would be useful
629
+ 4. Any special configuration options
630
+ 5. Reasoning for your recommendations
631
+
632
+ Consider factors like:
633
+ - Site complexity (SPA, heavy JavaScript, etc.)
634
+ - Anti-bot protection
635
+ - Mobile responsiveness
636
+ - Authentication requirements
637
+ - Known compatibility issues
638
+
639
+ Provide your recommendation as a JSON object with configuration parameters.
640
+ """
641
+
642
+ try:
643
+ async with self._llm as client:
644
+ response = await client.ask(
645
+ prompt=analysis_prompt,
646
+ system_prompt=self.system_prompt_template,
647
+ model=self._llm_model,
648
+ max_tokens=self._max_tokens,
649
+ temperature=self._llm_temp,
650
+ use_tools=True,
651
+ )
652
+
653
+ # Parse recommendation from response
654
+ content = self._safe_extract_text(response)
655
+ recommendation = self._parse_browser_recommendation(content)
656
+
657
+ return {
658
+ 'source': 'llm_analysis',
659
+ 'config': recommendation,
660
+ 'confidence': 'medium',
661
+ 'reason': 'Based on LLM analysis of site characteristics',
662
+ 'full_analysis': content
663
+ }
664
+
665
+ except Exception as e:
666
+ self.logger.warning(f"Failed to get browser recommendation: {str(e)}")
667
+ return {
668
+ 'source': 'fallback',
669
+ 'config': {'browser': 'chrome', 'headless': True},
670
+ 'confidence': 'low',
671
+ 'reason': 'Default fallback configuration'
672
+ }
673
+
674
+ def _parse_browser_recommendation(self, llm_response: str) -> Dict[str, Any]:
675
+ """Parse browser configuration recommendation from LLM response"""
676
+ try:
677
+ # Try to extract JSON from response
678
+ json_match = re.search(r'```json\s*(\{.*?\})\s*```', llm_response, re.DOTALL)
679
+ if json_match:
680
+ return json.loads(json_match.group(1))
681
+
682
+ # Fallback: extract configuration from text
683
+ config = {}
684
+
685
+ # Extract browser type
686
+ for browser, _ in self.browser_capabilities.items():
687
+ if browser.lower() in llm_response.lower():
688
+ config['browser'] = browser
689
+ break
690
+
691
+ # Extract headless recommendation
692
+ if 'headless' in llm_response.lower():
693
+ config['headless'] = 'false' not in llm_response.lower()
694
+
695
+ # Extract mobile recommendation
696
+ if 'mobile' in llm_response.lower():
697
+ config['mobile'] = 'true' in llm_response.lower()
698
+
699
+ return config if config else {'browser': 'chrome', 'headless': True}
700
+
701
+ except Exception as e:
702
+ self.logger.error(f"Failed to parse browser recommendation: {str(e)}")
703
+ return {'browser': 'chrome', 'headless': True}
704
+
705
+ async def _attempt_recovery(
706
+ self,
707
+ request: Dict[str, Any],
708
+ failed_results: List[ScrapingResult],
709
+ original_plan: Dict[str, Any]
710
+ ) -> List[ScrapingResult]:
711
+ """
712
+ Attempt to recover from failed scraping using LLM analysis
713
+ """
714
+ # Analyze failures
715
+ failure_analysis = []
716
+ for result in failed_results:
717
+ if not result.success:
718
+ failure_analysis.append({
719
+ 'url': result.url,
720
+ 'error': result.error_message,
721
+ 'content_available': bool(result.content)
722
+ })
723
+
724
+ recovery_prompt = f"""
725
+ The initial scraping attempt had some failures. Please analyze and suggest recovery strategies:
726
+
727
+ **Original Request:** {json.dumps(request, indent=2)}
728
+ **Failed Results:** {json.dumps(failure_analysis, indent=2)}
729
+ **Original Plan:** {json.dumps(original_plan, indent=2)}
730
+ **Current Browser Config:** {json.dumps(self.browser_config, indent=2)}
731
+
732
+ Please suggest:
733
+ 1. Modified navigation steps to address the failures
734
+ 2. Alternative selectors that might be more robust
735
+ 3. Browser configuration changes that might help
736
+ 4. Additional wait conditions or timing adjustments
737
+ 5. Any authentication issues to address
738
+
739
+ Provide a recovery plan in the same format as before, including any browser config changes.
740
+ """
741
+
742
+ async with self._llm as client:
743
+ recovery_response = await client.ask(
744
+ prompt=recovery_prompt,
745
+ system_prompt=self.system_prompt_template,
746
+ model=self._llm_model,
747
+ max_tokens=self._max_tokens,
748
+ temperature=self._llm_temp,
749
+ use_tools=True,
750
+ )
751
+
752
+ recovery_plan = self._parse_scraping_plan(self._safe_extract_text(recovery_response))
753
+
754
+ if recovery_plan.get('steps'):
755
+ self.logger.info("Executing recovery plan")
756
+
757
+ # Apply any browser configuration changes
758
+ if recovery_plan.get('browser_config'):
759
+ await self._adapt_browser_configuration(recovery_plan['browser_config'])
760
+
761
+ recovery_steps = [self._create_scraping_step(step) for step in recovery_plan['steps']]
762
+ recovery_selectors = [self._create_scraping_selector(sel) for sel in recovery_plan.get('selectors', [])]
763
+
764
+ recovery_results = await self.scraping_tool.execute_scraping_workflow(
765
+ steps=recovery_steps,
766
+ selectors=recovery_selectors,
767
+ base_url=request.get('base_url', '')
768
+ )
769
+
770
+ # Combine successful results from both attempts
771
+ combined_results = []
772
+ for original, recovery in zip(failed_results, recovery_results):
773
+ if recovery.success:
774
+ combined_results.append(recovery)
775
+ elif original.success:
776
+ combined_results.append(original)
777
+ else:
778
+ combined_results.append(recovery) # Keep the latest attempt
779
+
780
+ return combined_results
781
+
782
+ return failed_results
783
+
784
+ async def _enhance_results(
785
+ self,
786
+ results: List[ScrapingResult],
787
+ request: Dict[str, Any]
788
+ ) -> List[ScrapingResult]:
789
+ """
790
+ Enhance scraping results with LLM-powered content analysis
791
+ """
792
+ for result in results:
793
+ if result.success and result.extracted_data:
794
+ # Analyze content relevance and quality
795
+ analysis_prompt = f"""
796
+ Analyze this scraped content for relevance and quality:
797
+
798
+ **Original Objective:** {request.get('objective', 'General extraction')}
799
+ **Extracted Data:** {json.dumps(result.extracted_data, indent=2, default=str)}
800
+ **URL:** {result.url}
801
+
802
+ Please provide:
803
+ 1. Content quality score (1-10)
804
+ 2. Relevance to objective (1-10)
805
+ 3. Key insights or important information found
806
+ 4. Suggestions for improving extraction
807
+ 5. Data cleaning or formatting recommendations
808
+
809
+ Keep your analysis concise but comprehensive.
810
+ """
811
+
812
+ try:
813
+ async with self._llm as client:
814
+ analysis_response = await client.ask(
815
+ prompt=analysis_prompt,
816
+ system_prompt=self.system_prompt_template,
817
+ model=self._llm_model,
818
+ max_tokens=self._max_tokens,
819
+ temperature=self._llm_temp,
820
+ use_tools=True,
821
+ )
822
+ content = self._safe_extract_text(analysis_response)
823
+ # Add analysis to metadata
824
+ result.metadata.update({
825
+ 'llm_analysis': content,
826
+ 'analysis_timestamp': datetime.now().isoformat(),
827
+ 'enhanced': True,
828
+ 'browser_config_used': self.browser_config.copy()
829
+ })
830
+ except Exception as e:
831
+ self.logger.warning(f"Content analysis failed: {str(e)}")
832
+
833
+ return results
834
+
835
+ def _looks_like_url(self, s: str) -> bool:
836
+ try:
837
+ s = (s or "").strip()
838
+ if not s:
839
+ return False
840
+ return s.startswith(("http://", "https://")) or ('.' in s and ' ' not in s)
841
+ except Exception:
842
+ return False
843
+
844
+ def _coerce_list_of_dicts(self, maybe_list):
845
+ if maybe_list is None:
846
+ return []
847
+ if isinstance(maybe_list, dict):
848
+ out = []
849
+ for k, v in maybe_list.items():
850
+ if isinstance(v, dict):
851
+ vv = v.copy()
852
+ vv.setdefault("name", k)
853
+ out.append(vv)
854
+ else:
855
+ out.append({"name": str(k), "selector": str(v)})
856
+ return out
857
+ if isinstance(maybe_list, (list, tuple, set)):
858
+ out = []
859
+ for item in maybe_list:
860
+ out.append(item if isinstance(item, dict) else {"selector": str(item)})
861
+ return out
862
+ return [{"selector": str(maybe_list)}]
863
+
864
+ def _sanitize_steps(self, steps_raw, request_url: str) -> list[dict]:
865
+ allowed = {"navigate", "click", "fill", "wait", "scroll", "authenticate", "await_human", "await_keypress", "await_browser_event"}
866
+ steps: list[dict] = []
867
+ for s in self._coerce_list_of_dicts(steps_raw):
868
+ action = self._normalize_action(s.get("action"))
869
+ if action not in allowed:
870
+ continue
871
+ target = self._normalize_target(s.get("target"))
872
+ value = self._normalize_value(s.get("value"))
873
+
874
+ # If navigate target isn't a real URL, force it to request_url
875
+ if action == "navigate" and (not target or not self._looks_like_url(target)):
876
+ target = request_url or target
877
+
878
+ # For non-navigate actions, ensure target is a plausible CSS selector
879
+ if action in {"click", "fill", "wait"}:
880
+ # pick the first of comma-separated list if present
881
+ if target and "," in target:
882
+ target = target.split(",")[0].strip()
883
+ # reject blatant prose targets
884
+ if target and (len(target) > 150 or " the " in target.lower()):
885
+ target = "" # will be filtered below
886
+
887
+ steps.append({
888
+ "action": action,
889
+ "target": target or "",
890
+ "value": value,
891
+ "wait_condition": s.get("wait_condition"),
892
+ "timeout": s.get("timeout", 10),
893
+ "description": s.get("description", "")
894
+ })
895
+
896
+ # Ensure we start with a valid navigate
897
+ has_nav = any(st["action"] == "navigate" for st in steps)
898
+ if not has_nav and request_url:
899
+ steps.insert(0, {
900
+ "action": "navigate",
901
+ "target": request_url,
902
+ "value": None,
903
+ "wait_condition": None,
904
+ "timeout": 15,
905
+ "description": "Navigate to target URL"
906
+ })
907
+ else:
908
+ for st in steps:
909
+ if st["action"] == "navigate":
910
+ if not self._looks_like_url(st["target"]) and request_url:
911
+ st["target"] = request_url
912
+ break
913
+ return steps
914
+
915
+ def _sanitize_selectors(self, selectors_raw) -> list[dict]:
916
+ cleaned: list[dict] = []
917
+ bad_prefixes = (".0", "#0") # guard against things like ".0.0.1"
918
+ ip_like = re.compile(r'^\d{1,3}(?:\.\d{1,3}){3}$')
919
+
920
+ for sel in self._coerce_list_of_dicts(selectors_raw):
921
+ selector = sel.get("selector") or sel.get("css") or sel.get("target")
922
+ name = sel.get("name") or selector
923
+ if not selector:
924
+ continue
925
+ selector = str(selector).strip()
926
+ name = str(name)
927
+
928
+ # Drop IPs or clearly invalid CSS like ".0.0.1"
929
+ if selector.startswith(bad_prefixes) or ip_like.match(selector):
930
+ continue
931
+ # Very weak CSS plausibility check
932
+ if not any(ch in selector for ch in ('.', '#', '[', '>', ':')) and ' ' not in selector:
933
+ # allow tag-only selectors like 'a', 'h2' by whitelisting when short
934
+ if selector.lower() not in {"a", "h1", "h2", "h3", "p", "span", "div"}:
935
+ continue
936
+
937
+ cleaned.append({
938
+ "name": name,
939
+ "selector": selector,
940
+ "selector_type": str(sel.get("selector_type", "css")),
941
+ "extract_type": str(sel.get("extract_type", "text")),
942
+ "attribute": (str(sel["attribute"]) if sel.get("attribute") is not None else None),
943
+ "multiple": bool(sel.get("multiple", True))
944
+ })
945
+ return cleaned
946
+
947
+ def _sanitize_plan(self, plan: dict, request: dict) -> dict:
948
+ url = request.get("target_url") or request.get("base_url") or ""
949
+ plan = dict(plan or {})
950
+ plan["steps"] = self._sanitize_steps(plan.get("steps") or [], url)
951
+ plan["selectors"] = self._sanitize_selectors(plan.get("selectors") or [])
952
+ bcfg = plan.get("browser_config")
953
+ if not isinstance(bcfg, dict):
954
+ bcfg = {}
955
+ plan["browser_config"] = bcfg
956
+ return plan
957
+
958
+ def _parse_scraping_plan(self, llm_response: str) -> Dict[str, Any]:
959
+ """
960
+ Parse LLM response to extract structured scraping plan
961
+ """
962
+ try:
963
+ plan = {
964
+ 'steps': [],
965
+ 'selectors': [],
966
+ 'browser_config': {},
967
+ 'analysis': llm_response,
968
+ 'parsed_successfully': False
969
+ }
970
+
971
+ # Extract JSON sections from the response
972
+ json_blocks = re.findall(r'```json\s*(\{.*?\}|\[.*?\])\s*```', llm_response, re.DOTALL)
973
+
974
+ for block in json_blocks:
975
+ try:
976
+ parsed = json.loads(block)
977
+ if isinstance(parsed, list):
978
+ # Could be steps or selectors
979
+ if parsed and 'action' in str(parsed[0]):
980
+ plan['steps'] = parsed
981
+ elif parsed and 'selector' in str(parsed[0]):
982
+ plan['selectors'] = parsed
983
+ elif isinstance(parsed, dict):
984
+ # Could be browser config
985
+ if any(key in parsed for key in ['browser', 'headless', 'mobile']):
986
+ plan['browser_config'] = parsed
987
+ except json.JSONDecodeError:
988
+ continue
989
+
990
+ # Fallback: try to extract from text
991
+ if not plan['steps']:
992
+ plan['steps'] = self._extract_steps_from_text(llm_response)
993
+
994
+ if not plan['selectors']:
995
+ plan['selectors'] = self._extract_selectors_from_text(llm_response)
996
+
997
+ plan['parsed_successfully'] = bool(plan['steps'] or plan['selectors'])
998
+ return plan
999
+
1000
+ except Exception as e:
1001
+ self.logger.error(f"Failed to parse scraping plan: {str(e)}")
1002
+ return {
1003
+ 'steps': [],
1004
+ 'selectors': [],
1005
+ 'browser_config': {},
1006
+ 'analysis': llm_response,
1007
+ 'parsed_successfully': False,
1008
+ 'parse_error': str(e)
1009
+ }
1010
+
1011
+ def _extract_steps_from_text(self, text: str) -> List[Dict[str, Any]]:
1012
+ """Fallback method to extract steps from unstructured text"""
1013
+ steps = []
1014
+
1015
+ # Look for step patterns in text
1016
+ step_patterns = [
1017
+ r'navigate to (.*?)[\n\.]',
1018
+ r'click on (.*?)[\n\.]',
1019
+ r'fill (.*?) with (.*?)[\n\.]',
1020
+ r'wait for (.*?)[\n\.]',
1021
+ r'scroll to (.*?)[\n\.]'
1022
+ ]
1023
+
1024
+ actions = ['navigate', 'click', 'fill', 'wait', 'scroll']
1025
+
1026
+ for i, pattern in enumerate(step_patterns):
1027
+ matches = re.findall(pattern, text, re.IGNORECASE)
1028
+ for match in matches:
1029
+ if isinstance(match, tuple):
1030
+ # For fill action
1031
+ steps.append({
1032
+ 'action': actions[i],
1033
+ 'target': match[0].strip(),
1034
+ 'value': match[1].strip() if len(match) > 1 else None,
1035
+ 'description': f"{actions[i].title()} {match[0].strip()}"
1036
+ })
1037
+ else:
1038
+ steps.append({
1039
+ 'action': actions[i],
1040
+ 'target': match.strip(),
1041
+ 'description': f"{actions[i].title()} {match.strip()}"
1042
+ })
1043
+
1044
+ return steps
1045
+
1046
+ def _extract_selectors_from_text(self, text: str) -> List[Dict[str, Any]]:
1047
+ """Fallback method to extract selectors from unstructured text"""
1048
+ selectors = []
1049
+
1050
+ # Look for selector patterns
1051
+ css_selectors = re.findall(r'[\.#][\w-]+(?:\s*[\.#][\w-]+)*', text)
1052
+
1053
+ for i, selector in enumerate(css_selectors):
1054
+ selectors.append({
1055
+ 'name': f'selector_{i+1}',
1056
+ 'selector': selector.strip(),
1057
+ 'selector_type': 'css',
1058
+ 'extract_type': 'text'
1059
+ })
1060
+
1061
+ return selectors
1062
+
1063
+ def _extract_domain(self, url: str) -> Optional[str]:
1064
+ """Extract domain from URL"""
1065
+ try:
1066
+ parsed = urlparse(url)
1067
+ return parsed.netloc if parsed.netloc else None
1068
+ except:
1069
+ return None
1070
+
1071
+ def _update_site_knowledge(
1072
+ self,
1073
+ request: Dict[str, Any],
1074
+ results: List[ScrapingResult]
1075
+ ):
1076
+ """Update our knowledge base about specific sites"""
1077
+ domain = self._extract_domain(request.get('target_url', ''))
1078
+ if domain and domain in self.site_knowledge:
1079
+ successful_results = [r for r in results if r.success]
1080
+ success_rate = len(successful_results) / len(results) if results else 0.0
1081
+
1082
+ self.site_knowledge[domain].update({
1083
+ 'success_rate': success_rate,
1084
+ 'last_scrape': datetime.now().isoformat(),
1085
+ 'total_attempts': self.site_knowledge[domain].get('total_attempts', 0) + 1,
1086
+ 'last_successful_config': self.browser_config.copy() if success_rate > 0.5 else None
1087
+ })
1088
+
1089
+ async def get_site_recommendations(self, url: str) -> Dict[str, Any]:
1090
+ """Get comprehensive recommendations for scraping a specific site"""
1091
+ domain = self._extract_domain(url)
1092
+ recommendations = {
1093
+ 'domain': domain,
1094
+ 'browser_recommendation': None,
1095
+ 'scraping_strategy': None,
1096
+ 'historical_data': None
1097
+ }
1098
+
1099
+ # Get browser recommendation
1100
+ browser_rec = await self.recommend_browser_for_site(url)
1101
+ recommendations['browser_recommendation'] = browser_rec
1102
+
1103
+ # Get historical data if available
1104
+ if domain in self.site_knowledge:
1105
+ knowledge = self.site_knowledge[domain]
1106
+ recommendations['historical_data'] = {
1107
+ 'success_rate': knowledge.get('success_rate', 0.0),
1108
+ 'last_successful_scrape': knowledge.get('last_scrape'),
1109
+ 'total_attempts': knowledge.get('total_attempts', 0),
1110
+ 'last_successful_config': knowledge.get('last_successful_config')
1111
+ }
1112
+
1113
+ # Generate comprehensive strategy recommendations
1114
+ strategy_prompt = f"""
1115
+ Provide comprehensive scraping strategy recommendations for this site:
1116
+
1117
+ **Domain:** {domain}
1118
+ **URL:** {url}
1119
+ **Browser Recommendation:** {json.dumps(browser_rec, indent=2)}
1120
+ **Historical Data:** {json.dumps(recommendations.get('historical_data', {}), indent=2)}
1121
+
1122
+ Please suggest:
1123
+ 1. Overall scraping approach and strategy
1124
+ 2. Timing and rate limiting recommendations
1125
+ 3. Common challenges and how to handle them
1126
+ 4. Authentication strategies if needed
1127
+ 5. Content extraction best practices
1128
+ 6. Error handling and recovery strategies
1129
+ """
1130
+
1131
+ try:
1132
+ async with self._llm as client:
1133
+ strategy_response = await client.ask(
1134
+ prompt=strategy_prompt,
1135
+ system_prompt=self.system_prompt_template,
1136
+ model=self._llm_model,
1137
+ max_tokens=self._max_tokens,
1138
+ temperature=self._llm_temp,
1139
+ use_tools=True,
1140
+ )
1141
+ recommendations['scraping_strategy'] = self._safe_extract_text(strategy_response)
1142
+ except Exception as e:
1143
+ self.logger.warning(f"Failed to generate strategy recommendations: {str(e)}")
1144
+ recommendations['scraping_strategy'] = "Unable to generate strategy recommendations"
1145
+
1146
+ return recommendations
1147
+
1148
+ async def cleanup(self):
1149
+ """Clean up resources"""
1150
+ if hasattr(self.scraping_tool, 'cleanup'):
1151
+ await self.scraping_tool.cleanup()
1152
+
1153
+ def get_available_templates(self) -> Dict[str, str]:
1154
+ """Get list of available scraping templates"""
1155
+ return {domain: template.get('guidance', 'No guidance available')
1156
+ for domain, template in self.scraping_templates.items()}
1157
+
1158
+ def get_template_for_url(self, url: str) -> Optional[Dict[str, Any]]:
1159
+ """Get the best matching template for a given URL"""
1160
+ domain = self._extract_domain(url)
1161
+ if not domain:
1162
+ return None
1163
+
1164
+ # Check for exact match
1165
+ if domain in self.scraping_templates:
1166
+ return self.scraping_templates[domain]
1167
+
1168
+ # Check for partial matches
1169
+ for template_domain, template_data in self.scraping_templates.items():
1170
+ if template_domain in domain or domain in template_domain:
1171
+ return template_data
1172
+
1173
+ return None