ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
parrot/loaders/web.py ADDED
@@ -0,0 +1,599 @@
1
+ import asyncio
2
+ import time
3
+ from typing import Union, List, Optional, Tuple, Dict, Any
4
+ from bs4 import BeautifulSoup, NavigableString
5
+ from markdownify import MarkdownConverter
6
+ from webdriver_manager.chrome import ChromeDriverManager
7
+ from webdriver_manager.firefox import GeckoDriverManager
8
+ from selenium import webdriver
9
+ from selenium.webdriver.chrome.service import Service as ChromeService
10
+ from selenium.webdriver.firefox.service import Service as FirefoxService
11
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
12
+ from selenium.webdriver.firefox.options import Options as FirefoxOptions
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support.ui import WebDriverWait
15
+ from selenium.webdriver.support import expected_conditions as EC
16
+ from navconfig.logging import logging
17
+ from .abstract import AbstractLoader
18
+ from ..stores.models import Document
19
+
20
+
21
+ logging.getLogger(name='selenium.webdriver').setLevel(logging.WARNING)
22
+ logging.getLogger(name='WDM').setLevel(logging.WARNING)
23
+ logging.getLogger(name='matplotlib').setLevel(logging.WARNING)
24
+
25
+
26
+ DEFAULT_UA = (
27
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
28
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
29
+ "Chrome/122.0.0.0 Safari/537.36"
30
+ )
31
+
32
+
33
+ class WebDriverPool:
34
+ """Async WebDriver pool for efficient browser management."""
35
+
36
+ def __init__(self, max_drivers: int = 3, browser: str = "chrome", **driver_kwargs):
37
+ self.max_drivers = max_drivers
38
+ self.browser = browser.lower()
39
+ self.driver_kwargs = driver_kwargs
40
+ self.pool = asyncio.Queue(maxsize=max_drivers)
41
+ self.active_drivers = set()
42
+ self.lock = asyncio.Lock()
43
+ self.logger = logging.getLogger(self.__class__.__name__)
44
+
45
+ async def get_driver(self) -> webdriver:
46
+ """Get a driver from the pool or create a new one."""
47
+ try:
48
+ # Try to get an existing driver from the pool
49
+ driver = self.pool.get_nowait()
50
+ self.logger.debug("Reusing driver from pool")
51
+ return driver
52
+ except asyncio.QueueEmpty:
53
+ # Pool is empty, create new driver if under limit
54
+ async with self.lock:
55
+ if len(self.active_drivers) < self.max_drivers:
56
+ driver = await asyncio.get_event_loop().run_in_executor(
57
+ None, self._create_driver
58
+ )
59
+ self.active_drivers.add(driver)
60
+ self.logger.debug(f"Created new driver. Active: {len(self.active_drivers)}")
61
+ return driver
62
+ else:
63
+ # Wait for a driver to become available
64
+ self.logger.debug("Waiting for available driver")
65
+ return await self.pool.get()
66
+
67
+ def _create_driver(self) -> webdriver:
68
+ """Create a new WebDriver instance synchronously."""
69
+ chrome_args = [
70
+ "--headless=new",
71
+ "--enable-automation",
72
+ "--lang=en",
73
+ "--disable-extensions",
74
+ "--disable-gpu",
75
+ "--no-sandbox",
76
+ "--disable-dev-shm-usage",
77
+ ]
78
+
79
+ firefox_args = [
80
+ "-headless",
81
+ ]
82
+
83
+ if self.browser == "firefox":
84
+ options = FirefoxOptions()
85
+ for arg in firefox_args:
86
+ options.add_argument(arg)
87
+
88
+ user_agent = self.driver_kwargs.get('user_agent')
89
+ if user_agent:
90
+ options.set_preference("general.useragent.override", user_agent)
91
+
92
+ page_load_strategy = self.driver_kwargs.get('page_load_strategy', 'normal')
93
+ caps = webdriver.DesiredCapabilities.FIREFOX.copy()
94
+ caps["pageLoadStrategy"] = page_load_strategy
95
+
96
+ service = FirefoxService(GeckoDriverManager().install())
97
+ return webdriver.Firefox(service=service, options=options)
98
+
99
+ else: # Chrome
100
+ options = ChromeOptions()
101
+ for arg in chrome_args:
102
+ options.add_argument(arg)
103
+
104
+ user_agent = self.driver_kwargs.get('user_agent', DEFAULT_UA)
105
+ if user_agent:
106
+ options.add_argument(f"user-agent={user_agent}")
107
+
108
+ page_load_strategy = self.driver_kwargs.get('page_load_strategy', 'normal')
109
+ options.page_load_strategy = page_load_strategy
110
+
111
+ service = ChromeService(ChromeDriverManager().install())
112
+ return webdriver.Chrome(service=service, options=options)
113
+
114
+ async def return_driver(self, driver: webdriver):
115
+ """Return a driver to the pool after cleaning it."""
116
+ try:
117
+ # Clean the driver (clear cookies, navigate to blank page, etc.)
118
+ await asyncio.get_event_loop().run_in_executor(
119
+ None, self._clean_driver, driver
120
+ )
121
+
122
+ # Return to pool
123
+ await self.pool.put(driver)
124
+ self.logger.debug("Returned cleaned driver to pool")
125
+ except Exception as e:
126
+ self.logger.error(f"Error returning driver to pool: {e}")
127
+ await self._destroy_driver(driver)
128
+
129
+ def _clean_driver(self, driver: webdriver):
130
+ """Clean driver state synchronously."""
131
+ try:
132
+ driver.delete_all_cookies()
133
+ driver.execute_script("window.localStorage.clear();")
134
+ driver.execute_script("window.sessionStorage.clear();")
135
+ driver.get("about:blank")
136
+ except Exception as e:
137
+ self.logger.warning(f"Error cleaning driver: {e}")
138
+
139
+ async def _destroy_driver(self, driver: webdriver):
140
+ """Destroy a driver and remove it from active set."""
141
+ try:
142
+ await asyncio.get_event_loop().run_in_executor(None, driver.quit)
143
+ except Exception as e:
144
+ self.logger.error(f"Error quitting driver: {e}")
145
+ finally:
146
+ async with self.lock:
147
+ self.active_drivers.discard(driver)
148
+
149
+ async def close_all(self):
150
+ """Close all drivers in the pool."""
151
+ async with self.lock:
152
+ # Close drivers in pool
153
+ while not self.pool.empty():
154
+ try:
155
+ driver = await self.pool.get()
156
+ await self._destroy_driver(driver)
157
+ except:
158
+ pass
159
+
160
+ # Close active drivers
161
+ destroy_tasks = [self._destroy_driver(driver) for driver in self.active_drivers.copy()]
162
+ if destroy_tasks:
163
+ await asyncio.gather(*destroy_tasks, return_exceptions=True)
164
+
165
+ self.active_drivers.clear()
166
+ self.logger.info("Closed all WebDriver instances")
167
+
168
+
169
+ class WebLoader(AbstractLoader):
170
+ """Load web pages and extract HTML + Markdown + structured bits (videos/nav/tables)."""
171
+
172
+ def __init__(
173
+ self,
174
+ source_type: str = 'website',
175
+ *,
176
+ browser: str = "chrome",
177
+ timeout: int = 60,
178
+ page_load_strategy: str = "normal",
179
+ user_agent: Optional[str] = DEFAULT_UA,
180
+ max_drivers: int = 3,
181
+ driver_pool: Optional[WebDriverPool] = None,
182
+ **kwargs
183
+ ):
184
+ super().__init__(source_type=source_type, **kwargs)
185
+
186
+ self.timeout = timeout
187
+ self.browser = browser.lower()
188
+ self.page_load_strategy = page_load_strategy
189
+ self.user_agent = user_agent
190
+ self.max_drivers = max_drivers
191
+
192
+ # Use provided pool or create our own
193
+ if driver_pool:
194
+ self.driver_pool = driver_pool
195
+ self._own_pool = False
196
+ else:
197
+ self.driver_pool = WebDriverPool(
198
+ max_drivers=max_drivers,
199
+ browser=browser,
200
+ page_load_strategy=page_load_strategy,
201
+ user_agent=user_agent
202
+ )
203
+ self._own_pool = True
204
+
205
+ self.driver = None
206
+
207
+ async def open(self):
208
+ """Initialize resources - called by AbstractLoader's __aenter__."""
209
+ self.logger.debug("Opening WebLoader")
210
+ # Driver pool is ready to use, no additional setup needed
211
+ pass
212
+
213
+ async def close(self):
214
+ """Clean up resources - called by AbstractLoader's __aexit__."""
215
+ self.logger.debug("Closing WebLoader")
216
+ if self._own_pool and self.driver_pool:
217
+ await self.driver_pool.close_all()
218
+
219
+ def md(self, soup: BeautifulSoup, **options) -> str:
220
+ """Convert BeautifulSoup to Markdown."""
221
+ return MarkdownConverter(**options).convert_soup(soup)
222
+
223
+ def _text(self, node: Any) -> str:
224
+ """Extract text content from a node."""
225
+ if node is None:
226
+ return ""
227
+ if isinstance(node, NavigableString):
228
+ return str(node).strip()
229
+ return node.get_text(" ", strip=True)
230
+
231
+ def _collect_video_links(self, soup: BeautifulSoup) -> List[str]:
232
+ """Extract video links from the page."""
233
+ items: List[str] = []
234
+
235
+ # iframes (YouTube/Vimeo/etc.)
236
+ for iframe in soup.find_all("iframe"):
237
+ src = iframe.get("src")
238
+ if not src:
239
+ continue
240
+ items.append(f"Video Link: {src}")
241
+
242
+ # <video> and <source>
243
+ for video in soup.find_all("video"):
244
+ src = video.get("src")
245
+ if src:
246
+ items.append(f"Video Link: {src}")
247
+ for source in video.find_all("source"):
248
+ s = source.get("src")
249
+ if s:
250
+ items.append(f"Video Source: {s}")
251
+
252
+ # Deduplicate while preserving order
253
+ seen = set()
254
+ result = []
255
+ for x in items:
256
+ if x not in seen:
257
+ result.append(x)
258
+ seen.add(x)
259
+ return result
260
+
261
+ def _collect_navbars(self, soup: BeautifulSoup) -> List[str]:
262
+ """Extract navigation menus as Markdown lists."""
263
+ nav_texts: List[str] = []
264
+
265
+ def nav_to_markdown(nav) -> str:
266
+ lines = []
267
+ blocks = nav.find_all(["ul", "ol"], recursive=True)
268
+ if not blocks:
269
+ # Fallback: collect links directly under nav
270
+ for a in nav.find_all("a", href=True):
271
+ txt = self._text(a)
272
+ href = a.get("href", "")
273
+ if txt or href:
274
+ lines.append(f"- {txt} (Link: {href})" if href else f"- {txt}")
275
+ else:
276
+ for block in blocks:
277
+ for li in block.find_all("li", recursive=False):
278
+ a = li.find("a", href=True)
279
+ if a:
280
+ txt = self._text(a)
281
+ href = a.get("href", "")
282
+ lines.append(f"- {txt} (Link: {href})" if href else f"- {txt}")
283
+ else:
284
+ t = self._text(li)
285
+ if t:
286
+ lines.append(f"- {t}")
287
+
288
+ # nested lists
289
+ for sub in li.find_all(["ul", "ol"], recursive=False):
290
+ for sub_li in sub.find_all("li", recursive=False):
291
+ a2 = sub_li.find("a", href=True)
292
+ if a2:
293
+ txt2 = self._text(a2)
294
+ href2 = a2.get("href", "")
295
+ lines.append(f" - {txt2} (Link: {href2})" if href2 else f" - {txt2}")
296
+ else:
297
+ t2 = self._text(sub_li)
298
+ if t2:
299
+ lines.append(f" - {t2}")
300
+ return "\n".join(lines)
301
+
302
+ # <nav> regions
303
+ for nav in soup.find_all("nav"):
304
+ md_list = nav_to_markdown(nav)
305
+ if md_list.strip():
306
+ nav_texts.append("Navigation:\n" + md_list)
307
+
308
+ # Common menu containers if no <nav>
309
+ if not nav_texts:
310
+ candidates = soup.select("[role='navigation'], .navbar, .menu, .nav")
311
+ for nav in candidates:
312
+ md_list = nav_to_markdown(nav)
313
+ if md_list.strip():
314
+ nav_texts.append("Navigation:\n" + md_list)
315
+
316
+ return nav_texts
317
+
318
+ def _table_to_markdown(self, table) -> str:
319
+ """Convert a <table> to GitHub-flavored Markdown."""
320
+ # Caption
321
+ caption_el = table.find("caption")
322
+ caption = self._text(caption_el) if caption_el else ""
323
+
324
+ # Headers
325
+ headers = []
326
+ thead = table.find("thead")
327
+ if thead:
328
+ ths = thead.find_all("th")
329
+ if ths:
330
+ headers = [self._text(th) for th in ths]
331
+
332
+ # If no thead, try first row as headers
333
+ if not headers:
334
+ first_row = table.find("tr")
335
+ if first_row:
336
+ cells = first_row.find_all(["th", "td"])
337
+ headers = [self._text(c) for c in cells]
338
+
339
+ # Rows
340
+ rows = []
341
+ for tr in table.find_all("tr"):
342
+ cells = tr.find_all(["td"])
343
+ if not cells:
344
+ continue
345
+ rows.append([self._text(td) for td in cells])
346
+
347
+ if not headers and rows:
348
+ headers = [f"Col {i+1}" for i in range(len(rows[0]))]
349
+
350
+ # Normalize column count
351
+ ncol = len(headers)
352
+ norm_rows = []
353
+ for r in rows:
354
+ if len(r) < ncol:
355
+ r = r + [""] * (ncol - len(r))
356
+ elif len(r) > ncol:
357
+ r = r[:ncol]
358
+ norm_rows.append(r)
359
+
360
+ def esc(cell: str) -> str:
361
+ return (cell or "").replace("|", "\\|").strip()
362
+
363
+ md = []
364
+ if caption:
365
+ md.append(f"Table: {caption}\n")
366
+ if headers:
367
+ md.append("| " + " | ".join(esc(h) for h in headers) + " |")
368
+ md.append("| " + " | ".join("---" for _ in headers) + " |")
369
+ for r in norm_rows:
370
+ md.append("| " + " | ".join(esc(c) for c in r) + " |")
371
+ return "\n".join(md).strip()
372
+
373
+ def _collect_tables(self, soup: BeautifulSoup, max_tables: int = 25) -> List[str]:
374
+ """Extract tables as Markdown."""
375
+ out = []
376
+ for i, table in enumerate(soup.find_all("table")):
377
+ if i >= max_tables:
378
+ break
379
+ try:
380
+ out.append(self._table_to_markdown(table))
381
+ except Exception:
382
+ continue
383
+ return out
384
+
385
+ def _fetch_page_sync(self, driver: webdriver, url: str, args: dict) -> str:
386
+ """Synchronously fetch page content using WebDriver."""
387
+ # Waiting / cookie handling
388
+ locator = args.get('locator', (By.TAG_NAME, 'body'))
389
+ wait = WebDriverWait(driver, self.timeout)
390
+ acookies = args.get('accept_cookies', False)
391
+ sleep_after = args.get('sleep_after', 0)
392
+
393
+ try:
394
+ driver.get(url)
395
+ wait.until(EC.presence_of_element_located(locator))
396
+
397
+ if acookies:
398
+ try:
399
+ btn = wait.until(EC.element_to_be_clickable(acookies))
400
+ btn.click()
401
+ except Exception:
402
+ pass
403
+ except Exception as exc:
404
+ self.logger.error(f"Failed to load {url}: {exc}")
405
+ raise
406
+
407
+ if sleep_after:
408
+ time.sleep(float(sleep_after))
409
+
410
+ return driver.page_source
411
+
412
+ def clean_html(
413
+ self,
414
+ html: str,
415
+ tags: List[str],
416
+ objects: List[Dict[str, Dict[str, Any]]] = [],
417
+ *,
418
+ parse_videos: bool = True,
419
+ parse_navs: bool = True,
420
+ parse_tables: bool = True
421
+ ) -> Tuple[List[str], str, str]:
422
+ """Clean and extract content from HTML."""
423
+ soup = BeautifulSoup(html, 'html.parser')
424
+
425
+ # Remove script/style/link early
426
+ for el in soup(["script", "style", "link", "noscript"]):
427
+ el.decompose()
428
+
429
+ # Title
430
+ page_title = ""
431
+ try:
432
+ if soup.title and soup.title.string:
433
+ page_title = soup.title.string.strip()
434
+ if not page_title:
435
+ og = soup.find("meta", property="og:title")
436
+ if og and og.get("content"):
437
+ page_title = og["content"].strip()
438
+ except Exception:
439
+ page_title = ""
440
+
441
+ # Full-page Markdown
442
+ md_text = self.md(soup)
443
+
444
+ content: List[str] = []
445
+
446
+ # Paragraphs/headers/sections
447
+ for p in soup.find_all(tags):
448
+ text = ' '.join(p.get_text(" ", strip=True).split())
449
+ if text:
450
+ content.append(text)
451
+
452
+ # Videos
453
+ if parse_videos:
454
+ content.extend(self._collect_video_links(soup))
455
+
456
+ # Navbars
457
+ if parse_navs:
458
+ content.extend(self._collect_navbars(soup))
459
+
460
+ # Tables
461
+ if parse_tables:
462
+ content.extend(self._collect_tables(soup))
463
+
464
+ # Custom objects (keeping existing behavior)
465
+ if objects:
466
+ for obj in objects:
467
+ (element, args), = obj.items()
468
+ if 'parse_list' in args:
469
+ parse_list = args.pop('parse_list')
470
+ container = soup.find(element, attrs=args)
471
+ if not container:
472
+ continue
473
+ name_type = parse_list.pop('type', 'List')
474
+ params = parse_list.get('find', [])
475
+ el = params[0] if params else 'ul'
476
+ attrs = params[1] if len(params) > 1 else {}
477
+ elements = container.find_all(el, attrs=attrs)
478
+ structured_text = ''
479
+ for element in elements:
480
+ title_el = element.find('span', class_='title')
481
+ title = title_el.get_text(strip=True) if title_el else ''
482
+ lists = element.find_all('ul')
483
+ if lists:
484
+ if title:
485
+ structured_text += f"\nCategory: {title}\n{name_type}:\n"
486
+ for ul in lists:
487
+ items = [f"- {li.get_text(strip=True)}" for li in ul.select('li')]
488
+ structured_text += '\n'.join(items)
489
+ structured_text += "\n"
490
+ if structured_text.strip():
491
+ content.append(structured_text.strip())
492
+ else:
493
+ elements = soup.find_all(element, attrs=args)
494
+ for element in elements:
495
+ for link in element.find_all('a'):
496
+ link_text = link.get_text(strip=True)
497
+ href = link.get('href', '')
498
+ formatted = f"{link_text} (Link: {href})" if href else link_text
499
+ link.replace_with(formatted)
500
+
501
+ for ul in element.find_all('ul'):
502
+ items = [li.get_text(strip=True) for li in ul.select('li')]
503
+ if items:
504
+ content.append('\n'.join(items))
505
+
506
+ cleaned_text = ' '.join(element.get_text().split())
507
+ if cleaned_text:
508
+ content.append(cleaned_text)
509
+
510
+ return (content, md_text, page_title)
511
+
512
+ def _normalize_url_args(self, address, kwargs):
513
+ """Normalize URL and arguments from different input formats."""
514
+ if isinstance(address, str):
515
+ url = address
516
+ args = dict(kwargs) if kwargs else {}
517
+ return url, args
518
+
519
+ if isinstance(address, dict):
520
+ (url, args), = address.items()
521
+ args = dict(args or {})
522
+ if kwargs:
523
+ args.update(kwargs)
524
+ return url, args
525
+
526
+ raise TypeError(f"Unsupported address type for WebLoader: {type(address)}")
527
+
528
+ async def _load(self, address: Union[str, dict], **kwargs) -> List[Document]:
529
+ """Load a single web page."""
530
+ url, args = self._normalize_url_args(address, kwargs)
531
+ self.logger.info(f'Loading URL: {url}')
532
+
533
+ # Get driver from pool
534
+ driver = await self.driver_pool.get_driver()
535
+
536
+ try:
537
+ # Fetch page content in executor
538
+ html_content = await asyncio.get_event_loop().run_in_executor(
539
+ None, self._fetch_page_sync, driver, url, args
540
+ )
541
+
542
+ # Process content
543
+ extract_tags = args.get('tags', ['p', 'title', 'h1', 'h2', 'section', 'article'])
544
+ objects = args.get('objects', [])
545
+ parse_videos = args.get('parse_videos', True)
546
+ parse_navs = args.get('parse_navs', True)
547
+ parse_tables = args.get('parse_tables', True)
548
+ source_type = args.get('source_type', self._source_type)
549
+
550
+ content, md_text, page_title = self.clean_html(
551
+ html_content,
552
+ extract_tags,
553
+ objects,
554
+ parse_videos=parse_videos,
555
+ parse_navs=parse_navs,
556
+ parse_tables=parse_tables
557
+ )
558
+
559
+ if not page_title:
560
+ page_title = url
561
+
562
+ metadata = {
563
+ "source": url,
564
+ "url": url,
565
+ "filename": page_title,
566
+ "source_type": source_type,
567
+ "type": "webpage",
568
+ "document_meta": {
569
+ "language": "en",
570
+ "title": page_title,
571
+ },
572
+ }
573
+
574
+ docs: List[Document] = []
575
+ if md_text:
576
+ docs.append(
577
+ Document(
578
+ page_content=md_text,
579
+ metadata={**metadata, "content_kind": "markdown_full"}
580
+ )
581
+ )
582
+
583
+ for chunk in content:
584
+ if chunk and isinstance(chunk, str):
585
+ docs.append(
586
+ Document(
587
+ page_content=chunk,
588
+ metadata={**metadata, "content_kind": "fragment"}
589
+ )
590
+ )
591
+
592
+ return docs
593
+
594
+ except Exception as exc:
595
+ self.logger.error(f"Failed to load {url}: {exc}")
596
+ raise
597
+ finally:
598
+ # Return driver to pool
599
+ await self.driver_pool.return_driver(driver)