ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
@@ -0,0 +1,1131 @@
1
+ from __future__ import annotations
2
+ from typing import Generator, Union, List, Any, Optional, TypeVar
3
+ from collections.abc import Callable
4
+ from abc import ABC, abstractmethod
5
+ from datetime import datetime
6
+ import uuid
7
+ from pathlib import Path, PosixPath, PurePath
8
+ import asyncio
9
+ import pandas as pd
10
+ from navconfig.logging import logging
11
+ from navigator.libs.json import JSONContent # pylint: disable=E0611
12
+ from ..stores.models import Document
13
+ ## AI Models:
14
+ from ..models.google import GoogleModel
15
+ from ..models.groq import GroqModel
16
+ from ..clients.factory import LLMFactory
17
+ from .splitters import (
18
+ TokenTextSplitter,
19
+ MarkdownTextSplitter
20
+ )
21
+ from ..stores.utils.chunking import LateChunkingProcessor
22
+ from ..conf import (
23
+ DEFAULT_LLM_MODEL,
24
+ DEFAULT_LLM_TEMPERATURE,
25
+ DEFAULT_GROQ_MODEL,
26
+ CUDA_DEFAULT_DEVICE,
27
+ CUDA_DEFAULT_DEVICE_NUMBER
28
+ )
29
+
30
+
31
+ T = TypeVar('T')
32
+
33
+
34
+ class AbstractLoader(ABC):
35
+ """
36
+ Base class for all loaders.
37
+ Loaders are responsible for loading data from various sources.
38
+ """
39
+ extensions: List[str] = ['.*']
40
+ skip_directories: List[str] = []
41
+
42
+ def __init__(
43
+ self,
44
+ source: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
45
+ *,
46
+ tokenizer: Union[str, Callable] = None,
47
+ text_splitter: Union[str, Callable] = None,
48
+ source_type: str = 'file',
49
+ **kwargs
50
+ ):
51
+ """
52
+ Initialize the AbstractLoader.
53
+
54
+ Args:
55
+ source: Path, URL, or list of paths/URLs to load from
56
+ tokenizer: Tokenizer to use (string model name or callable)
57
+ text_splitter: Text splitter to use
58
+ source_type: Type of source ('file', 'url', etc.)
59
+ **kwargs: Additional keyword arguments for configuration
60
+ """
61
+ self.chunk_size: int = kwargs.get('chunk_size', 800)
62
+ self.chunk_overlap: int = kwargs.get('chunk_overlap', 100)
63
+ self.token_size: int = kwargs.get('token_size', 20)
64
+ self.semaphore = asyncio.Semaphore(kwargs.get('semaphore', 10))
65
+ self.extensions = kwargs.get('extensions', self.extensions)
66
+ self.skip_directories = kwargs.get(
67
+ 'skip_directories',
68
+ self.skip_directories
69
+ )
70
+ self.encoding = kwargs.get('encoding', 'utf-8')
71
+ self._source_type = source_type
72
+ self._recursive: bool = kwargs.get('recursive', False)
73
+ self.category: str = kwargs.get('category', 'document')
74
+ self.doctype: str = kwargs.get('doctype', 'text')
75
+ # Chunking configuration
76
+ self._use_markdown_splitter: bool = kwargs.get('use_markdown_splitter', True)
77
+ self._use_huggingface_splitter: bool = kwargs.get('use_huggingface_splitter', False)
78
+ self._auto_detect_content_type: bool = kwargs.get('auto_detect_content_type', True)
79
+
80
+ # Advanced features
81
+ self._summarization = kwargs.get('summarization', False)
82
+ self._summary_model: Optional[Any] = kwargs.get('summary_model', None)
83
+ self._use_summary_pipeline: bool = kwargs.get('use_summary_pipeline', False)
84
+ self._use_translation_pipeline: bool = kwargs.get('use_translation_pipeline', False)
85
+ self._translation = kwargs.get('translation', False)
86
+
87
+ # Handle source/path initialization
88
+ self.path = None
89
+ if source is not None:
90
+ self.path = source
91
+ elif 'path' in kwargs:
92
+ self.path = kwargs['path']
93
+
94
+ # Normalize path if it's a string
95
+ if self.path is not None and isinstance(self.path, str):
96
+ self.path = Path(self.path).resolve()
97
+ elif self.path is not None and isinstance(self.path, (Path, PurePath)):
98
+ self.path = Path(self.path).resolve()
99
+
100
+ # Tokenizer
101
+ self.tokenizer = tokenizer
102
+ # Text Splitter
103
+ self.text_splitter = kwargs.get('text_splitter', None)
104
+ self.markdown_splitter = kwargs.get('markdown_splitter', None)
105
+
106
+ # Initialize text splitter based on configuration
107
+ self._setup_text_splitters(tokenizer, text_splitter, kwargs)
108
+
109
+ # Summarization Model:
110
+ self.summarization_model = kwargs.get('summarizer', None)
111
+ # LLM (if required)
112
+ self._setup_llm(kwargs)
113
+ # Logger
114
+ self.logger = logging.getLogger(
115
+ f"Parrot.Loaders.{self.__class__.__name__}"
116
+ )
117
+ # JSON encoder:
118
+ self._encoder = JSONContent()
119
+ # Use CUDA if available:
120
+ self._setup_device(kwargs)
121
+
122
+ def _get_token_splitter(
123
+ self,
124
+ model_name: str = "gpt-3.5-turbo",
125
+ chunk_size: int = 4000,
126
+ chunk_overlap: int = 200
127
+ ) -> TokenTextSplitter:
128
+ """Create a TokenTextSplitter with common settings"""
129
+ if self.text_splitter:
130
+ return self.text_splitter
131
+ return TokenTextSplitter(
132
+ chunk_size=chunk_size,
133
+ chunk_overlap=chunk_overlap,
134
+ model_name=model_name
135
+ )
136
+
137
+ def _get_markdown_splitter(
138
+ self,
139
+ chunk_size: int = 4000,
140
+ chunk_overlap: int = 200,
141
+ strip_headers: bool = False
142
+ ) -> MarkdownTextSplitter:
143
+ """Create a MarkdownTextSplitter with common settings"""
144
+ if self.text_splitter:
145
+ return self.text_splitter
146
+ return MarkdownTextSplitter(
147
+ chunk_size=chunk_size,
148
+ chunk_overlap=chunk_overlap,
149
+ strip_headers=strip_headers
150
+ )
151
+
152
+ def _create_hf_token_splitter(
153
+ self,
154
+ model_name: str,
155
+ chunk_size: int = 4000,
156
+ chunk_overlap: int = 200
157
+ ) -> TokenTextSplitter:
158
+ """Create a TokenTextSplitter using a HuggingFace Tokenizer"""
159
+ from transformers import AutoTokenizer
160
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
161
+ return TokenTextSplitter(
162
+ chunk_size=chunk_size,
163
+ chunk_overlap=chunk_overlap,
164
+ tokenizer=tokenizer
165
+ )
166
+
167
+ def _setup_text_splitters(self, tokenizer, text_splitter, kwargs):
168
+ """Initialize text splitters based on configuration."""
169
+ # Always create a markdown splitter
170
+ self.markdown_splitter = self._get_markdown_splitter(
171
+ chunk_size=self.chunk_size,
172
+ chunk_overlap=self.chunk_overlap
173
+ )
174
+
175
+ # Choose primary text splitter based on configuration
176
+ if self._use_markdown_splitter:
177
+ self.text_splitter = text_splitter or self.markdown_splitter
178
+ else:
179
+ if self._use_huggingface_splitter:
180
+ self.text_splitter = self._create_hf_token_splitter(
181
+ model_name=kwargs.get('model_name', 'gpt-3.5-turbo'),
182
+ chunk_size=self.chunk_size,
183
+ chunk_overlap=self.chunk_overlap
184
+ )
185
+ else:
186
+ # Default to TokenTextSplitter
187
+ if isinstance(tokenizer, str):
188
+ self.text_splitter = self._get_token_splitter(
189
+ model_name=tokenizer,
190
+ chunk_size=self.chunk_size,
191
+ chunk_overlap=self.chunk_overlap
192
+ )
193
+ elif callable(tokenizer):
194
+ self.text_splitter = TokenTextSplitter(
195
+ chunk_size=self.chunk_size,
196
+ chunk_overlap=self.chunk_overlap,
197
+ tokenizer_function=tokenizer
198
+ )
199
+ else:
200
+ # Use default TokenTextSplitter
201
+ self.text_splitter = TokenTextSplitter(
202
+ chunk_size=self.chunk_size,
203
+ chunk_overlap=self.chunk_overlap,
204
+ model_name=kwargs.get('model_name', 'gpt-3.5-turbo')
205
+ )
206
+
207
+ def _setup_llm(self, kwargs):
208
+ """Initialize LLM if required."""
209
+ self._use_llm = kwargs.get('use_llm', False)
210
+ self._llm_model = kwargs.get('llm_model', None)
211
+ self._llm_model_kwargs = kwargs.get('model_kwargs', {})
212
+ self._llm = kwargs.get('llm', None)
213
+ if self._use_llm:
214
+ self._llm = self.get_default_llm(
215
+ model=self._llm_model,
216
+ model_kwargs=self._llm_model_kwargs,
217
+ )
218
+
219
+ def get_default_llm(
220
+ self,
221
+ model: str = None,
222
+ model_kwargs: dict = None,
223
+ use_groq: bool = False,
224
+ use_openai: bool = False
225
+ ) -> Any:
226
+ """Return a AI Client instance."""
227
+ if not model_kwargs:
228
+ model_kwargs = {
229
+ "temperature": DEFAULT_LLM_TEMPERATURE,
230
+ "top_k": 30,
231
+ "top_p": 0.5,
232
+ }
233
+ if use_groq:
234
+ return LLMFactory.create(
235
+ llm=f"groq:{model or DEFAULT_GROQ_MODEL}" if model else "groq",
236
+ model_kwargs=model_kwargs
237
+ )
238
+ elif use_openai:
239
+ return LLMFactory.create(
240
+ llm=f"openai:{model}" if model else "openai",
241
+ model_kwargs=model_kwargs
242
+ )
243
+ return LLMFactory.create(
244
+ llm=model or DEFAULT_LLM_MODEL,
245
+ model_kwargs=model_kwargs
246
+ )
247
+
248
+ def _setup_device(self, kwargs):
249
+ """Initialize device configuration."""
250
+ self.device_name = kwargs.get('device', CUDA_DEFAULT_DEVICE)
251
+ self.cuda_number = kwargs.get('cuda_number', CUDA_DEFAULT_DEVICE_NUMBER)
252
+ self._device = None
253
+
254
+ def _get_device(
255
+ self,
256
+ device_type: str = None,
257
+ cuda_number: int = 0
258
+ ):
259
+ """
260
+ Get device configuration for Torch and transformers.
261
+
262
+ Returns:
263
+ tuple: (pipeline_device_idx, torch_device, dtype)
264
+ - pipeline_device_idx: int for HuggingFace pipeline (-1 for CPU, 0+ for GPU)
265
+ - torch_device: torch.device object for model loading
266
+ - dtype: torch data type for model weights
267
+ """
268
+ import torch
269
+ # Default values for CPU usage
270
+ pipeline_idx = -1 # This is what HuggingFace pipeline expects for CPU
271
+ torch_dev = torch.device("cpu")
272
+ dtype = torch.float32
273
+
274
+ # Check if we're forcing CPU usage globally
275
+ if CUDA_DEFAULT_DEVICE == 'cpu' or device_type == 'cpu':
276
+ # CPU is explicitly requested
277
+ return -1, torch.device('cpu'), torch.float32
278
+
279
+ # Check for CUDA availability and use it if possible
280
+ if torch.cuda.is_available():
281
+ # For GPU, pipeline wants an integer index
282
+ pipeline_idx = cuda_number # 0 for first GPU, 1 for second, etc.
283
+ torch_dev = torch.device(f"cuda:{cuda_number}")
284
+
285
+ # Choose the best dtype for this GPU
286
+ if torch.cuda.is_bf16_supported():
287
+ dtype = torch.bfloat16
288
+ else:
289
+ dtype = torch.float16
290
+
291
+ return pipeline_idx, torch_dev, dtype
292
+
293
+ # Check for Apple Silicon GPU (MPS)
294
+ if torch.backends.mps.is_available():
295
+ # MPS is tricky - HuggingFace pipelines don't always support it well
296
+ # We return "mps" as a string for pipeline, and torch.device for model
297
+ # Note: You might need to handle this specially in your pipeline code
298
+ return "mps", torch.device("mps"), torch.float32
299
+
300
+ # Fallback to CPU if nothing else is available
301
+ return -1, torch.device("cpu"), torch.float32
302
+
303
+ def clear_cuda(self):
304
+ self.tokenizer = None # Reset the tokenizer
305
+ self.text_splitter = None # Reset the text splitter
306
+ try:
307
+ import torch
308
+ torch.cuda.synchronize() # Wait for all kernels to finish
309
+ torch.cuda.empty_cache() # Clear unused memory
310
+ except Exception as e:
311
+ self.logger.warning(f"Error clearing CUDA memory: {e}")
312
+
313
+ async def __aenter__(self):
314
+ """Open the loader if it has an open method."""
315
+ # Check if the loader has an open method and call it
316
+ if hasattr(self, "open"):
317
+ await self.open()
318
+ return self
319
+
320
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
321
+ """Close the loader if it has a close method."""
322
+ if hasattr(self, "close"):
323
+ await self.close()
324
+ return True
325
+
326
+ def supported_extensions(self):
327
+ """Get the supported file extensions."""
328
+ return self.extensions
329
+
330
+ def _detect_content_type(self, document: Document) -> str:
331
+ """
332
+ Auto-detect content type based on document metadata and content.
333
+
334
+ Args:
335
+ document: Document to analyze
336
+
337
+ Returns:
338
+ Content type string ('markdown', 'code', 'text', etc.)
339
+ """
340
+ if not self._auto_detect_content_type:
341
+ return 'text'
342
+
343
+ # Check metadata for hints
344
+ metadata = document.metadata or {}
345
+ filename = metadata.get('filename', '').lower()
346
+ source_type = metadata.get('source_type', '').lower()
347
+
348
+ # File extension based detection
349
+ if filename.endswith(('.md', '.markdown')):
350
+ return 'markdown'
351
+ elif filename.endswith(('.py', '.pyx', '.js', '.java', '.cpp', '.c', '.go', '.rs')):
352
+ return 'code'
353
+ elif filename.endswith(('.html', '.htm', '.xml')):
354
+ return 'html'
355
+ elif source_type in ['markdown', 'md']:
356
+ return 'markdown'
357
+
358
+ # Content based detection
359
+ content = document.page_content[:1000].lower() # Check first 1000 chars
360
+
361
+ # Simple heuristics for markdown
362
+ markdown_indicators = ['#', '```', '**', '*', '[', '](', '|', '---']
363
+ markdown_score = sum(1 for indicator in markdown_indicators if indicator in content)
364
+
365
+ if markdown_score >= 3: # If multiple markdown indicators found
366
+ return 'markdown'
367
+
368
+ # Default to text
369
+ return 'text'
370
+
371
+ def _select_splitter_for_content(self, content_type: str):
372
+ """
373
+ Select the appropriate text splitter based on content type.
374
+
375
+ Args:
376
+ content_type: Detected or specified content type
377
+
378
+ Returns:
379
+ Appropriate text splitter
380
+ """
381
+ if content_type == 'markdown':
382
+ return self.markdown_splitter
383
+ elif content_type == 'code':
384
+ # Use token splitter with smaller chunks for code
385
+ return TokenTextSplitter(
386
+ chunk_size=min(self.chunk_size, 2048),
387
+ chunk_overlap=self.chunk_overlap,
388
+ model_name='gpt-3.5-turbo'
389
+ )
390
+ else:
391
+ # Default to the configured text splitter
392
+ return self.text_splitter
393
+
394
+ def is_valid_path(self, path: Union[str, Path]) -> bool:
395
+ """Check if a path is valid."""
396
+ if self.extensions == '*':
397
+ return True
398
+ if isinstance(path, str):
399
+ path = Path(path)
400
+ if not path.exists():
401
+ return False
402
+ if path.is_dir() and path.name in self.skip_directories:
403
+ return False
404
+ if path.is_file():
405
+ if path.suffix not in self.extensions:
406
+ return False
407
+ if path.name.startswith("."):
408
+ return False
409
+ # check if file is empty
410
+ if path.stat().st_size == 0:
411
+ return False
412
+ # check if file is inside of skip directories:
413
+ for skip_dir in self.skip_directories:
414
+ if path.is_relative_to(skip_dir):
415
+ return False
416
+ return True
417
+
418
+ @abstractmethod
419
+ async def _load(self, source: Union[str, PurePath], **kwargs) -> List[Document]:
420
+ """Load a single data/url/file from a source and return it as a Langchain Document.
421
+
422
+ Args:
423
+ source (str): The source of the data.
424
+
425
+ Returns:
426
+ List[Document]: A list of Langchain Documents.
427
+ """
428
+ pass
429
+
430
+ async def from_path(
431
+ self,
432
+ path: Union[str, Path],
433
+ recursive: bool = False,
434
+ **kwargs
435
+ ) -> List[asyncio.Task]:
436
+ """
437
+ Load data from a path.
438
+ """
439
+ tasks = []
440
+ if isinstance(path, str):
441
+ path = PurePath(path)
442
+ if path.is_dir():
443
+ for ext in self.extensions:
444
+ glob_method = path.rglob if recursive else path.glob
445
+ # Use glob to find all files with the specified extension
446
+ for item in glob_method(f'*{ext}'):
447
+ # Check if the item is a directory and if it should be skipped
448
+ if set(item.parts).isdisjoint(self.skip_directories):
449
+ if self.is_valid_path(item):
450
+ tasks.append(
451
+ asyncio.create_task(self._load(item, **kwargs))
452
+ )
453
+ elif path.is_file():
454
+ if self.is_valid_path(path):
455
+ tasks.append(
456
+ asyncio.create_task(self._load(path, **kwargs))
457
+ )
458
+ else:
459
+ self.logger.warning(
460
+ f"Path {path} is not valid."
461
+ )
462
+ return tasks
463
+
464
+ async def from_url(
465
+ self,
466
+ url: Union[str, List[str]],
467
+ **kwargs
468
+ ) -> List[asyncio.Task]:
469
+ """
470
+ Load data from a URL.
471
+ """
472
+ tasks = []
473
+ if isinstance(url, str):
474
+ url = [url]
475
+ for item in url:
476
+ tasks.append(
477
+ asyncio.create_task(self._load(item, **kwargs))
478
+ )
479
+ return tasks
480
+
481
+ async def from_dataframe(
482
+ self,
483
+ source: pd.DataFrame,
484
+ **kwargs
485
+ ) -> List[asyncio.Task]:
486
+ """
487
+ Load data from a pandas DataFrame.
488
+ """
489
+ tasks = []
490
+ if isinstance(source, pd.DataFrame):
491
+ tasks.append(
492
+ asyncio.create_task(self._load(source, **kwargs))
493
+ )
494
+ else:
495
+ self.logger.warning(
496
+ f"Source {source} is not a valid pandas DataFrame."
497
+ )
498
+ return tasks
499
+
500
+ def chunkify(self, lst: List[T], n: int = 50) -> Generator[List[T], None, None]:
501
+ """Split a List of objects into chunks of size n.
502
+
503
+ Args:
504
+ lst: The list to split into chunks
505
+ n: The maximum size of each chunk
506
+
507
+ Yields:
508
+ List[T]: Chunks of the original list, each of size at most n
509
+ """
510
+ for i in range(0, len(lst), n):
511
+ yield lst[i:i + n]
512
+
513
+ async def _async_map(self, func: Callable, iterable: list) -> list:
514
+ """Run a function on a list of items asynchronously."""
515
+ async def async_func(item):
516
+ async with self.semaphore:
517
+ return await func(item)
518
+
519
+ tasks = [async_func(item) for item in iterable]
520
+ return await asyncio.gather(*tasks)
521
+
522
+ async def _load_tasks(self, tasks: list) -> list:
523
+ """Load a list of tasks asynchronously."""
524
+ results = []
525
+
526
+ if not tasks:
527
+ return results
528
+
529
+ # Create a controlled task function to limit concurrency
530
+ async def controlled_task(task):
531
+ async with self.semaphore:
532
+ try:
533
+ return await task
534
+ except Exception as e:
535
+ self.logger.error(f"Task error: {e}")
536
+ return e
537
+
538
+ for chunk in self.chunkify(tasks, self.chunk_size):
539
+ # Wrap each task with semaphore control
540
+ controlled_tasks = [controlled_task(task) for task in chunk]
541
+ result = await asyncio.gather(*controlled_tasks, return_exceptions=True)
542
+ if result:
543
+ for res in result:
544
+ if isinstance(res, Exception):
545
+ # Handle the exception
546
+ self.logger.error(f"Error loading {res}")
547
+ else:
548
+ # Handle both single documents and lists of documents
549
+ if isinstance(res, list):
550
+ results.extend(res)
551
+ else:
552
+ results.append(res)
553
+ return results
554
+
555
+ async def load(
556
+ self,
557
+ source: Optional[Any] = None,
558
+ split_documents: bool = True,
559
+ late_chunking: bool = False,
560
+ vector_store=None,
561
+ store_full_document: bool = True,
562
+ auto_detect_content_type: bool = None,
563
+ **kwargs
564
+ ) -> List[Document]:
565
+ """
566
+ Load data from a source and return it as a list of Documents.
567
+
568
+ The source can be:
569
+ - None: Uses self.path attribute if available
570
+ - Path or str: Treated as file path or directory
571
+ - List[str/Path]: Treated as list of file paths
572
+ - URL string: Treated as a URL
573
+ - List of URLs: Treated as list of URLs
574
+
575
+ Args:
576
+ source (Optional[Any]): The source of the data.
577
+ split_documents (bool): Whether to split documents into chunks, defaults to True
578
+ late_chunking (bool): Whether to use late chunking strategy
579
+ vector_store: Vector store instance (required for late chunking)
580
+ store_full_document (bool): Whether to store full documents alongside chunks
581
+ auto_detect_content_type (bool): Override auto-detection setting
582
+ **kwargs: Additional keyword arguments
583
+
584
+ Returns:
585
+ List[Document]: A list of Documents (chunked if requested).
586
+ """
587
+ tasks = []
588
+ # If no source is provided, use self.path
589
+ if source is None:
590
+ if self.path is None:
591
+ raise ValueError(
592
+ "No source provided and self.path is not set. "
593
+ "Please provide a source parameter or set path during initialization."
594
+ )
595
+ source = self.path
596
+
597
+ if isinstance(source, (str, Path, PosixPath, PurePath)):
598
+ # Check if it's a URL
599
+ if isinstance(source, str) and (
600
+ source.startswith('http://') or source.startswith('https://')
601
+ ):
602
+ tasks = await self.from_url(source, **kwargs)
603
+ else:
604
+ # Assume it's a file path or directory
605
+ tasks = await self.from_path(
606
+ source,
607
+ recursive=self._recursive,
608
+ **kwargs
609
+ )
610
+ elif isinstance(source, list):
611
+ # Check if it's a list of URLs or paths
612
+ if all(
613
+ isinstance(item, str) and (
614
+ item.startswith('http://') or item.startswith('https://')
615
+ ) for item in source
616
+ ):
617
+ tasks = await self.from_url(source, **kwargs)
618
+ else:
619
+ # Assume it's a list of file paths
620
+ path_tasks = []
621
+ for path in source:
622
+ path_tasks.extend(
623
+ await self.from_path(path, recursive=self._recursive, **kwargs)
624
+ )
625
+ tasks = path_tasks
626
+ elif isinstance(source, pd.DataFrame):
627
+ tasks = await self.from_dataframe(source, **kwargs)
628
+ else:
629
+ raise ValueError(
630
+ f"Unsupported source type: {type(source)}"
631
+ )
632
+ # Load tasks and get raw documents
633
+ documents = []
634
+ if tasks:
635
+ results = await self._load_tasks(tasks)
636
+ documents = results
637
+
638
+ # Apply chunking if requested
639
+ if split_documents and documents:
640
+ self.logger.debug(
641
+ f"Splitting {len(documents)} documents into chunks..."
642
+ )
643
+
644
+ if late_chunking and vector_store is None:
645
+ raise ValueError(
646
+ "Vector store is required when using late_chunking=True"
647
+ )
648
+
649
+ documents = await self.chunk_documents(
650
+ documents=documents,
651
+ use_late_chunking=late_chunking,
652
+ vector_store=vector_store,
653
+ store_full_document=store_full_document,
654
+ auto_detect_content_type=auto_detect_content_type
655
+ )
656
+
657
+ self.logger.debug(
658
+ f"Document chunking complete: {len(documents)} final documents"
659
+ )
660
+
661
+ return documents
662
+
663
+ def create_metadata(
664
+ self,
665
+ path: Union[str, PurePath],
666
+ doctype: str = 'document',
667
+ source_type: str = 'source',
668
+ doc_metadata: Optional[dict] = None,
669
+ **kwargs
670
+ ):
671
+ if not doc_metadata:
672
+ doc_metadata = {}
673
+ if isinstance(path, PurePath):
674
+ origin = path.name
675
+ url = f'file://{path.name}'
676
+ filename = path
677
+ else:
678
+ origin = path
679
+ url = path
680
+ filename = f'file://{path}'
681
+ metadata = {
682
+ "url": url,
683
+ "source": origin,
684
+ "filename": str(filename),
685
+ "type": doctype,
686
+ "source_type": source_type or self._source_type,
687
+ "created_at": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
688
+ "category": self.category,
689
+ "document_meta": {
690
+ **doc_metadata
691
+ },
692
+ **kwargs
693
+ }
694
+ return metadata
695
+
696
+ def create_document(
697
+ self,
698
+ content: Any,
699
+ path: Union[str, PurePath],
700
+ metadata: Optional[dict] = None,
701
+ **kwargs
702
+ ) -> Document:
703
+ """Create a Langchain Document from the content.
704
+ Args:
705
+ content (Any): The content to create the document from.
706
+ Returns:
707
+ Document: A Langchain Document.
708
+ """
709
+ if metadata:
710
+ _meta = metadata
711
+ else:
712
+ _meta = self.create_metadata(
713
+ path=path,
714
+ doctype=self.doctype,
715
+ source_type=self._source_type,
716
+ **kwargs
717
+ )
718
+ return Document(
719
+ page_content=content,
720
+ metadata=_meta
721
+ )
722
+
723
+ async def summary_from_text(
724
+ self,
725
+ text: str,
726
+ max_length: int = 500,
727
+ min_length: int = 50
728
+ ) -> str:
729
+ """
730
+ Get a summary of a text.
731
+ """
732
+ if not text:
733
+ return ''
734
+ try:
735
+ summarizer = self.get_summarization_model()
736
+ if self._use_summary_pipeline:
737
+ # Use Huggingface pipeline
738
+ content = summarizer(
739
+ text,
740
+ max_length=max_length,
741
+ min_length=min_length,
742
+ do_sample=False,
743
+ truncation=True
744
+ )
745
+ return content[0].get('summary_text', '')
746
+ # Use Summarize Method from GroqClient
747
+ system_prompt = f"""
748
+ Your job is to produce a final summary from the following text and identify the main theme.
749
+ - The summary should be concise and to the point.
750
+ - The summary should be no longer than {max_length} characters and no less than {min_length} characters.
751
+ - The summary should be in a single paragraph.
752
+ """
753
+ summary = await summarizer.summarize_text(
754
+ text=text,
755
+ model=GroqModel.LLAMA_3_3_70B_VERSATILE,
756
+ system_prompt=system_prompt,
757
+ temperature=0.1,
758
+ max_tokens=1000,
759
+ top_p=0.5
760
+ )
761
+ return summary.output
762
+ except Exception as e:
763
+ self.logger.error(
764
+ f'ERROR on summary_from_text: {e}'
765
+ )
766
+ return ""
767
+
768
+ def get_summarization_model(
769
+ self,
770
+ model_name: str = 'facebook/bart-large-cnn'
771
+ ):
772
+ if not self._summary_model:
773
+ if self._use_summary_pipeline:
774
+ from transformers import (
775
+ AutoModelForSeq2SeqLM,
776
+ AutoTokenizer,
777
+ pipeline
778
+ )
779
+ _, pipe_dev, torch_dtype = self._get_device()
780
+ summarize_model = AutoModelForSeq2SeqLM.from_pretrained(
781
+ model_name,
782
+ )
783
+ summarize_tokenizer = AutoTokenizer.from_pretrained(
784
+ model_name,
785
+ padding_side="left"
786
+ )
787
+ self._summary_model = pipeline(
788
+ "summarization",
789
+ model=summarize_model,
790
+ tokenizer=summarize_tokenizer,
791
+ device=pipe_dev, # 0 for CUDA, mps device, or -1
792
+ torch_dtype=torch_dtype if pipe_dev != -1 else None,
793
+ )
794
+ else:
795
+ # Use Groq for Summarization:
796
+ self._summary_model = LLMFactory.create(
797
+ llm=f"groq:{GroqModel.LLAMA_3_3_70B_VERSATILE}",
798
+ model_kwargs={
799
+ "temperature": 0.1,
800
+ "top_p": 0.5,
801
+ }
802
+ )
803
+ return self._summary_model
804
+
805
+ def translate_text(
806
+ self,
807
+ text: str,
808
+ source_lang: str = None,
809
+ target_lang: str = "es"
810
+ ) -> str:
811
+ """
812
+ Translate text from source language to target language.
813
+
814
+ Args:
815
+ text: Text to translate
816
+ source_lang: Source language code (default: 'en')
817
+ target_lang: Target language code (default: 'es')
818
+
819
+ Returns:
820
+ Translated text
821
+ """
822
+ if not text:
823
+ return ''
824
+ try:
825
+ translator = self.get_translation_model(source_lang, target_lang)
826
+ if self._use_translation_pipeline:
827
+ # Use Huggingface pipeline
828
+ content = translator(
829
+ text,
830
+ max_length=len(text) * 2, # Allow for expansion in target language
831
+ truncation=True
832
+ )
833
+ return content[0].get('translation_text', '')
834
+ else:
835
+ # Use LLM for translation
836
+ translation = translator.translate_text(
837
+ text=text,
838
+ source_lang=source_lang,
839
+ target_lang=target_lang,
840
+ model=GoogleModel.GEMINI_2_5_FLASH_LITE_PREVIEW,
841
+ temperature=0.1,
842
+ max_tokens=1000
843
+ )
844
+ return translation.get('text', '')
845
+ except Exception as e:
846
+ self.logger.error(f'ERROR on translate_text: {e}')
847
+ return ""
848
+
849
+ def get_translation_model(
850
+ self,
851
+ source_lang: str = "en",
852
+ target_lang: str = "es",
853
+ model_name: str = None
854
+ ):
855
+ """
856
+ Get or create a translation model.
857
+
858
+ Args:
859
+ source_lang: Source language code
860
+ target_lang: Target language code
861
+ model_name: Optional model name override
862
+
863
+ Returns:
864
+ Translation model/chain
865
+ """
866
+ # Create a cache key for the language pair
867
+ cache_key = f"{source_lang}_{target_lang}"
868
+
869
+ # Check if we already have a model for this language pair
870
+ if not hasattr(self, '_translation_models'):
871
+ self._translation_models = {}
872
+
873
+ if cache_key not in self._translation_models:
874
+ if self._use_translation_pipeline:
875
+ from transformers import (
876
+ AutoModelForSeq2SeqLM,
877
+ AutoTokenizer,
878
+ pipeline
879
+ )
880
+ # Select appropriate model based on language pair if not specified
881
+ if model_name is None:
882
+ if source_lang == "en" and target_lang in ["es", "fr", "de", "it", "pt", "ru"]:
883
+ model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
884
+ elif source_lang in ["es", "fr", "de", "it", "pt"] and target_lang == "en":
885
+ model_name = "Helsinki-NLP/opus-mt-ROMANCE-en"
886
+ else:
887
+ # Default to a specific model for the language pair
888
+ model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
889
+
890
+ try:
891
+ translate_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
892
+ translate_tokenizer = AutoTokenizer.from_pretrained(model_name)
893
+
894
+ self._translation_models[cache_key] = pipeline(
895
+ "translation",
896
+ model=translate_model,
897
+ tokenizer=translate_tokenizer
898
+ )
899
+ except Exception as e:
900
+ self.logger.error(
901
+ f"Error loading translation model {model_name}: {e}"
902
+ )
903
+ # Fallback to using LLM for translation
904
+ self._use_translation_pipeline = False
905
+
906
+ if not self._use_translation_pipeline:
907
+ # Use LLM for translation
908
+ translation_model = self.get_default_llm(
909
+ model=GoogleModel.GEMINI_2_5_FLASH_LITE_PREVIEW
910
+ )
911
+ self._translation_models[cache_key] = translation_model
912
+
913
+ return self._translation_models[cache_key]
914
+
915
+ def create_translated_document(
916
+ self,
917
+ content: str,
918
+ metadata: dict,
919
+ source_lang: str = "en",
920
+ target_lang: str = "es"
921
+ ) -> Document:
922
+ """
923
+ Create a document with translated content.
924
+
925
+ Args:
926
+ content: Original content
927
+ metadata: Document metadata
928
+ source_lang: Source language code
929
+ target_lang: Target language code
930
+
931
+ Returns:
932
+ Document with translated content
933
+ """
934
+ translated_content = self.translate_text(content, source_lang, target_lang)
935
+
936
+ # Clone the metadata and add translation info
937
+ translation_metadata = metadata.copy()
938
+ translation_metadata.update({
939
+ "original_language": source_lang,
940
+ "language": target_lang,
941
+ "is_translation": True
942
+ })
943
+
944
+ return Document(
945
+ page_content=translated_content,
946
+ metadata=translation_metadata
947
+ )
948
+
949
+ def saving_file(self, filename: PurePath, data: Any):
950
+ """Save data to a file.
951
+
952
+ Args:
953
+ filename (PurePath): The path to the file.
954
+ data (Any): The data to save.
955
+ """
956
+ with open(filename, 'wb') as f:
957
+ f.write(data)
958
+ f.flush()
959
+ print(f':: Saved File on {filename}')
960
+
961
+ async def chunk_documents(
962
+ self,
963
+ documents: List[Document],
964
+ use_late_chunking: bool = False,
965
+ vector_store=None,
966
+ store_full_document: bool = True,
967
+ auto_detect_content_type: bool = None
968
+ ) -> List[Document]:
969
+ """
970
+ Chunk documents using the configured text splitter or late chunking strategy.
971
+
972
+ Args:
973
+ documents: List of documents to chunk
974
+ use_late_chunking: Whether to use late chunking strategy
975
+ vector_store: Vector store instance (required for late chunking)
976
+ store_full_document: Whether to store full documents alongside chunks (late chunking only)
977
+ auto_detect_content_type: Override auto-detection setting
978
+
979
+ Returns:
980
+ List of chunked documents
981
+ """
982
+ if use_late_chunking:
983
+ return await self._chunk_with_late_chunking(
984
+ documents, vector_store, store_full_document
985
+ )
986
+ else:
987
+ return self._chunk_with_text_splitter(
988
+ documents, auto_detect_content_type
989
+ )
990
+
991
+ def _chunk_with_text_splitter(
992
+ self,
993
+ documents: List[Document],
994
+ auto_detect_content_type: bool = None
995
+ ) -> List[Document]:
996
+ """
997
+ Chunk documents using regular text splitters.
998
+
999
+ Args:
1000
+ documents: List of documents to chunk
1001
+ auto_detect_content_type: Override auto-detection setting
1002
+
1003
+ Returns:
1004
+ List of chunked documents
1005
+ """
1006
+ chunked_docs = []
1007
+ detect_content = auto_detect_content_type if auto_detect_content_type is not None else self._auto_detect_content_type # noqa
1008
+
1009
+ for doc in documents:
1010
+ try:
1011
+ # Detect content type and select appropriate splitter
1012
+ if detect_content:
1013
+ content_type = self._detect_content_type(doc)
1014
+ splitter = self._select_splitter_for_content(content_type)
1015
+ # self.logger.debug(f"Detected content type: {content_type} for document")
1016
+ else:
1017
+ content_type = 'text'
1018
+ splitter = self.text_splitter
1019
+
1020
+ # Create chunks using the selected splitter
1021
+ chunks = splitter.create_chunks(
1022
+ text=doc.page_content,
1023
+ metadata=doc.metadata
1024
+ )
1025
+
1026
+ # Convert chunks to Document objects
1027
+ for chunk in chunks:
1028
+ chunked_doc = Document(
1029
+ page_content=chunk.text,
1030
+ metadata={
1031
+ **chunk.metadata,
1032
+ 'chunk_id': chunk.chunk_id,
1033
+ 'token_count': chunk.token_count,
1034
+ 'start_position': chunk.start_position,
1035
+ 'end_position': chunk.end_position,
1036
+ 'content_type': content_type,
1037
+ 'splitter_type': splitter.__class__.__name__,
1038
+ 'is_chunk': True,
1039
+ 'parent_document_id': doc.metadata.get('document_id', f"doc_{uuid.uuid4().hex[:8]}")
1040
+ }
1041
+ )
1042
+ chunked_docs.append(chunked_doc)
1043
+
1044
+ except Exception as e:
1045
+ self.logger.error(f"Error chunking document: {e}")
1046
+ # Fall back to adding the original document
1047
+ chunked_docs.append(doc)
1048
+
1049
+ self.logger.info(f"Chunked {len(documents)} documents into {len(chunked_docs)} chunks")
1050
+ return chunked_docs
1051
+
1052
+ async def _chunk_with_late_chunking(
1053
+ self,
1054
+ documents: List[Document],
1055
+ vector_store=None,
1056
+ store_full_document: bool = True
1057
+ ) -> List[Document]:
1058
+ """
1059
+ Chunk documents using late chunking strategy.
1060
+
1061
+ Args:
1062
+ documents: List of documents to chunk
1063
+ vector_store: Vector store instance (required)
1064
+ store_full_document: Whether to store full documents alongside chunks
1065
+
1066
+ Returns:
1067
+ List of chunked documents (and optionally full documents)
1068
+ """
1069
+ if LateChunkingProcessor is None:
1070
+ self.logger.warning(
1071
+ "LateChunkingProcessor not available, falling back to regular chunking"
1072
+ )
1073
+ return self._chunk_with_text_splitter(documents)
1074
+
1075
+ if vector_store is None:
1076
+ raise ValueError("Vector store is required for late chunking strategy")
1077
+
1078
+ chunked_docs = []
1079
+
1080
+ # Initialize late chunking processor
1081
+ chunking_processor = LateChunkingProcessor(
1082
+ vector_store=vector_store,
1083
+ chunk_size=self.chunk_size,
1084
+ chunk_overlap=self.chunk_overlap
1085
+ )
1086
+
1087
+ for doc_idx, document in enumerate(documents):
1088
+ try:
1089
+ document_id = document.metadata.get('document_id', f"doc_{doc_idx:06d}_{uuid.uuid4().hex[:8]}")
1090
+
1091
+ # Process document with late chunking
1092
+ _, chunk_infos = await chunking_processor.process_document_late_chunking(
1093
+ document_text=document.page_content,
1094
+ document_id=document_id,
1095
+ metadata=document.metadata
1096
+ )
1097
+
1098
+ # Store full document if requested
1099
+ if store_full_document:
1100
+ full_doc_metadata = {
1101
+ **(document.metadata or {}),
1102
+ 'document_id': document_id,
1103
+ 'is_full_document': True,
1104
+ 'total_chunks': len(chunk_infos),
1105
+ 'document_type': 'parent',
1106
+ 'chunking_strategy': 'late_chunking'
1107
+ }
1108
+
1109
+ full_doc = Document(
1110
+ page_content=document.page_content,
1111
+ metadata=full_doc_metadata
1112
+ )
1113
+ chunked_docs.append(full_doc)
1114
+
1115
+ # Add all chunks as documents
1116
+ for chunk_info in chunk_infos:
1117
+ chunk_doc = Document(
1118
+ page_content=chunk_info.chunk_text,
1119
+ metadata=chunk_info.metadata
1120
+ )
1121
+ chunked_docs.append(chunk_doc)
1122
+
1123
+ except Exception as e:
1124
+ self.logger.error(f"Error in late chunking for document {doc_idx}: {e}")
1125
+ # Fall back to adding the original document
1126
+ chunked_docs.append(document)
1127
+
1128
+ self.logger.info(
1129
+ f"Late chunking processed {len(documents)} documents into {len(chunked_docs)} items"
1130
+ )
1131
+ return chunked_docs