ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
parrot/loaders/ppt.py ADDED
@@ -0,0 +1,476 @@
1
+ from typing import List, Union, Optional, Literal
2
+ from pathlib import PurePath
3
+ from collections.abc import Callable
4
+ import re
5
+ from ..stores.models import Document
6
+ from .abstract import AbstractLoader
7
+
8
+ # Optional dependencies
9
+ try:
10
+ from markitdown import MarkItDown
11
+ MARKITDOWN_AVAILABLE = True
12
+ except ImportError:
13
+ MARKITDOWN_AVAILABLE = False
14
+
15
+ try:
16
+ from pptx import Presentation
17
+ PPTX_AVAILABLE = True
18
+ except ImportError:
19
+ PPTX_AVAILABLE = False
20
+
21
+
22
+ class PowerPointLoader(AbstractLoader):
23
+ """
24
+ Enhanced PowerPoint loader with multiple backends.
25
+
26
+ Supports:
27
+ 1. MarkItDown backend for rich markdown extraction (primary)
28
+ 2. python-pptx backend for detailed control and fallback
29
+
30
+ Features:
31
+ - Slide-by-slide processing with proper markdown formatting
32
+ - Automatic slide title detection
33
+ - Bullet point preservation
34
+ - Slide notes extraction
35
+ - Image-only slide detection and filtering
36
+ - Configurable output formats
37
+ """
38
+
39
+ extensions: List[str] = ['.pptx', '.ppt']
40
+
41
+ def __init__(
42
+ self,
43
+ source: Optional[Union[str, PurePath, List[PurePath]]] = None,
44
+ *,
45
+ tokenizer: Union[str, Callable] = None,
46
+ text_splitter: Union[str, Callable] = None,
47
+ source_type: str = 'file',
48
+
49
+ # Backend selection
50
+ backend: str = "auto", # "markitdown", "pptx", "auto"
51
+
52
+ # Output format
53
+ output_format: Literal["markdown", "plain"] = "markdown",
54
+
55
+ # Processing options
56
+ skip_image_only_slides: bool = True,
57
+ skip_empty_slides: bool = True,
58
+ extract_slide_notes: bool = True,
59
+ preserve_slide_structure: bool = True,
60
+
61
+ # Slide filtering
62
+ min_slide_content_length: int = 10,
63
+
64
+ # Content processing
65
+ clean_whitespace: bool = True,
66
+ merge_consecutive_headers: bool = True,
67
+
68
+ **kwargs
69
+ ):
70
+ super().__init__(
71
+ source,
72
+ tokenizer=tokenizer,
73
+ text_splitter=text_splitter,
74
+ source_type=source_type,
75
+ **kwargs
76
+ )
77
+
78
+ # Backend configuration
79
+ self.backend = self._select_backend(backend)
80
+ self.output_format = output_format
81
+
82
+ # Processing options
83
+ self.skip_image_only_slides = skip_image_only_slides
84
+ self.skip_empty_slides = skip_empty_slides
85
+ self.extract_slide_notes = extract_slide_notes
86
+ self.preserve_slide_structure = preserve_slide_structure
87
+ self.min_slide_content_length = min_slide_content_length
88
+
89
+ # Content processing
90
+ self.clean_whitespace = clean_whitespace
91
+ self.merge_consecutive_headers = merge_consecutive_headers
92
+
93
+ # Initialize backend
94
+ self._setup_backend()
95
+
96
+ def _select_backend(self, preferred: str) -> str:
97
+ """Select the best available backend."""
98
+ if preferred == "auto":
99
+ if MARKITDOWN_AVAILABLE:
100
+ return "markitdown"
101
+ elif PPTX_AVAILABLE:
102
+ return "pptx"
103
+ else:
104
+ raise ImportError(
105
+ "No PowerPoint processing backend available. Install 'markitdown' or 'python-pptx'"
106
+ )
107
+ elif preferred == "markitdown" and MARKITDOWN_AVAILABLE:
108
+ return "markitdown"
109
+ elif preferred == "pptx" and PPTX_AVAILABLE:
110
+ return "pptx"
111
+ else:
112
+ self.logger.warning(
113
+ f"Backend '{preferred}' not available, falling back"
114
+ )
115
+ return self._select_backend("auto")
116
+
117
+ def _setup_backend(self):
118
+ """Initialize the selected backend."""
119
+ if self.backend == "markitdown":
120
+ self.md_converter = MarkItDown()
121
+ self.logger.info("Using MarkItDown backend for PowerPoint processing")
122
+ else:
123
+ self.logger.info("Using python-pptx backend for PowerPoint processing")
124
+
125
+ def _clean_content(self, content: str) -> str:
126
+ """Clean and normalize content."""
127
+ if not content:
128
+ return ""
129
+
130
+ if self.clean_whitespace:
131
+ # Normalize whitespace while preserving markdown structure
132
+ lines = content.split('\n')
133
+ cleaned_lines = []
134
+ for line in lines:
135
+ cleaned_line = ' '.join(line.split())
136
+ cleaned_lines.append(cleaned_line)
137
+ content = '\n'.join(cleaned_lines)
138
+
139
+ return content.strip()
140
+
141
+ def _extract_slides_from_markdown(self, markdown_content: str) -> List[dict]:
142
+ """Extract individual slides from MarkItDown markdown output."""
143
+ slides = []
144
+
145
+ # Split by slide separators (MarkItDown typically uses headers or page breaks)
146
+ # Try multiple patterns for slide separation
147
+ slide_patterns = [
148
+ r'\n(?=#{1,2}\s)', # Level 1-2 headers (typical slide titles)
149
+ r'\n---+\n', # Horizontal rules
150
+ r'\n\*{3,}\n', # Multiple asterisks
151
+ r'(?:\n\s*){3,}', # Multiple blank lines
152
+ ]
153
+
154
+ slide_sections = [markdown_content] # Start with full content
155
+
156
+ for pattern in slide_patterns:
157
+ new_sections = []
158
+ for section in slide_sections:
159
+ parts = re.split(pattern, section)
160
+ new_sections.extend([part.strip() for part in parts if part.strip()])
161
+ if len(new_sections) > len(slide_sections):
162
+ slide_sections = new_sections
163
+ break
164
+
165
+ # Process each section as a potential slide
166
+ for i, section in enumerate(slide_sections):
167
+ if len(section) < self.min_slide_content_length:
168
+ continue
169
+
170
+ # Extract title (first header if present)
171
+ title_match = re.match(r'^(#{1,3})\s*(.+)$', section, re.MULTILINE)
172
+ title = title_match.group(2) if title_match else f"Slide {i+1}"
173
+
174
+ # Extract content (everything after title or full content if no title)
175
+ if title_match:
176
+ content_start = section.find('\n', title_match.end())
177
+ content = section[content_start:].strip() if content_start != -1 else ""
178
+ else:
179
+ content = section.strip()
180
+
181
+ slides.append({
182
+ "slide_number": i + 1,
183
+ "title": title,
184
+ "content": content,
185
+ "full_content": section,
186
+ "has_title": bool(title_match)
187
+ })
188
+
189
+ return slides
190
+
191
+ def _process_markitdown_content(self, path: Union[str, PurePath]) -> List[dict]:
192
+ """Process PowerPoint using MarkItDown backend."""
193
+ try:
194
+ result = self.md_converter.convert(str(path))
195
+ if not result or not result.text_content:
196
+ self.logger.warning("MarkItDown returned empty content")
197
+ return []
198
+
199
+ markdown_content = result.text_content
200
+ slides = self._extract_slides_from_markdown(markdown_content)
201
+
202
+ self.logger.info(f"MarkItDown extracted {len(slides)} slides")
203
+ return slides
204
+
205
+ except Exception as e:
206
+ self.logger.error(f"MarkItDown processing failed: {e}")
207
+ return []
208
+
209
+ # Original python-pptx methods (preserved as fallback)
210
+ def extract_slide_text(self, slide):
211
+ """Extract all text from a slide as a single string."""
212
+ text_chunks = []
213
+ for shape in slide.shapes:
214
+ if hasattr(shape, "text") and shape.text.strip():
215
+ text_chunks.append(shape.text.strip())
216
+ return "\n\n".join(text_chunks).strip()
217
+
218
+ def slide_has_text(self, slide) -> bool:
219
+ """Determine if a slide contains any text."""
220
+ for shape in slide.shapes:
221
+ if hasattr(shape, "text") and shape.text.strip():
222
+ return True
223
+ return False
224
+
225
+ def slide_has_images_only(self, slide) -> bool:
226
+ """Return True if slide has images and no text."""
227
+ has_image = False
228
+ for shape in slide.shapes:
229
+ if shape.shape_type == 13: # PICTURE shape type in python-pptx
230
+ has_image = True
231
+ if hasattr(shape, "text") and shape.text.strip():
232
+ return False
233
+ return has_image
234
+
235
+ def _extract_slide_title(self, slide) -> str:
236
+ """Extract slide title from python-pptx slide object."""
237
+ # Try to get title from title placeholder
238
+ try:
239
+ if slide.shapes.title and slide.shapes.title.text.strip():
240
+ return slide.shapes.title.text.strip()
241
+ except:
242
+ pass
243
+
244
+ # Look for first text shape that looks like a title
245
+ for shape in slide.shapes:
246
+ if hasattr(shape, "text") and shape.text.strip():
247
+ text = shape.text.strip()
248
+ # Simple heuristic: short text, single line, likely a title
249
+ if len(text) < 100 and '\n' not in text:
250
+ return text
251
+ break
252
+
253
+ return ""
254
+
255
+ def _format_slide_as_markdown(self, slide_data: dict, slide_text: str, slide_notes: str = "") -> str:
256
+ """Format slide content as markdown."""
257
+ markdown_parts = []
258
+
259
+ # Add title
260
+ if slide_data.get("title"):
261
+ markdown_parts.append(f"# {slide_data['title']}")
262
+ elif not slide_data.get("has_title", False):
263
+ markdown_parts.append(f"# Slide {slide_data['slide_number']}")
264
+
265
+ # Add main content
266
+ if slide_text:
267
+ # Convert plain text to markdown if needed
268
+ if self.output_format == "markdown" and not slide_data.get("full_content"):
269
+ # Basic markdown conversion for bullet points
270
+ content_lines = []
271
+ for line in slide_text.split('\n'):
272
+ line = line.strip()
273
+ if line:
274
+ # Convert indented text to bullet points
275
+ if line.startswith('•') or line.startswith('-'):
276
+ content_lines.append(f"- {line[1:].strip()}")
277
+ elif line.startswith(' ') or line.startswith('\t'):
278
+ content_lines.append(f"- {line.strip()}")
279
+ else:
280
+ content_lines.append(line)
281
+ markdown_parts.append('\n'.join(content_lines))
282
+ else:
283
+ markdown_parts.append(slide_text)
284
+
285
+ # Add notes if present
286
+ if slide_notes and self.extract_slide_notes:
287
+ markdown_parts.append("## Notes")
288
+ markdown_parts.append(slide_notes)
289
+
290
+ return '\n\n'.join(markdown_parts)
291
+
292
+ def _process_pptx_content(self, path: Union[str, PurePath]) -> List[dict]:
293
+ """Process PowerPoint using python-pptx backend (original implementation enhanced)."""
294
+ if not PPTX_AVAILABLE:
295
+ raise ImportError("python-pptx not available for fallback processing")
296
+
297
+ try:
298
+ prs = Presentation(str(path))
299
+ slides = []
300
+ slide_count = len(prs.slides)
301
+
302
+ for i, slide in enumerate(prs.slides):
303
+ # Skip image-only slides if configured
304
+ if self.skip_image_only_slides and self.slide_has_images_only(slide):
305
+ self.logger.debug(f"Slide {i+1}/{slide_count}: only images, skipping.")
306
+ continue
307
+
308
+ # Extract slide text
309
+ slide_text = self.extract_slide_text(slide)
310
+
311
+ # Skip empty slides if configured
312
+ if self.skip_empty_slides and (not slide_text or len(slide_text) < self.min_slide_content_length):
313
+ self.logger.debug(f"Slide {i+1}/{slide_count}: no sufficient text content, skipping.")
314
+ continue
315
+
316
+ # Extract slide title
317
+ slide_title = self._extract_slide_title(slide)
318
+
319
+ # Extract slide notes
320
+ slide_notes = ""
321
+ if self.extract_slide_notes and slide.has_notes_slide and slide.notes_slide.notes_text_frame:
322
+ slide_notes = slide.notes_slide.notes_text_frame.text.strip()
323
+
324
+ slides.append({
325
+ "slide_number": i + 1,
326
+ "slide_id": slide.slide_id,
327
+ "title": slide_title,
328
+ "content": slide_text,
329
+ "notes": slide_notes,
330
+ "has_title": bool(slide_title)
331
+ })
332
+
333
+ self.logger.info(f"python-pptx extracted {len(slides)} slides from {slide_count} total slides")
334
+ return slides
335
+
336
+ except Exception as e:
337
+ self.logger.error(f"python-pptx processing failed: {e}")
338
+ return []
339
+
340
+ async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
341
+ """
342
+ Load PowerPoint presentation with enhanced markdown support.
343
+
344
+ Args:
345
+ path: Path to the PowerPoint file
346
+
347
+ Returns:
348
+ List of Document objects, one per slide
349
+ """
350
+ self.logger.info(f"Loading PowerPoint file: {path}")
351
+ docs = []
352
+
353
+ # Try primary backend
354
+ if self.backend == "markitdown":
355
+ slides_data = self._process_markitdown_content(path)
356
+
357
+ # Fallback to python-pptx if MarkItDown fails or returns no slides
358
+ if not slides_data and PPTX_AVAILABLE:
359
+ self.logger.info("MarkItDown failed or returned no slides, falling back to python-pptx")
360
+ slides_data = self._process_pptx_content(path)
361
+ else:
362
+ slides_data = self._process_pptx_content(path)
363
+
364
+ if not slides_data:
365
+ self.logger.warning(f"No slides extracted from {path}")
366
+ return docs
367
+
368
+ # Create documents for each slide
369
+ for slide_data in slides_data:
370
+ # Format content based on output format and backend
371
+ if self.backend == "markitdown" and self.output_format == "markdown":
372
+ if slide_data.get("full_content"):
373
+ content = slide_data["full_content"]
374
+ else:
375
+ content = self._format_slide_as_markdown(
376
+ slide_data,
377
+ slide_data.get("content", ""),
378
+ slide_data.get("notes", "")
379
+ )
380
+ elif self.output_format == "markdown":
381
+ content = self._format_slide_as_markdown(
382
+ slide_data,
383
+ slide_data.get("content", ""),
384
+ slide_data.get("notes", "")
385
+ )
386
+ else:
387
+ # Plain text format
388
+ parts = []
389
+ if slide_data.get("title"):
390
+ parts.append(f"Title: {slide_data['title']}")
391
+ if slide_data.get("content"):
392
+ parts.append(slide_data["content"])
393
+ if slide_data.get("notes") and self.extract_slide_notes:
394
+ parts.append(f"Notes: {slide_data['notes']}")
395
+ content = "\n\n".join(parts)
396
+
397
+ content = self._clean_content(content)
398
+
399
+ if not content or len(content) < self.min_slide_content_length:
400
+ continue
401
+
402
+ # Create metadata
403
+ slide_meta = {
404
+ "slide_number": slide_data["slide_number"],
405
+ "slide_title": slide_data.get("title", ""),
406
+ "has_notes": bool(slide_data.get("notes", "")),
407
+ "content_length": len(content),
408
+ }
409
+
410
+ # Add backend-specific metadata
411
+ if "slide_id" in slide_data:
412
+ slide_meta["slide_id"] = slide_data["slide_id"]
413
+
414
+ metadata = self.create_metadata(
415
+ path=path,
416
+ doctype="pptx",
417
+ source_type="powerpoint",
418
+ doc_metadata={
419
+ **slide_meta,
420
+ "extraction_backend": self.backend,
421
+ "output_format": self.output_format,
422
+ },
423
+ )
424
+
425
+ # Create context header if preserve_slide_structure is True
426
+ if self.preserve_slide_structure:
427
+ context_parts = [
428
+ f"File Name: {path.name if hasattr(path, 'name') else str(path).split('/')[-1]}",
429
+ f"Slide Number: {slide_data['slide_number']}",
430
+ f"Document Type: pptx",
431
+ f"Source Type: powerpoint",
432
+ ]
433
+
434
+ if slide_data.get("slide_id"):
435
+ context_parts.append(f"Slide ID: {slide_data['slide_id']}")
436
+
437
+ context_str = "\n".join(context_parts) + "\n======\n\n"
438
+ full_content = context_str + content
439
+ else:
440
+ full_content = content
441
+
442
+ doc = self.create_document(
443
+ content=full_content,
444
+ path=path,
445
+ metadata=metadata
446
+ )
447
+ docs.append(doc)
448
+
449
+ self.logger.info(f"Created {len(docs)} documents from PowerPoint slides")
450
+ return docs
451
+
452
+ def get_supported_backends(self) -> List[str]:
453
+ """Get list of available backends."""
454
+ backends = []
455
+
456
+ if MARKITDOWN_AVAILABLE:
457
+ backends.append("markitdown")
458
+ if PPTX_AVAILABLE:
459
+ backends.append("pptx")
460
+
461
+ return backends
462
+
463
+ def get_backend_info(self) -> dict:
464
+ """Get information about current backend configuration."""
465
+ return {
466
+ "current_backend": self.backend,
467
+ "available_backends": self.get_supported_backends(),
468
+ "output_format": self.output_format,
469
+ "settings": {
470
+ "skip_image_only_slides": self.skip_image_only_slides,
471
+ "skip_empty_slides": self.skip_empty_slides,
472
+ "extract_slide_notes": self.extract_slide_notes,
473
+ "preserve_slide_structure": self.preserve_slide_structure,
474
+ "min_slide_content_length": self.min_slide_content_length,
475
+ }
476
+ }
parrot/loaders/qa.py ADDED
@@ -0,0 +1,63 @@
1
+
2
+ from pathlib import PurePath
3
+ from typing import List
4
+ import pandas as pd
5
+ from ..stores.models import Document
6
+ from .abstract import AbstractLoader
7
+
8
+
9
+ class QAFileLoader(AbstractLoader):
10
+ """
11
+ Question and Answers File based on Excel, coverted to Parrot Documents.
12
+ """
13
+ extensions: List[str] = ['.xlsx']
14
+ chunk_size = 1024
15
+ _source_type = 'QA-File'
16
+
17
+ def __init__(
18
+ self,
19
+ *args,
20
+ **kwargs
21
+ ):
22
+ self._columns = kwargs.pop('columns', ['Question', 'Answer'])
23
+ self._question_col = kwargs.pop('question_column', 'Question')
24
+ self._answer_col = kwargs.pop('answer_column', 'Answer')
25
+ self.doctype = kwargs.pop('doctype', 'qa')
26
+ super().__init__(*args, **kwargs)
27
+
28
+
29
+ async def _load(self, path: PurePath, **kwargs) -> List[Document]:
30
+ df = pd.read_excel(path, header=0, engine='openpyxl')
31
+ # trip spaces on columns names:
32
+ df.columns = df.columns.str.strip()
33
+ q = self._columns[0]
34
+ a = self._columns[1]
35
+ docs = []
36
+ if q not in df.columns or a not in df.columns:
37
+ raise ValueError(
38
+ f"Columns {q} and {a} must be present in the DataFrame."
39
+ )
40
+ for idx, row in df.iterrows():
41
+ # check first if columns q and a are present:
42
+ # Question Document
43
+ qs = row[q]
44
+ answer = row[a]
45
+ document_meta = {
46
+ "question": qs,
47
+ "answer": answer,
48
+ }
49
+ metadata = self.create_metadata(
50
+ path=path,
51
+ doctype=self.doctype,
52
+ source_type=self._source_type,
53
+ doc_metadata=document_meta,
54
+ type="FAQ",
55
+ question=qs,
56
+ answer=answer,
57
+ )
58
+ doc = Document(
59
+ page_content=f"{idx}. Question: {qs}: Answer: {answer}",
60
+ metadata=metadata,
61
+ )
62
+ docs.append(doc)
63
+ return docs
@@ -0,0 +1,10 @@
1
+ from .base import BaseTextSplitter
2
+ from .md import MarkdownTextSplitter
3
+ from .token import TokenTextSplitter
4
+
5
+
6
+ __all__ = (
7
+ 'BaseTextSplitter',
8
+ 'MarkdownTextSplitter',
9
+ 'TokenTextSplitter',
10
+ )
@@ -0,0 +1,138 @@
1
+ import re
2
+ import uuid
3
+ from abc import ABC, abstractmethod
4
+ from typing import List, Dict, Any, Optional
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass
9
+ class TextChunk:
10
+ """Represents a chunk of text with metadata"""
11
+ text: str
12
+ start_position: int
13
+ end_position: int
14
+ token_count: int
15
+ metadata: Dict[str, Any]
16
+ chunk_id: Optional[str] = None
17
+
18
+
19
+ class BaseTextSplitter(ABC):
20
+ """Base class for all text splitters"""
21
+
22
+ def __init__(
23
+ self,
24
+ chunk_size: int = 4000,
25
+ chunk_overlap: int = 200,
26
+ keep_separator: bool = True,
27
+ add_start_index: bool = True
28
+ ):
29
+ self.chunk_size = chunk_size
30
+ self.chunk_overlap = chunk_overlap
31
+ self.keep_separator = keep_separator
32
+ self.add_start_index = add_start_index
33
+
34
+ @abstractmethod
35
+ def split_text(self, text: str) -> List[str]:
36
+ """Split text into chunks"""
37
+ pass
38
+
39
+ def create_chunks(
40
+ self,
41
+ text: str,
42
+ metadata: Optional[Dict[str, Any]] = None
43
+ ) -> List[TextChunk]:
44
+ """Create TextChunk objects with metadata"""
45
+ text_chunks = self.split_text(text)
46
+ chunks = []
47
+ current_position = 0
48
+
49
+ for i, chunk_text in enumerate(text_chunks):
50
+ # Find the actual position in the original text
51
+ start_pos = text.find(chunk_text, current_position)
52
+ if start_pos == -1:
53
+ start_pos = current_position
54
+
55
+ end_pos = start_pos + len(chunk_text)
56
+
57
+ chunk_metadata = {
58
+ **(metadata or {}),
59
+ 'chunk_index': i,
60
+ 'total_chunks': len(text_chunks),
61
+ 'splitter_type': self.__class__.__name__
62
+ }
63
+
64
+ if self.add_start_index:
65
+ chunk_metadata['start_index'] = start_pos
66
+ chunk_metadata['end_index'] = end_pos
67
+
68
+ chunk = TextChunk(
69
+ text=chunk_text,
70
+ start_position=start_pos,
71
+ end_position=end_pos,
72
+ token_count=self._count_tokens(chunk_text),
73
+ metadata=chunk_metadata,
74
+ chunk_id=f"chunk_{i:04d}_{uuid.uuid4().hex[:8]}"
75
+ )
76
+
77
+ chunks.append(chunk)
78
+ current_position = start_pos + len(chunk_text) - self.chunk_overlap
79
+
80
+ return chunks
81
+
82
+ @abstractmethod
83
+ def _count_tokens(self, text: str) -> int:
84
+ """Count tokens in text"""
85
+ pass
86
+
87
+ def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
88
+ """Merge splits with overlap handling"""
89
+ if not splits:
90
+ return []
91
+
92
+ docs = []
93
+ current_doc = []
94
+ current_length = 0
95
+
96
+ for split in splits:
97
+ split_len = self._count_tokens(split)
98
+
99
+ if current_length + split_len > self.chunk_size and current_doc:
100
+ # Create document from current chunks
101
+ doc = separator.join(current_doc)
102
+ if doc:
103
+ docs.append(doc)
104
+
105
+ # Start new document with overlap
106
+ overlap_splits = self._get_overlap_splits(current_doc, separator)
107
+ current_doc = overlap_splits + [split]
108
+ current_length = sum(self._count_tokens(s) for s in current_doc)
109
+ else:
110
+ current_doc.append(split)
111
+ current_length += split_len
112
+
113
+ # Add final document
114
+ if current_doc:
115
+ doc = separator.join(current_doc)
116
+ if doc:
117
+ docs.append(doc)
118
+
119
+ return docs
120
+
121
+ def _get_overlap_splits(self, splits: List[str], separator: str) -> List[str]:
122
+ """Get splits for overlap"""
123
+ if not splits or self.chunk_overlap == 0:
124
+ return []
125
+
126
+ overlap_splits = []
127
+ overlap_length = 0
128
+
129
+ # Start from the end and work backwards
130
+ for split in reversed(splits):
131
+ split_len = self._count_tokens(split)
132
+ if overlap_length + split_len <= self.chunk_overlap:
133
+ overlap_splits.insert(0, split)
134
+ overlap_length += split_len
135
+ else:
136
+ break
137
+
138
+ return overlap_splits