ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
@@ -0,0 +1,63 @@
1
+ import aiofiles
2
+ from .abstract import FilePlugin
3
+
4
+ class TextFile(FilePlugin):
5
+ """
6
+ A class to handle text files asynchronously.
7
+ """
8
+ def __init__(self, path: str, encoding: str = 'utf-8'):
9
+ """
10
+ Initialize the TextFile with a file path.
11
+
12
+ Args:
13
+ path: Path to the text file.
14
+ encoding: File encoding (default: utf-8)
15
+ """
16
+ super().__init__()
17
+ self.path = path
18
+ self.encoding = encoding
19
+ self._file = None
20
+
21
+ async def open(self):
22
+ """
23
+ Asynchronously open the text file.
24
+ """
25
+ try:
26
+ self._file = await aiofiles.open(self.path, mode='r', encoding=self.encoding)
27
+ self.logger.debug(
28
+ f"Successfully opened file: {self.path}"
29
+ )
30
+ except Exception as e:
31
+ self.logger.error(f"Error opening file {self.path}: {str(e)}")
32
+ raise
33
+
34
+ async def close(self):
35
+ """
36
+ Asynchronously close the text file.
37
+ """
38
+ if self._file is not None:
39
+ try:
40
+ await self._file.close()
41
+ self.logger.debug(f"Successfully closed file: {self.path}")
42
+ except Exception as e:
43
+ self.logger.error(f"Error closing file {self.path}: {str(e)}")
44
+ raise
45
+ finally:
46
+ self._file = None
47
+
48
+ async def read(self) -> str:
49
+ """
50
+ Asynchronously read the content of the text file.
51
+
52
+ Returns:
53
+ Content of the text file as a string.
54
+ """
55
+ if self._file is None:
56
+ await self.open()
57
+
58
+ try:
59
+ content = await self._file.read()
60
+ return content
61
+ except Exception as e:
62
+ self.logger.error(f"Error reading file {self.path}: {str(e)}")
63
+ raise
parrot/loaders/html.py ADDED
@@ -0,0 +1,152 @@
1
+ from typing import Union, List, Callable, Any
2
+ from datetime import datetime
3
+ from pathlib import PurePath
4
+ from markdownify import markdownify as md
5
+ from ..stores.models import Document
6
+ from .abstract import AbstractLoader
7
+ from .files.html import HTMLFile
8
+
9
+
10
+ class HTMLLoader(AbstractLoader):
11
+ """
12
+ Loader for HTML files to convert into Parrot Documents.
13
+
14
+ Processes HTML files, extracts relevant content, converts to Markdown,
15
+ and associates metadata with each document.
16
+ """
17
+
18
+ extensions: List[str] = ['.html', '.htm']
19
+
20
+ def __init__(
21
+ self,
22
+ path: PurePath,
23
+ tokenizer: Callable[..., Any] = None,
24
+ text_splitter: Callable[..., Any] = None,
25
+ source_type: str = 'html',
26
+ language: str = "eng",
27
+ chunk_size: int = 1024,
28
+ chunk_overlap: int = 10,
29
+ **kwargs
30
+ ):
31
+ """Initialize the HTMLLoader."""
32
+ self.elements: list = kwargs.pop('elements', [])
33
+ super().__init__(
34
+ path=path,
35
+ tokenizer=tokenizer,
36
+ text_splitter=text_splitter,
37
+ source_type=source_type,
38
+ language=language,
39
+ **kwargs
40
+ )
41
+ # Initialize markdown splitter
42
+ self._splitter = self._get_markdown_splitter(
43
+ chunk_size=chunk_size,
44
+ chunk_overlap=chunk_overlap
45
+ )
46
+
47
+ async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
48
+ """
49
+ Load a TXT file.
50
+
51
+ Args:
52
+ path (Path): The path to the TXT file.
53
+
54
+ Returns:
55
+ list: A list of Parrot Documents.
56
+ """
57
+ docs = []
58
+ async with HTMLFile(path) as file:
59
+ soup, content = await file.read()
60
+ # Extract the entire <body> content or
61
+ # Determine the top-level element to process
62
+ top_element = soup.body or soup
63
+ if not top_element:
64
+ raise ValueError(
65
+ "The HTML file does not contain a <body> or Top element tag."
66
+ )
67
+
68
+ extracted_elements = []
69
+ if self.elements:
70
+ # Extract content from specific elements
71
+ for element in self.elements:
72
+ for tag, selector in element.items():
73
+ extracted_elements.extend(
74
+ top_element.find_all(tag, class_=selector.lstrip('.'))
75
+ )
76
+ if not extracted_elements:
77
+ extracted_elements = [top_element]
78
+
79
+ # Process each extracted element
80
+ for elem in extracted_elements:
81
+ # Get the plain text content
82
+ text = elem.get_text(separator="\n", strip=True)
83
+
84
+ # Generate a summary for the extracted text
85
+ try:
86
+ summary = self.summary_from_text(text)
87
+ except Exception as e:
88
+ if self.logger:
89
+ self.logger.error(f"Error generating summary: {e}")
90
+ summary = None
91
+
92
+ # Create document-level context
93
+ document_context = f"File Name: {path.name}\n"
94
+ document_context += f"Document Type: {self.doctype}\n"
95
+ document_context += f"Source Type: {self._source_type}\n"
96
+ document_context += f"Element: {elem.name}\n"
97
+
98
+ # Convert the entire <body> to Markdown for better structure
99
+ markdown_content = md(str(elem))
100
+
101
+ # Metadata preparation
102
+ document_meta = self.create_metadata(
103
+ path=path,
104
+ doctype=self.doctype,
105
+ source_type=self._source_type,
106
+ doc_metadata={
107
+ "type": "html",
108
+ "category": self.category,
109
+ }
110
+ )
111
+
112
+ # Create a single Langchain Document with the full body content
113
+ document = Document(
114
+ page_content=document_context + markdown_content,
115
+ metadata=document_meta
116
+ )
117
+ docs.append(document)
118
+
119
+ # Create a document from summary (if any):
120
+ if summary:
121
+ document = Document(
122
+ page_content=summary,
123
+ metadata={
124
+ **document_meta,
125
+ "source": str(path),
126
+ "timestamp": datetime.now().isoformat(),
127
+ }
128
+ )
129
+ docs.append(document)
130
+
131
+ # splitting the content:
132
+ try:
133
+ chunks = self._splitter.split_text(text)
134
+ self.logger.info(f"Split document into {len(chunks)} chunks")
135
+ except Exception as e:
136
+ self.logger.error(
137
+ f"Failed to split text: {e}"
138
+ )
139
+ # Fallback: use the entire text as one chunk
140
+ chunks = [text]
141
+ for chunk in chunks:
142
+ _idx = {
143
+ **document_meta
144
+ }
145
+ # Create a Langchain Document
146
+ docs.append(
147
+ Document(
148
+ page_content=document_context + chunk,
149
+ metadata=_idx
150
+ )
151
+ )
152
+ return []
@@ -0,0 +1,442 @@
1
+ from collections.abc import Callable
2
+ from typing import List, Optional, Union
3
+ import re
4
+ from pathlib import Path, PurePath
5
+ from markitdown import MarkItDown
6
+ from ..stores.models import Document
7
+ from .abstract import AbstractLoader
8
+
9
+
10
+ class MarkdownLoader(AbstractLoader):
11
+ """
12
+ Universal Document Loader using MarkItDown library.
13
+
14
+ Converts various document formats to markdown and returns Document objects.
15
+ Supports:
16
+ - PDF files
17
+ - PowerPoint presentations (.pptx, .ppt)
18
+ - Word documents (.docx, .doc)
19
+ - Excel spreadsheets (.xlsx, .xls, .csv)
20
+ - HTML files
21
+ - Text-based formats (CSV, JSON, XML)
22
+ - Images with OCR (if enabled)
23
+ - Audio files (if enabled)
24
+ """
25
+
26
+ # Supported extensions based on MarkItDown capabilities
27
+ extensions: List[str] = {
28
+ '.pdf', '.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls',
29
+ '.csv', '.html', '.htm', '.xml', '.json', '.txt', '.md',
30
+ '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', # Images (with OCR)
31
+ '.mp3', '.wav', '.m4a', '.flac' # Audio (with transcription)
32
+ }
33
+
34
+ def __init__(
35
+ self,
36
+ source: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
37
+ *,
38
+ tokenizer: Union[str, Callable] = None,
39
+ text_splitter: Union[str, Callable] = None,
40
+ source_type: str = 'file',
41
+ enable_plugins: bool = True,
42
+ enable_ocr: bool = False,
43
+ enable_audio: bool = False,
44
+ use_chapters: bool = False,
45
+ use_sections: bool = False,
46
+ merge_consecutive_headers: bool = True,
47
+ min_section_length: int = 50,
48
+ **kwargs
49
+ ):
50
+ """
51
+ Initialize the MarkdownLoader.
52
+
53
+ Args:
54
+ source: Path or list of paths to load from
55
+ tokenizer: Tokenizer to use for text processing
56
+ text_splitter: Text splitter to use
57
+ source_type: Type of source ('file', 'url', etc.)
58
+ enable_plugins: Enable MarkItDown plugins for enhanced processing
59
+ enable_ocr: Enable OCR for image processing
60
+ enable_audio: Enable audio transcription
61
+ use_chapters: Split by chapters/major sections
62
+ use_sections: Split by all sections
63
+ merge_consecutive_headers: Merge consecutive headers with their content
64
+ min_section_length: Minimum length for a section to be considered valid
65
+ **kwargs: Additional arguments passed to AbstractLoader
66
+ """
67
+ super().__init__(
68
+ source,
69
+ tokenizer=tokenizer,
70
+ text_splitter=text_splitter,
71
+ source_type=source_type,
72
+ **kwargs
73
+ )
74
+
75
+ self.doctype = 'markdown'
76
+ self._source_type = source_type
77
+ self.enable_plugins = enable_plugins
78
+ self.enable_ocr = enable_ocr
79
+ self.enable_audio = enable_audio
80
+ self.use_chapters = use_chapters
81
+ self.use_sections = use_sections
82
+ self.merge_consecutive_headers = merge_consecutive_headers
83
+ self.min_section_length = min_section_length
84
+
85
+ # Initialize MarkItDown
86
+ self._setup_markitdown()
87
+
88
+ def _setup_markitdown(self):
89
+ """Initialize the MarkItDown converter with appropriate settings."""
90
+ try:
91
+ self.md_converter = MarkItDown(enable_plugins=self.enable_plugins)
92
+ self.logger.info("MarkItDown converter initialized successfully")
93
+ except Exception as e:
94
+ self.logger.error(f"Failed to initialize MarkItDown: {e}")
95
+ raise
96
+
97
+ def _detect_document_type(self, path: PurePath) -> str:
98
+ """Detect the type of document based on file extension."""
99
+ suffix = path.suffix.lower()
100
+
101
+ type_mapping = {
102
+ '.pdf': 'pdf',
103
+ '.docx': 'word', '.doc': 'word',
104
+ '.pptx': 'powerpoint', '.ppt': 'powerpoint',
105
+ '.xlsx': 'excel', '.xls': 'excel',
106
+ '.csv': 'csv',
107
+ '.html': 'html', '.htm': 'html',
108
+ '.xml': 'xml',
109
+ '.json': 'json',
110
+ '.txt': 'text', '.md': 'markdown',
111
+ '.png': 'image', '.jpg': 'image', '.jpeg': 'image',
112
+ '.gif': 'image', '.bmp': 'image', '.tiff': 'image',
113
+ '.mp3': 'audio', '.wav': 'audio', '.m4a': 'audio', '.flac': 'audio'
114
+ }
115
+
116
+ return type_mapping.get(suffix, 'unknown')
117
+
118
+ def _extract_sections_from_markdown(self, md_text: str) -> List[dict]:
119
+ """
120
+ Extract sections from markdown text based on headers.
121
+
122
+ Args:
123
+ md_text: Markdown text content
124
+
125
+ Returns:
126
+ List of section dictionaries with 'title', 'content', 'level', and 'section_number'
127
+ """
128
+ sections = []
129
+ lines = md_text.split('\n')
130
+ current_section = None
131
+ current_content = []
132
+ section_counter = 0
133
+
134
+ for line in lines:
135
+ # Check if line is a header
136
+ header_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
137
+
138
+ if header_match:
139
+ # Save previous section if it exists
140
+ if current_section and current_content:
141
+ content = '\n'.join(current_content).strip()
142
+ if len(content) >= self.min_section_length:
143
+ current_section['content'] = content
144
+ sections.append(current_section)
145
+
146
+ # Start new section
147
+ level = len(header_match.group(1))
148
+ title = header_match.group(2).strip()
149
+ section_counter += 1
150
+
151
+ # Determine if this should be included based on settings
152
+ include_section = False
153
+ if self.use_chapters and level <= 2: # H1 and H2 for chapters
154
+ include_section = True
155
+ elif self.use_sections and level <= 4: # H1-H4 for sections
156
+ include_section = True
157
+ elif not self.use_chapters and not self.use_sections:
158
+ include_section = True # Include all if no specific setting
159
+
160
+ if include_section:
161
+ current_section = {
162
+ 'title': title,
163
+ 'level': level,
164
+ 'section_number': section_counter,
165
+ 'header_line': line
166
+ }
167
+ current_content = []
168
+
169
+ # Include the header in content if merging
170
+ if self.merge_consecutive_headers:
171
+ current_content.append(line)
172
+ else:
173
+ current_section = None
174
+ current_content = []
175
+ else:
176
+ # Add line to current section content
177
+ if current_section is not None:
178
+ current_content.append(line)
179
+
180
+ # Handle the last section
181
+ if current_section and current_content:
182
+ content = '\n'.join(current_content).strip()
183
+ if len(content) >= self.min_section_length:
184
+ current_section['content'] = content
185
+ sections.append(current_section)
186
+
187
+ return sections
188
+
189
+ def _clean_markdown_content(self, content: str) -> str:
190
+ """
191
+ Clean and normalize markdown content.
192
+
193
+ Args:
194
+ content: Raw markdown content
195
+
196
+ Returns:
197
+ Cleaned markdown content
198
+ """
199
+ if not content:
200
+ return ""
201
+
202
+ # Remove excessive blank lines
203
+ content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
204
+
205
+ # Remove trailing whitespace from lines
206
+ lines = [line.rstrip() for line in content.split('\n')]
207
+ content = '\n'.join(lines)
208
+
209
+ # Ensure proper spacing around headers
210
+ content = re.sub(r'(^|\n)(#{1,6}\s+[^\n]+)(\n)', r'\1\n\2\n\n', content)
211
+
212
+ return content.strip()
213
+
214
+ def _extract_metadata_from_markdown(self, md_text: str, file_path: PurePath) -> dict:
215
+ """
216
+ Extract metadata from markdown content and file.
217
+
218
+ Args:
219
+ md_text: Markdown text content
220
+ file_path: Path to the source file
221
+
222
+ Returns:
223
+ Dictionary containing extracted metadata
224
+ """
225
+ metadata = {}
226
+
227
+ # Extract frontmatter if present
228
+ frontmatter_match = re.match(r'^---\n(.*?)\n---\n', md_text, re.DOTALL)
229
+ if frontmatter_match:
230
+ try:
231
+ import yaml
232
+ frontmatter = yaml.safe_load(frontmatter_match.group(1))
233
+ if isinstance(frontmatter, dict):
234
+ metadata.update(frontmatter)
235
+ except (ImportError, yaml.YAMLError):
236
+ self.logger.warning("Could not parse frontmatter metadata")
237
+
238
+ # Extract title from first header if not in frontmatter
239
+ if 'title' not in metadata:
240
+ title_match = re.search(r'^#\s+(.+)$', md_text, re.MULTILINE)
241
+ if title_match:
242
+ metadata['title'] = title_match.group(1).strip()
243
+
244
+ # Count various elements
245
+ metadata.update({
246
+ 'word_count': len(md_text.split()),
247
+ 'header_count': len(re.findall(r'^#{1,6}\s+', md_text, re.MULTILINE)),
248
+ 'table_count': len(re.findall(r'^\|.*\|$', md_text, re.MULTILINE)),
249
+ 'code_block_count': len(re.findall(r'```', md_text)) // 2,
250
+ 'link_count': len(re.findall(r'\[.*?\]\(.*?\)', md_text)),
251
+ 'image_count': len(re.findall(r'!\[.*?\]\(.*?\)', md_text))
252
+ })
253
+
254
+ return metadata
255
+
256
+ async def _load(self, path: PurePath, **kwargs) -> List[Document]:
257
+ """
258
+ Load a single file using MarkItDown and return Document objects.
259
+
260
+ Args:
261
+ path: Path to the file to load
262
+ **kwargs: Additional arguments
263
+
264
+ Returns:
265
+ List of Document objects
266
+ """
267
+ self.logger.info(f"Loading file with MarkItDown: {path}")
268
+ docs = []
269
+
270
+ try:
271
+ # Convert file to markdown using MarkItDown
272
+ result = self.md_converter.convert(str(path))
273
+
274
+ if not result or not result.text_content:
275
+ self.logger.warning(f"No content extracted from {path}")
276
+ return docs
277
+
278
+ md_text = result.text_content
279
+ md_text = self._clean_markdown_content(md_text)
280
+
281
+ # Extract additional metadata
282
+ doc_type = self._detect_document_type(path)
283
+ extracted_metadata = self._extract_metadata_from_markdown(md_text, path)
284
+
285
+ # Determine how to split the content
286
+ if self.use_chapters or self.use_sections:
287
+ # Split by sections/chapters
288
+ sections = self._extract_sections_from_markdown(md_text)
289
+ self.logger.info(f"Extracted {len(sections)} sections from {path}")
290
+
291
+ if sections:
292
+ for section in sections:
293
+ section_type = "chapter" if self.use_chapters else "section"
294
+
295
+ document_meta = {
296
+ "filename": path.name,
297
+ "file_path": str(path),
298
+ "document_type": doc_type,
299
+ "section_title": section['title'],
300
+ "section_number": section['section_number'],
301
+ "header_level": section['level'],
302
+ "content_type": section_type,
303
+ "extracted_metadata": extracted_metadata,
304
+ **extracted_metadata
305
+ }
306
+
307
+ meta = self.create_metadata(
308
+ path=path,
309
+ doctype="markdown",
310
+ source_type=f"markitdown_{section_type}",
311
+ doc_metadata=document_meta,
312
+ )
313
+
314
+ docs.append(
315
+ self.create_document(
316
+ content=section['content'],
317
+ path=path,
318
+ metadata=meta
319
+ )
320
+ )
321
+ else:
322
+ # No sections found, treat as single document
323
+ self.logger.info(f"No sections found in {path}, treating as single document")
324
+ self._create_single_document(docs, md_text, path, doc_type, extracted_metadata)
325
+ else:
326
+ # Return whole markdown as single document
327
+ self._create_single_document(docs, md_text, path, doc_type, extracted_metadata)
328
+
329
+ # Generate summary if enabled
330
+ if self._summarization and docs:
331
+ full_text = "\n\n".join([doc.page_content for doc in docs])
332
+ summary = await self.summary_from_text(full_text)
333
+
334
+ if summary:
335
+ summary_meta = self.create_metadata(
336
+ path=path,
337
+ doctype="markdown",
338
+ source_type="markitdown_summary",
339
+ doc_metadata={
340
+ "summary_for_sections": len(docs),
341
+ "document_type": doc_type,
342
+ **extracted_metadata
343
+ }
344
+ )
345
+
346
+ docs.append(
347
+ self.create_document(
348
+ content=f"SUMMARY:\n\n{summary}",
349
+ path=path,
350
+ metadata=summary_meta
351
+ )
352
+ )
353
+
354
+ except Exception as e:
355
+ self.logger.error(f"Error processing {path} with MarkItDown: {e}")
356
+ # Could optionally fall back to reading as plain text
357
+ raise
358
+
359
+ return docs
360
+
361
+ def _create_single_document(
362
+ self,
363
+ docs: List[Document],
364
+ md_text: str,
365
+ path: PurePath,
366
+ doc_type: str,
367
+ extracted_metadata: dict
368
+ ):
369
+ """Helper method to create a single document from markdown text."""
370
+ document_meta = {
371
+ "filename": path.name,
372
+ "file_path": str(path),
373
+ "document_type": doc_type,
374
+ "content_type": "full_document",
375
+ "extracted_metadata": extracted_metadata,
376
+ **extracted_metadata
377
+ }
378
+
379
+ meta = self.create_metadata(
380
+ path=path,
381
+ doctype="markdown",
382
+ source_type="markitdown_full",
383
+ doc_metadata=document_meta,
384
+ )
385
+
386
+ docs.append(
387
+ self.create_document(
388
+ content=md_text,
389
+ path=path,
390
+ metadata=meta
391
+ )
392
+ )
393
+
394
+ def get_supported_formats(self) -> dict:
395
+ """
396
+ Get information about supported file formats.
397
+
398
+ Returns:
399
+ Dictionary mapping format categories to file extensions
400
+ """
401
+ return {
402
+ 'documents': ['.pdf', '.docx', '.doc'],
403
+ 'presentations': ['.pptx', '.ppt'],
404
+ 'spreadsheets': ['.xlsx', '.xls', '.csv'],
405
+ 'web': ['.html', '.htm'],
406
+ 'data': ['.xml', '.json'],
407
+ 'text': ['.txt', '.md'],
408
+ 'images': ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'],
409
+ 'audio': ['.mp3', '.wav', '.m4a', '.flac']
410
+ }
411
+
412
+ def validate_file_support(self, path: Union[str, Path]) -> bool:
413
+ """
414
+ Check if a file is supported by MarkItDown.
415
+
416
+ Args:
417
+ path: File path to check
418
+
419
+ Returns:
420
+ True if file is supported, False otherwise
421
+ """
422
+ if isinstance(path, str):
423
+ path = Path(path)
424
+
425
+ return path.suffix.lower() in self.extensions
426
+
427
+ async def convert_to_markdown(self, path: Union[str, Path]) -> str:
428
+ """
429
+ Convert a single file to markdown and return the content.
430
+
431
+ Args:
432
+ path: Path to file to convert
433
+
434
+ Returns:
435
+ Markdown content as string
436
+ """
437
+ try:
438
+ result = self.md_converter.convert(str(path))
439
+ return self._clean_markdown_content(result.text_content) if result else ""
440
+ except Exception as e:
441
+ self.logger.error(f"Error converting {path} to markdown: {e}")
442
+ return ""