ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (535) hide show
  1. agentui/.prettierrc +15 -0
  2. agentui/QUICKSTART.md +272 -0
  3. agentui/README.md +59 -0
  4. agentui/env.example +16 -0
  5. agentui/jsconfig.json +14 -0
  6. agentui/package-lock.json +4242 -0
  7. agentui/package.json +34 -0
  8. agentui/scripts/postinstall/apply-patches.mjs +260 -0
  9. agentui/src/app.css +61 -0
  10. agentui/src/app.d.ts +13 -0
  11. agentui/src/app.html +12 -0
  12. agentui/src/components/LoadingSpinner.svelte +64 -0
  13. agentui/src/components/ThemeSwitcher.svelte +159 -0
  14. agentui/src/components/index.js +4 -0
  15. agentui/src/lib/api/bots.ts +60 -0
  16. agentui/src/lib/api/chat.ts +22 -0
  17. agentui/src/lib/api/http.ts +25 -0
  18. agentui/src/lib/components/BotCard.svelte +33 -0
  19. agentui/src/lib/components/ChatBubble.svelte +63 -0
  20. agentui/src/lib/components/Toast.svelte +21 -0
  21. agentui/src/lib/config.ts +20 -0
  22. agentui/src/lib/stores/auth.svelte.ts +73 -0
  23. agentui/src/lib/stores/theme.svelte.js +64 -0
  24. agentui/src/lib/stores/toast.svelte.ts +31 -0
  25. agentui/src/lib/utils/conversation.ts +39 -0
  26. agentui/src/routes/+layout.svelte +20 -0
  27. agentui/src/routes/+page.svelte +232 -0
  28. agentui/src/routes/login/+page.svelte +200 -0
  29. agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
  30. agentui/src/routes/talk/[agentId]/+page.ts +7 -0
  31. agentui/static/README.md +1 -0
  32. agentui/svelte.config.js +11 -0
  33. agentui/tailwind.config.ts +53 -0
  34. agentui/tsconfig.json +3 -0
  35. agentui/vite.config.ts +10 -0
  36. ai_parrot-0.17.2.dist-info/METADATA +472 -0
  37. ai_parrot-0.17.2.dist-info/RECORD +535 -0
  38. ai_parrot-0.17.2.dist-info/WHEEL +6 -0
  39. ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
  40. ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
  41. ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
  42. crew-builder/.prettierrc +15 -0
  43. crew-builder/QUICKSTART.md +259 -0
  44. crew-builder/README.md +113 -0
  45. crew-builder/env.example +17 -0
  46. crew-builder/jsconfig.json +14 -0
  47. crew-builder/package-lock.json +4182 -0
  48. crew-builder/package.json +37 -0
  49. crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
  50. crew-builder/src/app.css +62 -0
  51. crew-builder/src/app.d.ts +13 -0
  52. crew-builder/src/app.html +12 -0
  53. crew-builder/src/components/LoadingSpinner.svelte +64 -0
  54. crew-builder/src/components/ThemeSwitcher.svelte +149 -0
  55. crew-builder/src/components/index.js +9 -0
  56. crew-builder/src/lib/api/bots.ts +60 -0
  57. crew-builder/src/lib/api/chat.ts +80 -0
  58. crew-builder/src/lib/api/client.ts +56 -0
  59. crew-builder/src/lib/api/crew/crew.ts +136 -0
  60. crew-builder/src/lib/api/index.ts +5 -0
  61. crew-builder/src/lib/api/o365/auth.ts +65 -0
  62. crew-builder/src/lib/auth/auth.ts +54 -0
  63. crew-builder/src/lib/components/AgentNode.svelte +43 -0
  64. crew-builder/src/lib/components/BotCard.svelte +33 -0
  65. crew-builder/src/lib/components/ChatBubble.svelte +67 -0
  66. crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
  67. crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
  68. crew-builder/src/lib/components/JsonViewer.svelte +24 -0
  69. crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
  70. crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
  71. crew-builder/src/lib/components/Toast.svelte +67 -0
  72. crew-builder/src/lib/components/Toolbar.svelte +157 -0
  73. crew-builder/src/lib/components/index.ts +10 -0
  74. crew-builder/src/lib/config.ts +8 -0
  75. crew-builder/src/lib/stores/auth.svelte.ts +228 -0
  76. crew-builder/src/lib/stores/crewStore.ts +369 -0
  77. crew-builder/src/lib/stores/theme.svelte.js +145 -0
  78. crew-builder/src/lib/stores/toast.svelte.ts +69 -0
  79. crew-builder/src/lib/utils/conversation.ts +39 -0
  80. crew-builder/src/lib/utils/markdown.ts +122 -0
  81. crew-builder/src/lib/utils/talkHistory.ts +47 -0
  82. crew-builder/src/routes/+layout.svelte +20 -0
  83. crew-builder/src/routes/+page.svelte +539 -0
  84. crew-builder/src/routes/agents/+page.svelte +247 -0
  85. crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
  86. crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
  87. crew-builder/src/routes/builder/+page.svelte +204 -0
  88. crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
  89. crew-builder/src/routes/crew/ask/+page.ts +1 -0
  90. crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
  91. crew-builder/src/routes/login/+page.svelte +197 -0
  92. crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
  93. crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
  94. crew-builder/static/README.md +1 -0
  95. crew-builder/svelte.config.js +11 -0
  96. crew-builder/tailwind.config.ts +53 -0
  97. crew-builder/tsconfig.json +3 -0
  98. crew-builder/vite.config.ts +10 -0
  99. mcp_servers/calculator_server.py +309 -0
  100. parrot/__init__.py +27 -0
  101. parrot/__pycache__/__init__.cpython-310.pyc +0 -0
  102. parrot/__pycache__/version.cpython-310.pyc +0 -0
  103. parrot/_version.py +34 -0
  104. parrot/a2a/__init__.py +48 -0
  105. parrot/a2a/client.py +658 -0
  106. parrot/a2a/discovery.py +89 -0
  107. parrot/a2a/mixin.py +257 -0
  108. parrot/a2a/models.py +376 -0
  109. parrot/a2a/server.py +770 -0
  110. parrot/agents/__init__.py +29 -0
  111. parrot/bots/__init__.py +12 -0
  112. parrot/bots/a2a_agent.py +19 -0
  113. parrot/bots/abstract.py +3139 -0
  114. parrot/bots/agent.py +1129 -0
  115. parrot/bots/basic.py +9 -0
  116. parrot/bots/chatbot.py +669 -0
  117. parrot/bots/data.py +1618 -0
  118. parrot/bots/database/__init__.py +5 -0
  119. parrot/bots/database/abstract.py +3071 -0
  120. parrot/bots/database/cache.py +286 -0
  121. parrot/bots/database/models.py +468 -0
  122. parrot/bots/database/prompts.py +154 -0
  123. parrot/bots/database/retries.py +98 -0
  124. parrot/bots/database/router.py +269 -0
  125. parrot/bots/database/sql.py +41 -0
  126. parrot/bots/db/__init__.py +6 -0
  127. parrot/bots/db/abstract.py +556 -0
  128. parrot/bots/db/bigquery.py +602 -0
  129. parrot/bots/db/cache.py +85 -0
  130. parrot/bots/db/documentdb.py +668 -0
  131. parrot/bots/db/elastic.py +1014 -0
  132. parrot/bots/db/influx.py +898 -0
  133. parrot/bots/db/mock.py +96 -0
  134. parrot/bots/db/multi.py +783 -0
  135. parrot/bots/db/prompts.py +185 -0
  136. parrot/bots/db/sql.py +1255 -0
  137. parrot/bots/db/tools.py +212 -0
  138. parrot/bots/document.py +680 -0
  139. parrot/bots/hrbot.py +15 -0
  140. parrot/bots/kb.py +170 -0
  141. parrot/bots/mcp.py +36 -0
  142. parrot/bots/orchestration/README.md +463 -0
  143. parrot/bots/orchestration/__init__.py +1 -0
  144. parrot/bots/orchestration/agent.py +155 -0
  145. parrot/bots/orchestration/crew.py +3330 -0
  146. parrot/bots/orchestration/fsm.py +1179 -0
  147. parrot/bots/orchestration/hr.py +434 -0
  148. parrot/bots/orchestration/storage/__init__.py +4 -0
  149. parrot/bots/orchestration/storage/memory.py +100 -0
  150. parrot/bots/orchestration/storage/mixin.py +119 -0
  151. parrot/bots/orchestration/verify.py +202 -0
  152. parrot/bots/product.py +204 -0
  153. parrot/bots/prompts/__init__.py +96 -0
  154. parrot/bots/prompts/agents.py +155 -0
  155. parrot/bots/prompts/data.py +216 -0
  156. parrot/bots/prompts/output_generation.py +8 -0
  157. parrot/bots/scraper/__init__.py +3 -0
  158. parrot/bots/scraper/models.py +122 -0
  159. parrot/bots/scraper/scraper.py +1173 -0
  160. parrot/bots/scraper/templates.py +115 -0
  161. parrot/bots/stores/__init__.py +5 -0
  162. parrot/bots/stores/local.py +172 -0
  163. parrot/bots/webdev.py +81 -0
  164. parrot/cli.py +17 -0
  165. parrot/clients/__init__.py +16 -0
  166. parrot/clients/base.py +1491 -0
  167. parrot/clients/claude.py +1191 -0
  168. parrot/clients/factory.py +129 -0
  169. parrot/clients/google.py +4567 -0
  170. parrot/clients/gpt.py +1975 -0
  171. parrot/clients/grok.py +432 -0
  172. parrot/clients/groq.py +986 -0
  173. parrot/clients/hf.py +582 -0
  174. parrot/clients/models.py +18 -0
  175. parrot/conf.py +395 -0
  176. parrot/embeddings/__init__.py +9 -0
  177. parrot/embeddings/base.py +157 -0
  178. parrot/embeddings/google.py +98 -0
  179. parrot/embeddings/huggingface.py +74 -0
  180. parrot/embeddings/openai.py +84 -0
  181. parrot/embeddings/processor.py +88 -0
  182. parrot/exceptions.c +13868 -0
  183. parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
  184. parrot/exceptions.pxd +22 -0
  185. parrot/exceptions.pxi +15 -0
  186. parrot/exceptions.pyx +44 -0
  187. parrot/generators/__init__.py +29 -0
  188. parrot/generators/base.py +200 -0
  189. parrot/generators/html.py +293 -0
  190. parrot/generators/react.py +205 -0
  191. parrot/generators/streamlit.py +203 -0
  192. parrot/generators/template.py +105 -0
  193. parrot/handlers/__init__.py +4 -0
  194. parrot/handlers/agent.py +861 -0
  195. parrot/handlers/agents/__init__.py +1 -0
  196. parrot/handlers/agents/abstract.py +900 -0
  197. parrot/handlers/bots.py +338 -0
  198. parrot/handlers/chat.py +915 -0
  199. parrot/handlers/creation.sql +192 -0
  200. parrot/handlers/crew/ARCHITECTURE.md +362 -0
  201. parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
  202. parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
  203. parrot/handlers/crew/__init__.py +0 -0
  204. parrot/handlers/crew/handler.py +801 -0
  205. parrot/handlers/crew/models.py +229 -0
  206. parrot/handlers/crew/redis_persistence.py +523 -0
  207. parrot/handlers/jobs/__init__.py +10 -0
  208. parrot/handlers/jobs/job.py +384 -0
  209. parrot/handlers/jobs/mixin.py +627 -0
  210. parrot/handlers/jobs/models.py +115 -0
  211. parrot/handlers/jobs/worker.py +31 -0
  212. parrot/handlers/models.py +596 -0
  213. parrot/handlers/o365_auth.py +105 -0
  214. parrot/handlers/stream.py +337 -0
  215. parrot/interfaces/__init__.py +6 -0
  216. parrot/interfaces/aws.py +143 -0
  217. parrot/interfaces/credentials.py +113 -0
  218. parrot/interfaces/database.py +27 -0
  219. parrot/interfaces/google.py +1123 -0
  220. parrot/interfaces/hierarchy.py +1227 -0
  221. parrot/interfaces/http.py +651 -0
  222. parrot/interfaces/images/__init__.py +0 -0
  223. parrot/interfaces/images/plugins/__init__.py +24 -0
  224. parrot/interfaces/images/plugins/abstract.py +58 -0
  225. parrot/interfaces/images/plugins/analisys.py +148 -0
  226. parrot/interfaces/images/plugins/classify.py +150 -0
  227. parrot/interfaces/images/plugins/classifybase.py +182 -0
  228. parrot/interfaces/images/plugins/detect.py +150 -0
  229. parrot/interfaces/images/plugins/exif.py +1103 -0
  230. parrot/interfaces/images/plugins/hash.py +52 -0
  231. parrot/interfaces/images/plugins/vision.py +104 -0
  232. parrot/interfaces/images/plugins/yolo.py +66 -0
  233. parrot/interfaces/images/plugins/zerodetect.py +197 -0
  234. parrot/interfaces/o365.py +978 -0
  235. parrot/interfaces/onedrive.py +822 -0
  236. parrot/interfaces/sharepoint.py +1435 -0
  237. parrot/interfaces/soap.py +257 -0
  238. parrot/loaders/__init__.py +8 -0
  239. parrot/loaders/abstract.py +1131 -0
  240. parrot/loaders/audio.py +199 -0
  241. parrot/loaders/basepdf.py +53 -0
  242. parrot/loaders/basevideo.py +1568 -0
  243. parrot/loaders/csv.py +409 -0
  244. parrot/loaders/docx.py +116 -0
  245. parrot/loaders/epubloader.py +316 -0
  246. parrot/loaders/excel.py +199 -0
  247. parrot/loaders/factory.py +55 -0
  248. parrot/loaders/files/__init__.py +0 -0
  249. parrot/loaders/files/abstract.py +39 -0
  250. parrot/loaders/files/html.py +26 -0
  251. parrot/loaders/files/text.py +63 -0
  252. parrot/loaders/html.py +152 -0
  253. parrot/loaders/markdown.py +442 -0
  254. parrot/loaders/pdf.py +373 -0
  255. parrot/loaders/pdfmark.py +320 -0
  256. parrot/loaders/pdftables.py +506 -0
  257. parrot/loaders/ppt.py +476 -0
  258. parrot/loaders/qa.py +63 -0
  259. parrot/loaders/splitters/__init__.py +10 -0
  260. parrot/loaders/splitters/base.py +138 -0
  261. parrot/loaders/splitters/md.py +228 -0
  262. parrot/loaders/splitters/token.py +143 -0
  263. parrot/loaders/txt.py +26 -0
  264. parrot/loaders/video.py +89 -0
  265. parrot/loaders/videolocal.py +218 -0
  266. parrot/loaders/videounderstanding.py +377 -0
  267. parrot/loaders/vimeo.py +167 -0
  268. parrot/loaders/web.py +599 -0
  269. parrot/loaders/youtube.py +504 -0
  270. parrot/manager/__init__.py +5 -0
  271. parrot/manager/manager.py +1030 -0
  272. parrot/mcp/__init__.py +28 -0
  273. parrot/mcp/adapter.py +105 -0
  274. parrot/mcp/cli.py +174 -0
  275. parrot/mcp/client.py +119 -0
  276. parrot/mcp/config.py +75 -0
  277. parrot/mcp/integration.py +842 -0
  278. parrot/mcp/oauth.py +933 -0
  279. parrot/mcp/server.py +225 -0
  280. parrot/mcp/transports/__init__.py +3 -0
  281. parrot/mcp/transports/base.py +279 -0
  282. parrot/mcp/transports/grpc_session.py +163 -0
  283. parrot/mcp/transports/http.py +312 -0
  284. parrot/mcp/transports/mcp.proto +108 -0
  285. parrot/mcp/transports/quic.py +1082 -0
  286. parrot/mcp/transports/sse.py +330 -0
  287. parrot/mcp/transports/stdio.py +309 -0
  288. parrot/mcp/transports/unix.py +395 -0
  289. parrot/mcp/transports/websocket.py +547 -0
  290. parrot/memory/__init__.py +16 -0
  291. parrot/memory/abstract.py +209 -0
  292. parrot/memory/agent.py +32 -0
  293. parrot/memory/cache.py +175 -0
  294. parrot/memory/core.py +555 -0
  295. parrot/memory/file.py +153 -0
  296. parrot/memory/mem.py +131 -0
  297. parrot/memory/redis.py +613 -0
  298. parrot/models/__init__.py +46 -0
  299. parrot/models/basic.py +118 -0
  300. parrot/models/compliance.py +208 -0
  301. parrot/models/crew.py +395 -0
  302. parrot/models/detections.py +654 -0
  303. parrot/models/generation.py +85 -0
  304. parrot/models/google.py +223 -0
  305. parrot/models/groq.py +23 -0
  306. parrot/models/openai.py +30 -0
  307. parrot/models/outputs.py +285 -0
  308. parrot/models/responses.py +938 -0
  309. parrot/notifications/__init__.py +743 -0
  310. parrot/openapi/__init__.py +3 -0
  311. parrot/openapi/components.yaml +641 -0
  312. parrot/openapi/config.py +322 -0
  313. parrot/outputs/__init__.py +32 -0
  314. parrot/outputs/formats/__init__.py +108 -0
  315. parrot/outputs/formats/altair.py +359 -0
  316. parrot/outputs/formats/application.py +122 -0
  317. parrot/outputs/formats/base.py +351 -0
  318. parrot/outputs/formats/bokeh.py +356 -0
  319. parrot/outputs/formats/card.py +424 -0
  320. parrot/outputs/formats/chart.py +436 -0
  321. parrot/outputs/formats/d3.py +255 -0
  322. parrot/outputs/formats/echarts.py +310 -0
  323. parrot/outputs/formats/generators/__init__.py +0 -0
  324. parrot/outputs/formats/generators/abstract.py +61 -0
  325. parrot/outputs/formats/generators/panel.py +145 -0
  326. parrot/outputs/formats/generators/streamlit.py +86 -0
  327. parrot/outputs/formats/generators/terminal.py +63 -0
  328. parrot/outputs/formats/holoviews.py +310 -0
  329. parrot/outputs/formats/html.py +147 -0
  330. parrot/outputs/formats/jinja2.py +46 -0
  331. parrot/outputs/formats/json.py +87 -0
  332. parrot/outputs/formats/map.py +933 -0
  333. parrot/outputs/formats/markdown.py +172 -0
  334. parrot/outputs/formats/matplotlib.py +237 -0
  335. parrot/outputs/formats/mixins/__init__.py +0 -0
  336. parrot/outputs/formats/mixins/emaps.py +855 -0
  337. parrot/outputs/formats/plotly.py +341 -0
  338. parrot/outputs/formats/seaborn.py +310 -0
  339. parrot/outputs/formats/table.py +397 -0
  340. parrot/outputs/formats/template_report.py +138 -0
  341. parrot/outputs/formats/yaml.py +125 -0
  342. parrot/outputs/formatter.py +152 -0
  343. parrot/outputs/templates/__init__.py +95 -0
  344. parrot/pipelines/__init__.py +0 -0
  345. parrot/pipelines/abstract.py +210 -0
  346. parrot/pipelines/detector.py +124 -0
  347. parrot/pipelines/models.py +90 -0
  348. parrot/pipelines/planogram.py +3002 -0
  349. parrot/pipelines/table.sql +97 -0
  350. parrot/plugins/__init__.py +106 -0
  351. parrot/plugins/importer.py +80 -0
  352. parrot/py.typed +0 -0
  353. parrot/registry/__init__.py +18 -0
  354. parrot/registry/registry.py +594 -0
  355. parrot/scheduler/__init__.py +1189 -0
  356. parrot/scheduler/models.py +60 -0
  357. parrot/security/__init__.py +16 -0
  358. parrot/security/prompt_injection.py +268 -0
  359. parrot/security/security_events.sql +25 -0
  360. parrot/services/__init__.py +1 -0
  361. parrot/services/mcp/__init__.py +8 -0
  362. parrot/services/mcp/config.py +13 -0
  363. parrot/services/mcp/server.py +295 -0
  364. parrot/services/o365_remote_auth.py +235 -0
  365. parrot/stores/__init__.py +7 -0
  366. parrot/stores/abstract.py +352 -0
  367. parrot/stores/arango.py +1090 -0
  368. parrot/stores/bigquery.py +1377 -0
  369. parrot/stores/cache.py +106 -0
  370. parrot/stores/empty.py +10 -0
  371. parrot/stores/faiss_store.py +1157 -0
  372. parrot/stores/kb/__init__.py +9 -0
  373. parrot/stores/kb/abstract.py +68 -0
  374. parrot/stores/kb/cache.py +165 -0
  375. parrot/stores/kb/doc.py +325 -0
  376. parrot/stores/kb/hierarchy.py +346 -0
  377. parrot/stores/kb/local.py +457 -0
  378. parrot/stores/kb/prompt.py +28 -0
  379. parrot/stores/kb/redis.py +659 -0
  380. parrot/stores/kb/store.py +115 -0
  381. parrot/stores/kb/user.py +374 -0
  382. parrot/stores/models.py +59 -0
  383. parrot/stores/pgvector.py +3 -0
  384. parrot/stores/postgres.py +2853 -0
  385. parrot/stores/utils/__init__.py +0 -0
  386. parrot/stores/utils/chunking.py +197 -0
  387. parrot/telemetry/__init__.py +3 -0
  388. parrot/telemetry/mixin.py +111 -0
  389. parrot/template/__init__.py +3 -0
  390. parrot/template/engine.py +259 -0
  391. parrot/tools/__init__.py +23 -0
  392. parrot/tools/abstract.py +644 -0
  393. parrot/tools/agent.py +363 -0
  394. parrot/tools/arangodbsearch.py +537 -0
  395. parrot/tools/arxiv_tool.py +188 -0
  396. parrot/tools/calculator/__init__.py +3 -0
  397. parrot/tools/calculator/operations/__init__.py +38 -0
  398. parrot/tools/calculator/operations/calculus.py +80 -0
  399. parrot/tools/calculator/operations/statistics.py +76 -0
  400. parrot/tools/calculator/tool.py +150 -0
  401. parrot/tools/cloudwatch.py +988 -0
  402. parrot/tools/codeinterpreter/__init__.py +127 -0
  403. parrot/tools/codeinterpreter/executor.py +371 -0
  404. parrot/tools/codeinterpreter/internals.py +473 -0
  405. parrot/tools/codeinterpreter/models.py +643 -0
  406. parrot/tools/codeinterpreter/prompts.py +224 -0
  407. parrot/tools/codeinterpreter/tool.py +664 -0
  408. parrot/tools/company_info/__init__.py +6 -0
  409. parrot/tools/company_info/tool.py +1138 -0
  410. parrot/tools/correlationanalysis.py +437 -0
  411. parrot/tools/database/abstract.py +286 -0
  412. parrot/tools/database/bq.py +115 -0
  413. parrot/tools/database/cache.py +284 -0
  414. parrot/tools/database/models.py +95 -0
  415. parrot/tools/database/pg.py +343 -0
  416. parrot/tools/databasequery.py +1159 -0
  417. parrot/tools/db.py +1800 -0
  418. parrot/tools/ddgo.py +370 -0
  419. parrot/tools/decorators.py +271 -0
  420. parrot/tools/dftohtml.py +282 -0
  421. parrot/tools/document.py +549 -0
  422. parrot/tools/ecs.py +819 -0
  423. parrot/tools/edareport.py +368 -0
  424. parrot/tools/elasticsearch.py +1049 -0
  425. parrot/tools/employees.py +462 -0
  426. parrot/tools/epson/__init__.py +96 -0
  427. parrot/tools/excel.py +683 -0
  428. parrot/tools/file/__init__.py +13 -0
  429. parrot/tools/file/abstract.py +76 -0
  430. parrot/tools/file/gcs.py +378 -0
  431. parrot/tools/file/local.py +284 -0
  432. parrot/tools/file/s3.py +511 -0
  433. parrot/tools/file/tmp.py +309 -0
  434. parrot/tools/file/tool.py +501 -0
  435. parrot/tools/file_reader.py +129 -0
  436. parrot/tools/flowtask/__init__.py +19 -0
  437. parrot/tools/flowtask/tool.py +761 -0
  438. parrot/tools/gittoolkit.py +508 -0
  439. parrot/tools/google/__init__.py +18 -0
  440. parrot/tools/google/base.py +169 -0
  441. parrot/tools/google/tools.py +1251 -0
  442. parrot/tools/googlelocation.py +5 -0
  443. parrot/tools/googleroutes.py +5 -0
  444. parrot/tools/googlesearch.py +5 -0
  445. parrot/tools/googlesitesearch.py +5 -0
  446. parrot/tools/googlevoice.py +2 -0
  447. parrot/tools/gvoice.py +695 -0
  448. parrot/tools/ibisworld/README.md +225 -0
  449. parrot/tools/ibisworld/__init__.py +11 -0
  450. parrot/tools/ibisworld/tool.py +366 -0
  451. parrot/tools/jiratoolkit.py +1718 -0
  452. parrot/tools/manager.py +1098 -0
  453. parrot/tools/math.py +152 -0
  454. parrot/tools/metadata.py +476 -0
  455. parrot/tools/msteams.py +1621 -0
  456. parrot/tools/msword.py +635 -0
  457. parrot/tools/multidb.py +580 -0
  458. parrot/tools/multistoresearch.py +369 -0
  459. parrot/tools/networkninja.py +167 -0
  460. parrot/tools/nextstop/__init__.py +4 -0
  461. parrot/tools/nextstop/base.py +286 -0
  462. parrot/tools/nextstop/employee.py +733 -0
  463. parrot/tools/nextstop/store.py +462 -0
  464. parrot/tools/notification.py +435 -0
  465. parrot/tools/o365/__init__.py +42 -0
  466. parrot/tools/o365/base.py +295 -0
  467. parrot/tools/o365/bundle.py +522 -0
  468. parrot/tools/o365/events.py +554 -0
  469. parrot/tools/o365/mail.py +992 -0
  470. parrot/tools/o365/onedrive.py +497 -0
  471. parrot/tools/o365/sharepoint.py +641 -0
  472. parrot/tools/openapi_toolkit.py +904 -0
  473. parrot/tools/openweather.py +527 -0
  474. parrot/tools/pdfprint.py +1001 -0
  475. parrot/tools/powerbi.py +518 -0
  476. parrot/tools/powerpoint.py +1113 -0
  477. parrot/tools/pricestool.py +146 -0
  478. parrot/tools/products/__init__.py +246 -0
  479. parrot/tools/prophet_tool.py +171 -0
  480. parrot/tools/pythonpandas.py +630 -0
  481. parrot/tools/pythonrepl.py +910 -0
  482. parrot/tools/qsource.py +436 -0
  483. parrot/tools/querytoolkit.py +395 -0
  484. parrot/tools/quickeda.py +827 -0
  485. parrot/tools/resttool.py +553 -0
  486. parrot/tools/retail/__init__.py +0 -0
  487. parrot/tools/retail/bby.py +528 -0
  488. parrot/tools/sandboxtool.py +703 -0
  489. parrot/tools/sassie/__init__.py +352 -0
  490. parrot/tools/scraping/__init__.py +7 -0
  491. parrot/tools/scraping/docs/select.md +466 -0
  492. parrot/tools/scraping/documentation.md +1278 -0
  493. parrot/tools/scraping/driver.py +436 -0
  494. parrot/tools/scraping/models.py +576 -0
  495. parrot/tools/scraping/options.py +85 -0
  496. parrot/tools/scraping/orchestrator.py +517 -0
  497. parrot/tools/scraping/readme.md +740 -0
  498. parrot/tools/scraping/tool.py +3115 -0
  499. parrot/tools/seasonaldetection.py +642 -0
  500. parrot/tools/shell_tool/__init__.py +5 -0
  501. parrot/tools/shell_tool/actions.py +408 -0
  502. parrot/tools/shell_tool/engine.py +155 -0
  503. parrot/tools/shell_tool/models.py +322 -0
  504. parrot/tools/shell_tool/tool.py +442 -0
  505. parrot/tools/site_search.py +214 -0
  506. parrot/tools/textfile.py +418 -0
  507. parrot/tools/think.py +378 -0
  508. parrot/tools/toolkit.py +298 -0
  509. parrot/tools/webapp_tool.py +187 -0
  510. parrot/tools/whatif.py +1279 -0
  511. parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
  512. parrot/tools/workday/__init__.py +6 -0
  513. parrot/tools/workday/models.py +1389 -0
  514. parrot/tools/workday/tool.py +1293 -0
  515. parrot/tools/yfinance_tool.py +306 -0
  516. parrot/tools/zipcode.py +217 -0
  517. parrot/utils/__init__.py +2 -0
  518. parrot/utils/helpers.py +73 -0
  519. parrot/utils/parsers/__init__.py +5 -0
  520. parrot/utils/parsers/toml.c +12078 -0
  521. parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
  522. parrot/utils/parsers/toml.pyx +21 -0
  523. parrot/utils/toml.py +11 -0
  524. parrot/utils/types.cpp +20936 -0
  525. parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
  526. parrot/utils/types.pyx +213 -0
  527. parrot/utils/uv.py +11 -0
  528. parrot/version.py +10 -0
  529. parrot/yaml-rs/Cargo.lock +350 -0
  530. parrot/yaml-rs/Cargo.toml +19 -0
  531. parrot/yaml-rs/pyproject.toml +19 -0
  532. parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
  533. parrot/yaml-rs/src/lib.rs +222 -0
  534. requirements/docker-compose.yml +24 -0
  535. requirements/requirements-dev.txt +21 -0
parrot/loaders/pdf.py ADDED
@@ -0,0 +1,373 @@
1
+ from collections.abc import Callable
2
+ from typing import List, Optional, Union
3
+ import re
4
+ from pathlib import Path, PurePath
5
+ import fitz
6
+ import pymupdf4llm
7
+ from ..stores.models import Document
8
+ from .abstract import AbstractLoader
9
+
10
+ class PDFLoader(AbstractLoader):
11
+ """
12
+ Advanced PDF Loader using PyMuPDF (fitz).
13
+ - Skips image-only pages.
14
+ - Combines title-only pages with next content page.
15
+ - Preserves tables as text for chatbot/RAG KB usage.
16
+ - Returns a Parrot Document per logical page.
17
+ - Supports chapter-based splitting for markdown output.
18
+ """
19
+
20
+ extensions: List[str] = {'.pdf'}
21
+
22
+ def __init__(
23
+ self,
24
+ source: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
25
+ *,
26
+ tokenizer: Union[str, Callable] = None,
27
+ text_splitter: Union[str, Callable] = None,
28
+ source_type: str = 'file',
29
+ as_markdown: bool = False,
30
+ use_chapters: bool = False,
31
+ use_pages: bool = False,
32
+ **kwargs
33
+ ):
34
+ super().__init__(
35
+ source,
36
+ tokenizer=tokenizer,
37
+ text_splitter=text_splitter,
38
+ source_type=source_type,
39
+ **kwargs
40
+ )
41
+ self.doctype = 'pdf'
42
+ self._source_type = source_type
43
+ self.as_markdown = as_markdown
44
+ self.use_chapters = use_chapters
45
+ self.use_pages = use_pages
46
+
47
+ def is_title_only(self, text: str, min_len: int = 5, max_len: int = 50) -> bool:
48
+ """Check if text looks like a title (short, single line, large font)."""
49
+ lines = [l for l in text.strip().split('\n') if l.strip()]
50
+ if len(lines) == 1 and min_len <= len(lines[0]) <= max_len:
51
+ return True
52
+ return False
53
+
54
+ def is_image_only(self, page: fitz.Page) -> bool:
55
+ """Return True if the page only contains images (no visible text)."""
56
+ text = page.get_text("text").strip()
57
+ if text:
58
+ return False
59
+ # Has no text, check if images exist
60
+ img_list = page.get_images(full=True)
61
+ return len(img_list) > 0
62
+
63
+ def is_table_like(self, text: str) -> bool:
64
+ """Naive check: Table if lines have multiple columns (lots of |, tab, or spaces)."""
65
+ lines = [l for l in text.split('\n') if l.strip()]
66
+ if not lines:
67
+ return False
68
+ count_table_lines = sum(1 for l in lines if ('|' in l or '\t' in l or (len(l.split()) > 3)))
69
+ return (count_table_lines > len(lines) // 2) and len(lines) > 2
70
+
71
+ def extract_table(self, page: fitz.Page) -> Optional[str]:
72
+ """Attempt to extract table structure, return as markdown if detected, else None."""
73
+ # PyMuPDF can't extract structured tables, so fallback to plain text with basic cleanup
74
+ text = page.get_text("text")
75
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
76
+ # Try to join lines with | if possible
77
+ if not lines:
78
+ return None
79
+ # Heuristic: If tab separated or lots of spaces, format as a markdown table
80
+ table_lines = []
81
+ for l in lines:
82
+ if '\t' in l:
83
+ cells = [c.strip() for c in l.split('\t')]
84
+ table_lines.append("| " + " | ".join(cells) + " |")
85
+ elif '|' in l:
86
+ table_lines.append(l)
87
+ else:
88
+ # Split by multiple spaces
89
+ cells = [c.strip() for c in l.split(" ") if c.strip()]
90
+ if len(cells) > 2:
91
+ table_lines.append("| " + " | ".join(cells) + " |")
92
+ else:
93
+ table_lines.append(l)
94
+ if table_lines:
95
+ # Add markdown header if more than 2 columns
96
+ if len(table_lines) > 1 and table_lines[0].count('|') == table_lines[1].count('|'):
97
+ ncols = table_lines[0].count('|') - 1
98
+ if ncols > 1:
99
+ header_sep = "| " + " | ".join(['---'] * ncols) + " |"
100
+ table_lines.insert(1, header_sep)
101
+ return "\n".join(table_lines)
102
+ return None
103
+
104
+ def extract_chapters_from_markdown(self, md_text: str) -> List[dict]:
105
+ """
106
+ Extract chapters from markdown text based on headers.
107
+ Returns list of dicts with 'title' and 'content' keys.
108
+ """
109
+ chapters = []
110
+
111
+ # Split by horizontal rules and headers
112
+ # Look for patterns like: -----\n**TITLE**\n or # Title
113
+
114
+ # First, let's handle the horizontal rule + bold title pattern
115
+ sections = re.split(r'\n-----+\n', md_text)
116
+
117
+ for i, section in enumerate(sections):
118
+ section = section.strip()
119
+ if not section:
120
+ continue
121
+
122
+ # Look for bold titles at the beginning of sections
123
+ title_match = re.match(r'^\*\*([^*]+)\*\*', section)
124
+ if title_match:
125
+ title = title_match.group(1).strip()
126
+ # Get content after the title
127
+ content = re.sub(r'^\*\*[^*]+\*\*\s*', '', section, count=1).strip()
128
+ else:
129
+ # Look for markdown headers (# ## ###)
130
+ header_match = re.match(r'^(#{1,6})\s*(.+?)$', section, re.MULTILINE)
131
+ if header_match:
132
+ title = header_match.group(2).strip()
133
+ # Get content after the header
134
+ content = re.sub(r'^#{1,6}\s*.+?$', '', section, count=1, flags=re.MULTILINE).strip()
135
+ else:
136
+ # No clear title found, use section number or first line
137
+ lines = section.split('\n')
138
+ if lines:
139
+ title = f"Section {i+1}" if not lines[0].strip() else lines[0][:50] + "..."
140
+ content = section
141
+ else:
142
+ continue
143
+
144
+ # Skip if content is too short (less than 10 characters)
145
+ if len(content.strip()) < 10:
146
+ self.logger.info(f"Skipping chapter '{title}' - content too short")
147
+ continue
148
+
149
+ chapters.append({
150
+ 'title': title,
151
+ 'content': content,
152
+ 'chapter_number': len(chapters) + 1
153
+ })
154
+
155
+ return chapters
156
+
157
+ def extract_pages_from_markdown(self, md_text: str) -> List[dict]:
158
+ """
159
+ Extract pages from markdown text based on page separators.
160
+ Returns list of dicts with 'title' and 'content' keys.
161
+ """
162
+ pages = []
163
+
164
+ # Split by page indicators (common patterns)
165
+ page_patterns = [
166
+ r'\n-----+\n', # Horizontal rules
167
+ r'Slide \d+', # Slide indicators
168
+ r'Page \d+', # Page indicators
169
+ ]
170
+
171
+ # Try to split by the most common pattern first
172
+ sections = re.split(r'\n-----+\n', md_text)
173
+
174
+ for i, section in enumerate(sections):
175
+ section = section.strip()
176
+ if not section or len(section) < 10:
177
+ continue
178
+
179
+ # Extract title from the beginning of the page
180
+ lines = section.split('\n')
181
+ title = None
182
+ content_start = 0
183
+
184
+ # Look for bold title or header at the beginning
185
+ for j, line in enumerate(lines[:3]): # Check first 3 lines
186
+ line = line.strip()
187
+ if re.match(r'^\*\*([^*]+)\*\*$', line):
188
+ title = re.match(r'^\*\*([^*]+)\*\*$', line).group(1)
189
+ content_start = j + 1
190
+ break
191
+ elif re.match(r'^#{1,6}\s*(.+?)$', line):
192
+ title = re.match(r'^#{1,6}\s*(.+?)$', line).group(1)
193
+ content_start = j + 1
194
+ break
195
+
196
+ if not title:
197
+ title = f"Page {i+1}"
198
+
199
+ # Get content after title
200
+ content = '\n'.join(lines[content_start:]).strip()
201
+
202
+ if len(content) < 10:
203
+ continue
204
+
205
+ pages.append({
206
+ 'title': title,
207
+ 'content': content,
208
+ 'page_number': i + 1
209
+ })
210
+
211
+ return pages
212
+
213
+ async def _load(self, path: PurePath, **kwargs) -> List[Document]:
214
+ self.logger.info(f"Loading PDF file: {path}")
215
+ docs = []
216
+ all_text = [] # ← For summary collection
217
+ doc = fitz.open(str(path))
218
+ if self.as_markdown:
219
+ md_text = pymupdf4llm.to_markdown(path)
220
+ if self.use_chapters:
221
+ # Split by chapters
222
+ chapters = self.extract_chapters_from_markdown(md_text)
223
+ self.logger.info(f"Found {len(chapters)} chapters")
224
+ for chapter in chapters:
225
+ document_meta = {
226
+ "filename": path.name,
227
+ "file_path": str(path),
228
+ "chapter_title": chapter['title'],
229
+ "chapter_number": chapter['chapter_number'],
230
+ "content_type": "chapter"
231
+ }
232
+ meta = self.create_metadata(
233
+ path=path,
234
+ doctype="pdf",
235
+ source_type="pdf_chapter",
236
+ doc_metadata=document_meta,
237
+ )
238
+ # Combine title and content
239
+ full_content = f"# {chapter['title']}\n\n{chapter['content']}"
240
+ docs.append(
241
+ self.create_document(
242
+ content=full_content,
243
+ path=path,
244
+ metadata=meta
245
+ )
246
+ )
247
+ elif self.use_pages:
248
+ # Split by pages
249
+ pages = self.extract_pages_from_markdown(md_text)
250
+ self.logger.info(f"Found {len(pages)} pages")
251
+
252
+ for page in pages:
253
+ document_meta = {
254
+ "filename": path.name,
255
+ "file_path": str(path),
256
+ "page_title": page['title'],
257
+ "page_number": page['page_number'],
258
+ "content_type": "page"
259
+ }
260
+
261
+ meta = self.create_metadata(
262
+ path=path,
263
+ doctype="pdf",
264
+ source_type="pdf_page",
265
+ doc_metadata=document_meta,
266
+ )
267
+
268
+ # Combine title and content
269
+ full_content = f"## {page['title']}\n\n{page['content']}"
270
+
271
+ docs.append(
272
+ self.create_document(
273
+ content=full_content,
274
+ path=path,
275
+ metadata=meta
276
+ )
277
+ )
278
+ else:
279
+ # Return whole markdown as single document
280
+ document_meta = {
281
+ "filename": path.name,
282
+ "file_path": str(path),
283
+ "content_type": "full_document"
284
+ }
285
+ meta = self.create_metadata(
286
+ path=path,
287
+ doctype="pdf",
288
+ source_type="pdf_markdown",
289
+ doc_metadata=document_meta,
290
+ )
291
+ docs.append(
292
+ self.create_document(
293
+ content=md_text,
294
+ path=path,
295
+ metadata=meta
296
+ )
297
+ )
298
+ else:
299
+ # Use the default text extraction page-based
300
+ pending_title = None
301
+ for i, page in enumerate(doc):
302
+ page_text = page.get_text("text").strip()
303
+ if self.is_image_only(page):
304
+ self.logger.info(f"Page {i+1}: image-only, skipping.")
305
+ continue
306
+
307
+ # Title-only page: store to prepend to next content
308
+ if self.is_title_only(page_text):
309
+ self.logger.info(f"Page {i+1}: title-only, saving for next page.")
310
+ pending_title = page_text
311
+ continue
312
+
313
+ # Table page: try to preserve structure
314
+ if self.is_table_like(page_text):
315
+ table_md = self.extract_table(page)
316
+ if table_md:
317
+ content = (pending_title + '\n\n' if pending_title else '') + table_md
318
+ pending_title = None
319
+ else:
320
+ content = (pending_title + '\n\n' if pending_title else '') + page_text
321
+ pending_title = None
322
+ else:
323
+ content = (pending_title + '\n\n' if pending_title else '') + page_text
324
+ pending_title = None
325
+
326
+ document_meta = {
327
+ "filename": path.name,
328
+ "file_path": str(path),
329
+ "page_number": i + 1,
330
+ # "title": doc.metadata.get("title", ""),
331
+ # "creationDate": doc.metadata.get("creationDate", ""),
332
+ # "author": doc.metadata.get("author", ""),
333
+ }
334
+ meta = self.create_metadata(
335
+ path=path,
336
+ doctype="pdf",
337
+ source_type="pdf",
338
+ doc_metadata=document_meta,
339
+ )
340
+ if len(content) < 10:
341
+ self.logger.warning(
342
+ f"Page {i+1} content too short, skipping."
343
+ )
344
+ continue
345
+ docs.append(
346
+ self.create_document(
347
+ content=content,
348
+ path=path,
349
+ metadata=meta
350
+ )
351
+ )
352
+ all_text.append(content)
353
+ doc.close()
354
+ # --- Summarization step ---
355
+ full_text = "\n\n".join(all_text)
356
+ summary = await self.summary_from_text(full_text)
357
+ if summary:
358
+ summary_meta = self.create_metadata(
359
+ path=path,
360
+ doctype=self.doctype,
361
+ source_type=self._source_type,
362
+ doc_metadata={
363
+ "summary_for_pages": len(docs),
364
+ }
365
+ )
366
+ docs.append(
367
+ self.create_document(
368
+ content=f"SUMMARY:\n\n{summary}",
369
+ path=path,
370
+ metadata=summary_meta
371
+ )
372
+ )
373
+ return docs
@@ -0,0 +1,320 @@
1
+ from typing import Any, Union, List
2
+ import logging
3
+ from collections.abc import Callable
4
+ from pathlib import PurePath
5
+ import fitz
6
+ from ..stores.models import Document
7
+ from .basepdf import BasePDF
8
+ # Option 1: Use MarkItDown (Microsoft's universal document converter)
9
+ try:
10
+ from markitdown import MarkItDown
11
+ MARKITDOWN_AVAILABLE = True
12
+ except ImportError:
13
+ MARKITDOWN_AVAILABLE = False
14
+
15
+ # Option 2: Use pymupdf4llm (updated PyMuPDF library)
16
+ try:
17
+ import pymupdf4llm
18
+ PYMUPDF4LLM_AVAILABLE = True
19
+ except ImportError:
20
+ PYMUPDF4LLM_AVAILABLE = False
21
+
22
+
23
+ logger = logging.getLogger('pdfminer').setLevel(logging.INFO)
24
+
25
+ class PDFMarkdownLoader(BasePDF):
26
+ """
27
+ Loader for PDF files converted content to markdown.
28
+
29
+ This loader supports multiple backends for PDF to markdown conversion:
30
+ 1. MarkItDown (Microsoft's universal document converter)
31
+ 2. pymupdf4llm (PyMuPDF's markdown converter)
32
+ 3. Fallback manual conversion using PyMuPDF
33
+ """
34
+
35
+ extensions: List[str] = {'.pdf'}
36
+
37
+ def __init__(
38
+ self,
39
+ source: Union[str, PurePath, List[PurePath]],
40
+ tokenizer: Callable[..., Any] = None,
41
+ text_splitter: Callable[..., Any] = None,
42
+ source_type: str = 'pdf',
43
+ language: str = "eng",
44
+ markdown_backend: str = "auto", # "markitdown", "pymupdf4llm", "manual", "auto"
45
+ chunk_size: int = 1024,
46
+ chunk_overlap: int = 10,
47
+ preserve_tables: bool = True,
48
+ extract_images: bool = False,
49
+ **kwargs
50
+ ):
51
+ super().__init__(
52
+ source=source,
53
+ tokenizer=tokenizer,
54
+ text_splitter=text_splitter,
55
+ source_type=source_type,
56
+ **kwargs
57
+ )
58
+ self._language = language
59
+ self.markdown_backend = self._select_backend(markdown_backend)
60
+ self.preserve_tables = preserve_tables
61
+ self.extract_images = extract_images
62
+
63
+ # Initialize markdown splitter
64
+ self._splitter = self._get_markdown_splitter(
65
+ chunk_size=chunk_size,
66
+ chunk_overlap=chunk_overlap
67
+ )
68
+
69
+ # Initialize conversion backend
70
+ self._setup_conversion_backend()
71
+
72
+ def _select_backend(self, preferred: str) -> str:
73
+ """Select the best available backend for PDF to markdown conversion."""
74
+ if preferred == "auto":
75
+ if MARKITDOWN_AVAILABLE:
76
+ return "markitdown"
77
+ elif PYMUPDF4LLM_AVAILABLE:
78
+ return "pymupdf4llm"
79
+ else:
80
+ return "manual"
81
+ elif preferred == "markitdown" and MARKITDOWN_AVAILABLE:
82
+ return "markitdown"
83
+ elif preferred == "pymupdf4llm" and PYMUPDF4LLM_AVAILABLE:
84
+ return "pymupdf4llm"
85
+ elif preferred == "manual":
86
+ return "manual"
87
+ else:
88
+ # Fallback to available backend
89
+ self.logger.warning(f"Preferred backend '{preferred}' not available, using fallback")
90
+ return self._select_backend("auto")
91
+
92
+ def _setup_conversion_backend(self):
93
+ """Initialize the selected conversion backend."""
94
+ if self.markdown_backend == "markitdown":
95
+ self.md_converter = MarkItDown()
96
+ self.logger.info("Using MarkItDown backend for PDF to markdown conversion")
97
+ elif self.markdown_backend == "pymupdf4llm":
98
+ self.logger.info("Using pymupdf4llm backend for PDF to markdown conversion")
99
+ else:
100
+ self.logger.info("Using manual PyMuPDF backend for PDF to markdown conversion")
101
+
102
+ def _convert_to_markdown_markitdown(self, path: Union[str, PurePath]) -> str:
103
+ """Convert PDF to markdown using MarkItDown."""
104
+ try:
105
+ result = self.md_converter.convert(str(path))
106
+ return result.text_content if result else ""
107
+ except Exception as e:
108
+ self.logger.error(f"MarkItDown conversion failed: {e}")
109
+ return self._convert_to_markdown_manual(path)
110
+
111
+ def _convert_to_markdown_pymupdf4llm(self, path: Union[str, PurePath]) -> str:
112
+ """Convert PDF to markdown using pymupdf4llm."""
113
+ try:
114
+ return pymupdf4llm.to_markdown(str(path))
115
+ except Exception as e:
116
+ self.logger.error(f"pymupdf4llm conversion failed: {e}")
117
+ return self._convert_to_markdown_manual(path)
118
+
119
+ def _convert_to_markdown_manual(self, path: Union[str, PurePath]) -> str:
120
+ """Fallback manual conversion using PyMuPDF with basic markdown formatting."""
121
+ try:
122
+ doc = fitz.open(str(path))
123
+ markdown_text = []
124
+
125
+ for _, page_num in enumerate(doc):
126
+ page = doc[page_num]
127
+
128
+ # Extract text blocks with formatting
129
+ blocks = page.get_text("dict")["blocks"]
130
+
131
+ for block in blocks:
132
+ if "lines" in block:
133
+ block_text = []
134
+ for line in block["lines"]:
135
+ line_text = ""
136
+ for span in line["spans"]:
137
+ text = span["text"]
138
+ font_size = span.get("size", 12)
139
+ flags = span.get("flags", 0)
140
+
141
+ # Basic formatting based on font properties
142
+ if font_size > 16:
143
+ text = f"# {text}"
144
+ elif font_size > 14:
145
+ text = f"## {text}"
146
+ elif font_size > 12:
147
+ text = f"### {text}"
148
+
149
+ # Bold text
150
+ if flags & 2**4: # Bold flag
151
+ text = f"**{text}**"
152
+
153
+ # Italic text
154
+ if flags & 2**6: # Italic flag
155
+ text = f"*{text}*"
156
+
157
+ line_text += text
158
+
159
+ if line_text.strip():
160
+ block_text.append(line_text)
161
+
162
+ if block_text:
163
+ markdown_text.append("\n".join(block_text))
164
+
165
+ # Extract tables if requested
166
+ if self.preserve_tables:
167
+ tables = page.find_tables()
168
+ for table in tables:
169
+ try:
170
+ table_data = table.extract()
171
+ if table_data:
172
+ markdown_table = self._format_table_as_markdown(table_data)
173
+ if markdown_table:
174
+ markdown_text.append(markdown_table)
175
+ except Exception as e:
176
+ self.logger.debug(f"Failed to extract table: {e}")
177
+
178
+ doc.close()
179
+ return "\n\n".join(markdown_text)
180
+
181
+ except Exception as e:
182
+ self.logger.error(f"Manual PDF conversion failed: {e}")
183
+ return ""
184
+
185
+ def _format_table_as_markdown(self, table_data: List[List[str]]) -> str:
186
+ """Convert table data to markdown format."""
187
+ if not table_data or len(table_data) < 1:
188
+ return ""
189
+
190
+ markdown_rows = []
191
+
192
+ # Header row
193
+ header_row = " | ".join(str(cell) if cell else "" for cell in table_data[0])
194
+ markdown_rows.append(f"| {header_row} |")
195
+
196
+ # Separator row
197
+ separator = " | ".join("---" for _ in table_data[0])
198
+ markdown_rows.append(f"| {separator} |")
199
+
200
+ # Data rows
201
+ for row in table_data[1:]:
202
+ data_row = " | ".join(str(cell) if cell else "" for cell in row)
203
+ markdown_rows.append(f"| {data_row} |")
204
+
205
+ return "\n".join(markdown_rows)
206
+
207
+ async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
208
+ """
209
+ Load a PDF file and convert to markdown format.
210
+
211
+ Args:
212
+ path (Union[str, PurePath, List[PurePath]]): The path to the PDF file.
213
+
214
+ Returns:
215
+ List[Document]: A list of AI-Parrot Documents.
216
+ """
217
+ self.logger.info(f"Loading PDF file: {path}")
218
+ docs = []
219
+
220
+ # Convert to markdown using selected backend
221
+ if self.markdown_backend == "markitdown":
222
+ md_text = self._convert_to_markdown_markitdown(path)
223
+ elif self.markdown_backend == "pymupdf4llm":
224
+ md_text = self._convert_to_markdown_pymupdf4llm(path)
225
+ else:
226
+ md_text = self._convert_to_markdown_manual(path)
227
+
228
+ if not md_text.strip():
229
+ self.logger.warning(f"No markdown content extracted from {path}")
230
+ return docs
231
+
232
+ # Extract PDF metadata
233
+ try:
234
+ pdf = fitz.open(str(path))
235
+ pdf_metadata = pdf.metadata # pylint: disable=E1101 # noqa: E1101
236
+ pdf.close()
237
+ except Exception as e:
238
+ self.logger.warning(
239
+ f"Could not extract PDF metadata: {e}"
240
+ )
241
+ pdf_metadata = {}
242
+
243
+ # Generate summary if enabled
244
+ try:
245
+ summary = await self.summary_from_text(md_text)
246
+ except Exception as e:
247
+ self.logger.warning(
248
+ f"Summary generation failed: {e}"
249
+ )
250
+ summary = ''
251
+
252
+ # Create base metadata
253
+ base_metadata = {
254
+ "url": '',
255
+ "filename": path.name if hasattr(path, 'name') else str(path).rsplit('/', maxsplit=1)[-1], # noqa
256
+ "source": str(path.name if hasattr(path, 'name') else path),
257
+ "type": 'pdf',
258
+ "data": {},
259
+ "category": self.category,
260
+ "source_type": self._source_type,
261
+ "conversion_backend": self.markdown_backend,
262
+ "document_meta": {
263
+ "title": pdf_metadata.get("title", ""),
264
+ "creationDate": pdf_metadata.get("creationDate", ""),
265
+ "author": pdf_metadata.get("author", ""),
266
+ }
267
+ }
268
+
269
+ # Add summary document if available
270
+ if summary:
271
+ summary_metadata = {
272
+ **base_metadata,
273
+ "content_type": "summary"
274
+ }
275
+ docs.append(
276
+ Document(
277
+ page_content=summary,
278
+ metadata=summary_metadata
279
+ )
280
+ )
281
+
282
+ # Split markdown content into chunks
283
+ try:
284
+ chunks = self._splitter.split_text(md_text)
285
+ self.logger.info(f"Split document into {len(chunks)} chunks")
286
+ except Exception as e:
287
+ self.logger.error(
288
+ f"Failed to split text: {e}"
289
+ )
290
+ # Fallback: use the entire text as one chunk
291
+ chunks = [md_text]
292
+
293
+ # Create documents for each chunk
294
+ for chunk_index, chunk in enumerate(chunks):
295
+ chunk_metadata = {
296
+ **base_metadata,
297
+ "content_type": "chunk",
298
+ "chunk_index": chunk_index,
299
+ "total_chunks": len(chunks)
300
+ }
301
+
302
+ docs.append(
303
+ Document(
304
+ page_content=chunk,
305
+ metadata=chunk_metadata
306
+ )
307
+ )
308
+
309
+ return docs
310
+
311
+ def get_supported_backends(self) -> List[str]:
312
+ """Get list of available conversion backends."""
313
+ backends = ["manual"] # Always available
314
+
315
+ if MARKITDOWN_AVAILABLE:
316
+ backends.append("markitdown")
317
+ if PYMUPDF4LLM_AVAILABLE:
318
+ backends.append("pymupdf4llm")
319
+
320
+ return backends