ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
parrot/loaders/pdf.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import List, Optional, Union
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path, PurePath
|
|
5
|
+
import fitz
|
|
6
|
+
import pymupdf4llm
|
|
7
|
+
from ..stores.models import Document
|
|
8
|
+
from .abstract import AbstractLoader
|
|
9
|
+
|
|
10
|
+
class PDFLoader(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
Advanced PDF Loader using PyMuPDF (fitz).
|
|
13
|
+
- Skips image-only pages.
|
|
14
|
+
- Combines title-only pages with next content page.
|
|
15
|
+
- Preserves tables as text for chatbot/RAG KB usage.
|
|
16
|
+
- Returns a Parrot Document per logical page.
|
|
17
|
+
- Supports chapter-based splitting for markdown output.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
extensions: List[str] = {'.pdf'}
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
source: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
|
|
25
|
+
*,
|
|
26
|
+
tokenizer: Union[str, Callable] = None,
|
|
27
|
+
text_splitter: Union[str, Callable] = None,
|
|
28
|
+
source_type: str = 'file',
|
|
29
|
+
as_markdown: bool = False,
|
|
30
|
+
use_chapters: bool = False,
|
|
31
|
+
use_pages: bool = False,
|
|
32
|
+
**kwargs
|
|
33
|
+
):
|
|
34
|
+
super().__init__(
|
|
35
|
+
source,
|
|
36
|
+
tokenizer=tokenizer,
|
|
37
|
+
text_splitter=text_splitter,
|
|
38
|
+
source_type=source_type,
|
|
39
|
+
**kwargs
|
|
40
|
+
)
|
|
41
|
+
self.doctype = 'pdf'
|
|
42
|
+
self._source_type = source_type
|
|
43
|
+
self.as_markdown = as_markdown
|
|
44
|
+
self.use_chapters = use_chapters
|
|
45
|
+
self.use_pages = use_pages
|
|
46
|
+
|
|
47
|
+
def is_title_only(self, text: str, min_len: int = 5, max_len: int = 50) -> bool:
|
|
48
|
+
"""Check if text looks like a title (short, single line, large font)."""
|
|
49
|
+
lines = [l for l in text.strip().split('\n') if l.strip()]
|
|
50
|
+
if len(lines) == 1 and min_len <= len(lines[0]) <= max_len:
|
|
51
|
+
return True
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
def is_image_only(self, page: fitz.Page) -> bool:
|
|
55
|
+
"""Return True if the page only contains images (no visible text)."""
|
|
56
|
+
text = page.get_text("text").strip()
|
|
57
|
+
if text:
|
|
58
|
+
return False
|
|
59
|
+
# Has no text, check if images exist
|
|
60
|
+
img_list = page.get_images(full=True)
|
|
61
|
+
return len(img_list) > 0
|
|
62
|
+
|
|
63
|
+
def is_table_like(self, text: str) -> bool:
|
|
64
|
+
"""Naive check: Table if lines have multiple columns (lots of |, tab, or spaces)."""
|
|
65
|
+
lines = [l for l in text.split('\n') if l.strip()]
|
|
66
|
+
if not lines:
|
|
67
|
+
return False
|
|
68
|
+
count_table_lines = sum(1 for l in lines if ('|' in l or '\t' in l or (len(l.split()) > 3)))
|
|
69
|
+
return (count_table_lines > len(lines) // 2) and len(lines) > 2
|
|
70
|
+
|
|
71
|
+
def extract_table(self, page: fitz.Page) -> Optional[str]:
|
|
72
|
+
"""Attempt to extract table structure, return as markdown if detected, else None."""
|
|
73
|
+
# PyMuPDF can't extract structured tables, so fallback to plain text with basic cleanup
|
|
74
|
+
text = page.get_text("text")
|
|
75
|
+
lines = [l.strip() for l in text.split('\n') if l.strip()]
|
|
76
|
+
# Try to join lines with | if possible
|
|
77
|
+
if not lines:
|
|
78
|
+
return None
|
|
79
|
+
# Heuristic: If tab separated or lots of spaces, format as a markdown table
|
|
80
|
+
table_lines = []
|
|
81
|
+
for l in lines:
|
|
82
|
+
if '\t' in l:
|
|
83
|
+
cells = [c.strip() for c in l.split('\t')]
|
|
84
|
+
table_lines.append("| " + " | ".join(cells) + " |")
|
|
85
|
+
elif '|' in l:
|
|
86
|
+
table_lines.append(l)
|
|
87
|
+
else:
|
|
88
|
+
# Split by multiple spaces
|
|
89
|
+
cells = [c.strip() for c in l.split(" ") if c.strip()]
|
|
90
|
+
if len(cells) > 2:
|
|
91
|
+
table_lines.append("| " + " | ".join(cells) + " |")
|
|
92
|
+
else:
|
|
93
|
+
table_lines.append(l)
|
|
94
|
+
if table_lines:
|
|
95
|
+
# Add markdown header if more than 2 columns
|
|
96
|
+
if len(table_lines) > 1 and table_lines[0].count('|') == table_lines[1].count('|'):
|
|
97
|
+
ncols = table_lines[0].count('|') - 1
|
|
98
|
+
if ncols > 1:
|
|
99
|
+
header_sep = "| " + " | ".join(['---'] * ncols) + " |"
|
|
100
|
+
table_lines.insert(1, header_sep)
|
|
101
|
+
return "\n".join(table_lines)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def extract_chapters_from_markdown(self, md_text: str) -> List[dict]:
|
|
105
|
+
"""
|
|
106
|
+
Extract chapters from markdown text based on headers.
|
|
107
|
+
Returns list of dicts with 'title' and 'content' keys.
|
|
108
|
+
"""
|
|
109
|
+
chapters = []
|
|
110
|
+
|
|
111
|
+
# Split by horizontal rules and headers
|
|
112
|
+
# Look for patterns like: -----\n**TITLE**\n or # Title
|
|
113
|
+
|
|
114
|
+
# First, let's handle the horizontal rule + bold title pattern
|
|
115
|
+
sections = re.split(r'\n-----+\n', md_text)
|
|
116
|
+
|
|
117
|
+
for i, section in enumerate(sections):
|
|
118
|
+
section = section.strip()
|
|
119
|
+
if not section:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# Look for bold titles at the beginning of sections
|
|
123
|
+
title_match = re.match(r'^\*\*([^*]+)\*\*', section)
|
|
124
|
+
if title_match:
|
|
125
|
+
title = title_match.group(1).strip()
|
|
126
|
+
# Get content after the title
|
|
127
|
+
content = re.sub(r'^\*\*[^*]+\*\*\s*', '', section, count=1).strip()
|
|
128
|
+
else:
|
|
129
|
+
# Look for markdown headers (# ## ###)
|
|
130
|
+
header_match = re.match(r'^(#{1,6})\s*(.+?)$', section, re.MULTILINE)
|
|
131
|
+
if header_match:
|
|
132
|
+
title = header_match.group(2).strip()
|
|
133
|
+
# Get content after the header
|
|
134
|
+
content = re.sub(r'^#{1,6}\s*.+?$', '', section, count=1, flags=re.MULTILINE).strip()
|
|
135
|
+
else:
|
|
136
|
+
# No clear title found, use section number or first line
|
|
137
|
+
lines = section.split('\n')
|
|
138
|
+
if lines:
|
|
139
|
+
title = f"Section {i+1}" if not lines[0].strip() else lines[0][:50] + "..."
|
|
140
|
+
content = section
|
|
141
|
+
else:
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Skip if content is too short (less than 10 characters)
|
|
145
|
+
if len(content.strip()) < 10:
|
|
146
|
+
self.logger.info(f"Skipping chapter '{title}' - content too short")
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
chapters.append({
|
|
150
|
+
'title': title,
|
|
151
|
+
'content': content,
|
|
152
|
+
'chapter_number': len(chapters) + 1
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
return chapters
|
|
156
|
+
|
|
157
|
+
def extract_pages_from_markdown(self, md_text: str) -> List[dict]:
|
|
158
|
+
"""
|
|
159
|
+
Extract pages from markdown text based on page separators.
|
|
160
|
+
Returns list of dicts with 'title' and 'content' keys.
|
|
161
|
+
"""
|
|
162
|
+
pages = []
|
|
163
|
+
|
|
164
|
+
# Split by page indicators (common patterns)
|
|
165
|
+
page_patterns = [
|
|
166
|
+
r'\n-----+\n', # Horizontal rules
|
|
167
|
+
r'Slide \d+', # Slide indicators
|
|
168
|
+
r'Page \d+', # Page indicators
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
# Try to split by the most common pattern first
|
|
172
|
+
sections = re.split(r'\n-----+\n', md_text)
|
|
173
|
+
|
|
174
|
+
for i, section in enumerate(sections):
|
|
175
|
+
section = section.strip()
|
|
176
|
+
if not section or len(section) < 10:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
# Extract title from the beginning of the page
|
|
180
|
+
lines = section.split('\n')
|
|
181
|
+
title = None
|
|
182
|
+
content_start = 0
|
|
183
|
+
|
|
184
|
+
# Look for bold title or header at the beginning
|
|
185
|
+
for j, line in enumerate(lines[:3]): # Check first 3 lines
|
|
186
|
+
line = line.strip()
|
|
187
|
+
if re.match(r'^\*\*([^*]+)\*\*$', line):
|
|
188
|
+
title = re.match(r'^\*\*([^*]+)\*\*$', line).group(1)
|
|
189
|
+
content_start = j + 1
|
|
190
|
+
break
|
|
191
|
+
elif re.match(r'^#{1,6}\s*(.+?)$', line):
|
|
192
|
+
title = re.match(r'^#{1,6}\s*(.+?)$', line).group(1)
|
|
193
|
+
content_start = j + 1
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
if not title:
|
|
197
|
+
title = f"Page {i+1}"
|
|
198
|
+
|
|
199
|
+
# Get content after title
|
|
200
|
+
content = '\n'.join(lines[content_start:]).strip()
|
|
201
|
+
|
|
202
|
+
if len(content) < 10:
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
pages.append({
|
|
206
|
+
'title': title,
|
|
207
|
+
'content': content,
|
|
208
|
+
'page_number': i + 1
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
return pages
|
|
212
|
+
|
|
213
|
+
async def _load(self, path: PurePath, **kwargs) -> List[Document]:
|
|
214
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
215
|
+
docs = []
|
|
216
|
+
all_text = [] # ← For summary collection
|
|
217
|
+
doc = fitz.open(str(path))
|
|
218
|
+
if self.as_markdown:
|
|
219
|
+
md_text = pymupdf4llm.to_markdown(path)
|
|
220
|
+
if self.use_chapters:
|
|
221
|
+
# Split by chapters
|
|
222
|
+
chapters = self.extract_chapters_from_markdown(md_text)
|
|
223
|
+
self.logger.info(f"Found {len(chapters)} chapters")
|
|
224
|
+
for chapter in chapters:
|
|
225
|
+
document_meta = {
|
|
226
|
+
"filename": path.name,
|
|
227
|
+
"file_path": str(path),
|
|
228
|
+
"chapter_title": chapter['title'],
|
|
229
|
+
"chapter_number": chapter['chapter_number'],
|
|
230
|
+
"content_type": "chapter"
|
|
231
|
+
}
|
|
232
|
+
meta = self.create_metadata(
|
|
233
|
+
path=path,
|
|
234
|
+
doctype="pdf",
|
|
235
|
+
source_type="pdf_chapter",
|
|
236
|
+
doc_metadata=document_meta,
|
|
237
|
+
)
|
|
238
|
+
# Combine title and content
|
|
239
|
+
full_content = f"# {chapter['title']}\n\n{chapter['content']}"
|
|
240
|
+
docs.append(
|
|
241
|
+
self.create_document(
|
|
242
|
+
content=full_content,
|
|
243
|
+
path=path,
|
|
244
|
+
metadata=meta
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
elif self.use_pages:
|
|
248
|
+
# Split by pages
|
|
249
|
+
pages = self.extract_pages_from_markdown(md_text)
|
|
250
|
+
self.logger.info(f"Found {len(pages)} pages")
|
|
251
|
+
|
|
252
|
+
for page in pages:
|
|
253
|
+
document_meta = {
|
|
254
|
+
"filename": path.name,
|
|
255
|
+
"file_path": str(path),
|
|
256
|
+
"page_title": page['title'],
|
|
257
|
+
"page_number": page['page_number'],
|
|
258
|
+
"content_type": "page"
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
meta = self.create_metadata(
|
|
262
|
+
path=path,
|
|
263
|
+
doctype="pdf",
|
|
264
|
+
source_type="pdf_page",
|
|
265
|
+
doc_metadata=document_meta,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Combine title and content
|
|
269
|
+
full_content = f"## {page['title']}\n\n{page['content']}"
|
|
270
|
+
|
|
271
|
+
docs.append(
|
|
272
|
+
self.create_document(
|
|
273
|
+
content=full_content,
|
|
274
|
+
path=path,
|
|
275
|
+
metadata=meta
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
else:
|
|
279
|
+
# Return whole markdown as single document
|
|
280
|
+
document_meta = {
|
|
281
|
+
"filename": path.name,
|
|
282
|
+
"file_path": str(path),
|
|
283
|
+
"content_type": "full_document"
|
|
284
|
+
}
|
|
285
|
+
meta = self.create_metadata(
|
|
286
|
+
path=path,
|
|
287
|
+
doctype="pdf",
|
|
288
|
+
source_type="pdf_markdown",
|
|
289
|
+
doc_metadata=document_meta,
|
|
290
|
+
)
|
|
291
|
+
docs.append(
|
|
292
|
+
self.create_document(
|
|
293
|
+
content=md_text,
|
|
294
|
+
path=path,
|
|
295
|
+
metadata=meta
|
|
296
|
+
)
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
# Use the default text extraction page-based
|
|
300
|
+
pending_title = None
|
|
301
|
+
for i, page in enumerate(doc):
|
|
302
|
+
page_text = page.get_text("text").strip()
|
|
303
|
+
if self.is_image_only(page):
|
|
304
|
+
self.logger.info(f"Page {i+1}: image-only, skipping.")
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
# Title-only page: store to prepend to next content
|
|
308
|
+
if self.is_title_only(page_text):
|
|
309
|
+
self.logger.info(f"Page {i+1}: title-only, saving for next page.")
|
|
310
|
+
pending_title = page_text
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
# Table page: try to preserve structure
|
|
314
|
+
if self.is_table_like(page_text):
|
|
315
|
+
table_md = self.extract_table(page)
|
|
316
|
+
if table_md:
|
|
317
|
+
content = (pending_title + '\n\n' if pending_title else '') + table_md
|
|
318
|
+
pending_title = None
|
|
319
|
+
else:
|
|
320
|
+
content = (pending_title + '\n\n' if pending_title else '') + page_text
|
|
321
|
+
pending_title = None
|
|
322
|
+
else:
|
|
323
|
+
content = (pending_title + '\n\n' if pending_title else '') + page_text
|
|
324
|
+
pending_title = None
|
|
325
|
+
|
|
326
|
+
document_meta = {
|
|
327
|
+
"filename": path.name,
|
|
328
|
+
"file_path": str(path),
|
|
329
|
+
"page_number": i + 1,
|
|
330
|
+
# "title": doc.metadata.get("title", ""),
|
|
331
|
+
# "creationDate": doc.metadata.get("creationDate", ""),
|
|
332
|
+
# "author": doc.metadata.get("author", ""),
|
|
333
|
+
}
|
|
334
|
+
meta = self.create_metadata(
|
|
335
|
+
path=path,
|
|
336
|
+
doctype="pdf",
|
|
337
|
+
source_type="pdf",
|
|
338
|
+
doc_metadata=document_meta,
|
|
339
|
+
)
|
|
340
|
+
if len(content) < 10:
|
|
341
|
+
self.logger.warning(
|
|
342
|
+
f"Page {i+1} content too short, skipping."
|
|
343
|
+
)
|
|
344
|
+
continue
|
|
345
|
+
docs.append(
|
|
346
|
+
self.create_document(
|
|
347
|
+
content=content,
|
|
348
|
+
path=path,
|
|
349
|
+
metadata=meta
|
|
350
|
+
)
|
|
351
|
+
)
|
|
352
|
+
all_text.append(content)
|
|
353
|
+
doc.close()
|
|
354
|
+
# --- Summarization step ---
|
|
355
|
+
full_text = "\n\n".join(all_text)
|
|
356
|
+
summary = await self.summary_from_text(full_text)
|
|
357
|
+
if summary:
|
|
358
|
+
summary_meta = self.create_metadata(
|
|
359
|
+
path=path,
|
|
360
|
+
doctype=self.doctype,
|
|
361
|
+
source_type=self._source_type,
|
|
362
|
+
doc_metadata={
|
|
363
|
+
"summary_for_pages": len(docs),
|
|
364
|
+
}
|
|
365
|
+
)
|
|
366
|
+
docs.append(
|
|
367
|
+
self.create_document(
|
|
368
|
+
content=f"SUMMARY:\n\n{summary}",
|
|
369
|
+
path=path,
|
|
370
|
+
metadata=summary_meta
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
return docs
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
from typing import Any, Union, List
|
|
2
|
+
import logging
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from pathlib import PurePath
|
|
5
|
+
import fitz
|
|
6
|
+
from ..stores.models import Document
|
|
7
|
+
from .basepdf import BasePDF
|
|
8
|
+
# Option 1: Use MarkItDown (Microsoft's universal document converter)
|
|
9
|
+
try:
|
|
10
|
+
from markitdown import MarkItDown
|
|
11
|
+
MARKITDOWN_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
MARKITDOWN_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
# Option 2: Use pymupdf4llm (updated PyMuPDF library)
|
|
16
|
+
try:
|
|
17
|
+
import pymupdf4llm
|
|
18
|
+
PYMUPDF4LLM_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
PYMUPDF4LLM_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger('pdfminer').setLevel(logging.INFO)
|
|
24
|
+
|
|
25
|
+
class PDFMarkdownLoader(BasePDF):
|
|
26
|
+
"""
|
|
27
|
+
Loader for PDF files converted content to markdown.
|
|
28
|
+
|
|
29
|
+
This loader supports multiple backends for PDF to markdown conversion:
|
|
30
|
+
1. MarkItDown (Microsoft's universal document converter)
|
|
31
|
+
2. pymupdf4llm (PyMuPDF's markdown converter)
|
|
32
|
+
3. Fallback manual conversion using PyMuPDF
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
extensions: List[str] = {'.pdf'}
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
source: Union[str, PurePath, List[PurePath]],
|
|
40
|
+
tokenizer: Callable[..., Any] = None,
|
|
41
|
+
text_splitter: Callable[..., Any] = None,
|
|
42
|
+
source_type: str = 'pdf',
|
|
43
|
+
language: str = "eng",
|
|
44
|
+
markdown_backend: str = "auto", # "markitdown", "pymupdf4llm", "manual", "auto"
|
|
45
|
+
chunk_size: int = 1024,
|
|
46
|
+
chunk_overlap: int = 10,
|
|
47
|
+
preserve_tables: bool = True,
|
|
48
|
+
extract_images: bool = False,
|
|
49
|
+
**kwargs
|
|
50
|
+
):
|
|
51
|
+
super().__init__(
|
|
52
|
+
source=source,
|
|
53
|
+
tokenizer=tokenizer,
|
|
54
|
+
text_splitter=text_splitter,
|
|
55
|
+
source_type=source_type,
|
|
56
|
+
**kwargs
|
|
57
|
+
)
|
|
58
|
+
self._language = language
|
|
59
|
+
self.markdown_backend = self._select_backend(markdown_backend)
|
|
60
|
+
self.preserve_tables = preserve_tables
|
|
61
|
+
self.extract_images = extract_images
|
|
62
|
+
|
|
63
|
+
# Initialize markdown splitter
|
|
64
|
+
self._splitter = self._get_markdown_splitter(
|
|
65
|
+
chunk_size=chunk_size,
|
|
66
|
+
chunk_overlap=chunk_overlap
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Initialize conversion backend
|
|
70
|
+
self._setup_conversion_backend()
|
|
71
|
+
|
|
72
|
+
def _select_backend(self, preferred: str) -> str:
|
|
73
|
+
"""Select the best available backend for PDF to markdown conversion."""
|
|
74
|
+
if preferred == "auto":
|
|
75
|
+
if MARKITDOWN_AVAILABLE:
|
|
76
|
+
return "markitdown"
|
|
77
|
+
elif PYMUPDF4LLM_AVAILABLE:
|
|
78
|
+
return "pymupdf4llm"
|
|
79
|
+
else:
|
|
80
|
+
return "manual"
|
|
81
|
+
elif preferred == "markitdown" and MARKITDOWN_AVAILABLE:
|
|
82
|
+
return "markitdown"
|
|
83
|
+
elif preferred == "pymupdf4llm" and PYMUPDF4LLM_AVAILABLE:
|
|
84
|
+
return "pymupdf4llm"
|
|
85
|
+
elif preferred == "manual":
|
|
86
|
+
return "manual"
|
|
87
|
+
else:
|
|
88
|
+
# Fallback to available backend
|
|
89
|
+
self.logger.warning(f"Preferred backend '{preferred}' not available, using fallback")
|
|
90
|
+
return self._select_backend("auto")
|
|
91
|
+
|
|
92
|
+
def _setup_conversion_backend(self):
|
|
93
|
+
"""Initialize the selected conversion backend."""
|
|
94
|
+
if self.markdown_backend == "markitdown":
|
|
95
|
+
self.md_converter = MarkItDown()
|
|
96
|
+
self.logger.info("Using MarkItDown backend for PDF to markdown conversion")
|
|
97
|
+
elif self.markdown_backend == "pymupdf4llm":
|
|
98
|
+
self.logger.info("Using pymupdf4llm backend for PDF to markdown conversion")
|
|
99
|
+
else:
|
|
100
|
+
self.logger.info("Using manual PyMuPDF backend for PDF to markdown conversion")
|
|
101
|
+
|
|
102
|
+
def _convert_to_markdown_markitdown(self, path: Union[str, PurePath]) -> str:
|
|
103
|
+
"""Convert PDF to markdown using MarkItDown."""
|
|
104
|
+
try:
|
|
105
|
+
result = self.md_converter.convert(str(path))
|
|
106
|
+
return result.text_content if result else ""
|
|
107
|
+
except Exception as e:
|
|
108
|
+
self.logger.error(f"MarkItDown conversion failed: {e}")
|
|
109
|
+
return self._convert_to_markdown_manual(path)
|
|
110
|
+
|
|
111
|
+
def _convert_to_markdown_pymupdf4llm(self, path: Union[str, PurePath]) -> str:
|
|
112
|
+
"""Convert PDF to markdown using pymupdf4llm."""
|
|
113
|
+
try:
|
|
114
|
+
return pymupdf4llm.to_markdown(str(path))
|
|
115
|
+
except Exception as e:
|
|
116
|
+
self.logger.error(f"pymupdf4llm conversion failed: {e}")
|
|
117
|
+
return self._convert_to_markdown_manual(path)
|
|
118
|
+
|
|
119
|
+
def _convert_to_markdown_manual(self, path: Union[str, PurePath]) -> str:
|
|
120
|
+
"""Fallback manual conversion using PyMuPDF with basic markdown formatting."""
|
|
121
|
+
try:
|
|
122
|
+
doc = fitz.open(str(path))
|
|
123
|
+
markdown_text = []
|
|
124
|
+
|
|
125
|
+
for _, page_num in enumerate(doc):
|
|
126
|
+
page = doc[page_num]
|
|
127
|
+
|
|
128
|
+
# Extract text blocks with formatting
|
|
129
|
+
blocks = page.get_text("dict")["blocks"]
|
|
130
|
+
|
|
131
|
+
for block in blocks:
|
|
132
|
+
if "lines" in block:
|
|
133
|
+
block_text = []
|
|
134
|
+
for line in block["lines"]:
|
|
135
|
+
line_text = ""
|
|
136
|
+
for span in line["spans"]:
|
|
137
|
+
text = span["text"]
|
|
138
|
+
font_size = span.get("size", 12)
|
|
139
|
+
flags = span.get("flags", 0)
|
|
140
|
+
|
|
141
|
+
# Basic formatting based on font properties
|
|
142
|
+
if font_size > 16:
|
|
143
|
+
text = f"# {text}"
|
|
144
|
+
elif font_size > 14:
|
|
145
|
+
text = f"## {text}"
|
|
146
|
+
elif font_size > 12:
|
|
147
|
+
text = f"### {text}"
|
|
148
|
+
|
|
149
|
+
# Bold text
|
|
150
|
+
if flags & 2**4: # Bold flag
|
|
151
|
+
text = f"**{text}**"
|
|
152
|
+
|
|
153
|
+
# Italic text
|
|
154
|
+
if flags & 2**6: # Italic flag
|
|
155
|
+
text = f"*{text}*"
|
|
156
|
+
|
|
157
|
+
line_text += text
|
|
158
|
+
|
|
159
|
+
if line_text.strip():
|
|
160
|
+
block_text.append(line_text)
|
|
161
|
+
|
|
162
|
+
if block_text:
|
|
163
|
+
markdown_text.append("\n".join(block_text))
|
|
164
|
+
|
|
165
|
+
# Extract tables if requested
|
|
166
|
+
if self.preserve_tables:
|
|
167
|
+
tables = page.find_tables()
|
|
168
|
+
for table in tables:
|
|
169
|
+
try:
|
|
170
|
+
table_data = table.extract()
|
|
171
|
+
if table_data:
|
|
172
|
+
markdown_table = self._format_table_as_markdown(table_data)
|
|
173
|
+
if markdown_table:
|
|
174
|
+
markdown_text.append(markdown_table)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
self.logger.debug(f"Failed to extract table: {e}")
|
|
177
|
+
|
|
178
|
+
doc.close()
|
|
179
|
+
return "\n\n".join(markdown_text)
|
|
180
|
+
|
|
181
|
+
except Exception as e:
|
|
182
|
+
self.logger.error(f"Manual PDF conversion failed: {e}")
|
|
183
|
+
return ""
|
|
184
|
+
|
|
185
|
+
def _format_table_as_markdown(self, table_data: List[List[str]]) -> str:
|
|
186
|
+
"""Convert table data to markdown format."""
|
|
187
|
+
if not table_data or len(table_data) < 1:
|
|
188
|
+
return ""
|
|
189
|
+
|
|
190
|
+
markdown_rows = []
|
|
191
|
+
|
|
192
|
+
# Header row
|
|
193
|
+
header_row = " | ".join(str(cell) if cell else "" for cell in table_data[0])
|
|
194
|
+
markdown_rows.append(f"| {header_row} |")
|
|
195
|
+
|
|
196
|
+
# Separator row
|
|
197
|
+
separator = " | ".join("---" for _ in table_data[0])
|
|
198
|
+
markdown_rows.append(f"| {separator} |")
|
|
199
|
+
|
|
200
|
+
# Data rows
|
|
201
|
+
for row in table_data[1:]:
|
|
202
|
+
data_row = " | ".join(str(cell) if cell else "" for cell in row)
|
|
203
|
+
markdown_rows.append(f"| {data_row} |")
|
|
204
|
+
|
|
205
|
+
return "\n".join(markdown_rows)
|
|
206
|
+
|
|
207
|
+
async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
|
|
208
|
+
"""
|
|
209
|
+
Load a PDF file and convert to markdown format.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
path (Union[str, PurePath, List[PurePath]]): The path to the PDF file.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List[Document]: A list of AI-Parrot Documents.
|
|
216
|
+
"""
|
|
217
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
218
|
+
docs = []
|
|
219
|
+
|
|
220
|
+
# Convert to markdown using selected backend
|
|
221
|
+
if self.markdown_backend == "markitdown":
|
|
222
|
+
md_text = self._convert_to_markdown_markitdown(path)
|
|
223
|
+
elif self.markdown_backend == "pymupdf4llm":
|
|
224
|
+
md_text = self._convert_to_markdown_pymupdf4llm(path)
|
|
225
|
+
else:
|
|
226
|
+
md_text = self._convert_to_markdown_manual(path)
|
|
227
|
+
|
|
228
|
+
if not md_text.strip():
|
|
229
|
+
self.logger.warning(f"No markdown content extracted from {path}")
|
|
230
|
+
return docs
|
|
231
|
+
|
|
232
|
+
# Extract PDF metadata
|
|
233
|
+
try:
|
|
234
|
+
pdf = fitz.open(str(path))
|
|
235
|
+
pdf_metadata = pdf.metadata # pylint: disable=E1101 # noqa: E1101
|
|
236
|
+
pdf.close()
|
|
237
|
+
except Exception as e:
|
|
238
|
+
self.logger.warning(
|
|
239
|
+
f"Could not extract PDF metadata: {e}"
|
|
240
|
+
)
|
|
241
|
+
pdf_metadata = {}
|
|
242
|
+
|
|
243
|
+
# Generate summary if enabled
|
|
244
|
+
try:
|
|
245
|
+
summary = await self.summary_from_text(md_text)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
self.logger.warning(
|
|
248
|
+
f"Summary generation failed: {e}"
|
|
249
|
+
)
|
|
250
|
+
summary = ''
|
|
251
|
+
|
|
252
|
+
# Create base metadata
|
|
253
|
+
base_metadata = {
|
|
254
|
+
"url": '',
|
|
255
|
+
"filename": path.name if hasattr(path, 'name') else str(path).rsplit('/', maxsplit=1)[-1], # noqa
|
|
256
|
+
"source": str(path.name if hasattr(path, 'name') else path),
|
|
257
|
+
"type": 'pdf',
|
|
258
|
+
"data": {},
|
|
259
|
+
"category": self.category,
|
|
260
|
+
"source_type": self._source_type,
|
|
261
|
+
"conversion_backend": self.markdown_backend,
|
|
262
|
+
"document_meta": {
|
|
263
|
+
"title": pdf_metadata.get("title", ""),
|
|
264
|
+
"creationDate": pdf_metadata.get("creationDate", ""),
|
|
265
|
+
"author": pdf_metadata.get("author", ""),
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
# Add summary document if available
|
|
270
|
+
if summary:
|
|
271
|
+
summary_metadata = {
|
|
272
|
+
**base_metadata,
|
|
273
|
+
"content_type": "summary"
|
|
274
|
+
}
|
|
275
|
+
docs.append(
|
|
276
|
+
Document(
|
|
277
|
+
page_content=summary,
|
|
278
|
+
metadata=summary_metadata
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Split markdown content into chunks
|
|
283
|
+
try:
|
|
284
|
+
chunks = self._splitter.split_text(md_text)
|
|
285
|
+
self.logger.info(f"Split document into {len(chunks)} chunks")
|
|
286
|
+
except Exception as e:
|
|
287
|
+
self.logger.error(
|
|
288
|
+
f"Failed to split text: {e}"
|
|
289
|
+
)
|
|
290
|
+
# Fallback: use the entire text as one chunk
|
|
291
|
+
chunks = [md_text]
|
|
292
|
+
|
|
293
|
+
# Create documents for each chunk
|
|
294
|
+
for chunk_index, chunk in enumerate(chunks):
|
|
295
|
+
chunk_metadata = {
|
|
296
|
+
**base_metadata,
|
|
297
|
+
"content_type": "chunk",
|
|
298
|
+
"chunk_index": chunk_index,
|
|
299
|
+
"total_chunks": len(chunks)
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
docs.append(
|
|
303
|
+
Document(
|
|
304
|
+
page_content=chunk,
|
|
305
|
+
metadata=chunk_metadata
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return docs
|
|
310
|
+
|
|
311
|
+
def get_supported_backends(self) -> List[str]:
|
|
312
|
+
"""Get list of available conversion backends."""
|
|
313
|
+
backends = ["manual"] # Always available
|
|
314
|
+
|
|
315
|
+
if MARKITDOWN_AVAILABLE:
|
|
316
|
+
backends.append("markitdown")
|
|
317
|
+
if PYMUPDF4LLM_AVAILABLE:
|
|
318
|
+
backends.append("pymupdf4llm")
|
|
319
|
+
|
|
320
|
+
return backends
|