ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
parrot/loaders/ppt.py
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
from typing import List, Union, Optional, Literal
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
import re
|
|
5
|
+
from ..stores.models import Document
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
|
|
8
|
+
# Optional dependencies
|
|
9
|
+
try:
|
|
10
|
+
from markitdown import MarkItDown
|
|
11
|
+
MARKITDOWN_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
MARKITDOWN_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from pptx import Presentation
|
|
17
|
+
PPTX_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
PPTX_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PowerPointLoader(AbstractLoader):
|
|
23
|
+
"""
|
|
24
|
+
Enhanced PowerPoint loader with multiple backends.
|
|
25
|
+
|
|
26
|
+
Supports:
|
|
27
|
+
1. MarkItDown backend for rich markdown extraction (primary)
|
|
28
|
+
2. python-pptx backend for detailed control and fallback
|
|
29
|
+
|
|
30
|
+
Features:
|
|
31
|
+
- Slide-by-slide processing with proper markdown formatting
|
|
32
|
+
- Automatic slide title detection
|
|
33
|
+
- Bullet point preservation
|
|
34
|
+
- Slide notes extraction
|
|
35
|
+
- Image-only slide detection and filtering
|
|
36
|
+
- Configurable output formats
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
extensions: List[str] = ['.pptx', '.ppt']
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
source: Optional[Union[str, PurePath, List[PurePath]]] = None,
|
|
44
|
+
*,
|
|
45
|
+
tokenizer: Union[str, Callable] = None,
|
|
46
|
+
text_splitter: Union[str, Callable] = None,
|
|
47
|
+
source_type: str = 'file',
|
|
48
|
+
|
|
49
|
+
# Backend selection
|
|
50
|
+
backend: str = "auto", # "markitdown", "pptx", "auto"
|
|
51
|
+
|
|
52
|
+
# Output format
|
|
53
|
+
output_format: Literal["markdown", "plain"] = "markdown",
|
|
54
|
+
|
|
55
|
+
# Processing options
|
|
56
|
+
skip_image_only_slides: bool = True,
|
|
57
|
+
skip_empty_slides: bool = True,
|
|
58
|
+
extract_slide_notes: bool = True,
|
|
59
|
+
preserve_slide_structure: bool = True,
|
|
60
|
+
|
|
61
|
+
# Slide filtering
|
|
62
|
+
min_slide_content_length: int = 10,
|
|
63
|
+
|
|
64
|
+
# Content processing
|
|
65
|
+
clean_whitespace: bool = True,
|
|
66
|
+
merge_consecutive_headers: bool = True,
|
|
67
|
+
|
|
68
|
+
**kwargs
|
|
69
|
+
):
|
|
70
|
+
super().__init__(
|
|
71
|
+
source,
|
|
72
|
+
tokenizer=tokenizer,
|
|
73
|
+
text_splitter=text_splitter,
|
|
74
|
+
source_type=source_type,
|
|
75
|
+
**kwargs
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Backend configuration
|
|
79
|
+
self.backend = self._select_backend(backend)
|
|
80
|
+
self.output_format = output_format
|
|
81
|
+
|
|
82
|
+
# Processing options
|
|
83
|
+
self.skip_image_only_slides = skip_image_only_slides
|
|
84
|
+
self.skip_empty_slides = skip_empty_slides
|
|
85
|
+
self.extract_slide_notes = extract_slide_notes
|
|
86
|
+
self.preserve_slide_structure = preserve_slide_structure
|
|
87
|
+
self.min_slide_content_length = min_slide_content_length
|
|
88
|
+
|
|
89
|
+
# Content processing
|
|
90
|
+
self.clean_whitespace = clean_whitespace
|
|
91
|
+
self.merge_consecutive_headers = merge_consecutive_headers
|
|
92
|
+
|
|
93
|
+
# Initialize backend
|
|
94
|
+
self._setup_backend()
|
|
95
|
+
|
|
96
|
+
def _select_backend(self, preferred: str) -> str:
|
|
97
|
+
"""Select the best available backend."""
|
|
98
|
+
if preferred == "auto":
|
|
99
|
+
if MARKITDOWN_AVAILABLE:
|
|
100
|
+
return "markitdown"
|
|
101
|
+
elif PPTX_AVAILABLE:
|
|
102
|
+
return "pptx"
|
|
103
|
+
else:
|
|
104
|
+
raise ImportError(
|
|
105
|
+
"No PowerPoint processing backend available. Install 'markitdown' or 'python-pptx'"
|
|
106
|
+
)
|
|
107
|
+
elif preferred == "markitdown" and MARKITDOWN_AVAILABLE:
|
|
108
|
+
return "markitdown"
|
|
109
|
+
elif preferred == "pptx" and PPTX_AVAILABLE:
|
|
110
|
+
return "pptx"
|
|
111
|
+
else:
|
|
112
|
+
self.logger.warning(
|
|
113
|
+
f"Backend '{preferred}' not available, falling back"
|
|
114
|
+
)
|
|
115
|
+
return self._select_backend("auto")
|
|
116
|
+
|
|
117
|
+
def _setup_backend(self):
|
|
118
|
+
"""Initialize the selected backend."""
|
|
119
|
+
if self.backend == "markitdown":
|
|
120
|
+
self.md_converter = MarkItDown()
|
|
121
|
+
self.logger.info("Using MarkItDown backend for PowerPoint processing")
|
|
122
|
+
else:
|
|
123
|
+
self.logger.info("Using python-pptx backend for PowerPoint processing")
|
|
124
|
+
|
|
125
|
+
def _clean_content(self, content: str) -> str:
|
|
126
|
+
"""Clean and normalize content."""
|
|
127
|
+
if not content:
|
|
128
|
+
return ""
|
|
129
|
+
|
|
130
|
+
if self.clean_whitespace:
|
|
131
|
+
# Normalize whitespace while preserving markdown structure
|
|
132
|
+
lines = content.split('\n')
|
|
133
|
+
cleaned_lines = []
|
|
134
|
+
for line in lines:
|
|
135
|
+
cleaned_line = ' '.join(line.split())
|
|
136
|
+
cleaned_lines.append(cleaned_line)
|
|
137
|
+
content = '\n'.join(cleaned_lines)
|
|
138
|
+
|
|
139
|
+
return content.strip()
|
|
140
|
+
|
|
141
|
+
def _extract_slides_from_markdown(self, markdown_content: str) -> List[dict]:
|
|
142
|
+
"""Extract individual slides from MarkItDown markdown output."""
|
|
143
|
+
slides = []
|
|
144
|
+
|
|
145
|
+
# Split by slide separators (MarkItDown typically uses headers or page breaks)
|
|
146
|
+
# Try multiple patterns for slide separation
|
|
147
|
+
slide_patterns = [
|
|
148
|
+
r'\n(?=#{1,2}\s)', # Level 1-2 headers (typical slide titles)
|
|
149
|
+
r'\n---+\n', # Horizontal rules
|
|
150
|
+
r'\n\*{3,}\n', # Multiple asterisks
|
|
151
|
+
r'(?:\n\s*){3,}', # Multiple blank lines
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
slide_sections = [markdown_content] # Start with full content
|
|
155
|
+
|
|
156
|
+
for pattern in slide_patterns:
|
|
157
|
+
new_sections = []
|
|
158
|
+
for section in slide_sections:
|
|
159
|
+
parts = re.split(pattern, section)
|
|
160
|
+
new_sections.extend([part.strip() for part in parts if part.strip()])
|
|
161
|
+
if len(new_sections) > len(slide_sections):
|
|
162
|
+
slide_sections = new_sections
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
# Process each section as a potential slide
|
|
166
|
+
for i, section in enumerate(slide_sections):
|
|
167
|
+
if len(section) < self.min_slide_content_length:
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Extract title (first header if present)
|
|
171
|
+
title_match = re.match(r'^(#{1,3})\s*(.+)$', section, re.MULTILINE)
|
|
172
|
+
title = title_match.group(2) if title_match else f"Slide {i+1}"
|
|
173
|
+
|
|
174
|
+
# Extract content (everything after title or full content if no title)
|
|
175
|
+
if title_match:
|
|
176
|
+
content_start = section.find('\n', title_match.end())
|
|
177
|
+
content = section[content_start:].strip() if content_start != -1 else ""
|
|
178
|
+
else:
|
|
179
|
+
content = section.strip()
|
|
180
|
+
|
|
181
|
+
slides.append({
|
|
182
|
+
"slide_number": i + 1,
|
|
183
|
+
"title": title,
|
|
184
|
+
"content": content,
|
|
185
|
+
"full_content": section,
|
|
186
|
+
"has_title": bool(title_match)
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
return slides
|
|
190
|
+
|
|
191
|
+
def _process_markitdown_content(self, path: Union[str, PurePath]) -> List[dict]:
|
|
192
|
+
"""Process PowerPoint using MarkItDown backend."""
|
|
193
|
+
try:
|
|
194
|
+
result = self.md_converter.convert(str(path))
|
|
195
|
+
if not result or not result.text_content:
|
|
196
|
+
self.logger.warning("MarkItDown returned empty content")
|
|
197
|
+
return []
|
|
198
|
+
|
|
199
|
+
markdown_content = result.text_content
|
|
200
|
+
slides = self._extract_slides_from_markdown(markdown_content)
|
|
201
|
+
|
|
202
|
+
self.logger.info(f"MarkItDown extracted {len(slides)} slides")
|
|
203
|
+
return slides
|
|
204
|
+
|
|
205
|
+
except Exception as e:
|
|
206
|
+
self.logger.error(f"MarkItDown processing failed: {e}")
|
|
207
|
+
return []
|
|
208
|
+
|
|
209
|
+
# Original python-pptx methods (preserved as fallback)
|
|
210
|
+
def extract_slide_text(self, slide):
|
|
211
|
+
"""Extract all text from a slide as a single string."""
|
|
212
|
+
text_chunks = []
|
|
213
|
+
for shape in slide.shapes:
|
|
214
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
215
|
+
text_chunks.append(shape.text.strip())
|
|
216
|
+
return "\n\n".join(text_chunks).strip()
|
|
217
|
+
|
|
218
|
+
def slide_has_text(self, slide) -> bool:
|
|
219
|
+
"""Determine if a slide contains any text."""
|
|
220
|
+
for shape in slide.shapes:
|
|
221
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
222
|
+
return True
|
|
223
|
+
return False
|
|
224
|
+
|
|
225
|
+
def slide_has_images_only(self, slide) -> bool:
|
|
226
|
+
"""Return True if slide has images and no text."""
|
|
227
|
+
has_image = False
|
|
228
|
+
for shape in slide.shapes:
|
|
229
|
+
if shape.shape_type == 13: # PICTURE shape type in python-pptx
|
|
230
|
+
has_image = True
|
|
231
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
232
|
+
return False
|
|
233
|
+
return has_image
|
|
234
|
+
|
|
235
|
+
def _extract_slide_title(self, slide) -> str:
|
|
236
|
+
"""Extract slide title from python-pptx slide object."""
|
|
237
|
+
# Try to get title from title placeholder
|
|
238
|
+
try:
|
|
239
|
+
if slide.shapes.title and slide.shapes.title.text.strip():
|
|
240
|
+
return slide.shapes.title.text.strip()
|
|
241
|
+
except:
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
# Look for first text shape that looks like a title
|
|
245
|
+
for shape in slide.shapes:
|
|
246
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
247
|
+
text = shape.text.strip()
|
|
248
|
+
# Simple heuristic: short text, single line, likely a title
|
|
249
|
+
if len(text) < 100 and '\n' not in text:
|
|
250
|
+
return text
|
|
251
|
+
break
|
|
252
|
+
|
|
253
|
+
return ""
|
|
254
|
+
|
|
255
|
+
def _format_slide_as_markdown(self, slide_data: dict, slide_text: str, slide_notes: str = "") -> str:
|
|
256
|
+
"""Format slide content as markdown."""
|
|
257
|
+
markdown_parts = []
|
|
258
|
+
|
|
259
|
+
# Add title
|
|
260
|
+
if slide_data.get("title"):
|
|
261
|
+
markdown_parts.append(f"# {slide_data['title']}")
|
|
262
|
+
elif not slide_data.get("has_title", False):
|
|
263
|
+
markdown_parts.append(f"# Slide {slide_data['slide_number']}")
|
|
264
|
+
|
|
265
|
+
# Add main content
|
|
266
|
+
if slide_text:
|
|
267
|
+
# Convert plain text to markdown if needed
|
|
268
|
+
if self.output_format == "markdown" and not slide_data.get("full_content"):
|
|
269
|
+
# Basic markdown conversion for bullet points
|
|
270
|
+
content_lines = []
|
|
271
|
+
for line in slide_text.split('\n'):
|
|
272
|
+
line = line.strip()
|
|
273
|
+
if line:
|
|
274
|
+
# Convert indented text to bullet points
|
|
275
|
+
if line.startswith('•') or line.startswith('-'):
|
|
276
|
+
content_lines.append(f"- {line[1:].strip()}")
|
|
277
|
+
elif line.startswith(' ') or line.startswith('\t'):
|
|
278
|
+
content_lines.append(f"- {line.strip()}")
|
|
279
|
+
else:
|
|
280
|
+
content_lines.append(line)
|
|
281
|
+
markdown_parts.append('\n'.join(content_lines))
|
|
282
|
+
else:
|
|
283
|
+
markdown_parts.append(slide_text)
|
|
284
|
+
|
|
285
|
+
# Add notes if present
|
|
286
|
+
if slide_notes and self.extract_slide_notes:
|
|
287
|
+
markdown_parts.append("## Notes")
|
|
288
|
+
markdown_parts.append(slide_notes)
|
|
289
|
+
|
|
290
|
+
return '\n\n'.join(markdown_parts)
|
|
291
|
+
|
|
292
|
+
def _process_pptx_content(self, path: Union[str, PurePath]) -> List[dict]:
|
|
293
|
+
"""Process PowerPoint using python-pptx backend (original implementation enhanced)."""
|
|
294
|
+
if not PPTX_AVAILABLE:
|
|
295
|
+
raise ImportError("python-pptx not available for fallback processing")
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
prs = Presentation(str(path))
|
|
299
|
+
slides = []
|
|
300
|
+
slide_count = len(prs.slides)
|
|
301
|
+
|
|
302
|
+
for i, slide in enumerate(prs.slides):
|
|
303
|
+
# Skip image-only slides if configured
|
|
304
|
+
if self.skip_image_only_slides and self.slide_has_images_only(slide):
|
|
305
|
+
self.logger.debug(f"Slide {i+1}/{slide_count}: only images, skipping.")
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
# Extract slide text
|
|
309
|
+
slide_text = self.extract_slide_text(slide)
|
|
310
|
+
|
|
311
|
+
# Skip empty slides if configured
|
|
312
|
+
if self.skip_empty_slides and (not slide_text or len(slide_text) < self.min_slide_content_length):
|
|
313
|
+
self.logger.debug(f"Slide {i+1}/{slide_count}: no sufficient text content, skipping.")
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
# Extract slide title
|
|
317
|
+
slide_title = self._extract_slide_title(slide)
|
|
318
|
+
|
|
319
|
+
# Extract slide notes
|
|
320
|
+
slide_notes = ""
|
|
321
|
+
if self.extract_slide_notes and slide.has_notes_slide and slide.notes_slide.notes_text_frame:
|
|
322
|
+
slide_notes = slide.notes_slide.notes_text_frame.text.strip()
|
|
323
|
+
|
|
324
|
+
slides.append({
|
|
325
|
+
"slide_number": i + 1,
|
|
326
|
+
"slide_id": slide.slide_id,
|
|
327
|
+
"title": slide_title,
|
|
328
|
+
"content": slide_text,
|
|
329
|
+
"notes": slide_notes,
|
|
330
|
+
"has_title": bool(slide_title)
|
|
331
|
+
})
|
|
332
|
+
|
|
333
|
+
self.logger.info(f"python-pptx extracted {len(slides)} slides from {slide_count} total slides")
|
|
334
|
+
return slides
|
|
335
|
+
|
|
336
|
+
except Exception as e:
|
|
337
|
+
self.logger.error(f"python-pptx processing failed: {e}")
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
|
|
341
|
+
"""
|
|
342
|
+
Load PowerPoint presentation with enhanced markdown support.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
path: Path to the PowerPoint file
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
List of Document objects, one per slide
|
|
349
|
+
"""
|
|
350
|
+
self.logger.info(f"Loading PowerPoint file: {path}")
|
|
351
|
+
docs = []
|
|
352
|
+
|
|
353
|
+
# Try primary backend
|
|
354
|
+
if self.backend == "markitdown":
|
|
355
|
+
slides_data = self._process_markitdown_content(path)
|
|
356
|
+
|
|
357
|
+
# Fallback to python-pptx if MarkItDown fails or returns no slides
|
|
358
|
+
if not slides_data and PPTX_AVAILABLE:
|
|
359
|
+
self.logger.info("MarkItDown failed or returned no slides, falling back to python-pptx")
|
|
360
|
+
slides_data = self._process_pptx_content(path)
|
|
361
|
+
else:
|
|
362
|
+
slides_data = self._process_pptx_content(path)
|
|
363
|
+
|
|
364
|
+
if not slides_data:
|
|
365
|
+
self.logger.warning(f"No slides extracted from {path}")
|
|
366
|
+
return docs
|
|
367
|
+
|
|
368
|
+
# Create documents for each slide
|
|
369
|
+
for slide_data in slides_data:
|
|
370
|
+
# Format content based on output format and backend
|
|
371
|
+
if self.backend == "markitdown" and self.output_format == "markdown":
|
|
372
|
+
if slide_data.get("full_content"):
|
|
373
|
+
content = slide_data["full_content"]
|
|
374
|
+
else:
|
|
375
|
+
content = self._format_slide_as_markdown(
|
|
376
|
+
slide_data,
|
|
377
|
+
slide_data.get("content", ""),
|
|
378
|
+
slide_data.get("notes", "")
|
|
379
|
+
)
|
|
380
|
+
elif self.output_format == "markdown":
|
|
381
|
+
content = self._format_slide_as_markdown(
|
|
382
|
+
slide_data,
|
|
383
|
+
slide_data.get("content", ""),
|
|
384
|
+
slide_data.get("notes", "")
|
|
385
|
+
)
|
|
386
|
+
else:
|
|
387
|
+
# Plain text format
|
|
388
|
+
parts = []
|
|
389
|
+
if slide_data.get("title"):
|
|
390
|
+
parts.append(f"Title: {slide_data['title']}")
|
|
391
|
+
if slide_data.get("content"):
|
|
392
|
+
parts.append(slide_data["content"])
|
|
393
|
+
if slide_data.get("notes") and self.extract_slide_notes:
|
|
394
|
+
parts.append(f"Notes: {slide_data['notes']}")
|
|
395
|
+
content = "\n\n".join(parts)
|
|
396
|
+
|
|
397
|
+
content = self._clean_content(content)
|
|
398
|
+
|
|
399
|
+
if not content or len(content) < self.min_slide_content_length:
|
|
400
|
+
continue
|
|
401
|
+
|
|
402
|
+
# Create metadata
|
|
403
|
+
slide_meta = {
|
|
404
|
+
"slide_number": slide_data["slide_number"],
|
|
405
|
+
"slide_title": slide_data.get("title", ""),
|
|
406
|
+
"has_notes": bool(slide_data.get("notes", "")),
|
|
407
|
+
"content_length": len(content),
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
# Add backend-specific metadata
|
|
411
|
+
if "slide_id" in slide_data:
|
|
412
|
+
slide_meta["slide_id"] = slide_data["slide_id"]
|
|
413
|
+
|
|
414
|
+
metadata = self.create_metadata(
|
|
415
|
+
path=path,
|
|
416
|
+
doctype="pptx",
|
|
417
|
+
source_type="powerpoint",
|
|
418
|
+
doc_metadata={
|
|
419
|
+
**slide_meta,
|
|
420
|
+
"extraction_backend": self.backend,
|
|
421
|
+
"output_format": self.output_format,
|
|
422
|
+
},
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Create context header if preserve_slide_structure is True
|
|
426
|
+
if self.preserve_slide_structure:
|
|
427
|
+
context_parts = [
|
|
428
|
+
f"File Name: {path.name if hasattr(path, 'name') else str(path).split('/')[-1]}",
|
|
429
|
+
f"Slide Number: {slide_data['slide_number']}",
|
|
430
|
+
f"Document Type: pptx",
|
|
431
|
+
f"Source Type: powerpoint",
|
|
432
|
+
]
|
|
433
|
+
|
|
434
|
+
if slide_data.get("slide_id"):
|
|
435
|
+
context_parts.append(f"Slide ID: {slide_data['slide_id']}")
|
|
436
|
+
|
|
437
|
+
context_str = "\n".join(context_parts) + "\n======\n\n"
|
|
438
|
+
full_content = context_str + content
|
|
439
|
+
else:
|
|
440
|
+
full_content = content
|
|
441
|
+
|
|
442
|
+
doc = self.create_document(
|
|
443
|
+
content=full_content,
|
|
444
|
+
path=path,
|
|
445
|
+
metadata=metadata
|
|
446
|
+
)
|
|
447
|
+
docs.append(doc)
|
|
448
|
+
|
|
449
|
+
self.logger.info(f"Created {len(docs)} documents from PowerPoint slides")
|
|
450
|
+
return docs
|
|
451
|
+
|
|
452
|
+
def get_supported_backends(self) -> List[str]:
|
|
453
|
+
"""Get list of available backends."""
|
|
454
|
+
backends = []
|
|
455
|
+
|
|
456
|
+
if MARKITDOWN_AVAILABLE:
|
|
457
|
+
backends.append("markitdown")
|
|
458
|
+
if PPTX_AVAILABLE:
|
|
459
|
+
backends.append("pptx")
|
|
460
|
+
|
|
461
|
+
return backends
|
|
462
|
+
|
|
463
|
+
def get_backend_info(self) -> dict:
|
|
464
|
+
"""Get information about current backend configuration."""
|
|
465
|
+
return {
|
|
466
|
+
"current_backend": self.backend,
|
|
467
|
+
"available_backends": self.get_supported_backends(),
|
|
468
|
+
"output_format": self.output_format,
|
|
469
|
+
"settings": {
|
|
470
|
+
"skip_image_only_slides": self.skip_image_only_slides,
|
|
471
|
+
"skip_empty_slides": self.skip_empty_slides,
|
|
472
|
+
"extract_slide_notes": self.extract_slide_notes,
|
|
473
|
+
"preserve_slide_structure": self.preserve_slide_structure,
|
|
474
|
+
"min_slide_content_length": self.min_slide_content_length,
|
|
475
|
+
}
|
|
476
|
+
}
|
parrot/loaders/qa.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from typing import List
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from ..stores.models import Document
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class QAFileLoader(AbstractLoader):
|
|
10
|
+
"""
|
|
11
|
+
Question and Answers File based on Excel, coverted to Parrot Documents.
|
|
12
|
+
"""
|
|
13
|
+
extensions: List[str] = ['.xlsx']
|
|
14
|
+
chunk_size = 1024
|
|
15
|
+
_source_type = 'QA-File'
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
*args,
|
|
20
|
+
**kwargs
|
|
21
|
+
):
|
|
22
|
+
self._columns = kwargs.pop('columns', ['Question', 'Answer'])
|
|
23
|
+
self._question_col = kwargs.pop('question_column', 'Question')
|
|
24
|
+
self._answer_col = kwargs.pop('answer_column', 'Answer')
|
|
25
|
+
self.doctype = kwargs.pop('doctype', 'qa')
|
|
26
|
+
super().__init__(*args, **kwargs)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def _load(self, path: PurePath, **kwargs) -> List[Document]:
|
|
30
|
+
df = pd.read_excel(path, header=0, engine='openpyxl')
|
|
31
|
+
# trip spaces on columns names:
|
|
32
|
+
df.columns = df.columns.str.strip()
|
|
33
|
+
q = self._columns[0]
|
|
34
|
+
a = self._columns[1]
|
|
35
|
+
docs = []
|
|
36
|
+
if q not in df.columns or a not in df.columns:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"Columns {q} and {a} must be present in the DataFrame."
|
|
39
|
+
)
|
|
40
|
+
for idx, row in df.iterrows():
|
|
41
|
+
# check first if columns q and a are present:
|
|
42
|
+
# Question Document
|
|
43
|
+
qs = row[q]
|
|
44
|
+
answer = row[a]
|
|
45
|
+
document_meta = {
|
|
46
|
+
"question": qs,
|
|
47
|
+
"answer": answer,
|
|
48
|
+
}
|
|
49
|
+
metadata = self.create_metadata(
|
|
50
|
+
path=path,
|
|
51
|
+
doctype=self.doctype,
|
|
52
|
+
source_type=self._source_type,
|
|
53
|
+
doc_metadata=document_meta,
|
|
54
|
+
type="FAQ",
|
|
55
|
+
question=qs,
|
|
56
|
+
answer=answer,
|
|
57
|
+
)
|
|
58
|
+
doc = Document(
|
|
59
|
+
page_content=f"{idx}. Question: {qs}: Answer: {answer}",
|
|
60
|
+
metadata=metadata,
|
|
61
|
+
)
|
|
62
|
+
docs.append(doc)
|
|
63
|
+
return docs
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import uuid
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import List, Dict, Any, Optional
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class TextChunk:
|
|
10
|
+
"""Represents a chunk of text with metadata"""
|
|
11
|
+
text: str
|
|
12
|
+
start_position: int
|
|
13
|
+
end_position: int
|
|
14
|
+
token_count: int
|
|
15
|
+
metadata: Dict[str, Any]
|
|
16
|
+
chunk_id: Optional[str] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseTextSplitter(ABC):
|
|
20
|
+
"""Base class for all text splitters"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
chunk_size: int = 4000,
|
|
25
|
+
chunk_overlap: int = 200,
|
|
26
|
+
keep_separator: bool = True,
|
|
27
|
+
add_start_index: bool = True
|
|
28
|
+
):
|
|
29
|
+
self.chunk_size = chunk_size
|
|
30
|
+
self.chunk_overlap = chunk_overlap
|
|
31
|
+
self.keep_separator = keep_separator
|
|
32
|
+
self.add_start_index = add_start_index
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def split_text(self, text: str) -> List[str]:
|
|
36
|
+
"""Split text into chunks"""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
def create_chunks(
|
|
40
|
+
self,
|
|
41
|
+
text: str,
|
|
42
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
43
|
+
) -> List[TextChunk]:
|
|
44
|
+
"""Create TextChunk objects with metadata"""
|
|
45
|
+
text_chunks = self.split_text(text)
|
|
46
|
+
chunks = []
|
|
47
|
+
current_position = 0
|
|
48
|
+
|
|
49
|
+
for i, chunk_text in enumerate(text_chunks):
|
|
50
|
+
# Find the actual position in the original text
|
|
51
|
+
start_pos = text.find(chunk_text, current_position)
|
|
52
|
+
if start_pos == -1:
|
|
53
|
+
start_pos = current_position
|
|
54
|
+
|
|
55
|
+
end_pos = start_pos + len(chunk_text)
|
|
56
|
+
|
|
57
|
+
chunk_metadata = {
|
|
58
|
+
**(metadata or {}),
|
|
59
|
+
'chunk_index': i,
|
|
60
|
+
'total_chunks': len(text_chunks),
|
|
61
|
+
'splitter_type': self.__class__.__name__
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if self.add_start_index:
|
|
65
|
+
chunk_metadata['start_index'] = start_pos
|
|
66
|
+
chunk_metadata['end_index'] = end_pos
|
|
67
|
+
|
|
68
|
+
chunk = TextChunk(
|
|
69
|
+
text=chunk_text,
|
|
70
|
+
start_position=start_pos,
|
|
71
|
+
end_position=end_pos,
|
|
72
|
+
token_count=self._count_tokens(chunk_text),
|
|
73
|
+
metadata=chunk_metadata,
|
|
74
|
+
chunk_id=f"chunk_{i:04d}_{uuid.uuid4().hex[:8]}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
chunks.append(chunk)
|
|
78
|
+
current_position = start_pos + len(chunk_text) - self.chunk_overlap
|
|
79
|
+
|
|
80
|
+
return chunks
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def _count_tokens(self, text: str) -> int:
|
|
84
|
+
"""Count tokens in text"""
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def _merge_splits(self, splits: List[str], separator: str) -> List[str]:
|
|
88
|
+
"""Merge splits with overlap handling"""
|
|
89
|
+
if not splits:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
docs = []
|
|
93
|
+
current_doc = []
|
|
94
|
+
current_length = 0
|
|
95
|
+
|
|
96
|
+
for split in splits:
|
|
97
|
+
split_len = self._count_tokens(split)
|
|
98
|
+
|
|
99
|
+
if current_length + split_len > self.chunk_size and current_doc:
|
|
100
|
+
# Create document from current chunks
|
|
101
|
+
doc = separator.join(current_doc)
|
|
102
|
+
if doc:
|
|
103
|
+
docs.append(doc)
|
|
104
|
+
|
|
105
|
+
# Start new document with overlap
|
|
106
|
+
overlap_splits = self._get_overlap_splits(current_doc, separator)
|
|
107
|
+
current_doc = overlap_splits + [split]
|
|
108
|
+
current_length = sum(self._count_tokens(s) for s in current_doc)
|
|
109
|
+
else:
|
|
110
|
+
current_doc.append(split)
|
|
111
|
+
current_length += split_len
|
|
112
|
+
|
|
113
|
+
# Add final document
|
|
114
|
+
if current_doc:
|
|
115
|
+
doc = separator.join(current_doc)
|
|
116
|
+
if doc:
|
|
117
|
+
docs.append(doc)
|
|
118
|
+
|
|
119
|
+
return docs
|
|
120
|
+
|
|
121
|
+
def _get_overlap_splits(self, splits: List[str], separator: str) -> List[str]:
|
|
122
|
+
"""Get splits for overlap"""
|
|
123
|
+
if not splits or self.chunk_overlap == 0:
|
|
124
|
+
return []
|
|
125
|
+
|
|
126
|
+
overlap_splits = []
|
|
127
|
+
overlap_length = 0
|
|
128
|
+
|
|
129
|
+
# Start from the end and work backwards
|
|
130
|
+
for split in reversed(splits):
|
|
131
|
+
split_len = self._count_tokens(split)
|
|
132
|
+
if overlap_length + split_len <= self.chunk_overlap:
|
|
133
|
+
overlap_splits.insert(0, split)
|
|
134
|
+
overlap_length += split_len
|
|
135
|
+
else:
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
return overlap_splits
|