ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
parrot/loaders/csv.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
from typing import List, Union, Optional, Dict, Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
import json
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from ..stores.models import Document
|
|
7
|
+
from .abstract import AbstractLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CSVLoader(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
CSV Loader that creates one JSON Document per row using pandas.
|
|
13
|
+
|
|
14
|
+
This loader reads CSV files with pandas and converts each row into a separate
|
|
15
|
+
Document with JSON content. Perfect for creating searchable knowledge bases
|
|
16
|
+
from tabular data where each row represents an entity or record.
|
|
17
|
+
|
|
18
|
+
Features:
|
|
19
|
+
- One document per CSV row
|
|
20
|
+
- JSON serialization of row data
|
|
21
|
+
- Configurable pandas read options
|
|
22
|
+
- Row indexing and metadata
|
|
23
|
+
- Header preservation
|
|
24
|
+
- Data type inference
|
|
25
|
+
- Error handling for malformed data
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
extensions: List[str] = ['.csv', '.tsv']
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
source: Optional[Union[pd.DataFrame, PurePath, List[PurePath]]] = None,
|
|
33
|
+
*,
|
|
34
|
+
tokenizer: Union[str, Callable] = None,
|
|
35
|
+
text_splitter: Union[str, Callable] = None,
|
|
36
|
+
source_type: str = 'file',
|
|
37
|
+
|
|
38
|
+
# CSV reading options (pandas parameters)
|
|
39
|
+
separator: Optional[str] = None, # Auto-detect if None
|
|
40
|
+
encoding: str = 'utf-8',
|
|
41
|
+
header: Union[int, List[int], str] = 0, # Use first row as header
|
|
42
|
+
index_col: Union[int, str, List, bool] = False,
|
|
43
|
+
usecols: Optional[List] = None,
|
|
44
|
+
dtype: Optional[Dict] = None,
|
|
45
|
+
|
|
46
|
+
# Data processing options
|
|
47
|
+
skip_empty_rows: bool = True,
|
|
48
|
+
skip_na_rows: bool = False, # Keep rows with some NaN values
|
|
49
|
+
fill_na_value: Optional[str] = None, # Fill NaN with this value
|
|
50
|
+
|
|
51
|
+
# JSON formatting
|
|
52
|
+
json_ensure_ascii: bool = False,
|
|
53
|
+
json_indent: Optional[int] = 2,
|
|
54
|
+
|
|
55
|
+
# Row filtering
|
|
56
|
+
max_rows: Optional[int] = None,
|
|
57
|
+
skip_rows: Optional[int] = None,
|
|
58
|
+
|
|
59
|
+
# Content options
|
|
60
|
+
include_row_index: bool = True,
|
|
61
|
+
include_headers_in_content: bool = False,
|
|
62
|
+
row_prefix: str = "Row",
|
|
63
|
+
|
|
64
|
+
**kwargs
|
|
65
|
+
):
|
|
66
|
+
super().__init__(
|
|
67
|
+
source,
|
|
68
|
+
tokenizer=tokenizer,
|
|
69
|
+
text_splitter=text_splitter,
|
|
70
|
+
source_type=source_type,
|
|
71
|
+
**kwargs
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# CSV reading configuration
|
|
75
|
+
self.separator = separator
|
|
76
|
+
self.encoding = encoding
|
|
77
|
+
self.header = header
|
|
78
|
+
self.index_col = index_col
|
|
79
|
+
self.usecols = usecols
|
|
80
|
+
self.dtype = dtype
|
|
81
|
+
|
|
82
|
+
# Data processing
|
|
83
|
+
self.skip_empty_rows = skip_empty_rows
|
|
84
|
+
self.skip_na_rows = skip_na_rows
|
|
85
|
+
self.fill_na_value = fill_na_value
|
|
86
|
+
|
|
87
|
+
# JSON formatting
|
|
88
|
+
self.json_ensure_ascii = json_ensure_ascii
|
|
89
|
+
self.json_indent = json_indent
|
|
90
|
+
|
|
91
|
+
# Row filtering
|
|
92
|
+
self.max_rows = max_rows
|
|
93
|
+
self.skip_rows = skip_rows
|
|
94
|
+
|
|
95
|
+
# Content options
|
|
96
|
+
self.include_row_index = include_row_index
|
|
97
|
+
self.include_headers_in_content = include_headers_in_content
|
|
98
|
+
self.row_prefix = row_prefix
|
|
99
|
+
|
|
100
|
+
def _detect_separator(self, file_path: str, sample_size: int = 1024) -> str:
|
|
101
|
+
"""Auto-detect CSV separator by sampling the file."""
|
|
102
|
+
try:
|
|
103
|
+
with open(file_path, 'r', encoding=self.encoding) as f:
|
|
104
|
+
sample = f.read(sample_size)
|
|
105
|
+
|
|
106
|
+
# Try common separators
|
|
107
|
+
separators = [',', ';', '\t', '|']
|
|
108
|
+
separator_counts = {}
|
|
109
|
+
|
|
110
|
+
for sep in separators:
|
|
111
|
+
# Count occurrences in first few lines
|
|
112
|
+
lines = sample.split('\n')[:3]
|
|
113
|
+
count = sum(line.count(sep) for line in lines)
|
|
114
|
+
if count > 0:
|
|
115
|
+
separator_counts[sep] = count
|
|
116
|
+
|
|
117
|
+
if separator_counts:
|
|
118
|
+
# Return separator with highest count
|
|
119
|
+
return max(separator_counts.keys(), key=separator_counts.get)
|
|
120
|
+
else:
|
|
121
|
+
return ',' # Default fallback
|
|
122
|
+
|
|
123
|
+
except Exception as e:
|
|
124
|
+
self.logger.warning(f"Could not auto-detect separator: {e}, using comma")
|
|
125
|
+
return ','
|
|
126
|
+
|
|
127
|
+
def _prepare_pandas_kwargs(self, path: str) -> Dict[str, Any]:
|
|
128
|
+
"""Prepare pandas read_csv arguments."""
|
|
129
|
+
kwargs = {
|
|
130
|
+
'encoding': self.encoding,
|
|
131
|
+
'header': self.header,
|
|
132
|
+
'index_col': self.index_col,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Auto-detect separator if not specified
|
|
136
|
+
if self.separator is None:
|
|
137
|
+
kwargs['sep'] = self._detect_separator(path)
|
|
138
|
+
else:
|
|
139
|
+
kwargs['sep'] = self.separator
|
|
140
|
+
|
|
141
|
+
# Add optional parameters
|
|
142
|
+
if self.usecols is not None:
|
|
143
|
+
kwargs['usecols'] = self.usecols
|
|
144
|
+
if self.dtype is not None:
|
|
145
|
+
kwargs['dtype'] = self.dtype
|
|
146
|
+
if self.max_rows is not None:
|
|
147
|
+
kwargs['nrows'] = self.max_rows
|
|
148
|
+
if self.skip_rows is not None:
|
|
149
|
+
kwargs['skiprows'] = self.skip_rows
|
|
150
|
+
|
|
151
|
+
# Handle empty rows
|
|
152
|
+
if self.skip_empty_rows:
|
|
153
|
+
kwargs['skip_blank_lines'] = True
|
|
154
|
+
|
|
155
|
+
return kwargs
|
|
156
|
+
|
|
157
|
+
def _clean_row_data(self, row_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
158
|
+
"""Clean and process row data."""
|
|
159
|
+
cleaned = {}
|
|
160
|
+
|
|
161
|
+
for key, value in row_dict.items():
|
|
162
|
+
# Clean column names (keys)
|
|
163
|
+
clean_key = str(key).strip()
|
|
164
|
+
|
|
165
|
+
# Handle NaN values
|
|
166
|
+
if pd.isna(value):
|
|
167
|
+
if self.fill_na_value is not None:
|
|
168
|
+
clean_value = self.fill_na_value
|
|
169
|
+
else:
|
|
170
|
+
clean_value = None
|
|
171
|
+
else:
|
|
172
|
+
# Convert to appropriate type for JSON serialization
|
|
173
|
+
if isinstance(value, (pd.Timestamp, pd.Period)):
|
|
174
|
+
clean_value = str(value)
|
|
175
|
+
elif isinstance(value, (int, float, str, bool)):
|
|
176
|
+
clean_value = value
|
|
177
|
+
else:
|
|
178
|
+
clean_value = str(value)
|
|
179
|
+
|
|
180
|
+
cleaned[clean_key] = clean_value
|
|
181
|
+
|
|
182
|
+
return cleaned
|
|
183
|
+
|
|
184
|
+
def _should_skip_row(self, row_dict: Dict[str, Any]) -> bool:
|
|
185
|
+
"""Determine if a row should be skipped."""
|
|
186
|
+
# Skip if all values are NaN and skip_na_rows is True
|
|
187
|
+
if self.skip_na_rows:
|
|
188
|
+
non_null_values = [v for v in row_dict.values() if not pd.isna(v) and v is not None]
|
|
189
|
+
if not non_null_values:
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
# Skip if row is effectively empty after cleaning
|
|
193
|
+
if self.skip_empty_rows:
|
|
194
|
+
cleaned = self._clean_row_data(row_dict)
|
|
195
|
+
non_empty_values = [v for v in cleaned.values() if v is not None and str(v).strip()]
|
|
196
|
+
if not non_empty_values:
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
return False
|
|
200
|
+
|
|
201
|
+
def _format_row_as_json(self, row_dict: Dict[str, Any], row_index: int, headers: List[str]) -> str:
|
|
202
|
+
"""Convert row data to JSON string."""
|
|
203
|
+
cleaned_data = self._clean_row_data(row_dict)
|
|
204
|
+
|
|
205
|
+
# Create structured JSON document
|
|
206
|
+
json_doc = {
|
|
207
|
+
"data": cleaned_data
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Add row metadata if requested
|
|
211
|
+
if self.include_row_index:
|
|
212
|
+
json_doc["row_index"] = row_index
|
|
213
|
+
|
|
214
|
+
# Add headers information if requested
|
|
215
|
+
if self.include_headers_in_content:
|
|
216
|
+
json_doc["headers"] = headers
|
|
217
|
+
json_doc["row_name"] = f"{self.row_prefix} {row_index + 1}"
|
|
218
|
+
|
|
219
|
+
return json.dumps(
|
|
220
|
+
json_doc,
|
|
221
|
+
ensure_ascii=self.json_ensure_ascii,
|
|
222
|
+
indent=self.json_indent,
|
|
223
|
+
default=str # Handle any remaining non-serializable objects
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
|
|
227
|
+
"""
|
|
228
|
+
Load CSV file and create one Document per row.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
path: Path to the CSV file
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
List of Document objects, one per CSV row
|
|
235
|
+
"""
|
|
236
|
+
if isinstance(path, PurePath):
|
|
237
|
+
self.logger.info(f"Loading CSV From: {path}")
|
|
238
|
+
elif isinstance(path, pd.DataFrame):
|
|
239
|
+
self.logger.info(
|
|
240
|
+
f"Loading CSV From DataFrame with {len(path)} rows and {len(path.columns)} columns"
|
|
241
|
+
)
|
|
242
|
+
docs = []
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
# Prepare pandas arguments
|
|
246
|
+
pandas_kwargs = self._prepare_pandas_kwargs(str(path))
|
|
247
|
+
|
|
248
|
+
# Read CSV with pandas
|
|
249
|
+
if isinstance(path, pd.DataFrame):
|
|
250
|
+
df = path
|
|
251
|
+
else:
|
|
252
|
+
df = pd.read_csv(str(path), **pandas_kwargs)
|
|
253
|
+
|
|
254
|
+
if df.empty:
|
|
255
|
+
self.logger.warning(f"CSV file {path} is empty or has no valid data")
|
|
256
|
+
return docs
|
|
257
|
+
|
|
258
|
+
self.logger.info(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
|
|
259
|
+
|
|
260
|
+
# Get column headers
|
|
261
|
+
headers = df.columns.tolist()
|
|
262
|
+
|
|
263
|
+
# Create CSV metadata
|
|
264
|
+
csv_metadata = {
|
|
265
|
+
"total_rows": len(df),
|
|
266
|
+
"total_columns": len(df.columns),
|
|
267
|
+
"column_headers": headers,
|
|
268
|
+
"data_types": df.dtypes.astype(str).to_dict(),
|
|
269
|
+
"separator_used": pandas_kwargs.get('sep', ','),
|
|
270
|
+
"encoding_used": self.encoding,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# Add basic statistics if numeric columns exist
|
|
274
|
+
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
|
|
275
|
+
if numeric_columns:
|
|
276
|
+
csv_metadata["numeric_columns"] = numeric_columns
|
|
277
|
+
csv_metadata["has_numeric_data"] = True
|
|
278
|
+
else:
|
|
279
|
+
csv_metadata["has_numeric_data"] = False
|
|
280
|
+
|
|
281
|
+
# Process each row
|
|
282
|
+
processed_rows = 0
|
|
283
|
+
skipped_rows = 0
|
|
284
|
+
|
|
285
|
+
for row_index, (_, row) in enumerate(df.iterrows()):
|
|
286
|
+
row_dict = row.to_dict()
|
|
287
|
+
|
|
288
|
+
# Skip row if it meets skip criteria
|
|
289
|
+
if self._should_skip_row(row_dict):
|
|
290
|
+
skipped_rows += 1
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
# Convert row to JSON
|
|
294
|
+
json_content = self._format_row_as_json(row_dict, row_index, headers)
|
|
295
|
+
|
|
296
|
+
# Create row metadata
|
|
297
|
+
row_metadata = {
|
|
298
|
+
"row_index": row_index,
|
|
299
|
+
"row_number": row_index + 1, # Human-friendly numbering
|
|
300
|
+
"column_count": len([v for v in row_dict.values() if not pd.isna(v)]),
|
|
301
|
+
"empty_columns": len([v for v in row_dict.values() if pd.isna(v)]),
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
# Create document metadata
|
|
305
|
+
metadata = self.create_metadata(
|
|
306
|
+
path=path,
|
|
307
|
+
doctype="csv_row",
|
|
308
|
+
source_type="csv",
|
|
309
|
+
doc_metadata={
|
|
310
|
+
**row_metadata,
|
|
311
|
+
"csv_info": csv_metadata,
|
|
312
|
+
"content_type": "application/json",
|
|
313
|
+
"processing_options": {
|
|
314
|
+
"skip_empty_rows": self.skip_empty_rows,
|
|
315
|
+
"skip_na_rows": self.skip_na_rows,
|
|
316
|
+
"fill_na_value": self.fill_na_value,
|
|
317
|
+
}
|
|
318
|
+
},
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Create document
|
|
322
|
+
doc = Document(
|
|
323
|
+
page_content=json_content,
|
|
324
|
+
metadata=metadata
|
|
325
|
+
)
|
|
326
|
+
docs.append(doc)
|
|
327
|
+
processed_rows += 1
|
|
328
|
+
|
|
329
|
+
self.logger.info(f"Processed {processed_rows} rows, skipped {skipped_rows} rows")
|
|
330
|
+
|
|
331
|
+
except Exception as e:
|
|
332
|
+
self.logger.error(f"Failed to load CSV file {path}: {e}")
|
|
333
|
+
raise
|
|
334
|
+
|
|
335
|
+
return docs
|
|
336
|
+
|
|
337
|
+
def get_csv_info(self, path: Union[str, PurePath]) -> Dict[str, Any]:
|
|
338
|
+
"""Get information about a CSV file without loading all data."""
|
|
339
|
+
try:
|
|
340
|
+
pandas_kwargs = self._prepare_pandas_kwargs(str(path))
|
|
341
|
+
|
|
342
|
+
# Read just the first few rows to get info
|
|
343
|
+
sample_df = pd.read_csv(str(path), nrows=5, **pandas_kwargs)
|
|
344
|
+
|
|
345
|
+
# Get full row count (more efficient than loading all data)
|
|
346
|
+
with open(str(path), 'r', encoding=self.encoding) as f:
|
|
347
|
+
total_rows = sum(1 for line in f) - (1 if self.header == 0 else 0)
|
|
348
|
+
|
|
349
|
+
return {
|
|
350
|
+
"total_rows": total_rows,
|
|
351
|
+
"total_columns": len(sample_df.columns),
|
|
352
|
+
"column_headers": sample_df.columns.tolist(),
|
|
353
|
+
"data_types": sample_df.dtypes.astype(str).to_dict(),
|
|
354
|
+
"separator_detected": pandas_kwargs.get('sep', ','),
|
|
355
|
+
"sample_data": sample_df.head(3).to_dict('records'),
|
|
356
|
+
"numeric_columns": sample_df.select_dtypes(include=['number']).columns.tolist(),
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
self.logger.error(f"Could not analyze CSV file {path}: {e}")
|
|
361
|
+
return {"error": str(e)}
|
|
362
|
+
|
|
363
|
+
def estimate_documents_count(self, path: Union[str, PurePath]) -> int:
|
|
364
|
+
"""Estimate how many documents will be created from a CSV file."""
|
|
365
|
+
try:
|
|
366
|
+
info = self.get_csv_info(path)
|
|
367
|
+
if "error" in info:
|
|
368
|
+
return 0
|
|
369
|
+
|
|
370
|
+
total_rows = info["total_rows"]
|
|
371
|
+
|
|
372
|
+
# Apply filtering estimates
|
|
373
|
+
if self.max_rows is not None:
|
|
374
|
+
total_rows = min(total_rows, self.max_rows)
|
|
375
|
+
|
|
376
|
+
if self.skip_rows is not None:
|
|
377
|
+
total_rows = max(0, total_rows - self.skip_rows)
|
|
378
|
+
|
|
379
|
+
# Rough estimate for empty row filtering (assume 5% empty rows)
|
|
380
|
+
if self.skip_empty_rows:
|
|
381
|
+
total_rows = int(total_rows * 0.95)
|
|
382
|
+
|
|
383
|
+
return total_rows
|
|
384
|
+
|
|
385
|
+
except Exception:
|
|
386
|
+
return 0
|
|
387
|
+
|
|
388
|
+
def get_configuration_summary(self) -> Dict[str, Any]:
|
|
389
|
+
"""Get current loader configuration."""
|
|
390
|
+
return {
|
|
391
|
+
"csv_options": {
|
|
392
|
+
"separator": self.separator or "auto-detect",
|
|
393
|
+
"encoding": self.encoding,
|
|
394
|
+
"header": self.header,
|
|
395
|
+
"max_rows": self.max_rows,
|
|
396
|
+
"skip_rows": self.skip_rows,
|
|
397
|
+
},
|
|
398
|
+
"processing": {
|
|
399
|
+
"skip_empty_rows": self.skip_empty_rows,
|
|
400
|
+
"skip_na_rows": self.skip_na_rows,
|
|
401
|
+
"fill_na_value": self.fill_na_value,
|
|
402
|
+
},
|
|
403
|
+
"output": {
|
|
404
|
+
"json_indent": self.json_indent,
|
|
405
|
+
"include_row_index": self.include_row_index,
|
|
406
|
+
"include_headers_in_content": self.include_headers_in_content,
|
|
407
|
+
"row_prefix": self.row_prefix,
|
|
408
|
+
}
|
|
409
|
+
}
|
parrot/loaders/docx.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
import re
|
|
4
|
+
import mammoth
|
|
5
|
+
import docx
|
|
6
|
+
from markdownify import markdownify as md
|
|
7
|
+
from ..stores.models import Document
|
|
8
|
+
from .abstract import AbstractLoader
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MSWordLoader(AbstractLoader):
|
|
12
|
+
"""
|
|
13
|
+
Load Microsoft Docx as Parrot Documents.
|
|
14
|
+
"""
|
|
15
|
+
extensions: List[str] = ['.doc', '.docx']
|
|
16
|
+
|
|
17
|
+
def docx_to_markdown(self, docx_path):
|
|
18
|
+
doc = docx.Document(docx_path)
|
|
19
|
+
md_lines = []
|
|
20
|
+
|
|
21
|
+
# Parse paragraphs and basic styles
|
|
22
|
+
for para in doc.paragraphs:
|
|
23
|
+
style = para.style.name.lower()
|
|
24
|
+
text = para.text.strip()
|
|
25
|
+
if not text:
|
|
26
|
+
continue
|
|
27
|
+
if "heading" in style:
|
|
28
|
+
# Markdown headings
|
|
29
|
+
level = re.sub(r"[^\d]", "", style) or "1"
|
|
30
|
+
md_lines.append(f"{'#' * int(level)} {text}")
|
|
31
|
+
elif style.startswith("list"):
|
|
32
|
+
md_lines.append(f"- {text}")
|
|
33
|
+
else:
|
|
34
|
+
md_lines.append(text)
|
|
35
|
+
|
|
36
|
+
# Parse tables
|
|
37
|
+
for table in doc.tables:
|
|
38
|
+
rows = []
|
|
39
|
+
for row in table.rows:
|
|
40
|
+
row_data = [cell.text.strip() for cell in row.cells]
|
|
41
|
+
rows.append("| " + " | ".join(row_data) + " |")
|
|
42
|
+
if rows:
|
|
43
|
+
# Add header separator if more than 1 row
|
|
44
|
+
if len(rows) > 1:
|
|
45
|
+
header_sep = "| " + " | ".join(['---'] * len(table.columns)) + " |"
|
|
46
|
+
rows.insert(1, header_sep)
|
|
47
|
+
md_lines.extend(rows)
|
|
48
|
+
md_lines.append("") # Blank line after table
|
|
49
|
+
|
|
50
|
+
# Join lines and cleanup
|
|
51
|
+
markdown_text = "\n\n".join(md_lines)
|
|
52
|
+
# Optionally, use markdownify to post-process (if any HTML remains)
|
|
53
|
+
return md(markdown_text)
|
|
54
|
+
|
|
55
|
+
def extract_text(self, path):
|
|
56
|
+
"""Extract text from a docx file.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
path (Path): The source of the data.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
str: The extracted text.
|
|
63
|
+
"""
|
|
64
|
+
doc = docx.Document(str(path))
|
|
65
|
+
text = []
|
|
66
|
+
for paragraph in doc.paragraphs:
|
|
67
|
+
text.append(paragraph.text)
|
|
68
|
+
return "\n".join(text)
|
|
69
|
+
|
|
70
|
+
async def _load(self, path: PurePath, **kwargs) -> List[Document]:
|
|
71
|
+
"""Load data from a source and return it as a Document.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
path (Path): The source of the data.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
List[Document]: A list of Documents.
|
|
78
|
+
"""
|
|
79
|
+
self.logger.info(
|
|
80
|
+
f"Loading Word file: {path}"
|
|
81
|
+
)
|
|
82
|
+
docs = []
|
|
83
|
+
doc = docx.Document(str(path))
|
|
84
|
+
properties = doc.core_properties
|
|
85
|
+
md_text = self.docx_to_markdown(path)
|
|
86
|
+
document_meta = {
|
|
87
|
+
"author": properties.author,
|
|
88
|
+
"version": properties.version,
|
|
89
|
+
"title": properties.title,
|
|
90
|
+
}
|
|
91
|
+
metadata = self.create_metadata(
|
|
92
|
+
path=path,
|
|
93
|
+
doctype=self.doctype,
|
|
94
|
+
source_type=self._source_type,
|
|
95
|
+
doc_metadata=document_meta
|
|
96
|
+
)
|
|
97
|
+
# Create document-level context
|
|
98
|
+
document_context = f"File Name: {path.name}\n"
|
|
99
|
+
document_context += f"Document Type: {self.doctype}\n"
|
|
100
|
+
document_context += f"Source Type: {self._source_type}\n"
|
|
101
|
+
# document_context += f"Summary: {summary}\n"
|
|
102
|
+
document_context += "======\n"
|
|
103
|
+
# splitting the content:
|
|
104
|
+
for chunk in self.markdown_splitter.split_text(md_text):
|
|
105
|
+
_idx = {
|
|
106
|
+
**metadata
|
|
107
|
+
}
|
|
108
|
+
doc = self.create_document(
|
|
109
|
+
content=document_context + chunk,
|
|
110
|
+
path=path,
|
|
111
|
+
metadata=_idx
|
|
112
|
+
)
|
|
113
|
+
docs.append(
|
|
114
|
+
doc
|
|
115
|
+
)
|
|
116
|
+
return docs
|