ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
from typing import List, Optional, Union, Tuple, Dict
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from ..stores.models import Document
|
|
5
|
+
from .abstract import AbstractLoader
|
|
6
|
+
|
|
7
|
+
# Optional deps: install via
|
|
8
|
+
# pip install ebooklib beautifulsoup4 markdownify
|
|
9
|
+
try:
|
|
10
|
+
from ebooklib import epub
|
|
11
|
+
EBOOKLIB_AVAILABLE = True
|
|
12
|
+
try:
|
|
13
|
+
ITEM_DOCUMENT = epub.ITEM_DOCUMENT
|
|
14
|
+
except AttributeError:
|
|
15
|
+
try:
|
|
16
|
+
from ebooklib.epub import ITEM_DOCUMENT
|
|
17
|
+
except ImportError:
|
|
18
|
+
ITEM_DOCUMENT = 9 # Known constant value
|
|
19
|
+
except Exception:
|
|
20
|
+
EBOOKLIB_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from bs4 import BeautifulSoup
|
|
24
|
+
BS4_AVAILABLE = True
|
|
25
|
+
except Exception:
|
|
26
|
+
BS4_AVAILABLE = False
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from markdownify import MarkdownConverter
|
|
30
|
+
MD_AVAILABLE = True
|
|
31
|
+
except Exception:
|
|
32
|
+
MD_AVAILABLE = False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class EpubLoader(AbstractLoader):
|
|
36
|
+
"""
|
|
37
|
+
EPUB loader that extracts clean Markdown (or plain text) from chapters/sections.
|
|
38
|
+
|
|
39
|
+
Features:
|
|
40
|
+
- Per-chapter documents with titles from TOC/HTML
|
|
41
|
+
- Optional full-book document (merged)
|
|
42
|
+
- Clean Markdown conversion (lists, headers, links)
|
|
43
|
+
- Skips non-document items (css, images, fonts)
|
|
44
|
+
- Configurable minimum content length
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
extensions: List[str] = ['.epub']
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
source: Optional[Union[str, PurePath, List[PurePath]]] = None,
|
|
52
|
+
*,
|
|
53
|
+
tokenizer: Union[str, Callable] = None,
|
|
54
|
+
text_splitter: Union[str, Callable] = None,
|
|
55
|
+
source_type: str = 'file',
|
|
56
|
+
|
|
57
|
+
# Output controls
|
|
58
|
+
as_markdown: bool = True, # emit markdown instead of plain text
|
|
59
|
+
per_chapter: bool = True, # True => one Document per chapter; False => single full-book doc
|
|
60
|
+
include_toc_document: bool = False,# optional separate TOC document
|
|
61
|
+
min_section_length: int = 50, # drop tiny/empty sections
|
|
62
|
+
|
|
63
|
+
# Markdown conversion tuning
|
|
64
|
+
heading_style: str = "ATX", # for markdownify; "ATX" => # Heading
|
|
65
|
+
strip_whitespace: bool = True,
|
|
66
|
+
|
|
67
|
+
**kwargs
|
|
68
|
+
):
|
|
69
|
+
super().__init__(
|
|
70
|
+
source,
|
|
71
|
+
tokenizer=tokenizer,
|
|
72
|
+
text_splitter=text_splitter,
|
|
73
|
+
source_type=source_type,
|
|
74
|
+
**kwargs
|
|
75
|
+
)
|
|
76
|
+
self.doctype = 'epub'
|
|
77
|
+
self._source_type = 'ebook'
|
|
78
|
+
|
|
79
|
+
# Options
|
|
80
|
+
self.as_markdown = as_markdown
|
|
81
|
+
self.per_chapter = per_chapter
|
|
82
|
+
self.include_toc_document = include_toc_document
|
|
83
|
+
self.min_section_length = int(min_section_length)
|
|
84
|
+
self.strip_whitespace = bool(strip_whitespace)
|
|
85
|
+
self.heading_style = heading_style
|
|
86
|
+
|
|
87
|
+
# sanity checks
|
|
88
|
+
if not EBOOKLIB_AVAILABLE or not BS4_AVAILABLE:
|
|
89
|
+
missing = []
|
|
90
|
+
if not EBOOKLIB_AVAILABLE:
|
|
91
|
+
missing.append("ebooklib")
|
|
92
|
+
if not BS4_AVAILABLE:
|
|
93
|
+
missing.append("beautifulsoup4")
|
|
94
|
+
raise ImportError(
|
|
95
|
+
f"EpubLoader requires {', '.join(missing)}. "
|
|
96
|
+
f"Install with: pip install ebooklib beautifulsoup4"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def _html_to_markdown(self, html: str) -> str:
|
|
100
|
+
"""Convert XHTML chapter html to Markdown (fallback to plain text)."""
|
|
101
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
102
|
+
|
|
103
|
+
# remove scripts/styles
|
|
104
|
+
for bad in soup(["script", "style", "noscript"]):
|
|
105
|
+
bad.decompose()
|
|
106
|
+
|
|
107
|
+
if MD_AVAILABLE and self.as_markdown:
|
|
108
|
+
md = MarkdownConverter(
|
|
109
|
+
heading_style=self.heading_style,
|
|
110
|
+
strip=['style', 'script', 'noscript']
|
|
111
|
+
).convert_soup(soup)
|
|
112
|
+
return self._clean(md)
|
|
113
|
+
|
|
114
|
+
# plain text fallback
|
|
115
|
+
text = soup.get_text("\n", strip=True)
|
|
116
|
+
return self._clean(text)
|
|
117
|
+
|
|
118
|
+
def _clean(self, text: str) -> str:
|
|
119
|
+
if not text:
|
|
120
|
+
return ""
|
|
121
|
+
if self.strip_whitespace:
|
|
122
|
+
# Normalize multiple blank lines; trim trailing spaces
|
|
123
|
+
lines = [ln.rstrip() for ln in text.splitlines()]
|
|
124
|
+
# Collapse >2 blank lines to just one
|
|
125
|
+
cleaned = []
|
|
126
|
+
blank = 0
|
|
127
|
+
for ln in lines:
|
|
128
|
+
if ln.strip():
|
|
129
|
+
blank = 0
|
|
130
|
+
cleaned.append(ln)
|
|
131
|
+
else:
|
|
132
|
+
blank += 1
|
|
133
|
+
if blank <= 1:
|
|
134
|
+
cleaned.append("")
|
|
135
|
+
text = "\n".join(cleaned)
|
|
136
|
+
return text.strip()
|
|
137
|
+
|
|
138
|
+
def _flatten_toc(self, toc) -> List[Tuple[str, str]]:
|
|
139
|
+
"""
|
|
140
|
+
Flatten ebooklib TOC into a list of (href, title) entries.
|
|
141
|
+
toc entries are like: Link(title, href) or nested lists/tuples.
|
|
142
|
+
"""
|
|
143
|
+
flat = []
|
|
144
|
+
|
|
145
|
+
def _walk(node):
|
|
146
|
+
if isinstance(node, (list, tuple)):
|
|
147
|
+
for child in node:
|
|
148
|
+
_walk(child)
|
|
149
|
+
else:
|
|
150
|
+
# epub.Link or epub.Section
|
|
151
|
+
try:
|
|
152
|
+
href = getattr(node, "href", None)
|
|
153
|
+
title = getattr(node, "title", None)
|
|
154
|
+
if href and title:
|
|
155
|
+
flat.append((href.split("#", 1)[0], str(title)))
|
|
156
|
+
except Exception:
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
_walk(toc)
|
|
160
|
+
return flat
|
|
161
|
+
|
|
162
|
+
def _toc_title_lookup(self, book: "epub.EpubBook") -> Dict[str, str]:
|
|
163
|
+
"""
|
|
164
|
+
Build a mapping from href→title using TOC (best effort).
|
|
165
|
+
Keys are hrefs without fragments; values are strings.
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
flat = self._flatten_toc(book.toc or [])
|
|
169
|
+
# Normalize: keep last title if duplicates
|
|
170
|
+
return {href: title for href, title in flat}
|
|
171
|
+
except Exception:
|
|
172
|
+
return {}
|
|
173
|
+
|
|
174
|
+
def _iter_document_items(self, book: "epub.EpubBook"):
|
|
175
|
+
"""
|
|
176
|
+
Yield (order_idx, item) for spine items that are HTML documents.
|
|
177
|
+
"""
|
|
178
|
+
id_to_item = {it.get_id(): it for it in book.get_items()}
|
|
179
|
+
order = 0
|
|
180
|
+
for entry in (book.spine or []):
|
|
181
|
+
if isinstance(entry, tuple) and entry and isinstance(entry[0], str):
|
|
182
|
+
idref = entry[0]
|
|
183
|
+
item = id_to_item.get(idref)
|
|
184
|
+
if item is None:
|
|
185
|
+
continue
|
|
186
|
+
if item.get_type() == ITEM_DOCUMENT:
|
|
187
|
+
yield order, item
|
|
188
|
+
order += 1
|
|
189
|
+
|
|
190
|
+
if order == 0:
|
|
191
|
+
for i, item in enumerate(book.get_items_of_type(ITEM_DOCUMENT)):
|
|
192
|
+
yield i, item
|
|
193
|
+
|
|
194
|
+
def _derive_title_from_html(self, html: str) -> Optional[str]:
|
|
195
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
196
|
+
# Try <title>
|
|
197
|
+
if soup.title and soup.title.string:
|
|
198
|
+
t = soup.title.string.strip()
|
|
199
|
+
if t:
|
|
200
|
+
return t
|
|
201
|
+
# Try first heading
|
|
202
|
+
for tag in ["h1", "h2", "h3"]:
|
|
203
|
+
h = soup.find(tag)
|
|
204
|
+
if h and h.get_text(strip=True):
|
|
205
|
+
return h.get_text(strip=True)
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
async def _load(self, path: PurePath, **kwargs) -> List[Document]:
|
|
209
|
+
"""
|
|
210
|
+
Load an EPUB file into Parrot Documents.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
- Per-chapter Documents (default), or
|
|
214
|
+
- Single full-book Document if per_chapter=False
|
|
215
|
+
"""
|
|
216
|
+
self.logger.info(f"Loading EPUB file: {path}")
|
|
217
|
+
|
|
218
|
+
docs: List[Document] = []
|
|
219
|
+
try:
|
|
220
|
+
book = epub.read_epub(str(path))
|
|
221
|
+
except Exception as e:
|
|
222
|
+
self.logger.error(f"Failed to open EPUB {path}: {e}")
|
|
223
|
+
return docs
|
|
224
|
+
|
|
225
|
+
toc_map = self._toc_title_lookup(book)
|
|
226
|
+
|
|
227
|
+
# Optionally create a separate TOC document
|
|
228
|
+
if self.include_toc_document and toc_map:
|
|
229
|
+
toc_lines = ["# Table of Contents"]
|
|
230
|
+
for href, title in toc_map.items():
|
|
231
|
+
toc_lines.append(f"- {title} (Link: {href})")
|
|
232
|
+
toc_content = "\n".join(toc_lines)
|
|
233
|
+
toc_meta = self.create_metadata(
|
|
234
|
+
path=path,
|
|
235
|
+
doctype="epub",
|
|
236
|
+
source_type="epub_toc",
|
|
237
|
+
doc_metadata={
|
|
238
|
+
"content_type": "toc",
|
|
239
|
+
"entries": len(toc_map)
|
|
240
|
+
},
|
|
241
|
+
)
|
|
242
|
+
docs.append(self.create_document(toc_content, path, toc_meta))
|
|
243
|
+
|
|
244
|
+
# Collect per-chapter or full text
|
|
245
|
+
all_sections = []
|
|
246
|
+
for order_idx, item in self._iter_document_items(book):
|
|
247
|
+
try:
|
|
248
|
+
html = item.get_content().decode("utf-8", errors="ignore")
|
|
249
|
+
except Exception:
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
content = self._html_to_markdown(html)
|
|
253
|
+
|
|
254
|
+
if len(content) < self.min_section_length:
|
|
255
|
+
# skip boilerplate/empty stubs
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
# Derive title from TOC → HTML <title> → filename
|
|
259
|
+
href = getattr(item, "file_name", "") or ""
|
|
260
|
+
title = toc_map.get(href) or self._derive_title_from_html(html) or PurePath(href).name or f"Section {order_idx+1}"
|
|
261
|
+
|
|
262
|
+
# Track for full-book option
|
|
263
|
+
all_sections.append((order_idx, title, content, href))
|
|
264
|
+
|
|
265
|
+
# Per-chapter Document
|
|
266
|
+
if self.per_chapter:
|
|
267
|
+
section_meta = self.create_metadata(
|
|
268
|
+
path=path,
|
|
269
|
+
doctype="epub",
|
|
270
|
+
source_type="epub_section",
|
|
271
|
+
doc_metadata={
|
|
272
|
+
"section_order": order_idx + 1,
|
|
273
|
+
"section_title": title,
|
|
274
|
+
"href": href,
|
|
275
|
+
"content_type": "chapter",
|
|
276
|
+
"output_format": "markdown" if self.as_markdown else "text",
|
|
277
|
+
"min_section_length": self.min_section_length
|
|
278
|
+
},
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Prepend a lightweight context header (like your PPT/PDF style)
|
|
282
|
+
context = [
|
|
283
|
+
f"File Name: {path.name if hasattr(path, 'name') else str(path)}",
|
|
284
|
+
f"Section: {order_idx + 1}",
|
|
285
|
+
f"Title: {title}",
|
|
286
|
+
f"Document Type: epub",
|
|
287
|
+
f"Source Type: ebook",
|
|
288
|
+
]
|
|
289
|
+
full_content = "\n".join(context) + "\n======\n\n" + content
|
|
290
|
+
|
|
291
|
+
docs.append(self.create_document(full_content, path, section_meta))
|
|
292
|
+
|
|
293
|
+
if not all_sections:
|
|
294
|
+
self.logger.warning(f"No textual sections extracted from {path}")
|
|
295
|
+
return docs
|
|
296
|
+
|
|
297
|
+
# Full-book Document (if requested)
|
|
298
|
+
if not self.per_chapter:
|
|
299
|
+
merged = []
|
|
300
|
+
for order_idx, title, content, href in all_sections:
|
|
301
|
+
merged.append(f"# {title}\n\n{content}\n")
|
|
302
|
+
book_text = "\n\n".join(merged).strip()
|
|
303
|
+
|
|
304
|
+
full_meta = self.create_metadata(
|
|
305
|
+
path=path,
|
|
306
|
+
doctype="epub",
|
|
307
|
+
source_type="epub_full",
|
|
308
|
+
doc_metadata={
|
|
309
|
+
"sections": len(all_sections),
|
|
310
|
+
"content_type": "full_document",
|
|
311
|
+
"output_format": "markdown" if self.as_markdown else "text",
|
|
312
|
+
},
|
|
313
|
+
)
|
|
314
|
+
docs.append(self.create_document(book_text, path, full_meta))
|
|
315
|
+
|
|
316
|
+
return docs
|
parrot/loaders/excel.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# parrot/loaders/excel.py
|
|
2
|
+
from typing import List, Optional, Union, Literal, Dict
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from navigator.libs.json import JSONContent
|
|
7
|
+
from ..stores.models import Document
|
|
8
|
+
from .abstract import AbstractLoader
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExcelLoader(AbstractLoader):
|
|
12
|
+
"""
|
|
13
|
+
Excel loader that converts an Excel workbook (or DataFrame) into per-row Documents.
|
|
14
|
+
|
|
15
|
+
- One Document per row per sheet (rows with all-empty values are skipped).
|
|
16
|
+
- Works for .xlsx / .xlsm / .xls files (pandas engine auto-detects).
|
|
17
|
+
- Also accepts a pandas.DataFrame (sheet='DataFrame').
|
|
18
|
+
- Output formats: markdown (default), plain, or json.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
extensions: List[str] = ['.xlsx', '.xlsm', '.xls']
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
source: Optional[Union[str, PurePath, List[PurePath]]] = None,
|
|
26
|
+
*,
|
|
27
|
+
tokenizer: Union[str, Callable] = None,
|
|
28
|
+
text_splitter: Union[str, Callable] = None,
|
|
29
|
+
source_type: str = 'file',
|
|
30
|
+
|
|
31
|
+
sheets: Optional[Union[str, int, List[Union[str, int]]]] = None,
|
|
32
|
+
header: Union[int, List[int], None] = 0,
|
|
33
|
+
usecols: Optional[Union[str, List[Union[int, str]]]] = None,
|
|
34
|
+
drop_empty_rows: bool = True,
|
|
35
|
+
max_rows: Optional[int] = None,
|
|
36
|
+
date_format: str = "%Y-%m-%d",
|
|
37
|
+
output_format: Literal["markdown", "plain", "json"] = "markdown",
|
|
38
|
+
min_row_length: int = 1, # skip rows with < N non-empty fields
|
|
39
|
+
title_column: Optional[str] = None,
|
|
40
|
+
|
|
41
|
+
**kwargs
|
|
42
|
+
):
|
|
43
|
+
super().__init__(
|
|
44
|
+
source,
|
|
45
|
+
tokenizer=tokenizer,
|
|
46
|
+
text_splitter=text_splitter,
|
|
47
|
+
source_type=source_type,
|
|
48
|
+
**kwargs
|
|
49
|
+
)
|
|
50
|
+
self.doctype = 'excel'
|
|
51
|
+
self._source_type = source_type
|
|
52
|
+
self.sheets = sheets
|
|
53
|
+
self.header = header
|
|
54
|
+
self.usecols = usecols
|
|
55
|
+
self.drop_empty_rows = drop_empty_rows
|
|
56
|
+
self.max_rows = max_rows
|
|
57
|
+
self.date_format = date_format
|
|
58
|
+
self.output_format = output_format
|
|
59
|
+
self.min_row_length = int(min_row_length)
|
|
60
|
+
self.title_column = title_column
|
|
61
|
+
|
|
62
|
+
def _stringify(self, v):
|
|
63
|
+
if pd.isna(v):
|
|
64
|
+
return ""
|
|
65
|
+
if isinstance(v, (pd.Timestamp, )):
|
|
66
|
+
return v.strftime(self.date_format)
|
|
67
|
+
return str(v)
|
|
68
|
+
|
|
69
|
+
def _row_to_text(self, row: Dict[str, object]) -> str:
|
|
70
|
+
"""Render a single row dict to text in the chosen output_format."""
|
|
71
|
+
if self.output_format == "json":
|
|
72
|
+
return JSONContent.dumps(row, indent=2)
|
|
73
|
+
|
|
74
|
+
items = [(k, self._stringify(v)) for k, v in row.items()]
|
|
75
|
+
if self.output_format == "plain":
|
|
76
|
+
# key: value per line
|
|
77
|
+
return "\n".join(f"{k}: {v}" for k, v in items if v != "")
|
|
78
|
+
|
|
79
|
+
# markdown: list of **key**: value
|
|
80
|
+
return "\n".join(f"- **{k}**: {v}" for k, v in items if v != "")
|
|
81
|
+
|
|
82
|
+
def _row_nonempty_count(self, row: Dict[str, object]) -> int:
|
|
83
|
+
return sum(1 for v in row.values() if (not pd.isna(v)) and str(v).strip() != "")
|
|
84
|
+
|
|
85
|
+
async def _load(self, source: Union[PurePath, str, pd.DataFrame], **kwargs) -> List[Document]:
|
|
86
|
+
"""
|
|
87
|
+
Load an Excel file (or DataFrame) and return per-row Documents.
|
|
88
|
+
"""
|
|
89
|
+
docs: List[Document] = []
|
|
90
|
+
|
|
91
|
+
# Case A: already a DataFrame (from AbstractLoader.from_dataframe)
|
|
92
|
+
# (sheet name is synthetic: "DataFrame")
|
|
93
|
+
if isinstance(source, pd.DataFrame):
|
|
94
|
+
sheet_name = "DataFrame"
|
|
95
|
+
docs.extend(await self._docs_from_dataframe(source, sheet_name, path_hint="dataframe"))
|
|
96
|
+
return docs
|
|
97
|
+
|
|
98
|
+
# Case B: excel path
|
|
99
|
+
path = PurePath(source) if not isinstance(source, PurePath) else source
|
|
100
|
+
self.logger.info(f"Loading Excel file: {path}")
|
|
101
|
+
|
|
102
|
+
# Read one or multiple sheets
|
|
103
|
+
try:
|
|
104
|
+
# If sheets=None -> pd returns dict of DataFrames (all sheets)
|
|
105
|
+
# If sheets is a single name/index -> returns a DataFrame
|
|
106
|
+
xls = pd.read_excel(
|
|
107
|
+
str(path),
|
|
108
|
+
sheet_name=self.sheets if self.sheets is not None else None,
|
|
109
|
+
header=self.header,
|
|
110
|
+
usecols=self.usecols,
|
|
111
|
+
dtype=object # keep as objects → stringify ourselves
|
|
112
|
+
)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
self.logger.error(f"Failed to read Excel {path}: {e}")
|
|
115
|
+
return docs
|
|
116
|
+
|
|
117
|
+
# Normalize to dict[str, DataFrame]
|
|
118
|
+
if isinstance(xls, pd.DataFrame):
|
|
119
|
+
frames = {"Sheet1" if self.sheets is None else str(self.sheets): xls}
|
|
120
|
+
else:
|
|
121
|
+
# dict of {sheet_name: df}
|
|
122
|
+
frames = {str(k): v for k, v in xls.items()}
|
|
123
|
+
|
|
124
|
+
for sheet_name, df in frames.items():
|
|
125
|
+
# Drop fully empty rows
|
|
126
|
+
if self.drop_empty_rows:
|
|
127
|
+
df = df.dropna(how="all")
|
|
128
|
+
|
|
129
|
+
if self.max_rows is not None:
|
|
130
|
+
df = df.head(self.max_rows)
|
|
131
|
+
|
|
132
|
+
if df.empty:
|
|
133
|
+
self.logger.info(f"Sheet '{sheet_name}' is empty; skipping.")
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Ensure columns are strings
|
|
137
|
+
df.columns = [str(c) for c in df.columns]
|
|
138
|
+
docs.extend(await self._docs_from_dataframe(df, sheet_name, path_hint=path))
|
|
139
|
+
|
|
140
|
+
return docs
|
|
141
|
+
|
|
142
|
+
async def _docs_from_dataframe(
|
|
143
|
+
self,
|
|
144
|
+
df: pd.DataFrame,
|
|
145
|
+
sheet_name: str,
|
|
146
|
+
path_hint: Union[str, PurePath]
|
|
147
|
+
) -> List[Document]:
|
|
148
|
+
"""Convert a DataFrame into per-row Documents."""
|
|
149
|
+
docs: List[Document] = []
|
|
150
|
+
|
|
151
|
+
# Convert to records for easy iteration
|
|
152
|
+
records = df.to_dict(orient="records")
|
|
153
|
+
|
|
154
|
+
for i, row in enumerate(records, start=1):
|
|
155
|
+
if self.min_row_length > 1 and self._row_nonempty_count(row) < self.min_row_length:
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
content_body = self._row_to_text(row)
|
|
159
|
+
|
|
160
|
+
# Context header (aligns with PDF/PPT style: header + "======")
|
|
161
|
+
title_val = None
|
|
162
|
+
if self.title_column and self.title_column in row:
|
|
163
|
+
title_val = self._stringify(row[self.title_column]).strip() or None
|
|
164
|
+
|
|
165
|
+
context = [
|
|
166
|
+
f"File Name: {path_hint.name if hasattr(path_hint, 'name') else str(path_hint)}",
|
|
167
|
+
f"Sheet: {sheet_name}",
|
|
168
|
+
f"Row: {i}",
|
|
169
|
+
f"Document Type: excel",
|
|
170
|
+
f"Source Type: {self._source_type}",
|
|
171
|
+
]
|
|
172
|
+
if title_val:
|
|
173
|
+
context.append(f"Title: {title_val}")
|
|
174
|
+
|
|
175
|
+
full_content = "\n".join(context) + "\n======\n\n" + content_body
|
|
176
|
+
|
|
177
|
+
# Metadata
|
|
178
|
+
doc_meta = {
|
|
179
|
+
"filename": path_hint.name if hasattr(path_hint, 'name') else str(path_hint),
|
|
180
|
+
"file_path": str(path_hint),
|
|
181
|
+
"sheet": sheet_name,
|
|
182
|
+
"row_index": i,
|
|
183
|
+
"columns": list(df.columns),
|
|
184
|
+
"content_type": "row",
|
|
185
|
+
"output_format": self.output_format,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
meta = self.create_metadata(
|
|
189
|
+
path=path_hint,
|
|
190
|
+
doctype="excel",
|
|
191
|
+
source_type="excel_row",
|
|
192
|
+
doc_metadata=doc_meta,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
docs.append(
|
|
196
|
+
self.create_document(full_content, path_hint, meta)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return docs
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
####
|
|
2
|
+
# Copyright 2023 Jesus Lara.
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
#
|
|
5
|
+
# Loaders.
|
|
6
|
+
# Open, extract and load data from different sources.
|
|
7
|
+
#####
|
|
8
|
+
from .pdf import PDFLoader
|
|
9
|
+
from .txt import TextLoader
|
|
10
|
+
from .docx import MSWordLoader
|
|
11
|
+
from .qa import QAFileLoader
|
|
12
|
+
from .html import HTMLLoader
|
|
13
|
+
from .pdfmark import PDFMarkdownLoader
|
|
14
|
+
from .pdftables import PDFTablesLoader
|
|
15
|
+
from .csv import CSVLoader
|
|
16
|
+
from .youtube import YoutubeLoader
|
|
17
|
+
from .web import WebLoader
|
|
18
|
+
from .ppt import PowerPointLoader
|
|
19
|
+
from .markdown import MarkdownLoader
|
|
20
|
+
from .epubloader import EpubLoader
|
|
21
|
+
from .excel import ExcelLoader
|
|
22
|
+
# from .video import VideoLoader
|
|
23
|
+
from .videolocal import VideoLocalLoader
|
|
24
|
+
from .videounderstanding import VideoUnderstandingLoader
|
|
25
|
+
# from .vimeo import VimeoLoader
|
|
26
|
+
from .audio import AudioLoader
|
|
27
|
+
|
|
28
|
+
AVAILABLE_LOADERS = {
|
|
29
|
+
'.pdf': PDFLoader,
|
|
30
|
+
'.txt': TextLoader,
|
|
31
|
+
'.docx': MSWordLoader,
|
|
32
|
+
'.qa': QAFileLoader,
|
|
33
|
+
'.xlsx': ExcelLoader,
|
|
34
|
+
'.xlsm': ExcelLoader,
|
|
35
|
+
'.xls': ExcelLoader,
|
|
36
|
+
'.html': HTMLLoader,
|
|
37
|
+
'.pdfmd': PDFMarkdownLoader,
|
|
38
|
+
'.pdftables': PDFTablesLoader,
|
|
39
|
+
'.csv': CSVLoader,
|
|
40
|
+
'.youtube': YoutubeLoader,
|
|
41
|
+
'.web': WebLoader,
|
|
42
|
+
'.ppt': PowerPointLoader,
|
|
43
|
+
'.pptx': PowerPointLoader,
|
|
44
|
+
'.md': MarkdownLoader,
|
|
45
|
+
'.json': MarkdownLoader,
|
|
46
|
+
'.xml': MarkdownLoader,
|
|
47
|
+
'.epub': EpubLoader,
|
|
48
|
+
'.mp3': AudioLoader,
|
|
49
|
+
'.wav': AudioLoader,
|
|
50
|
+
'.avi': VideoUnderstandingLoader,
|
|
51
|
+
'.mp4': VideoUnderstandingLoader,
|
|
52
|
+
'.webm': VideoUnderstandingLoader,
|
|
53
|
+
'.mov': VideoUnderstandingLoader,
|
|
54
|
+
'.mkv': VideoUnderstandingLoader,
|
|
55
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Optional, Any
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from navconfig.logging import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FilePlugin(ABC):
|
|
7
|
+
"""
|
|
8
|
+
FilePlugin is a base class for Open Files.
|
|
9
|
+
It provides a common interface for all opening all kind of iles.
|
|
10
|
+
Subclasses should implement the `open` method to define
|
|
11
|
+
the specific file processing logic.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, *args, **kwargs):
|
|
15
|
+
"""
|
|
16
|
+
Initialize the ImagePlugin with an optional image path.
|
|
17
|
+
|
|
18
|
+
:param image: Path to the image file.
|
|
19
|
+
"""
|
|
20
|
+
self.logger = logging.getLogger(
|
|
21
|
+
f'parrot.FileLoader.{self.__class__.__name__}'
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
async def __aenter__(self):
|
|
25
|
+
if hasattr(self, "open"):
|
|
26
|
+
await self.open()
|
|
27
|
+
return self
|
|
28
|
+
|
|
29
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
30
|
+
if hasattr(self, "close"):
|
|
31
|
+
await self.close()
|
|
32
|
+
return True
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
async def read(self):
|
|
36
|
+
"""
|
|
37
|
+
Return the content of the file, need to be implemented in the subclass.
|
|
38
|
+
"""
|
|
39
|
+
pass
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup
|
|
2
|
+
from .text import TextFile
|
|
3
|
+
|
|
4
|
+
class HTMLFile(TextFile):
|
|
5
|
+
"""
|
|
6
|
+
A class to handle HTML files asynchronously.
|
|
7
|
+
"""
|
|
8
|
+
async def read(self) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Asynchronously read the content of the html file.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
BeautifulSoup object of HTML File.
|
|
14
|
+
"""
|
|
15
|
+
if self._file is None:
|
|
16
|
+
await self.open()
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
content = await self._file.read()
|
|
20
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
21
|
+
return soup, content
|
|
22
|
+
except Exception as e:
|
|
23
|
+
self.logger.error(
|
|
24
|
+
f"Error reading HTML file {self.path}: {str(e)}"
|
|
25
|
+
)
|
|
26
|
+
raise
|