ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import aiofiles
|
|
2
|
+
from .abstract import FilePlugin
|
|
3
|
+
|
|
4
|
+
class TextFile(FilePlugin):
|
|
5
|
+
"""
|
|
6
|
+
A class to handle text files asynchronously.
|
|
7
|
+
"""
|
|
8
|
+
def __init__(self, path: str, encoding: str = 'utf-8'):
|
|
9
|
+
"""
|
|
10
|
+
Initialize the TextFile with a file path.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
path: Path to the text file.
|
|
14
|
+
encoding: File encoding (default: utf-8)
|
|
15
|
+
"""
|
|
16
|
+
super().__init__()
|
|
17
|
+
self.path = path
|
|
18
|
+
self.encoding = encoding
|
|
19
|
+
self._file = None
|
|
20
|
+
|
|
21
|
+
async def open(self):
|
|
22
|
+
"""
|
|
23
|
+
Asynchronously open the text file.
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
self._file = await aiofiles.open(self.path, mode='r', encoding=self.encoding)
|
|
27
|
+
self.logger.debug(
|
|
28
|
+
f"Successfully opened file: {self.path}"
|
|
29
|
+
)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
self.logger.error(f"Error opening file {self.path}: {str(e)}")
|
|
32
|
+
raise
|
|
33
|
+
|
|
34
|
+
async def close(self):
|
|
35
|
+
"""
|
|
36
|
+
Asynchronously close the text file.
|
|
37
|
+
"""
|
|
38
|
+
if self._file is not None:
|
|
39
|
+
try:
|
|
40
|
+
await self._file.close()
|
|
41
|
+
self.logger.debug(f"Successfully closed file: {self.path}")
|
|
42
|
+
except Exception as e:
|
|
43
|
+
self.logger.error(f"Error closing file {self.path}: {str(e)}")
|
|
44
|
+
raise
|
|
45
|
+
finally:
|
|
46
|
+
self._file = None
|
|
47
|
+
|
|
48
|
+
async def read(self) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Asynchronously read the content of the text file.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Content of the text file as a string.
|
|
54
|
+
"""
|
|
55
|
+
if self._file is None:
|
|
56
|
+
await self.open()
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
content = await self._file.read()
|
|
60
|
+
return content
|
|
61
|
+
except Exception as e:
|
|
62
|
+
self.logger.error(f"Error reading file {self.path}: {str(e)}")
|
|
63
|
+
raise
|
parrot/loaders/html.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from typing import Union, List, Callable, Any
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import PurePath
|
|
4
|
+
from markdownify import markdownify as md
|
|
5
|
+
from ..stores.models import Document
|
|
6
|
+
from .abstract import AbstractLoader
|
|
7
|
+
from .files.html import HTMLFile
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HTMLLoader(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
Loader for HTML files to convert into Parrot Documents.
|
|
13
|
+
|
|
14
|
+
Processes HTML files, extracts relevant content, converts to Markdown,
|
|
15
|
+
and associates metadata with each document.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
extensions: List[str] = ['.html', '.htm']
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
path: PurePath,
|
|
23
|
+
tokenizer: Callable[..., Any] = None,
|
|
24
|
+
text_splitter: Callable[..., Any] = None,
|
|
25
|
+
source_type: str = 'html',
|
|
26
|
+
language: str = "eng",
|
|
27
|
+
chunk_size: int = 1024,
|
|
28
|
+
chunk_overlap: int = 10,
|
|
29
|
+
**kwargs
|
|
30
|
+
):
|
|
31
|
+
"""Initialize the HTMLLoader."""
|
|
32
|
+
self.elements: list = kwargs.pop('elements', [])
|
|
33
|
+
super().__init__(
|
|
34
|
+
path=path,
|
|
35
|
+
tokenizer=tokenizer,
|
|
36
|
+
text_splitter=text_splitter,
|
|
37
|
+
source_type=source_type,
|
|
38
|
+
language=language,
|
|
39
|
+
**kwargs
|
|
40
|
+
)
|
|
41
|
+
# Initialize markdown splitter
|
|
42
|
+
self._splitter = self._get_markdown_splitter(
|
|
43
|
+
chunk_size=chunk_size,
|
|
44
|
+
chunk_overlap=chunk_overlap
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
async def _load(self, path: Union[str, PurePath, List[PurePath]], **kwargs) -> List[Document]:
|
|
48
|
+
"""
|
|
49
|
+
Load a TXT file.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
path (Path): The path to the TXT file.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
list: A list of Parrot Documents.
|
|
56
|
+
"""
|
|
57
|
+
docs = []
|
|
58
|
+
async with HTMLFile(path) as file:
|
|
59
|
+
soup, content = await file.read()
|
|
60
|
+
# Extract the entire <body> content or
|
|
61
|
+
# Determine the top-level element to process
|
|
62
|
+
top_element = soup.body or soup
|
|
63
|
+
if not top_element:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"The HTML file does not contain a <body> or Top element tag."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
extracted_elements = []
|
|
69
|
+
if self.elements:
|
|
70
|
+
# Extract content from specific elements
|
|
71
|
+
for element in self.elements:
|
|
72
|
+
for tag, selector in element.items():
|
|
73
|
+
extracted_elements.extend(
|
|
74
|
+
top_element.find_all(tag, class_=selector.lstrip('.'))
|
|
75
|
+
)
|
|
76
|
+
if not extracted_elements:
|
|
77
|
+
extracted_elements = [top_element]
|
|
78
|
+
|
|
79
|
+
# Process each extracted element
|
|
80
|
+
for elem in extracted_elements:
|
|
81
|
+
# Get the plain text content
|
|
82
|
+
text = elem.get_text(separator="\n", strip=True)
|
|
83
|
+
|
|
84
|
+
# Generate a summary for the extracted text
|
|
85
|
+
try:
|
|
86
|
+
summary = self.summary_from_text(text)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
if self.logger:
|
|
89
|
+
self.logger.error(f"Error generating summary: {e}")
|
|
90
|
+
summary = None
|
|
91
|
+
|
|
92
|
+
# Create document-level context
|
|
93
|
+
document_context = f"File Name: {path.name}\n"
|
|
94
|
+
document_context += f"Document Type: {self.doctype}\n"
|
|
95
|
+
document_context += f"Source Type: {self._source_type}\n"
|
|
96
|
+
document_context += f"Element: {elem.name}\n"
|
|
97
|
+
|
|
98
|
+
# Convert the entire <body> to Markdown for better structure
|
|
99
|
+
markdown_content = md(str(elem))
|
|
100
|
+
|
|
101
|
+
# Metadata preparation
|
|
102
|
+
document_meta = self.create_metadata(
|
|
103
|
+
path=path,
|
|
104
|
+
doctype=self.doctype,
|
|
105
|
+
source_type=self._source_type,
|
|
106
|
+
doc_metadata={
|
|
107
|
+
"type": "html",
|
|
108
|
+
"category": self.category,
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Create a single Langchain Document with the full body content
|
|
113
|
+
document = Document(
|
|
114
|
+
page_content=document_context + markdown_content,
|
|
115
|
+
metadata=document_meta
|
|
116
|
+
)
|
|
117
|
+
docs.append(document)
|
|
118
|
+
|
|
119
|
+
# Create a document from summary (if any):
|
|
120
|
+
if summary:
|
|
121
|
+
document = Document(
|
|
122
|
+
page_content=summary,
|
|
123
|
+
metadata={
|
|
124
|
+
**document_meta,
|
|
125
|
+
"source": str(path),
|
|
126
|
+
"timestamp": datetime.now().isoformat(),
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
docs.append(document)
|
|
130
|
+
|
|
131
|
+
# splitting the content:
|
|
132
|
+
try:
|
|
133
|
+
chunks = self._splitter.split_text(text)
|
|
134
|
+
self.logger.info(f"Split document into {len(chunks)} chunks")
|
|
135
|
+
except Exception as e:
|
|
136
|
+
self.logger.error(
|
|
137
|
+
f"Failed to split text: {e}"
|
|
138
|
+
)
|
|
139
|
+
# Fallback: use the entire text as one chunk
|
|
140
|
+
chunks = [text]
|
|
141
|
+
for chunk in chunks:
|
|
142
|
+
_idx = {
|
|
143
|
+
**document_meta
|
|
144
|
+
}
|
|
145
|
+
# Create a Langchain Document
|
|
146
|
+
docs.append(
|
|
147
|
+
Document(
|
|
148
|
+
page_content=document_context + chunk,
|
|
149
|
+
metadata=_idx
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
return []
|
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import List, Optional, Union
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path, PurePath
|
|
5
|
+
from markitdown import MarkItDown
|
|
6
|
+
from ..stores.models import Document
|
|
7
|
+
from .abstract import AbstractLoader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MarkdownLoader(AbstractLoader):
|
|
11
|
+
"""
|
|
12
|
+
Universal Document Loader using MarkItDown library.
|
|
13
|
+
|
|
14
|
+
Converts various document formats to markdown and returns Document objects.
|
|
15
|
+
Supports:
|
|
16
|
+
- PDF files
|
|
17
|
+
- PowerPoint presentations (.pptx, .ppt)
|
|
18
|
+
- Word documents (.docx, .doc)
|
|
19
|
+
- Excel spreadsheets (.xlsx, .xls, .csv)
|
|
20
|
+
- HTML files
|
|
21
|
+
- Text-based formats (CSV, JSON, XML)
|
|
22
|
+
- Images with OCR (if enabled)
|
|
23
|
+
- Audio files (if enabled)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# Supported extensions based on MarkItDown capabilities
|
|
27
|
+
extensions: List[str] = {
|
|
28
|
+
'.pdf', '.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls',
|
|
29
|
+
'.csv', '.html', '.htm', '.xml', '.json', '.txt', '.md',
|
|
30
|
+
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', # Images (with OCR)
|
|
31
|
+
'.mp3', '.wav', '.m4a', '.flac' # Audio (with transcription)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
source: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
|
|
37
|
+
*,
|
|
38
|
+
tokenizer: Union[str, Callable] = None,
|
|
39
|
+
text_splitter: Union[str, Callable] = None,
|
|
40
|
+
source_type: str = 'file',
|
|
41
|
+
enable_plugins: bool = True,
|
|
42
|
+
enable_ocr: bool = False,
|
|
43
|
+
enable_audio: bool = False,
|
|
44
|
+
use_chapters: bool = False,
|
|
45
|
+
use_sections: bool = False,
|
|
46
|
+
merge_consecutive_headers: bool = True,
|
|
47
|
+
min_section_length: int = 50,
|
|
48
|
+
**kwargs
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Initialize the MarkdownLoader.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
source: Path or list of paths to load from
|
|
55
|
+
tokenizer: Tokenizer to use for text processing
|
|
56
|
+
text_splitter: Text splitter to use
|
|
57
|
+
source_type: Type of source ('file', 'url', etc.)
|
|
58
|
+
enable_plugins: Enable MarkItDown plugins for enhanced processing
|
|
59
|
+
enable_ocr: Enable OCR for image processing
|
|
60
|
+
enable_audio: Enable audio transcription
|
|
61
|
+
use_chapters: Split by chapters/major sections
|
|
62
|
+
use_sections: Split by all sections
|
|
63
|
+
merge_consecutive_headers: Merge consecutive headers with their content
|
|
64
|
+
min_section_length: Minimum length for a section to be considered valid
|
|
65
|
+
**kwargs: Additional arguments passed to AbstractLoader
|
|
66
|
+
"""
|
|
67
|
+
super().__init__(
|
|
68
|
+
source,
|
|
69
|
+
tokenizer=tokenizer,
|
|
70
|
+
text_splitter=text_splitter,
|
|
71
|
+
source_type=source_type,
|
|
72
|
+
**kwargs
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.doctype = 'markdown'
|
|
76
|
+
self._source_type = source_type
|
|
77
|
+
self.enable_plugins = enable_plugins
|
|
78
|
+
self.enable_ocr = enable_ocr
|
|
79
|
+
self.enable_audio = enable_audio
|
|
80
|
+
self.use_chapters = use_chapters
|
|
81
|
+
self.use_sections = use_sections
|
|
82
|
+
self.merge_consecutive_headers = merge_consecutive_headers
|
|
83
|
+
self.min_section_length = min_section_length
|
|
84
|
+
|
|
85
|
+
# Initialize MarkItDown
|
|
86
|
+
self._setup_markitdown()
|
|
87
|
+
|
|
88
|
+
def _setup_markitdown(self):
|
|
89
|
+
"""Initialize the MarkItDown converter with appropriate settings."""
|
|
90
|
+
try:
|
|
91
|
+
self.md_converter = MarkItDown(enable_plugins=self.enable_plugins)
|
|
92
|
+
self.logger.info("MarkItDown converter initialized successfully")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
self.logger.error(f"Failed to initialize MarkItDown: {e}")
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
def _detect_document_type(self, path: PurePath) -> str:
|
|
98
|
+
"""Detect the type of document based on file extension."""
|
|
99
|
+
suffix = path.suffix.lower()
|
|
100
|
+
|
|
101
|
+
type_mapping = {
|
|
102
|
+
'.pdf': 'pdf',
|
|
103
|
+
'.docx': 'word', '.doc': 'word',
|
|
104
|
+
'.pptx': 'powerpoint', '.ppt': 'powerpoint',
|
|
105
|
+
'.xlsx': 'excel', '.xls': 'excel',
|
|
106
|
+
'.csv': 'csv',
|
|
107
|
+
'.html': 'html', '.htm': 'html',
|
|
108
|
+
'.xml': 'xml',
|
|
109
|
+
'.json': 'json',
|
|
110
|
+
'.txt': 'text', '.md': 'markdown',
|
|
111
|
+
'.png': 'image', '.jpg': 'image', '.jpeg': 'image',
|
|
112
|
+
'.gif': 'image', '.bmp': 'image', '.tiff': 'image',
|
|
113
|
+
'.mp3': 'audio', '.wav': 'audio', '.m4a': 'audio', '.flac': 'audio'
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return type_mapping.get(suffix, 'unknown')
|
|
117
|
+
|
|
118
|
+
def _extract_sections_from_markdown(self, md_text: str) -> List[dict]:
|
|
119
|
+
"""
|
|
120
|
+
Extract sections from markdown text based on headers.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
md_text: Markdown text content
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of section dictionaries with 'title', 'content', 'level', and 'section_number'
|
|
127
|
+
"""
|
|
128
|
+
sections = []
|
|
129
|
+
lines = md_text.split('\n')
|
|
130
|
+
current_section = None
|
|
131
|
+
current_content = []
|
|
132
|
+
section_counter = 0
|
|
133
|
+
|
|
134
|
+
for line in lines:
|
|
135
|
+
# Check if line is a header
|
|
136
|
+
header_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
|
137
|
+
|
|
138
|
+
if header_match:
|
|
139
|
+
# Save previous section if it exists
|
|
140
|
+
if current_section and current_content:
|
|
141
|
+
content = '\n'.join(current_content).strip()
|
|
142
|
+
if len(content) >= self.min_section_length:
|
|
143
|
+
current_section['content'] = content
|
|
144
|
+
sections.append(current_section)
|
|
145
|
+
|
|
146
|
+
# Start new section
|
|
147
|
+
level = len(header_match.group(1))
|
|
148
|
+
title = header_match.group(2).strip()
|
|
149
|
+
section_counter += 1
|
|
150
|
+
|
|
151
|
+
# Determine if this should be included based on settings
|
|
152
|
+
include_section = False
|
|
153
|
+
if self.use_chapters and level <= 2: # H1 and H2 for chapters
|
|
154
|
+
include_section = True
|
|
155
|
+
elif self.use_sections and level <= 4: # H1-H4 for sections
|
|
156
|
+
include_section = True
|
|
157
|
+
elif not self.use_chapters and not self.use_sections:
|
|
158
|
+
include_section = True # Include all if no specific setting
|
|
159
|
+
|
|
160
|
+
if include_section:
|
|
161
|
+
current_section = {
|
|
162
|
+
'title': title,
|
|
163
|
+
'level': level,
|
|
164
|
+
'section_number': section_counter,
|
|
165
|
+
'header_line': line
|
|
166
|
+
}
|
|
167
|
+
current_content = []
|
|
168
|
+
|
|
169
|
+
# Include the header in content if merging
|
|
170
|
+
if self.merge_consecutive_headers:
|
|
171
|
+
current_content.append(line)
|
|
172
|
+
else:
|
|
173
|
+
current_section = None
|
|
174
|
+
current_content = []
|
|
175
|
+
else:
|
|
176
|
+
# Add line to current section content
|
|
177
|
+
if current_section is not None:
|
|
178
|
+
current_content.append(line)
|
|
179
|
+
|
|
180
|
+
# Handle the last section
|
|
181
|
+
if current_section and current_content:
|
|
182
|
+
content = '\n'.join(current_content).strip()
|
|
183
|
+
if len(content) >= self.min_section_length:
|
|
184
|
+
current_section['content'] = content
|
|
185
|
+
sections.append(current_section)
|
|
186
|
+
|
|
187
|
+
return sections
|
|
188
|
+
|
|
189
|
+
def _clean_markdown_content(self, content: str) -> str:
|
|
190
|
+
"""
|
|
191
|
+
Clean and normalize markdown content.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
content: Raw markdown content
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Cleaned markdown content
|
|
198
|
+
"""
|
|
199
|
+
if not content:
|
|
200
|
+
return ""
|
|
201
|
+
|
|
202
|
+
# Remove excessive blank lines
|
|
203
|
+
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
|
204
|
+
|
|
205
|
+
# Remove trailing whitespace from lines
|
|
206
|
+
lines = [line.rstrip() for line in content.split('\n')]
|
|
207
|
+
content = '\n'.join(lines)
|
|
208
|
+
|
|
209
|
+
# Ensure proper spacing around headers
|
|
210
|
+
content = re.sub(r'(^|\n)(#{1,6}\s+[^\n]+)(\n)', r'\1\n\2\n\n', content)
|
|
211
|
+
|
|
212
|
+
return content.strip()
|
|
213
|
+
|
|
214
|
+
def _extract_metadata_from_markdown(self, md_text: str, file_path: PurePath) -> dict:
|
|
215
|
+
"""
|
|
216
|
+
Extract metadata from markdown content and file.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
md_text: Markdown text content
|
|
220
|
+
file_path: Path to the source file
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Dictionary containing extracted metadata
|
|
224
|
+
"""
|
|
225
|
+
metadata = {}
|
|
226
|
+
|
|
227
|
+
# Extract frontmatter if present
|
|
228
|
+
frontmatter_match = re.match(r'^---\n(.*?)\n---\n', md_text, re.DOTALL)
|
|
229
|
+
if frontmatter_match:
|
|
230
|
+
try:
|
|
231
|
+
import yaml
|
|
232
|
+
frontmatter = yaml.safe_load(frontmatter_match.group(1))
|
|
233
|
+
if isinstance(frontmatter, dict):
|
|
234
|
+
metadata.update(frontmatter)
|
|
235
|
+
except (ImportError, yaml.YAMLError):
|
|
236
|
+
self.logger.warning("Could not parse frontmatter metadata")
|
|
237
|
+
|
|
238
|
+
# Extract title from first header if not in frontmatter
|
|
239
|
+
if 'title' not in metadata:
|
|
240
|
+
title_match = re.search(r'^#\s+(.+)$', md_text, re.MULTILINE)
|
|
241
|
+
if title_match:
|
|
242
|
+
metadata['title'] = title_match.group(1).strip()
|
|
243
|
+
|
|
244
|
+
# Count various elements
|
|
245
|
+
metadata.update({
|
|
246
|
+
'word_count': len(md_text.split()),
|
|
247
|
+
'header_count': len(re.findall(r'^#{1,6}\s+', md_text, re.MULTILINE)),
|
|
248
|
+
'table_count': len(re.findall(r'^\|.*\|$', md_text, re.MULTILINE)),
|
|
249
|
+
'code_block_count': len(re.findall(r'```', md_text)) // 2,
|
|
250
|
+
'link_count': len(re.findall(r'\[.*?\]\(.*?\)', md_text)),
|
|
251
|
+
'image_count': len(re.findall(r'!\[.*?\]\(.*?\)', md_text))
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
return metadata
|
|
255
|
+
|
|
256
|
+
async def _load(self, path: PurePath, **kwargs) -> List[Document]:
|
|
257
|
+
"""
|
|
258
|
+
Load a single file using MarkItDown and return Document objects.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
path: Path to the file to load
|
|
262
|
+
**kwargs: Additional arguments
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
List of Document objects
|
|
266
|
+
"""
|
|
267
|
+
self.logger.info(f"Loading file with MarkItDown: {path}")
|
|
268
|
+
docs = []
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
# Convert file to markdown using MarkItDown
|
|
272
|
+
result = self.md_converter.convert(str(path))
|
|
273
|
+
|
|
274
|
+
if not result or not result.text_content:
|
|
275
|
+
self.logger.warning(f"No content extracted from {path}")
|
|
276
|
+
return docs
|
|
277
|
+
|
|
278
|
+
md_text = result.text_content
|
|
279
|
+
md_text = self._clean_markdown_content(md_text)
|
|
280
|
+
|
|
281
|
+
# Extract additional metadata
|
|
282
|
+
doc_type = self._detect_document_type(path)
|
|
283
|
+
extracted_metadata = self._extract_metadata_from_markdown(md_text, path)
|
|
284
|
+
|
|
285
|
+
# Determine how to split the content
|
|
286
|
+
if self.use_chapters or self.use_sections:
|
|
287
|
+
# Split by sections/chapters
|
|
288
|
+
sections = self._extract_sections_from_markdown(md_text)
|
|
289
|
+
self.logger.info(f"Extracted {len(sections)} sections from {path}")
|
|
290
|
+
|
|
291
|
+
if sections:
|
|
292
|
+
for section in sections:
|
|
293
|
+
section_type = "chapter" if self.use_chapters else "section"
|
|
294
|
+
|
|
295
|
+
document_meta = {
|
|
296
|
+
"filename": path.name,
|
|
297
|
+
"file_path": str(path),
|
|
298
|
+
"document_type": doc_type,
|
|
299
|
+
"section_title": section['title'],
|
|
300
|
+
"section_number": section['section_number'],
|
|
301
|
+
"header_level": section['level'],
|
|
302
|
+
"content_type": section_type,
|
|
303
|
+
"extracted_metadata": extracted_metadata,
|
|
304
|
+
**extracted_metadata
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
meta = self.create_metadata(
|
|
308
|
+
path=path,
|
|
309
|
+
doctype="markdown",
|
|
310
|
+
source_type=f"markitdown_{section_type}",
|
|
311
|
+
doc_metadata=document_meta,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
docs.append(
|
|
315
|
+
self.create_document(
|
|
316
|
+
content=section['content'],
|
|
317
|
+
path=path,
|
|
318
|
+
metadata=meta
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
else:
|
|
322
|
+
# No sections found, treat as single document
|
|
323
|
+
self.logger.info(f"No sections found in {path}, treating as single document")
|
|
324
|
+
self._create_single_document(docs, md_text, path, doc_type, extracted_metadata)
|
|
325
|
+
else:
|
|
326
|
+
# Return whole markdown as single document
|
|
327
|
+
self._create_single_document(docs, md_text, path, doc_type, extracted_metadata)
|
|
328
|
+
|
|
329
|
+
# Generate summary if enabled
|
|
330
|
+
if self._summarization and docs:
|
|
331
|
+
full_text = "\n\n".join([doc.page_content for doc in docs])
|
|
332
|
+
summary = await self.summary_from_text(full_text)
|
|
333
|
+
|
|
334
|
+
if summary:
|
|
335
|
+
summary_meta = self.create_metadata(
|
|
336
|
+
path=path,
|
|
337
|
+
doctype="markdown",
|
|
338
|
+
source_type="markitdown_summary",
|
|
339
|
+
doc_metadata={
|
|
340
|
+
"summary_for_sections": len(docs),
|
|
341
|
+
"document_type": doc_type,
|
|
342
|
+
**extracted_metadata
|
|
343
|
+
}
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
docs.append(
|
|
347
|
+
self.create_document(
|
|
348
|
+
content=f"SUMMARY:\n\n{summary}",
|
|
349
|
+
path=path,
|
|
350
|
+
metadata=summary_meta
|
|
351
|
+
)
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
except Exception as e:
|
|
355
|
+
self.logger.error(f"Error processing {path} with MarkItDown: {e}")
|
|
356
|
+
# Could optionally fall back to reading as plain text
|
|
357
|
+
raise
|
|
358
|
+
|
|
359
|
+
return docs
|
|
360
|
+
|
|
361
|
+
def _create_single_document(
|
|
362
|
+
self,
|
|
363
|
+
docs: List[Document],
|
|
364
|
+
md_text: str,
|
|
365
|
+
path: PurePath,
|
|
366
|
+
doc_type: str,
|
|
367
|
+
extracted_metadata: dict
|
|
368
|
+
):
|
|
369
|
+
"""Helper method to create a single document from markdown text."""
|
|
370
|
+
document_meta = {
|
|
371
|
+
"filename": path.name,
|
|
372
|
+
"file_path": str(path),
|
|
373
|
+
"document_type": doc_type,
|
|
374
|
+
"content_type": "full_document",
|
|
375
|
+
"extracted_metadata": extracted_metadata,
|
|
376
|
+
**extracted_metadata
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
meta = self.create_metadata(
|
|
380
|
+
path=path,
|
|
381
|
+
doctype="markdown",
|
|
382
|
+
source_type="markitdown_full",
|
|
383
|
+
doc_metadata=document_meta,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
docs.append(
|
|
387
|
+
self.create_document(
|
|
388
|
+
content=md_text,
|
|
389
|
+
path=path,
|
|
390
|
+
metadata=meta
|
|
391
|
+
)
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def get_supported_formats(self) -> dict:
|
|
395
|
+
"""
|
|
396
|
+
Get information about supported file formats.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Dictionary mapping format categories to file extensions
|
|
400
|
+
"""
|
|
401
|
+
return {
|
|
402
|
+
'documents': ['.pdf', '.docx', '.doc'],
|
|
403
|
+
'presentations': ['.pptx', '.ppt'],
|
|
404
|
+
'spreadsheets': ['.xlsx', '.xls', '.csv'],
|
|
405
|
+
'web': ['.html', '.htm'],
|
|
406
|
+
'data': ['.xml', '.json'],
|
|
407
|
+
'text': ['.txt', '.md'],
|
|
408
|
+
'images': ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'],
|
|
409
|
+
'audio': ['.mp3', '.wav', '.m4a', '.flac']
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
def validate_file_support(self, path: Union[str, Path]) -> bool:
|
|
413
|
+
"""
|
|
414
|
+
Check if a file is supported by MarkItDown.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
path: File path to check
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
True if file is supported, False otherwise
|
|
421
|
+
"""
|
|
422
|
+
if isinstance(path, str):
|
|
423
|
+
path = Path(path)
|
|
424
|
+
|
|
425
|
+
return path.suffix.lower() in self.extensions
|
|
426
|
+
|
|
427
|
+
async def convert_to_markdown(self, path: Union[str, Path]) -> str:
|
|
428
|
+
"""
|
|
429
|
+
Convert a single file to markdown and return the content.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
path: Path to file to convert
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
Markdown content as string
|
|
436
|
+
"""
|
|
437
|
+
try:
|
|
438
|
+
result = self.md_converter.convert(str(path))
|
|
439
|
+
return self._clean_markdown_content(result.text_content) if result else ""
|
|
440
|
+
except Exception as e:
|
|
441
|
+
self.logger.error(f"Error converting {path} to markdown: {e}")
|
|
442
|
+
return ""
|