ai-parrot 0.17.2__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentui/.prettierrc +15 -0
- agentui/QUICKSTART.md +272 -0
- agentui/README.md +59 -0
- agentui/env.example +16 -0
- agentui/jsconfig.json +14 -0
- agentui/package-lock.json +4242 -0
- agentui/package.json +34 -0
- agentui/scripts/postinstall/apply-patches.mjs +260 -0
- agentui/src/app.css +61 -0
- agentui/src/app.d.ts +13 -0
- agentui/src/app.html +12 -0
- agentui/src/components/LoadingSpinner.svelte +64 -0
- agentui/src/components/ThemeSwitcher.svelte +159 -0
- agentui/src/components/index.js +4 -0
- agentui/src/lib/api/bots.ts +60 -0
- agentui/src/lib/api/chat.ts +22 -0
- agentui/src/lib/api/http.ts +25 -0
- agentui/src/lib/components/BotCard.svelte +33 -0
- agentui/src/lib/components/ChatBubble.svelte +63 -0
- agentui/src/lib/components/Toast.svelte +21 -0
- agentui/src/lib/config.ts +20 -0
- agentui/src/lib/stores/auth.svelte.ts +73 -0
- agentui/src/lib/stores/theme.svelte.js +64 -0
- agentui/src/lib/stores/toast.svelte.ts +31 -0
- agentui/src/lib/utils/conversation.ts +39 -0
- agentui/src/routes/+layout.svelte +20 -0
- agentui/src/routes/+page.svelte +232 -0
- agentui/src/routes/login/+page.svelte +200 -0
- agentui/src/routes/talk/[agentId]/+page.svelte +297 -0
- agentui/src/routes/talk/[agentId]/+page.ts +7 -0
- agentui/static/README.md +1 -0
- agentui/svelte.config.js +11 -0
- agentui/tailwind.config.ts +53 -0
- agentui/tsconfig.json +3 -0
- agentui/vite.config.ts +10 -0
- ai_parrot-0.17.2.dist-info/METADATA +472 -0
- ai_parrot-0.17.2.dist-info/RECORD +535 -0
- ai_parrot-0.17.2.dist-info/WHEEL +6 -0
- ai_parrot-0.17.2.dist-info/entry_points.txt +2 -0
- ai_parrot-0.17.2.dist-info/licenses/LICENSE +21 -0
- ai_parrot-0.17.2.dist-info/top_level.txt +6 -0
- crew-builder/.prettierrc +15 -0
- crew-builder/QUICKSTART.md +259 -0
- crew-builder/README.md +113 -0
- crew-builder/env.example +17 -0
- crew-builder/jsconfig.json +14 -0
- crew-builder/package-lock.json +4182 -0
- crew-builder/package.json +37 -0
- crew-builder/scripts/postinstall/apply-patches.mjs +260 -0
- crew-builder/src/app.css +62 -0
- crew-builder/src/app.d.ts +13 -0
- crew-builder/src/app.html +12 -0
- crew-builder/src/components/LoadingSpinner.svelte +64 -0
- crew-builder/src/components/ThemeSwitcher.svelte +149 -0
- crew-builder/src/components/index.js +9 -0
- crew-builder/src/lib/api/bots.ts +60 -0
- crew-builder/src/lib/api/chat.ts +80 -0
- crew-builder/src/lib/api/client.ts +56 -0
- crew-builder/src/lib/api/crew/crew.ts +136 -0
- crew-builder/src/lib/api/index.ts +5 -0
- crew-builder/src/lib/api/o365/auth.ts +65 -0
- crew-builder/src/lib/auth/auth.ts +54 -0
- crew-builder/src/lib/components/AgentNode.svelte +43 -0
- crew-builder/src/lib/components/BotCard.svelte +33 -0
- crew-builder/src/lib/components/ChatBubble.svelte +67 -0
- crew-builder/src/lib/components/ConfigPanel.svelte +278 -0
- crew-builder/src/lib/components/JsonTreeNode.svelte +76 -0
- crew-builder/src/lib/components/JsonViewer.svelte +24 -0
- crew-builder/src/lib/components/MarkdownEditor.svelte +48 -0
- crew-builder/src/lib/components/ThemeToggle.svelte +36 -0
- crew-builder/src/lib/components/Toast.svelte +67 -0
- crew-builder/src/lib/components/Toolbar.svelte +157 -0
- crew-builder/src/lib/components/index.ts +10 -0
- crew-builder/src/lib/config.ts +8 -0
- crew-builder/src/lib/stores/auth.svelte.ts +228 -0
- crew-builder/src/lib/stores/crewStore.ts +369 -0
- crew-builder/src/lib/stores/theme.svelte.js +145 -0
- crew-builder/src/lib/stores/toast.svelte.ts +69 -0
- crew-builder/src/lib/utils/conversation.ts +39 -0
- crew-builder/src/lib/utils/markdown.ts +122 -0
- crew-builder/src/lib/utils/talkHistory.ts +47 -0
- crew-builder/src/routes/+layout.svelte +20 -0
- crew-builder/src/routes/+page.svelte +539 -0
- crew-builder/src/routes/agents/+page.svelte +247 -0
- crew-builder/src/routes/agents/[agentId]/+page.svelte +288 -0
- crew-builder/src/routes/agents/[agentId]/+page.ts +7 -0
- crew-builder/src/routes/builder/+page.svelte +204 -0
- crew-builder/src/routes/crew/ask/+page.svelte +1052 -0
- crew-builder/src/routes/crew/ask/+page.ts +1 -0
- crew-builder/src/routes/integrations/o365/+page.svelte +304 -0
- crew-builder/src/routes/login/+page.svelte +197 -0
- crew-builder/src/routes/talk/[agentId]/+page.svelte +487 -0
- crew-builder/src/routes/talk/[agentId]/+page.ts +7 -0
- crew-builder/static/README.md +1 -0
- crew-builder/svelte.config.js +11 -0
- crew-builder/tailwind.config.ts +53 -0
- crew-builder/tsconfig.json +3 -0
- crew-builder/vite.config.ts +10 -0
- mcp_servers/calculator_server.py +309 -0
- parrot/__init__.py +27 -0
- parrot/__pycache__/__init__.cpython-310.pyc +0 -0
- parrot/__pycache__/version.cpython-310.pyc +0 -0
- parrot/_version.py +34 -0
- parrot/a2a/__init__.py +48 -0
- parrot/a2a/client.py +658 -0
- parrot/a2a/discovery.py +89 -0
- parrot/a2a/mixin.py +257 -0
- parrot/a2a/models.py +376 -0
- parrot/a2a/server.py +770 -0
- parrot/agents/__init__.py +29 -0
- parrot/bots/__init__.py +12 -0
- parrot/bots/a2a_agent.py +19 -0
- parrot/bots/abstract.py +3139 -0
- parrot/bots/agent.py +1129 -0
- parrot/bots/basic.py +9 -0
- parrot/bots/chatbot.py +669 -0
- parrot/bots/data.py +1618 -0
- parrot/bots/database/__init__.py +5 -0
- parrot/bots/database/abstract.py +3071 -0
- parrot/bots/database/cache.py +286 -0
- parrot/bots/database/models.py +468 -0
- parrot/bots/database/prompts.py +154 -0
- parrot/bots/database/retries.py +98 -0
- parrot/bots/database/router.py +269 -0
- parrot/bots/database/sql.py +41 -0
- parrot/bots/db/__init__.py +6 -0
- parrot/bots/db/abstract.py +556 -0
- parrot/bots/db/bigquery.py +602 -0
- parrot/bots/db/cache.py +85 -0
- parrot/bots/db/documentdb.py +668 -0
- parrot/bots/db/elastic.py +1014 -0
- parrot/bots/db/influx.py +898 -0
- parrot/bots/db/mock.py +96 -0
- parrot/bots/db/multi.py +783 -0
- parrot/bots/db/prompts.py +185 -0
- parrot/bots/db/sql.py +1255 -0
- parrot/bots/db/tools.py +212 -0
- parrot/bots/document.py +680 -0
- parrot/bots/hrbot.py +15 -0
- parrot/bots/kb.py +170 -0
- parrot/bots/mcp.py +36 -0
- parrot/bots/orchestration/README.md +463 -0
- parrot/bots/orchestration/__init__.py +1 -0
- parrot/bots/orchestration/agent.py +155 -0
- parrot/bots/orchestration/crew.py +3330 -0
- parrot/bots/orchestration/fsm.py +1179 -0
- parrot/bots/orchestration/hr.py +434 -0
- parrot/bots/orchestration/storage/__init__.py +4 -0
- parrot/bots/orchestration/storage/memory.py +100 -0
- parrot/bots/orchestration/storage/mixin.py +119 -0
- parrot/bots/orchestration/verify.py +202 -0
- parrot/bots/product.py +204 -0
- parrot/bots/prompts/__init__.py +96 -0
- parrot/bots/prompts/agents.py +155 -0
- parrot/bots/prompts/data.py +216 -0
- parrot/bots/prompts/output_generation.py +8 -0
- parrot/bots/scraper/__init__.py +3 -0
- parrot/bots/scraper/models.py +122 -0
- parrot/bots/scraper/scraper.py +1173 -0
- parrot/bots/scraper/templates.py +115 -0
- parrot/bots/stores/__init__.py +5 -0
- parrot/bots/stores/local.py +172 -0
- parrot/bots/webdev.py +81 -0
- parrot/cli.py +17 -0
- parrot/clients/__init__.py +16 -0
- parrot/clients/base.py +1491 -0
- parrot/clients/claude.py +1191 -0
- parrot/clients/factory.py +129 -0
- parrot/clients/google.py +4567 -0
- parrot/clients/gpt.py +1975 -0
- parrot/clients/grok.py +432 -0
- parrot/clients/groq.py +986 -0
- parrot/clients/hf.py +582 -0
- parrot/clients/models.py +18 -0
- parrot/conf.py +395 -0
- parrot/embeddings/__init__.py +9 -0
- parrot/embeddings/base.py +157 -0
- parrot/embeddings/google.py +98 -0
- parrot/embeddings/huggingface.py +74 -0
- parrot/embeddings/openai.py +84 -0
- parrot/embeddings/processor.py +88 -0
- parrot/exceptions.c +13868 -0
- parrot/exceptions.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/exceptions.pxd +22 -0
- parrot/exceptions.pxi +15 -0
- parrot/exceptions.pyx +44 -0
- parrot/generators/__init__.py +29 -0
- parrot/generators/base.py +200 -0
- parrot/generators/html.py +293 -0
- parrot/generators/react.py +205 -0
- parrot/generators/streamlit.py +203 -0
- parrot/generators/template.py +105 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/agent.py +861 -0
- parrot/handlers/agents/__init__.py +1 -0
- parrot/handlers/agents/abstract.py +900 -0
- parrot/handlers/bots.py +338 -0
- parrot/handlers/chat.py +915 -0
- parrot/handlers/creation.sql +192 -0
- parrot/handlers/crew/ARCHITECTURE.md +362 -0
- parrot/handlers/crew/README_BOTMANAGER_PERSISTENCE.md +303 -0
- parrot/handlers/crew/README_REDIS_PERSISTENCE.md +366 -0
- parrot/handlers/crew/__init__.py +0 -0
- parrot/handlers/crew/handler.py +801 -0
- parrot/handlers/crew/models.py +229 -0
- parrot/handlers/crew/redis_persistence.py +523 -0
- parrot/handlers/jobs/__init__.py +10 -0
- parrot/handlers/jobs/job.py +384 -0
- parrot/handlers/jobs/mixin.py +627 -0
- parrot/handlers/jobs/models.py +115 -0
- parrot/handlers/jobs/worker.py +31 -0
- parrot/handlers/models.py +596 -0
- parrot/handlers/o365_auth.py +105 -0
- parrot/handlers/stream.py +337 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/aws.py +143 -0
- parrot/interfaces/credentials.py +113 -0
- parrot/interfaces/database.py +27 -0
- parrot/interfaces/google.py +1123 -0
- parrot/interfaces/hierarchy.py +1227 -0
- parrot/interfaces/http.py +651 -0
- parrot/interfaces/images/__init__.py +0 -0
- parrot/interfaces/images/plugins/__init__.py +24 -0
- parrot/interfaces/images/plugins/abstract.py +58 -0
- parrot/interfaces/images/plugins/analisys.py +148 -0
- parrot/interfaces/images/plugins/classify.py +150 -0
- parrot/interfaces/images/plugins/classifybase.py +182 -0
- parrot/interfaces/images/plugins/detect.py +150 -0
- parrot/interfaces/images/plugins/exif.py +1103 -0
- parrot/interfaces/images/plugins/hash.py +52 -0
- parrot/interfaces/images/plugins/vision.py +104 -0
- parrot/interfaces/images/plugins/yolo.py +66 -0
- parrot/interfaces/images/plugins/zerodetect.py +197 -0
- parrot/interfaces/o365.py +978 -0
- parrot/interfaces/onedrive.py +822 -0
- parrot/interfaces/sharepoint.py +1435 -0
- parrot/interfaces/soap.py +257 -0
- parrot/loaders/__init__.py +8 -0
- parrot/loaders/abstract.py +1131 -0
- parrot/loaders/audio.py +199 -0
- parrot/loaders/basepdf.py +53 -0
- parrot/loaders/basevideo.py +1568 -0
- parrot/loaders/csv.py +409 -0
- parrot/loaders/docx.py +116 -0
- parrot/loaders/epubloader.py +316 -0
- parrot/loaders/excel.py +199 -0
- parrot/loaders/factory.py +55 -0
- parrot/loaders/files/__init__.py +0 -0
- parrot/loaders/files/abstract.py +39 -0
- parrot/loaders/files/html.py +26 -0
- parrot/loaders/files/text.py +63 -0
- parrot/loaders/html.py +152 -0
- parrot/loaders/markdown.py +442 -0
- parrot/loaders/pdf.py +373 -0
- parrot/loaders/pdfmark.py +320 -0
- parrot/loaders/pdftables.py +506 -0
- parrot/loaders/ppt.py +476 -0
- parrot/loaders/qa.py +63 -0
- parrot/loaders/splitters/__init__.py +10 -0
- parrot/loaders/splitters/base.py +138 -0
- parrot/loaders/splitters/md.py +228 -0
- parrot/loaders/splitters/token.py +143 -0
- parrot/loaders/txt.py +26 -0
- parrot/loaders/video.py +89 -0
- parrot/loaders/videolocal.py +218 -0
- parrot/loaders/videounderstanding.py +377 -0
- parrot/loaders/vimeo.py +167 -0
- parrot/loaders/web.py +599 -0
- parrot/loaders/youtube.py +504 -0
- parrot/manager/__init__.py +5 -0
- parrot/manager/manager.py +1030 -0
- parrot/mcp/__init__.py +28 -0
- parrot/mcp/adapter.py +105 -0
- parrot/mcp/cli.py +174 -0
- parrot/mcp/client.py +119 -0
- parrot/mcp/config.py +75 -0
- parrot/mcp/integration.py +842 -0
- parrot/mcp/oauth.py +933 -0
- parrot/mcp/server.py +225 -0
- parrot/mcp/transports/__init__.py +3 -0
- parrot/mcp/transports/base.py +279 -0
- parrot/mcp/transports/grpc_session.py +163 -0
- parrot/mcp/transports/http.py +312 -0
- parrot/mcp/transports/mcp.proto +108 -0
- parrot/mcp/transports/quic.py +1082 -0
- parrot/mcp/transports/sse.py +330 -0
- parrot/mcp/transports/stdio.py +309 -0
- parrot/mcp/transports/unix.py +395 -0
- parrot/mcp/transports/websocket.py +547 -0
- parrot/memory/__init__.py +16 -0
- parrot/memory/abstract.py +209 -0
- parrot/memory/agent.py +32 -0
- parrot/memory/cache.py +175 -0
- parrot/memory/core.py +555 -0
- parrot/memory/file.py +153 -0
- parrot/memory/mem.py +131 -0
- parrot/memory/redis.py +613 -0
- parrot/models/__init__.py +46 -0
- parrot/models/basic.py +118 -0
- parrot/models/compliance.py +208 -0
- parrot/models/crew.py +395 -0
- parrot/models/detections.py +654 -0
- parrot/models/generation.py +85 -0
- parrot/models/google.py +223 -0
- parrot/models/groq.py +23 -0
- parrot/models/openai.py +30 -0
- parrot/models/outputs.py +285 -0
- parrot/models/responses.py +938 -0
- parrot/notifications/__init__.py +743 -0
- parrot/openapi/__init__.py +3 -0
- parrot/openapi/components.yaml +641 -0
- parrot/openapi/config.py +322 -0
- parrot/outputs/__init__.py +32 -0
- parrot/outputs/formats/__init__.py +108 -0
- parrot/outputs/formats/altair.py +359 -0
- parrot/outputs/formats/application.py +122 -0
- parrot/outputs/formats/base.py +351 -0
- parrot/outputs/formats/bokeh.py +356 -0
- parrot/outputs/formats/card.py +424 -0
- parrot/outputs/formats/chart.py +436 -0
- parrot/outputs/formats/d3.py +255 -0
- parrot/outputs/formats/echarts.py +310 -0
- parrot/outputs/formats/generators/__init__.py +0 -0
- parrot/outputs/formats/generators/abstract.py +61 -0
- parrot/outputs/formats/generators/panel.py +145 -0
- parrot/outputs/formats/generators/streamlit.py +86 -0
- parrot/outputs/formats/generators/terminal.py +63 -0
- parrot/outputs/formats/holoviews.py +310 -0
- parrot/outputs/formats/html.py +147 -0
- parrot/outputs/formats/jinja2.py +46 -0
- parrot/outputs/formats/json.py +87 -0
- parrot/outputs/formats/map.py +933 -0
- parrot/outputs/formats/markdown.py +172 -0
- parrot/outputs/formats/matplotlib.py +237 -0
- parrot/outputs/formats/mixins/__init__.py +0 -0
- parrot/outputs/formats/mixins/emaps.py +855 -0
- parrot/outputs/formats/plotly.py +341 -0
- parrot/outputs/formats/seaborn.py +310 -0
- parrot/outputs/formats/table.py +397 -0
- parrot/outputs/formats/template_report.py +138 -0
- parrot/outputs/formats/yaml.py +125 -0
- parrot/outputs/formatter.py +152 -0
- parrot/outputs/templates/__init__.py +95 -0
- parrot/pipelines/__init__.py +0 -0
- parrot/pipelines/abstract.py +210 -0
- parrot/pipelines/detector.py +124 -0
- parrot/pipelines/models.py +90 -0
- parrot/pipelines/planogram.py +3002 -0
- parrot/pipelines/table.sql +97 -0
- parrot/plugins/__init__.py +106 -0
- parrot/plugins/importer.py +80 -0
- parrot/py.typed +0 -0
- parrot/registry/__init__.py +18 -0
- parrot/registry/registry.py +594 -0
- parrot/scheduler/__init__.py +1189 -0
- parrot/scheduler/models.py +60 -0
- parrot/security/__init__.py +16 -0
- parrot/security/prompt_injection.py +268 -0
- parrot/security/security_events.sql +25 -0
- parrot/services/__init__.py +1 -0
- parrot/services/mcp/__init__.py +8 -0
- parrot/services/mcp/config.py +13 -0
- parrot/services/mcp/server.py +295 -0
- parrot/services/o365_remote_auth.py +235 -0
- parrot/stores/__init__.py +7 -0
- parrot/stores/abstract.py +352 -0
- parrot/stores/arango.py +1090 -0
- parrot/stores/bigquery.py +1377 -0
- parrot/stores/cache.py +106 -0
- parrot/stores/empty.py +10 -0
- parrot/stores/faiss_store.py +1157 -0
- parrot/stores/kb/__init__.py +9 -0
- parrot/stores/kb/abstract.py +68 -0
- parrot/stores/kb/cache.py +165 -0
- parrot/stores/kb/doc.py +325 -0
- parrot/stores/kb/hierarchy.py +346 -0
- parrot/stores/kb/local.py +457 -0
- parrot/stores/kb/prompt.py +28 -0
- parrot/stores/kb/redis.py +659 -0
- parrot/stores/kb/store.py +115 -0
- parrot/stores/kb/user.py +374 -0
- parrot/stores/models.py +59 -0
- parrot/stores/pgvector.py +3 -0
- parrot/stores/postgres.py +2853 -0
- parrot/stores/utils/__init__.py +0 -0
- parrot/stores/utils/chunking.py +197 -0
- parrot/telemetry/__init__.py +3 -0
- parrot/telemetry/mixin.py +111 -0
- parrot/template/__init__.py +3 -0
- parrot/template/engine.py +259 -0
- parrot/tools/__init__.py +23 -0
- parrot/tools/abstract.py +644 -0
- parrot/tools/agent.py +363 -0
- parrot/tools/arangodbsearch.py +537 -0
- parrot/tools/arxiv_tool.py +188 -0
- parrot/tools/calculator/__init__.py +3 -0
- parrot/tools/calculator/operations/__init__.py +38 -0
- parrot/tools/calculator/operations/calculus.py +80 -0
- parrot/tools/calculator/operations/statistics.py +76 -0
- parrot/tools/calculator/tool.py +150 -0
- parrot/tools/cloudwatch.py +988 -0
- parrot/tools/codeinterpreter/__init__.py +127 -0
- parrot/tools/codeinterpreter/executor.py +371 -0
- parrot/tools/codeinterpreter/internals.py +473 -0
- parrot/tools/codeinterpreter/models.py +643 -0
- parrot/tools/codeinterpreter/prompts.py +224 -0
- parrot/tools/codeinterpreter/tool.py +664 -0
- parrot/tools/company_info/__init__.py +6 -0
- parrot/tools/company_info/tool.py +1138 -0
- parrot/tools/correlationanalysis.py +437 -0
- parrot/tools/database/abstract.py +286 -0
- parrot/tools/database/bq.py +115 -0
- parrot/tools/database/cache.py +284 -0
- parrot/tools/database/models.py +95 -0
- parrot/tools/database/pg.py +343 -0
- parrot/tools/databasequery.py +1159 -0
- parrot/tools/db.py +1800 -0
- parrot/tools/ddgo.py +370 -0
- parrot/tools/decorators.py +271 -0
- parrot/tools/dftohtml.py +282 -0
- parrot/tools/document.py +549 -0
- parrot/tools/ecs.py +819 -0
- parrot/tools/edareport.py +368 -0
- parrot/tools/elasticsearch.py +1049 -0
- parrot/tools/employees.py +462 -0
- parrot/tools/epson/__init__.py +96 -0
- parrot/tools/excel.py +683 -0
- parrot/tools/file/__init__.py +13 -0
- parrot/tools/file/abstract.py +76 -0
- parrot/tools/file/gcs.py +378 -0
- parrot/tools/file/local.py +284 -0
- parrot/tools/file/s3.py +511 -0
- parrot/tools/file/tmp.py +309 -0
- parrot/tools/file/tool.py +501 -0
- parrot/tools/file_reader.py +129 -0
- parrot/tools/flowtask/__init__.py +19 -0
- parrot/tools/flowtask/tool.py +761 -0
- parrot/tools/gittoolkit.py +508 -0
- parrot/tools/google/__init__.py +18 -0
- parrot/tools/google/base.py +169 -0
- parrot/tools/google/tools.py +1251 -0
- parrot/tools/googlelocation.py +5 -0
- parrot/tools/googleroutes.py +5 -0
- parrot/tools/googlesearch.py +5 -0
- parrot/tools/googlesitesearch.py +5 -0
- parrot/tools/googlevoice.py +2 -0
- parrot/tools/gvoice.py +695 -0
- parrot/tools/ibisworld/README.md +225 -0
- parrot/tools/ibisworld/__init__.py +11 -0
- parrot/tools/ibisworld/tool.py +366 -0
- parrot/tools/jiratoolkit.py +1718 -0
- parrot/tools/manager.py +1098 -0
- parrot/tools/math.py +152 -0
- parrot/tools/metadata.py +476 -0
- parrot/tools/msteams.py +1621 -0
- parrot/tools/msword.py +635 -0
- parrot/tools/multidb.py +580 -0
- parrot/tools/multistoresearch.py +369 -0
- parrot/tools/networkninja.py +167 -0
- parrot/tools/nextstop/__init__.py +4 -0
- parrot/tools/nextstop/base.py +286 -0
- parrot/tools/nextstop/employee.py +733 -0
- parrot/tools/nextstop/store.py +462 -0
- parrot/tools/notification.py +435 -0
- parrot/tools/o365/__init__.py +42 -0
- parrot/tools/o365/base.py +295 -0
- parrot/tools/o365/bundle.py +522 -0
- parrot/tools/o365/events.py +554 -0
- parrot/tools/o365/mail.py +992 -0
- parrot/tools/o365/onedrive.py +497 -0
- parrot/tools/o365/sharepoint.py +641 -0
- parrot/tools/openapi_toolkit.py +904 -0
- parrot/tools/openweather.py +527 -0
- parrot/tools/pdfprint.py +1001 -0
- parrot/tools/powerbi.py +518 -0
- parrot/tools/powerpoint.py +1113 -0
- parrot/tools/pricestool.py +146 -0
- parrot/tools/products/__init__.py +246 -0
- parrot/tools/prophet_tool.py +171 -0
- parrot/tools/pythonpandas.py +630 -0
- parrot/tools/pythonrepl.py +910 -0
- parrot/tools/qsource.py +436 -0
- parrot/tools/querytoolkit.py +395 -0
- parrot/tools/quickeda.py +827 -0
- parrot/tools/resttool.py +553 -0
- parrot/tools/retail/__init__.py +0 -0
- parrot/tools/retail/bby.py +528 -0
- parrot/tools/sandboxtool.py +703 -0
- parrot/tools/sassie/__init__.py +352 -0
- parrot/tools/scraping/__init__.py +7 -0
- parrot/tools/scraping/docs/select.md +466 -0
- parrot/tools/scraping/documentation.md +1278 -0
- parrot/tools/scraping/driver.py +436 -0
- parrot/tools/scraping/models.py +576 -0
- parrot/tools/scraping/options.py +85 -0
- parrot/tools/scraping/orchestrator.py +517 -0
- parrot/tools/scraping/readme.md +740 -0
- parrot/tools/scraping/tool.py +3115 -0
- parrot/tools/seasonaldetection.py +642 -0
- parrot/tools/shell_tool/__init__.py +5 -0
- parrot/tools/shell_tool/actions.py +408 -0
- parrot/tools/shell_tool/engine.py +155 -0
- parrot/tools/shell_tool/models.py +322 -0
- parrot/tools/shell_tool/tool.py +442 -0
- parrot/tools/site_search.py +214 -0
- parrot/tools/textfile.py +418 -0
- parrot/tools/think.py +378 -0
- parrot/tools/toolkit.py +298 -0
- parrot/tools/webapp_tool.py +187 -0
- parrot/tools/whatif.py +1279 -0
- parrot/tools/workday/MULTI_WSDL_EXAMPLE.md +249 -0
- parrot/tools/workday/__init__.py +6 -0
- parrot/tools/workday/models.py +1389 -0
- parrot/tools/workday/tool.py +1293 -0
- parrot/tools/yfinance_tool.py +306 -0
- parrot/tools/zipcode.py +217 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/helpers.py +73 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.c +12078 -0
- parrot/utils/parsers/toml.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/parsers/toml.pyx +21 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpp +20936 -0
- parrot/utils/types.cpython-310-x86_64-linux-gnu.so +0 -0
- parrot/utils/types.pyx +213 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- parrot/yaml-rs/Cargo.lock +350 -0
- parrot/yaml-rs/Cargo.toml +19 -0
- parrot/yaml-rs/pyproject.toml +19 -0
- parrot/yaml-rs/python/yaml_rs/__init__.py +81 -0
- parrot/yaml-rs/src/lib.rs +222 -0
- requirements/docker-compose.yml +24 -0
- requirements/requirements-dev.txt +21 -0
|
@@ -0,0 +1,1131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Generator, Union, List, Any, Optional, TypeVar
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
import uuid
|
|
7
|
+
from pathlib import Path, PosixPath, PurePath
|
|
8
|
+
import asyncio
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from navconfig.logging import logging
|
|
11
|
+
from navigator.libs.json import JSONContent # pylint: disable=E0611
|
|
12
|
+
from ..stores.models import Document
|
|
13
|
+
## AI Models:
|
|
14
|
+
from ..models.google import GoogleModel
|
|
15
|
+
from ..models.groq import GroqModel
|
|
16
|
+
from ..clients.factory import LLMFactory
|
|
17
|
+
from .splitters import (
|
|
18
|
+
TokenTextSplitter,
|
|
19
|
+
MarkdownTextSplitter
|
|
20
|
+
)
|
|
21
|
+
from ..stores.utils.chunking import LateChunkingProcessor
|
|
22
|
+
from ..conf import (
|
|
23
|
+
DEFAULT_LLM_MODEL,
|
|
24
|
+
DEFAULT_LLM_TEMPERATURE,
|
|
25
|
+
DEFAULT_GROQ_MODEL,
|
|
26
|
+
CUDA_DEFAULT_DEVICE,
|
|
27
|
+
CUDA_DEFAULT_DEVICE_NUMBER
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
T = TypeVar('T')
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AbstractLoader(ABC):
|
|
35
|
+
"""
|
|
36
|
+
Base class for all loaders.
|
|
37
|
+
Loaders are responsible for loading data from various sources.
|
|
38
|
+
"""
|
|
39
|
+
extensions: List[str] = ['.*']
|
|
40
|
+
skip_directories: List[str] = []
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
source: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
|
|
45
|
+
*,
|
|
46
|
+
tokenizer: Union[str, Callable] = None,
|
|
47
|
+
text_splitter: Union[str, Callable] = None,
|
|
48
|
+
source_type: str = 'file',
|
|
49
|
+
**kwargs
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize the AbstractLoader.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
source: Path, URL, or list of paths/URLs to load from
|
|
56
|
+
tokenizer: Tokenizer to use (string model name or callable)
|
|
57
|
+
text_splitter: Text splitter to use
|
|
58
|
+
source_type: Type of source ('file', 'url', etc.)
|
|
59
|
+
**kwargs: Additional keyword arguments for configuration
|
|
60
|
+
"""
|
|
61
|
+
self.chunk_size: int = kwargs.get('chunk_size', 800)
|
|
62
|
+
self.chunk_overlap: int = kwargs.get('chunk_overlap', 100)
|
|
63
|
+
self.token_size: int = kwargs.get('token_size', 20)
|
|
64
|
+
self.semaphore = asyncio.Semaphore(kwargs.get('semaphore', 10))
|
|
65
|
+
self.extensions = kwargs.get('extensions', self.extensions)
|
|
66
|
+
self.skip_directories = kwargs.get(
|
|
67
|
+
'skip_directories',
|
|
68
|
+
self.skip_directories
|
|
69
|
+
)
|
|
70
|
+
self.encoding = kwargs.get('encoding', 'utf-8')
|
|
71
|
+
self._source_type = source_type
|
|
72
|
+
self._recursive: bool = kwargs.get('recursive', False)
|
|
73
|
+
self.category: str = kwargs.get('category', 'document')
|
|
74
|
+
self.doctype: str = kwargs.get('doctype', 'text')
|
|
75
|
+
# Chunking configuration
|
|
76
|
+
self._use_markdown_splitter: bool = kwargs.get('use_markdown_splitter', True)
|
|
77
|
+
self._use_huggingface_splitter: bool = kwargs.get('use_huggingface_splitter', False)
|
|
78
|
+
self._auto_detect_content_type: bool = kwargs.get('auto_detect_content_type', True)
|
|
79
|
+
|
|
80
|
+
# Advanced features
|
|
81
|
+
self._summarization = kwargs.get('summarization', False)
|
|
82
|
+
self._summary_model: Optional[Any] = kwargs.get('summary_model', None)
|
|
83
|
+
self._use_summary_pipeline: bool = kwargs.get('use_summary_pipeline', False)
|
|
84
|
+
self._use_translation_pipeline: bool = kwargs.get('use_translation_pipeline', False)
|
|
85
|
+
self._translation = kwargs.get('translation', False)
|
|
86
|
+
|
|
87
|
+
# Handle source/path initialization
|
|
88
|
+
self.path = None
|
|
89
|
+
if source is not None:
|
|
90
|
+
self.path = source
|
|
91
|
+
elif 'path' in kwargs:
|
|
92
|
+
self.path = kwargs['path']
|
|
93
|
+
|
|
94
|
+
# Normalize path if it's a string
|
|
95
|
+
if self.path is not None and isinstance(self.path, str):
|
|
96
|
+
self.path = Path(self.path).resolve()
|
|
97
|
+
elif self.path is not None and isinstance(self.path, (Path, PurePath)):
|
|
98
|
+
self.path = Path(self.path).resolve()
|
|
99
|
+
|
|
100
|
+
# Tokenizer
|
|
101
|
+
self.tokenizer = tokenizer
|
|
102
|
+
# Text Splitter
|
|
103
|
+
self.text_splitter = kwargs.get('text_splitter', None)
|
|
104
|
+
self.markdown_splitter = kwargs.get('markdown_splitter', None)
|
|
105
|
+
|
|
106
|
+
# Initialize text splitter based on configuration
|
|
107
|
+
self._setup_text_splitters(tokenizer, text_splitter, kwargs)
|
|
108
|
+
|
|
109
|
+
# Summarization Model:
|
|
110
|
+
self.summarization_model = kwargs.get('summarizer', None)
|
|
111
|
+
# LLM (if required)
|
|
112
|
+
self._setup_llm(kwargs)
|
|
113
|
+
# Logger
|
|
114
|
+
self.logger = logging.getLogger(
|
|
115
|
+
f"Parrot.Loaders.{self.__class__.__name__}"
|
|
116
|
+
)
|
|
117
|
+
# JSON encoder:
|
|
118
|
+
self._encoder = JSONContent()
|
|
119
|
+
# Use CUDA if available:
|
|
120
|
+
self._setup_device(kwargs)
|
|
121
|
+
|
|
122
|
+
def _get_token_splitter(
|
|
123
|
+
self,
|
|
124
|
+
model_name: str = "gpt-3.5-turbo",
|
|
125
|
+
chunk_size: int = 4000,
|
|
126
|
+
chunk_overlap: int = 200
|
|
127
|
+
) -> TokenTextSplitter:
|
|
128
|
+
"""Create a TokenTextSplitter with common settings"""
|
|
129
|
+
if self.text_splitter:
|
|
130
|
+
return self.text_splitter
|
|
131
|
+
return TokenTextSplitter(
|
|
132
|
+
chunk_size=chunk_size,
|
|
133
|
+
chunk_overlap=chunk_overlap,
|
|
134
|
+
model_name=model_name
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def _get_markdown_splitter(
|
|
138
|
+
self,
|
|
139
|
+
chunk_size: int = 4000,
|
|
140
|
+
chunk_overlap: int = 200,
|
|
141
|
+
strip_headers: bool = False
|
|
142
|
+
) -> MarkdownTextSplitter:
|
|
143
|
+
"""Create a MarkdownTextSplitter with common settings"""
|
|
144
|
+
if self.text_splitter:
|
|
145
|
+
return self.text_splitter
|
|
146
|
+
return MarkdownTextSplitter(
|
|
147
|
+
chunk_size=chunk_size,
|
|
148
|
+
chunk_overlap=chunk_overlap,
|
|
149
|
+
strip_headers=strip_headers
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def _create_hf_token_splitter(
|
|
153
|
+
self,
|
|
154
|
+
model_name: str,
|
|
155
|
+
chunk_size: int = 4000,
|
|
156
|
+
chunk_overlap: int = 200
|
|
157
|
+
) -> TokenTextSplitter:
|
|
158
|
+
"""Create a TokenTextSplitter using a HuggingFace Tokenizer"""
|
|
159
|
+
from transformers import AutoTokenizer
|
|
160
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
161
|
+
return TokenTextSplitter(
|
|
162
|
+
chunk_size=chunk_size,
|
|
163
|
+
chunk_overlap=chunk_overlap,
|
|
164
|
+
tokenizer=tokenizer
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def _setup_text_splitters(self, tokenizer, text_splitter, kwargs):
|
|
168
|
+
"""Initialize text splitters based on configuration."""
|
|
169
|
+
# Always create a markdown splitter
|
|
170
|
+
self.markdown_splitter = self._get_markdown_splitter(
|
|
171
|
+
chunk_size=self.chunk_size,
|
|
172
|
+
chunk_overlap=self.chunk_overlap
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Choose primary text splitter based on configuration
|
|
176
|
+
if self._use_markdown_splitter:
|
|
177
|
+
self.text_splitter = text_splitter or self.markdown_splitter
|
|
178
|
+
else:
|
|
179
|
+
if self._use_huggingface_splitter:
|
|
180
|
+
self.text_splitter = self._create_hf_token_splitter(
|
|
181
|
+
model_name=kwargs.get('model_name', 'gpt-3.5-turbo'),
|
|
182
|
+
chunk_size=self.chunk_size,
|
|
183
|
+
chunk_overlap=self.chunk_overlap
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
# Default to TokenTextSplitter
|
|
187
|
+
if isinstance(tokenizer, str):
|
|
188
|
+
self.text_splitter = self._get_token_splitter(
|
|
189
|
+
model_name=tokenizer,
|
|
190
|
+
chunk_size=self.chunk_size,
|
|
191
|
+
chunk_overlap=self.chunk_overlap
|
|
192
|
+
)
|
|
193
|
+
elif callable(tokenizer):
|
|
194
|
+
self.text_splitter = TokenTextSplitter(
|
|
195
|
+
chunk_size=self.chunk_size,
|
|
196
|
+
chunk_overlap=self.chunk_overlap,
|
|
197
|
+
tokenizer_function=tokenizer
|
|
198
|
+
)
|
|
199
|
+
else:
|
|
200
|
+
# Use default TokenTextSplitter
|
|
201
|
+
self.text_splitter = TokenTextSplitter(
|
|
202
|
+
chunk_size=self.chunk_size,
|
|
203
|
+
chunk_overlap=self.chunk_overlap,
|
|
204
|
+
model_name=kwargs.get('model_name', 'gpt-3.5-turbo')
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def _setup_llm(self, kwargs):
|
|
208
|
+
"""Initialize LLM if required."""
|
|
209
|
+
self._use_llm = kwargs.get('use_llm', False)
|
|
210
|
+
self._llm_model = kwargs.get('llm_model', None)
|
|
211
|
+
self._llm_model_kwargs = kwargs.get('model_kwargs', {})
|
|
212
|
+
self._llm = kwargs.get('llm', None)
|
|
213
|
+
if self._use_llm:
|
|
214
|
+
self._llm = self.get_default_llm(
|
|
215
|
+
model=self._llm_model,
|
|
216
|
+
model_kwargs=self._llm_model_kwargs,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def get_default_llm(
|
|
220
|
+
self,
|
|
221
|
+
model: str = None,
|
|
222
|
+
model_kwargs: dict = None,
|
|
223
|
+
use_groq: bool = False,
|
|
224
|
+
use_openai: bool = False
|
|
225
|
+
) -> Any:
|
|
226
|
+
"""Return a AI Client instance."""
|
|
227
|
+
if not model_kwargs:
|
|
228
|
+
model_kwargs = {
|
|
229
|
+
"temperature": DEFAULT_LLM_TEMPERATURE,
|
|
230
|
+
"top_k": 30,
|
|
231
|
+
"top_p": 0.5,
|
|
232
|
+
}
|
|
233
|
+
if use_groq:
|
|
234
|
+
return LLMFactory.create(
|
|
235
|
+
llm=f"groq:{model or DEFAULT_GROQ_MODEL}" if model else "groq",
|
|
236
|
+
model_kwargs=model_kwargs
|
|
237
|
+
)
|
|
238
|
+
elif use_openai:
|
|
239
|
+
return LLMFactory.create(
|
|
240
|
+
llm=f"openai:{model}" if model else "openai",
|
|
241
|
+
model_kwargs=model_kwargs
|
|
242
|
+
)
|
|
243
|
+
return LLMFactory.create(
|
|
244
|
+
llm=model or DEFAULT_LLM_MODEL,
|
|
245
|
+
model_kwargs=model_kwargs
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def _setup_device(self, kwargs):
|
|
249
|
+
"""Initialize device configuration."""
|
|
250
|
+
self.device_name = kwargs.get('device', CUDA_DEFAULT_DEVICE)
|
|
251
|
+
self.cuda_number = kwargs.get('cuda_number', CUDA_DEFAULT_DEVICE_NUMBER)
|
|
252
|
+
self._device = None
|
|
253
|
+
|
|
254
|
+
def _get_device(
|
|
255
|
+
self,
|
|
256
|
+
device_type: str = None,
|
|
257
|
+
cuda_number: int = 0
|
|
258
|
+
):
|
|
259
|
+
"""
|
|
260
|
+
Get device configuration for Torch and transformers.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
tuple: (pipeline_device_idx, torch_device, dtype)
|
|
264
|
+
- pipeline_device_idx: int for HuggingFace pipeline (-1 for CPU, 0+ for GPU)
|
|
265
|
+
- torch_device: torch.device object for model loading
|
|
266
|
+
- dtype: torch data type for model weights
|
|
267
|
+
"""
|
|
268
|
+
import torch
|
|
269
|
+
# Default values for CPU usage
|
|
270
|
+
pipeline_idx = -1 # This is what HuggingFace pipeline expects for CPU
|
|
271
|
+
torch_dev = torch.device("cpu")
|
|
272
|
+
dtype = torch.float32
|
|
273
|
+
|
|
274
|
+
# Check if we're forcing CPU usage globally
|
|
275
|
+
if CUDA_DEFAULT_DEVICE == 'cpu' or device_type == 'cpu':
|
|
276
|
+
# CPU is explicitly requested
|
|
277
|
+
return -1, torch.device('cpu'), torch.float32
|
|
278
|
+
|
|
279
|
+
# Check for CUDA availability and use it if possible
|
|
280
|
+
if torch.cuda.is_available():
|
|
281
|
+
# For GPU, pipeline wants an integer index
|
|
282
|
+
pipeline_idx = cuda_number # 0 for first GPU, 1 for second, etc.
|
|
283
|
+
torch_dev = torch.device(f"cuda:{cuda_number}")
|
|
284
|
+
|
|
285
|
+
# Choose the best dtype for this GPU
|
|
286
|
+
if torch.cuda.is_bf16_supported():
|
|
287
|
+
dtype = torch.bfloat16
|
|
288
|
+
else:
|
|
289
|
+
dtype = torch.float16
|
|
290
|
+
|
|
291
|
+
return pipeline_idx, torch_dev, dtype
|
|
292
|
+
|
|
293
|
+
# Check for Apple Silicon GPU (MPS)
|
|
294
|
+
if torch.backends.mps.is_available():
|
|
295
|
+
# MPS is tricky - HuggingFace pipelines don't always support it well
|
|
296
|
+
# We return "mps" as a string for pipeline, and torch.device for model
|
|
297
|
+
# Note: You might need to handle this specially in your pipeline code
|
|
298
|
+
return "mps", torch.device("mps"), torch.float32
|
|
299
|
+
|
|
300
|
+
# Fallback to CPU if nothing else is available
|
|
301
|
+
return -1, torch.device("cpu"), torch.float32
|
|
302
|
+
|
|
303
|
+
def clear_cuda(self):
|
|
304
|
+
self.tokenizer = None # Reset the tokenizer
|
|
305
|
+
self.text_splitter = None # Reset the text splitter
|
|
306
|
+
try:
|
|
307
|
+
import torch
|
|
308
|
+
torch.cuda.synchronize() # Wait for all kernels to finish
|
|
309
|
+
torch.cuda.empty_cache() # Clear unused memory
|
|
310
|
+
except Exception as e:
|
|
311
|
+
self.logger.warning(f"Error clearing CUDA memory: {e}")
|
|
312
|
+
|
|
313
|
+
async def __aenter__(self):
|
|
314
|
+
"""Open the loader if it has an open method."""
|
|
315
|
+
# Check if the loader has an open method and call it
|
|
316
|
+
if hasattr(self, "open"):
|
|
317
|
+
await self.open()
|
|
318
|
+
return self
|
|
319
|
+
|
|
320
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
321
|
+
"""Close the loader if it has a close method."""
|
|
322
|
+
if hasattr(self, "close"):
|
|
323
|
+
await self.close()
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
def supported_extensions(self):
|
|
327
|
+
"""Get the supported file extensions."""
|
|
328
|
+
return self.extensions
|
|
329
|
+
|
|
330
|
+
def _detect_content_type(self, document: Document) -> str:
|
|
331
|
+
"""
|
|
332
|
+
Auto-detect content type based on document metadata and content.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
document: Document to analyze
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Content type string ('markdown', 'code', 'text', etc.)
|
|
339
|
+
"""
|
|
340
|
+
if not self._auto_detect_content_type:
|
|
341
|
+
return 'text'
|
|
342
|
+
|
|
343
|
+
# Check metadata for hints
|
|
344
|
+
metadata = document.metadata or {}
|
|
345
|
+
filename = metadata.get('filename', '').lower()
|
|
346
|
+
source_type = metadata.get('source_type', '').lower()
|
|
347
|
+
|
|
348
|
+
# File extension based detection
|
|
349
|
+
if filename.endswith(('.md', '.markdown')):
|
|
350
|
+
return 'markdown'
|
|
351
|
+
elif filename.endswith(('.py', '.pyx', '.js', '.java', '.cpp', '.c', '.go', '.rs')):
|
|
352
|
+
return 'code'
|
|
353
|
+
elif filename.endswith(('.html', '.htm', '.xml')):
|
|
354
|
+
return 'html'
|
|
355
|
+
elif source_type in ['markdown', 'md']:
|
|
356
|
+
return 'markdown'
|
|
357
|
+
|
|
358
|
+
# Content based detection
|
|
359
|
+
content = document.page_content[:1000].lower() # Check first 1000 chars
|
|
360
|
+
|
|
361
|
+
# Simple heuristics for markdown
|
|
362
|
+
markdown_indicators = ['#', '```', '**', '*', '[', '](', '|', '---']
|
|
363
|
+
markdown_score = sum(1 for indicator in markdown_indicators if indicator in content)
|
|
364
|
+
|
|
365
|
+
if markdown_score >= 3: # If multiple markdown indicators found
|
|
366
|
+
return 'markdown'
|
|
367
|
+
|
|
368
|
+
# Default to text
|
|
369
|
+
return 'text'
|
|
370
|
+
|
|
371
|
+
def _select_splitter_for_content(self, content_type: str):
|
|
372
|
+
"""
|
|
373
|
+
Select the appropriate text splitter based on content type.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
content_type: Detected or specified content type
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
Appropriate text splitter
|
|
380
|
+
"""
|
|
381
|
+
if content_type == 'markdown':
|
|
382
|
+
return self.markdown_splitter
|
|
383
|
+
elif content_type == 'code':
|
|
384
|
+
# Use token splitter with smaller chunks for code
|
|
385
|
+
return TokenTextSplitter(
|
|
386
|
+
chunk_size=min(self.chunk_size, 2048),
|
|
387
|
+
chunk_overlap=self.chunk_overlap,
|
|
388
|
+
model_name='gpt-3.5-turbo'
|
|
389
|
+
)
|
|
390
|
+
else:
|
|
391
|
+
# Default to the configured text splitter
|
|
392
|
+
return self.text_splitter
|
|
393
|
+
|
|
394
|
+
def is_valid_path(self, path: Union[str, Path]) -> bool:
|
|
395
|
+
"""Check if a path is valid."""
|
|
396
|
+
if self.extensions == '*':
|
|
397
|
+
return True
|
|
398
|
+
if isinstance(path, str):
|
|
399
|
+
path = Path(path)
|
|
400
|
+
if not path.exists():
|
|
401
|
+
return False
|
|
402
|
+
if path.is_dir() and path.name in self.skip_directories:
|
|
403
|
+
return False
|
|
404
|
+
if path.is_file():
|
|
405
|
+
if path.suffix not in self.extensions:
|
|
406
|
+
return False
|
|
407
|
+
if path.name.startswith("."):
|
|
408
|
+
return False
|
|
409
|
+
# check if file is empty
|
|
410
|
+
if path.stat().st_size == 0:
|
|
411
|
+
return False
|
|
412
|
+
# check if file is inside of skip directories:
|
|
413
|
+
for skip_dir in self.skip_directories:
|
|
414
|
+
if path.is_relative_to(skip_dir):
|
|
415
|
+
return False
|
|
416
|
+
return True
|
|
417
|
+
|
|
418
|
+
@abstractmethod
|
|
419
|
+
async def _load(self, source: Union[str, PurePath], **kwargs) -> List[Document]:
|
|
420
|
+
"""Load a single data/url/file from a source and return it as a Langchain Document.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
source (str): The source of the data.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
List[Document]: A list of Langchain Documents.
|
|
427
|
+
"""
|
|
428
|
+
pass
|
|
429
|
+
|
|
430
|
+
async def from_path(
|
|
431
|
+
self,
|
|
432
|
+
path: Union[str, Path],
|
|
433
|
+
recursive: bool = False,
|
|
434
|
+
**kwargs
|
|
435
|
+
) -> List[asyncio.Task]:
|
|
436
|
+
"""
|
|
437
|
+
Load data from a path.
|
|
438
|
+
"""
|
|
439
|
+
tasks = []
|
|
440
|
+
if isinstance(path, str):
|
|
441
|
+
path = PurePath(path)
|
|
442
|
+
if path.is_dir():
|
|
443
|
+
for ext in self.extensions:
|
|
444
|
+
glob_method = path.rglob if recursive else path.glob
|
|
445
|
+
# Use glob to find all files with the specified extension
|
|
446
|
+
for item in glob_method(f'*{ext}'):
|
|
447
|
+
# Check if the item is a directory and if it should be skipped
|
|
448
|
+
if set(item.parts).isdisjoint(self.skip_directories):
|
|
449
|
+
if self.is_valid_path(item):
|
|
450
|
+
tasks.append(
|
|
451
|
+
asyncio.create_task(self._load(item, **kwargs))
|
|
452
|
+
)
|
|
453
|
+
elif path.is_file():
|
|
454
|
+
if self.is_valid_path(path):
|
|
455
|
+
tasks.append(
|
|
456
|
+
asyncio.create_task(self._load(path, **kwargs))
|
|
457
|
+
)
|
|
458
|
+
else:
|
|
459
|
+
self.logger.warning(
|
|
460
|
+
f"Path {path} is not valid."
|
|
461
|
+
)
|
|
462
|
+
return tasks
|
|
463
|
+
|
|
464
|
+
async def from_url(
|
|
465
|
+
self,
|
|
466
|
+
url: Union[str, List[str]],
|
|
467
|
+
**kwargs
|
|
468
|
+
) -> List[asyncio.Task]:
|
|
469
|
+
"""
|
|
470
|
+
Load data from a URL.
|
|
471
|
+
"""
|
|
472
|
+
tasks = []
|
|
473
|
+
if isinstance(url, str):
|
|
474
|
+
url = [url]
|
|
475
|
+
for item in url:
|
|
476
|
+
tasks.append(
|
|
477
|
+
asyncio.create_task(self._load(item, **kwargs))
|
|
478
|
+
)
|
|
479
|
+
return tasks
|
|
480
|
+
|
|
481
|
+
async def from_dataframe(
|
|
482
|
+
self,
|
|
483
|
+
source: pd.DataFrame,
|
|
484
|
+
**kwargs
|
|
485
|
+
) -> List[asyncio.Task]:
|
|
486
|
+
"""
|
|
487
|
+
Load data from a pandas DataFrame.
|
|
488
|
+
"""
|
|
489
|
+
tasks = []
|
|
490
|
+
if isinstance(source, pd.DataFrame):
|
|
491
|
+
tasks.append(
|
|
492
|
+
asyncio.create_task(self._load(source, **kwargs))
|
|
493
|
+
)
|
|
494
|
+
else:
|
|
495
|
+
self.logger.warning(
|
|
496
|
+
f"Source {source} is not a valid pandas DataFrame."
|
|
497
|
+
)
|
|
498
|
+
return tasks
|
|
499
|
+
|
|
500
|
+
def chunkify(self, lst: List[T], n: int = 50) -> Generator[List[T], None, None]:
|
|
501
|
+
"""Split a List of objects into chunks of size n.
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
lst: The list to split into chunks
|
|
505
|
+
n: The maximum size of each chunk
|
|
506
|
+
|
|
507
|
+
Yields:
|
|
508
|
+
List[T]: Chunks of the original list, each of size at most n
|
|
509
|
+
"""
|
|
510
|
+
for i in range(0, len(lst), n):
|
|
511
|
+
yield lst[i:i + n]
|
|
512
|
+
|
|
513
|
+
async def _async_map(self, func: Callable, iterable: list) -> list:
|
|
514
|
+
"""Run a function on a list of items asynchronously."""
|
|
515
|
+
async def async_func(item):
|
|
516
|
+
async with self.semaphore:
|
|
517
|
+
return await func(item)
|
|
518
|
+
|
|
519
|
+
tasks = [async_func(item) for item in iterable]
|
|
520
|
+
return await asyncio.gather(*tasks)
|
|
521
|
+
|
|
522
|
+
async def _load_tasks(self, tasks: list) -> list:
|
|
523
|
+
"""Load a list of tasks asynchronously."""
|
|
524
|
+
results = []
|
|
525
|
+
|
|
526
|
+
if not tasks:
|
|
527
|
+
return results
|
|
528
|
+
|
|
529
|
+
# Create a controlled task function to limit concurrency
|
|
530
|
+
async def controlled_task(task):
|
|
531
|
+
async with self.semaphore:
|
|
532
|
+
try:
|
|
533
|
+
return await task
|
|
534
|
+
except Exception as e:
|
|
535
|
+
self.logger.error(f"Task error: {e}")
|
|
536
|
+
return e
|
|
537
|
+
|
|
538
|
+
for chunk in self.chunkify(tasks, self.chunk_size):
|
|
539
|
+
# Wrap each task with semaphore control
|
|
540
|
+
controlled_tasks = [controlled_task(task) for task in chunk]
|
|
541
|
+
result = await asyncio.gather(*controlled_tasks, return_exceptions=True)
|
|
542
|
+
if result:
|
|
543
|
+
for res in result:
|
|
544
|
+
if isinstance(res, Exception):
|
|
545
|
+
# Handle the exception
|
|
546
|
+
self.logger.error(f"Error loading {res}")
|
|
547
|
+
else:
|
|
548
|
+
# Handle both single documents and lists of documents
|
|
549
|
+
if isinstance(res, list):
|
|
550
|
+
results.extend(res)
|
|
551
|
+
else:
|
|
552
|
+
results.append(res)
|
|
553
|
+
return results
|
|
554
|
+
|
|
555
|
+
async def load(
|
|
556
|
+
self,
|
|
557
|
+
source: Optional[Any] = None,
|
|
558
|
+
split_documents: bool = True,
|
|
559
|
+
late_chunking: bool = False,
|
|
560
|
+
vector_store=None,
|
|
561
|
+
store_full_document: bool = True,
|
|
562
|
+
auto_detect_content_type: bool = None,
|
|
563
|
+
**kwargs
|
|
564
|
+
) -> List[Document]:
|
|
565
|
+
"""
|
|
566
|
+
Load data from a source and return it as a list of Documents.
|
|
567
|
+
|
|
568
|
+
The source can be:
|
|
569
|
+
- None: Uses self.path attribute if available
|
|
570
|
+
- Path or str: Treated as file path or directory
|
|
571
|
+
- List[str/Path]: Treated as list of file paths
|
|
572
|
+
- URL string: Treated as a URL
|
|
573
|
+
- List of URLs: Treated as list of URLs
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
source (Optional[Any]): The source of the data.
|
|
577
|
+
split_documents (bool): Whether to split documents into chunks, defaults to True
|
|
578
|
+
late_chunking (bool): Whether to use late chunking strategy
|
|
579
|
+
vector_store: Vector store instance (required for late chunking)
|
|
580
|
+
store_full_document (bool): Whether to store full documents alongside chunks
|
|
581
|
+
auto_detect_content_type (bool): Override auto-detection setting
|
|
582
|
+
**kwargs: Additional keyword arguments
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
List[Document]: A list of Documents (chunked if requested).
|
|
586
|
+
"""
|
|
587
|
+
tasks = []
|
|
588
|
+
# If no source is provided, use self.path
|
|
589
|
+
if source is None:
|
|
590
|
+
if self.path is None:
|
|
591
|
+
raise ValueError(
|
|
592
|
+
"No source provided and self.path is not set. "
|
|
593
|
+
"Please provide a source parameter or set path during initialization."
|
|
594
|
+
)
|
|
595
|
+
source = self.path
|
|
596
|
+
|
|
597
|
+
if isinstance(source, (str, Path, PosixPath, PurePath)):
|
|
598
|
+
# Check if it's a URL
|
|
599
|
+
if isinstance(source, str) and (
|
|
600
|
+
source.startswith('http://') or source.startswith('https://')
|
|
601
|
+
):
|
|
602
|
+
tasks = await self.from_url(source, **kwargs)
|
|
603
|
+
else:
|
|
604
|
+
# Assume it's a file path or directory
|
|
605
|
+
tasks = await self.from_path(
|
|
606
|
+
source,
|
|
607
|
+
recursive=self._recursive,
|
|
608
|
+
**kwargs
|
|
609
|
+
)
|
|
610
|
+
elif isinstance(source, list):
|
|
611
|
+
# Check if it's a list of URLs or paths
|
|
612
|
+
if all(
|
|
613
|
+
isinstance(item, str) and (
|
|
614
|
+
item.startswith('http://') or item.startswith('https://')
|
|
615
|
+
) for item in source
|
|
616
|
+
):
|
|
617
|
+
tasks = await self.from_url(source, **kwargs)
|
|
618
|
+
else:
|
|
619
|
+
# Assume it's a list of file paths
|
|
620
|
+
path_tasks = []
|
|
621
|
+
for path in source:
|
|
622
|
+
path_tasks.extend(
|
|
623
|
+
await self.from_path(path, recursive=self._recursive, **kwargs)
|
|
624
|
+
)
|
|
625
|
+
tasks = path_tasks
|
|
626
|
+
elif isinstance(source, pd.DataFrame):
|
|
627
|
+
tasks = await self.from_dataframe(source, **kwargs)
|
|
628
|
+
else:
|
|
629
|
+
raise ValueError(
|
|
630
|
+
f"Unsupported source type: {type(source)}"
|
|
631
|
+
)
|
|
632
|
+
# Load tasks and get raw documents
|
|
633
|
+
documents = []
|
|
634
|
+
if tasks:
|
|
635
|
+
results = await self._load_tasks(tasks)
|
|
636
|
+
documents = results
|
|
637
|
+
|
|
638
|
+
# Apply chunking if requested
|
|
639
|
+
if split_documents and documents:
|
|
640
|
+
self.logger.debug(
|
|
641
|
+
f"Splitting {len(documents)} documents into chunks..."
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
if late_chunking and vector_store is None:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
"Vector store is required when using late_chunking=True"
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
documents = await self.chunk_documents(
|
|
650
|
+
documents=documents,
|
|
651
|
+
use_late_chunking=late_chunking,
|
|
652
|
+
vector_store=vector_store,
|
|
653
|
+
store_full_document=store_full_document,
|
|
654
|
+
auto_detect_content_type=auto_detect_content_type
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
self.logger.debug(
|
|
658
|
+
f"Document chunking complete: {len(documents)} final documents"
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
return documents
|
|
662
|
+
|
|
663
|
+
def create_metadata(
|
|
664
|
+
self,
|
|
665
|
+
path: Union[str, PurePath],
|
|
666
|
+
doctype: str = 'document',
|
|
667
|
+
source_type: str = 'source',
|
|
668
|
+
doc_metadata: Optional[dict] = None,
|
|
669
|
+
**kwargs
|
|
670
|
+
):
|
|
671
|
+
if not doc_metadata:
|
|
672
|
+
doc_metadata = {}
|
|
673
|
+
if isinstance(path, PurePath):
|
|
674
|
+
origin = path.name
|
|
675
|
+
url = f'file://{path.name}'
|
|
676
|
+
filename = path
|
|
677
|
+
else:
|
|
678
|
+
origin = path
|
|
679
|
+
url = path
|
|
680
|
+
filename = f'file://{path}'
|
|
681
|
+
metadata = {
|
|
682
|
+
"url": url,
|
|
683
|
+
"source": origin,
|
|
684
|
+
"filename": str(filename),
|
|
685
|
+
"type": doctype,
|
|
686
|
+
"source_type": source_type or self._source_type,
|
|
687
|
+
"created_at": datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
|
|
688
|
+
"category": self.category,
|
|
689
|
+
"document_meta": {
|
|
690
|
+
**doc_metadata
|
|
691
|
+
},
|
|
692
|
+
**kwargs
|
|
693
|
+
}
|
|
694
|
+
return metadata
|
|
695
|
+
|
|
696
|
+
def create_document(
|
|
697
|
+
self,
|
|
698
|
+
content: Any,
|
|
699
|
+
path: Union[str, PurePath],
|
|
700
|
+
metadata: Optional[dict] = None,
|
|
701
|
+
**kwargs
|
|
702
|
+
) -> Document:
|
|
703
|
+
"""Create a Langchain Document from the content.
|
|
704
|
+
Args:
|
|
705
|
+
content (Any): The content to create the document from.
|
|
706
|
+
Returns:
|
|
707
|
+
Document: A Langchain Document.
|
|
708
|
+
"""
|
|
709
|
+
if metadata:
|
|
710
|
+
_meta = metadata
|
|
711
|
+
else:
|
|
712
|
+
_meta = self.create_metadata(
|
|
713
|
+
path=path,
|
|
714
|
+
doctype=self.doctype,
|
|
715
|
+
source_type=self._source_type,
|
|
716
|
+
**kwargs
|
|
717
|
+
)
|
|
718
|
+
return Document(
|
|
719
|
+
page_content=content,
|
|
720
|
+
metadata=_meta
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
async def summary_from_text(
|
|
724
|
+
self,
|
|
725
|
+
text: str,
|
|
726
|
+
max_length: int = 500,
|
|
727
|
+
min_length: int = 50
|
|
728
|
+
) -> str:
|
|
729
|
+
"""
|
|
730
|
+
Get a summary of a text.
|
|
731
|
+
"""
|
|
732
|
+
if not text:
|
|
733
|
+
return ''
|
|
734
|
+
try:
|
|
735
|
+
summarizer = self.get_summarization_model()
|
|
736
|
+
if self._use_summary_pipeline:
|
|
737
|
+
# Use Huggingface pipeline
|
|
738
|
+
content = summarizer(
|
|
739
|
+
text,
|
|
740
|
+
max_length=max_length,
|
|
741
|
+
min_length=min_length,
|
|
742
|
+
do_sample=False,
|
|
743
|
+
truncation=True
|
|
744
|
+
)
|
|
745
|
+
return content[0].get('summary_text', '')
|
|
746
|
+
# Use Summarize Method from GroqClient
|
|
747
|
+
system_prompt = f"""
|
|
748
|
+
Your job is to produce a final summary from the following text and identify the main theme.
|
|
749
|
+
- The summary should be concise and to the point.
|
|
750
|
+
- The summary should be no longer than {max_length} characters and no less than {min_length} characters.
|
|
751
|
+
- The summary should be in a single paragraph.
|
|
752
|
+
"""
|
|
753
|
+
summary = await summarizer.summarize_text(
|
|
754
|
+
text=text,
|
|
755
|
+
model=GroqModel.LLAMA_3_3_70B_VERSATILE,
|
|
756
|
+
system_prompt=system_prompt,
|
|
757
|
+
temperature=0.1,
|
|
758
|
+
max_tokens=1000,
|
|
759
|
+
top_p=0.5
|
|
760
|
+
)
|
|
761
|
+
return summary.output
|
|
762
|
+
except Exception as e:
|
|
763
|
+
self.logger.error(
|
|
764
|
+
f'ERROR on summary_from_text: {e}'
|
|
765
|
+
)
|
|
766
|
+
return ""
|
|
767
|
+
|
|
768
|
+
def get_summarization_model(
|
|
769
|
+
self,
|
|
770
|
+
model_name: str = 'facebook/bart-large-cnn'
|
|
771
|
+
):
|
|
772
|
+
if not self._summary_model:
|
|
773
|
+
if self._use_summary_pipeline:
|
|
774
|
+
from transformers import (
|
|
775
|
+
AutoModelForSeq2SeqLM,
|
|
776
|
+
AutoTokenizer,
|
|
777
|
+
pipeline
|
|
778
|
+
)
|
|
779
|
+
_, pipe_dev, torch_dtype = self._get_device()
|
|
780
|
+
summarize_model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
781
|
+
model_name,
|
|
782
|
+
)
|
|
783
|
+
summarize_tokenizer = AutoTokenizer.from_pretrained(
|
|
784
|
+
model_name,
|
|
785
|
+
padding_side="left"
|
|
786
|
+
)
|
|
787
|
+
self._summary_model = pipeline(
|
|
788
|
+
"summarization",
|
|
789
|
+
model=summarize_model,
|
|
790
|
+
tokenizer=summarize_tokenizer,
|
|
791
|
+
device=pipe_dev, # 0 for CUDA, mps device, or -1
|
|
792
|
+
torch_dtype=torch_dtype if pipe_dev != -1 else None,
|
|
793
|
+
)
|
|
794
|
+
else:
|
|
795
|
+
# Use Groq for Summarization:
|
|
796
|
+
self._summary_model = LLMFactory.create(
|
|
797
|
+
llm=f"groq:{GroqModel.LLAMA_3_3_70B_VERSATILE}",
|
|
798
|
+
model_kwargs={
|
|
799
|
+
"temperature": 0.1,
|
|
800
|
+
"top_p": 0.5,
|
|
801
|
+
}
|
|
802
|
+
)
|
|
803
|
+
return self._summary_model
|
|
804
|
+
|
|
805
|
+
def translate_text(
|
|
806
|
+
self,
|
|
807
|
+
text: str,
|
|
808
|
+
source_lang: str = None,
|
|
809
|
+
target_lang: str = "es"
|
|
810
|
+
) -> str:
|
|
811
|
+
"""
|
|
812
|
+
Translate text from source language to target language.
|
|
813
|
+
|
|
814
|
+
Args:
|
|
815
|
+
text: Text to translate
|
|
816
|
+
source_lang: Source language code (default: 'en')
|
|
817
|
+
target_lang: Target language code (default: 'es')
|
|
818
|
+
|
|
819
|
+
Returns:
|
|
820
|
+
Translated text
|
|
821
|
+
"""
|
|
822
|
+
if not text:
|
|
823
|
+
return ''
|
|
824
|
+
try:
|
|
825
|
+
translator = self.get_translation_model(source_lang, target_lang)
|
|
826
|
+
if self._use_translation_pipeline:
|
|
827
|
+
# Use Huggingface pipeline
|
|
828
|
+
content = translator(
|
|
829
|
+
text,
|
|
830
|
+
max_length=len(text) * 2, # Allow for expansion in target language
|
|
831
|
+
truncation=True
|
|
832
|
+
)
|
|
833
|
+
return content[0].get('translation_text', '')
|
|
834
|
+
else:
|
|
835
|
+
# Use LLM for translation
|
|
836
|
+
translation = translator.translate_text(
|
|
837
|
+
text=text,
|
|
838
|
+
source_lang=source_lang,
|
|
839
|
+
target_lang=target_lang,
|
|
840
|
+
model=GoogleModel.GEMINI_2_5_FLASH_LITE_PREVIEW,
|
|
841
|
+
temperature=0.1,
|
|
842
|
+
max_tokens=1000
|
|
843
|
+
)
|
|
844
|
+
return translation.get('text', '')
|
|
845
|
+
except Exception as e:
|
|
846
|
+
self.logger.error(f'ERROR on translate_text: {e}')
|
|
847
|
+
return ""
|
|
848
|
+
|
|
849
|
+
def get_translation_model(
|
|
850
|
+
self,
|
|
851
|
+
source_lang: str = "en",
|
|
852
|
+
target_lang: str = "es",
|
|
853
|
+
model_name: str = None
|
|
854
|
+
):
|
|
855
|
+
"""
|
|
856
|
+
Get or create a translation model.
|
|
857
|
+
|
|
858
|
+
Args:
|
|
859
|
+
source_lang: Source language code
|
|
860
|
+
target_lang: Target language code
|
|
861
|
+
model_name: Optional model name override
|
|
862
|
+
|
|
863
|
+
Returns:
|
|
864
|
+
Translation model/chain
|
|
865
|
+
"""
|
|
866
|
+
# Create a cache key for the language pair
|
|
867
|
+
cache_key = f"{source_lang}_{target_lang}"
|
|
868
|
+
|
|
869
|
+
# Check if we already have a model for this language pair
|
|
870
|
+
if not hasattr(self, '_translation_models'):
|
|
871
|
+
self._translation_models = {}
|
|
872
|
+
|
|
873
|
+
if cache_key not in self._translation_models:
|
|
874
|
+
if self._use_translation_pipeline:
|
|
875
|
+
from transformers import (
|
|
876
|
+
AutoModelForSeq2SeqLM,
|
|
877
|
+
AutoTokenizer,
|
|
878
|
+
pipeline
|
|
879
|
+
)
|
|
880
|
+
# Select appropriate model based on language pair if not specified
|
|
881
|
+
if model_name is None:
|
|
882
|
+
if source_lang == "en" and target_lang in ["es", "fr", "de", "it", "pt", "ru"]:
|
|
883
|
+
model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
|
|
884
|
+
elif source_lang in ["es", "fr", "de", "it", "pt"] and target_lang == "en":
|
|
885
|
+
model_name = "Helsinki-NLP/opus-mt-ROMANCE-en"
|
|
886
|
+
else:
|
|
887
|
+
# Default to a specific model for the language pair
|
|
888
|
+
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
|
889
|
+
|
|
890
|
+
try:
|
|
891
|
+
translate_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
892
|
+
translate_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
893
|
+
|
|
894
|
+
self._translation_models[cache_key] = pipeline(
|
|
895
|
+
"translation",
|
|
896
|
+
model=translate_model,
|
|
897
|
+
tokenizer=translate_tokenizer
|
|
898
|
+
)
|
|
899
|
+
except Exception as e:
|
|
900
|
+
self.logger.error(
|
|
901
|
+
f"Error loading translation model {model_name}: {e}"
|
|
902
|
+
)
|
|
903
|
+
# Fallback to using LLM for translation
|
|
904
|
+
self._use_translation_pipeline = False
|
|
905
|
+
|
|
906
|
+
if not self._use_translation_pipeline:
|
|
907
|
+
# Use LLM for translation
|
|
908
|
+
translation_model = self.get_default_llm(
|
|
909
|
+
model=GoogleModel.GEMINI_2_5_FLASH_LITE_PREVIEW
|
|
910
|
+
)
|
|
911
|
+
self._translation_models[cache_key] = translation_model
|
|
912
|
+
|
|
913
|
+
return self._translation_models[cache_key]
|
|
914
|
+
|
|
915
|
+
def create_translated_document(
|
|
916
|
+
self,
|
|
917
|
+
content: str,
|
|
918
|
+
metadata: dict,
|
|
919
|
+
source_lang: str = "en",
|
|
920
|
+
target_lang: str = "es"
|
|
921
|
+
) -> Document:
|
|
922
|
+
"""
|
|
923
|
+
Create a document with translated content.
|
|
924
|
+
|
|
925
|
+
Args:
|
|
926
|
+
content: Original content
|
|
927
|
+
metadata: Document metadata
|
|
928
|
+
source_lang: Source language code
|
|
929
|
+
target_lang: Target language code
|
|
930
|
+
|
|
931
|
+
Returns:
|
|
932
|
+
Document with translated content
|
|
933
|
+
"""
|
|
934
|
+
translated_content = self.translate_text(content, source_lang, target_lang)
|
|
935
|
+
|
|
936
|
+
# Clone the metadata and add translation info
|
|
937
|
+
translation_metadata = metadata.copy()
|
|
938
|
+
translation_metadata.update({
|
|
939
|
+
"original_language": source_lang,
|
|
940
|
+
"language": target_lang,
|
|
941
|
+
"is_translation": True
|
|
942
|
+
})
|
|
943
|
+
|
|
944
|
+
return Document(
|
|
945
|
+
page_content=translated_content,
|
|
946
|
+
metadata=translation_metadata
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
def saving_file(self, filename: PurePath, data: Any):
|
|
950
|
+
"""Save data to a file.
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
filename (PurePath): The path to the file.
|
|
954
|
+
data (Any): The data to save.
|
|
955
|
+
"""
|
|
956
|
+
with open(filename, 'wb') as f:
|
|
957
|
+
f.write(data)
|
|
958
|
+
f.flush()
|
|
959
|
+
print(f':: Saved File on {filename}')
|
|
960
|
+
|
|
961
|
+
async def chunk_documents(
|
|
962
|
+
self,
|
|
963
|
+
documents: List[Document],
|
|
964
|
+
use_late_chunking: bool = False,
|
|
965
|
+
vector_store=None,
|
|
966
|
+
store_full_document: bool = True,
|
|
967
|
+
auto_detect_content_type: bool = None
|
|
968
|
+
) -> List[Document]:
|
|
969
|
+
"""
|
|
970
|
+
Chunk documents using the configured text splitter or late chunking strategy.
|
|
971
|
+
|
|
972
|
+
Args:
|
|
973
|
+
documents: List of documents to chunk
|
|
974
|
+
use_late_chunking: Whether to use late chunking strategy
|
|
975
|
+
vector_store: Vector store instance (required for late chunking)
|
|
976
|
+
store_full_document: Whether to store full documents alongside chunks (late chunking only)
|
|
977
|
+
auto_detect_content_type: Override auto-detection setting
|
|
978
|
+
|
|
979
|
+
Returns:
|
|
980
|
+
List of chunked documents
|
|
981
|
+
"""
|
|
982
|
+
if use_late_chunking:
|
|
983
|
+
return await self._chunk_with_late_chunking(
|
|
984
|
+
documents, vector_store, store_full_document
|
|
985
|
+
)
|
|
986
|
+
else:
|
|
987
|
+
return self._chunk_with_text_splitter(
|
|
988
|
+
documents, auto_detect_content_type
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
def _chunk_with_text_splitter(
|
|
992
|
+
self,
|
|
993
|
+
documents: List[Document],
|
|
994
|
+
auto_detect_content_type: bool = None
|
|
995
|
+
) -> List[Document]:
|
|
996
|
+
"""
|
|
997
|
+
Chunk documents using regular text splitters.
|
|
998
|
+
|
|
999
|
+
Args:
|
|
1000
|
+
documents: List of documents to chunk
|
|
1001
|
+
auto_detect_content_type: Override auto-detection setting
|
|
1002
|
+
|
|
1003
|
+
Returns:
|
|
1004
|
+
List of chunked documents
|
|
1005
|
+
"""
|
|
1006
|
+
chunked_docs = []
|
|
1007
|
+
detect_content = auto_detect_content_type if auto_detect_content_type is not None else self._auto_detect_content_type # noqa
|
|
1008
|
+
|
|
1009
|
+
for doc in documents:
|
|
1010
|
+
try:
|
|
1011
|
+
# Detect content type and select appropriate splitter
|
|
1012
|
+
if detect_content:
|
|
1013
|
+
content_type = self._detect_content_type(doc)
|
|
1014
|
+
splitter = self._select_splitter_for_content(content_type)
|
|
1015
|
+
# self.logger.debug(f"Detected content type: {content_type} for document")
|
|
1016
|
+
else:
|
|
1017
|
+
content_type = 'text'
|
|
1018
|
+
splitter = self.text_splitter
|
|
1019
|
+
|
|
1020
|
+
# Create chunks using the selected splitter
|
|
1021
|
+
chunks = splitter.create_chunks(
|
|
1022
|
+
text=doc.page_content,
|
|
1023
|
+
metadata=doc.metadata
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
# Convert chunks to Document objects
|
|
1027
|
+
for chunk in chunks:
|
|
1028
|
+
chunked_doc = Document(
|
|
1029
|
+
page_content=chunk.text,
|
|
1030
|
+
metadata={
|
|
1031
|
+
**chunk.metadata,
|
|
1032
|
+
'chunk_id': chunk.chunk_id,
|
|
1033
|
+
'token_count': chunk.token_count,
|
|
1034
|
+
'start_position': chunk.start_position,
|
|
1035
|
+
'end_position': chunk.end_position,
|
|
1036
|
+
'content_type': content_type,
|
|
1037
|
+
'splitter_type': splitter.__class__.__name__,
|
|
1038
|
+
'is_chunk': True,
|
|
1039
|
+
'parent_document_id': doc.metadata.get('document_id', f"doc_{uuid.uuid4().hex[:8]}")
|
|
1040
|
+
}
|
|
1041
|
+
)
|
|
1042
|
+
chunked_docs.append(chunked_doc)
|
|
1043
|
+
|
|
1044
|
+
except Exception as e:
|
|
1045
|
+
self.logger.error(f"Error chunking document: {e}")
|
|
1046
|
+
# Fall back to adding the original document
|
|
1047
|
+
chunked_docs.append(doc)
|
|
1048
|
+
|
|
1049
|
+
self.logger.info(f"Chunked {len(documents)} documents into {len(chunked_docs)} chunks")
|
|
1050
|
+
return chunked_docs
|
|
1051
|
+
|
|
1052
|
+
async def _chunk_with_late_chunking(
|
|
1053
|
+
self,
|
|
1054
|
+
documents: List[Document],
|
|
1055
|
+
vector_store=None,
|
|
1056
|
+
store_full_document: bool = True
|
|
1057
|
+
) -> List[Document]:
|
|
1058
|
+
"""
|
|
1059
|
+
Chunk documents using late chunking strategy.
|
|
1060
|
+
|
|
1061
|
+
Args:
|
|
1062
|
+
documents: List of documents to chunk
|
|
1063
|
+
vector_store: Vector store instance (required)
|
|
1064
|
+
store_full_document: Whether to store full documents alongside chunks
|
|
1065
|
+
|
|
1066
|
+
Returns:
|
|
1067
|
+
List of chunked documents (and optionally full documents)
|
|
1068
|
+
"""
|
|
1069
|
+
if LateChunkingProcessor is None:
|
|
1070
|
+
self.logger.warning(
|
|
1071
|
+
"LateChunkingProcessor not available, falling back to regular chunking"
|
|
1072
|
+
)
|
|
1073
|
+
return self._chunk_with_text_splitter(documents)
|
|
1074
|
+
|
|
1075
|
+
if vector_store is None:
|
|
1076
|
+
raise ValueError("Vector store is required for late chunking strategy")
|
|
1077
|
+
|
|
1078
|
+
chunked_docs = []
|
|
1079
|
+
|
|
1080
|
+
# Initialize late chunking processor
|
|
1081
|
+
chunking_processor = LateChunkingProcessor(
|
|
1082
|
+
vector_store=vector_store,
|
|
1083
|
+
chunk_size=self.chunk_size,
|
|
1084
|
+
chunk_overlap=self.chunk_overlap
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
for doc_idx, document in enumerate(documents):
|
|
1088
|
+
try:
|
|
1089
|
+
document_id = document.metadata.get('document_id', f"doc_{doc_idx:06d}_{uuid.uuid4().hex[:8]}")
|
|
1090
|
+
|
|
1091
|
+
# Process document with late chunking
|
|
1092
|
+
_, chunk_infos = await chunking_processor.process_document_late_chunking(
|
|
1093
|
+
document_text=document.page_content,
|
|
1094
|
+
document_id=document_id,
|
|
1095
|
+
metadata=document.metadata
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
# Store full document if requested
|
|
1099
|
+
if store_full_document:
|
|
1100
|
+
full_doc_metadata = {
|
|
1101
|
+
**(document.metadata or {}),
|
|
1102
|
+
'document_id': document_id,
|
|
1103
|
+
'is_full_document': True,
|
|
1104
|
+
'total_chunks': len(chunk_infos),
|
|
1105
|
+
'document_type': 'parent',
|
|
1106
|
+
'chunking_strategy': 'late_chunking'
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
full_doc = Document(
|
|
1110
|
+
page_content=document.page_content,
|
|
1111
|
+
metadata=full_doc_metadata
|
|
1112
|
+
)
|
|
1113
|
+
chunked_docs.append(full_doc)
|
|
1114
|
+
|
|
1115
|
+
# Add all chunks as documents
|
|
1116
|
+
for chunk_info in chunk_infos:
|
|
1117
|
+
chunk_doc = Document(
|
|
1118
|
+
page_content=chunk_info.chunk_text,
|
|
1119
|
+
metadata=chunk_info.metadata
|
|
1120
|
+
)
|
|
1121
|
+
chunked_docs.append(chunk_doc)
|
|
1122
|
+
|
|
1123
|
+
except Exception as e:
|
|
1124
|
+
self.logger.error(f"Error in late chunking for document {doc_idx}: {e}")
|
|
1125
|
+
# Fall back to adding the original document
|
|
1126
|
+
chunked_docs.append(document)
|
|
1127
|
+
|
|
1128
|
+
self.logger.info(
|
|
1129
|
+
f"Late chunking processed {len(documents)} documents into {len(chunked_docs)} items"
|
|
1130
|
+
)
|
|
1131
|
+
return chunked_docs
|