@mseep/open-computer-use 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.coderabbit.yaml +25 -0
- package/.dockerignore +95 -0
- package/.env.example +137 -0
- package/.githooks/pre-commit +68 -0
- package/.github/CODEOWNERS +125 -0
- package/.github/ISSUE_TEMPLATE/adr-proposal.md +41 -0
- package/.github/ISSUE_TEMPLATE/bug-report.md +49 -0
- package/.github/ISSUE_TEMPLATE/component-proposal.md +38 -0
- package/.github/ISSUE_TEMPLATE/config.yml +15 -0
- package/.github/ISSUE_TEMPLATE/dependency-proposal.md +59 -0
- package/.github/ISSUE_TEMPLATE/feature_request.md +15 -0
- package/.github/ISSUE_TEMPLATE/nfr-proposal.md +44 -0
- package/.github/PULL_REQUEST_TEMPLATE.md +15 -0
- package/.github/codeql/codeql-config.yml +11 -0
- package/.github/codeql/extensions/security-models/python-sanitizers.model.yml +17 -0
- package/.github/codeql/extensions/security-models/qlpack.yml +7 -0
- package/.github/dependabot.yml +23 -0
- package/.github/security-exceptions.yml +23 -0
- package/.github/workflows/build.yml +420 -0
- package/.github/workflows/codeql.yml +33 -0
- package/.github/workflows/contracts-lint.yml +90 -0
- package/.github/workflows/docs-lint.yml +151 -0
- package/.github/workflows/helm.yml +131 -0
- package/.github/workflows/identity-lint.yml +30 -0
- package/.github/workflows/release-chart.yml +177 -0
- package/.github/workflows/release.yml +95 -0
- package/.github/workflows/security.yml +332 -0
- package/.github/workflows/stale.yml +31 -0
- package/.github/workflows/supply-chain.yml +242 -0
- package/.gitleaks.toml +53 -0
- package/.markdownlint.yaml +51 -0
- package/.semgrepignore +85 -0
- package/.vale/styles/Architecture/ap13-data-class-substrate.yml +12 -0
- package/.vale/styles/Architecture/banned-phrases.yml +23 -0
- package/.vale/styles/Architecture/banned-vocab.yml +23 -0
- package/.vale/styles/Architecture/marketing-tone.yml +19 -0
- package/.vale.ini +18 -0
- package/CHANGELOG.md +411 -0
- package/CLAUDE.md +218 -0
- package/CONTRIBUTING.md +82 -0
- package/Dockerfile +676 -0
- package/LICENSE +98 -0
- package/LICENSE-APACHE +202 -0
- package/LICENSE-MIT +21 -0
- package/NOTICE +36 -0
- package/README.md +516 -0
- package/SECURITY.md +45 -0
- package/THIRD-PARTY-LICENSES.md +14 -0
- package/apt-packages.txt +108 -0
- package/computer-use-server/.dockerignore +13 -0
- package/computer-use-server/Dockerfile +44 -0
- package/computer-use-server/README.md +84 -0
- package/computer-use-server/app.py +1544 -0
- package/computer-use-server/bin/list-subagent-models +449 -0
- package/computer-use-server/cli-defaults/README.md +31 -0
- package/computer-use-server/cli-defaults/codex.json +7 -0
- package/computer-use-server/cli-defaults/opencode.json +18 -0
- package/computer-use-server/cli_adapters/__init__.py +46 -0
- package/computer-use-server/cli_adapters/claude.py +163 -0
- package/computer-use-server/cli_adapters/codex.py +163 -0
- package/computer-use-server/cli_adapters/opencode.py +169 -0
- package/computer-use-server/cli_adapters/result.py +34 -0
- package/computer-use-server/cli_runtime.py +316 -0
- package/computer-use-server/context_vars.py +24 -0
- package/computer-use-server/docker_manager.py +1100 -0
- package/computer-use-server/docs_html.py +12 -0
- package/computer-use-server/mcp_resources.py +170 -0
- package/computer-use-server/mcp_tools.py +1430 -0
- package/computer-use-server/requirements.txt +17 -0
- package/computer-use-server/security.py +50 -0
- package/computer-use-server/skill_manager.py +664 -0
- package/computer-use-server/static/browser-viewer.js +445 -0
- package/computer-use-server/static/chart.umd.js +14 -0
- package/computer-use-server/static/docs.html +203 -0
- package/computer-use-server/static/github-dark.min.css +10 -0
- package/computer-use-server/static/github.min.css +10 -0
- package/computer-use-server/static/highlight.min.js +1213 -0
- package/computer-use-server/static/highlightjs-line-numbers.min.js +1 -0
- package/computer-use-server/static/icons.js +74 -0
- package/computer-use-server/static/jszip.min.js +13 -0
- package/computer-use-server/static/katex/auto-render.min.js +1 -0
- package/computer-use-server/static/katex/fonts/KaTeX_AMS-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_AMS-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_AMS-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Caligraphic-Bold.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Caligraphic-Bold.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Caligraphic-Bold.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Caligraphic-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Caligraphic-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Caligraphic-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Fraktur-Bold.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Fraktur-Bold.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Fraktur-Bold.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Fraktur-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Fraktur-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Fraktur-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Bold.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Bold.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Bold.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-BoldItalic.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-BoldItalic.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-BoldItalic.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Italic.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Italic.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Italic.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Main-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Math-BoldItalic.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Math-BoldItalic.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Math-BoldItalic.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Math-Italic.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Math-Italic.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Math-Italic.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Bold.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Bold.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Bold.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Italic.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Italic.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Italic.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_SansSerif-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Script-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Script-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Script-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size1-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size1-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size1-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size2-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size2-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size2-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size3-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size3-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size3-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size4-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size4-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Size4-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Typewriter-Regular.ttf +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Typewriter-Regular.woff +0 -0
- package/computer-use-server/static/katex/fonts/KaTeX_Typewriter-Regular.woff2 +0 -0
- package/computer-use-server/static/katex/katex.min.css +1 -0
- package/computer-use-server/static/katex/katex.min.js +1 -0
- package/computer-use-server/static/locale.js +242 -0
- package/computer-use-server/static/mammoth.browser.min.js +21 -0
- package/computer-use-server/static/marked.min.js +6 -0
- package/computer-use-server/static/mermaid.min.js +2811 -0
- package/computer-use-server/static/pdf.min.js +22 -0
- package/computer-use-server/static/pdf.worker.min.js +22 -0
- package/computer-use-server/static/pptxviewjs.min.js +1 -0
- package/computer-use-server/static/preact-htm.min.js +1 -0
- package/computer-use-server/static/preview.css +1030 -0
- package/computer-use-server/static/preview.js +1522 -0
- package/computer-use-server/static/xlsx.full.min.js +22 -0
- package/computer-use-server/static/xterm-addon-fit.min.js +2 -0
- package/computer-use-server/static/xterm-addon-web-links.min.js +2 -0
- package/computer-use-server/static/xterm.css +218 -0
- package/computer-use-server/static/xterm.min.js +2 -0
- package/computer-use-server/system_prompt.py +761 -0
- package/computer-use-server/uploads.py +82 -0
- package/contracts/README.md +53 -0
- package/contracts/audit/audit-fanin.asyncapi.yaml +407 -0
- package/contracts/exec/exec-channel.schema.json +240 -0
- package/contracts/mcp/2025-06-18/ocu-constraints.schema.json +178 -0
- package/contracts/storage/file-artifact-api.schema.json +390 -0
- package/contracts/storage/file-ops.schema.json +217 -0
- package/contracts/storage/mount-config.schema.json +197 -0
- package/cron/Dockerfile +15 -0
- package/cron/cleanup-quick.sh +21 -0
- package/cron/cleanup.sh +127 -0
- package/data/outputs/.gitkeep +0 -0
- package/data/uploads/.gitkeep +0 -0
- package/docker-compose.test.yml +54 -0
- package/docker-compose.webui.yml +77 -0
- package/docker-compose.yml +96 -0
- package/docs/CLOUD.md +29 -0
- package/docs/COMPARISON.md +128 -0
- package/docs/DOCKER.md +469 -0
- package/docs/DYNAMIC-SKILLS.md +77 -0
- package/docs/FEATURES.md +100 -0
- package/docs/INSTALL.md +111 -0
- package/docs/KNOWN-BUGS.md +86 -0
- package/docs/MCP.md +320 -0
- package/docs/SCREENSHOTS.md +39 -0
- package/docs/SKILLS-USER-GUIDE.md +86 -0
- package/docs/SKILLS.md +483 -0
- package/docs/TERMINAL-TAB.md +56 -0
- package/docs/architecture/02-trust-boundaries.md +224 -0
- package/docs/architecture/03-c4-context.md +61 -0
- package/docs/architecture/04-bounded-contexts.md +119 -0
- package/docs/architecture/05-c4-container.md +88 -0
- package/docs/architecture/06-threat-model.md +172 -0
- package/docs/architecture/08-contracts.md +105 -0
- package/docs/architecture/MANIFESTO.md +38 -0
- package/docs/architecture/PROCESS.md +64 -0
- package/docs/architecture/README.md +37 -0
- package/docs/architecture/adr/0000-template.md +65 -0
- package/docs/architecture/adr/0001-layer-0-gate-legacy-exclusion.md +75 -0
- package/docs/architecture/adr/0002-session-view-descriptor.md +57 -0
- package/docs/architecture/adr/0003-sandbox-runtime-tier-ladder.md +63 -0
- package/docs/architecture/adr/0004-operator-authentication-substrate.md +63 -0
- package/docs/architecture/adr/0005-egress-credential-delivery-envoy-sds.md +62 -0
- package/docs/architecture/adr/0006-egress-forward-proxy-substrate.md +65 -0
- package/docs/architecture/adr/0007-egress-auth-mechanism.md +72 -0
- package/docs/architecture/adr/0008-session-egress-attribution.md +59 -0
- package/docs/architecture/adr/0009-audit-pipeline-pluggable-by-contract.md +76 -0
- package/docs/architecture/adr/0010-storage-backend-pluggable-adapter.md +60 -0
- package/docs/architecture/adr/0011-storage-egress-lane.md +67 -0
- package/docs/architecture/adr/0012-implementation-language.md +67 -0
- package/docs/architecture/adr/0020-sandbox-image-provisioning.md +82 -0
- package/docs/architecture/adr/README.md +53 -0
- package/docs/architecture/compliance/.gitkeep +0 -0
- package/docs/architecture/components/00-overview.md +42 -0
- package/docs/architecture/components/0000-template.md +50 -0
- package/docs/architecture/components/01-mcp-gateway.md +80 -0
- package/docs/architecture/components/02-control-operator-api.md +80 -0
- package/docs/architecture/components/04-storage-broker.md +104 -0
- package/docs/architecture/components/05-session-sandbox.md +93 -0
- package/docs/architecture/components/06-egress-trust-edge.md +95 -0
- package/docs/architecture/components/07-audit-pipeline.md +110 -0
- package/docs/architecture/diagrams/.gitkeep +0 -0
- package/docs/architecture/diagrams/02-trust-boundaries.mmd +111 -0
- package/docs/architecture/diagrams/06-threat-model.mmd +41 -0
- package/docs/architecture/diagrams/08-contracts.mmd +47 -0
- package/docs/architecture/diagrams/c4-container.mmd +59 -0
- package/docs/architecture/diagrams/c4-context.mmd +46 -0
- package/docs/architecture/glossary.md +172 -0
- package/docs/architecture/manifesto/.gitkeep +0 -0
- package/docs/architecture/manifesto/01-audience-and-buyer.md +57 -0
- package/docs/architecture/manifesto/02-nfrs.md +325 -0
- package/docs/architecture/manifesto/03-non-negotiables.md +35 -0
- package/docs/architecture/manifesto/04-non-goals.md +23 -0
- package/docs/architecture/manifesto/05-licensing-posture.md +61 -0
- package/docs/architecture/manifesto/06-starter-mode-policy.md +49 -0
- package/docs/architecture/manifesto/07-governance.md +60 -0
- package/docs/architecture/primitives-backlog.md +51 -0
- package/docs/architecture.svg +117 -0
- package/docs/claude-code-gateway.md +173 -0
- package/docs/cli-config-templates.md +240 -0
- package/docs/data-flow.svg +72 -0
- package/docs/demo-landing-page.gif +0 -0
- package/docs/demo-qwen-trending.gif +0 -0
- package/docs/dynamic-skills.svg +77 -0
- package/docs/file-flow.svg +126 -0
- package/docs/future-architecture/README.md +152 -0
- package/docs/future-architecture/adr/0001-control-plane-language-go.md +80 -0
- package/docs/future-architecture/adr/0002-guest-agent-language-go.md +84 -0
- package/docs/future-architecture/adr/0003-docker-poc-first-then-k8s.md +37 -0
- package/docs/future-architecture/adr/0004-pluggable-runtime-via-runtimeclass.md +34 -0
- package/docs/future-architecture/adr/0005-mcp-as-control-plane-gateway.md +34 -0
- package/docs/future-architecture/adr/0006-no-agpl-no-bsl-dependencies.md +41 -0
- package/docs/future-architecture/adr/0007-superseded-by-future-architecture.md +37 -0
- package/docs/future-architecture/adr/0008-internal-grpc-external-rest-mcp.md +106 -0
- package/docs/future-architecture/adr/0009-external-protocol-dialects.md +94 -0
- package/docs/future-architecture/adr/0010-lambda-as-inspiration-not-runtime.md +86 -0
- package/docs/future-architecture/adr/0011-kata-as-first-class-dind-runtime.md +84 -0
- package/docs/future-architecture/antipatterns.md +552 -0
- package/docs/future-architecture/architecture/01-layers.md +109 -0
- package/docs/future-architecture/architecture/02-layer4-control-plane.md +122 -0
- package/docs/future-architecture/architecture/03-layer3-providers.md +174 -0
- package/docs/future-architecture/architecture/04-layer2-runtimes.md +114 -0
- package/docs/future-architecture/architecture/04b-credential-broker.md +153 -0
- package/docs/future-architecture/architecture/05-layer1-guest-agent.md +138 -0
- package/docs/future-architecture/architecture/06-storage.md +134 -0
- package/docs/future-architecture/architecture/07-security.md +194 -0
- package/docs/future-architecture/architecture/08-networking.md +149 -0
- package/docs/future-architecture/architecture/09-templates.md +122 -0
- package/docs/future-architecture/architecture/10-observability.md +121 -0
- package/docs/future-architecture/design-notes.md +72 -0
- package/docs/future-architecture/gaps.md +281 -0
- package/docs/future-architecture/phase-template.md +123 -0
- package/docs/future-architecture/references.md +225 -0
- package/docs/future-architecture/research/01-kata-containers.md +100 -0
- package/docs/future-architecture/research/02-e2b-infra.md +133 -0
- package/docs/future-architecture/research/03-coder.md +115 -0
- package/docs/future-architecture/research/04-cloud-hypervisor.md +99 -0
- package/docs/future-architecture/research/05-firecracker.md +114 -0
- package/docs/future-architecture/research/06-agent-sandbox.md +142 -0
- package/docs/future-architecture/research/07-chromedp.md +78 -0
- package/docs/future-architecture/research/08-microsandbox.md +78 -0
- package/docs/future-architecture/research/09-agentbox.md +135 -0
- package/docs/future-architecture/research/10-sysbox.md +100 -0
- package/docs/future-architecture/research/11-firecracker-containerd.md +93 -0
- package/docs/future-architecture/research/12-docker-socket-proxy.md +59 -0
- package/docs/future-architecture/research/14-e2b-desktop-and-surf.md +107 -0
- package/docs/future-architecture/research/18-open-webui-terminals-observed.md +135 -0
- package/docs/future-architecture/research/bank-buyer.md +96 -0
- package/docs/future-architecture/research/enthusiast-audience.md +106 -0
- package/docs/future-architecture/research/proof-uipath-anthropic-2026-05.md +76 -0
- package/docs/future-architecture/research/widemoat-thesis-advisor.md +124 -0
- package/docs/future-architecture/roadmap.md +438 -0
- package/docs/kata-runtime.md +267 -0
- package/docs/kubernetes.md +86 -0
- package/docs/logo.png +0 -0
- package/docs/multi-cli.md +161 -0
- package/docs/openwebui-filter.md +134 -0
- package/docs/roadmap/implementation-roadmap.md +104 -0
- package/docs/sandbox-contents.svg +229 -0
- package/docs/screenshots/01-create-document.png +0 -0
- package/docs/screenshots/02-file-preview.png +0 -0
- package/docs/screenshots/03-browser-viewer.png +0 -0
- package/docs/screenshots/04-sub-agent-terminal.png +0 -0
- package/docs/screenshots/05-chat-overview.png +0 -0
- package/docs/screenshots/06-sub-agent-dashboard.png +0 -0
- package/docs/screenshots/07-frontend-design-skill.png +0 -0
- package/docs/screenshots/08-pptx-skill.png +0 -0
- package/docs/screenshots/09-skill-creator.png +0 -0
- package/docs/screenshots/10-data-chart.png +0 -0
- package/docs/shared-browser.svg +102 -0
- package/docs/system-prompt.md +113 -0
- package/docs/terminal-flow.svg +69 -0
- package/examples/helm/README.md +20 -0
- package/examples/helm/standalone/values.yaml +49 -0
- package/examples/helm/with-open-webui/README.md +99 -0
- package/examples/helm/with-open-webui/values-computer-use.yaml +32 -0
- package/examples/helm/with-open-webui/values-open-webui.yaml +67 -0
- package/fonts/NotoEmoji-Regular.ttf +0 -0
- package/helm/computer-use-server/.helmignore +17 -0
- package/helm/computer-use-server/Chart.yaml +32 -0
- package/helm/computer-use-server/README.md +211 -0
- package/helm/computer-use-server/templates/NOTES.txt +66 -0
- package/helm/computer-use-server/templates/_helpers.tpl +115 -0
- package/helm/computer-use-server/templates/configmap-dind-init.yaml +82 -0
- package/helm/computer-use-server/templates/configmap.yaml +18 -0
- package/helm/computer-use-server/templates/deployment.yaml +248 -0
- package/helm/computer-use-server/templates/ingress.yaml +38 -0
- package/helm/computer-use-server/templates/networkpolicy.yaml +50 -0
- package/helm/computer-use-server/templates/pdb.yaml +16 -0
- package/helm/computer-use-server/templates/pvc-data.yaml +20 -0
- package/helm/computer-use-server/templates/pvc-skills-cache.yaml +20 -0
- package/helm/computer-use-server/templates/pvc-user-data.yaml +20 -0
- package/helm/computer-use-server/templates/pvc-var-lib-docker.yaml +27 -0
- package/helm/computer-use-server/templates/secret.yaml +23 -0
- package/helm/computer-use-server/templates/service.yaml +22 -0
- package/helm/computer-use-server/templates/serviceaccount.yaml +15 -0
- package/helm/computer-use-server/templates/tests/test-health.yaml +23 -0
- package/helm/computer-use-server/values.schema.json +183 -0
- package/helm/computer-use-server/values.yaml +297 -0
- package/lychee.toml +36 -0
- package/openwebui/Dockerfile +52 -0
- package/openwebui/README.md +38 -0
- package/openwebui/functions/README.md +48 -0
- package/openwebui/functions/computer_link_filter.py +487 -0
- package/openwebui/init.sh +305 -0
- package/openwebui/patches/README.md +44 -0
- package/openwebui/patches/fix_artifacts_auto_show.py +441 -0
- package/openwebui/patches/fix_attached_files_position.py +87 -0
- package/openwebui/patches/fix_large_tool_args.py +156 -0
- package/openwebui/patches/fix_large_tool_results.py +289 -0
- package/openwebui/patches/fix_preview_url_detection.py +230 -0
- package/openwebui/patches/fix_skip_embedding_chat_files.py +229 -0
- package/openwebui/patches/fix_skip_rag_files_native_fc.py +100 -0
- package/openwebui/patches/fix_tool_loop_errors.py +510 -0
- package/package.json +39 -0
- package/requirements.txt +112 -0
- package/scripts/check-config.sh +141 -0
- package/scripts/docs-lint/ai-slop-detector.sh +202 -0
- package/scripts/docs-lint/architecture-tree-whitelist.sh +131 -0
- package/scripts/docs-lint/ascii-diagram-detector.sh +58 -0
- package/scripts/docs-lint/front-matter-validator.sh +97 -0
- package/scripts/docs-lint/gitignored-ref-detector.sh +122 -0
- package/scripts/docs-lint/identity-email-detector.sh +48 -0
- package/scripts/docs-lint/test-linters.sh +354 -0
- package/scripts/docs-lint/wc-budget.sh +61 -0
- package/scripts/githooks/pre-push +75 -0
- package/server.json +13 -0
- package/settings-wrapper/Dockerfile +9 -0
- package/settings-wrapper/README.md +119 -0
- package/settings-wrapper/app.py +113 -0
- package/settings-wrapper/requirements.txt +2 -0
- package/settings-wrapper/skills.json +25 -0
- package/skills/README.md +46 -0
- package/skills/examples/algorithmic-art/SKILL.md +405 -0
- package/skills/examples/algorithmic-art/templates/generator_template.js +223 -0
- package/skills/examples/algorithmic-art/templates/viewer.html +601 -0
- package/skills/examples/artifacts-builder/SKILL.md +74 -0
- package/skills/examples/artifacts-builder/scripts/bundle-artifact.sh +54 -0
- package/skills/examples/artifacts-builder/scripts/init-artifact.sh +322 -0
- package/skills/examples/artifacts-builder/scripts/shadcn-components.tar.gz +0 -0
- package/skills/examples/canvas-design/LICENSE.txt +202 -0
- package/skills/examples/canvas-design/SKILL.md +130 -0
- package/skills/examples/canvas-design/canvas-fonts/ArsenalSC-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/ArsenalSC-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/BigShoulders-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/BigShoulders-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/BigShoulders-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Boldonse-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Boldonse-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/BricolageGrotesque-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/BricolageGrotesque-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/BricolageGrotesque-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/CrimsonPro-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/CrimsonPro-Italic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/CrimsonPro-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/CrimsonPro-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/DMMono-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/DMMono-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/EricaOne-OFL.txt +94 -0
- package/skills/examples/canvas-design/canvas-fonts/EricaOne-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/GeistMono-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/GeistMono-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/GeistMono-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Gloock-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Gloock-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexMono-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexMono-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexMono-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexSerif-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexSerif-BoldItalic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexSerif-Italic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/IBMPlexSerif-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSans-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSans-BoldItalic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSans-Italic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSans-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSans-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSerif-Italic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/InstrumentSerif-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Italiana-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Italiana-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/JetBrainsMono-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/JetBrainsMono-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/JetBrainsMono-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Jura-Light.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Jura-Medium.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Jura-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/LibreBaskerville-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/LibreBaskerville-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Lora-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Lora-BoldItalic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Lora-Italic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Lora-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Lora-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/NationalPark-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/NationalPark-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/NationalPark-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/NothingYouCouldDo-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/NothingYouCouldDo-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Outfit-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Outfit-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Outfit-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/PixelifySans-Medium.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/PixelifySans-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/PoiretOne-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/PoiretOne-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/RedHatMono-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/RedHatMono-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/RedHatMono-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Silkscreen-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Silkscreen-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/SmoochSans-Medium.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/SmoochSans-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Tektur-Medium.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/Tektur-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/Tektur-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/WorkSans-Bold.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/WorkSans-BoldItalic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/WorkSans-Italic.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/WorkSans-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/WorkSans-Regular.ttf +0 -0
- package/skills/examples/canvas-design/canvas-fonts/YoungSerif-OFL.txt +93 -0
- package/skills/examples/canvas-design/canvas-fonts/YoungSerif-Regular.ttf +0 -0
- package/skills/examples/copy-editing/SKILL.md +447 -0
- package/skills/examples/copy-editing/evals/evals.json +89 -0
- package/skills/examples/copy-editing/references/plain-english-alternatives.md +394 -0
- package/skills/examples/internal-comms/LICENSE.txt +202 -0
- package/skills/examples/internal-comms/SKILL.md +32 -0
- package/skills/examples/internal-comms/examples/3p-updates.md +47 -0
- package/skills/examples/internal-comms/examples/company-newsletter.md +65 -0
- package/skills/examples/internal-comms/examples/faq-answers.md +30 -0
- package/skills/examples/internal-comms/examples/general-comms.md +16 -0
- package/skills/examples/mcp-builder/SKILL.md +328 -0
- package/skills/examples/mcp-builder/reference/evaluation.md +602 -0
- package/skills/examples/mcp-builder/reference/mcp_best_practices.md +915 -0
- package/skills/examples/mcp-builder/reference/node_mcp_server.md +916 -0
- package/skills/examples/mcp-builder/reference/python_mcp_server.md +752 -0
- package/skills/examples/mcp-builder/scripts/connections.py +151 -0
- package/skills/examples/mcp-builder/scripts/evaluation.py +373 -0
- package/skills/examples/mcp-builder/scripts/example_evaluation.xml +22 -0
- package/skills/examples/mcp-builder/scripts/requirements.txt +2 -0
- package/skills/examples/product-marketing-context/SKILL.md +241 -0
- package/skills/examples/product-marketing-context/evals/evals.json +85 -0
- package/skills/examples/single-cell-rna-qc/SKILL.md +175 -0
- package/skills/examples/single-cell-rna-qc/references/scverse_qc_guidelines.md +186 -0
- package/skills/examples/single-cell-rna-qc/scripts/qc_analysis.py +232 -0
- package/skills/examples/single-cell-rna-qc/scripts/qc_core.py +233 -0
- package/skills/examples/single-cell-rna-qc/scripts/qc_plotting.py +235 -0
- package/skills/examples/skill-creator/SKILL.md +355 -0
- package/skills/examples/skill-creator/references/output-patterns.md +82 -0
- package/skills/examples/skill-creator/references/workflows.md +28 -0
- package/skills/examples/skill-creator/scripts/init_skill.py +303 -0
- package/skills/examples/skill-creator/scripts/package_skill.py +110 -0
- package/skills/examples/skill-creator/scripts/quick_validate.py +95 -0
- package/skills/examples/slack-gif-creator/SKILL.md +254 -0
- package/skills/examples/slack-gif-creator/core/easing.py +234 -0
- package/skills/examples/slack-gif-creator/core/frame_composer.py +176 -0
- package/skills/examples/slack-gif-creator/core/gif_builder.py +269 -0
- package/skills/examples/slack-gif-creator/core/validators.py +136 -0
- package/skills/examples/slack-gif-creator/requirements.txt +4 -0
- package/skills/examples/social-content/SKILL.md +278 -0
- package/skills/examples/social-content/evals/evals.json +92 -0
- package/skills/examples/social-content/references/platforms.md +170 -0
- package/skills/examples/social-content/references/post-templates.md +177 -0
- package/skills/examples/social-content/references/reverse-engineering.md +195 -0
- package/skills/examples/theme-factory/SKILL.md +59 -0
- package/skills/examples/theme-factory/theme-showcase.pdf +0 -0
- package/skills/examples/theme-factory/themes/arctic-frost.md +19 -0
- package/skills/examples/theme-factory/themes/botanical-garden.md +19 -0
- package/skills/examples/theme-factory/themes/desert-rose.md +19 -0
- package/skills/examples/theme-factory/themes/forest-canopy.md +19 -0
- package/skills/examples/theme-factory/themes/golden-hour.md +19 -0
- package/skills/examples/theme-factory/themes/midnight-galaxy.md +19 -0
- package/skills/examples/theme-factory/themes/modern-minimalist.md +19 -0
- package/skills/examples/theme-factory/themes/ocean-depths.md +19 -0
- package/skills/examples/theme-factory/themes/sunset-boulevard.md +19 -0
- package/skills/examples/theme-factory/themes/tech-innovation.md +19 -0
- package/skills/examples/web-artifacts-builder/LICENSE.txt +202 -0
- package/skills/examples/web-artifacts-builder/SKILL.md +74 -0
- package/skills/examples/web-artifacts-builder/scripts/bundle-artifact.sh +54 -0
- package/skills/examples/web-artifacts-builder/scripts/init-artifact.sh +322 -0
- package/skills/examples/web-artifacts-builder/scripts/shadcn-components.tar.gz +0 -0
- package/skills/examples/writing-skills/SKILL.md +655 -0
- package/skills/examples/writing-skills/anthropic-best-practices.md +1150 -0
- package/skills/examples/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -0
- package/skills/examples/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/examples/writing-skills/persuasion-principles.md +187 -0
- package/skills/examples/writing-skills/render-graphs.js +168 -0
- package/skills/examples/writing-skills/testing-skills-with-subagents.md +384 -0
- package/skills/public/describe-image/SKILL.md +105 -0
- package/skills/public/describe-image/scripts/describe.py +389 -0
- package/skills/public/doc-coauthoring/SKILL.md +375 -0
- package/skills/public/docx/LICENSE.txt +30 -0
- package/skills/public/docx/SKILL.md +199 -0
- package/skills/public/docx/docx-js.md +350 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/public/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/public/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/public/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/public/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/public/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/public/docx/ooxml/schemas/mce/mc.xsd +75 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/public/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/public/docx/ooxml/scripts/pack.py +159 -0
- package/skills/public/docx/ooxml/scripts/unpack.py +29 -0
- package/skills/public/docx/ooxml/scripts/validate.py +69 -0
- package/skills/public/docx/ooxml/scripts/validation/__init__.py +15 -0
- package/skills/public/docx/ooxml/scripts/validation/base.py +951 -0
- package/skills/public/docx/ooxml/scripts/validation/docx.py +274 -0
- package/skills/public/docx/ooxml/scripts/validation/pptx.py +315 -0
- package/skills/public/docx/ooxml/scripts/validation/redlining.py +279 -0
- package/skills/public/docx/ooxml.md +632 -0
- package/skills/public/docx/scripts/__init__.py +1 -0
- package/skills/public/docx/scripts/document.py +1292 -0
- package/skills/public/docx/scripts/templates/comments.xml +3 -0
- package/skills/public/docx/scripts/templates/commentsExtended.xml +3 -0
- package/skills/public/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/skills/public/docx/scripts/templates/commentsIds.xml +3 -0
- package/skills/public/docx/scripts/templates/people.xml +3 -0
- package/skills/public/docx/scripts/utilities.py +374 -0
- package/skills/public/file-reading/LICENSE.txt +30 -0
- package/skills/public/file-reading/SKILL.md +350 -0
- package/skills/public/frontend-design/LICENSE.txt +177 -0
- package/skills/public/frontend-design/SKILL.md +42 -0
- package/skills/public/gitlab-explorer/SKILL.md +174 -0
- package/skills/public/gitlab-explorer/references/git-commands.md +323 -0
- package/skills/public/gitlab-explorer/references/glab-commands.md +282 -0
- package/skills/public/gitlab-explorer/scripts/check_gitlab_auth.sh +109 -0
- package/skills/public/pdf/FORMS.md +205 -0
- package/skills/public/pdf/REFERENCE.md +612 -0
- package/skills/public/pdf/SKILL.md +364 -0
- package/skills/public/pdf/scripts/check_bounding_boxes.py +70 -0
- package/skills/public/pdf/scripts/check_bounding_boxes_test.py +226 -0
- package/skills/public/pdf/scripts/check_fillable_fields.py +12 -0
- package/skills/public/pdf/scripts/convert_pdf_to_images.py +35 -0
- package/skills/public/pdf/scripts/create_validation_image.py +41 -0
- package/skills/public/pdf/scripts/extract_form_field_info.py +152 -0
- package/skills/public/pdf/scripts/fill_fillable_fields.py +114 -0
- package/skills/public/pdf/scripts/fill_pdf_form_with_annotations.py +108 -0
- package/skills/public/pdf-reading/LICENSE.txt +30 -0
- package/skills/public/pdf-reading/REFERENCE.md +196 -0
- package/skills/public/pdf-reading/SKILL.md +305 -0
- package/skills/public/playwright-cli/SKILL.md +278 -0
- package/skills/public/playwright-cli/references/request-mocking.md +87 -0
- package/skills/public/playwright-cli/references/running-code.md +232 -0
- package/skills/public/playwright-cli/references/session-management.md +169 -0
- package/skills/public/playwright-cli/references/storage-state.md +275 -0
- package/skills/public/playwright-cli/references/test-generation.md +88 -0
- package/skills/public/playwright-cli/references/tracing.md +139 -0
- package/skills/public/playwright-cli/references/video-recording.md +43 -0
- package/skills/public/pptx/LICENSE.txt +30 -0
- package/skills/public/pptx/SKILL.md +484 -0
- package/skills/public/pptx/css.md +335 -0
- package/skills/public/pptx/html2pptx.md +893 -0
- package/skills/public/pptx/html2pptx.tgz +0 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/public/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/public/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/public/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/public/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/public/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/public/pptx/ooxml/schemas/mce/mc.xsd +75 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/public/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/public/pptx/ooxml/scripts/pack.py +159 -0
- package/skills/public/pptx/ooxml/scripts/unpack.py +29 -0
- package/skills/public/pptx/ooxml/scripts/validate.py +69 -0
- package/skills/public/pptx/ooxml/scripts/validation/__init__.py +15 -0
- package/skills/public/pptx/ooxml/scripts/validation/base.py +951 -0
- package/skills/public/pptx/ooxml/scripts/validation/docx.py +274 -0
- package/skills/public/pptx/ooxml/scripts/validation/pptx.py +315 -0
- package/skills/public/pptx/ooxml/scripts/validation/redlining.py +279 -0
- package/skills/public/pptx/ooxml.md +427 -0
- package/skills/public/pptx/scripts/inventory.py +1020 -0
- package/skills/public/pptx/scripts/rearrange.py +231 -0
- package/skills/public/pptx/scripts/replace.py +385 -0
- package/skills/public/pptx/scripts/thumbnail.py +450 -0
- package/skills/public/skill-creator/SKILL.md +356 -0
- package/skills/public/skill-creator/references/output-patterns.md +82 -0
- package/skills/public/skill-creator/references/workflows.md +28 -0
- package/skills/public/skill-creator/scripts/init_skill.py +303 -0
- package/skills/public/skill-creator/scripts/package_skill.py +110 -0
- package/skills/public/skill-creator/scripts/quick_validate.py +95 -0
- package/skills/public/sub-agent/SKILL.md +186 -0
- package/skills/public/sub-agent/references/security-review.md +153 -0
- package/skills/public/sub-agent/references/usage.md +207 -0
- package/skills/public/sub-agent/scripts/list_subagent_models.sh +22 -0
- package/skills/public/test-driven-development/SKILL.md +371 -0
- package/skills/public/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/public/webapp-testing/LICENSE.txt +202 -0
- package/skills/public/webapp-testing/SKILL.md +96 -0
- package/skills/public/webapp-testing/examples/console_logging.py +35 -0
- package/skills/public/webapp-testing/examples/element_discovery.py +40 -0
- package/skills/public/webapp-testing/examples/static_html_automation.py +33 -0
- package/skills/public/webapp-testing/scripts/with_server.py +106 -0
- package/skills/public/xlsx/LICENSE.txt +30 -0
- package/skills/public/xlsx/SKILL.md +316 -0
- package/skills/public/xlsx/preview_data.py +93 -0
- package/skills/public/xlsx/recalc.py +178 -0
- package/tests/README.md +42 -0
- package/tests/fixtures/cli/claude_v0.9.2.0_argv.json +46 -0
- package/tests/fixtures/cli/claude_v0.9.2.0_stdout.json +32 -0
- package/tests/fixtures/cli/codex_run.jsonl +4 -0
- package/tests/fixtures/cli/opencode_run.jsonl +6 -0
- package/tests/integration/README.md +56 -0
- package/tests/integration/conftest.py +280 -0
- package/tests/integration/pytest.ini +13 -0
- package/tests/integration/test_mcp_auth.py +85 -0
- package/tests/integration/test_mcp_tools.py +101 -0
- package/tests/integration/test_workspace_lifecycle.py +125 -0
- package/tests/orchestrator/mock_llm_server.py +343 -0
- package/tests/orchestrator/test_cli_adapters.py +566 -0
- package/tests/orchestrator/test_cli_adapters_live.py +527 -0
- package/tests/orchestrator/test_cli_runtime.py +451 -0
- package/tests/orchestrator/test_docker_manager.py +302 -0
- package/tests/orchestrator/test_dynamic_instructions.py +69 -0
- package/tests/orchestrator/test_mcp_resources.py +140 -0
- package/tests/orchestrator/test_mcp_tools.py +224 -0
- package/tests/orchestrator/test_passthrough_isolation.py +201 -0
- package/tests/orchestrator/test_readme_in_container.py +76 -0
- package/tests/orchestrator/test_render_cache.py +84 -0
- package/tests/orchestrator/test_runtime_cli_endpoint.py +108 -0
- package/tests/orchestrator/test_single_user_mode.py +212 -0
- package/tests/orchestrator/test_startup_warnings.py +123 -0
- package/tests/orchestrator/test_sub_agent_dispatch.py +327 -0
- package/tests/orchestrator/test_subagent_claude_compat.py +367 -0
- package/tests/orchestrator/test_system_prompt_endpoint.py +191 -0
- package/tests/orchestrator/test_tool_descriptions.py +52 -0
- package/tests/orchestrator/test_view_image.py +201 -0
- package/tests/patches/conftest.py +30 -0
- package/tests/patches/fixtures/__init__.py +10 -0
- package/tests/patches/fixtures/middleware_v0.9.1.py +5057 -0
- package/tests/patches/fixtures/middleware_v0.9.2.py +5120 -0
- package/tests/patches/fixtures/retrieval_v0.9.1.py +2684 -0
- package/tests/patches/fixtures/retrieval_v0.9.2.py +2700 -0
- package/tests/patches/test_fix_attached_files_position.py +118 -0
- package/tests/patches/test_fix_large_tool_args.py +130 -0
- package/tests/patches/test_fix_large_tool_results.py +531 -0
- package/tests/patches/test_fix_skip_embedding_chat_files.py +160 -0
- package/tests/patches/test_fix_skip_rag_files_native_fc.py +120 -0
- package/tests/patches/test_fix_tool_loop_errors.py +128 -0
- package/tests/security/test_path_traversal_app.py +132 -0
- package/tests/security/test_path_traversal_docker.py +36 -0
- package/tests/security/test_path_traversal_settings.py +87 -0
- package/tests/security/test_safe_path_util.py +166 -0
- package/tests/security/test_xss_preview.py +46 -0
- package/tests/test-default-model-resolution.py +136 -0
- package/tests/test-docker-image.sh +358 -0
- package/tests/test-list-subagent-models.sh +421 -0
- package/tests/test-mcp-endpoint-live.sh +92 -0
- package/tests/test-mcp-native-surface.sh +213 -0
- package/tests/test-no-cyrillic.sh +135 -0
- package/tests/test-opencode-error-mapping.py +130 -0
- package/tests/test-pr88-skills.sh +305 -0
- package/tests/test-project-structure.sh +202 -0
- package/tests/test-single-user-mode.sh +269 -0
- package/tests/test-skill-no-hardcoded-models.sh +65 -0
- package/tests/test-subagent-cli-surface.py +137 -0
- package/tests/test-subagent-runtime.sh +109 -0
- package/tests/test_codex_toml_converter.py +204 -0
- package/tests/test_default_resolver_no_legacy_global.py +159 -0
- package/tests/test_filter.py +648 -0
- package/tests/test_init_sh_unchanged.sh +49 -0
- package/tests/test_opencode_alias_map_drop.py +144 -0
- package/tests/test_requirements.py +91 -0
- package/tests/test_subagent_docstring.py +193 -0
- package/tests/test_tools.py +34 -0
- package/vendor/extract-text/README.md +46 -0
- package/vendor/extract-text/extract-text +0 -0
|
@@ -0,0 +1,2684 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import mimetypes
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import asyncio
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import uuid
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Iterator, List, Optional, Sequence, Union
|
|
13
|
+
|
|
14
|
+
from fastapi import (
|
|
15
|
+
Depends,
|
|
16
|
+
FastAPI,
|
|
17
|
+
Query,
|
|
18
|
+
File,
|
|
19
|
+
Form,
|
|
20
|
+
HTTPException,
|
|
21
|
+
UploadFile,
|
|
22
|
+
Request,
|
|
23
|
+
status,
|
|
24
|
+
APIRouter,
|
|
25
|
+
)
|
|
26
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
27
|
+
from fastapi.concurrency import run_in_threadpool
|
|
28
|
+
from pydantic import BaseModel
|
|
29
|
+
import tiktoken
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
from langchain_text_splitters import (
|
|
33
|
+
RecursiveCharacterTextSplitter,
|
|
34
|
+
TokenTextSplitter,
|
|
35
|
+
MarkdownHeaderTextSplitter,
|
|
36
|
+
)
|
|
37
|
+
from langchain_core.documents import Document
|
|
38
|
+
|
|
39
|
+
from open_webui.models.files import FileModel, FileUpdateForm, Files
|
|
40
|
+
from open_webui.utils.access_control.files import has_access_to_file
|
|
41
|
+
from open_webui.models.knowledge import Knowledges
|
|
42
|
+
from open_webui.storage.provider import Storage
|
|
43
|
+
from open_webui.internal.db import get_async_db, get_async_session
|
|
44
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
from open_webui.retrieval.vector.factory import VECTOR_DB_CLIENT
|
|
48
|
+
from open_webui.retrieval.vector.async_client import ASYNC_VECTOR_DB_CLIENT
|
|
49
|
+
|
|
50
|
+
# Document loaders
|
|
51
|
+
|
|
52
|
+
from open_webui.retrieval.loaders.youtube import YoutubeLoader
|
|
53
|
+
|
|
54
|
+
# Web search engines
|
|
55
|
+
from open_webui.retrieval.web.main import SearchResult
|
|
56
|
+
from open_webui.retrieval.web.utils import get_web_loader
|
|
57
|
+
from open_webui.retrieval.web.ollama import search_ollama_cloud
|
|
58
|
+
from open_webui.retrieval.web.perplexity_search import search_perplexity_search
|
|
59
|
+
from open_webui.retrieval.web.brave import search_brave
|
|
60
|
+
from open_webui.retrieval.web.kagi import search_kagi
|
|
61
|
+
from open_webui.retrieval.web.mojeek import search_mojeek
|
|
62
|
+
from open_webui.retrieval.web.bocha import search_bocha
|
|
63
|
+
from open_webui.retrieval.web.duckduckgo import search_duckduckgo
|
|
64
|
+
from open_webui.retrieval.web.google_pse import search_google_pse
|
|
65
|
+
from open_webui.retrieval.web.jina_search import search_jina
|
|
66
|
+
from open_webui.retrieval.web.searchapi import search_searchapi
|
|
67
|
+
from open_webui.retrieval.web.serpapi import search_serpapi
|
|
68
|
+
from open_webui.retrieval.web.searxng import search_searxng
|
|
69
|
+
from open_webui.retrieval.web.yacy import search_yacy
|
|
70
|
+
from open_webui.retrieval.web.serper import search_serper
|
|
71
|
+
from open_webui.retrieval.web.serply import search_serply
|
|
72
|
+
from open_webui.retrieval.web.serpstack import search_serpstack
|
|
73
|
+
from open_webui.retrieval.web.tavily import search_tavily
|
|
74
|
+
from open_webui.retrieval.web.bing import search_bing
|
|
75
|
+
from open_webui.retrieval.web.azure import search_azure
|
|
76
|
+
from open_webui.retrieval.web.exa import search_exa
|
|
77
|
+
from open_webui.retrieval.web.perplexity import search_perplexity
|
|
78
|
+
from open_webui.retrieval.web.sougou import search_sougou
|
|
79
|
+
from open_webui.retrieval.web.firecrawl import search_firecrawl
|
|
80
|
+
from open_webui.retrieval.web.external import search_external
|
|
81
|
+
from open_webui.retrieval.web.yandex import search_yandex
|
|
82
|
+
from open_webui.retrieval.web.ydc import search_youcom
|
|
83
|
+
|
|
84
|
+
from open_webui.retrieval.utils import (
|
|
85
|
+
build_loader_from_config,
|
|
86
|
+
filter_accessible_collections,
|
|
87
|
+
get_content_from_url,
|
|
88
|
+
get_embedding_function,
|
|
89
|
+
get_reranking_function,
|
|
90
|
+
get_model_path,
|
|
91
|
+
query_collection,
|
|
92
|
+
query_collection_with_hybrid_search,
|
|
93
|
+
query_doc,
|
|
94
|
+
query_doc_with_hybrid_search,
|
|
95
|
+
)
|
|
96
|
+
from open_webui.retrieval.vector.utils import filter_metadata
|
|
97
|
+
from open_webui.utils.misc import (
|
|
98
|
+
calculate_sha256_string,
|
|
99
|
+
sanitize_text_for_db,
|
|
100
|
+
)
|
|
101
|
+
from open_webui.utils.auth import get_admin_user, get_verified_user
|
|
102
|
+
from open_webui.utils.access_control import has_permission
|
|
103
|
+
|
|
104
|
+
from open_webui.config import (
|
|
105
|
+
ENV,
|
|
106
|
+
RAG_EMBEDDING_MODEL_AUTO_UPDATE,
|
|
107
|
+
RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
|
|
108
|
+
RAG_RERANKING_MODEL_AUTO_UPDATE,
|
|
109
|
+
RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
|
|
110
|
+
UPLOAD_DIR,
|
|
111
|
+
DEFAULT_LOCALE,
|
|
112
|
+
RAG_EMBEDDING_CONTENT_PREFIX,
|
|
113
|
+
RAG_EMBEDDING_QUERY_PREFIX,
|
|
114
|
+
)
|
|
115
|
+
from open_webui.env import (
|
|
116
|
+
DEVICE_TYPE,
|
|
117
|
+
DOCKER,
|
|
118
|
+
RAG_EMBEDDING_TIMEOUT,
|
|
119
|
+
SENTENCE_TRANSFORMERS_BACKEND,
|
|
120
|
+
SENTENCE_TRANSFORMERS_MODEL_KWARGS,
|
|
121
|
+
SENTENCE_TRANSFORMERS_CROSS_ENCODER_BACKEND,
|
|
122
|
+
SENTENCE_TRANSFORMERS_CROSS_ENCODER_MODEL_KWARGS,
|
|
123
|
+
SENTENCE_TRANSFORMERS_CROSS_ENCODER_SIGMOID_ACTIVATION_FUNCTION,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
from open_webui.constants import ERROR_MESSAGES
|
|
127
|
+
|
|
128
|
+
log = logging.getLogger(__name__)
|
|
129
|
+
|
|
130
|
+
##########################################
|
|
131
|
+
#
|
|
132
|
+
# Utility functions
|
|
133
|
+
# Give us this day our relevant chunks, and lead us
|
|
134
|
+
# not into hallucination, but deliver us from noise.
|
|
135
|
+
#
|
|
136
|
+
##########################################
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_ef(
|
|
140
|
+
engine: str,
|
|
141
|
+
embedding_model: str,
|
|
142
|
+
auto_update: bool = RAG_EMBEDDING_MODEL_AUTO_UPDATE,
|
|
143
|
+
):
|
|
144
|
+
ef = None
|
|
145
|
+
if embedding_model and engine == '':
|
|
146
|
+
from sentence_transformers import SentenceTransformer
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
ef = SentenceTransformer(
|
|
150
|
+
get_model_path(embedding_model, auto_update),
|
|
151
|
+
device=DEVICE_TYPE,
|
|
152
|
+
trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
|
|
153
|
+
backend=SENTENCE_TRANSFORMERS_BACKEND,
|
|
154
|
+
model_kwargs=SENTENCE_TRANSFORMERS_MODEL_KWARGS,
|
|
155
|
+
)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
log.error(f'Error loading SentenceTransformer: {e}')
|
|
158
|
+
|
|
159
|
+
return ef
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_rf(
|
|
163
|
+
engine: str = '',
|
|
164
|
+
reranking_model: Optional[str] = None,
|
|
165
|
+
external_reranker_url: str = '',
|
|
166
|
+
external_reranker_api_key: str = '',
|
|
167
|
+
external_reranker_timeout: str = '',
|
|
168
|
+
auto_update: bool = RAG_RERANKING_MODEL_AUTO_UPDATE,
|
|
169
|
+
):
|
|
170
|
+
rf = None
|
|
171
|
+
# Convert timeout string to int or None (system default)
|
|
172
|
+
timeout_value = int(external_reranker_timeout) if external_reranker_timeout else None
|
|
173
|
+
if reranking_model:
|
|
174
|
+
if any(model in reranking_model for model in ['jinaai/jina-colbert-v2']):
|
|
175
|
+
try:
|
|
176
|
+
from open_webui.retrieval.models.colbert import ColBERT
|
|
177
|
+
|
|
178
|
+
rf = ColBERT(
|
|
179
|
+
get_model_path(reranking_model, auto_update),
|
|
180
|
+
env='docker' if DOCKER else None,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
log.error(f'ColBERT: {e}')
|
|
185
|
+
raise Exception(ERROR_MESSAGES.DEFAULT(e))
|
|
186
|
+
else:
|
|
187
|
+
if engine == 'external':
|
|
188
|
+
try:
|
|
189
|
+
from open_webui.retrieval.models.external import ExternalReranker
|
|
190
|
+
|
|
191
|
+
rf = ExternalReranker(
|
|
192
|
+
url=external_reranker_url,
|
|
193
|
+
api_key=external_reranker_api_key,
|
|
194
|
+
model=reranking_model,
|
|
195
|
+
timeout=timeout_value,
|
|
196
|
+
)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
log.error(f'ExternalReranking: {e}')
|
|
199
|
+
raise Exception(ERROR_MESSAGES.DEFAULT(e))
|
|
200
|
+
else:
|
|
201
|
+
import sentence_transformers
|
|
202
|
+
import torch
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
rf = sentence_transformers.CrossEncoder(
|
|
206
|
+
get_model_path(reranking_model, auto_update),
|
|
207
|
+
device=DEVICE_TYPE,
|
|
208
|
+
trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
|
|
209
|
+
backend=SENTENCE_TRANSFORMERS_CROSS_ENCODER_BACKEND,
|
|
210
|
+
model_kwargs=SENTENCE_TRANSFORMERS_CROSS_ENCODER_MODEL_KWARGS,
|
|
211
|
+
activation_fn=(
|
|
212
|
+
torch.nn.Sigmoid()
|
|
213
|
+
if SENTENCE_TRANSFORMERS_CROSS_ENCODER_SIGMOID_ACTIVATION_FUNCTION
|
|
214
|
+
else None
|
|
215
|
+
),
|
|
216
|
+
)
|
|
217
|
+
except Exception as e:
|
|
218
|
+
log.error(f'CrossEncoder: {e}')
|
|
219
|
+
raise Exception(ERROR_MESSAGES.DEFAULT('CrossEncoder error'))
|
|
220
|
+
|
|
221
|
+
# Safely adjust pad_token_id if missing as some models do not have this in config
|
|
222
|
+
try:
|
|
223
|
+
model_cfg = getattr(rf, 'model', None)
|
|
224
|
+
if model_cfg and hasattr(model_cfg, 'config'):
|
|
225
|
+
cfg = model_cfg.config
|
|
226
|
+
if getattr(cfg, 'pad_token_id', None) is None:
|
|
227
|
+
# Fallback to eos_token_id when available
|
|
228
|
+
eos = getattr(cfg, 'eos_token_id', None)
|
|
229
|
+
if eos is not None:
|
|
230
|
+
cfg.pad_token_id = eos
|
|
231
|
+
log.debug(f'Missing pad_token_id detected; set to eos_token_id={eos}')
|
|
232
|
+
else:
|
|
233
|
+
log.warning('Neither pad_token_id nor eos_token_id present in model config')
|
|
234
|
+
except Exception as e2:
|
|
235
|
+
log.warning(f'Failed to adjust pad_token_id on CrossEncoder: {e2}')
|
|
236
|
+
|
|
237
|
+
return rf
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
##########################################
|
|
241
|
+
#
|
|
242
|
+
# API routes
|
|
243
|
+
#
|
|
244
|
+
##########################################
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
router = APIRouter()
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class CollectionNameForm(BaseModel):
|
|
251
|
+
collection_name: Optional[str] = None
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class ProcessUrlForm(CollectionNameForm):
|
|
255
|
+
url: str
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class SearchForm(BaseModel):
|
|
259
|
+
queries: List[str]
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@router.get('/')
|
|
263
|
+
async def get_status(request: Request):
|
|
264
|
+
return {
|
|
265
|
+
'status': True,
|
|
266
|
+
'CHUNK_SIZE': request.app.state.config.CHUNK_SIZE,
|
|
267
|
+
'CHUNK_OVERLAP': request.app.state.config.CHUNK_OVERLAP,
|
|
268
|
+
'RAG_TEMPLATE': request.app.state.config.RAG_TEMPLATE,
|
|
269
|
+
'RAG_EMBEDDING_ENGINE': request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
270
|
+
'RAG_EMBEDDING_MODEL': request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
271
|
+
'RAG_RERANKING_MODEL': request.app.state.config.RAG_RERANKING_MODEL,
|
|
272
|
+
'RAG_EMBEDDING_BATCH_SIZE': request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
|
|
273
|
+
'ENABLE_ASYNC_EMBEDDING': request.app.state.config.ENABLE_ASYNC_EMBEDDING,
|
|
274
|
+
'RAG_EMBEDDING_CONCURRENT_REQUESTS': request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
@router.get('/embedding')
|
|
279
|
+
async def get_embedding_config(request: Request, user=Depends(get_admin_user)):
|
|
280
|
+
return {
|
|
281
|
+
'status': True,
|
|
282
|
+
'RAG_EMBEDDING_ENGINE': request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
283
|
+
'RAG_EMBEDDING_MODEL': request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
284
|
+
'RAG_EMBEDDING_BATCH_SIZE': request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
|
|
285
|
+
'ENABLE_ASYNC_EMBEDDING': request.app.state.config.ENABLE_ASYNC_EMBEDDING,
|
|
286
|
+
'RAG_EMBEDDING_CONCURRENT_REQUESTS': request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
|
|
287
|
+
'openai_config': {
|
|
288
|
+
'url': request.app.state.config.RAG_OPENAI_API_BASE_URL,
|
|
289
|
+
'key': request.app.state.config.RAG_OPENAI_API_KEY,
|
|
290
|
+
},
|
|
291
|
+
'ollama_config': {
|
|
292
|
+
'url': request.app.state.config.RAG_OLLAMA_BASE_URL,
|
|
293
|
+
'key': request.app.state.config.RAG_OLLAMA_API_KEY,
|
|
294
|
+
},
|
|
295
|
+
'azure_openai_config': {
|
|
296
|
+
'url': request.app.state.config.RAG_AZURE_OPENAI_BASE_URL,
|
|
297
|
+
'key': request.app.state.config.RAG_AZURE_OPENAI_API_KEY,
|
|
298
|
+
'version': request.app.state.config.RAG_AZURE_OPENAI_API_VERSION,
|
|
299
|
+
},
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class OpenAIConfigForm(BaseModel):
|
|
304
|
+
url: str
|
|
305
|
+
key: str
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
class OllamaConfigForm(BaseModel):
|
|
309
|
+
url: str
|
|
310
|
+
key: str
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class AzureOpenAIConfigForm(BaseModel):
|
|
314
|
+
url: str
|
|
315
|
+
key: str
|
|
316
|
+
version: str
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class EmbeddingModelUpdateForm(BaseModel):
|
|
320
|
+
openai_config: Optional[OpenAIConfigForm] = None
|
|
321
|
+
ollama_config: Optional[OllamaConfigForm] = None
|
|
322
|
+
azure_openai_config: Optional[AzureOpenAIConfigForm] = None
|
|
323
|
+
RAG_EMBEDDING_ENGINE: str
|
|
324
|
+
RAG_EMBEDDING_MODEL: str
|
|
325
|
+
RAG_EMBEDDING_BATCH_SIZE: Optional[int] = 1
|
|
326
|
+
ENABLE_ASYNC_EMBEDDING: Optional[bool] = True
|
|
327
|
+
RAG_EMBEDDING_CONCURRENT_REQUESTS: Optional[int] = 0
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def unload_embedding_model(request: Request):
|
|
331
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == '':
|
|
332
|
+
# unloads current internal embedding model and clears VRAM cache
|
|
333
|
+
request.app.state.ef = None
|
|
334
|
+
request.app.state.EMBEDDING_FUNCTION = None
|
|
335
|
+
import gc
|
|
336
|
+
|
|
337
|
+
gc.collect()
|
|
338
|
+
if DEVICE_TYPE == 'cuda':
|
|
339
|
+
import torch
|
|
340
|
+
|
|
341
|
+
if torch.cuda.is_available():
|
|
342
|
+
torch.cuda.empty_cache()
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
@router.post('/embedding/update')
|
|
346
|
+
async def update_embedding_config(request: Request, form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)):
|
|
347
|
+
log.info(
|
|
348
|
+
f'Updating embedding model: {request.app.state.config.RAG_EMBEDDING_MODEL} to {form_data.RAG_EMBEDDING_MODEL}'
|
|
349
|
+
)
|
|
350
|
+
unload_embedding_model(request)
|
|
351
|
+
try:
|
|
352
|
+
request.app.state.config.RAG_EMBEDDING_ENGINE = form_data.RAG_EMBEDDING_ENGINE
|
|
353
|
+
request.app.state.config.RAG_EMBEDDING_MODEL = form_data.RAG_EMBEDDING_MODEL
|
|
354
|
+
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE = form_data.RAG_EMBEDDING_BATCH_SIZE
|
|
355
|
+
request.app.state.config.ENABLE_ASYNC_EMBEDDING = form_data.ENABLE_ASYNC_EMBEDDING
|
|
356
|
+
request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS = form_data.RAG_EMBEDDING_CONCURRENT_REQUESTS
|
|
357
|
+
|
|
358
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE in [
|
|
359
|
+
'ollama',
|
|
360
|
+
'openai',
|
|
361
|
+
'azure_openai',
|
|
362
|
+
]:
|
|
363
|
+
if form_data.openai_config is not None:
|
|
364
|
+
request.app.state.config.RAG_OPENAI_API_BASE_URL = form_data.openai_config.url
|
|
365
|
+
request.app.state.config.RAG_OPENAI_API_KEY = form_data.openai_config.key
|
|
366
|
+
|
|
367
|
+
if form_data.ollama_config is not None:
|
|
368
|
+
request.app.state.config.RAG_OLLAMA_BASE_URL = form_data.ollama_config.url
|
|
369
|
+
request.app.state.config.RAG_OLLAMA_API_KEY = form_data.ollama_config.key
|
|
370
|
+
|
|
371
|
+
if form_data.azure_openai_config is not None:
|
|
372
|
+
request.app.state.config.RAG_AZURE_OPENAI_BASE_URL = form_data.azure_openai_config.url
|
|
373
|
+
request.app.state.config.RAG_AZURE_OPENAI_API_KEY = form_data.azure_openai_config.key
|
|
374
|
+
request.app.state.config.RAG_AZURE_OPENAI_API_VERSION = form_data.azure_openai_config.version
|
|
375
|
+
|
|
376
|
+
request.app.state.ef = get_ef(
|
|
377
|
+
request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
378
|
+
request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
request.app.state.EMBEDDING_FUNCTION = get_embedding_function(
|
|
382
|
+
request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
383
|
+
request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
384
|
+
request.app.state.ef,
|
|
385
|
+
(
|
|
386
|
+
request.app.state.config.RAG_OPENAI_API_BASE_URL
|
|
387
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
|
|
388
|
+
else (
|
|
389
|
+
request.app.state.config.RAG_OLLAMA_BASE_URL
|
|
390
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
|
|
391
|
+
else request.app.state.config.RAG_AZURE_OPENAI_BASE_URL
|
|
392
|
+
)
|
|
393
|
+
),
|
|
394
|
+
(
|
|
395
|
+
request.app.state.config.RAG_OPENAI_API_KEY
|
|
396
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
|
|
397
|
+
else (
|
|
398
|
+
request.app.state.config.RAG_OLLAMA_API_KEY
|
|
399
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
|
|
400
|
+
else request.app.state.config.RAG_AZURE_OPENAI_API_KEY
|
|
401
|
+
)
|
|
402
|
+
),
|
|
403
|
+
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
|
|
404
|
+
azure_api_version=(
|
|
405
|
+
request.app.state.config.RAG_AZURE_OPENAI_API_VERSION
|
|
406
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'azure_openai'
|
|
407
|
+
else None
|
|
408
|
+
),
|
|
409
|
+
enable_async=request.app.state.config.ENABLE_ASYNC_EMBEDDING,
|
|
410
|
+
concurrent_requests=request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
return {
|
|
414
|
+
'status': True,
|
|
415
|
+
'RAG_EMBEDDING_ENGINE': request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
416
|
+
'RAG_EMBEDDING_MODEL': request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
417
|
+
'RAG_EMBEDDING_BATCH_SIZE': request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
|
|
418
|
+
'ENABLE_ASYNC_EMBEDDING': request.app.state.config.ENABLE_ASYNC_EMBEDDING,
|
|
419
|
+
'RAG_EMBEDDING_CONCURRENT_REQUESTS': request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
|
|
420
|
+
'openai_config': {
|
|
421
|
+
'url': request.app.state.config.RAG_OPENAI_API_BASE_URL,
|
|
422
|
+
'key': request.app.state.config.RAG_OPENAI_API_KEY,
|
|
423
|
+
},
|
|
424
|
+
'ollama_config': {
|
|
425
|
+
'url': request.app.state.config.RAG_OLLAMA_BASE_URL,
|
|
426
|
+
'key': request.app.state.config.RAG_OLLAMA_API_KEY,
|
|
427
|
+
},
|
|
428
|
+
'azure_openai_config': {
|
|
429
|
+
'url': request.app.state.config.RAG_AZURE_OPENAI_BASE_URL,
|
|
430
|
+
'key': request.app.state.config.RAG_AZURE_OPENAI_API_KEY,
|
|
431
|
+
'version': request.app.state.config.RAG_AZURE_OPENAI_API_VERSION,
|
|
432
|
+
},
|
|
433
|
+
}
|
|
434
|
+
except Exception as e:
|
|
435
|
+
log.exception(f'Problem updating embedding model: {e}')
|
|
436
|
+
raise HTTPException(
|
|
437
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
438
|
+
detail=ERROR_MESSAGES.DEFAULT(e),
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
@router.get('/config')
|
|
443
|
+
async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|
444
|
+
return {
|
|
445
|
+
'status': True,
|
|
446
|
+
# RAG settings
|
|
447
|
+
'RAG_TEMPLATE': request.app.state.config.RAG_TEMPLATE,
|
|
448
|
+
'TOP_K': request.app.state.config.TOP_K,
|
|
449
|
+
'BYPASS_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
|
|
450
|
+
'RAG_FULL_CONTEXT': request.app.state.config.RAG_FULL_CONTEXT,
|
|
451
|
+
# Hybrid search settings
|
|
452
|
+
'ENABLE_RAG_HYBRID_SEARCH': request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
|
|
453
|
+
'ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS': request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS,
|
|
454
|
+
'TOP_K_RERANKER': request.app.state.config.TOP_K_RERANKER,
|
|
455
|
+
'RELEVANCE_THRESHOLD': request.app.state.config.RELEVANCE_THRESHOLD,
|
|
456
|
+
'HYBRID_BM25_WEIGHT': request.app.state.config.HYBRID_BM25_WEIGHT,
|
|
457
|
+
# Content extraction settings
|
|
458
|
+
'CONTENT_EXTRACTION_ENGINE': request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
|
459
|
+
'PDF_EXTRACT_IMAGES': request.app.state.config.PDF_EXTRACT_IMAGES,
|
|
460
|
+
'PDF_LOADER_MODE': request.app.state.config.PDF_LOADER_MODE,
|
|
461
|
+
'DATALAB_MARKER_API_KEY': request.app.state.config.DATALAB_MARKER_API_KEY,
|
|
462
|
+
'DATALAB_MARKER_API_BASE_URL': request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
|
463
|
+
'DATALAB_MARKER_ADDITIONAL_CONFIG': request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
|
464
|
+
'DATALAB_MARKER_SKIP_CACHE': request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
|
|
465
|
+
'DATALAB_MARKER_FORCE_OCR': request.app.state.config.DATALAB_MARKER_FORCE_OCR,
|
|
466
|
+
'DATALAB_MARKER_PAGINATE': request.app.state.config.DATALAB_MARKER_PAGINATE,
|
|
467
|
+
'DATALAB_MARKER_STRIP_EXISTING_OCR': request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
|
|
468
|
+
'DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION': request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
|
|
469
|
+
'DATALAB_MARKER_FORMAT_LINES': request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
|
|
470
|
+
'DATALAB_MARKER_USE_LLM': request.app.state.config.DATALAB_MARKER_USE_LLM,
|
|
471
|
+
'DATALAB_MARKER_OUTPUT_FORMAT': request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
|
|
472
|
+
'EXTERNAL_DOCUMENT_LOADER_URL': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
|
|
473
|
+
'EXTERNAL_DOCUMENT_LOADER_API_KEY': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
|
|
474
|
+
'TIKA_SERVER_URL': request.app.state.config.TIKA_SERVER_URL,
|
|
475
|
+
'DOCLING_SERVER_URL': request.app.state.config.DOCLING_SERVER_URL,
|
|
476
|
+
'DOCLING_API_KEY': request.app.state.config.DOCLING_API_KEY,
|
|
477
|
+
'DOCLING_PARAMS': request.app.state.config.DOCLING_PARAMS,
|
|
478
|
+
'DOCUMENT_INTELLIGENCE_ENDPOINT': request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
|
479
|
+
'DOCUMENT_INTELLIGENCE_KEY': request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
|
480
|
+
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
|
|
481
|
+
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
|
482
|
+
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
|
|
483
|
+
# MinerU settings
|
|
484
|
+
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
|
|
485
|
+
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
|
|
486
|
+
'MINERU_API_KEY': request.app.state.config.MINERU_API_KEY,
|
|
487
|
+
'MINERU_API_TIMEOUT': request.app.state.config.MINERU_API_TIMEOUT,
|
|
488
|
+
'MINERU_PARAMS': request.app.state.config.MINERU_PARAMS,
|
|
489
|
+
# Reranking settings
|
|
490
|
+
'RAG_RERANKING_MODEL': request.app.state.config.RAG_RERANKING_MODEL,
|
|
491
|
+
'RAG_RERANKING_ENGINE': request.app.state.config.RAG_RERANKING_ENGINE,
|
|
492
|
+
'RAG_RERANKING_BATCH_SIZE': request.app.state.config.RAG_RERANKING_BATCH_SIZE,
|
|
493
|
+
'RAG_EXTERNAL_RERANKER_URL': request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
|
|
494
|
+
'RAG_EXTERNAL_RERANKER_API_KEY': request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
|
|
495
|
+
'RAG_EXTERNAL_RERANKER_TIMEOUT': request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT,
|
|
496
|
+
# Chunking settings
|
|
497
|
+
'TEXT_SPLITTER': request.app.state.config.TEXT_SPLITTER,
|
|
498
|
+
'ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER': request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
|
499
|
+
'CHUNK_SIZE': request.app.state.config.CHUNK_SIZE,
|
|
500
|
+
'CHUNK_MIN_SIZE_TARGET': request.app.state.config.CHUNK_MIN_SIZE_TARGET,
|
|
501
|
+
'CHUNK_OVERLAP': request.app.state.config.CHUNK_OVERLAP,
|
|
502
|
+
# File upload settings
|
|
503
|
+
'FILE_MAX_SIZE': request.app.state.config.FILE_MAX_SIZE,
|
|
504
|
+
'FILE_MAX_COUNT': request.app.state.config.FILE_MAX_COUNT,
|
|
505
|
+
'FILE_IMAGE_COMPRESSION_WIDTH': request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH,
|
|
506
|
+
'FILE_IMAGE_COMPRESSION_HEIGHT': request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT,
|
|
507
|
+
'ALLOWED_FILE_EXTENSIONS': request.app.state.config.ALLOWED_FILE_EXTENSIONS,
|
|
508
|
+
# Integration settings
|
|
509
|
+
'ENABLE_GOOGLE_DRIVE_INTEGRATION': request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
|
|
510
|
+
'ENABLE_ONEDRIVE_INTEGRATION': request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
|
|
511
|
+
# Web search settings
|
|
512
|
+
'web': {
|
|
513
|
+
'ENABLE_WEB_SEARCH': request.app.state.config.ENABLE_WEB_SEARCH,
|
|
514
|
+
'WEB_SEARCH_ENGINE': request.app.state.config.WEB_SEARCH_ENGINE,
|
|
515
|
+
'WEB_SEARCH_TRUST_ENV': request.app.state.config.WEB_SEARCH_TRUST_ENV,
|
|
516
|
+
'WEB_SEARCH_RESULT_COUNT': request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
517
|
+
'WEB_SEARCH_CONCURRENT_REQUESTS': request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
|
|
518
|
+
'WEB_FETCH_MAX_CONTENT_LENGTH': request.app.state.config.WEB_FETCH_MAX_CONTENT_LENGTH,
|
|
519
|
+
'WEB_LOADER_CONCURRENT_REQUESTS': request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
|
|
520
|
+
'WEB_SEARCH_DOMAIN_FILTER_LIST': request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
521
|
+
'BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
|
|
522
|
+
'BYPASS_WEB_SEARCH_WEB_LOADER': request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER,
|
|
523
|
+
'OLLAMA_CLOUD_WEB_SEARCH_API_KEY': request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
|
|
524
|
+
'SEARXNG_QUERY_URL': request.app.state.config.SEARXNG_QUERY_URL,
|
|
525
|
+
'SEARXNG_LANGUAGE': request.app.state.config.SEARXNG_LANGUAGE,
|
|
526
|
+
'YACY_QUERY_URL': request.app.state.config.YACY_QUERY_URL,
|
|
527
|
+
'YACY_USERNAME': request.app.state.config.YACY_USERNAME,
|
|
528
|
+
'YACY_PASSWORD': request.app.state.config.YACY_PASSWORD,
|
|
529
|
+
'GOOGLE_PSE_API_KEY': request.app.state.config.GOOGLE_PSE_API_KEY,
|
|
530
|
+
'GOOGLE_PSE_ENGINE_ID': request.app.state.config.GOOGLE_PSE_ENGINE_ID,
|
|
531
|
+
'BRAVE_SEARCH_API_KEY': request.app.state.config.BRAVE_SEARCH_API_KEY,
|
|
532
|
+
'KAGI_SEARCH_API_KEY': request.app.state.config.KAGI_SEARCH_API_KEY,
|
|
533
|
+
'MOJEEK_SEARCH_API_KEY': request.app.state.config.MOJEEK_SEARCH_API_KEY,
|
|
534
|
+
'BOCHA_SEARCH_API_KEY': request.app.state.config.BOCHA_SEARCH_API_KEY,
|
|
535
|
+
'SERPSTACK_API_KEY': request.app.state.config.SERPSTACK_API_KEY,
|
|
536
|
+
'SERPSTACK_HTTPS': request.app.state.config.SERPSTACK_HTTPS,
|
|
537
|
+
'SERPER_API_KEY': request.app.state.config.SERPER_API_KEY,
|
|
538
|
+
'SERPLY_API_KEY': request.app.state.config.SERPLY_API_KEY,
|
|
539
|
+
'DDGS_BACKEND': request.app.state.config.DDGS_BACKEND,
|
|
540
|
+
'TAVILY_API_KEY': request.app.state.config.TAVILY_API_KEY,
|
|
541
|
+
'SEARCHAPI_API_KEY': request.app.state.config.SEARCHAPI_API_KEY,
|
|
542
|
+
'SEARCHAPI_ENGINE': request.app.state.config.SEARCHAPI_ENGINE,
|
|
543
|
+
'SERPAPI_API_KEY': request.app.state.config.SERPAPI_API_KEY,
|
|
544
|
+
'SERPAPI_ENGINE': request.app.state.config.SERPAPI_ENGINE,
|
|
545
|
+
'JINA_API_KEY': request.app.state.config.JINA_API_KEY,
|
|
546
|
+
'JINA_API_BASE_URL': request.app.state.config.JINA_API_BASE_URL,
|
|
547
|
+
'BING_SEARCH_V7_ENDPOINT': request.app.state.config.BING_SEARCH_V7_ENDPOINT,
|
|
548
|
+
'BING_SEARCH_V7_SUBSCRIPTION_KEY': request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
|
|
549
|
+
'EXA_API_KEY': request.app.state.config.EXA_API_KEY,
|
|
550
|
+
'PERPLEXITY_API_KEY': request.app.state.config.PERPLEXITY_API_KEY,
|
|
551
|
+
'PERPLEXITY_MODEL': request.app.state.config.PERPLEXITY_MODEL,
|
|
552
|
+
'PERPLEXITY_SEARCH_CONTEXT_USAGE': request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
|
|
553
|
+
'PERPLEXITY_SEARCH_API_URL': request.app.state.config.PERPLEXITY_SEARCH_API_URL,
|
|
554
|
+
'SOUGOU_API_SID': request.app.state.config.SOUGOU_API_SID,
|
|
555
|
+
'SOUGOU_API_SK': request.app.state.config.SOUGOU_API_SK,
|
|
556
|
+
'WEB_LOADER_ENGINE': request.app.state.config.WEB_LOADER_ENGINE,
|
|
557
|
+
'WEB_LOADER_TIMEOUT': request.app.state.config.WEB_LOADER_TIMEOUT,
|
|
558
|
+
'ENABLE_WEB_LOADER_SSL_VERIFICATION': request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
|
|
559
|
+
'PLAYWRIGHT_WS_URL': request.app.state.config.PLAYWRIGHT_WS_URL,
|
|
560
|
+
'PLAYWRIGHT_TIMEOUT': request.app.state.config.PLAYWRIGHT_TIMEOUT,
|
|
561
|
+
'FIRECRAWL_API_KEY': request.app.state.config.FIRECRAWL_API_KEY,
|
|
562
|
+
'FIRECRAWL_API_BASE_URL': request.app.state.config.FIRECRAWL_API_BASE_URL,
|
|
563
|
+
'FIRECRAWL_TIMEOUT': request.app.state.config.FIRECRAWL_TIMEOUT,
|
|
564
|
+
'TAVILY_EXTRACT_DEPTH': request.app.state.config.TAVILY_EXTRACT_DEPTH,
|
|
565
|
+
'EXTERNAL_WEB_SEARCH_URL': request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
|
|
566
|
+
'EXTERNAL_WEB_SEARCH_API_KEY': request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
|
|
567
|
+
'EXTERNAL_WEB_LOADER_URL': request.app.state.config.EXTERNAL_WEB_LOADER_URL,
|
|
568
|
+
'EXTERNAL_WEB_LOADER_API_KEY': request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
|
|
569
|
+
'YOUTUBE_LOADER_LANGUAGE': request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
|
570
|
+
'YOUTUBE_LOADER_PROXY_URL': request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
|
|
571
|
+
'YOUTUBE_LOADER_TRANSLATION': request.app.state.YOUTUBE_LOADER_TRANSLATION,
|
|
572
|
+
'YANDEX_WEB_SEARCH_URL': request.app.state.config.YANDEX_WEB_SEARCH_URL,
|
|
573
|
+
'YANDEX_WEB_SEARCH_API_KEY': request.app.state.config.YANDEX_WEB_SEARCH_API_KEY,
|
|
574
|
+
'YANDEX_WEB_SEARCH_CONFIG': request.app.state.config.YANDEX_WEB_SEARCH_CONFIG,
|
|
575
|
+
'YOUCOM_API_KEY': request.app.state.config.YOUCOM_API_KEY,
|
|
576
|
+
},
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
class WebConfig(BaseModel):
|
|
581
|
+
ENABLE_WEB_SEARCH: Optional[bool] = None
|
|
582
|
+
WEB_SEARCH_ENGINE: Optional[str] = None
|
|
583
|
+
WEB_SEARCH_TRUST_ENV: Optional[bool] = None
|
|
584
|
+
WEB_SEARCH_RESULT_COUNT: Optional[int] = None
|
|
585
|
+
WEB_SEARCH_CONCURRENT_REQUESTS: Optional[int] = None
|
|
586
|
+
WEB_SEARCH_DOMAIN_FILTER_LIST: Optional[List[str]] = []
|
|
587
|
+
WEB_FETCH_MAX_CONTENT_LENGTH: Optional[int] = None
|
|
588
|
+
WEB_LOADER_CONCURRENT_REQUESTS: Optional[int] = None
|
|
589
|
+
BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
|
|
590
|
+
BYPASS_WEB_SEARCH_WEB_LOADER: Optional[bool] = None
|
|
591
|
+
OLLAMA_CLOUD_WEB_SEARCH_API_KEY: Optional[str] = None
|
|
592
|
+
SEARXNG_QUERY_URL: Optional[str] = None
|
|
593
|
+
SEARXNG_LANGUAGE: Optional[str] = None
|
|
594
|
+
YACY_QUERY_URL: Optional[str] = None
|
|
595
|
+
YACY_USERNAME: Optional[str] = None
|
|
596
|
+
YACY_PASSWORD: Optional[str] = None
|
|
597
|
+
GOOGLE_PSE_API_KEY: Optional[str] = None
|
|
598
|
+
GOOGLE_PSE_ENGINE_ID: Optional[str] = None
|
|
599
|
+
BRAVE_SEARCH_API_KEY: Optional[str] = None
|
|
600
|
+
KAGI_SEARCH_API_KEY: Optional[str] = None
|
|
601
|
+
MOJEEK_SEARCH_API_KEY: Optional[str] = None
|
|
602
|
+
BOCHA_SEARCH_API_KEY: Optional[str] = None
|
|
603
|
+
SERPSTACK_API_KEY: Optional[str] = None
|
|
604
|
+
SERPSTACK_HTTPS: Optional[bool] = None
|
|
605
|
+
SERPER_API_KEY: Optional[str] = None
|
|
606
|
+
SERPLY_API_KEY: Optional[str] = None
|
|
607
|
+
DDGS_BACKEND: Optional[str] = None
|
|
608
|
+
TAVILY_API_KEY: Optional[str] = None
|
|
609
|
+
SEARCHAPI_API_KEY: Optional[str] = None
|
|
610
|
+
SEARCHAPI_ENGINE: Optional[str] = None
|
|
611
|
+
SERPAPI_API_KEY: Optional[str] = None
|
|
612
|
+
SERPAPI_ENGINE: Optional[str] = None
|
|
613
|
+
JINA_API_KEY: Optional[str] = None
|
|
614
|
+
JINA_API_BASE_URL: Optional[str] = None
|
|
615
|
+
BING_SEARCH_V7_ENDPOINT: Optional[str] = None
|
|
616
|
+
BING_SEARCH_V7_SUBSCRIPTION_KEY: Optional[str] = None
|
|
617
|
+
EXA_API_KEY: Optional[str] = None
|
|
618
|
+
PERPLEXITY_API_KEY: Optional[str] = None
|
|
619
|
+
PERPLEXITY_MODEL: Optional[str] = None
|
|
620
|
+
PERPLEXITY_SEARCH_CONTEXT_USAGE: Optional[str] = None
|
|
621
|
+
PERPLEXITY_SEARCH_API_URL: Optional[str] = None
|
|
622
|
+
SOUGOU_API_SID: Optional[str] = None
|
|
623
|
+
SOUGOU_API_SK: Optional[str] = None
|
|
624
|
+
WEB_LOADER_ENGINE: Optional[str] = None
|
|
625
|
+
WEB_LOADER_TIMEOUT: Optional[str] = None
|
|
626
|
+
ENABLE_WEB_LOADER_SSL_VERIFICATION: Optional[bool] = None
|
|
627
|
+
PLAYWRIGHT_WS_URL: Optional[str] = None
|
|
628
|
+
PLAYWRIGHT_TIMEOUT: Optional[int] = None
|
|
629
|
+
FIRECRAWL_API_KEY: Optional[str] = None
|
|
630
|
+
FIRECRAWL_API_BASE_URL: Optional[str] = None
|
|
631
|
+
FIRECRAWL_TIMEOUT: Optional[str] = None
|
|
632
|
+
TAVILY_EXTRACT_DEPTH: Optional[str] = None
|
|
633
|
+
EXTERNAL_WEB_SEARCH_URL: Optional[str] = None
|
|
634
|
+
EXTERNAL_WEB_SEARCH_API_KEY: Optional[str] = None
|
|
635
|
+
EXTERNAL_WEB_LOADER_URL: Optional[str] = None
|
|
636
|
+
EXTERNAL_WEB_LOADER_API_KEY: Optional[str] = None
|
|
637
|
+
YOUTUBE_LOADER_LANGUAGE: Optional[List[str]] = None
|
|
638
|
+
YOUTUBE_LOADER_PROXY_URL: Optional[str] = None
|
|
639
|
+
YOUTUBE_LOADER_TRANSLATION: Optional[str] = None
|
|
640
|
+
YANDEX_WEB_SEARCH_URL: Optional[str] = None
|
|
641
|
+
YANDEX_WEB_SEARCH_API_KEY: Optional[str] = None
|
|
642
|
+
YANDEX_WEB_SEARCH_CONFIG: Optional[str] = None
|
|
643
|
+
YOUCOM_API_KEY: Optional[str] = None
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
class ConfigForm(BaseModel):
|
|
647
|
+
# RAG settings
|
|
648
|
+
RAG_TEMPLATE: Optional[str] = None
|
|
649
|
+
TOP_K: Optional[int] = None
|
|
650
|
+
BYPASS_EMBEDDING_AND_RETRIEVAL: Optional[bool] = None
|
|
651
|
+
RAG_FULL_CONTEXT: Optional[bool] = None
|
|
652
|
+
|
|
653
|
+
# Hybrid search settings
|
|
654
|
+
ENABLE_RAG_HYBRID_SEARCH: Optional[bool] = None
|
|
655
|
+
ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS: Optional[bool] = None
|
|
656
|
+
TOP_K_RERANKER: Optional[int] = None
|
|
657
|
+
RELEVANCE_THRESHOLD: Optional[float] = None
|
|
658
|
+
HYBRID_BM25_WEIGHT: Optional[float] = None
|
|
659
|
+
|
|
660
|
+
# Content extraction settings
|
|
661
|
+
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
|
|
662
|
+
PDF_EXTRACT_IMAGES: Optional[bool] = None
|
|
663
|
+
PDF_LOADER_MODE: Optional[str] = None
|
|
664
|
+
|
|
665
|
+
DATALAB_MARKER_API_KEY: Optional[str] = None
|
|
666
|
+
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
|
|
667
|
+
DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
|
|
668
|
+
DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
|
|
669
|
+
DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
|
|
670
|
+
DATALAB_MARKER_PAGINATE: Optional[bool] = None
|
|
671
|
+
DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
|
|
672
|
+
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
|
|
673
|
+
DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
|
|
674
|
+
DATALAB_MARKER_USE_LLM: Optional[bool] = None
|
|
675
|
+
DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
|
|
676
|
+
|
|
677
|
+
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
|
|
678
|
+
EXTERNAL_DOCUMENT_LOADER_API_KEY: Optional[str] = None
|
|
679
|
+
|
|
680
|
+
TIKA_SERVER_URL: Optional[str] = None
|
|
681
|
+
DOCLING_SERVER_URL: Optional[str] = None
|
|
682
|
+
DOCLING_API_KEY: Optional[str] = None
|
|
683
|
+
DOCLING_PARAMS: Optional[dict] = None
|
|
684
|
+
DOCUMENT_INTELLIGENCE_ENDPOINT: Optional[str] = None
|
|
685
|
+
DOCUMENT_INTELLIGENCE_KEY: Optional[str] = None
|
|
686
|
+
DOCUMENT_INTELLIGENCE_MODEL: Optional[str] = None
|
|
687
|
+
MISTRAL_OCR_API_BASE_URL: Optional[str] = None
|
|
688
|
+
MISTRAL_OCR_API_KEY: Optional[str] = None
|
|
689
|
+
|
|
690
|
+
# MinerU settings
|
|
691
|
+
MINERU_API_MODE: Optional[str] = None
|
|
692
|
+
MINERU_API_URL: Optional[str] = None
|
|
693
|
+
MINERU_API_KEY: Optional[str] = None
|
|
694
|
+
MINERU_API_TIMEOUT: Optional[str] = None
|
|
695
|
+
MINERU_PARAMS: Optional[dict] = None
|
|
696
|
+
|
|
697
|
+
# Reranking settings
|
|
698
|
+
RAG_RERANKING_MODEL: Optional[str] = None
|
|
699
|
+
RAG_RERANKING_ENGINE: Optional[str] = None
|
|
700
|
+
RAG_RERANKING_BATCH_SIZE: Optional[int] = None
|
|
701
|
+
RAG_EXTERNAL_RERANKER_URL: Optional[str] = None
|
|
702
|
+
RAG_EXTERNAL_RERANKER_API_KEY: Optional[str] = None
|
|
703
|
+
RAG_EXTERNAL_RERANKER_TIMEOUT: Optional[str] = None
|
|
704
|
+
|
|
705
|
+
# Chunking settings
|
|
706
|
+
TEXT_SPLITTER: Optional[str] = None
|
|
707
|
+
ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER: Optional[bool] = None
|
|
708
|
+
CHUNK_SIZE: Optional[int] = None
|
|
709
|
+
CHUNK_MIN_SIZE_TARGET: Optional[int] = None
|
|
710
|
+
CHUNK_OVERLAP: Optional[int] = None
|
|
711
|
+
|
|
712
|
+
# File upload settings
|
|
713
|
+
FILE_MAX_SIZE: Optional[Union[int, str]] = None
|
|
714
|
+
FILE_MAX_COUNT: Optional[Union[int, str]] = None
|
|
715
|
+
FILE_IMAGE_COMPRESSION_WIDTH: Optional[Union[int, str]] = None
|
|
716
|
+
FILE_IMAGE_COMPRESSION_HEIGHT: Optional[Union[int, str]] = None
|
|
717
|
+
ALLOWED_FILE_EXTENSIONS: Optional[List[str]] = None
|
|
718
|
+
|
|
719
|
+
# Integration settings
|
|
720
|
+
ENABLE_GOOGLE_DRIVE_INTEGRATION: Optional[bool] = None
|
|
721
|
+
ENABLE_ONEDRIVE_INTEGRATION: Optional[bool] = None
|
|
722
|
+
|
|
723
|
+
# Web search settings
|
|
724
|
+
web: Optional[WebConfig] = None
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
@router.post('/config/update')
|
|
728
|
+
async def update_rag_config(request: Request, form_data: ConfigForm, user=Depends(get_admin_user)):
|
|
729
|
+
# RAG settings
|
|
730
|
+
request.app.state.config.RAG_TEMPLATE = (
|
|
731
|
+
form_data.RAG_TEMPLATE if form_data.RAG_TEMPLATE is not None else request.app.state.config.RAG_TEMPLATE
|
|
732
|
+
)
|
|
733
|
+
request.app.state.config.TOP_K = form_data.TOP_K if form_data.TOP_K is not None else request.app.state.config.TOP_K
|
|
734
|
+
request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL = (
|
|
735
|
+
form_data.BYPASS_EMBEDDING_AND_RETRIEVAL
|
|
736
|
+
if form_data.BYPASS_EMBEDDING_AND_RETRIEVAL is not None
|
|
737
|
+
else request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
|
|
738
|
+
)
|
|
739
|
+
request.app.state.config.RAG_FULL_CONTEXT = (
|
|
740
|
+
form_data.RAG_FULL_CONTEXT
|
|
741
|
+
if form_data.RAG_FULL_CONTEXT is not None
|
|
742
|
+
else request.app.state.config.RAG_FULL_CONTEXT
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
# Hybrid search settings
|
|
746
|
+
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
|
|
747
|
+
form_data.ENABLE_RAG_HYBRID_SEARCH
|
|
748
|
+
if form_data.ENABLE_RAG_HYBRID_SEARCH is not None
|
|
749
|
+
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
|
|
750
|
+
)
|
|
751
|
+
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS = (
|
|
752
|
+
form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
|
753
|
+
if form_data.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS is not None
|
|
754
|
+
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
request.app.state.config.TOP_K_RERANKER = (
|
|
758
|
+
form_data.TOP_K_RERANKER if form_data.TOP_K_RERANKER is not None else request.app.state.config.TOP_K_RERANKER
|
|
759
|
+
)
|
|
760
|
+
request.app.state.config.RELEVANCE_THRESHOLD = (
|
|
761
|
+
form_data.RELEVANCE_THRESHOLD
|
|
762
|
+
if form_data.RELEVANCE_THRESHOLD is not None
|
|
763
|
+
else request.app.state.config.RELEVANCE_THRESHOLD
|
|
764
|
+
)
|
|
765
|
+
request.app.state.config.HYBRID_BM25_WEIGHT = (
|
|
766
|
+
form_data.HYBRID_BM25_WEIGHT
|
|
767
|
+
if form_data.HYBRID_BM25_WEIGHT is not None
|
|
768
|
+
else request.app.state.config.HYBRID_BM25_WEIGHT
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# Content extraction settings
|
|
772
|
+
request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
|
|
773
|
+
form_data.CONTENT_EXTRACTION_ENGINE
|
|
774
|
+
if form_data.CONTENT_EXTRACTION_ENGINE is not None
|
|
775
|
+
else request.app.state.config.CONTENT_EXTRACTION_ENGINE
|
|
776
|
+
)
|
|
777
|
+
request.app.state.config.PDF_EXTRACT_IMAGES = (
|
|
778
|
+
form_data.PDF_EXTRACT_IMAGES
|
|
779
|
+
if form_data.PDF_EXTRACT_IMAGES is not None
|
|
780
|
+
else request.app.state.config.PDF_EXTRACT_IMAGES
|
|
781
|
+
)
|
|
782
|
+
request.app.state.config.PDF_LOADER_MODE = (
|
|
783
|
+
form_data.PDF_LOADER_MODE if form_data.PDF_LOADER_MODE is not None else request.app.state.config.PDF_LOADER_MODE
|
|
784
|
+
)
|
|
785
|
+
request.app.state.config.DATALAB_MARKER_API_KEY = (
|
|
786
|
+
form_data.DATALAB_MARKER_API_KEY
|
|
787
|
+
if form_data.DATALAB_MARKER_API_KEY is not None
|
|
788
|
+
else request.app.state.config.DATALAB_MARKER_API_KEY
|
|
789
|
+
)
|
|
790
|
+
request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
|
|
791
|
+
form_data.DATALAB_MARKER_API_BASE_URL
|
|
792
|
+
if form_data.DATALAB_MARKER_API_BASE_URL is not None
|
|
793
|
+
else request.app.state.config.DATALAB_MARKER_API_BASE_URL
|
|
794
|
+
)
|
|
795
|
+
request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
|
|
796
|
+
form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
|
|
797
|
+
if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
|
|
798
|
+
else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
|
|
799
|
+
)
|
|
800
|
+
request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
|
|
801
|
+
form_data.DATALAB_MARKER_SKIP_CACHE
|
|
802
|
+
if form_data.DATALAB_MARKER_SKIP_CACHE is not None
|
|
803
|
+
else request.app.state.config.DATALAB_MARKER_SKIP_CACHE
|
|
804
|
+
)
|
|
805
|
+
request.app.state.config.DATALAB_MARKER_FORCE_OCR = (
|
|
806
|
+
form_data.DATALAB_MARKER_FORCE_OCR
|
|
807
|
+
if form_data.DATALAB_MARKER_FORCE_OCR is not None
|
|
808
|
+
else request.app.state.config.DATALAB_MARKER_FORCE_OCR
|
|
809
|
+
)
|
|
810
|
+
request.app.state.config.DATALAB_MARKER_PAGINATE = (
|
|
811
|
+
form_data.DATALAB_MARKER_PAGINATE
|
|
812
|
+
if form_data.DATALAB_MARKER_PAGINATE is not None
|
|
813
|
+
else request.app.state.config.DATALAB_MARKER_PAGINATE
|
|
814
|
+
)
|
|
815
|
+
request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = (
|
|
816
|
+
form_data.DATALAB_MARKER_STRIP_EXISTING_OCR
|
|
817
|
+
if form_data.DATALAB_MARKER_STRIP_EXISTING_OCR is not None
|
|
818
|
+
else request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR
|
|
819
|
+
)
|
|
820
|
+
request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
|
|
821
|
+
form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
|
|
822
|
+
if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
|
|
823
|
+
else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
|
|
824
|
+
)
|
|
825
|
+
request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
|
|
826
|
+
form_data.DATALAB_MARKER_FORMAT_LINES
|
|
827
|
+
if form_data.DATALAB_MARKER_FORMAT_LINES is not None
|
|
828
|
+
else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
|
|
829
|
+
)
|
|
830
|
+
request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
|
|
831
|
+
form_data.DATALAB_MARKER_OUTPUT_FORMAT
|
|
832
|
+
if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
|
|
833
|
+
else request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT
|
|
834
|
+
)
|
|
835
|
+
request.app.state.config.DATALAB_MARKER_USE_LLM = (
|
|
836
|
+
form_data.DATALAB_MARKER_USE_LLM
|
|
837
|
+
if form_data.DATALAB_MARKER_USE_LLM is not None
|
|
838
|
+
else request.app.state.config.DATALAB_MARKER_USE_LLM
|
|
839
|
+
)
|
|
840
|
+
request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = (
|
|
841
|
+
form_data.EXTERNAL_DOCUMENT_LOADER_URL
|
|
842
|
+
if form_data.EXTERNAL_DOCUMENT_LOADER_URL is not None
|
|
843
|
+
else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL
|
|
844
|
+
)
|
|
845
|
+
request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = (
|
|
846
|
+
form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY
|
|
847
|
+
if form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY is not None
|
|
848
|
+
else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY
|
|
849
|
+
)
|
|
850
|
+
request.app.state.config.TIKA_SERVER_URL = (
|
|
851
|
+
form_data.TIKA_SERVER_URL if form_data.TIKA_SERVER_URL is not None else request.app.state.config.TIKA_SERVER_URL
|
|
852
|
+
)
|
|
853
|
+
request.app.state.config.DOCLING_SERVER_URL = (
|
|
854
|
+
form_data.DOCLING_SERVER_URL
|
|
855
|
+
if form_data.DOCLING_SERVER_URL is not None
|
|
856
|
+
else request.app.state.config.DOCLING_SERVER_URL
|
|
857
|
+
)
|
|
858
|
+
request.app.state.config.DOCLING_API_KEY = (
|
|
859
|
+
form_data.DOCLING_API_KEY if form_data.DOCLING_API_KEY is not None else request.app.state.config.DOCLING_API_KEY
|
|
860
|
+
)
|
|
861
|
+
request.app.state.config.DOCLING_PARAMS = (
|
|
862
|
+
form_data.DOCLING_PARAMS if form_data.DOCLING_PARAMS is not None else request.app.state.config.DOCLING_PARAMS
|
|
863
|
+
)
|
|
864
|
+
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
|
|
865
|
+
form_data.DOCUMENT_INTELLIGENCE_ENDPOINT
|
|
866
|
+
if form_data.DOCUMENT_INTELLIGENCE_ENDPOINT is not None
|
|
867
|
+
else request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT
|
|
868
|
+
)
|
|
869
|
+
request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
|
|
870
|
+
form_data.DOCUMENT_INTELLIGENCE_KEY
|
|
871
|
+
if form_data.DOCUMENT_INTELLIGENCE_KEY is not None
|
|
872
|
+
else request.app.state.config.DOCUMENT_INTELLIGENCE_KEY
|
|
873
|
+
)
|
|
874
|
+
request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL = (
|
|
875
|
+
form_data.DOCUMENT_INTELLIGENCE_MODEL
|
|
876
|
+
if form_data.DOCUMENT_INTELLIGENCE_MODEL is not None
|
|
877
|
+
else request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
request.app.state.config.MISTRAL_OCR_API_BASE_URL = (
|
|
881
|
+
form_data.MISTRAL_OCR_API_BASE_URL
|
|
882
|
+
if form_data.MISTRAL_OCR_API_BASE_URL is not None
|
|
883
|
+
else request.app.state.config.MISTRAL_OCR_API_BASE_URL
|
|
884
|
+
)
|
|
885
|
+
request.app.state.config.MISTRAL_OCR_API_KEY = (
|
|
886
|
+
form_data.MISTRAL_OCR_API_KEY
|
|
887
|
+
if form_data.MISTRAL_OCR_API_KEY is not None
|
|
888
|
+
else request.app.state.config.MISTRAL_OCR_API_KEY
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
# MinerU settings
|
|
892
|
+
request.app.state.config.MINERU_API_MODE = (
|
|
893
|
+
form_data.MINERU_API_MODE if form_data.MINERU_API_MODE is not None else request.app.state.config.MINERU_API_MODE
|
|
894
|
+
)
|
|
895
|
+
request.app.state.config.MINERU_API_URL = (
|
|
896
|
+
form_data.MINERU_API_URL if form_data.MINERU_API_URL is not None else request.app.state.config.MINERU_API_URL
|
|
897
|
+
)
|
|
898
|
+
request.app.state.config.MINERU_API_KEY = (
|
|
899
|
+
form_data.MINERU_API_KEY if form_data.MINERU_API_KEY is not None else request.app.state.config.MINERU_API_KEY
|
|
900
|
+
)
|
|
901
|
+
request.app.state.config.MINERU_API_TIMEOUT = (
|
|
902
|
+
form_data.MINERU_API_TIMEOUT
|
|
903
|
+
if form_data.MINERU_API_TIMEOUT is not None
|
|
904
|
+
else request.app.state.config.MINERU_API_TIMEOUT
|
|
905
|
+
)
|
|
906
|
+
request.app.state.config.MINERU_PARAMS = (
|
|
907
|
+
form_data.MINERU_PARAMS if form_data.MINERU_PARAMS is not None else request.app.state.config.MINERU_PARAMS
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
# Reranking settings
|
|
911
|
+
if request.app.state.config.RAG_RERANKING_ENGINE == '':
|
|
912
|
+
# Unloading the internal reranker and clear VRAM memory
|
|
913
|
+
request.app.state.rf = None
|
|
914
|
+
request.app.state.RERANKING_FUNCTION = None
|
|
915
|
+
import gc
|
|
916
|
+
|
|
917
|
+
gc.collect()
|
|
918
|
+
if DEVICE_TYPE == 'cuda':
|
|
919
|
+
import torch
|
|
920
|
+
|
|
921
|
+
if torch.cuda.is_available():
|
|
922
|
+
torch.cuda.empty_cache()
|
|
923
|
+
request.app.state.config.RAG_RERANKING_ENGINE = (
|
|
924
|
+
form_data.RAG_RERANKING_ENGINE
|
|
925
|
+
if form_data.RAG_RERANKING_ENGINE is not None
|
|
926
|
+
else request.app.state.config.RAG_RERANKING_ENGINE
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
request.app.state.config.RAG_EXTERNAL_RERANKER_URL = (
|
|
930
|
+
form_data.RAG_EXTERNAL_RERANKER_URL
|
|
931
|
+
if form_data.RAG_EXTERNAL_RERANKER_URL is not None
|
|
932
|
+
else request.app.state.config.RAG_EXTERNAL_RERANKER_URL
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY = (
|
|
936
|
+
form_data.RAG_EXTERNAL_RERANKER_API_KEY
|
|
937
|
+
if form_data.RAG_EXTERNAL_RERANKER_API_KEY is not None
|
|
938
|
+
else request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT = (
|
|
942
|
+
form_data.RAG_EXTERNAL_RERANKER_TIMEOUT
|
|
943
|
+
if form_data.RAG_EXTERNAL_RERANKER_TIMEOUT is not None
|
|
944
|
+
else request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
request.app.state.config.RAG_RERANKING_BATCH_SIZE = (
|
|
948
|
+
form_data.RAG_RERANKING_BATCH_SIZE
|
|
949
|
+
if form_data.RAG_RERANKING_BATCH_SIZE is not None
|
|
950
|
+
else request.app.state.config.RAG_RERANKING_BATCH_SIZE
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
log.info(
|
|
954
|
+
f'Updating reranking model: {request.app.state.config.RAG_RERANKING_MODEL} to {form_data.RAG_RERANKING_MODEL}'
|
|
955
|
+
)
|
|
956
|
+
try:
|
|
957
|
+
request.app.state.config.RAG_RERANKING_MODEL = (
|
|
958
|
+
form_data.RAG_RERANKING_MODEL
|
|
959
|
+
if form_data.RAG_RERANKING_MODEL is not None
|
|
960
|
+
else request.app.state.config.RAG_RERANKING_MODEL
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
try:
|
|
964
|
+
if (
|
|
965
|
+
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH
|
|
966
|
+
and not request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL
|
|
967
|
+
):
|
|
968
|
+
request.app.state.rf = get_rf(
|
|
969
|
+
request.app.state.config.RAG_RERANKING_ENGINE,
|
|
970
|
+
request.app.state.config.RAG_RERANKING_MODEL,
|
|
971
|
+
request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
|
|
972
|
+
request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
|
|
973
|
+
request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT,
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
request.app.state.RERANKING_FUNCTION = get_reranking_function(
|
|
977
|
+
request.app.state.config.RAG_RERANKING_ENGINE,
|
|
978
|
+
request.app.state.config.RAG_RERANKING_MODEL,
|
|
979
|
+
request.app.state.rf,
|
|
980
|
+
reranking_batch_size=request.app.state.config.RAG_RERANKING_BATCH_SIZE,
|
|
981
|
+
)
|
|
982
|
+
except Exception as e:
|
|
983
|
+
log.error(f'Error loading reranking model: {e}')
|
|
984
|
+
request.app.state.config.ENABLE_RAG_HYBRID_SEARCH = False
|
|
985
|
+
except Exception as e:
|
|
986
|
+
log.exception(f'Problem updating reranking model: {e}')
|
|
987
|
+
raise HTTPException(
|
|
988
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
989
|
+
detail=ERROR_MESSAGES.DEFAULT(e),
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
# Chunking settings
|
|
993
|
+
request.app.state.config.TEXT_SPLITTER = (
|
|
994
|
+
form_data.TEXT_SPLITTER if form_data.TEXT_SPLITTER is not None else request.app.state.config.TEXT_SPLITTER
|
|
995
|
+
)
|
|
996
|
+
request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER = (
|
|
997
|
+
form_data.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER
|
|
998
|
+
if form_data.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER is not None
|
|
999
|
+
else request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER
|
|
1000
|
+
)
|
|
1001
|
+
request.app.state.config.CHUNK_SIZE = (
|
|
1002
|
+
form_data.CHUNK_SIZE if form_data.CHUNK_SIZE is not None else request.app.state.config.CHUNK_SIZE
|
|
1003
|
+
)
|
|
1004
|
+
request.app.state.config.CHUNK_MIN_SIZE_TARGET = (
|
|
1005
|
+
form_data.CHUNK_MIN_SIZE_TARGET
|
|
1006
|
+
if form_data.CHUNK_MIN_SIZE_TARGET is not None
|
|
1007
|
+
else request.app.state.config.CHUNK_MIN_SIZE_TARGET
|
|
1008
|
+
)
|
|
1009
|
+
request.app.state.config.CHUNK_OVERLAP = (
|
|
1010
|
+
form_data.CHUNK_OVERLAP if form_data.CHUNK_OVERLAP is not None else request.app.state.config.CHUNK_OVERLAP
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
# File upload settings
|
|
1014
|
+
# Empty string means "clear to None" (unlimited/no compression),
|
|
1015
|
+
# None means "don't change", int means "set to this value"
|
|
1016
|
+
if form_data.FILE_MAX_SIZE is not None:
|
|
1017
|
+
request.app.state.config.FILE_MAX_SIZE = None if form_data.FILE_MAX_SIZE == '' else form_data.FILE_MAX_SIZE
|
|
1018
|
+
if form_data.FILE_MAX_COUNT is not None:
|
|
1019
|
+
request.app.state.config.FILE_MAX_COUNT = None if form_data.FILE_MAX_COUNT == '' else form_data.FILE_MAX_COUNT
|
|
1020
|
+
if form_data.FILE_IMAGE_COMPRESSION_WIDTH is not None:
|
|
1021
|
+
request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH = (
|
|
1022
|
+
None if form_data.FILE_IMAGE_COMPRESSION_WIDTH == '' else form_data.FILE_IMAGE_COMPRESSION_WIDTH
|
|
1023
|
+
)
|
|
1024
|
+
if form_data.FILE_IMAGE_COMPRESSION_HEIGHT is not None:
|
|
1025
|
+
request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT = (
|
|
1026
|
+
None if form_data.FILE_IMAGE_COMPRESSION_HEIGHT == '' else form_data.FILE_IMAGE_COMPRESSION_HEIGHT
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
request.app.state.config.ALLOWED_FILE_EXTENSIONS = (
|
|
1030
|
+
form_data.ALLOWED_FILE_EXTENSIONS
|
|
1031
|
+
if form_data.ALLOWED_FILE_EXTENSIONS is not None
|
|
1032
|
+
else request.app.state.config.ALLOWED_FILE_EXTENSIONS
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
# Integration settings
|
|
1036
|
+
request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION = (
|
|
1037
|
+
form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION
|
|
1038
|
+
if form_data.ENABLE_GOOGLE_DRIVE_INTEGRATION is not None
|
|
1039
|
+
else request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION
|
|
1040
|
+
)
|
|
1041
|
+
request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION = (
|
|
1042
|
+
form_data.ENABLE_ONEDRIVE_INTEGRATION
|
|
1043
|
+
if form_data.ENABLE_ONEDRIVE_INTEGRATION is not None
|
|
1044
|
+
else request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
if form_data.web is not None:
|
|
1048
|
+
# Web search settings
|
|
1049
|
+
request.app.state.config.ENABLE_WEB_SEARCH = form_data.web.ENABLE_WEB_SEARCH
|
|
1050
|
+
request.app.state.config.WEB_SEARCH_ENGINE = form_data.web.WEB_SEARCH_ENGINE
|
|
1051
|
+
request.app.state.config.WEB_SEARCH_TRUST_ENV = form_data.web.WEB_SEARCH_TRUST_ENV
|
|
1052
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT = form_data.web.WEB_SEARCH_RESULT_COUNT
|
|
1053
|
+
request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS = form_data.web.WEB_SEARCH_CONCURRENT_REQUESTS
|
|
1054
|
+
request.app.state.config.WEB_FETCH_MAX_CONTENT_LENGTH = form_data.web.WEB_FETCH_MAX_CONTENT_LENGTH
|
|
1055
|
+
request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS = form_data.web.WEB_LOADER_CONCURRENT_REQUESTS
|
|
1056
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST = form_data.web.WEB_SEARCH_DOMAIN_FILTER_LIST
|
|
1057
|
+
request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL = (
|
|
1058
|
+
form_data.web.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL
|
|
1059
|
+
)
|
|
1060
|
+
request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER = form_data.web.BYPASS_WEB_SEARCH_WEB_LOADER
|
|
1061
|
+
request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY = form_data.web.OLLAMA_CLOUD_WEB_SEARCH_API_KEY
|
|
1062
|
+
request.app.state.config.SEARXNG_QUERY_URL = form_data.web.SEARXNG_QUERY_URL
|
|
1063
|
+
request.app.state.config.SEARXNG_LANGUAGE = form_data.web.SEARXNG_LANGUAGE
|
|
1064
|
+
request.app.state.config.YACY_QUERY_URL = form_data.web.YACY_QUERY_URL
|
|
1065
|
+
request.app.state.config.YACY_USERNAME = form_data.web.YACY_USERNAME
|
|
1066
|
+
request.app.state.config.YACY_PASSWORD = form_data.web.YACY_PASSWORD
|
|
1067
|
+
request.app.state.config.GOOGLE_PSE_API_KEY = form_data.web.GOOGLE_PSE_API_KEY
|
|
1068
|
+
request.app.state.config.GOOGLE_PSE_ENGINE_ID = form_data.web.GOOGLE_PSE_ENGINE_ID
|
|
1069
|
+
request.app.state.config.BRAVE_SEARCH_API_KEY = form_data.web.BRAVE_SEARCH_API_KEY
|
|
1070
|
+
request.app.state.config.KAGI_SEARCH_API_KEY = form_data.web.KAGI_SEARCH_API_KEY
|
|
1071
|
+
request.app.state.config.MOJEEK_SEARCH_API_KEY = form_data.web.MOJEEK_SEARCH_API_KEY
|
|
1072
|
+
request.app.state.config.BOCHA_SEARCH_API_KEY = form_data.web.BOCHA_SEARCH_API_KEY
|
|
1073
|
+
request.app.state.config.SERPSTACK_API_KEY = form_data.web.SERPSTACK_API_KEY
|
|
1074
|
+
request.app.state.config.SERPSTACK_HTTPS = form_data.web.SERPSTACK_HTTPS
|
|
1075
|
+
request.app.state.config.SERPER_API_KEY = form_data.web.SERPER_API_KEY
|
|
1076
|
+
request.app.state.config.SERPLY_API_KEY = form_data.web.SERPLY_API_KEY
|
|
1077
|
+
request.app.state.config.DDGS_BACKEND = form_data.web.DDGS_BACKEND
|
|
1078
|
+
request.app.state.config.TAVILY_API_KEY = form_data.web.TAVILY_API_KEY
|
|
1079
|
+
request.app.state.config.SEARCHAPI_API_KEY = form_data.web.SEARCHAPI_API_KEY
|
|
1080
|
+
request.app.state.config.SEARCHAPI_ENGINE = form_data.web.SEARCHAPI_ENGINE
|
|
1081
|
+
request.app.state.config.SERPAPI_API_KEY = form_data.web.SERPAPI_API_KEY
|
|
1082
|
+
request.app.state.config.SERPAPI_ENGINE = form_data.web.SERPAPI_ENGINE
|
|
1083
|
+
request.app.state.config.JINA_API_KEY = form_data.web.JINA_API_KEY
|
|
1084
|
+
request.app.state.config.JINA_API_BASE_URL = form_data.web.JINA_API_BASE_URL
|
|
1085
|
+
request.app.state.config.BING_SEARCH_V7_ENDPOINT = form_data.web.BING_SEARCH_V7_ENDPOINT
|
|
1086
|
+
request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY = form_data.web.BING_SEARCH_V7_SUBSCRIPTION_KEY
|
|
1087
|
+
request.app.state.config.EXA_API_KEY = form_data.web.EXA_API_KEY
|
|
1088
|
+
request.app.state.config.PERPLEXITY_API_KEY = form_data.web.PERPLEXITY_API_KEY
|
|
1089
|
+
request.app.state.config.PERPLEXITY_MODEL = form_data.web.PERPLEXITY_MODEL
|
|
1090
|
+
request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE = form_data.web.PERPLEXITY_SEARCH_CONTEXT_USAGE
|
|
1091
|
+
request.app.state.config.PERPLEXITY_SEARCH_API_URL = form_data.web.PERPLEXITY_SEARCH_API_URL
|
|
1092
|
+
request.app.state.config.SOUGOU_API_SID = form_data.web.SOUGOU_API_SID
|
|
1093
|
+
request.app.state.config.SOUGOU_API_SK = form_data.web.SOUGOU_API_SK
|
|
1094
|
+
|
|
1095
|
+
# Web loader settings
|
|
1096
|
+
request.app.state.config.WEB_LOADER_ENGINE = form_data.web.WEB_LOADER_ENGINE
|
|
1097
|
+
request.app.state.config.WEB_LOADER_TIMEOUT = form_data.web.WEB_LOADER_TIMEOUT
|
|
1098
|
+
|
|
1099
|
+
request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = form_data.web.ENABLE_WEB_LOADER_SSL_VERIFICATION
|
|
1100
|
+
request.app.state.config.PLAYWRIGHT_WS_URL = form_data.web.PLAYWRIGHT_WS_URL
|
|
1101
|
+
request.app.state.config.PLAYWRIGHT_TIMEOUT = form_data.web.PLAYWRIGHT_TIMEOUT
|
|
1102
|
+
request.app.state.config.FIRECRAWL_API_KEY = form_data.web.FIRECRAWL_API_KEY
|
|
1103
|
+
request.app.state.config.FIRECRAWL_API_BASE_URL = form_data.web.FIRECRAWL_API_BASE_URL
|
|
1104
|
+
request.app.state.config.FIRECRAWL_TIMEOUT = form_data.web.FIRECRAWL_TIMEOUT
|
|
1105
|
+
request.app.state.config.EXTERNAL_WEB_SEARCH_URL = form_data.web.EXTERNAL_WEB_SEARCH_URL
|
|
1106
|
+
request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = form_data.web.EXTERNAL_WEB_SEARCH_API_KEY
|
|
1107
|
+
request.app.state.config.EXTERNAL_WEB_LOADER_URL = form_data.web.EXTERNAL_WEB_LOADER_URL
|
|
1108
|
+
request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY = form_data.web.EXTERNAL_WEB_LOADER_API_KEY
|
|
1109
|
+
request.app.state.config.TAVILY_EXTRACT_DEPTH = form_data.web.TAVILY_EXTRACT_DEPTH
|
|
1110
|
+
request.app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.web.YOUTUBE_LOADER_LANGUAGE
|
|
1111
|
+
request.app.state.config.YOUTUBE_LOADER_PROXY_URL = form_data.web.YOUTUBE_LOADER_PROXY_URL
|
|
1112
|
+
request.app.state.YOUTUBE_LOADER_TRANSLATION = form_data.web.YOUTUBE_LOADER_TRANSLATION
|
|
1113
|
+
request.app.state.config.YANDEX_WEB_SEARCH_URL = form_data.web.YANDEX_WEB_SEARCH_URL
|
|
1114
|
+
request.app.state.config.YANDEX_WEB_SEARCH_API_KEY = form_data.web.YANDEX_WEB_SEARCH_API_KEY
|
|
1115
|
+
request.app.state.config.YANDEX_WEB_SEARCH_CONFIG = form_data.web.YANDEX_WEB_SEARCH_CONFIG
|
|
1116
|
+
request.app.state.config.YOUCOM_API_KEY = form_data.web.YOUCOM_API_KEY
|
|
1117
|
+
|
|
1118
|
+
return {
|
|
1119
|
+
'status': True,
|
|
1120
|
+
# RAG settings
|
|
1121
|
+
'RAG_TEMPLATE': request.app.state.config.RAG_TEMPLATE,
|
|
1122
|
+
'TOP_K': request.app.state.config.TOP_K,
|
|
1123
|
+
'BYPASS_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL,
|
|
1124
|
+
'RAG_FULL_CONTEXT': request.app.state.config.RAG_FULL_CONTEXT,
|
|
1125
|
+
# Hybrid search settings
|
|
1126
|
+
'ENABLE_RAG_HYBRID_SEARCH': request.app.state.config.ENABLE_RAG_HYBRID_SEARCH,
|
|
1127
|
+
'TOP_K_RERANKER': request.app.state.config.TOP_K_RERANKER,
|
|
1128
|
+
'RELEVANCE_THRESHOLD': request.app.state.config.RELEVANCE_THRESHOLD,
|
|
1129
|
+
'HYBRID_BM25_WEIGHT': request.app.state.config.HYBRID_BM25_WEIGHT,
|
|
1130
|
+
# Content extraction settings
|
|
1131
|
+
'CONTENT_EXTRACTION_ENGINE': request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
|
1132
|
+
'PDF_EXTRACT_IMAGES': request.app.state.config.PDF_EXTRACT_IMAGES,
|
|
1133
|
+
'PDF_LOADER_MODE': request.app.state.config.PDF_LOADER_MODE,
|
|
1134
|
+
'DATALAB_MARKER_API_KEY': request.app.state.config.DATALAB_MARKER_API_KEY,
|
|
1135
|
+
'DATALAB_MARKER_API_BASE_URL': request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
|
1136
|
+
'DATALAB_MARKER_ADDITIONAL_CONFIG': request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
|
1137
|
+
'DATALAB_MARKER_SKIP_CACHE': request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
|
|
1138
|
+
'DATALAB_MARKER_FORCE_OCR': request.app.state.config.DATALAB_MARKER_FORCE_OCR,
|
|
1139
|
+
'DATALAB_MARKER_PAGINATE': request.app.state.config.DATALAB_MARKER_PAGINATE,
|
|
1140
|
+
'DATALAB_MARKER_STRIP_EXISTING_OCR': request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
|
|
1141
|
+
'DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION': request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
|
|
1142
|
+
'DATALAB_MARKER_USE_LLM': request.app.state.config.DATALAB_MARKER_USE_LLM,
|
|
1143
|
+
'DATALAB_MARKER_OUTPUT_FORMAT': request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
|
|
1144
|
+
'EXTERNAL_DOCUMENT_LOADER_URL': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
|
|
1145
|
+
'EXTERNAL_DOCUMENT_LOADER_API_KEY': request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY,
|
|
1146
|
+
'TIKA_SERVER_URL': request.app.state.config.TIKA_SERVER_URL,
|
|
1147
|
+
'DOCLING_SERVER_URL': request.app.state.config.DOCLING_SERVER_URL,
|
|
1148
|
+
'DOCLING_API_KEY': request.app.state.config.DOCLING_API_KEY,
|
|
1149
|
+
'DOCLING_PARAMS': request.app.state.config.DOCLING_PARAMS,
|
|
1150
|
+
'DOCUMENT_INTELLIGENCE_ENDPOINT': request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
|
|
1151
|
+
'DOCUMENT_INTELLIGENCE_KEY': request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
|
|
1152
|
+
'DOCUMENT_INTELLIGENCE_MODEL': request.app.state.config.DOCUMENT_INTELLIGENCE_MODEL,
|
|
1153
|
+
'MISTRAL_OCR_API_BASE_URL': request.app.state.config.MISTRAL_OCR_API_BASE_URL,
|
|
1154
|
+
'MISTRAL_OCR_API_KEY': request.app.state.config.MISTRAL_OCR_API_KEY,
|
|
1155
|
+
# MinerU settings
|
|
1156
|
+
'MINERU_API_MODE': request.app.state.config.MINERU_API_MODE,
|
|
1157
|
+
'MINERU_API_URL': request.app.state.config.MINERU_API_URL,
|
|
1158
|
+
'MINERU_API_KEY': request.app.state.config.MINERU_API_KEY,
|
|
1159
|
+
'MINERU_API_TIMEOUT': request.app.state.config.MINERU_API_TIMEOUT,
|
|
1160
|
+
'MINERU_PARAMS': request.app.state.config.MINERU_PARAMS,
|
|
1161
|
+
# Reranking settings
|
|
1162
|
+
'RAG_RERANKING_MODEL': request.app.state.config.RAG_RERANKING_MODEL,
|
|
1163
|
+
'RAG_RERANKING_ENGINE': request.app.state.config.RAG_RERANKING_ENGINE,
|
|
1164
|
+
'RAG_EXTERNAL_RERANKER_URL': request.app.state.config.RAG_EXTERNAL_RERANKER_URL,
|
|
1165
|
+
'RAG_EXTERNAL_RERANKER_API_KEY': request.app.state.config.RAG_EXTERNAL_RERANKER_API_KEY,
|
|
1166
|
+
'RAG_EXTERNAL_RERANKER_TIMEOUT': request.app.state.config.RAG_EXTERNAL_RERANKER_TIMEOUT,
|
|
1167
|
+
# Chunking settings
|
|
1168
|
+
'TEXT_SPLITTER': request.app.state.config.TEXT_SPLITTER,
|
|
1169
|
+
'CHUNK_SIZE': request.app.state.config.CHUNK_SIZE,
|
|
1170
|
+
'CHUNK_MIN_SIZE_TARGET': request.app.state.config.CHUNK_MIN_SIZE_TARGET,
|
|
1171
|
+
'ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER': request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER,
|
|
1172
|
+
'CHUNK_OVERLAP': request.app.state.config.CHUNK_OVERLAP,
|
|
1173
|
+
# File upload settings
|
|
1174
|
+
'FILE_MAX_SIZE': request.app.state.config.FILE_MAX_SIZE,
|
|
1175
|
+
'FILE_MAX_COUNT': request.app.state.config.FILE_MAX_COUNT,
|
|
1176
|
+
'FILE_IMAGE_COMPRESSION_WIDTH': request.app.state.config.FILE_IMAGE_COMPRESSION_WIDTH,
|
|
1177
|
+
'FILE_IMAGE_COMPRESSION_HEIGHT': request.app.state.config.FILE_IMAGE_COMPRESSION_HEIGHT,
|
|
1178
|
+
'ALLOWED_FILE_EXTENSIONS': request.app.state.config.ALLOWED_FILE_EXTENSIONS,
|
|
1179
|
+
# Integration settings
|
|
1180
|
+
'ENABLE_GOOGLE_DRIVE_INTEGRATION': request.app.state.config.ENABLE_GOOGLE_DRIVE_INTEGRATION,
|
|
1181
|
+
'ENABLE_ONEDRIVE_INTEGRATION': request.app.state.config.ENABLE_ONEDRIVE_INTEGRATION,
|
|
1182
|
+
# Web search settings
|
|
1183
|
+
'web': {
|
|
1184
|
+
'ENABLE_WEB_SEARCH': request.app.state.config.ENABLE_WEB_SEARCH,
|
|
1185
|
+
'WEB_SEARCH_ENGINE': request.app.state.config.WEB_SEARCH_ENGINE,
|
|
1186
|
+
'WEB_SEARCH_TRUST_ENV': request.app.state.config.WEB_SEARCH_TRUST_ENV,
|
|
1187
|
+
'WEB_SEARCH_RESULT_COUNT': request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1188
|
+
'WEB_SEARCH_CONCURRENT_REQUESTS': request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
|
|
1189
|
+
'WEB_FETCH_MAX_CONTENT_LENGTH': request.app.state.config.WEB_FETCH_MAX_CONTENT_LENGTH,
|
|
1190
|
+
'WEB_LOADER_CONCURRENT_REQUESTS': request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
|
|
1191
|
+
'WEB_SEARCH_DOMAIN_FILTER_LIST': request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1192
|
+
'BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL': request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL,
|
|
1193
|
+
'BYPASS_WEB_SEARCH_WEB_LOADER': request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER,
|
|
1194
|
+
'OLLAMA_CLOUD_WEB_SEARCH_API_KEY': request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
|
|
1195
|
+
'SEARXNG_QUERY_URL': request.app.state.config.SEARXNG_QUERY_URL,
|
|
1196
|
+
'SEARXNG_LANGUAGE': request.app.state.config.SEARXNG_LANGUAGE,
|
|
1197
|
+
'YACY_QUERY_URL': request.app.state.config.YACY_QUERY_URL,
|
|
1198
|
+
'YACY_USERNAME': request.app.state.config.YACY_USERNAME,
|
|
1199
|
+
'YACY_PASSWORD': request.app.state.config.YACY_PASSWORD,
|
|
1200
|
+
'GOOGLE_PSE_API_KEY': request.app.state.config.GOOGLE_PSE_API_KEY,
|
|
1201
|
+
'GOOGLE_PSE_ENGINE_ID': request.app.state.config.GOOGLE_PSE_ENGINE_ID,
|
|
1202
|
+
'BRAVE_SEARCH_API_KEY': request.app.state.config.BRAVE_SEARCH_API_KEY,
|
|
1203
|
+
'KAGI_SEARCH_API_KEY': request.app.state.config.KAGI_SEARCH_API_KEY,
|
|
1204
|
+
'MOJEEK_SEARCH_API_KEY': request.app.state.config.MOJEEK_SEARCH_API_KEY,
|
|
1205
|
+
'BOCHA_SEARCH_API_KEY': request.app.state.config.BOCHA_SEARCH_API_KEY,
|
|
1206
|
+
'SERPSTACK_API_KEY': request.app.state.config.SERPSTACK_API_KEY,
|
|
1207
|
+
'SERPSTACK_HTTPS': request.app.state.config.SERPSTACK_HTTPS,
|
|
1208
|
+
'SERPER_API_KEY': request.app.state.config.SERPER_API_KEY,
|
|
1209
|
+
'SERPLY_API_KEY': request.app.state.config.SERPLY_API_KEY,
|
|
1210
|
+
'TAVILY_API_KEY': request.app.state.config.TAVILY_API_KEY,
|
|
1211
|
+
'SEARCHAPI_API_KEY': request.app.state.config.SEARCHAPI_API_KEY,
|
|
1212
|
+
'SEARCHAPI_ENGINE': request.app.state.config.SEARCHAPI_ENGINE,
|
|
1213
|
+
'SERPAPI_API_KEY': request.app.state.config.SERPAPI_API_KEY,
|
|
1214
|
+
'SERPAPI_ENGINE': request.app.state.config.SERPAPI_ENGINE,
|
|
1215
|
+
'JINA_API_KEY': request.app.state.config.JINA_API_KEY,
|
|
1216
|
+
'JINA_API_BASE_URL': request.app.state.config.JINA_API_BASE_URL,
|
|
1217
|
+
'BING_SEARCH_V7_ENDPOINT': request.app.state.config.BING_SEARCH_V7_ENDPOINT,
|
|
1218
|
+
'BING_SEARCH_V7_SUBSCRIPTION_KEY': request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
|
|
1219
|
+
'EXA_API_KEY': request.app.state.config.EXA_API_KEY,
|
|
1220
|
+
'PERPLEXITY_API_KEY': request.app.state.config.PERPLEXITY_API_KEY,
|
|
1221
|
+
'PERPLEXITY_MODEL': request.app.state.config.PERPLEXITY_MODEL,
|
|
1222
|
+
'PERPLEXITY_SEARCH_CONTEXT_USAGE': request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
|
|
1223
|
+
'PERPLEXITY_SEARCH_API_URL': request.app.state.config.PERPLEXITY_SEARCH_API_URL,
|
|
1224
|
+
'SOUGOU_API_SID': request.app.state.config.SOUGOU_API_SID,
|
|
1225
|
+
'SOUGOU_API_SK': request.app.state.config.SOUGOU_API_SK,
|
|
1226
|
+
'WEB_LOADER_ENGINE': request.app.state.config.WEB_LOADER_ENGINE,
|
|
1227
|
+
'WEB_LOADER_TIMEOUT': request.app.state.config.WEB_LOADER_TIMEOUT,
|
|
1228
|
+
'ENABLE_WEB_LOADER_SSL_VERIFICATION': request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
|
|
1229
|
+
'PLAYWRIGHT_WS_URL': request.app.state.config.PLAYWRIGHT_WS_URL,
|
|
1230
|
+
'PLAYWRIGHT_TIMEOUT': request.app.state.config.PLAYWRIGHT_TIMEOUT,
|
|
1231
|
+
'FIRECRAWL_API_KEY': request.app.state.config.FIRECRAWL_API_KEY,
|
|
1232
|
+
'FIRECRAWL_API_BASE_URL': request.app.state.config.FIRECRAWL_API_BASE_URL,
|
|
1233
|
+
'FIRECRAWL_TIMEOUT': request.app.state.config.FIRECRAWL_TIMEOUT,
|
|
1234
|
+
'TAVILY_EXTRACT_DEPTH': request.app.state.config.TAVILY_EXTRACT_DEPTH,
|
|
1235
|
+
'EXTERNAL_WEB_SEARCH_URL': request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
|
|
1236
|
+
'EXTERNAL_WEB_SEARCH_API_KEY': request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
|
|
1237
|
+
'EXTERNAL_WEB_LOADER_URL': request.app.state.config.EXTERNAL_WEB_LOADER_URL,
|
|
1238
|
+
'EXTERNAL_WEB_LOADER_API_KEY': request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY,
|
|
1239
|
+
'YOUTUBE_LOADER_LANGUAGE': request.app.state.config.YOUTUBE_LOADER_LANGUAGE,
|
|
1240
|
+
'YOUTUBE_LOADER_PROXY_URL': request.app.state.config.YOUTUBE_LOADER_PROXY_URL,
|
|
1241
|
+
'YOUTUBE_LOADER_TRANSLATION': request.app.state.YOUTUBE_LOADER_TRANSLATION,
|
|
1242
|
+
'YANDEX_WEB_SEARCH_URL': request.app.state.config.YANDEX_WEB_SEARCH_URL,
|
|
1243
|
+
'YANDEX_WEB_SEARCH_API_KEY': request.app.state.config.YANDEX_WEB_SEARCH_API_KEY,
|
|
1244
|
+
'YANDEX_WEB_SEARCH_CONFIG': request.app.state.config.YANDEX_WEB_SEARCH_CONFIG,
|
|
1245
|
+
'YOUCOM_API_KEY': request.app.state.config.YOUCOM_API_KEY,
|
|
1246
|
+
},
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
|
|
1250
|
+
####################################
|
|
1251
|
+
#
|
|
1252
|
+
# Document process and retrieval
|
|
1253
|
+
#
|
|
1254
|
+
####################################
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def can_merge_chunks(a: Document, b: Document) -> bool:
|
|
1258
|
+
if a.metadata.get('source') != b.metadata.get('source'):
|
|
1259
|
+
return False
|
|
1260
|
+
|
|
1261
|
+
a_file_id = a.metadata.get('file_id')
|
|
1262
|
+
b_file_id = b.metadata.get('file_id')
|
|
1263
|
+
|
|
1264
|
+
if a_file_id is not None and b_file_id is not None:
|
|
1265
|
+
return a_file_id == b_file_id
|
|
1266
|
+
|
|
1267
|
+
return True
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
def merge_docs_to_target_size(
|
|
1271
|
+
request: Request,
|
|
1272
|
+
chunks: list[Document],
|
|
1273
|
+
) -> list[Document]:
|
|
1274
|
+
"""
|
|
1275
|
+
Best-effort normalization of chunk sizes.
|
|
1276
|
+
|
|
1277
|
+
Attempts to grow small chunks up to a desired minimum size,
|
|
1278
|
+
without exceeding the maximum size or crossing source/file
|
|
1279
|
+
boundaries.
|
|
1280
|
+
"""
|
|
1281
|
+
min_chunk_size_target = request.app.state.config.CHUNK_MIN_SIZE_TARGET
|
|
1282
|
+
max_chunk_size = request.app.state.config.CHUNK_SIZE
|
|
1283
|
+
|
|
1284
|
+
if min_chunk_size_target <= 0:
|
|
1285
|
+
return chunks
|
|
1286
|
+
|
|
1287
|
+
measure_chunk_size = len
|
|
1288
|
+
if request.app.state.config.TEXT_SPLITTER == 'token':
|
|
1289
|
+
encoding = tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
|
|
1290
|
+
measure_chunk_size = lambda text: len(encoding.encode(text))
|
|
1291
|
+
|
|
1292
|
+
processed_chunks: list[Document] = []
|
|
1293
|
+
|
|
1294
|
+
current_chunk: Document | None = None
|
|
1295
|
+
current_content: str = ''
|
|
1296
|
+
|
|
1297
|
+
for next_chunk in chunks:
|
|
1298
|
+
if current_chunk is None:
|
|
1299
|
+
current_chunk = next_chunk
|
|
1300
|
+
current_content = next_chunk.page_content
|
|
1301
|
+
continue # First chunk initialization
|
|
1302
|
+
|
|
1303
|
+
proposed_content = f'{current_content}\n\n{next_chunk.page_content}'
|
|
1304
|
+
|
|
1305
|
+
can_merge = (
|
|
1306
|
+
can_merge_chunks(current_chunk, next_chunk)
|
|
1307
|
+
and measure_chunk_size(current_content) < min_chunk_size_target
|
|
1308
|
+
and measure_chunk_size(proposed_content) <= max_chunk_size
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
if can_merge:
|
|
1312
|
+
current_content = proposed_content
|
|
1313
|
+
else:
|
|
1314
|
+
processed_chunks.append(
|
|
1315
|
+
Document(
|
|
1316
|
+
page_content=current_content,
|
|
1317
|
+
metadata={**current_chunk.metadata},
|
|
1318
|
+
)
|
|
1319
|
+
)
|
|
1320
|
+
current_chunk = next_chunk
|
|
1321
|
+
current_content = next_chunk.page_content
|
|
1322
|
+
|
|
1323
|
+
if current_chunk is not None:
|
|
1324
|
+
processed_chunks.append(
|
|
1325
|
+
Document(
|
|
1326
|
+
page_content=current_content,
|
|
1327
|
+
metadata={**current_chunk.metadata},
|
|
1328
|
+
)
|
|
1329
|
+
)
|
|
1330
|
+
|
|
1331
|
+
return processed_chunks
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
def save_docs_to_vector_db(
|
|
1335
|
+
request: Request,
|
|
1336
|
+
docs,
|
|
1337
|
+
collection_name,
|
|
1338
|
+
metadata: Optional[dict] = None,
|
|
1339
|
+
overwrite: bool = False,
|
|
1340
|
+
split: bool = True,
|
|
1341
|
+
add: bool = False,
|
|
1342
|
+
user=None,
|
|
1343
|
+
) -> bool:
|
|
1344
|
+
def _get_docs_info(docs: list[Document]) -> str:
|
|
1345
|
+
docs_info = set()
|
|
1346
|
+
|
|
1347
|
+
# Trying to select relevant metadata identifying the document.
|
|
1348
|
+
for doc in docs:
|
|
1349
|
+
metadata = getattr(doc, 'metadata', {})
|
|
1350
|
+
doc_name = metadata.get('name', '')
|
|
1351
|
+
if not doc_name:
|
|
1352
|
+
doc_name = metadata.get('title', '')
|
|
1353
|
+
if not doc_name:
|
|
1354
|
+
doc_name = metadata.get('source', '')
|
|
1355
|
+
if doc_name:
|
|
1356
|
+
docs_info.add(doc_name)
|
|
1357
|
+
|
|
1358
|
+
return ', '.join(docs_info)
|
|
1359
|
+
|
|
1360
|
+
log.debug(f'save_docs_to_vector_db: document {_get_docs_info(docs)} {collection_name}')
|
|
1361
|
+
|
|
1362
|
+
# Check if entries with the same hash (metadata.hash) already exist
|
|
1363
|
+
if metadata and 'hash' in metadata:
|
|
1364
|
+
result = VECTOR_DB_CLIENT.query(
|
|
1365
|
+
collection_name=collection_name,
|
|
1366
|
+
filter={'hash': metadata['hash']},
|
|
1367
|
+
)
|
|
1368
|
+
|
|
1369
|
+
if result is not None and result.ids and len(result.ids) > 0:
|
|
1370
|
+
existing_doc_ids = result.ids[0]
|
|
1371
|
+
if existing_doc_ids:
|
|
1372
|
+
# Check if the existing document belongs to the same file
|
|
1373
|
+
# If same file_id, this is a re-add/reindex - allow it
|
|
1374
|
+
# If different file_id, this is a duplicate - block it
|
|
1375
|
+
existing_file_id = None
|
|
1376
|
+
if result.metadatas and result.metadatas[0]:
|
|
1377
|
+
existing_file_id = result.metadatas[0][0].get('file_id')
|
|
1378
|
+
|
|
1379
|
+
if existing_file_id != metadata.get('file_id'):
|
|
1380
|
+
log.info(f'Document with hash {metadata["hash"]} already exists')
|
|
1381
|
+
raise ValueError(ERROR_MESSAGES.DUPLICATE_CONTENT)
|
|
1382
|
+
|
|
1383
|
+
if split:
|
|
1384
|
+
if request.app.state.config.ENABLE_MARKDOWN_HEADER_TEXT_SPLITTER:
|
|
1385
|
+
log.info('Using markdown header text splitter')
|
|
1386
|
+
# Define headers to split on - covering most common markdown header levels
|
|
1387
|
+
markdown_splitter = MarkdownHeaderTextSplitter(
|
|
1388
|
+
headers_to_split_on=[
|
|
1389
|
+
('#', 'Header 1'),
|
|
1390
|
+
('##', 'Header 2'),
|
|
1391
|
+
('###', 'Header 3'),
|
|
1392
|
+
('####', 'Header 4'),
|
|
1393
|
+
('#####', 'Header 5'),
|
|
1394
|
+
('######', 'Header 6'),
|
|
1395
|
+
],
|
|
1396
|
+
strip_headers=False, # Keep headers in content for context
|
|
1397
|
+
)
|
|
1398
|
+
|
|
1399
|
+
split_docs = []
|
|
1400
|
+
for doc in docs:
|
|
1401
|
+
split_docs.extend(
|
|
1402
|
+
[
|
|
1403
|
+
Document(
|
|
1404
|
+
page_content=split_chunk.page_content,
|
|
1405
|
+
metadata={**doc.metadata},
|
|
1406
|
+
)
|
|
1407
|
+
for split_chunk in markdown_splitter.split_text(doc.page_content)
|
|
1408
|
+
]
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
docs = split_docs
|
|
1412
|
+
if request.app.state.config.CHUNK_MIN_SIZE_TARGET > 0:
|
|
1413
|
+
docs = merge_docs_to_target_size(request, docs)
|
|
1414
|
+
|
|
1415
|
+
if request.app.state.config.TEXT_SPLITTER in ['', 'character']:
|
|
1416
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
1417
|
+
chunk_size=request.app.state.config.CHUNK_SIZE,
|
|
1418
|
+
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
|
1419
|
+
add_start_index=True,
|
|
1420
|
+
)
|
|
1421
|
+
docs = text_splitter.split_documents(docs)
|
|
1422
|
+
elif request.app.state.config.TEXT_SPLITTER == 'token':
|
|
1423
|
+
log.info(f'Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}')
|
|
1424
|
+
|
|
1425
|
+
tiktoken.get_encoding(str(request.app.state.config.TIKTOKEN_ENCODING_NAME))
|
|
1426
|
+
text_splitter = TokenTextSplitter(
|
|
1427
|
+
encoding_name=str(request.app.state.config.TIKTOKEN_ENCODING_NAME),
|
|
1428
|
+
chunk_size=request.app.state.config.CHUNK_SIZE,
|
|
1429
|
+
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
|
1430
|
+
add_start_index=True,
|
|
1431
|
+
)
|
|
1432
|
+
docs = text_splitter.split_documents(docs)
|
|
1433
|
+
else:
|
|
1434
|
+
raise ValueError(ERROR_MESSAGES.DEFAULT('Invalid text splitter'))
|
|
1435
|
+
|
|
1436
|
+
if len(docs) == 0:
|
|
1437
|
+
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
|
1438
|
+
|
|
1439
|
+
texts = [sanitize_text_for_db(doc.page_content) for doc in docs]
|
|
1440
|
+
metadatas = [
|
|
1441
|
+
{
|
|
1442
|
+
**doc.metadata,
|
|
1443
|
+
**(metadata if metadata else {}),
|
|
1444
|
+
'embedding_config': {
|
|
1445
|
+
'engine': request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
1446
|
+
'model': request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
1447
|
+
},
|
|
1448
|
+
}
|
|
1449
|
+
for doc in docs
|
|
1450
|
+
]
|
|
1451
|
+
|
|
1452
|
+
try:
|
|
1453
|
+
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
|
|
1454
|
+
log.info(f'collection {collection_name} already exists')
|
|
1455
|
+
|
|
1456
|
+
if overwrite:
|
|
1457
|
+
VECTOR_DB_CLIENT.delete_collection(collection_name=collection_name)
|
|
1458
|
+
log.info(f'deleting existing collection {collection_name}')
|
|
1459
|
+
elif add is False:
|
|
1460
|
+
log.info(f'collection {collection_name} already exists, overwrite is False and add is False')
|
|
1461
|
+
return True
|
|
1462
|
+
|
|
1463
|
+
log.info(f'generating embeddings for {collection_name}')
|
|
1464
|
+
embedding_function = get_embedding_function(
|
|
1465
|
+
request.app.state.config.RAG_EMBEDDING_ENGINE,
|
|
1466
|
+
request.app.state.config.RAG_EMBEDDING_MODEL,
|
|
1467
|
+
request.app.state.ef,
|
|
1468
|
+
(
|
|
1469
|
+
request.app.state.config.RAG_OPENAI_API_BASE_URL
|
|
1470
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
|
|
1471
|
+
else (
|
|
1472
|
+
request.app.state.config.RAG_OLLAMA_BASE_URL
|
|
1473
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
|
|
1474
|
+
else request.app.state.config.RAG_AZURE_OPENAI_BASE_URL
|
|
1475
|
+
)
|
|
1476
|
+
),
|
|
1477
|
+
(
|
|
1478
|
+
request.app.state.config.RAG_OPENAI_API_KEY
|
|
1479
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'openai'
|
|
1480
|
+
else (
|
|
1481
|
+
request.app.state.config.RAG_OLLAMA_API_KEY
|
|
1482
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'ollama'
|
|
1483
|
+
else request.app.state.config.RAG_AZURE_OPENAI_API_KEY
|
|
1484
|
+
)
|
|
1485
|
+
),
|
|
1486
|
+
request.app.state.config.RAG_EMBEDDING_BATCH_SIZE,
|
|
1487
|
+
azure_api_version=(
|
|
1488
|
+
request.app.state.config.RAG_AZURE_OPENAI_API_VERSION
|
|
1489
|
+
if request.app.state.config.RAG_EMBEDDING_ENGINE == 'azure_openai'
|
|
1490
|
+
else None
|
|
1491
|
+
),
|
|
1492
|
+
enable_async=request.app.state.config.ENABLE_ASYNC_EMBEDDING,
|
|
1493
|
+
concurrent_requests=request.app.state.config.RAG_EMBEDDING_CONCURRENT_REQUESTS,
|
|
1494
|
+
)
|
|
1495
|
+
|
|
1496
|
+
# Run async embedding in sync context using the main event loop
|
|
1497
|
+
# This allows the main loop to stay responsive to health checks during long operations
|
|
1498
|
+
embedding_timeout = RAG_EMBEDDING_TIMEOUT
|
|
1499
|
+
|
|
1500
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
1501
|
+
embedding_function(
|
|
1502
|
+
list(map(lambda x: x.replace('\n', ' '), texts)),
|
|
1503
|
+
prefix=RAG_EMBEDDING_CONTENT_PREFIX,
|
|
1504
|
+
user=user,
|
|
1505
|
+
),
|
|
1506
|
+
request.app.state.main_loop,
|
|
1507
|
+
)
|
|
1508
|
+
embeddings = future.result(timeout=embedding_timeout)
|
|
1509
|
+
log.info(f'embeddings generated {len(embeddings)} for {len(texts)} items')
|
|
1510
|
+
|
|
1511
|
+
items = [
|
|
1512
|
+
{
|
|
1513
|
+
'id': str(uuid.uuid4()),
|
|
1514
|
+
'text': text,
|
|
1515
|
+
'vector': embeddings[idx],
|
|
1516
|
+
'metadata': metadatas[idx],
|
|
1517
|
+
}
|
|
1518
|
+
for idx, text in enumerate(texts)
|
|
1519
|
+
]
|
|
1520
|
+
|
|
1521
|
+
log.info(f'adding to collection {collection_name}')
|
|
1522
|
+
VECTOR_DB_CLIENT.insert(
|
|
1523
|
+
collection_name=collection_name,
|
|
1524
|
+
items=items,
|
|
1525
|
+
)
|
|
1526
|
+
|
|
1527
|
+
log.info(f'added {len(items)} items to collection {collection_name}')
|
|
1528
|
+
return True
|
|
1529
|
+
except Exception as e:
|
|
1530
|
+
log.exception(e)
|
|
1531
|
+
raise e
|
|
1532
|
+
|
|
1533
|
+
|
|
1534
|
+
class ProcessFileForm(BaseModel):
|
|
1535
|
+
file_id: str
|
|
1536
|
+
content: Optional[str] = None
|
|
1537
|
+
collection_name: Optional[str] = None
|
|
1538
|
+
|
|
1539
|
+
|
|
1540
|
+
@router.post('/process/file')
|
|
1541
|
+
async def process_file(
|
|
1542
|
+
request: Request,
|
|
1543
|
+
form_data: ProcessFileForm,
|
|
1544
|
+
user=Depends(get_verified_user),
|
|
1545
|
+
db: AsyncSession = Depends(get_async_session),
|
|
1546
|
+
):
|
|
1547
|
+
"""
|
|
1548
|
+
Process a file and save its content to the vector database.
|
|
1549
|
+
Process a file and save its content to the vector database.
|
|
1550
|
+
Note: granular session management is used to prevent connection pool exhaustion.
|
|
1551
|
+
The session is committed before external API calls, and updates use a fresh session.
|
|
1552
|
+
"""
|
|
1553
|
+
if user.role == 'admin':
|
|
1554
|
+
file = await Files.get_file_by_id(form_data.file_id, db=db)
|
|
1555
|
+
else:
|
|
1556
|
+
file = await Files.get_file_by_id_and_user_id(form_data.file_id, user.id, db=db)
|
|
1557
|
+
|
|
1558
|
+
if file:
|
|
1559
|
+
try:
|
|
1560
|
+
collection_name = form_data.collection_name
|
|
1561
|
+
|
|
1562
|
+
if collection_name is None:
|
|
1563
|
+
collection_name = f'file-{file.id}'
|
|
1564
|
+
|
|
1565
|
+
if form_data.content:
|
|
1566
|
+
# Update the content in the file
|
|
1567
|
+
# Usage: /files/{file_id}/data/content/update, /files/ (audio file upload pipeline)
|
|
1568
|
+
|
|
1569
|
+
try:
|
|
1570
|
+
# /files/{file_id}/data/content/update
|
|
1571
|
+
await ASYNC_VECTOR_DB_CLIENT.delete_collection(collection_name=f'file-{file.id}')
|
|
1572
|
+
except Exception:
|
|
1573
|
+
# Audio file upload pipeline
|
|
1574
|
+
pass
|
|
1575
|
+
|
|
1576
|
+
docs = [
|
|
1577
|
+
Document(
|
|
1578
|
+
page_content=form_data.content.replace('<br/>', '\n'),
|
|
1579
|
+
metadata={
|
|
1580
|
+
**file.meta,
|
|
1581
|
+
'name': file.filename,
|
|
1582
|
+
'created_by': file.user_id,
|
|
1583
|
+
'file_id': file.id,
|
|
1584
|
+
'source': file.filename,
|
|
1585
|
+
},
|
|
1586
|
+
)
|
|
1587
|
+
]
|
|
1588
|
+
|
|
1589
|
+
text_content = form_data.content
|
|
1590
|
+
elif form_data.collection_name:
|
|
1591
|
+
# Check if the file has already been processed and save the content
|
|
1592
|
+
# Usage: /knowledge/{id}/file/add, /knowledge/{id}/file/update
|
|
1593
|
+
|
|
1594
|
+
result = await ASYNC_VECTOR_DB_CLIENT.query(
|
|
1595
|
+
collection_name=f'file-{file.id}', filter={'file_id': file.id}
|
|
1596
|
+
)
|
|
1597
|
+
|
|
1598
|
+
if result is not None and len(result.ids[0]) > 0:
|
|
1599
|
+
docs = [
|
|
1600
|
+
Document(
|
|
1601
|
+
page_content=result.documents[0][idx],
|
|
1602
|
+
metadata=result.metadatas[0][idx],
|
|
1603
|
+
)
|
|
1604
|
+
for idx, id in enumerate(result.ids[0])
|
|
1605
|
+
]
|
|
1606
|
+
else:
|
|
1607
|
+
docs = [
|
|
1608
|
+
Document(
|
|
1609
|
+
page_content=file.data.get('content', ''),
|
|
1610
|
+
metadata={
|
|
1611
|
+
**file.meta,
|
|
1612
|
+
'name': file.filename,
|
|
1613
|
+
'created_by': file.user_id,
|
|
1614
|
+
'file_id': file.id,
|
|
1615
|
+
'source': file.filename,
|
|
1616
|
+
},
|
|
1617
|
+
)
|
|
1618
|
+
]
|
|
1619
|
+
|
|
1620
|
+
text_content = file.data.get('content', '')
|
|
1621
|
+
else:
|
|
1622
|
+
# Process the file and save the content
|
|
1623
|
+
# Usage: /files/
|
|
1624
|
+
file_path = file.path
|
|
1625
|
+
if file_path:
|
|
1626
|
+
file_path = await asyncio.to_thread(Storage.get_file, file_path)
|
|
1627
|
+
loader = build_loader_from_config(request)
|
|
1628
|
+
loader.user = user
|
|
1629
|
+
docs = await loader.aload(file.filename, file.meta.get('content_type'), file_path)
|
|
1630
|
+
|
|
1631
|
+
docs = [
|
|
1632
|
+
Document(
|
|
1633
|
+
page_content=doc.page_content,
|
|
1634
|
+
metadata={
|
|
1635
|
+
**filter_metadata(doc.metadata),
|
|
1636
|
+
'name': file.filename,
|
|
1637
|
+
'created_by': file.user_id,
|
|
1638
|
+
'file_id': file.id,
|
|
1639
|
+
'source': file.filename,
|
|
1640
|
+
},
|
|
1641
|
+
)
|
|
1642
|
+
for doc in docs
|
|
1643
|
+
]
|
|
1644
|
+
else:
|
|
1645
|
+
docs = [
|
|
1646
|
+
Document(
|
|
1647
|
+
page_content=file.data.get('content', ''),
|
|
1648
|
+
metadata={
|
|
1649
|
+
**file.meta,
|
|
1650
|
+
'name': file.filename,
|
|
1651
|
+
'created_by': file.user_id,
|
|
1652
|
+
'file_id': file.id,
|
|
1653
|
+
'source': file.filename,
|
|
1654
|
+
},
|
|
1655
|
+
)
|
|
1656
|
+
]
|
|
1657
|
+
text_content = ' '.join([doc.page_content for doc in docs])
|
|
1658
|
+
|
|
1659
|
+
log.debug(f'text_content: {text_content}')
|
|
1660
|
+
await Files.update_file_data_by_id(
|
|
1661
|
+
file.id,
|
|
1662
|
+
{'content': text_content},
|
|
1663
|
+
db=db,
|
|
1664
|
+
)
|
|
1665
|
+
hash = calculate_sha256_string(text_content)
|
|
1666
|
+
|
|
1667
|
+
if request.app.state.config.BYPASS_EMBEDDING_AND_RETRIEVAL:
|
|
1668
|
+
await Files.update_file_data_by_id(file.id, {'status': 'completed'}, db=db)
|
|
1669
|
+
await Files.update_file_hash_by_id(file.id, hash, db=db)
|
|
1670
|
+
return {
|
|
1671
|
+
'status': True,
|
|
1672
|
+
'collection_name': None,
|
|
1673
|
+
'filename': file.filename,
|
|
1674
|
+
'content': text_content,
|
|
1675
|
+
}
|
|
1676
|
+
else:
|
|
1677
|
+
try:
|
|
1678
|
+
# Commit any pending changes before the slow embedding step.
|
|
1679
|
+
# Note: file is already a Pydantic model (not ORM), so no expunge needed.
|
|
1680
|
+
await db.commit()
|
|
1681
|
+
|
|
1682
|
+
# External embedding API takes time (5-60s+).
|
|
1683
|
+
# Subsequent updates use fresh async sessions.
|
|
1684
|
+
# NOTE: save_docs_to_vector_db is a sync function that
|
|
1685
|
+
# calls asyncio.run_coroutine_threadsafe(..., main_loop).result()
|
|
1686
|
+
# which blocks the calling thread. We MUST run it in a
|
|
1687
|
+
# worker thread to avoid deadlocking the event loop.
|
|
1688
|
+
result = await run_in_threadpool(
|
|
1689
|
+
save_docs_to_vector_db,
|
|
1690
|
+
request,
|
|
1691
|
+
docs=docs,
|
|
1692
|
+
collection_name=collection_name,
|
|
1693
|
+
metadata={
|
|
1694
|
+
'file_id': file.id,
|
|
1695
|
+
'name': file.filename,
|
|
1696
|
+
'hash': hash,
|
|
1697
|
+
},
|
|
1698
|
+
add=(True if form_data.collection_name else False),
|
|
1699
|
+
user=user,
|
|
1700
|
+
)
|
|
1701
|
+
log.info(f'added {len(docs)} items to collection {collection_name}')
|
|
1702
|
+
|
|
1703
|
+
if result:
|
|
1704
|
+
# Fresh session for the final update.
|
|
1705
|
+
async with get_async_db() as session:
|
|
1706
|
+
await Files.update_file_metadata_by_id(
|
|
1707
|
+
file.id,
|
|
1708
|
+
{
|
|
1709
|
+
'collection_name': collection_name,
|
|
1710
|
+
},
|
|
1711
|
+
db=session,
|
|
1712
|
+
)
|
|
1713
|
+
|
|
1714
|
+
await Files.update_file_data_by_id(
|
|
1715
|
+
file.id,
|
|
1716
|
+
{'status': 'completed'},
|
|
1717
|
+
db=session,
|
|
1718
|
+
)
|
|
1719
|
+
await Files.update_file_hash_by_id(file.id, hash, db=session)
|
|
1720
|
+
|
|
1721
|
+
return {
|
|
1722
|
+
'status': True,
|
|
1723
|
+
'collection_name': collection_name,
|
|
1724
|
+
'filename': file.filename,
|
|
1725
|
+
'content': text_content,
|
|
1726
|
+
}
|
|
1727
|
+
else:
|
|
1728
|
+
raise Exception('Error saving document to vector database')
|
|
1729
|
+
except Exception as e:
|
|
1730
|
+
raise e
|
|
1731
|
+
|
|
1732
|
+
except Exception as e:
|
|
1733
|
+
log.exception(e)
|
|
1734
|
+
# Fresh session for error status update.
|
|
1735
|
+
async with get_async_db() as session:
|
|
1736
|
+
await Files.update_file_data_by_id(
|
|
1737
|
+
file.id,
|
|
1738
|
+
{'status': 'failed'},
|
|
1739
|
+
db=session,
|
|
1740
|
+
)
|
|
1741
|
+
# Clear the hash so the file can be re-uploaded after fixing the issue
|
|
1742
|
+
await Files.update_file_hash_by_id(file.id, None, db=session)
|
|
1743
|
+
|
|
1744
|
+
if 'No pandoc was found' in str(e):
|
|
1745
|
+
raise HTTPException(
|
|
1746
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
1747
|
+
detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
|
|
1748
|
+
)
|
|
1749
|
+
else:
|
|
1750
|
+
raise HTTPException(
|
|
1751
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
1752
|
+
detail=str(e),
|
|
1753
|
+
)
|
|
1754
|
+
|
|
1755
|
+
else:
|
|
1756
|
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=ERROR_MESSAGES.NOT_FOUND)
|
|
1757
|
+
|
|
1758
|
+
|
|
1759
|
+
class ProcessTextForm(BaseModel):
|
|
1760
|
+
name: str
|
|
1761
|
+
content: str
|
|
1762
|
+
collection_name: Optional[str] = None
|
|
1763
|
+
|
|
1764
|
+
|
|
1765
|
+
@router.post('/process/text')
|
|
1766
|
+
async def process_text(
|
|
1767
|
+
request: Request,
|
|
1768
|
+
form_data: ProcessTextForm,
|
|
1769
|
+
user=Depends(get_verified_user),
|
|
1770
|
+
):
|
|
1771
|
+
collection_name = form_data.collection_name
|
|
1772
|
+
if collection_name is None:
|
|
1773
|
+
collection_name = calculate_sha256_string(form_data.content)
|
|
1774
|
+
else:
|
|
1775
|
+
await _validate_collection_access([collection_name], user, access_type='write')
|
|
1776
|
+
|
|
1777
|
+
docs = [
|
|
1778
|
+
Document(
|
|
1779
|
+
page_content=form_data.content,
|
|
1780
|
+
metadata={'name': form_data.name, 'created_by': user.id},
|
|
1781
|
+
)
|
|
1782
|
+
]
|
|
1783
|
+
text_content = form_data.content
|
|
1784
|
+
log.debug(f'text_content: {text_content}')
|
|
1785
|
+
|
|
1786
|
+
result = await run_in_threadpool(save_docs_to_vector_db, request, docs, collection_name, user=user)
|
|
1787
|
+
if result:
|
|
1788
|
+
return {
|
|
1789
|
+
'status': True,
|
|
1790
|
+
'collection_name': collection_name,
|
|
1791
|
+
'content': text_content,
|
|
1792
|
+
}
|
|
1793
|
+
else:
|
|
1794
|
+
raise HTTPException(
|
|
1795
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1796
|
+
detail=ERROR_MESSAGES.DEFAULT(),
|
|
1797
|
+
)
|
|
1798
|
+
|
|
1799
|
+
|
|
1800
|
+
@router.post('/process/youtube')
|
|
1801
|
+
@router.post('/process/web')
|
|
1802
|
+
async def process_web(
|
|
1803
|
+
request: Request,
|
|
1804
|
+
form_data: ProcessUrlForm,
|
|
1805
|
+
process: bool = Query(True, description='Whether to process and save the content'),
|
|
1806
|
+
overwrite: bool = Query(True, description='Whether to overwrite existing collection'),
|
|
1807
|
+
user=Depends(get_verified_user),
|
|
1808
|
+
):
|
|
1809
|
+
try:
|
|
1810
|
+
content, docs = await run_in_threadpool(get_content_from_url, request, form_data.url)
|
|
1811
|
+
log.debug(f'text_content: {content}')
|
|
1812
|
+
|
|
1813
|
+
if process:
|
|
1814
|
+
collection_name = form_data.collection_name
|
|
1815
|
+
if not collection_name:
|
|
1816
|
+
collection_name = calculate_sha256_string(form_data.url)[:63]
|
|
1817
|
+
else:
|
|
1818
|
+
await _validate_collection_access([collection_name], user, access_type='write')
|
|
1819
|
+
|
|
1820
|
+
if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
|
|
1821
|
+
await run_in_threadpool(
|
|
1822
|
+
save_docs_to_vector_db,
|
|
1823
|
+
request,
|
|
1824
|
+
docs,
|
|
1825
|
+
collection_name,
|
|
1826
|
+
overwrite=overwrite,
|
|
1827
|
+
add=(not overwrite),
|
|
1828
|
+
user=user,
|
|
1829
|
+
)
|
|
1830
|
+
else:
|
|
1831
|
+
collection_name = None
|
|
1832
|
+
|
|
1833
|
+
return {
|
|
1834
|
+
'status': True,
|
|
1835
|
+
'collection_name': collection_name,
|
|
1836
|
+
'filename': form_data.url,
|
|
1837
|
+
'file': {
|
|
1838
|
+
'data': {
|
|
1839
|
+
'content': content,
|
|
1840
|
+
},
|
|
1841
|
+
'meta': {
|
|
1842
|
+
'name': form_data.url,
|
|
1843
|
+
'source': form_data.url,
|
|
1844
|
+
},
|
|
1845
|
+
},
|
|
1846
|
+
}
|
|
1847
|
+
else:
|
|
1848
|
+
return {
|
|
1849
|
+
'status': True,
|
|
1850
|
+
'content': content,
|
|
1851
|
+
}
|
|
1852
|
+
except Exception as e:
|
|
1853
|
+
log.exception(e)
|
|
1854
|
+
raise HTTPException(
|
|
1855
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
1856
|
+
detail=ERROR_MESSAGES.DEFAULT(e),
|
|
1857
|
+
)
|
|
1858
|
+
|
|
1859
|
+
|
|
1860
|
+
def search_web(request: Request, engine: str, query: str, user=None) -> list[SearchResult]:
|
|
1861
|
+
"""Search the web using a search engine and return the results as a list of SearchResult objects.
|
|
1862
|
+
Will look for a search engine API key in environment variables in the following order:
|
|
1863
|
+
- SEARXNG_QUERY_URL
|
|
1864
|
+
- YACY_QUERY_URL + YACY_USERNAME + YACY_PASSWORD
|
|
1865
|
+
- GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID
|
|
1866
|
+
- BRAVE_SEARCH_API_KEY
|
|
1867
|
+
- KAGI_SEARCH_API_KEY
|
|
1868
|
+
- MOJEEK_SEARCH_API_KEY
|
|
1869
|
+
- BOCHA_SEARCH_API_KEY
|
|
1870
|
+
- SERPSTACK_API_KEY
|
|
1871
|
+
- SERPER_API_KEY
|
|
1872
|
+
- SERPLY_API_KEY
|
|
1873
|
+
- TAVILY_API_KEY
|
|
1874
|
+
- EXA_API_KEY
|
|
1875
|
+
- PERPLEXITY_API_KEY
|
|
1876
|
+
- SOUGOU_API_SID + SOUGOU_API_SK
|
|
1877
|
+
- SEARCHAPI_API_KEY + SEARCHAPI_ENGINE (by default `google`)
|
|
1878
|
+
- SERPAPI_API_KEY + SERPAPI_ENGINE (by default `google`)
|
|
1879
|
+
Args:
|
|
1880
|
+
query (str): The query to search for
|
|
1881
|
+
"""
|
|
1882
|
+
|
|
1883
|
+
# TODO: add playwright to search the web
|
|
1884
|
+
if engine == 'ollama_cloud':
|
|
1885
|
+
return search_ollama_cloud(
|
|
1886
|
+
'https://ollama.com',
|
|
1887
|
+
request.app.state.config.OLLAMA_CLOUD_WEB_SEARCH_API_KEY,
|
|
1888
|
+
query,
|
|
1889
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1890
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1891
|
+
)
|
|
1892
|
+
elif engine == 'perplexity_search':
|
|
1893
|
+
if request.app.state.config.PERPLEXITY_API_KEY:
|
|
1894
|
+
return search_perplexity_search(
|
|
1895
|
+
request.app.state.config.PERPLEXITY_API_KEY,
|
|
1896
|
+
query,
|
|
1897
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1898
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1899
|
+
request.app.state.config.PERPLEXITY_SEARCH_API_URL,
|
|
1900
|
+
user,
|
|
1901
|
+
)
|
|
1902
|
+
else:
|
|
1903
|
+
raise Exception('No PERPLEXITY_API_KEY found in environment variables')
|
|
1904
|
+
elif engine == 'searxng':
|
|
1905
|
+
if request.app.state.config.SEARXNG_QUERY_URL:
|
|
1906
|
+
searxng_kwargs = {'language': request.app.state.config.SEARXNG_LANGUAGE}
|
|
1907
|
+
return search_searxng(
|
|
1908
|
+
request.app.state.config.SEARXNG_QUERY_URL,
|
|
1909
|
+
query,
|
|
1910
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1911
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1912
|
+
**searxng_kwargs,
|
|
1913
|
+
)
|
|
1914
|
+
else:
|
|
1915
|
+
raise Exception('No SEARXNG_QUERY_URL found in environment variables')
|
|
1916
|
+
elif engine == 'yacy':
|
|
1917
|
+
if request.app.state.config.YACY_QUERY_URL:
|
|
1918
|
+
return search_yacy(
|
|
1919
|
+
request.app.state.config.YACY_QUERY_URL,
|
|
1920
|
+
request.app.state.config.YACY_USERNAME,
|
|
1921
|
+
request.app.state.config.YACY_PASSWORD,
|
|
1922
|
+
query,
|
|
1923
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1924
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1925
|
+
)
|
|
1926
|
+
else:
|
|
1927
|
+
raise Exception('No YACY_QUERY_URL found in environment variables')
|
|
1928
|
+
elif engine == 'google_pse':
|
|
1929
|
+
if request.app.state.config.GOOGLE_PSE_API_KEY and request.app.state.config.GOOGLE_PSE_ENGINE_ID:
|
|
1930
|
+
return search_google_pse(
|
|
1931
|
+
request.app.state.config.GOOGLE_PSE_API_KEY,
|
|
1932
|
+
request.app.state.config.GOOGLE_PSE_ENGINE_ID,
|
|
1933
|
+
query,
|
|
1934
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1935
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1936
|
+
referer=request.app.state.config.WEBUI_URL,
|
|
1937
|
+
)
|
|
1938
|
+
else:
|
|
1939
|
+
raise Exception('No GOOGLE_PSE_API_KEY or GOOGLE_PSE_ENGINE_ID found in environment variables')
|
|
1940
|
+
elif engine == 'brave':
|
|
1941
|
+
if request.app.state.config.BRAVE_SEARCH_API_KEY:
|
|
1942
|
+
return search_brave(
|
|
1943
|
+
request.app.state.config.BRAVE_SEARCH_API_KEY,
|
|
1944
|
+
query,
|
|
1945
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1946
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1947
|
+
)
|
|
1948
|
+
else:
|
|
1949
|
+
raise Exception('No BRAVE_SEARCH_API_KEY found in environment variables')
|
|
1950
|
+
elif engine == 'kagi':
|
|
1951
|
+
if request.app.state.config.KAGI_SEARCH_API_KEY:
|
|
1952
|
+
return search_kagi(
|
|
1953
|
+
request.app.state.config.KAGI_SEARCH_API_KEY,
|
|
1954
|
+
query,
|
|
1955
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1956
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1957
|
+
)
|
|
1958
|
+
else:
|
|
1959
|
+
raise Exception('No KAGI_SEARCH_API_KEY found in environment variables')
|
|
1960
|
+
elif engine == 'mojeek':
|
|
1961
|
+
if request.app.state.config.MOJEEK_SEARCH_API_KEY:
|
|
1962
|
+
return search_mojeek(
|
|
1963
|
+
request.app.state.config.MOJEEK_SEARCH_API_KEY,
|
|
1964
|
+
query,
|
|
1965
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1966
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1967
|
+
)
|
|
1968
|
+
else:
|
|
1969
|
+
raise Exception('No MOJEEK_SEARCH_API_KEY found in environment variables')
|
|
1970
|
+
elif engine == 'bocha':
|
|
1971
|
+
if request.app.state.config.BOCHA_SEARCH_API_KEY:
|
|
1972
|
+
return search_bocha(
|
|
1973
|
+
request.app.state.config.BOCHA_SEARCH_API_KEY,
|
|
1974
|
+
query,
|
|
1975
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1976
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1977
|
+
)
|
|
1978
|
+
else:
|
|
1979
|
+
raise Exception('No BOCHA_SEARCH_API_KEY found in environment variables')
|
|
1980
|
+
elif engine == 'serpstack':
|
|
1981
|
+
if request.app.state.config.SERPSTACK_API_KEY:
|
|
1982
|
+
return search_serpstack(
|
|
1983
|
+
request.app.state.config.SERPSTACK_API_KEY,
|
|
1984
|
+
query,
|
|
1985
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1986
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1987
|
+
https_enabled=request.app.state.config.SERPSTACK_HTTPS,
|
|
1988
|
+
)
|
|
1989
|
+
else:
|
|
1990
|
+
raise Exception('No SERPSTACK_API_KEY found in environment variables')
|
|
1991
|
+
elif engine == 'serper':
|
|
1992
|
+
if request.app.state.config.SERPER_API_KEY:
|
|
1993
|
+
return search_serper(
|
|
1994
|
+
request.app.state.config.SERPER_API_KEY,
|
|
1995
|
+
query,
|
|
1996
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
1997
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
1998
|
+
)
|
|
1999
|
+
else:
|
|
2000
|
+
raise Exception('No SERPER_API_KEY found in environment variables')
|
|
2001
|
+
elif engine == 'serply':
|
|
2002
|
+
if request.app.state.config.SERPLY_API_KEY:
|
|
2003
|
+
return search_serply(
|
|
2004
|
+
request.app.state.config.SERPLY_API_KEY,
|
|
2005
|
+
query,
|
|
2006
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2007
|
+
filter_list=request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2008
|
+
)
|
|
2009
|
+
else:
|
|
2010
|
+
raise Exception('No SERPLY_API_KEY found in environment variables')
|
|
2011
|
+
elif engine == 'duckduckgo':
|
|
2012
|
+
return search_duckduckgo(
|
|
2013
|
+
query,
|
|
2014
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2015
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2016
|
+
concurrent_requests=request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS,
|
|
2017
|
+
backend=request.app.state.config.DDGS_BACKEND,
|
|
2018
|
+
)
|
|
2019
|
+
elif engine == 'tavily':
|
|
2020
|
+
if request.app.state.config.TAVILY_API_KEY:
|
|
2021
|
+
return search_tavily(
|
|
2022
|
+
request.app.state.config.TAVILY_API_KEY,
|
|
2023
|
+
query,
|
|
2024
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2025
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2026
|
+
)
|
|
2027
|
+
else:
|
|
2028
|
+
raise Exception('No TAVILY_API_KEY found in environment variables')
|
|
2029
|
+
elif engine == 'exa':
|
|
2030
|
+
if request.app.state.config.EXA_API_KEY:
|
|
2031
|
+
return search_exa(
|
|
2032
|
+
request.app.state.config.EXA_API_KEY,
|
|
2033
|
+
query,
|
|
2034
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2035
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2036
|
+
)
|
|
2037
|
+
else:
|
|
2038
|
+
raise Exception('No EXA_API_KEY found in environment variables')
|
|
2039
|
+
elif engine == 'searchapi':
|
|
2040
|
+
if request.app.state.config.SEARCHAPI_API_KEY:
|
|
2041
|
+
return search_searchapi(
|
|
2042
|
+
request.app.state.config.SEARCHAPI_API_KEY,
|
|
2043
|
+
request.app.state.config.SEARCHAPI_ENGINE,
|
|
2044
|
+
query,
|
|
2045
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2046
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2047
|
+
)
|
|
2048
|
+
else:
|
|
2049
|
+
raise Exception('No SEARCHAPI_API_KEY found in environment variables')
|
|
2050
|
+
elif engine == 'serpapi':
|
|
2051
|
+
if request.app.state.config.SERPAPI_API_KEY:
|
|
2052
|
+
return search_serpapi(
|
|
2053
|
+
request.app.state.config.SERPAPI_API_KEY,
|
|
2054
|
+
request.app.state.config.SERPAPI_ENGINE,
|
|
2055
|
+
query,
|
|
2056
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2057
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2058
|
+
)
|
|
2059
|
+
else:
|
|
2060
|
+
raise Exception('No SERPAPI_API_KEY found in environment variables')
|
|
2061
|
+
elif engine == 'jina':
|
|
2062
|
+
return search_jina(
|
|
2063
|
+
request.app.state.config.JINA_API_KEY,
|
|
2064
|
+
query,
|
|
2065
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2066
|
+
request.app.state.config.JINA_API_BASE_URL,
|
|
2067
|
+
)
|
|
2068
|
+
elif engine == 'bing':
|
|
2069
|
+
return search_bing(
|
|
2070
|
+
request.app.state.config.BING_SEARCH_V7_SUBSCRIPTION_KEY,
|
|
2071
|
+
request.app.state.config.BING_SEARCH_V7_ENDPOINT,
|
|
2072
|
+
str(DEFAULT_LOCALE),
|
|
2073
|
+
query,
|
|
2074
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2075
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2076
|
+
)
|
|
2077
|
+
elif engine == 'azure':
|
|
2078
|
+
if (
|
|
2079
|
+
request.app.state.config.AZURE_AI_SEARCH_API_KEY
|
|
2080
|
+
and request.app.state.config.AZURE_AI_SEARCH_ENDPOINT
|
|
2081
|
+
and request.app.state.config.AZURE_AI_SEARCH_INDEX_NAME
|
|
2082
|
+
):
|
|
2083
|
+
return search_azure(
|
|
2084
|
+
request.app.state.config.AZURE_AI_SEARCH_API_KEY,
|
|
2085
|
+
request.app.state.config.AZURE_AI_SEARCH_ENDPOINT,
|
|
2086
|
+
request.app.state.config.AZURE_AI_SEARCH_INDEX_NAME,
|
|
2087
|
+
query,
|
|
2088
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2089
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2090
|
+
)
|
|
2091
|
+
else:
|
|
2092
|
+
raise Exception(
|
|
2093
|
+
'AZURE_AI_SEARCH_API_KEY, AZURE_AI_SEARCH_ENDPOINT, and AZURE_AI_SEARCH_INDEX_NAME are required for Azure AI Search'
|
|
2094
|
+
)
|
|
2095
|
+
elif engine == 'exa':
|
|
2096
|
+
return search_exa(
|
|
2097
|
+
request.app.state.config.EXA_API_KEY,
|
|
2098
|
+
query,
|
|
2099
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2100
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2101
|
+
)
|
|
2102
|
+
elif engine == 'perplexity':
|
|
2103
|
+
return search_perplexity(
|
|
2104
|
+
request.app.state.config.PERPLEXITY_API_KEY,
|
|
2105
|
+
query,
|
|
2106
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2107
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2108
|
+
model=request.app.state.config.PERPLEXITY_MODEL,
|
|
2109
|
+
search_context_usage=request.app.state.config.PERPLEXITY_SEARCH_CONTEXT_USAGE,
|
|
2110
|
+
)
|
|
2111
|
+
elif engine == 'sougou':
|
|
2112
|
+
if request.app.state.config.SOUGOU_API_SID and request.app.state.config.SOUGOU_API_SK:
|
|
2113
|
+
return search_sougou(
|
|
2114
|
+
request.app.state.config.SOUGOU_API_SID,
|
|
2115
|
+
request.app.state.config.SOUGOU_API_SK,
|
|
2116
|
+
query,
|
|
2117
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2118
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2119
|
+
)
|
|
2120
|
+
else:
|
|
2121
|
+
raise Exception('No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables')
|
|
2122
|
+
elif engine == 'firecrawl':
|
|
2123
|
+
return search_firecrawl(
|
|
2124
|
+
request.app.state.config.FIRECRAWL_API_BASE_URL,
|
|
2125
|
+
request.app.state.config.FIRECRAWL_API_KEY,
|
|
2126
|
+
query,
|
|
2127
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2128
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2129
|
+
)
|
|
2130
|
+
elif engine == 'external':
|
|
2131
|
+
return search_external(
|
|
2132
|
+
request,
|
|
2133
|
+
request.app.state.config.EXTERNAL_WEB_SEARCH_URL,
|
|
2134
|
+
request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY,
|
|
2135
|
+
query,
|
|
2136
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2137
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2138
|
+
user=user,
|
|
2139
|
+
)
|
|
2140
|
+
elif engine == 'yandex':
|
|
2141
|
+
return search_yandex(
|
|
2142
|
+
request,
|
|
2143
|
+
request.app.state.config.YANDEX_WEB_SEARCH_URL,
|
|
2144
|
+
request.app.state.config.YANDEX_WEB_SEARCH_API_KEY,
|
|
2145
|
+
request.app.state.config.YANDEX_WEB_SEARCH_CONFIG,
|
|
2146
|
+
query,
|
|
2147
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2148
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2149
|
+
user=user,
|
|
2150
|
+
)
|
|
2151
|
+
elif engine == 'youcom':
|
|
2152
|
+
return search_youcom(
|
|
2153
|
+
request.app.state.config.YOUCOM_API_KEY,
|
|
2154
|
+
query,
|
|
2155
|
+
request.app.state.config.WEB_SEARCH_RESULT_COUNT,
|
|
2156
|
+
request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST,
|
|
2157
|
+
)
|
|
2158
|
+
else:
|
|
2159
|
+
raise Exception('No search engine API key found in environment variables')
|
|
2160
|
+
|
|
2161
|
+
|
|
2162
|
+
@router.post('/process/web/search')
|
|
2163
|
+
async def process_web_search(request: Request, form_data: SearchForm, user=Depends(get_verified_user)):
|
|
2164
|
+
if not request.app.state.config.ENABLE_WEB_SEARCH:
|
|
2165
|
+
raise HTTPException(
|
|
2166
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
2167
|
+
detail=ERROR_MESSAGES.ACCESS_PROHIBITED,
|
|
2168
|
+
)
|
|
2169
|
+
|
|
2170
|
+
if user.role != 'admin' and not await has_permission(
|
|
2171
|
+
user.id, 'features.web_search', request.app.state.config.USER_PERMISSIONS
|
|
2172
|
+
):
|
|
2173
|
+
raise HTTPException(
|
|
2174
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
2175
|
+
detail=ERROR_MESSAGES.ACCESS_PROHIBITED,
|
|
2176
|
+
)
|
|
2177
|
+
|
|
2178
|
+
urls = []
|
|
2179
|
+
result_items = []
|
|
2180
|
+
|
|
2181
|
+
try:
|
|
2182
|
+
logging.debug(f'trying to web search with {request.app.state.config.WEB_SEARCH_ENGINE, form_data.queries}')
|
|
2183
|
+
|
|
2184
|
+
# Use semaphore to limit concurrent requests based on WEB_SEARCH_CONCURRENT_REQUESTS
|
|
2185
|
+
# 0 or None = unlimited (previous behavior), positive number = limited concurrency
|
|
2186
|
+
# Set to 1 for sequential execution (rate-limited APIs like Brave free tier)
|
|
2187
|
+
concurrent_limit = request.app.state.config.WEB_SEARCH_CONCURRENT_REQUESTS
|
|
2188
|
+
|
|
2189
|
+
if concurrent_limit:
|
|
2190
|
+
# Limited concurrency with semaphore
|
|
2191
|
+
semaphore = asyncio.Semaphore(concurrent_limit)
|
|
2192
|
+
|
|
2193
|
+
async def search_query_with_semaphore(query):
|
|
2194
|
+
async with semaphore:
|
|
2195
|
+
return await run_in_threadpool(
|
|
2196
|
+
search_web,
|
|
2197
|
+
request,
|
|
2198
|
+
request.app.state.config.WEB_SEARCH_ENGINE,
|
|
2199
|
+
query,
|
|
2200
|
+
user,
|
|
2201
|
+
)
|
|
2202
|
+
|
|
2203
|
+
search_tasks = [search_query_with_semaphore(query) for query in form_data.queries]
|
|
2204
|
+
else:
|
|
2205
|
+
# Unlimited parallel execution (previous behavior)
|
|
2206
|
+
search_tasks = [
|
|
2207
|
+
run_in_threadpool(
|
|
2208
|
+
search_web,
|
|
2209
|
+
request,
|
|
2210
|
+
request.app.state.config.WEB_SEARCH_ENGINE,
|
|
2211
|
+
query,
|
|
2212
|
+
user,
|
|
2213
|
+
)
|
|
2214
|
+
for query in form_data.queries
|
|
2215
|
+
]
|
|
2216
|
+
|
|
2217
|
+
search_results = await asyncio.gather(*search_tasks)
|
|
2218
|
+
|
|
2219
|
+
for result in search_results:
|
|
2220
|
+
if result:
|
|
2221
|
+
for item in result:
|
|
2222
|
+
if item and item.link:
|
|
2223
|
+
result_items.append(item)
|
|
2224
|
+
urls.append(item.link)
|
|
2225
|
+
|
|
2226
|
+
urls = list(dict.fromkeys(urls))
|
|
2227
|
+
log.debug(f'urls: {urls}')
|
|
2228
|
+
|
|
2229
|
+
except Exception as e:
|
|
2230
|
+
log.exception(e)
|
|
2231
|
+
|
|
2232
|
+
raise HTTPException(
|
|
2233
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
2234
|
+
detail=ERROR_MESSAGES.WEB_SEARCH_ERROR(e),
|
|
2235
|
+
)
|
|
2236
|
+
|
|
2237
|
+
if len(urls) == 0:
|
|
2238
|
+
raise HTTPException(
|
|
2239
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
2240
|
+
detail=ERROR_MESSAGES.DEFAULT('No results found from web search'),
|
|
2241
|
+
)
|
|
2242
|
+
|
|
2243
|
+
try:
|
|
2244
|
+
if request.app.state.config.BYPASS_WEB_SEARCH_WEB_LOADER:
|
|
2245
|
+
search_results = [item for result in search_results for item in result if result]
|
|
2246
|
+
|
|
2247
|
+
docs = [
|
|
2248
|
+
Document(
|
|
2249
|
+
page_content=result.snippet,
|
|
2250
|
+
metadata={
|
|
2251
|
+
'source': result.link,
|
|
2252
|
+
'title': result.title,
|
|
2253
|
+
'snippet': result.snippet,
|
|
2254
|
+
'link': result.link,
|
|
2255
|
+
},
|
|
2256
|
+
)
|
|
2257
|
+
for result in search_results
|
|
2258
|
+
if hasattr(result, 'snippet') and result.snippet is not None
|
|
2259
|
+
]
|
|
2260
|
+
else:
|
|
2261
|
+
loader = get_web_loader(
|
|
2262
|
+
urls,
|
|
2263
|
+
verify_ssl=request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION,
|
|
2264
|
+
requests_per_second=request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS,
|
|
2265
|
+
trust_env=request.app.state.config.WEB_SEARCH_TRUST_ENV,
|
|
2266
|
+
)
|
|
2267
|
+
docs = await loader.aload()
|
|
2268
|
+
|
|
2269
|
+
urls = [
|
|
2270
|
+
doc.metadata.get('source') for doc in docs if doc.metadata.get('source')
|
|
2271
|
+
] # only keep the urls returned by the loader
|
|
2272
|
+
result_items = [
|
|
2273
|
+
dict(item) for item in result_items if item.link in urls
|
|
2274
|
+
] # only keep the search results that have been loaded
|
|
2275
|
+
|
|
2276
|
+
if request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL:
|
|
2277
|
+
return {
|
|
2278
|
+
'status': True,
|
|
2279
|
+
'collection_name': None,
|
|
2280
|
+
'filenames': urls,
|
|
2281
|
+
'items': result_items,
|
|
2282
|
+
'docs': [
|
|
2283
|
+
{
|
|
2284
|
+
'content': doc.page_content,
|
|
2285
|
+
'metadata': doc.metadata,
|
|
2286
|
+
}
|
|
2287
|
+
for doc in docs
|
|
2288
|
+
],
|
|
2289
|
+
'loaded_count': len(docs),
|
|
2290
|
+
}
|
|
2291
|
+
else:
|
|
2292
|
+
# Create a single collection for all documents
|
|
2293
|
+
collection_name = f'web-search-{calculate_sha256_string("-".join(form_data.queries))}'[:63]
|
|
2294
|
+
|
|
2295
|
+
try:
|
|
2296
|
+
await run_in_threadpool(
|
|
2297
|
+
save_docs_to_vector_db,
|
|
2298
|
+
request,
|
|
2299
|
+
docs,
|
|
2300
|
+
collection_name,
|
|
2301
|
+
overwrite=True,
|
|
2302
|
+
user=user,
|
|
2303
|
+
)
|
|
2304
|
+
except Exception as e:
|
|
2305
|
+
log.debug(f'error saving docs: {e}')
|
|
2306
|
+
|
|
2307
|
+
return {
|
|
2308
|
+
'status': True,
|
|
2309
|
+
'collection_names': [collection_name],
|
|
2310
|
+
'items': result_items,
|
|
2311
|
+
'filenames': urls,
|
|
2312
|
+
'loaded_count': len(docs),
|
|
2313
|
+
}
|
|
2314
|
+
except Exception as e:
|
|
2315
|
+
log.exception(e)
|
|
2316
|
+
raise HTTPException(
|
|
2317
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
2318
|
+
detail=ERROR_MESSAGES.DEFAULT(e),
|
|
2319
|
+
)
|
|
2320
|
+
|
|
2321
|
+
|
|
2322
|
+
async def _validate_collection_access(collection_names: list[str], user, access_type: str = 'read') -> None:
|
|
2323
|
+
"""
|
|
2324
|
+
Raise 403 if the user lacks access to any of the requested collections.
|
|
2325
|
+
Delegates to the shared filter_accessible_collections utility so the
|
|
2326
|
+
access rules stay in one place.
|
|
2327
|
+
"""
|
|
2328
|
+
requested = set(collection_names)
|
|
2329
|
+
allowed = await filter_accessible_collections(requested, user, access_type=access_type)
|
|
2330
|
+
denied = requested - allowed
|
|
2331
|
+
if denied:
|
|
2332
|
+
raise HTTPException(
|
|
2333
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
2334
|
+
detail=ERROR_MESSAGES.ACCESS_PROHIBITED,
|
|
2335
|
+
)
|
|
2336
|
+
|
|
2337
|
+
|
|
2338
|
+
class QueryDocForm(BaseModel):
|
|
2339
|
+
collection_name: str
|
|
2340
|
+
query: str
|
|
2341
|
+
k: Optional[int] = None
|
|
2342
|
+
k_reranker: Optional[int] = None
|
|
2343
|
+
r: Optional[float] = None
|
|
2344
|
+
hybrid: Optional[bool] = None
|
|
2345
|
+
|
|
2346
|
+
|
|
2347
|
+
@router.post('/query/doc')
|
|
2348
|
+
async def query_doc_handler(
|
|
2349
|
+
request: Request,
|
|
2350
|
+
form_data: QueryDocForm,
|
|
2351
|
+
user=Depends(get_verified_user),
|
|
2352
|
+
):
|
|
2353
|
+
await _validate_collection_access([form_data.collection_name], user)
|
|
2354
|
+
|
|
2355
|
+
try:
|
|
2356
|
+
if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH and (form_data.hybrid is None or form_data.hybrid):
|
|
2357
|
+
collection_results = {}
|
|
2358
|
+
collection_results[form_data.collection_name] = await ASYNC_VECTOR_DB_CLIENT.get(
|
|
2359
|
+
collection_name=form_data.collection_name
|
|
2360
|
+
)
|
|
2361
|
+
return await query_doc_with_hybrid_search(
|
|
2362
|
+
collection_name=form_data.collection_name,
|
|
2363
|
+
collection_result=collection_results[form_data.collection_name],
|
|
2364
|
+
query=form_data.query,
|
|
2365
|
+
embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
|
|
2366
|
+
query, prefix=prefix, user=user
|
|
2367
|
+
),
|
|
2368
|
+
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
|
|
2369
|
+
reranking_function=(
|
|
2370
|
+
(lambda query, documents: request.app.state.RERANKING_FUNCTION(query, documents, user=user))
|
|
2371
|
+
if request.app.state.RERANKING_FUNCTION
|
|
2372
|
+
else None
|
|
2373
|
+
),
|
|
2374
|
+
k_reranker=form_data.k_reranker or request.app.state.config.TOP_K_RERANKER,
|
|
2375
|
+
r=(form_data.r if form_data.r else request.app.state.config.RELEVANCE_THRESHOLD),
|
|
2376
|
+
hybrid_bm25_weight=(
|
|
2377
|
+
form_data.hybrid_bm25_weight
|
|
2378
|
+
if form_data.hybrid_bm25_weight
|
|
2379
|
+
else request.app.state.config.HYBRID_BM25_WEIGHT
|
|
2380
|
+
),
|
|
2381
|
+
user=user,
|
|
2382
|
+
)
|
|
2383
|
+
else:
|
|
2384
|
+
query_embedding = await request.app.state.EMBEDDING_FUNCTION(
|
|
2385
|
+
form_data.query, prefix=RAG_EMBEDDING_QUERY_PREFIX, user=user
|
|
2386
|
+
)
|
|
2387
|
+
# query_doc wraps a blocking VECTOR_DB_CLIENT.search call;
|
|
2388
|
+
# offload so the request's event loop stays responsive.
|
|
2389
|
+
return await asyncio.to_thread(
|
|
2390
|
+
query_doc,
|
|
2391
|
+
collection_name=form_data.collection_name,
|
|
2392
|
+
query_embedding=query_embedding,
|
|
2393
|
+
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
|
|
2394
|
+
user=user,
|
|
2395
|
+
)
|
|
2396
|
+
except Exception as e:
|
|
2397
|
+
log.exception(e)
|
|
2398
|
+
raise HTTPException(
|
|
2399
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
2400
|
+
detail=ERROR_MESSAGES.DEFAULT(e),
|
|
2401
|
+
)
|
|
2402
|
+
|
|
2403
|
+
|
|
2404
|
+
class QueryCollectionsForm(BaseModel):
|
|
2405
|
+
collection_names: list[str]
|
|
2406
|
+
query: str
|
|
2407
|
+
k: Optional[int] = None
|
|
2408
|
+
k_reranker: Optional[int] = None
|
|
2409
|
+
r: Optional[float] = None
|
|
2410
|
+
hybrid: Optional[bool] = None
|
|
2411
|
+
hybrid_bm25_weight: Optional[float] = None
|
|
2412
|
+
enable_enriched_texts: Optional[bool] = None
|
|
2413
|
+
|
|
2414
|
+
|
|
2415
|
+
@router.post('/query/collection')
|
|
2416
|
+
async def query_collection_handler(
|
|
2417
|
+
request: Request,
|
|
2418
|
+
form_data: QueryCollectionsForm,
|
|
2419
|
+
user=Depends(get_verified_user),
|
|
2420
|
+
):
|
|
2421
|
+
await _validate_collection_access(form_data.collection_names, user)
|
|
2422
|
+
|
|
2423
|
+
try:
|
|
2424
|
+
if request.app.state.config.ENABLE_RAG_HYBRID_SEARCH and (form_data.hybrid is None or form_data.hybrid):
|
|
2425
|
+
return await query_collection_with_hybrid_search(
|
|
2426
|
+
collection_names=form_data.collection_names,
|
|
2427
|
+
queries=[form_data.query],
|
|
2428
|
+
embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
|
|
2429
|
+
query, prefix=prefix, user=user
|
|
2430
|
+
),
|
|
2431
|
+
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
|
|
2432
|
+
reranking_function=(
|
|
2433
|
+
(lambda query, documents: request.app.state.RERANKING_FUNCTION(query, documents, user=user))
|
|
2434
|
+
if request.app.state.RERANKING_FUNCTION
|
|
2435
|
+
else None
|
|
2436
|
+
),
|
|
2437
|
+
k_reranker=form_data.k_reranker or request.app.state.config.TOP_K_RERANKER,
|
|
2438
|
+
r=(form_data.r if form_data.r else request.app.state.config.RELEVANCE_THRESHOLD),
|
|
2439
|
+
hybrid_bm25_weight=(
|
|
2440
|
+
form_data.hybrid_bm25_weight
|
|
2441
|
+
if form_data.hybrid_bm25_weight
|
|
2442
|
+
else request.app.state.config.HYBRID_BM25_WEIGHT
|
|
2443
|
+
),
|
|
2444
|
+
enable_enriched_texts=(
|
|
2445
|
+
form_data.enable_enriched_texts
|
|
2446
|
+
if form_data.enable_enriched_texts is not None
|
|
2447
|
+
else request.app.state.config.ENABLE_RAG_HYBRID_SEARCH_ENRICHED_TEXTS
|
|
2448
|
+
),
|
|
2449
|
+
)
|
|
2450
|
+
else:
|
|
2451
|
+
return await query_collection(
|
|
2452
|
+
request,
|
|
2453
|
+
collection_names=form_data.collection_names,
|
|
2454
|
+
queries=[form_data.query],
|
|
2455
|
+
embedding_function=lambda query, prefix: request.app.state.EMBEDDING_FUNCTION(
|
|
2456
|
+
query, prefix=prefix, user=user
|
|
2457
|
+
),
|
|
2458
|
+
k=form_data.k if form_data.k else request.app.state.config.TOP_K,
|
|
2459
|
+
)
|
|
2460
|
+
|
|
2461
|
+
except Exception as e:
|
|
2462
|
+
log.exception(e)
|
|
2463
|
+
raise HTTPException(
|
|
2464
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
2465
|
+
detail=ERROR_MESSAGES.DEFAULT(e),
|
|
2466
|
+
)
|
|
2467
|
+
|
|
2468
|
+
|
|
2469
|
+
####################################
|
|
2470
|
+
#
|
|
2471
|
+
# Vector DB operations
|
|
2472
|
+
#
|
|
2473
|
+
####################################
|
|
2474
|
+
|
|
2475
|
+
|
|
2476
|
+
class DeleteForm(BaseModel):
|
|
2477
|
+
collection_name: str
|
|
2478
|
+
file_id: str
|
|
2479
|
+
|
|
2480
|
+
|
|
2481
|
+
@router.post('/delete')
|
|
2482
|
+
async def delete_entries_from_collection(
|
|
2483
|
+
form_data: DeleteForm,
|
|
2484
|
+
user=Depends(get_admin_user),
|
|
2485
|
+
db: AsyncSession = Depends(get_async_session),
|
|
2486
|
+
):
|
|
2487
|
+
try:
|
|
2488
|
+
if await ASYNC_VECTOR_DB_CLIENT.has_collection(collection_name=form_data.collection_name):
|
|
2489
|
+
file = await Files.get_file_by_id(form_data.file_id, db=db)
|
|
2490
|
+
if not file:
|
|
2491
|
+
raise HTTPException(
|
|
2492
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
2493
|
+
detail=ERROR_MESSAGES.NOT_FOUND,
|
|
2494
|
+
)
|
|
2495
|
+
hash = file.hash
|
|
2496
|
+
|
|
2497
|
+
# Refuse to issue a `filter={'hash': None}` query — the
|
|
2498
|
+
# match semantics of a null filter value are
|
|
2499
|
+
# backend-dependent (some backends ignore the key, some
|
|
2500
|
+
# match every row whose metadata lacks `hash`) and risk
|
|
2501
|
+
# deleting unrelated entries. Files without a hash are
|
|
2502
|
+
# typically unprocessed / failed / legacy records that
|
|
2503
|
+
# can't be targeted by hash anyway.
|
|
2504
|
+
if hash is None:
|
|
2505
|
+
raise HTTPException(
|
|
2506
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
2507
|
+
detail=ERROR_MESSAGES.DEFAULT('File has no hash; cannot delete vector entries by hash.'),
|
|
2508
|
+
)
|
|
2509
|
+
|
|
2510
|
+
# Pre-existing bug: this used `metadata=` which is not a
|
|
2511
|
+
# parameter on `VectorDBBase.delete` nor on any backend
|
|
2512
|
+
# implementation, so the call always raised TypeError that
|
|
2513
|
+
# was silently swallowed by the surrounding `except
|
|
2514
|
+
# Exception` and the endpoint reported `{'status': False}`
|
|
2515
|
+
# for every request. Use `filter` to actually do what the
|
|
2516
|
+
# endpoint name promises.
|
|
2517
|
+
await ASYNC_VECTOR_DB_CLIENT.delete(
|
|
2518
|
+
collection_name=form_data.collection_name,
|
|
2519
|
+
filter={'hash': hash},
|
|
2520
|
+
)
|
|
2521
|
+
return {'status': True}
|
|
2522
|
+
else:
|
|
2523
|
+
return {'status': False}
|
|
2524
|
+
except HTTPException:
|
|
2525
|
+
# Caller-meaningful errors (404/400 above) must not be
|
|
2526
|
+
# swallowed and re-shaped as `{'status': False}`.
|
|
2527
|
+
raise
|
|
2528
|
+
except Exception as e:
|
|
2529
|
+
log.exception(e)
|
|
2530
|
+
return {'status': False}
|
|
2531
|
+
|
|
2532
|
+
|
|
2533
|
+
@router.post('/reset/db')
|
|
2534
|
+
async def reset_vector_db(user=Depends(get_admin_user), db: AsyncSession = Depends(get_async_session)):
|
|
2535
|
+
await ASYNC_VECTOR_DB_CLIENT.reset()
|
|
2536
|
+
await Knowledges.delete_all_knowledge(db=db)
|
|
2537
|
+
|
|
2538
|
+
|
|
2539
|
+
@router.post('/reset/uploads')
|
|
2540
|
+
async def reset_upload_dir(user=Depends(get_admin_user)) -> bool:
|
|
2541
|
+
folder = f'{UPLOAD_DIR}'
|
|
2542
|
+
try:
|
|
2543
|
+
# Check if the directory exists
|
|
2544
|
+
if os.path.exists(folder):
|
|
2545
|
+
# Iterate over all the files and directories in the specified directory
|
|
2546
|
+
for filename in os.listdir(folder):
|
|
2547
|
+
file_path = os.path.join(folder, filename)
|
|
2548
|
+
try:
|
|
2549
|
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
|
2550
|
+
os.unlink(file_path) # Remove the file or link
|
|
2551
|
+
elif os.path.isdir(file_path):
|
|
2552
|
+
shutil.rmtree(file_path) # Remove the directory
|
|
2553
|
+
except Exception as e:
|
|
2554
|
+
log.exception(f'Failed to delete {file_path}. Reason: {e}')
|
|
2555
|
+
else:
|
|
2556
|
+
log.warning(f'The directory {folder} does not exist')
|
|
2557
|
+
except Exception as e:
|
|
2558
|
+
log.exception(f'Failed to process the directory {folder}. Reason: {e}')
|
|
2559
|
+
return True
|
|
2560
|
+
|
|
2561
|
+
|
|
2562
|
+
if ENV == 'dev':
|
|
2563
|
+
|
|
2564
|
+
@router.get('/ef/{text}')
|
|
2565
|
+
async def get_embeddings(request: Request, text: Optional[str] = 'Hello World!'):
|
|
2566
|
+
return {'result': await request.app.state.EMBEDDING_FUNCTION(text, prefix=RAG_EMBEDDING_QUERY_PREFIX)}
|
|
2567
|
+
|
|
2568
|
+
|
|
2569
|
+
class BatchProcessFilesForm(BaseModel):
|
|
2570
|
+
files: List[FileModel]
|
|
2571
|
+
collection_name: str
|
|
2572
|
+
|
|
2573
|
+
|
|
2574
|
+
class BatchProcessFilesResult(BaseModel):
|
|
2575
|
+
file_id: str
|
|
2576
|
+
status: str
|
|
2577
|
+
error: Optional[str] = None
|
|
2578
|
+
|
|
2579
|
+
|
|
2580
|
+
class BatchProcessFilesResponse(BaseModel):
|
|
2581
|
+
results: List[BatchProcessFilesResult]
|
|
2582
|
+
errors: List[BatchProcessFilesResult]
|
|
2583
|
+
|
|
2584
|
+
|
|
2585
|
+
@router.post('/process/files/batch')
|
|
2586
|
+
async def process_files_batch(
|
|
2587
|
+
request: Request,
|
|
2588
|
+
form_data: BatchProcessFilesForm,
|
|
2589
|
+
user=Depends(get_verified_user),
|
|
2590
|
+
db=None,
|
|
2591
|
+
) -> BatchProcessFilesResponse:
|
|
2592
|
+
"""
|
|
2593
|
+
Process a batch of files and save them to the vector database.
|
|
2594
|
+
|
|
2595
|
+
NOTE: We intentionally do NOT use Depends(get_async_session) here.
|
|
2596
|
+
The save_docs_to_vector_db() call makes external embedding API calls which
|
|
2597
|
+
can take 5-60+ seconds for batch operations. Database operations after
|
|
2598
|
+
embedding (Files.update_file_by_id) manage their own short-lived sessions.
|
|
2599
|
+
"""
|
|
2600
|
+
|
|
2601
|
+
collection_name = form_data.collection_name
|
|
2602
|
+
|
|
2603
|
+
file_results: List[BatchProcessFilesResult] = []
|
|
2604
|
+
file_errors: List[BatchProcessFilesResult] = []
|
|
2605
|
+
file_updates: List[FileUpdateForm] = []
|
|
2606
|
+
|
|
2607
|
+
# Prepare all documents first
|
|
2608
|
+
all_docs: List[Document] = []
|
|
2609
|
+
|
|
2610
|
+
for file in form_data.files:
|
|
2611
|
+
try:
|
|
2612
|
+
# Ownership check: verify the requesting user owns the file or is an admin
|
|
2613
|
+
db_file = await Files.get_file_by_id(file.id, db=db)
|
|
2614
|
+
if not db_file:
|
|
2615
|
+
file_errors.append(
|
|
2616
|
+
BatchProcessFilesResult(
|
|
2617
|
+
file_id=file.id,
|
|
2618
|
+
status='failed',
|
|
2619
|
+
error='File not found',
|
|
2620
|
+
)
|
|
2621
|
+
)
|
|
2622
|
+
continue
|
|
2623
|
+
if db_file.user_id != user.id and user.role != 'admin':
|
|
2624
|
+
file_errors.append(
|
|
2625
|
+
BatchProcessFilesResult(
|
|
2626
|
+
file_id=file.id,
|
|
2627
|
+
status='failed',
|
|
2628
|
+
error='Permission denied: not file owner',
|
|
2629
|
+
)
|
|
2630
|
+
)
|
|
2631
|
+
continue
|
|
2632
|
+
|
|
2633
|
+
text_content = file.data.get('content', '')
|
|
2634
|
+
docs: List[Document] = [
|
|
2635
|
+
Document(
|
|
2636
|
+
page_content=text_content.replace('<br/>', '\n'),
|
|
2637
|
+
metadata={
|
|
2638
|
+
**file.meta,
|
|
2639
|
+
'name': file.filename,
|
|
2640
|
+
'created_by': file.user_id,
|
|
2641
|
+
'file_id': file.id,
|
|
2642
|
+
'source': file.filename,
|
|
2643
|
+
},
|
|
2644
|
+
)
|
|
2645
|
+
]
|
|
2646
|
+
|
|
2647
|
+
all_docs.extend(docs)
|
|
2648
|
+
|
|
2649
|
+
file_updates.append(
|
|
2650
|
+
FileUpdateForm(
|
|
2651
|
+
hash=calculate_sha256_string(text_content),
|
|
2652
|
+
data={'content': text_content},
|
|
2653
|
+
)
|
|
2654
|
+
)
|
|
2655
|
+
file_results.append(BatchProcessFilesResult(file_id=file.id, status='prepared'))
|
|
2656
|
+
|
|
2657
|
+
except Exception as e:
|
|
2658
|
+
log.error(f'process_files_batch: Error processing file {file.id}: {str(e)}')
|
|
2659
|
+
file_errors.append(BatchProcessFilesResult(file_id=file.id, status='failed', error=str(e)))
|
|
2660
|
+
|
|
2661
|
+
# Save all documents in one batch
|
|
2662
|
+
if all_docs:
|
|
2663
|
+
try:
|
|
2664
|
+
await run_in_threadpool(
|
|
2665
|
+
save_docs_to_vector_db,
|
|
2666
|
+
request,
|
|
2667
|
+
all_docs,
|
|
2668
|
+
collection_name,
|
|
2669
|
+
add=True,
|
|
2670
|
+
user=user,
|
|
2671
|
+
)
|
|
2672
|
+
|
|
2673
|
+
# Update all files with collection name
|
|
2674
|
+
for file_update, file_result in zip(file_updates, file_results):
|
|
2675
|
+
await Files.update_file_by_id(id=file_result.file_id, form_data=file_update, db=db)
|
|
2676
|
+
file_result.status = 'completed'
|
|
2677
|
+
|
|
2678
|
+
except Exception as e:
|
|
2679
|
+
log.error(f'process_files_batch: Error saving documents to vector DB: {str(e)}')
|
|
2680
|
+
for file_result in file_results:
|
|
2681
|
+
file_result.status = 'failed'
|
|
2682
|
+
file_errors.append(BatchProcessFilesResult(file_id=file_result.file_id, status='failed', error=str(e)))
|
|
2683
|
+
|
|
2684
|
+
return BatchProcessFilesResponse(results=file_results, errors=file_errors)
|