langflow-base-nightly 0.5.0.dev30__py3-none-any.whl → 0.5.0.dev31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langflow/api/router.py +2 -0
- langflow/api/v1/__init__.py +2 -0
- langflow/api/v1/knowledge_bases.py +437 -0
- langflow/base/data/kb_utils.py +104 -0
- langflow/components/data/__init__.py +4 -0
- langflow/components/data/kb_ingest.py +585 -0
- langflow/components/data/kb_retrieval.py +254 -0
- langflow/frontend/assets/{SlackIcon-D2PxMQjX.js → SlackIcon-Bikuxo8x.js} +1 -1
- langflow/frontend/assets/{Wikipedia-BNM0lBPs.js → Wikipedia-B6aCFf5-.js} +1 -1
- langflow/frontend/assets/{Wolfram-COQyGyeC.js → Wolfram-CekL_M-a.js} +1 -1
- langflow/frontend/assets/{index-CTpfN0Cy.js → index-09CVJwsY.js} +1 -1
- langflow/frontend/assets/{index-DWUG3nTC.js → index-1MEYR1La.js} +1 -1
- langflow/frontend/assets/{index-Ds9y6kEK.js → index-2vQdFIK_.js} +1 -1
- langflow/frontend/assets/{index-DRdKSzTn.js → index-4Tl3Nxdo.js} +1 -1
- langflow/frontend/assets/{index-O_vPh7iD.js → index-5G402gB8.js} +1 -1
- langflow/frontend/assets/{index-D15h4ir2.js → index-5hW8VleF.js} +1 -1
- langflow/frontend/assets/{index-BydnMWnM.js → index-6GWpsedd.js} +1 -1
- langflow/frontend/assets/{index-4vIU43o6.js → index-7x3wNZ-4.js} +1 -1
- langflow/frontend/assets/{index-DrFpyu9Z.js → index-9gkURvG2.js} +1 -1
- langflow/frontend/assets/{index-DRe5h2N_.js → index-AOX7bbjJ.js} +1 -1
- langflow/frontend/assets/{index-fJyq3ZWN.js → index-B20KmxhS.js} +1 -1
- langflow/frontend/assets/{index-D_sHnnuS.js → index-B2EmwqKj.js} +1 -1
- langflow/frontend/assets/{index-DEc_2ba8.js → index-B4AtFbkN.js} +1 -1
- langflow/frontend/assets/{index-D_zQiboE.js → index-B4xLpgbM.js} +1 -1
- langflow/frontend/assets/{index-Db8Xgs-K.js → index-B9KRIJFi.js} +1 -1
- langflow/frontend/assets/{index-BzCZNz2f.js → index-B9uOBe6Y.js} +1 -1
- langflow/frontend/assets/{index-pFTvwRsJ.js → index-BDmbsLY2.js} +1 -1
- langflow/frontend/assets/{index-CGef2axA.js → index-BIKbxmIh.js} +1 -1
- langflow/frontend/assets/{index-BTl_mLju.js → index-BIjUtp6d.js} +1 -1
- langflow/frontend/assets/{index-Jze67eTW.js → index-BJIsQS8D.js} +1 -1
- langflow/frontend/assets/{index-DV-gdr7l.js → index-BO4fl1uU.js} +1 -1
- langflow/frontend/assets/{index-BUVmswbg.js → index-BRE8A4Q_.js} +1 -1
- langflow/frontend/assets/{index-CTzWsu8S.js → index-BRNhftot.js} +1 -1
- langflow/frontend/assets/{index-DFYBo38q.js → index-BRizlHaN.js} +1 -1
- langflow/frontend/assets/{index-DbPP5vss.js → index-BRwkzs92.js} +1 -1
- langflow/frontend/assets/{index-BzE7oL1n.js → index-BZCt_UnJ.js} +1 -1
- langflow/frontend/assets/{index-BhRSkpxu.js → index-B_ytx_iA.js} +1 -1
- langflow/frontend/assets/{index-ByCunkn4.js → index-BcqeL_f4.js} +1 -1
- langflow/frontend/assets/{index-CAAZbdRp.js → index-Bgd7yLoW.js} +1 -1
- langflow/frontend/assets/{index-DpDbxNdQ.js → index-BlRTHXW5.js} +1 -1
- langflow/frontend/assets/{index-jXSPQ_JS.js → index-BllNr21U.js} +1 -1
- langflow/frontend/assets/{index-fpMcQS2L.js → index-Bm7a2vMS.js} +1 -1
- langflow/frontend/assets/{index-BFQzmLDT.js → index-Bn4HAVDG.js} +1 -1
- langflow/frontend/assets/{index-D8EpAMC3.js → index-BwlYjc56.js} +1 -1
- langflow/frontend/assets/{index-BcCN9mpu.js → index-BzCjyHto.js} +1 -1
- langflow/frontend/assets/{index-D6-jZ4sc.js → index-C3RZz8WE.js} +1 -1
- langflow/frontend/assets/{index-D66JmFlL.js → index-C69gdJqw.js} +1 -1
- langflow/frontend/assets/{index-pYD0BTGu.js → index-C6P0vvSP.js} +1 -1
- langflow/frontend/assets/{index-CIjw_ZkP.js → index-C7wDSVVH.js} +1 -1
- langflow/frontend/assets/{index-BCTEK38J.js → index-CAzSTGAM.js} +1 -1
- langflow/frontend/assets/{index-8FjgS_Vj.js → index-CEn_71Wk.js} +1 -1
- langflow/frontend/assets/{index-BFiCUM5l.js → index-CGVDXKtN.js} +1 -1
- langflow/frontend/assets/{index-BIH2K0v8.js → index-CIYzjH2y.js} +1 -1
- langflow/frontend/assets/{index-gM8j2Wvk.js → index-COqjpsdy.js} +1 -1
- langflow/frontend/assets/{index-2q8IFBNP.js → index-CP0tFKwN.js} +1 -1
- langflow/frontend/assets/{index-CXpZa4H9.js → index-CPIdMJkX.js} +1 -1
- langflow/frontend/assets/{index-B-YjnRWx.js → index-CSRizl2S.js} +1 -1
- langflow/frontend/assets/{index-DFo0yfS5.js → index-CUe1ivTn.js} +1 -1
- langflow/frontend/assets/{index-C2x5hzgY.js → index-CVphnxXi.js} +1 -1
- langflow/frontend/assets/{index-Bz3QnhLZ.js → index-CY6LUi4V.js} +1 -1
- langflow/frontend/assets/{index-Cq6gk34q.js → index-C_2G2ZqJ.js} +1 -1
- langflow/frontend/assets/{index-CSXUVElo.js → index-C_K6Tof7.js} +1 -1
- langflow/frontend/assets/{index-1D7jZ8vz.js → index-C_UkF-RJ.js} +1 -1
- langflow/frontend/assets/{index-BVGZcHHC.js → index-Cbwk3f-p.js} +1 -1
- langflow/frontend/assets/{index-kiqvo0Zi.js → index-CdwjD4IX.js} +1 -1
- langflow/frontend/assets/{index-BNy3Al2s.js → index-CgbINWS8.js} +1 -1
- langflow/frontend/assets/{index-BXJpd9hg.js → index-CglSqvB5.js} +1 -1
- langflow/frontend/assets/{index-D9CF_54p.js → index-CmiRgF_-.js} +1 -1
- langflow/frontend/assets/{index-ez1EW657.js → index-Cp7Pmn03.js} +1 -1
- langflow/frontend/assets/{index-aypzjPzG.js → index-Cq30cQcP.js} +1 -1
- langflow/frontend/assets/index-CqS7zir1.css +1 -0
- langflow/frontend/assets/{index-DKv0y9Dp.js → index-Cr2oy5K2.js} +1 -1
- langflow/frontend/assets/{index-DrfwVxtD.js → index-Crq_yhkG.js} +1 -1
- langflow/frontend/assets/{index-CzJzRS6i.js → index-Cs_jt3dj.js} +1 -1
- langflow/frontend/assets/{index-DO0mS8FQ.js → index-Cy-ZEfWh.js} +1 -1
- langflow/frontend/assets/{index-Q0bwuTZY.js → index-Cyk3aCmP.js} +1 -1
- langflow/frontend/assets/{index-DToZROdu.js → index-D-HTZ68O.js} +1 -1
- langflow/frontend/assets/{index-C0AEZF1v.js → index-D1RgjMON.js} +1 -1
- langflow/frontend/assets/{index-DilRRF2S.js → index-D29n5mus.js} +1 -1
- langflow/frontend/assets/{index-CKLOrtrx.js → index-D2nHdRne.js} +1 -1
- langflow/frontend/assets/{index-sfFDGjjd.js → index-D7Vx6mgS.js} +1 -1
- langflow/frontend/assets/{index-BAHhLqW9.js → index-D7nFs6oq.js} +1 -1
- langflow/frontend/assets/{index-C7jY4x98.js → index-DAJafn16.js} +1 -1
- langflow/frontend/assets/{index-BefwTGbP.js → index-DDcpxWU4.js} +1 -1
- langflow/frontend/assets/{index-CTZ9iXFr.js → index-DEuXrfXH.js} +1 -1
- langflow/frontend/assets/{index-DFfr0xSt.js → index-DF0oWRdd.js} +1 -1
- langflow/frontend/assets/{index-Bh5pQAZC.js → index-DI0zAExi.js} +1 -1
- langflow/frontend/assets/{index-CG-Suo0F.js → index-DJs6FoYC.js} +1 -1
- langflow/frontend/assets/{index-dvTTQhKz.js → index-DNS4La1f.js} +1 -1
- langflow/frontend/assets/{index-nLDaeeZg.js → index-DOI0ceS-.js} +1 -1
- langflow/frontend/assets/{index-DakdEtbq.js → index-DOb9c2bf.js} +1 -1
- langflow/frontend/assets/{index-CEVnRp4_.js → index-DS4F_Phe.js} +1 -1
- langflow/frontend/assets/{index-DGRg2M1l.js → index-DTJX3yQa.js} +1 -1
- langflow/frontend/assets/{index-BjAsd-Vo.js → index-DVV_etfW.js} +1 -1
- langflow/frontend/assets/{index-BrIuZD2A.js → index-DX_InNVT.js} +1 -1
- langflow/frontend/assets/{index-jG-zLXRN.js → index-DbmqjLy6.js} +1 -1
- langflow/frontend/assets/{index-DSvOFGJR.js → index-Dc0p1Oxl.js} +1 -1
- langflow/frontend/assets/{index-87GFtXu5.js → index-DkJCCraf.js} +1 -1
- langflow/frontend/assets/{index-BXidWkLM.js → index-DlMAYATX.js} +1 -1
- langflow/frontend/assets/{index-sbTxhltT.js → index-DmaQAn3K.js} +1 -1
- langflow/frontend/assets/{index-DkC5vMvx.js → index-DmvjdU1N.js} +1 -1
- langflow/frontend/assets/{index-CSUglByd.js → index-DnusMCK1.js} +1 -1
- langflow/frontend/assets/{index-DZOTHXs0.js → index-DoFlaGDx.js} +1 -1
- langflow/frontend/assets/{index-CZkMjaa8.js → index-DqDQk0Cu.js} +1 -1
- langflow/frontend/assets/{index-lc10GnwG.js → index-DrvRK4_i.js} +1 -1
- langflow/frontend/assets/{index-BNm-yAYc.js → index-DtCsjX48.js} +1 -1
- langflow/frontend/assets/{index-BeLnhfG-.js → index-Dy7ehgeV.js} +1 -1
- langflow/frontend/assets/{index-RGG9hk9J.js → index-Dz0r9Idb.js} +1 -1
- langflow/frontend/assets/{index-Bcq2yA-p.js → index-DzDNhMMW.js} +1 -1
- langflow/frontend/assets/{index-P3f-GeAm.js → index-FYcoJPMP.js} +1 -1
- langflow/frontend/assets/{index-DQwvl_Rp.js → index-Iamzh9ZT.js} +1 -1
- langflow/frontend/assets/{index-Cy6n8tA9.js → index-J0pvFqLk.js} +1 -1
- langflow/frontend/assets/{index-D1XTMye3.js → index-J98sU-1p.js} +1 -1
- langflow/frontend/assets/{index-BZ0rL0tK.js → index-JHCxbvlW.js} +1 -1
- langflow/frontend/assets/{index-DmSH63k1.js → index-KnS52ylc.js} +1 -1
- langflow/frontend/assets/{index-WGZ88ShH.js → index-L7FKc9QN.js} +1 -1
- langflow/frontend/assets/{index-BIoFnUtx.js → index-RveG4dl9.js} +1 -1
- langflow/frontend/assets/{index-BDdkPrzu.js → index-T2jJOG85.js} +1 -1
- langflow/frontend/assets/{index-2839k6WO.js → index-TRyDa01A.js} +1 -1
- langflow/frontend/assets/{index-DvOdMz35.js → index-U7J1YiWE.js} +1 -1
- langflow/frontend/assets/{index-DzUx1-Bl.js → index-UI2ws3qp.js} +1984 -1984
- langflow/frontend/assets/{index-8Fx5I2fx.js → index-VO-pk-Hg.js} +1 -1
- langflow/frontend/assets/{index-e-RKmhti.js → index-_3qag0I4.js} +1 -1
- langflow/frontend/assets/{index-X67tRPXo.js → index-dfaj9-hY.js} +1 -1
- langflow/frontend/assets/{index-CHexGuNQ.js → index-eJwu5YEi.js} +1 -1
- langflow/frontend/assets/{index-Dz5YIK1W.js → index-in188l0A.js} +1 -1
- langflow/frontend/assets/{index-CTwkLLMr.js → index-pkOi9P45.js} +1 -1
- langflow/frontend/assets/{index-D6BaTmee.js → index-qXcoVIRo.js} +1 -1
- langflow/frontend/assets/{index-euS8RcNY.js → index-xVx59Op-.js} +1 -1
- langflow/frontend/assets/{index-C4WueQ4k.js → index-yIh6-LZT.js} +1 -1
- langflow/frontend/assets/lazyIconImports-kvf_Kak2.js +2 -0
- langflow/frontend/assets/{use-post-add-user-CA-_peAV.js → use-post-add-user-Bt6vZvvT.js} +1 -1
- langflow/frontend/index.html +2 -2
- langflow/initial_setup/starter_projects/Knowledge Ingestion.json +1052 -0
- langflow/initial_setup/starter_projects/Knowledge Retrieval.json +707 -0
- langflow/services/settings/base.py +3 -0
- {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/METADATA +2 -1
- {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/RECORD +140 -134
- langflow/frontend/assets/index-DIcdzk44.css +0 -1
- langflow/frontend/assets/lazyIconImports-lnczjBhY.js +0 -2
- {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/WHEEL +0 -0
- {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/entry_points.txt +0 -0
langflow/api/router.py
CHANGED
|
@@ -8,6 +8,7 @@ from langflow.api.v1 import (
|
|
|
8
8
|
files_router,
|
|
9
9
|
flows_router,
|
|
10
10
|
folders_router,
|
|
11
|
+
knowledge_bases_router,
|
|
11
12
|
login_router,
|
|
12
13
|
mcp_projects_router,
|
|
13
14
|
mcp_router,
|
|
@@ -45,6 +46,7 @@ router_v1.include_router(monitor_router)
|
|
|
45
46
|
router_v1.include_router(folders_router)
|
|
46
47
|
router_v1.include_router(projects_router)
|
|
47
48
|
router_v1.include_router(starter_projects_router)
|
|
49
|
+
router_v1.include_router(knowledge_bases_router)
|
|
48
50
|
router_v1.include_router(mcp_router)
|
|
49
51
|
router_v1.include_router(voice_mode_router)
|
|
50
52
|
router_v1.include_router(mcp_projects_router)
|
langflow/api/v1/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from langflow.api.v1.endpoints import router as endpoints_router
|
|
|
4
4
|
from langflow.api.v1.files import router as files_router
|
|
5
5
|
from langflow.api.v1.flows import router as flows_router
|
|
6
6
|
from langflow.api.v1.folders import router as folders_router
|
|
7
|
+
from langflow.api.v1.knowledge_bases import router as knowledge_bases_router
|
|
7
8
|
from langflow.api.v1.login import router as login_router
|
|
8
9
|
from langflow.api.v1.mcp import router as mcp_router
|
|
9
10
|
from langflow.api.v1.mcp_projects import router as mcp_projects_router
|
|
@@ -23,6 +24,7 @@ __all__ = [
|
|
|
23
24
|
"files_router",
|
|
24
25
|
"flows_router",
|
|
25
26
|
"folders_router",
|
|
27
|
+
"knowledge_bases_router",
|
|
26
28
|
"login_router",
|
|
27
29
|
"mcp_projects_router",
|
|
28
30
|
"mcp_router",
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import shutil
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from fastapi import APIRouter, HTTPException
|
|
8
|
+
from langchain_chroma import Chroma
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from langflow.services.deps import get_settings_service
|
|
13
|
+
|
|
14
|
+
router = APIRouter(tags=["Knowledge Bases"], prefix="/knowledge_bases")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
settings = get_settings_service().settings
|
|
18
|
+
knowledge_directory = settings.knowledge_bases_dir
|
|
19
|
+
if not knowledge_directory:
|
|
20
|
+
msg = "Knowledge bases directory is not set in the settings."
|
|
21
|
+
raise ValueError(msg)
|
|
22
|
+
KNOWLEDGE_BASES_DIR = Path(knowledge_directory).expanduser()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class KnowledgeBaseInfo(BaseModel):
|
|
26
|
+
id: str
|
|
27
|
+
name: str
|
|
28
|
+
embedding_provider: str | None = "Unknown"
|
|
29
|
+
embedding_model: str | None = "Unknown"
|
|
30
|
+
size: int = 0
|
|
31
|
+
words: int = 0
|
|
32
|
+
characters: int = 0
|
|
33
|
+
chunks: int = 0
|
|
34
|
+
avg_chunk_size: float = 0.0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class BulkDeleteRequest(BaseModel):
|
|
38
|
+
kb_names: list[str]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_kb_root_path() -> Path:
|
|
42
|
+
"""Get the knowledge bases root path."""
|
|
43
|
+
return KNOWLEDGE_BASES_DIR
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_directory_size(path: Path) -> int:
|
|
47
|
+
"""Calculate the total size of all files in a directory."""
|
|
48
|
+
total_size = 0
|
|
49
|
+
try:
|
|
50
|
+
for file_path in path.rglob("*"):
|
|
51
|
+
if file_path.is_file():
|
|
52
|
+
total_size += file_path.stat().st_size
|
|
53
|
+
except (OSError, PermissionError):
|
|
54
|
+
pass
|
|
55
|
+
return total_size
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def detect_embedding_provider(kb_path: Path) -> str:
|
|
59
|
+
"""Detect the embedding provider from config files and directory structure."""
|
|
60
|
+
# Provider patterns to check for
|
|
61
|
+
provider_patterns = {
|
|
62
|
+
"OpenAI": ["openai", "text-embedding-ada", "text-embedding-3"],
|
|
63
|
+
"HuggingFace": ["sentence-transformers", "huggingface", "bert-"],
|
|
64
|
+
"Cohere": ["cohere", "embed-english", "embed-multilingual"],
|
|
65
|
+
"Google": ["palm", "gecko", "google"],
|
|
66
|
+
"Chroma": ["chroma"],
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Check JSON config files for provider information
|
|
70
|
+
for config_file in kb_path.glob("*.json"):
|
|
71
|
+
try:
|
|
72
|
+
with config_file.open("r", encoding="utf-8") as f:
|
|
73
|
+
config_data = json.load(f)
|
|
74
|
+
if not isinstance(config_data, dict):
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
config_str = json.dumps(config_data).lower()
|
|
78
|
+
|
|
79
|
+
# Check for explicit provider fields first
|
|
80
|
+
provider_fields = ["embedding_provider", "provider", "embedding_model_provider"]
|
|
81
|
+
for field in provider_fields:
|
|
82
|
+
if field in config_data:
|
|
83
|
+
provider_value = str(config_data[field]).lower()
|
|
84
|
+
for provider, patterns in provider_patterns.items():
|
|
85
|
+
if any(pattern in provider_value for pattern in patterns):
|
|
86
|
+
return provider
|
|
87
|
+
|
|
88
|
+
# Check for model name patterns
|
|
89
|
+
for provider, patterns in provider_patterns.items():
|
|
90
|
+
if any(pattern in config_str for pattern in patterns):
|
|
91
|
+
return provider
|
|
92
|
+
|
|
93
|
+
except (OSError, json.JSONDecodeError) as _:
|
|
94
|
+
logger.exception("Error reading config file '%s'", config_file)
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# Fallback to directory structure
|
|
98
|
+
if (kb_path / "chroma").exists():
|
|
99
|
+
return "Chroma"
|
|
100
|
+
if (kb_path / "vectors.npy").exists():
|
|
101
|
+
return "Local"
|
|
102
|
+
|
|
103
|
+
return "Unknown"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def detect_embedding_model(kb_path: Path) -> str:
|
|
107
|
+
"""Detect the embedding model from config files."""
|
|
108
|
+
# First check the embedding metadata file (most accurate)
|
|
109
|
+
metadata_file = kb_path / "embedding_metadata.json"
|
|
110
|
+
if metadata_file.exists():
|
|
111
|
+
try:
|
|
112
|
+
with metadata_file.open("r", encoding="utf-8") as f:
|
|
113
|
+
metadata = json.load(f)
|
|
114
|
+
if isinstance(metadata, dict) and "embedding_model" in metadata:
|
|
115
|
+
# Check for embedding model field
|
|
116
|
+
model_value = str(metadata.get("embedding_model", "unknown"))
|
|
117
|
+
if model_value and model_value.lower() != "unknown":
|
|
118
|
+
return model_value
|
|
119
|
+
except (OSError, json.JSONDecodeError) as _:
|
|
120
|
+
logger.exception("Error reading embedding metadata file '%s'", metadata_file)
|
|
121
|
+
|
|
122
|
+
# Check other JSON config files for model information
|
|
123
|
+
for config_file in kb_path.glob("*.json"):
|
|
124
|
+
# Skip the embedding metadata file since we already checked it
|
|
125
|
+
if config_file.name == "embedding_metadata.json":
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
with config_file.open("r", encoding="utf-8") as f:
|
|
130
|
+
config_data = json.load(f)
|
|
131
|
+
if not isinstance(config_data, dict):
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Check for explicit model fields first and return the actual model name
|
|
135
|
+
model_fields = ["embedding_model", "model", "embedding_model_name", "model_name"]
|
|
136
|
+
for field in model_fields:
|
|
137
|
+
if field in config_data:
|
|
138
|
+
model_value = str(config_data[field])
|
|
139
|
+
if model_value and model_value.lower() != "unknown":
|
|
140
|
+
return model_value
|
|
141
|
+
|
|
142
|
+
# Check for OpenAI specific model names
|
|
143
|
+
if "openai" in json.dumps(config_data).lower():
|
|
144
|
+
openai_models = ["text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"]
|
|
145
|
+
config_str = json.dumps(config_data).lower()
|
|
146
|
+
for model in openai_models:
|
|
147
|
+
if model in config_str:
|
|
148
|
+
return model
|
|
149
|
+
|
|
150
|
+
# Check for HuggingFace model names (usually in model field)
|
|
151
|
+
if "model" in config_data:
|
|
152
|
+
model_name = str(config_data["model"])
|
|
153
|
+
# Common HuggingFace embedding models
|
|
154
|
+
hf_patterns = ["sentence-transformers", "all-MiniLM", "all-mpnet", "multi-qa"]
|
|
155
|
+
if any(pattern in model_name for pattern in hf_patterns):
|
|
156
|
+
return model_name
|
|
157
|
+
|
|
158
|
+
except (OSError, json.JSONDecodeError) as _:
|
|
159
|
+
logger.exception("Error reading config file '%s'", config_file)
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
return "Unknown"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def get_text_columns(df: pd.DataFrame, schema_data: list | None = None) -> list[str]:
|
|
166
|
+
"""Get the text columns to analyze for word/character counts."""
|
|
167
|
+
# First try schema-defined text columns
|
|
168
|
+
if schema_data:
|
|
169
|
+
text_columns = [
|
|
170
|
+
col["column_name"]
|
|
171
|
+
for col in schema_data
|
|
172
|
+
if col.get("vectorize", False) and col.get("data_type") == "string"
|
|
173
|
+
]
|
|
174
|
+
if text_columns:
|
|
175
|
+
return [col for col in text_columns if col in df.columns]
|
|
176
|
+
|
|
177
|
+
# Fallback to common text column names
|
|
178
|
+
common_names = ["text", "content", "document", "chunk"]
|
|
179
|
+
text_columns = [col for col in df.columns if col.lower() in common_names]
|
|
180
|
+
if text_columns:
|
|
181
|
+
return text_columns
|
|
182
|
+
|
|
183
|
+
# Last resort: all string columns
|
|
184
|
+
return [col for col in df.columns if df[col].dtype == "object"]
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def calculate_text_metrics(df: pd.DataFrame, text_columns: list[str]) -> tuple[int, int]:
|
|
188
|
+
"""Calculate total words and characters from text columns."""
|
|
189
|
+
total_words = 0
|
|
190
|
+
total_characters = 0
|
|
191
|
+
|
|
192
|
+
for col in text_columns:
|
|
193
|
+
if col not in df.columns:
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
text_series = df[col].astype(str).fillna("")
|
|
197
|
+
total_characters += text_series.str.len().sum()
|
|
198
|
+
total_words += text_series.str.split().str.len().sum()
|
|
199
|
+
|
|
200
|
+
return int(total_words), int(total_characters)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def get_kb_metadata(kb_path: Path) -> dict:
|
|
204
|
+
"""Extract metadata from a knowledge base directory."""
|
|
205
|
+
metadata: dict[str, float | int | str] = {
|
|
206
|
+
"chunks": 0,
|
|
207
|
+
"words": 0,
|
|
208
|
+
"characters": 0,
|
|
209
|
+
"avg_chunk_size": 0.0,
|
|
210
|
+
"embedding_provider": "Unknown",
|
|
211
|
+
"embedding_model": "Unknown",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
# First check embedding metadata file for accurate provider and model info
|
|
216
|
+
metadata_file = kb_path / "embedding_metadata.json"
|
|
217
|
+
if metadata_file.exists():
|
|
218
|
+
try:
|
|
219
|
+
with metadata_file.open("r", encoding="utf-8") as f:
|
|
220
|
+
embedding_metadata = json.load(f)
|
|
221
|
+
if isinstance(embedding_metadata, dict):
|
|
222
|
+
if "embedding_provider" in embedding_metadata:
|
|
223
|
+
metadata["embedding_provider"] = embedding_metadata["embedding_provider"]
|
|
224
|
+
if "embedding_model" in embedding_metadata:
|
|
225
|
+
metadata["embedding_model"] = embedding_metadata["embedding_model"]
|
|
226
|
+
except (OSError, json.JSONDecodeError) as _:
|
|
227
|
+
logger.exception("Error reading embedding metadata file '%s'", metadata_file)
|
|
228
|
+
|
|
229
|
+
# Fallback to detection if not found in metadata file
|
|
230
|
+
if metadata["embedding_provider"] == "Unknown":
|
|
231
|
+
metadata["embedding_provider"] = detect_embedding_provider(kb_path)
|
|
232
|
+
if metadata["embedding_model"] == "Unknown":
|
|
233
|
+
metadata["embedding_model"] = detect_embedding_model(kb_path)
|
|
234
|
+
|
|
235
|
+
# Read schema for text column information
|
|
236
|
+
schema_data = None
|
|
237
|
+
schema_file = kb_path / "schema.json"
|
|
238
|
+
if schema_file.exists():
|
|
239
|
+
try:
|
|
240
|
+
with schema_file.open("r", encoding="utf-8") as f:
|
|
241
|
+
schema_data = json.load(f)
|
|
242
|
+
if not isinstance(schema_data, list):
|
|
243
|
+
schema_data = None
|
|
244
|
+
except (ValueError, TypeError, OSError) as _:
|
|
245
|
+
logger.exception("Error reading schema file '%s'", schema_file)
|
|
246
|
+
|
|
247
|
+
# Create vector store
|
|
248
|
+
chroma = Chroma(
|
|
249
|
+
persist_directory=str(kb_path),
|
|
250
|
+
collection_name=kb_path.name,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Access the raw collection
|
|
254
|
+
collection = chroma._collection
|
|
255
|
+
|
|
256
|
+
# Fetch all documents and metadata
|
|
257
|
+
results = collection.get(include=["documents", "metadatas"])
|
|
258
|
+
|
|
259
|
+
# Convert to pandas DataFrame
|
|
260
|
+
source_chunks = pd.DataFrame(
|
|
261
|
+
{
|
|
262
|
+
"document": results["documents"],
|
|
263
|
+
"metadata": results["metadatas"],
|
|
264
|
+
}
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Process the source data for metadata
|
|
268
|
+
try:
|
|
269
|
+
metadata["chunks"] = len(source_chunks)
|
|
270
|
+
|
|
271
|
+
# Get text columns and calculate metrics
|
|
272
|
+
text_columns = get_text_columns(source_chunks, schema_data)
|
|
273
|
+
if text_columns:
|
|
274
|
+
words, characters = calculate_text_metrics(source_chunks, text_columns)
|
|
275
|
+
metadata["words"] = words
|
|
276
|
+
metadata["characters"] = characters
|
|
277
|
+
|
|
278
|
+
# Calculate average chunk size
|
|
279
|
+
if int(metadata["chunks"]) > 0:
|
|
280
|
+
metadata["avg_chunk_size"] = round(int(characters) / int(metadata["chunks"]), 1)
|
|
281
|
+
|
|
282
|
+
except (OSError, ValueError, TypeError) as _:
|
|
283
|
+
logger.exception("Error processing Chroma DB '%s'", kb_path.name)
|
|
284
|
+
|
|
285
|
+
except (OSError, ValueError, TypeError) as _:
|
|
286
|
+
logger.exception("Error processing knowledge base directory '%s'", kb_path)
|
|
287
|
+
|
|
288
|
+
return metadata
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@router.get("", status_code=HTTPStatus.OK)
|
|
292
|
+
@router.get("/", status_code=HTTPStatus.OK)
|
|
293
|
+
async def list_knowledge_bases() -> list[KnowledgeBaseInfo]:
|
|
294
|
+
"""List all available knowledge bases."""
|
|
295
|
+
try:
|
|
296
|
+
kb_root_path = get_kb_root_path()
|
|
297
|
+
|
|
298
|
+
if not kb_root_path.exists():
|
|
299
|
+
return []
|
|
300
|
+
|
|
301
|
+
knowledge_bases = []
|
|
302
|
+
|
|
303
|
+
for kb_dir in kb_root_path.iterdir():
|
|
304
|
+
if not kb_dir.is_dir() or kb_dir.name.startswith("."):
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
# Get size of the directory
|
|
309
|
+
size = get_directory_size(kb_dir)
|
|
310
|
+
|
|
311
|
+
# Get metadata from KB files
|
|
312
|
+
metadata = get_kb_metadata(kb_dir)
|
|
313
|
+
|
|
314
|
+
kb_info = KnowledgeBaseInfo(
|
|
315
|
+
id=kb_dir.name,
|
|
316
|
+
name=kb_dir.name.replace("_", " ").replace("-", " ").title(),
|
|
317
|
+
embedding_provider=metadata["embedding_provider"],
|
|
318
|
+
embedding_model=metadata["embedding_model"],
|
|
319
|
+
size=size,
|
|
320
|
+
words=metadata["words"],
|
|
321
|
+
characters=metadata["characters"],
|
|
322
|
+
chunks=metadata["chunks"],
|
|
323
|
+
avg_chunk_size=metadata["avg_chunk_size"],
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
knowledge_bases.append(kb_info)
|
|
327
|
+
|
|
328
|
+
except OSError as _:
|
|
329
|
+
# Log the exception and skip directories that can't be read
|
|
330
|
+
logger.exception("Error reading knowledge base directory '%s'", kb_dir)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
# Sort by name alphabetically
|
|
334
|
+
knowledge_bases.sort(key=lambda x: x.name)
|
|
335
|
+
|
|
336
|
+
except Exception as e:
|
|
337
|
+
raise HTTPException(status_code=500, detail=f"Error listing knowledge bases: {e!s}") from e
|
|
338
|
+
else:
|
|
339
|
+
return knowledge_bases
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@router.get("/{kb_name}", status_code=HTTPStatus.OK)
|
|
343
|
+
async def get_knowledge_base(kb_name: str) -> KnowledgeBaseInfo:
|
|
344
|
+
"""Get detailed information about a specific knowledge base."""
|
|
345
|
+
try:
|
|
346
|
+
kb_root_path = get_kb_root_path()
|
|
347
|
+
kb_path = kb_root_path / kb_name
|
|
348
|
+
|
|
349
|
+
if not kb_path.exists() or not kb_path.is_dir():
|
|
350
|
+
raise HTTPException(status_code=404, detail=f"Knowledge base '{kb_name}' not found")
|
|
351
|
+
|
|
352
|
+
# Get size of the directory
|
|
353
|
+
size = get_directory_size(kb_path)
|
|
354
|
+
|
|
355
|
+
# Get metadata from KB files
|
|
356
|
+
metadata = get_kb_metadata(kb_path)
|
|
357
|
+
|
|
358
|
+
return KnowledgeBaseInfo(
|
|
359
|
+
id=kb_name,
|
|
360
|
+
name=kb_name.replace("_", " ").replace("-", " ").title(),
|
|
361
|
+
embedding_provider=metadata["embedding_provider"],
|
|
362
|
+
embedding_model=metadata["embedding_model"],
|
|
363
|
+
size=size,
|
|
364
|
+
words=metadata["words"],
|
|
365
|
+
characters=metadata["characters"],
|
|
366
|
+
chunks=metadata["chunks"],
|
|
367
|
+
avg_chunk_size=metadata["avg_chunk_size"],
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
except HTTPException:
|
|
371
|
+
raise
|
|
372
|
+
except Exception as e:
|
|
373
|
+
raise HTTPException(status_code=500, detail=f"Error getting knowledge base '{kb_name}': {e!s}") from e
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@router.delete("/{kb_name}", status_code=HTTPStatus.OK)
|
|
377
|
+
async def delete_knowledge_base(kb_name: str) -> dict[str, str]:
|
|
378
|
+
"""Delete a specific knowledge base."""
|
|
379
|
+
try:
|
|
380
|
+
kb_root_path = get_kb_root_path()
|
|
381
|
+
kb_path = kb_root_path / kb_name
|
|
382
|
+
|
|
383
|
+
if not kb_path.exists() or not kb_path.is_dir():
|
|
384
|
+
raise HTTPException(status_code=404, detail=f"Knowledge base '{kb_name}' not found")
|
|
385
|
+
|
|
386
|
+
# Delete the entire knowledge base directory
|
|
387
|
+
shutil.rmtree(kb_path)
|
|
388
|
+
|
|
389
|
+
except HTTPException:
|
|
390
|
+
raise
|
|
391
|
+
except Exception as e:
|
|
392
|
+
raise HTTPException(status_code=500, detail=f"Error deleting knowledge base '{kb_name}': {e!s}") from e
|
|
393
|
+
else:
|
|
394
|
+
return {"message": f"Knowledge base '{kb_name}' deleted successfully"}
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
@router.delete("", status_code=HTTPStatus.OK)
|
|
398
|
+
@router.delete("/", status_code=HTTPStatus.OK)
|
|
399
|
+
async def delete_knowledge_bases_bulk(request: BulkDeleteRequest) -> dict[str, object]:
|
|
400
|
+
"""Delete multiple knowledge bases."""
|
|
401
|
+
try:
|
|
402
|
+
kb_root_path = get_kb_root_path()
|
|
403
|
+
deleted_count = 0
|
|
404
|
+
not_found_kbs = []
|
|
405
|
+
|
|
406
|
+
for kb_name in request.kb_names:
|
|
407
|
+
kb_path = kb_root_path / kb_name
|
|
408
|
+
|
|
409
|
+
if not kb_path.exists() or not kb_path.is_dir():
|
|
410
|
+
not_found_kbs.append(kb_name)
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
try:
|
|
414
|
+
# Delete the entire knowledge base directory
|
|
415
|
+
shutil.rmtree(kb_path)
|
|
416
|
+
deleted_count += 1
|
|
417
|
+
except (OSError, PermissionError) as e:
|
|
418
|
+
logger.exception("Error deleting knowledge base '%s': %s", kb_name, e)
|
|
419
|
+
# Continue with other deletions even if one fails
|
|
420
|
+
|
|
421
|
+
if not_found_kbs and deleted_count == 0:
|
|
422
|
+
raise HTTPException(status_code=404, detail=f"Knowledge bases not found: {', '.join(not_found_kbs)}")
|
|
423
|
+
|
|
424
|
+
result = {
|
|
425
|
+
"message": f"Successfully deleted {deleted_count} knowledge base(s)",
|
|
426
|
+
"deleted_count": deleted_count,
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
if not_found_kbs:
|
|
430
|
+
result["not_found"] = ", ".join(not_found_kbs)
|
|
431
|
+
|
|
432
|
+
except HTTPException:
|
|
433
|
+
raise
|
|
434
|
+
except Exception as e:
|
|
435
|
+
raise HTTPException(status_code=500, detail=f"Error deleting knowledge bases: {e!s}") from e
|
|
436
|
+
else:
|
|
437
|
+
return result
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from collections import Counter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def compute_tfidf(documents: list[str], query_terms: list[str]) -> list[float]:
|
|
6
|
+
"""Compute TF-IDF scores for query terms across a collection of documents.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
documents: List of document strings
|
|
10
|
+
query_terms: List of query terms to score
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
List of TF-IDF scores for each document
|
|
14
|
+
"""
|
|
15
|
+
# Tokenize documents (simple whitespace splitting)
|
|
16
|
+
tokenized_docs = [doc.lower().split() for doc in documents]
|
|
17
|
+
n_docs = len(documents)
|
|
18
|
+
|
|
19
|
+
# Calculate document frequency for each term
|
|
20
|
+
document_frequencies = {}
|
|
21
|
+
for term in query_terms:
|
|
22
|
+
document_frequencies[term] = sum(1 for doc in tokenized_docs if term.lower() in doc)
|
|
23
|
+
|
|
24
|
+
scores = []
|
|
25
|
+
|
|
26
|
+
for doc_tokens in tokenized_docs:
|
|
27
|
+
doc_score = 0.0
|
|
28
|
+
doc_length = len(doc_tokens)
|
|
29
|
+
term_counts = Counter(doc_tokens)
|
|
30
|
+
|
|
31
|
+
for term in query_terms:
|
|
32
|
+
term_lower = term.lower()
|
|
33
|
+
|
|
34
|
+
# Term frequency (TF)
|
|
35
|
+
tf = term_counts[term_lower] / doc_length if doc_length > 0 else 0
|
|
36
|
+
|
|
37
|
+
# Inverse document frequency (IDF)
|
|
38
|
+
idf = math.log(n_docs / document_frequencies[term]) if document_frequencies[term] > 0 else 0
|
|
39
|
+
|
|
40
|
+
# TF-IDF score
|
|
41
|
+
doc_score += tf * idf
|
|
42
|
+
|
|
43
|
+
scores.append(doc_score)
|
|
44
|
+
|
|
45
|
+
return scores
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def compute_bm25(documents: list[str], query_terms: list[str], k1: float = 1.2, b: float = 0.75) -> list[float]:
|
|
49
|
+
"""Compute BM25 scores for query terms across a collection of documents.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
documents: List of document strings
|
|
53
|
+
query_terms: List of query terms to score
|
|
54
|
+
k1: Controls term frequency scaling (default: 1.2)
|
|
55
|
+
b: Controls document length normalization (default: 0.75)
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
List of BM25 scores for each document
|
|
59
|
+
"""
|
|
60
|
+
# Tokenize documents
|
|
61
|
+
tokenized_docs = [doc.lower().split() for doc in documents]
|
|
62
|
+
n_docs = len(documents)
|
|
63
|
+
|
|
64
|
+
# Calculate average document length
|
|
65
|
+
avg_doc_length = sum(len(doc) for doc in tokenized_docs) / n_docs if n_docs > 0 else 0
|
|
66
|
+
|
|
67
|
+
# Handle edge case where all documents are empty
|
|
68
|
+
if avg_doc_length == 0:
|
|
69
|
+
return [0.0] * n_docs
|
|
70
|
+
|
|
71
|
+
# Calculate document frequency for each term
|
|
72
|
+
document_frequencies = {}
|
|
73
|
+
for term in query_terms:
|
|
74
|
+
document_frequencies[term] = sum(1 for doc in tokenized_docs if term.lower() in doc)
|
|
75
|
+
|
|
76
|
+
scores = []
|
|
77
|
+
|
|
78
|
+
for doc_tokens in tokenized_docs:
|
|
79
|
+
doc_score = 0.0
|
|
80
|
+
doc_length = len(doc_tokens)
|
|
81
|
+
term_counts = Counter(doc_tokens)
|
|
82
|
+
|
|
83
|
+
for term in query_terms:
|
|
84
|
+
term_lower = term.lower()
|
|
85
|
+
|
|
86
|
+
# Term frequency in document
|
|
87
|
+
tf = term_counts[term_lower]
|
|
88
|
+
|
|
89
|
+
# Inverse document frequency (IDF)
|
|
90
|
+
# Use standard BM25 IDF formula that ensures non-negative values
|
|
91
|
+
idf = math.log(n_docs / document_frequencies[term]) if document_frequencies[term] > 0 else 0
|
|
92
|
+
|
|
93
|
+
# BM25 score calculation
|
|
94
|
+
numerator = tf * (k1 + 1)
|
|
95
|
+
denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
|
|
96
|
+
|
|
97
|
+
# Handle division by zero when tf=0 and k1=0
|
|
98
|
+
term_score = 0 if denominator == 0 else idf * (numerator / denominator)
|
|
99
|
+
|
|
100
|
+
doc_score += term_score
|
|
101
|
+
|
|
102
|
+
scores.append(doc_score)
|
|
103
|
+
|
|
104
|
+
return scores
|
|
@@ -3,6 +3,8 @@ from .csv_to_data import CSVToDataComponent
|
|
|
3
3
|
from .directory import DirectoryComponent
|
|
4
4
|
from .file import FileComponent
|
|
5
5
|
from .json_to_data import JSONToDataComponent
|
|
6
|
+
from .kb_ingest import KBIngestionComponent
|
|
7
|
+
from .kb_retrieval import KBRetrievalComponent
|
|
6
8
|
from .news_search import NewsSearchComponent
|
|
7
9
|
from .rss import RSSReaderComponent
|
|
8
10
|
from .sql_executor import SQLComponent
|
|
@@ -16,6 +18,8 @@ __all__ = [
|
|
|
16
18
|
"DirectoryComponent",
|
|
17
19
|
"FileComponent",
|
|
18
20
|
"JSONToDataComponent",
|
|
21
|
+
"KBIngestionComponent",
|
|
22
|
+
"KBRetrievalComponent",
|
|
19
23
|
"NewsSearchComponent",
|
|
20
24
|
"RSSReaderComponent",
|
|
21
25
|
"SQLComponent",
|