langflow-base-nightly 0.5.0.dev30__py3-none-any.whl → 0.5.0.dev31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. langflow/api/router.py +2 -0
  2. langflow/api/v1/__init__.py +2 -0
  3. langflow/api/v1/knowledge_bases.py +437 -0
  4. langflow/base/data/kb_utils.py +104 -0
  5. langflow/components/data/__init__.py +4 -0
  6. langflow/components/data/kb_ingest.py +585 -0
  7. langflow/components/data/kb_retrieval.py +254 -0
  8. langflow/frontend/assets/{SlackIcon-D2PxMQjX.js → SlackIcon-Bikuxo8x.js} +1 -1
  9. langflow/frontend/assets/{Wikipedia-BNM0lBPs.js → Wikipedia-B6aCFf5-.js} +1 -1
  10. langflow/frontend/assets/{Wolfram-COQyGyeC.js → Wolfram-CekL_M-a.js} +1 -1
  11. langflow/frontend/assets/{index-CTpfN0Cy.js → index-09CVJwsY.js} +1 -1
  12. langflow/frontend/assets/{index-DWUG3nTC.js → index-1MEYR1La.js} +1 -1
  13. langflow/frontend/assets/{index-Ds9y6kEK.js → index-2vQdFIK_.js} +1 -1
  14. langflow/frontend/assets/{index-DRdKSzTn.js → index-4Tl3Nxdo.js} +1 -1
  15. langflow/frontend/assets/{index-O_vPh7iD.js → index-5G402gB8.js} +1 -1
  16. langflow/frontend/assets/{index-D15h4ir2.js → index-5hW8VleF.js} +1 -1
  17. langflow/frontend/assets/{index-BydnMWnM.js → index-6GWpsedd.js} +1 -1
  18. langflow/frontend/assets/{index-4vIU43o6.js → index-7x3wNZ-4.js} +1 -1
  19. langflow/frontend/assets/{index-DrFpyu9Z.js → index-9gkURvG2.js} +1 -1
  20. langflow/frontend/assets/{index-DRe5h2N_.js → index-AOX7bbjJ.js} +1 -1
  21. langflow/frontend/assets/{index-fJyq3ZWN.js → index-B20KmxhS.js} +1 -1
  22. langflow/frontend/assets/{index-D_sHnnuS.js → index-B2EmwqKj.js} +1 -1
  23. langflow/frontend/assets/{index-DEc_2ba8.js → index-B4AtFbkN.js} +1 -1
  24. langflow/frontend/assets/{index-D_zQiboE.js → index-B4xLpgbM.js} +1 -1
  25. langflow/frontend/assets/{index-Db8Xgs-K.js → index-B9KRIJFi.js} +1 -1
  26. langflow/frontend/assets/{index-BzCZNz2f.js → index-B9uOBe6Y.js} +1 -1
  27. langflow/frontend/assets/{index-pFTvwRsJ.js → index-BDmbsLY2.js} +1 -1
  28. langflow/frontend/assets/{index-CGef2axA.js → index-BIKbxmIh.js} +1 -1
  29. langflow/frontend/assets/{index-BTl_mLju.js → index-BIjUtp6d.js} +1 -1
  30. langflow/frontend/assets/{index-Jze67eTW.js → index-BJIsQS8D.js} +1 -1
  31. langflow/frontend/assets/{index-DV-gdr7l.js → index-BO4fl1uU.js} +1 -1
  32. langflow/frontend/assets/{index-BUVmswbg.js → index-BRE8A4Q_.js} +1 -1
  33. langflow/frontend/assets/{index-CTzWsu8S.js → index-BRNhftot.js} +1 -1
  34. langflow/frontend/assets/{index-DFYBo38q.js → index-BRizlHaN.js} +1 -1
  35. langflow/frontend/assets/{index-DbPP5vss.js → index-BRwkzs92.js} +1 -1
  36. langflow/frontend/assets/{index-BzE7oL1n.js → index-BZCt_UnJ.js} +1 -1
  37. langflow/frontend/assets/{index-BhRSkpxu.js → index-B_ytx_iA.js} +1 -1
  38. langflow/frontend/assets/{index-ByCunkn4.js → index-BcqeL_f4.js} +1 -1
  39. langflow/frontend/assets/{index-CAAZbdRp.js → index-Bgd7yLoW.js} +1 -1
  40. langflow/frontend/assets/{index-DpDbxNdQ.js → index-BlRTHXW5.js} +1 -1
  41. langflow/frontend/assets/{index-jXSPQ_JS.js → index-BllNr21U.js} +1 -1
  42. langflow/frontend/assets/{index-fpMcQS2L.js → index-Bm7a2vMS.js} +1 -1
  43. langflow/frontend/assets/{index-BFQzmLDT.js → index-Bn4HAVDG.js} +1 -1
  44. langflow/frontend/assets/{index-D8EpAMC3.js → index-BwlYjc56.js} +1 -1
  45. langflow/frontend/assets/{index-BcCN9mpu.js → index-BzCjyHto.js} +1 -1
  46. langflow/frontend/assets/{index-D6-jZ4sc.js → index-C3RZz8WE.js} +1 -1
  47. langflow/frontend/assets/{index-D66JmFlL.js → index-C69gdJqw.js} +1 -1
  48. langflow/frontend/assets/{index-pYD0BTGu.js → index-C6P0vvSP.js} +1 -1
  49. langflow/frontend/assets/{index-CIjw_ZkP.js → index-C7wDSVVH.js} +1 -1
  50. langflow/frontend/assets/{index-BCTEK38J.js → index-CAzSTGAM.js} +1 -1
  51. langflow/frontend/assets/{index-8FjgS_Vj.js → index-CEn_71Wk.js} +1 -1
  52. langflow/frontend/assets/{index-BFiCUM5l.js → index-CGVDXKtN.js} +1 -1
  53. langflow/frontend/assets/{index-BIH2K0v8.js → index-CIYzjH2y.js} +1 -1
  54. langflow/frontend/assets/{index-gM8j2Wvk.js → index-COqjpsdy.js} +1 -1
  55. langflow/frontend/assets/{index-2q8IFBNP.js → index-CP0tFKwN.js} +1 -1
  56. langflow/frontend/assets/{index-CXpZa4H9.js → index-CPIdMJkX.js} +1 -1
  57. langflow/frontend/assets/{index-B-YjnRWx.js → index-CSRizl2S.js} +1 -1
  58. langflow/frontend/assets/{index-DFo0yfS5.js → index-CUe1ivTn.js} +1 -1
  59. langflow/frontend/assets/{index-C2x5hzgY.js → index-CVphnxXi.js} +1 -1
  60. langflow/frontend/assets/{index-Bz3QnhLZ.js → index-CY6LUi4V.js} +1 -1
  61. langflow/frontend/assets/{index-Cq6gk34q.js → index-C_2G2ZqJ.js} +1 -1
  62. langflow/frontend/assets/{index-CSXUVElo.js → index-C_K6Tof7.js} +1 -1
  63. langflow/frontend/assets/{index-1D7jZ8vz.js → index-C_UkF-RJ.js} +1 -1
  64. langflow/frontend/assets/{index-BVGZcHHC.js → index-Cbwk3f-p.js} +1 -1
  65. langflow/frontend/assets/{index-kiqvo0Zi.js → index-CdwjD4IX.js} +1 -1
  66. langflow/frontend/assets/{index-BNy3Al2s.js → index-CgbINWS8.js} +1 -1
  67. langflow/frontend/assets/{index-BXJpd9hg.js → index-CglSqvB5.js} +1 -1
  68. langflow/frontend/assets/{index-D9CF_54p.js → index-CmiRgF_-.js} +1 -1
  69. langflow/frontend/assets/{index-ez1EW657.js → index-Cp7Pmn03.js} +1 -1
  70. langflow/frontend/assets/{index-aypzjPzG.js → index-Cq30cQcP.js} +1 -1
  71. langflow/frontend/assets/index-CqS7zir1.css +1 -0
  72. langflow/frontend/assets/{index-DKv0y9Dp.js → index-Cr2oy5K2.js} +1 -1
  73. langflow/frontend/assets/{index-DrfwVxtD.js → index-Crq_yhkG.js} +1 -1
  74. langflow/frontend/assets/{index-CzJzRS6i.js → index-Cs_jt3dj.js} +1 -1
  75. langflow/frontend/assets/{index-DO0mS8FQ.js → index-Cy-ZEfWh.js} +1 -1
  76. langflow/frontend/assets/{index-Q0bwuTZY.js → index-Cyk3aCmP.js} +1 -1
  77. langflow/frontend/assets/{index-DToZROdu.js → index-D-HTZ68O.js} +1 -1
  78. langflow/frontend/assets/{index-C0AEZF1v.js → index-D1RgjMON.js} +1 -1
  79. langflow/frontend/assets/{index-DilRRF2S.js → index-D29n5mus.js} +1 -1
  80. langflow/frontend/assets/{index-CKLOrtrx.js → index-D2nHdRne.js} +1 -1
  81. langflow/frontend/assets/{index-sfFDGjjd.js → index-D7Vx6mgS.js} +1 -1
  82. langflow/frontend/assets/{index-BAHhLqW9.js → index-D7nFs6oq.js} +1 -1
  83. langflow/frontend/assets/{index-C7jY4x98.js → index-DAJafn16.js} +1 -1
  84. langflow/frontend/assets/{index-BefwTGbP.js → index-DDcpxWU4.js} +1 -1
  85. langflow/frontend/assets/{index-CTZ9iXFr.js → index-DEuXrfXH.js} +1 -1
  86. langflow/frontend/assets/{index-DFfr0xSt.js → index-DF0oWRdd.js} +1 -1
  87. langflow/frontend/assets/{index-Bh5pQAZC.js → index-DI0zAExi.js} +1 -1
  88. langflow/frontend/assets/{index-CG-Suo0F.js → index-DJs6FoYC.js} +1 -1
  89. langflow/frontend/assets/{index-dvTTQhKz.js → index-DNS4La1f.js} +1 -1
  90. langflow/frontend/assets/{index-nLDaeeZg.js → index-DOI0ceS-.js} +1 -1
  91. langflow/frontend/assets/{index-DakdEtbq.js → index-DOb9c2bf.js} +1 -1
  92. langflow/frontend/assets/{index-CEVnRp4_.js → index-DS4F_Phe.js} +1 -1
  93. langflow/frontend/assets/{index-DGRg2M1l.js → index-DTJX3yQa.js} +1 -1
  94. langflow/frontend/assets/{index-BjAsd-Vo.js → index-DVV_etfW.js} +1 -1
  95. langflow/frontend/assets/{index-BrIuZD2A.js → index-DX_InNVT.js} +1 -1
  96. langflow/frontend/assets/{index-jG-zLXRN.js → index-DbmqjLy6.js} +1 -1
  97. langflow/frontend/assets/{index-DSvOFGJR.js → index-Dc0p1Oxl.js} +1 -1
  98. langflow/frontend/assets/{index-87GFtXu5.js → index-DkJCCraf.js} +1 -1
  99. langflow/frontend/assets/{index-BXidWkLM.js → index-DlMAYATX.js} +1 -1
  100. langflow/frontend/assets/{index-sbTxhltT.js → index-DmaQAn3K.js} +1 -1
  101. langflow/frontend/assets/{index-DkC5vMvx.js → index-DmvjdU1N.js} +1 -1
  102. langflow/frontend/assets/{index-CSUglByd.js → index-DnusMCK1.js} +1 -1
  103. langflow/frontend/assets/{index-DZOTHXs0.js → index-DoFlaGDx.js} +1 -1
  104. langflow/frontend/assets/{index-CZkMjaa8.js → index-DqDQk0Cu.js} +1 -1
  105. langflow/frontend/assets/{index-lc10GnwG.js → index-DrvRK4_i.js} +1 -1
  106. langflow/frontend/assets/{index-BNm-yAYc.js → index-DtCsjX48.js} +1 -1
  107. langflow/frontend/assets/{index-BeLnhfG-.js → index-Dy7ehgeV.js} +1 -1
  108. langflow/frontend/assets/{index-RGG9hk9J.js → index-Dz0r9Idb.js} +1 -1
  109. langflow/frontend/assets/{index-Bcq2yA-p.js → index-DzDNhMMW.js} +1 -1
  110. langflow/frontend/assets/{index-P3f-GeAm.js → index-FYcoJPMP.js} +1 -1
  111. langflow/frontend/assets/{index-DQwvl_Rp.js → index-Iamzh9ZT.js} +1 -1
  112. langflow/frontend/assets/{index-Cy6n8tA9.js → index-J0pvFqLk.js} +1 -1
  113. langflow/frontend/assets/{index-D1XTMye3.js → index-J98sU-1p.js} +1 -1
  114. langflow/frontend/assets/{index-BZ0rL0tK.js → index-JHCxbvlW.js} +1 -1
  115. langflow/frontend/assets/{index-DmSH63k1.js → index-KnS52ylc.js} +1 -1
  116. langflow/frontend/assets/{index-WGZ88ShH.js → index-L7FKc9QN.js} +1 -1
  117. langflow/frontend/assets/{index-BIoFnUtx.js → index-RveG4dl9.js} +1 -1
  118. langflow/frontend/assets/{index-BDdkPrzu.js → index-T2jJOG85.js} +1 -1
  119. langflow/frontend/assets/{index-2839k6WO.js → index-TRyDa01A.js} +1 -1
  120. langflow/frontend/assets/{index-DvOdMz35.js → index-U7J1YiWE.js} +1 -1
  121. langflow/frontend/assets/{index-DzUx1-Bl.js → index-UI2ws3qp.js} +1984 -1984
  122. langflow/frontend/assets/{index-8Fx5I2fx.js → index-VO-pk-Hg.js} +1 -1
  123. langflow/frontend/assets/{index-e-RKmhti.js → index-_3qag0I4.js} +1 -1
  124. langflow/frontend/assets/{index-X67tRPXo.js → index-dfaj9-hY.js} +1 -1
  125. langflow/frontend/assets/{index-CHexGuNQ.js → index-eJwu5YEi.js} +1 -1
  126. langflow/frontend/assets/{index-Dz5YIK1W.js → index-in188l0A.js} +1 -1
  127. langflow/frontend/assets/{index-CTwkLLMr.js → index-pkOi9P45.js} +1 -1
  128. langflow/frontend/assets/{index-D6BaTmee.js → index-qXcoVIRo.js} +1 -1
  129. langflow/frontend/assets/{index-euS8RcNY.js → index-xVx59Op-.js} +1 -1
  130. langflow/frontend/assets/{index-C4WueQ4k.js → index-yIh6-LZT.js} +1 -1
  131. langflow/frontend/assets/lazyIconImports-kvf_Kak2.js +2 -0
  132. langflow/frontend/assets/{use-post-add-user-CA-_peAV.js → use-post-add-user-Bt6vZvvT.js} +1 -1
  133. langflow/frontend/index.html +2 -2
  134. langflow/initial_setup/starter_projects/Knowledge Ingestion.json +1052 -0
  135. langflow/initial_setup/starter_projects/Knowledge Retrieval.json +707 -0
  136. langflow/services/settings/base.py +3 -0
  137. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/METADATA +2 -1
  138. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/RECORD +140 -134
  139. langflow/frontend/assets/index-DIcdzk44.css +0 -1
  140. langflow/frontend/assets/lazyIconImports-lnczjBhY.js +0 -2
  141. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/WHEEL +0 -0
  142. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev31.dist-info}/entry_points.txt +0 -0
langflow/api/router.py CHANGED
@@ -8,6 +8,7 @@ from langflow.api.v1 import (
8
8
  files_router,
9
9
  flows_router,
10
10
  folders_router,
11
+ knowledge_bases_router,
11
12
  login_router,
12
13
  mcp_projects_router,
13
14
  mcp_router,
@@ -45,6 +46,7 @@ router_v1.include_router(monitor_router)
45
46
  router_v1.include_router(folders_router)
46
47
  router_v1.include_router(projects_router)
47
48
  router_v1.include_router(starter_projects_router)
49
+ router_v1.include_router(knowledge_bases_router)
48
50
  router_v1.include_router(mcp_router)
49
51
  router_v1.include_router(voice_mode_router)
50
52
  router_v1.include_router(mcp_projects_router)
@@ -4,6 +4,7 @@ from langflow.api.v1.endpoints import router as endpoints_router
4
4
  from langflow.api.v1.files import router as files_router
5
5
  from langflow.api.v1.flows import router as flows_router
6
6
  from langflow.api.v1.folders import router as folders_router
7
+ from langflow.api.v1.knowledge_bases import router as knowledge_bases_router
7
8
  from langflow.api.v1.login import router as login_router
8
9
  from langflow.api.v1.mcp import router as mcp_router
9
10
  from langflow.api.v1.mcp_projects import router as mcp_projects_router
@@ -23,6 +24,7 @@ __all__ = [
23
24
  "files_router",
24
25
  "flows_router",
25
26
  "folders_router",
27
+ "knowledge_bases_router",
26
28
  "login_router",
27
29
  "mcp_projects_router",
28
30
  "mcp_router",
@@ -0,0 +1,437 @@
1
+ import json
2
+ import shutil
3
+ from http import HTTPStatus
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ from fastapi import APIRouter, HTTPException
8
+ from langchain_chroma import Chroma
9
+ from loguru import logger
10
+ from pydantic import BaseModel
11
+
12
+ from langflow.services.deps import get_settings_service
13
+
14
+ router = APIRouter(tags=["Knowledge Bases"], prefix="/knowledge_bases")
15
+
16
+
17
+ settings = get_settings_service().settings
18
+ knowledge_directory = settings.knowledge_bases_dir
19
+ if not knowledge_directory:
20
+ msg = "Knowledge bases directory is not set in the settings."
21
+ raise ValueError(msg)
22
+ KNOWLEDGE_BASES_DIR = Path(knowledge_directory).expanduser()
23
+
24
+
25
+ class KnowledgeBaseInfo(BaseModel):
26
+ id: str
27
+ name: str
28
+ embedding_provider: str | None = "Unknown"
29
+ embedding_model: str | None = "Unknown"
30
+ size: int = 0
31
+ words: int = 0
32
+ characters: int = 0
33
+ chunks: int = 0
34
+ avg_chunk_size: float = 0.0
35
+
36
+
37
+ class BulkDeleteRequest(BaseModel):
38
+ kb_names: list[str]
39
+
40
+
41
+ def get_kb_root_path() -> Path:
42
+ """Get the knowledge bases root path."""
43
+ return KNOWLEDGE_BASES_DIR
44
+
45
+
46
+ def get_directory_size(path: Path) -> int:
47
+ """Calculate the total size of all files in a directory."""
48
+ total_size = 0
49
+ try:
50
+ for file_path in path.rglob("*"):
51
+ if file_path.is_file():
52
+ total_size += file_path.stat().st_size
53
+ except (OSError, PermissionError):
54
+ pass
55
+ return total_size
56
+
57
+
58
+ def detect_embedding_provider(kb_path: Path) -> str:
59
+ """Detect the embedding provider from config files and directory structure."""
60
+ # Provider patterns to check for
61
+ provider_patterns = {
62
+ "OpenAI": ["openai", "text-embedding-ada", "text-embedding-3"],
63
+ "HuggingFace": ["sentence-transformers", "huggingface", "bert-"],
64
+ "Cohere": ["cohere", "embed-english", "embed-multilingual"],
65
+ "Google": ["palm", "gecko", "google"],
66
+ "Chroma": ["chroma"],
67
+ }
68
+
69
+ # Check JSON config files for provider information
70
+ for config_file in kb_path.glob("*.json"):
71
+ try:
72
+ with config_file.open("r", encoding="utf-8") as f:
73
+ config_data = json.load(f)
74
+ if not isinstance(config_data, dict):
75
+ continue
76
+
77
+ config_str = json.dumps(config_data).lower()
78
+
79
+ # Check for explicit provider fields first
80
+ provider_fields = ["embedding_provider", "provider", "embedding_model_provider"]
81
+ for field in provider_fields:
82
+ if field in config_data:
83
+ provider_value = str(config_data[field]).lower()
84
+ for provider, patterns in provider_patterns.items():
85
+ if any(pattern in provider_value for pattern in patterns):
86
+ return provider
87
+
88
+ # Check for model name patterns
89
+ for provider, patterns in provider_patterns.items():
90
+ if any(pattern in config_str for pattern in patterns):
91
+ return provider
92
+
93
+ except (OSError, json.JSONDecodeError) as _:
94
+ logger.exception("Error reading config file '%s'", config_file)
95
+ continue
96
+
97
+ # Fallback to directory structure
98
+ if (kb_path / "chroma").exists():
99
+ return "Chroma"
100
+ if (kb_path / "vectors.npy").exists():
101
+ return "Local"
102
+
103
+ return "Unknown"
104
+
105
+
106
+ def detect_embedding_model(kb_path: Path) -> str:
107
+ """Detect the embedding model from config files."""
108
+ # First check the embedding metadata file (most accurate)
109
+ metadata_file = kb_path / "embedding_metadata.json"
110
+ if metadata_file.exists():
111
+ try:
112
+ with metadata_file.open("r", encoding="utf-8") as f:
113
+ metadata = json.load(f)
114
+ if isinstance(metadata, dict) and "embedding_model" in metadata:
115
+ # Check for embedding model field
116
+ model_value = str(metadata.get("embedding_model", "unknown"))
117
+ if model_value and model_value.lower() != "unknown":
118
+ return model_value
119
+ except (OSError, json.JSONDecodeError) as _:
120
+ logger.exception("Error reading embedding metadata file '%s'", metadata_file)
121
+
122
+ # Check other JSON config files for model information
123
+ for config_file in kb_path.glob("*.json"):
124
+ # Skip the embedding metadata file since we already checked it
125
+ if config_file.name == "embedding_metadata.json":
126
+ continue
127
+
128
+ try:
129
+ with config_file.open("r", encoding="utf-8") as f:
130
+ config_data = json.load(f)
131
+ if not isinstance(config_data, dict):
132
+ continue
133
+
134
+ # Check for explicit model fields first and return the actual model name
135
+ model_fields = ["embedding_model", "model", "embedding_model_name", "model_name"]
136
+ for field in model_fields:
137
+ if field in config_data:
138
+ model_value = str(config_data[field])
139
+ if model_value and model_value.lower() != "unknown":
140
+ return model_value
141
+
142
+ # Check for OpenAI specific model names
143
+ if "openai" in json.dumps(config_data).lower():
144
+ openai_models = ["text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"]
145
+ config_str = json.dumps(config_data).lower()
146
+ for model in openai_models:
147
+ if model in config_str:
148
+ return model
149
+
150
+ # Check for HuggingFace model names (usually in model field)
151
+ if "model" in config_data:
152
+ model_name = str(config_data["model"])
153
+ # Common HuggingFace embedding models
154
+ hf_patterns = ["sentence-transformers", "all-MiniLM", "all-mpnet", "multi-qa"]
155
+ if any(pattern in model_name for pattern in hf_patterns):
156
+ return model_name
157
+
158
+ except (OSError, json.JSONDecodeError) as _:
159
+ logger.exception("Error reading config file '%s'", config_file)
160
+ continue
161
+
162
+ return "Unknown"
163
+
164
+
165
+ def get_text_columns(df: pd.DataFrame, schema_data: list | None = None) -> list[str]:
166
+ """Get the text columns to analyze for word/character counts."""
167
+ # First try schema-defined text columns
168
+ if schema_data:
169
+ text_columns = [
170
+ col["column_name"]
171
+ for col in schema_data
172
+ if col.get("vectorize", False) and col.get("data_type") == "string"
173
+ ]
174
+ if text_columns:
175
+ return [col for col in text_columns if col in df.columns]
176
+
177
+ # Fallback to common text column names
178
+ common_names = ["text", "content", "document", "chunk"]
179
+ text_columns = [col for col in df.columns if col.lower() in common_names]
180
+ if text_columns:
181
+ return text_columns
182
+
183
+ # Last resort: all string columns
184
+ return [col for col in df.columns if df[col].dtype == "object"]
185
+
186
+
187
+ def calculate_text_metrics(df: pd.DataFrame, text_columns: list[str]) -> tuple[int, int]:
188
+ """Calculate total words and characters from text columns."""
189
+ total_words = 0
190
+ total_characters = 0
191
+
192
+ for col in text_columns:
193
+ if col not in df.columns:
194
+ continue
195
+
196
+ text_series = df[col].astype(str).fillna("")
197
+ total_characters += text_series.str.len().sum()
198
+ total_words += text_series.str.split().str.len().sum()
199
+
200
+ return int(total_words), int(total_characters)
201
+
202
+
203
+ def get_kb_metadata(kb_path: Path) -> dict:
204
+ """Extract metadata from a knowledge base directory."""
205
+ metadata: dict[str, float | int | str] = {
206
+ "chunks": 0,
207
+ "words": 0,
208
+ "characters": 0,
209
+ "avg_chunk_size": 0.0,
210
+ "embedding_provider": "Unknown",
211
+ "embedding_model": "Unknown",
212
+ }
213
+
214
+ try:
215
+ # First check embedding metadata file for accurate provider and model info
216
+ metadata_file = kb_path / "embedding_metadata.json"
217
+ if metadata_file.exists():
218
+ try:
219
+ with metadata_file.open("r", encoding="utf-8") as f:
220
+ embedding_metadata = json.load(f)
221
+ if isinstance(embedding_metadata, dict):
222
+ if "embedding_provider" in embedding_metadata:
223
+ metadata["embedding_provider"] = embedding_metadata["embedding_provider"]
224
+ if "embedding_model" in embedding_metadata:
225
+ metadata["embedding_model"] = embedding_metadata["embedding_model"]
226
+ except (OSError, json.JSONDecodeError) as _:
227
+ logger.exception("Error reading embedding metadata file '%s'", metadata_file)
228
+
229
+ # Fallback to detection if not found in metadata file
230
+ if metadata["embedding_provider"] == "Unknown":
231
+ metadata["embedding_provider"] = detect_embedding_provider(kb_path)
232
+ if metadata["embedding_model"] == "Unknown":
233
+ metadata["embedding_model"] = detect_embedding_model(kb_path)
234
+
235
+ # Read schema for text column information
236
+ schema_data = None
237
+ schema_file = kb_path / "schema.json"
238
+ if schema_file.exists():
239
+ try:
240
+ with schema_file.open("r", encoding="utf-8") as f:
241
+ schema_data = json.load(f)
242
+ if not isinstance(schema_data, list):
243
+ schema_data = None
244
+ except (ValueError, TypeError, OSError) as _:
245
+ logger.exception("Error reading schema file '%s'", schema_file)
246
+
247
+ # Create vector store
248
+ chroma = Chroma(
249
+ persist_directory=str(kb_path),
250
+ collection_name=kb_path.name,
251
+ )
252
+
253
+ # Access the raw collection
254
+ collection = chroma._collection
255
+
256
+ # Fetch all documents and metadata
257
+ results = collection.get(include=["documents", "metadatas"])
258
+
259
+ # Convert to pandas DataFrame
260
+ source_chunks = pd.DataFrame(
261
+ {
262
+ "document": results["documents"],
263
+ "metadata": results["metadatas"],
264
+ }
265
+ )
266
+
267
+ # Process the source data for metadata
268
+ try:
269
+ metadata["chunks"] = len(source_chunks)
270
+
271
+ # Get text columns and calculate metrics
272
+ text_columns = get_text_columns(source_chunks, schema_data)
273
+ if text_columns:
274
+ words, characters = calculate_text_metrics(source_chunks, text_columns)
275
+ metadata["words"] = words
276
+ metadata["characters"] = characters
277
+
278
+ # Calculate average chunk size
279
+ if int(metadata["chunks"]) > 0:
280
+ metadata["avg_chunk_size"] = round(int(characters) / int(metadata["chunks"]), 1)
281
+
282
+ except (OSError, ValueError, TypeError) as _:
283
+ logger.exception("Error processing Chroma DB '%s'", kb_path.name)
284
+
285
+ except (OSError, ValueError, TypeError) as _:
286
+ logger.exception("Error processing knowledge base directory '%s'", kb_path)
287
+
288
+ return metadata
289
+
290
+
291
+ @router.get("", status_code=HTTPStatus.OK)
292
+ @router.get("/", status_code=HTTPStatus.OK)
293
+ async def list_knowledge_bases() -> list[KnowledgeBaseInfo]:
294
+ """List all available knowledge bases."""
295
+ try:
296
+ kb_root_path = get_kb_root_path()
297
+
298
+ if not kb_root_path.exists():
299
+ return []
300
+
301
+ knowledge_bases = []
302
+
303
+ for kb_dir in kb_root_path.iterdir():
304
+ if not kb_dir.is_dir() or kb_dir.name.startswith("."):
305
+ continue
306
+
307
+ try:
308
+ # Get size of the directory
309
+ size = get_directory_size(kb_dir)
310
+
311
+ # Get metadata from KB files
312
+ metadata = get_kb_metadata(kb_dir)
313
+
314
+ kb_info = KnowledgeBaseInfo(
315
+ id=kb_dir.name,
316
+ name=kb_dir.name.replace("_", " ").replace("-", " ").title(),
317
+ embedding_provider=metadata["embedding_provider"],
318
+ embedding_model=metadata["embedding_model"],
319
+ size=size,
320
+ words=metadata["words"],
321
+ characters=metadata["characters"],
322
+ chunks=metadata["chunks"],
323
+ avg_chunk_size=metadata["avg_chunk_size"],
324
+ )
325
+
326
+ knowledge_bases.append(kb_info)
327
+
328
+ except OSError as _:
329
+ # Log the exception and skip directories that can't be read
330
+ logger.exception("Error reading knowledge base directory '%s'", kb_dir)
331
+ continue
332
+
333
+ # Sort by name alphabetically
334
+ knowledge_bases.sort(key=lambda x: x.name)
335
+
336
+ except Exception as e:
337
+ raise HTTPException(status_code=500, detail=f"Error listing knowledge bases: {e!s}") from e
338
+ else:
339
+ return knowledge_bases
340
+
341
+
342
+ @router.get("/{kb_name}", status_code=HTTPStatus.OK)
343
+ async def get_knowledge_base(kb_name: str) -> KnowledgeBaseInfo:
344
+ """Get detailed information about a specific knowledge base."""
345
+ try:
346
+ kb_root_path = get_kb_root_path()
347
+ kb_path = kb_root_path / kb_name
348
+
349
+ if not kb_path.exists() or not kb_path.is_dir():
350
+ raise HTTPException(status_code=404, detail=f"Knowledge base '{kb_name}' not found")
351
+
352
+ # Get size of the directory
353
+ size = get_directory_size(kb_path)
354
+
355
+ # Get metadata from KB files
356
+ metadata = get_kb_metadata(kb_path)
357
+
358
+ return KnowledgeBaseInfo(
359
+ id=kb_name,
360
+ name=kb_name.replace("_", " ").replace("-", " ").title(),
361
+ embedding_provider=metadata["embedding_provider"],
362
+ embedding_model=metadata["embedding_model"],
363
+ size=size,
364
+ words=metadata["words"],
365
+ characters=metadata["characters"],
366
+ chunks=metadata["chunks"],
367
+ avg_chunk_size=metadata["avg_chunk_size"],
368
+ )
369
+
370
+ except HTTPException:
371
+ raise
372
+ except Exception as e:
373
+ raise HTTPException(status_code=500, detail=f"Error getting knowledge base '{kb_name}': {e!s}") from e
374
+
375
+
376
+ @router.delete("/{kb_name}", status_code=HTTPStatus.OK)
377
+ async def delete_knowledge_base(kb_name: str) -> dict[str, str]:
378
+ """Delete a specific knowledge base."""
379
+ try:
380
+ kb_root_path = get_kb_root_path()
381
+ kb_path = kb_root_path / kb_name
382
+
383
+ if not kb_path.exists() or not kb_path.is_dir():
384
+ raise HTTPException(status_code=404, detail=f"Knowledge base '{kb_name}' not found")
385
+
386
+ # Delete the entire knowledge base directory
387
+ shutil.rmtree(kb_path)
388
+
389
+ except HTTPException:
390
+ raise
391
+ except Exception as e:
392
+ raise HTTPException(status_code=500, detail=f"Error deleting knowledge base '{kb_name}': {e!s}") from e
393
+ else:
394
+ return {"message": f"Knowledge base '{kb_name}' deleted successfully"}
395
+
396
+
397
+ @router.delete("", status_code=HTTPStatus.OK)
398
+ @router.delete("/", status_code=HTTPStatus.OK)
399
+ async def delete_knowledge_bases_bulk(request: BulkDeleteRequest) -> dict[str, object]:
400
+ """Delete multiple knowledge bases."""
401
+ try:
402
+ kb_root_path = get_kb_root_path()
403
+ deleted_count = 0
404
+ not_found_kbs = []
405
+
406
+ for kb_name in request.kb_names:
407
+ kb_path = kb_root_path / kb_name
408
+
409
+ if not kb_path.exists() or not kb_path.is_dir():
410
+ not_found_kbs.append(kb_name)
411
+ continue
412
+
413
+ try:
414
+ # Delete the entire knowledge base directory
415
+ shutil.rmtree(kb_path)
416
+ deleted_count += 1
417
+ except (OSError, PermissionError) as e:
418
+ logger.exception("Error deleting knowledge base '%s': %s", kb_name, e)
419
+ # Continue with other deletions even if one fails
420
+
421
+ if not_found_kbs and deleted_count == 0:
422
+ raise HTTPException(status_code=404, detail=f"Knowledge bases not found: {', '.join(not_found_kbs)}")
423
+
424
+ result = {
425
+ "message": f"Successfully deleted {deleted_count} knowledge base(s)",
426
+ "deleted_count": deleted_count,
427
+ }
428
+
429
+ if not_found_kbs:
430
+ result["not_found"] = ", ".join(not_found_kbs)
431
+
432
+ except HTTPException:
433
+ raise
434
+ except Exception as e:
435
+ raise HTTPException(status_code=500, detail=f"Error deleting knowledge bases: {e!s}") from e
436
+ else:
437
+ return result
@@ -0,0 +1,104 @@
1
+ import math
2
+ from collections import Counter
3
+
4
+
5
+ def compute_tfidf(documents: list[str], query_terms: list[str]) -> list[float]:
6
+ """Compute TF-IDF scores for query terms across a collection of documents.
7
+
8
+ Args:
9
+ documents: List of document strings
10
+ query_terms: List of query terms to score
11
+
12
+ Returns:
13
+ List of TF-IDF scores for each document
14
+ """
15
+ # Tokenize documents (simple whitespace splitting)
16
+ tokenized_docs = [doc.lower().split() for doc in documents]
17
+ n_docs = len(documents)
18
+
19
+ # Calculate document frequency for each term
20
+ document_frequencies = {}
21
+ for term in query_terms:
22
+ document_frequencies[term] = sum(1 for doc in tokenized_docs if term.lower() in doc)
23
+
24
+ scores = []
25
+
26
+ for doc_tokens in tokenized_docs:
27
+ doc_score = 0.0
28
+ doc_length = len(doc_tokens)
29
+ term_counts = Counter(doc_tokens)
30
+
31
+ for term in query_terms:
32
+ term_lower = term.lower()
33
+
34
+ # Term frequency (TF)
35
+ tf = term_counts[term_lower] / doc_length if doc_length > 0 else 0
36
+
37
+ # Inverse document frequency (IDF)
38
+ idf = math.log(n_docs / document_frequencies[term]) if document_frequencies[term] > 0 else 0
39
+
40
+ # TF-IDF score
41
+ doc_score += tf * idf
42
+
43
+ scores.append(doc_score)
44
+
45
+ return scores
46
+
47
+
48
+ def compute_bm25(documents: list[str], query_terms: list[str], k1: float = 1.2, b: float = 0.75) -> list[float]:
49
+ """Compute BM25 scores for query terms across a collection of documents.
50
+
51
+ Args:
52
+ documents: List of document strings
53
+ query_terms: List of query terms to score
54
+ k1: Controls term frequency scaling (default: 1.2)
55
+ b: Controls document length normalization (default: 0.75)
56
+
57
+ Returns:
58
+ List of BM25 scores for each document
59
+ """
60
+ # Tokenize documents
61
+ tokenized_docs = [doc.lower().split() for doc in documents]
62
+ n_docs = len(documents)
63
+
64
+ # Calculate average document length
65
+ avg_doc_length = sum(len(doc) for doc in tokenized_docs) / n_docs if n_docs > 0 else 0
66
+
67
+ # Handle edge case where all documents are empty
68
+ if avg_doc_length == 0:
69
+ return [0.0] * n_docs
70
+
71
+ # Calculate document frequency for each term
72
+ document_frequencies = {}
73
+ for term in query_terms:
74
+ document_frequencies[term] = sum(1 for doc in tokenized_docs if term.lower() in doc)
75
+
76
+ scores = []
77
+
78
+ for doc_tokens in tokenized_docs:
79
+ doc_score = 0.0
80
+ doc_length = len(doc_tokens)
81
+ term_counts = Counter(doc_tokens)
82
+
83
+ for term in query_terms:
84
+ term_lower = term.lower()
85
+
86
+ # Term frequency in document
87
+ tf = term_counts[term_lower]
88
+
89
+ # Inverse document frequency (IDF)
90
+ # Use standard BM25 IDF formula that ensures non-negative values
91
+ idf = math.log(n_docs / document_frequencies[term]) if document_frequencies[term] > 0 else 0
92
+
93
+ # BM25 score calculation
94
+ numerator = tf * (k1 + 1)
95
+ denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
96
+
97
+ # Handle division by zero when tf=0 and k1=0
98
+ term_score = 0 if denominator == 0 else idf * (numerator / denominator)
99
+
100
+ doc_score += term_score
101
+
102
+ scores.append(doc_score)
103
+
104
+ return scores
@@ -3,6 +3,8 @@ from .csv_to_data import CSVToDataComponent
3
3
  from .directory import DirectoryComponent
4
4
  from .file import FileComponent
5
5
  from .json_to_data import JSONToDataComponent
6
+ from .kb_ingest import KBIngestionComponent
7
+ from .kb_retrieval import KBRetrievalComponent
6
8
  from .news_search import NewsSearchComponent
7
9
  from .rss import RSSReaderComponent
8
10
  from .sql_executor import SQLComponent
@@ -16,6 +18,8 @@ __all__ = [
16
18
  "DirectoryComponent",
17
19
  "FileComponent",
18
20
  "JSONToDataComponent",
21
+ "KBIngestionComponent",
22
+ "KBRetrievalComponent",
19
23
  "NewsSearchComponent",
20
24
  "RSSReaderComponent",
21
25
  "SQLComponent",