langflow-base-nightly 0.5.0.dev30__py3-none-any.whl → 0.5.0.dev32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. langflow/__main__.py +130 -30
  2. langflow/api/router.py +2 -0
  3. langflow/api/v1/__init__.py +2 -0
  4. langflow/api/v1/knowledge_bases.py +437 -0
  5. langflow/base/data/kb_utils.py +104 -0
  6. langflow/components/data/__init__.py +4 -0
  7. langflow/components/data/kb_ingest.py +585 -0
  8. langflow/components/data/kb_retrieval.py +254 -0
  9. langflow/components/processing/save_file.py +1 -1
  10. langflow/frontend/assets/{SlackIcon-D2PxMQjX.js → SlackIcon-Bikuxo8x.js} +1 -1
  11. langflow/frontend/assets/{Wikipedia-BNM0lBPs.js → Wikipedia-B6aCFf5-.js} +1 -1
  12. langflow/frontend/assets/{Wolfram-COQyGyeC.js → Wolfram-CekL_M-a.js} +1 -1
  13. langflow/frontend/assets/{index-CTpfN0Cy.js → index-09CVJwsY.js} +1 -1
  14. langflow/frontend/assets/{index-DWUG3nTC.js → index-1MEYR1La.js} +1 -1
  15. langflow/frontend/assets/{index-Ds9y6kEK.js → index-2vQdFIK_.js} +1 -1
  16. langflow/frontend/assets/{index-DRdKSzTn.js → index-4Tl3Nxdo.js} +1 -1
  17. langflow/frontend/assets/{index-O_vPh7iD.js → index-5G402gB8.js} +1 -1
  18. langflow/frontend/assets/{index-D15h4ir2.js → index-5hW8VleF.js} +1 -1
  19. langflow/frontend/assets/{index-BydnMWnM.js → index-6GWpsedd.js} +1 -1
  20. langflow/frontend/assets/{index-4vIU43o6.js → index-7x3wNZ-4.js} +1 -1
  21. langflow/frontend/assets/{index-DrFpyu9Z.js → index-9gkURvG2.js} +1 -1
  22. langflow/frontend/assets/{index-DRe5h2N_.js → index-AOX7bbjJ.js} +1 -1
  23. langflow/frontend/assets/{index-fJyq3ZWN.js → index-B20KmxhS.js} +1 -1
  24. langflow/frontend/assets/{index-D_sHnnuS.js → index-B2EmwqKj.js} +1 -1
  25. langflow/frontend/assets/{index-DEc_2ba8.js → index-B4AtFbkN.js} +1 -1
  26. langflow/frontend/assets/{index-D_zQiboE.js → index-B4xLpgbM.js} +1 -1
  27. langflow/frontend/assets/{index-Db8Xgs-K.js → index-B9KRIJFi.js} +1 -1
  28. langflow/frontend/assets/{index-BzCZNz2f.js → index-B9uOBe6Y.js} +1 -1
  29. langflow/frontend/assets/{index-pFTvwRsJ.js → index-BDmbsLY2.js} +1 -1
  30. langflow/frontend/assets/{index-CGef2axA.js → index-BIKbxmIh.js} +1 -1
  31. langflow/frontend/assets/{index-BTl_mLju.js → index-BIjUtp6d.js} +1 -1
  32. langflow/frontend/assets/{index-Jze67eTW.js → index-BJIsQS8D.js} +1 -1
  33. langflow/frontend/assets/{index-DV-gdr7l.js → index-BO4fl1uU.js} +1 -1
  34. langflow/frontend/assets/{index-BUVmswbg.js → index-BRE8A4Q_.js} +1 -1
  35. langflow/frontend/assets/{index-CTzWsu8S.js → index-BRNhftot.js} +1 -1
  36. langflow/frontend/assets/{index-DFYBo38q.js → index-BRizlHaN.js} +1 -1
  37. langflow/frontend/assets/{index-DbPP5vss.js → index-BRwkzs92.js} +1 -1
  38. langflow/frontend/assets/{index-BzE7oL1n.js → index-BZCt_UnJ.js} +1 -1
  39. langflow/frontend/assets/{index-BhRSkpxu.js → index-B_ytx_iA.js} +1 -1
  40. langflow/frontend/assets/{index-ByCunkn4.js → index-BcqeL_f4.js} +1 -1
  41. langflow/frontend/assets/{index-CAAZbdRp.js → index-Bgd7yLoW.js} +1 -1
  42. langflow/frontend/assets/{index-DpDbxNdQ.js → index-BlRTHXW5.js} +1 -1
  43. langflow/frontend/assets/{index-jXSPQ_JS.js → index-BllNr21U.js} +1 -1
  44. langflow/frontend/assets/{index-fpMcQS2L.js → index-Bm7a2vMS.js} +1 -1
  45. langflow/frontend/assets/{index-BFQzmLDT.js → index-Bn4HAVDG.js} +1 -1
  46. langflow/frontend/assets/{index-D8EpAMC3.js → index-BwlYjc56.js} +1 -1
  47. langflow/frontend/assets/{index-BcCN9mpu.js → index-BzCjyHto.js} +1 -1
  48. langflow/frontend/assets/{index-D6-jZ4sc.js → index-C3RZz8WE.js} +1 -1
  49. langflow/frontend/assets/{index-D66JmFlL.js → index-C69gdJqw.js} +1 -1
  50. langflow/frontend/assets/{index-pYD0BTGu.js → index-C6P0vvSP.js} +1 -1
  51. langflow/frontend/assets/{index-CIjw_ZkP.js → index-C7wDSVVH.js} +1 -1
  52. langflow/frontend/assets/{index-BCTEK38J.js → index-CAzSTGAM.js} +1 -1
  53. langflow/frontend/assets/{index-8FjgS_Vj.js → index-CEn_71Wk.js} +1 -1
  54. langflow/frontend/assets/{index-BFiCUM5l.js → index-CGVDXKtN.js} +1 -1
  55. langflow/frontend/assets/{index-BIH2K0v8.js → index-CIYzjH2y.js} +1 -1
  56. langflow/frontend/assets/{index-gM8j2Wvk.js → index-COqjpsdy.js} +1 -1
  57. langflow/frontend/assets/{index-2q8IFBNP.js → index-CP0tFKwN.js} +1 -1
  58. langflow/frontend/assets/{index-CXpZa4H9.js → index-CPIdMJkX.js} +1 -1
  59. langflow/frontend/assets/{index-B-YjnRWx.js → index-CSRizl2S.js} +1 -1
  60. langflow/frontend/assets/{index-DFo0yfS5.js → index-CUe1ivTn.js} +1 -1
  61. langflow/frontend/assets/{index-C2x5hzgY.js → index-CVphnxXi.js} +1 -1
  62. langflow/frontend/assets/{index-Bz3QnhLZ.js → index-CY6LUi4V.js} +1 -1
  63. langflow/frontend/assets/{index-Cq6gk34q.js → index-C_2G2ZqJ.js} +1 -1
  64. langflow/frontend/assets/{index-CSXUVElo.js → index-C_K6Tof7.js} +1 -1
  65. langflow/frontend/assets/{index-1D7jZ8vz.js → index-C_UkF-RJ.js} +1 -1
  66. langflow/frontend/assets/{index-BVGZcHHC.js → index-Cbwk3f-p.js} +1 -1
  67. langflow/frontend/assets/{index-kiqvo0Zi.js → index-CdwjD4IX.js} +1 -1
  68. langflow/frontend/assets/{index-BNy3Al2s.js → index-CgbINWS8.js} +1 -1
  69. langflow/frontend/assets/{index-BXJpd9hg.js → index-CglSqvB5.js} +1 -1
  70. langflow/frontend/assets/{index-D9CF_54p.js → index-CmiRgF_-.js} +1 -1
  71. langflow/frontend/assets/{index-ez1EW657.js → index-Cp7Pmn03.js} +1 -1
  72. langflow/frontend/assets/{index-aypzjPzG.js → index-Cq30cQcP.js} +1 -1
  73. langflow/frontend/assets/index-CqS7zir1.css +1 -0
  74. langflow/frontend/assets/{index-DKv0y9Dp.js → index-Cr2oy5K2.js} +1 -1
  75. langflow/frontend/assets/{index-DrfwVxtD.js → index-Crq_yhkG.js} +1 -1
  76. langflow/frontend/assets/{index-CzJzRS6i.js → index-Cs_jt3dj.js} +1 -1
  77. langflow/frontend/assets/{index-DO0mS8FQ.js → index-Cy-ZEfWh.js} +1 -1
  78. langflow/frontend/assets/{index-Q0bwuTZY.js → index-Cyk3aCmP.js} +1 -1
  79. langflow/frontend/assets/{index-DToZROdu.js → index-D-HTZ68O.js} +1 -1
  80. langflow/frontend/assets/{index-C0AEZF1v.js → index-D1RgjMON.js} +1 -1
  81. langflow/frontend/assets/{index-DilRRF2S.js → index-D29n5mus.js} +1 -1
  82. langflow/frontend/assets/{index-CKLOrtrx.js → index-D2nHdRne.js} +1 -1
  83. langflow/frontend/assets/{index-sfFDGjjd.js → index-D7Vx6mgS.js} +1 -1
  84. langflow/frontend/assets/{index-BAHhLqW9.js → index-D7nFs6oq.js} +1 -1
  85. langflow/frontend/assets/{index-C7jY4x98.js → index-DAJafn16.js} +1 -1
  86. langflow/frontend/assets/{index-BefwTGbP.js → index-DDcpxWU4.js} +1 -1
  87. langflow/frontend/assets/{index-CTZ9iXFr.js → index-DEuXrfXH.js} +1 -1
  88. langflow/frontend/assets/{index-DFfr0xSt.js → index-DF0oWRdd.js} +1 -1
  89. langflow/frontend/assets/{index-Bh5pQAZC.js → index-DI0zAExi.js} +1 -1
  90. langflow/frontend/assets/{index-CG-Suo0F.js → index-DJs6FoYC.js} +1 -1
  91. langflow/frontend/assets/{index-dvTTQhKz.js → index-DNS4La1f.js} +1 -1
  92. langflow/frontend/assets/{index-nLDaeeZg.js → index-DOI0ceS-.js} +1 -1
  93. langflow/frontend/assets/{index-DakdEtbq.js → index-DOb9c2bf.js} +1 -1
  94. langflow/frontend/assets/{index-CEVnRp4_.js → index-DS4F_Phe.js} +1 -1
  95. langflow/frontend/assets/{index-DGRg2M1l.js → index-DTJX3yQa.js} +1 -1
  96. langflow/frontend/assets/{index-BjAsd-Vo.js → index-DVV_etfW.js} +1 -1
  97. langflow/frontend/assets/{index-BrIuZD2A.js → index-DX_InNVT.js} +1 -1
  98. langflow/frontend/assets/{index-jG-zLXRN.js → index-DbmqjLy6.js} +1 -1
  99. langflow/frontend/assets/{index-DSvOFGJR.js → index-Dc0p1Oxl.js} +1 -1
  100. langflow/frontend/assets/{index-87GFtXu5.js → index-DkJCCraf.js} +1 -1
  101. langflow/frontend/assets/{index-BXidWkLM.js → index-DlMAYATX.js} +1 -1
  102. langflow/frontend/assets/{index-sbTxhltT.js → index-DmaQAn3K.js} +1 -1
  103. langflow/frontend/assets/{index-DkC5vMvx.js → index-DmvjdU1N.js} +1 -1
  104. langflow/frontend/assets/{index-CSUglByd.js → index-DnusMCK1.js} +1 -1
  105. langflow/frontend/assets/{index-DZOTHXs0.js → index-DoFlaGDx.js} +1 -1
  106. langflow/frontend/assets/{index-CZkMjaa8.js → index-DqDQk0Cu.js} +1 -1
  107. langflow/frontend/assets/{index-lc10GnwG.js → index-DrvRK4_i.js} +1 -1
  108. langflow/frontend/assets/{index-BNm-yAYc.js → index-DtCsjX48.js} +1 -1
  109. langflow/frontend/assets/{index-BeLnhfG-.js → index-Dy7ehgeV.js} +1 -1
  110. langflow/frontend/assets/{index-RGG9hk9J.js → index-Dz0r9Idb.js} +1 -1
  111. langflow/frontend/assets/{index-Bcq2yA-p.js → index-DzDNhMMW.js} +1 -1
  112. langflow/frontend/assets/{index-P3f-GeAm.js → index-FYcoJPMP.js} +1 -1
  113. langflow/frontend/assets/{index-DQwvl_Rp.js → index-Iamzh9ZT.js} +1 -1
  114. langflow/frontend/assets/{index-Cy6n8tA9.js → index-J0pvFqLk.js} +1 -1
  115. langflow/frontend/assets/{index-D1XTMye3.js → index-J98sU-1p.js} +1 -1
  116. langflow/frontend/assets/{index-BZ0rL0tK.js → index-JHCxbvlW.js} +1 -1
  117. langflow/frontend/assets/{index-DmSH63k1.js → index-KnS52ylc.js} +1 -1
  118. langflow/frontend/assets/{index-WGZ88ShH.js → index-L7FKc9QN.js} +1 -1
  119. langflow/frontend/assets/{index-BIoFnUtx.js → index-RveG4dl9.js} +1 -1
  120. langflow/frontend/assets/{index-BDdkPrzu.js → index-T2jJOG85.js} +1 -1
  121. langflow/frontend/assets/{index-2839k6WO.js → index-TRyDa01A.js} +1 -1
  122. langflow/frontend/assets/{index-DvOdMz35.js → index-U7J1YiWE.js} +1 -1
  123. langflow/frontend/assets/{index-DzUx1-Bl.js → index-UI2ws3qp.js} +1984 -1984
  124. langflow/frontend/assets/{index-8Fx5I2fx.js → index-VO-pk-Hg.js} +1 -1
  125. langflow/frontend/assets/{index-e-RKmhti.js → index-_3qag0I4.js} +1 -1
  126. langflow/frontend/assets/{index-X67tRPXo.js → index-dfaj9-hY.js} +1 -1
  127. langflow/frontend/assets/{index-CHexGuNQ.js → index-eJwu5YEi.js} +1 -1
  128. langflow/frontend/assets/{index-Dz5YIK1W.js → index-in188l0A.js} +1 -1
  129. langflow/frontend/assets/{index-CTwkLLMr.js → index-pkOi9P45.js} +1 -1
  130. langflow/frontend/assets/{index-D6BaTmee.js → index-qXcoVIRo.js} +1 -1
  131. langflow/frontend/assets/{index-euS8RcNY.js → index-xVx59Op-.js} +1 -1
  132. langflow/frontend/assets/{index-C4WueQ4k.js → index-yIh6-LZT.js} +1 -1
  133. langflow/frontend/assets/lazyIconImports-kvf_Kak2.js +2 -0
  134. langflow/frontend/assets/{use-post-add-user-CA-_peAV.js → use-post-add-user-Bt6vZvvT.js} +1 -1
  135. langflow/frontend/index.html +2 -2
  136. langflow/initial_setup/starter_projects/Knowledge Ingestion.json +1052 -0
  137. langflow/initial_setup/starter_projects/Knowledge Retrieval.json +709 -0
  138. langflow/initial_setup/starter_projects/News Aggregator.json +4 -4
  139. langflow/services/database/models/user/crud.py +7 -0
  140. langflow/services/settings/auth.py +14 -1
  141. langflow/services/settings/base.py +3 -0
  142. langflow/services/utils.py +8 -3
  143. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev32.dist-info}/METADATA +2 -1
  144. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev32.dist-info}/RECORD +146 -140
  145. langflow/frontend/assets/index-DIcdzk44.css +0 -1
  146. langflow/frontend/assets/lazyIconImports-lnczjBhY.js +0 -2
  147. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev32.dist-info}/WHEEL +0 -0
  148. {langflow_base_nightly-0.5.0.dev30.dist-info → langflow_base_nightly-0.5.0.dev32.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,585 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import re
6
+ import uuid
7
+ from dataclasses import asdict, dataclass, field
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import pandas as pd
13
+ from cryptography.fernet import InvalidToken
14
+ from langchain_chroma import Chroma
15
+ from loguru import logger
16
+
17
+ from langflow.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES
18
+ from langflow.custom import Component
19
+ from langflow.io import BoolInput, DataFrameInput, DropdownInput, IntInput, Output, SecretStrInput, StrInput, TableInput
20
+ from langflow.schema.data import Data
21
+ from langflow.schema.dotdict import dotdict # noqa: TC001
22
+ from langflow.schema.table import EditMode
23
+ from langflow.services.auth.utils import decrypt_api_key, encrypt_api_key
24
+ from langflow.services.deps import get_settings_service
25
+
26
+ HUGGINGFACE_MODEL_NAMES = ["sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2"]
27
+ COHERE_MODEL_NAMES = ["embed-english-v3.0", "embed-multilingual-v3.0"]
28
+
29
+ settings = get_settings_service().settings
30
+ knowledge_directory = settings.knowledge_bases_dir
31
+ if not knowledge_directory:
32
+ msg = "Knowledge bases directory is not set in the settings."
33
+ raise ValueError(msg)
34
+ KNOWLEDGE_BASES_ROOT_PATH = Path(knowledge_directory).expanduser()
35
+
36
+
37
+ class KBIngestionComponent(Component):
38
+ """Create or append to Langflow Knowledge from a DataFrame."""
39
+
40
+ # ------ UI metadata ---------------------------------------------------
41
+ display_name = "Knowledge Ingestion"
42
+ description = "Create or update knowledge in Langflow."
43
+ icon = "database"
44
+ name = "KBIngestion"
45
+
46
+ @dataclass
47
+ class NewKnowledgeBaseInput:
48
+ functionality: str = "create"
49
+ fields: dict[str, dict] = field(
50
+ default_factory=lambda: {
51
+ "data": {
52
+ "node": {
53
+ "name": "create_knowledge_base",
54
+ "description": "Create new knowledge in Langflow.",
55
+ "display_name": "Create new knowledge",
56
+ "field_order": ["01_new_kb_name", "02_embedding_model", "03_api_key"],
57
+ "template": {
58
+ "01_new_kb_name": StrInput(
59
+ name="new_kb_name",
60
+ display_name="Knowledge Name",
61
+ info="Name of the new knowledge to create.",
62
+ required=True,
63
+ ),
64
+ "02_embedding_model": DropdownInput(
65
+ name="embedding_model",
66
+ display_name="Model Name",
67
+ info="Select the embedding model to use for this knowledge base.",
68
+ required=True,
69
+ options=OPENAI_EMBEDDING_MODEL_NAMES + HUGGINGFACE_MODEL_NAMES + COHERE_MODEL_NAMES,
70
+ options_metadata=[{"icon": "OpenAI"} for _ in OPENAI_EMBEDDING_MODEL_NAMES]
71
+ + [{"icon": "HuggingFace"} for _ in HUGGINGFACE_MODEL_NAMES]
72
+ + [{"icon": "Cohere"} for _ in COHERE_MODEL_NAMES],
73
+ ),
74
+ "03_api_key": SecretStrInput(
75
+ name="api_key",
76
+ display_name="API Key",
77
+ info="Provider API key for embedding model",
78
+ required=True,
79
+ load_from_db=True,
80
+ ),
81
+ },
82
+ },
83
+ }
84
+ }
85
+ )
86
+
87
+ # ------ Inputs --------------------------------------------------------
88
+ inputs = [
89
+ DropdownInput(
90
+ name="knowledge_base",
91
+ display_name="Knowledge",
92
+ info="Select the knowledge to load data from.",
93
+ required=True,
94
+ options=[
95
+ str(d.name) for d in KNOWLEDGE_BASES_ROOT_PATH.iterdir() if not d.name.startswith(".") and d.is_dir()
96
+ ]
97
+ if KNOWLEDGE_BASES_ROOT_PATH.exists()
98
+ else [],
99
+ refresh_button=True,
100
+ dialog_inputs=asdict(NewKnowledgeBaseInput()),
101
+ ),
102
+ DataFrameInput(
103
+ name="input_df",
104
+ display_name="Data",
105
+ info="Table with all original columns (already chunked / processed).",
106
+ required=True,
107
+ ),
108
+ TableInput(
109
+ name="column_config",
110
+ display_name="Column Configuration",
111
+ info="Configure column behavior for the knowledge base.",
112
+ required=True,
113
+ table_schema=[
114
+ {
115
+ "name": "column_name",
116
+ "display_name": "Column Name",
117
+ "type": "str",
118
+ "description": "Name of the column in the source DataFrame",
119
+ "edit_mode": EditMode.INLINE,
120
+ },
121
+ {
122
+ "name": "vectorize",
123
+ "display_name": "Vectorize",
124
+ "type": "boolean",
125
+ "description": "Create embeddings for this column",
126
+ "default": False,
127
+ "edit_mode": EditMode.INLINE,
128
+ },
129
+ {
130
+ "name": "identifier",
131
+ "display_name": "Identifier",
132
+ "type": "boolean",
133
+ "description": "Use this column as unique identifier",
134
+ "default": False,
135
+ "edit_mode": EditMode.INLINE,
136
+ },
137
+ ],
138
+ value=[
139
+ {
140
+ "column_name": "text",
141
+ "vectorize": True,
142
+ "identifier": False,
143
+ }
144
+ ],
145
+ ),
146
+ IntInput(
147
+ name="chunk_size",
148
+ display_name="Chunk Size",
149
+ info="Batch size for processing embeddings",
150
+ advanced=True,
151
+ value=1000,
152
+ ),
153
+ SecretStrInput(
154
+ name="api_key",
155
+ display_name="Embedding Provider API Key",
156
+ info="API key for the embedding provider to generate embeddings.",
157
+ advanced=True,
158
+ required=False,
159
+ ),
160
+ BoolInput(
161
+ name="allow_duplicates",
162
+ display_name="Allow Duplicates",
163
+ info="Allow duplicate rows in the knowledge base",
164
+ advanced=True,
165
+ value=False,
166
+ ),
167
+ ]
168
+
169
+ # ------ Outputs -------------------------------------------------------
170
+ outputs = [Output(display_name="DataFrame", name="dataframe", method="build_kb_info")]
171
+
172
+ # ------ Internal helpers ---------------------------------------------
173
+ def _get_kb_root(self) -> Path:
174
+ """Return the root directory for knowledge bases."""
175
+ return KNOWLEDGE_BASES_ROOT_PATH
176
+
177
+ def _validate_column_config(self, df_source: pd.DataFrame) -> list[dict[str, Any]]:
178
+ """Validate column configuration using Structured Output patterns."""
179
+ if not self.column_config:
180
+ msg = "Column configuration cannot be empty"
181
+ raise ValueError(msg)
182
+
183
+ # Convert table input to list of dicts (similar to Structured Output)
184
+ config_list = self.column_config if isinstance(self.column_config, list) else []
185
+
186
+ # Validate column names exist in DataFrame
187
+ df_columns = set(df_source.columns)
188
+ for config in config_list:
189
+ col_name = config.get("column_name")
190
+ if col_name not in df_columns and not self.silent_errors:
191
+ msg = f"Column '{col_name}' not found in DataFrame. Available columns: {sorted(df_columns)}"
192
+ self.log(f"Warning: {msg}")
193
+ raise ValueError(msg)
194
+
195
+ return config_list
196
+
197
+ def _get_embedding_provider(self, embedding_model: str) -> str:
198
+ """Get embedding provider by matching model name to lists."""
199
+ if embedding_model in OPENAI_EMBEDDING_MODEL_NAMES:
200
+ return "OpenAI"
201
+ if embedding_model in HUGGINGFACE_MODEL_NAMES:
202
+ return "HuggingFace"
203
+ if embedding_model in COHERE_MODEL_NAMES:
204
+ return "Cohere"
205
+ return "Custom"
206
+
207
+ def _build_embeddings(self, embedding_model: str, api_key: str):
208
+ """Build embedding model using provider patterns."""
209
+ # Get provider by matching model name to lists
210
+ provider = self._get_embedding_provider(embedding_model)
211
+
212
+ # Validate provider and model
213
+ if provider == "OpenAI":
214
+ from langchain_openai import OpenAIEmbeddings
215
+
216
+ if not api_key:
217
+ msg = "OpenAI API key is required when using OpenAI provider"
218
+ raise ValueError(msg)
219
+ return OpenAIEmbeddings(
220
+ model=embedding_model,
221
+ api_key=api_key,
222
+ chunk_size=self.chunk_size,
223
+ )
224
+ if provider == "HuggingFace":
225
+ from langchain_huggingface import HuggingFaceEmbeddings
226
+
227
+ return HuggingFaceEmbeddings(
228
+ model=embedding_model,
229
+ )
230
+ if provider == "Cohere":
231
+ from langchain_cohere import CohereEmbeddings
232
+
233
+ if not api_key:
234
+ msg = "Cohere API key is required when using Cohere provider"
235
+ raise ValueError(msg)
236
+ return CohereEmbeddings(
237
+ model=embedding_model,
238
+ cohere_api_key=api_key,
239
+ )
240
+ if provider == "Custom":
241
+ # For custom embedding models, we would need additional configuration
242
+ msg = "Custom embedding models not yet supported"
243
+ raise NotImplementedError(msg)
244
+ msg = f"Unknown provider: {provider}"
245
+ raise ValueError(msg)
246
+
247
+ def _build_embedding_metadata(self, embedding_model, api_key) -> dict[str, Any]:
248
+ """Build embedding model metadata."""
249
+ # Get provider by matching model name to lists
250
+ embedding_provider = self._get_embedding_provider(embedding_model)
251
+
252
+ api_key_to_save = None
253
+ if api_key and hasattr(api_key, "get_secret_value"):
254
+ api_key_to_save = api_key.get_secret_value()
255
+ elif isinstance(api_key, str):
256
+ api_key_to_save = api_key
257
+
258
+ encrypted_api_key = None
259
+ if api_key_to_save:
260
+ settings_service = get_settings_service()
261
+ try:
262
+ encrypted_api_key = encrypt_api_key(api_key_to_save, settings_service=settings_service)
263
+ except (TypeError, ValueError) as e:
264
+ self.log(f"Could not encrypt API key: {e}")
265
+ logger.error(f"Could not encrypt API key: {e}")
266
+
267
+ return {
268
+ "embedding_provider": embedding_provider,
269
+ "embedding_model": embedding_model,
270
+ "api_key": encrypted_api_key,
271
+ "api_key_used": bool(api_key),
272
+ "chunk_size": self.chunk_size,
273
+ "created_at": datetime.now(timezone.utc).isoformat(),
274
+ }
275
+
276
+ def _save_embedding_metadata(self, kb_path: Path, embedding_model: str, api_key: str) -> None:
277
+ """Save embedding model metadata."""
278
+ embedding_metadata = self._build_embedding_metadata(embedding_model, api_key)
279
+ metadata_path = kb_path / "embedding_metadata.json"
280
+ metadata_path.write_text(json.dumps(embedding_metadata, indent=2))
281
+
282
+ def _save_kb_files(
283
+ self,
284
+ kb_path: Path,
285
+ config_list: list[dict[str, Any]],
286
+ ) -> None:
287
+ """Save KB files using File Component storage patterns."""
288
+ try:
289
+ # Create directory (following File Component patterns)
290
+ kb_path.mkdir(parents=True, exist_ok=True)
291
+
292
+ # Save column configuration
293
+ # Only do this if the file doesn't exist already
294
+ cfg_path = kb_path / "schema.json"
295
+ if not cfg_path.exists():
296
+ cfg_path.write_text(json.dumps(config_list, indent=2))
297
+
298
+ except Exception as e:
299
+ if not self.silent_errors:
300
+ raise
301
+ self.log(f"Error saving KB files: {e}")
302
+
303
+ def _build_column_metadata(self, config_list: list[dict[str, Any]], df_source: pd.DataFrame) -> dict[str, Any]:
304
+ """Build detailed column metadata."""
305
+ metadata: dict[str, Any] = {
306
+ "total_columns": len(df_source.columns),
307
+ "mapped_columns": len(config_list),
308
+ "unmapped_columns": len(df_source.columns) - len(config_list),
309
+ "columns": [],
310
+ "summary": {"vectorized_columns": [], "identifier_columns": []},
311
+ }
312
+
313
+ for config in config_list:
314
+ col_name = config.get("column_name")
315
+ vectorize = config.get("vectorize") == "True" or config.get("vectorize") is True
316
+ identifier = config.get("identifier") == "True" or config.get("identifier") is True
317
+
318
+ # Add to columns list
319
+ metadata["columns"].append(
320
+ {
321
+ "name": col_name,
322
+ "vectorize": vectorize,
323
+ "identifier": identifier,
324
+ }
325
+ )
326
+
327
+ # Update summary
328
+ if vectorize:
329
+ metadata["summary"]["vectorized_columns"].append(col_name)
330
+ if identifier:
331
+ metadata["summary"]["identifier_columns"].append(col_name)
332
+
333
+ return metadata
334
+
335
+ def _create_vector_store(
336
+ self, df_source: pd.DataFrame, config_list: list[dict[str, Any]], embedding_model: str, api_key: str
337
+ ) -> None:
338
+ """Create vector store following Local DB component pattern."""
339
+ try:
340
+ # Set up vector store directory
341
+ base_dir = self._get_kb_root()
342
+
343
+ vector_store_dir = base_dir / self.knowledge_base
344
+ vector_store_dir.mkdir(parents=True, exist_ok=True)
345
+
346
+ # Create embeddings model
347
+ embedding_function = self._build_embeddings(embedding_model, api_key)
348
+
349
+ # Convert DataFrame to Data objects (following Local DB pattern)
350
+ data_objects = self._convert_df_to_data_objects(df_source, config_list)
351
+
352
+ # Create vector store
353
+ chroma = Chroma(
354
+ persist_directory=str(vector_store_dir),
355
+ embedding_function=embedding_function,
356
+ collection_name=self.knowledge_base,
357
+ )
358
+
359
+ # Convert Data objects to LangChain Documents
360
+ documents = []
361
+ for data_obj in data_objects:
362
+ doc = data_obj.to_lc_document()
363
+ documents.append(doc)
364
+
365
+ # Add documents to vector store
366
+ if documents:
367
+ chroma.add_documents(documents)
368
+ self.log(f"Added {len(documents)} documents to vector store '{self.knowledge_base}'")
369
+
370
+ except Exception as e:
371
+ if not self.silent_errors:
372
+ raise
373
+ self.log(f"Error creating vector store: {e}")
374
+
375
+ def _convert_df_to_data_objects(self, df_source: pd.DataFrame, config_list: list[dict[str, Any]]) -> list[Data]:
376
+ """Convert DataFrame to Data objects for vector store."""
377
+ data_objects: list[Data] = []
378
+
379
+ # Set up vector store directory
380
+ base_dir = self._get_kb_root()
381
+
382
+ # If we don't allow duplicates, we need to get the existing hashes
383
+ chroma = Chroma(
384
+ persist_directory=str(base_dir / self.knowledge_base),
385
+ collection_name=self.knowledge_base,
386
+ )
387
+
388
+ # Get all documents and their metadata
389
+ all_docs = chroma.get()
390
+
391
+ # Extract all _id values from metadata
392
+ id_list = [metadata.get("_id") for metadata in all_docs["metadatas"] if metadata.get("_id")]
393
+
394
+ # Get column roles
395
+ content_cols = []
396
+ identifier_cols = []
397
+
398
+ for config in config_list:
399
+ col_name = config.get("column_name")
400
+ vectorize = config.get("vectorize") == "True" or config.get("vectorize") is True
401
+ identifier = config.get("identifier") == "True" or config.get("identifier") is True
402
+
403
+ if vectorize:
404
+ content_cols.append(col_name)
405
+ elif identifier:
406
+ identifier_cols.append(col_name)
407
+
408
+ # Convert each row to a Data object
409
+ for _, row in df_source.iterrows():
410
+ # Build content text from vectorized columns using list comprehension
411
+ content_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]
412
+
413
+ page_content = " ".join(content_parts)
414
+
415
+ # Build metadata from NON-vectorized columns only (simple key-value pairs)
416
+ data_dict = {
417
+ "text": page_content, # Main content for vectorization
418
+ }
419
+
420
+ # Add metadata columns as simple key-value pairs
421
+ for col in df_source.columns:
422
+ if col not in content_cols and col in row and pd.notna(row[col]):
423
+ # Convert to simple types for Chroma metadata
424
+ value = row[col]
425
+ data_dict[col] = str(value) # Convert complex types to string
426
+
427
+ # Hash the page_content for unique ID
428
+ page_content_hash = hashlib.sha256(page_content.encode()).hexdigest()
429
+ data_dict["_id"] = page_content_hash
430
+
431
+ # If duplicates are disallowed, and hash exists, prevent adding this row
432
+ if not self.allow_duplicates and page_content_hash in id_list:
433
+ self.log(f"Skipping duplicate row with hash {page_content_hash}")
434
+ continue
435
+
436
+ # Create Data object - everything except "text" becomes metadata
437
+ data_obj = Data(data=data_dict)
438
+ data_objects.append(data_obj)
439
+
440
+ return data_objects
441
+
442
+ def is_valid_collection_name(self, name, min_length: int = 3, max_length: int = 63) -> bool:
443
+ """Validates collection name against conditions 1-3.
444
+
445
+ 1. Contains 3-63 characters
446
+ 2. Starts and ends with alphanumeric character
447
+ 3. Contains only alphanumeric characters, underscores, or hyphens.
448
+
449
+ Args:
450
+ name (str): Collection name to validate
451
+ min_length (int): Minimum length of the name
452
+ max_length (int): Maximum length of the name
453
+
454
+ Returns:
455
+ bool: True if valid, False otherwise
456
+ """
457
+ # Check length (condition 1)
458
+ if not (min_length <= len(name) <= max_length):
459
+ return False
460
+
461
+ # Check start/end with alphanumeric (condition 2)
462
+ if not (name[0].isalnum() and name[-1].isalnum()):
463
+ return False
464
+
465
+ # Check allowed characters (condition 3)
466
+ return re.match(r"^[a-zA-Z0-9_-]+$", name) is not None
467
+
468
+ # ---------------------------------------------------------------------
469
+ # OUTPUT METHODS
470
+ # ---------------------------------------------------------------------
471
+ def build_kb_info(self) -> Data:
472
+ """Main ingestion routine → returns a dict with KB metadata."""
473
+ try:
474
+ # Get source DataFrame
475
+ df_source: pd.DataFrame = self.input_df
476
+
477
+ # Validate column configuration (using Structured Output patterns)
478
+ config_list = self._validate_column_config(df_source)
479
+ column_metadata = self._build_column_metadata(config_list, df_source)
480
+
481
+ # Prepare KB folder (using File Component patterns)
482
+ kb_root = self._get_kb_root()
483
+ kb_path = kb_root / self.knowledge_base
484
+
485
+ # Read the embedding info from the knowledge base folder
486
+ metadata_path = kb_path / "embedding_metadata.json"
487
+
488
+ # If the API key is not provided, try to read it from the metadata file
489
+ if metadata_path.exists():
490
+ settings_service = get_settings_service()
491
+ metadata = json.loads(metadata_path.read_text())
492
+ embedding_model = metadata.get("embedding_model")
493
+ try:
494
+ api_key = decrypt_api_key(metadata["api_key"], settings_service)
495
+ except (InvalidToken, TypeError, ValueError) as e:
496
+ logger.error(f"Could not decrypt API key. Please provide it manually. Error: {e}")
497
+
498
+ # Check if a custom API key was provided, update metadata if so
499
+ if self.api_key:
500
+ api_key = self.api_key
501
+ self._save_embedding_metadata(
502
+ kb_path=kb_path,
503
+ embedding_model=embedding_model,
504
+ api_key=api_key,
505
+ )
506
+
507
+ # Create vector store following Local DB component pattern
508
+ self._create_vector_store(df_source, config_list, embedding_model=embedding_model, api_key=api_key)
509
+
510
+ # Save KB files (using File Component storage patterns)
511
+ self._save_kb_files(kb_path, config_list)
512
+
513
+ # Build metadata response
514
+ meta: dict[str, Any] = {
515
+ "kb_id": str(uuid.uuid4()),
516
+ "kb_name": self.knowledge_base,
517
+ "rows": len(df_source),
518
+ "column_metadata": column_metadata,
519
+ "path": str(kb_path),
520
+ "config_columns": len(config_list),
521
+ "timestamp": datetime.now(tz=timezone.utc).isoformat(),
522
+ }
523
+
524
+ # Set status message
525
+ self.status = f"✅ KB **{self.knowledge_base}** saved · {len(df_source)} chunks."
526
+
527
+ return Data(data=meta)
528
+
529
+ except Exception as e:
530
+ if not self.silent_errors:
531
+ raise
532
+ self.log(f"Error in KB ingestion: {e}")
533
+ self.status = f"❌ KB ingestion failed: {e}"
534
+ return Data(data={"error": str(e), "kb_name": self.knowledge_base})
535
+
536
+ def _get_knowledge_bases(self) -> list[str]:
537
+ """Retrieve a list of available knowledge bases.
538
+
539
+ Returns:
540
+ A list of knowledge base names.
541
+ """
542
+ # Return the list of directories in the knowledge base root path
543
+ kb_root_path = self._get_kb_root()
544
+
545
+ if not kb_root_path.exists():
546
+ return []
547
+
548
+ return [str(d.name) for d in kb_root_path.iterdir() if not d.name.startswith(".") and d.is_dir()]
549
+
550
+ def update_build_config(self, build_config: dotdict, field_value: Any, field_name: str | None = None) -> dotdict:
551
+ """Update build configuration based on provider selection."""
552
+ # Create a new knowledge base
553
+ if field_name == "knowledge_base":
554
+ if isinstance(field_value, dict) and "01_new_kb_name" in field_value:
555
+ # Validate the knowledge base name - Make sure it follows these rules:
556
+ if not self.is_valid_collection_name(field_value["01_new_kb_name"]):
557
+ msg = f"Invalid knowledge base name: {field_value['01_new_kb_name']}"
558
+ raise ValueError(msg)
559
+
560
+ # We need to test the API Key one time against the embedding model
561
+ embed_model = self._build_embeddings(
562
+ embedding_model=field_value["02_embedding_model"], api_key=field_value["03_api_key"]
563
+ )
564
+
565
+ # Try to generate a dummy embedding to validate the API key
566
+ embed_model.embed_query("test")
567
+
568
+ # Create the new knowledge base directory
569
+ kb_path = KNOWLEDGE_BASES_ROOT_PATH / field_value["01_new_kb_name"]
570
+ kb_path.mkdir(parents=True, exist_ok=True)
571
+
572
+ # Save the embedding metadata
573
+ build_config["knowledge_base"]["value"] = field_value["01_new_kb_name"]
574
+ self._save_embedding_metadata(
575
+ kb_path=kb_path,
576
+ embedding_model=field_value["02_embedding_model"],
577
+ api_key=field_value["03_api_key"],
578
+ )
579
+
580
+ # Update the knowledge base options dynamically
581
+ build_config["knowledge_base"]["options"] = self._get_knowledge_bases()
582
+ if build_config["knowledge_base"]["value"] not in build_config["knowledge_base"]["options"]:
583
+ build_config["knowledge_base"]["value"] = None
584
+
585
+ return build_config