lfx-nightly 0.1.12.dev14__py3-none-any.whl → 0.1.12.dev16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lfx-nightly might be problematic. Click here for more details.

Files changed (130) hide show
  1. lfx/base/agents/events.py +40 -29
  2. lfx/base/constants.py +1 -1
  3. lfx/base/data/docling_utils.py +43 -8
  4. lfx/base/data/utils.py +3 -3
  5. lfx/base/knowledge_bases/__init__.py +3 -0
  6. lfx/base/knowledge_bases/knowledge_base_utils.py +137 -0
  7. lfx/base/models/anthropic_constants.py +3 -1
  8. lfx/base/models/model_input_constants.py +1 -1
  9. lfx/base/vectorstores/vector_store_connection_decorator.py +1 -1
  10. lfx/components/agentql/agentql_api.py +1 -1
  11. lfx/components/agents/agent.py +62 -17
  12. lfx/components/agents/mcp_component.py +11 -1
  13. lfx/components/aiml/aiml.py +4 -1
  14. lfx/components/amazon/amazon_bedrock_converse.py +196 -0
  15. lfx/components/amazon/amazon_bedrock_model.py +5 -1
  16. lfx/components/azure/azure_openai.py +1 -1
  17. lfx/components/azure/azure_openai_embeddings.py +1 -1
  18. lfx/components/chroma/chroma.py +4 -2
  19. lfx/components/clickhouse/clickhouse.py +1 -1
  20. lfx/components/confluence/confluence.py +1 -1
  21. lfx/components/crewai/crewai.py +1 -0
  22. lfx/components/crewai/hierarchical_crew.py +1 -0
  23. lfx/components/crewai/hierarchical_task.py +1 -0
  24. lfx/components/crewai/sequential_crew.py +1 -0
  25. lfx/components/crewai/sequential_task.py +1 -0
  26. lfx/components/crewai/sequential_task_agent.py +1 -0
  27. lfx/components/data/api_request.py +13 -3
  28. lfx/components/data/csv_to_data.py +1 -0
  29. lfx/components/data/file.py +71 -25
  30. lfx/components/data/json_to_data.py +1 -0
  31. lfx/components/datastax/astra_db.py +2 -1
  32. lfx/components/datastax/astra_vectorize.py +3 -5
  33. lfx/components/datastax/astradb_tool.py +5 -1
  34. lfx/components/datastax/astradb_vectorstore.py +8 -1
  35. lfx/components/deactivated/chat_litellm_model.py +1 -1
  36. lfx/components/deactivated/metal.py +1 -1
  37. lfx/components/docling/docling_inline.py +23 -9
  38. lfx/components/elastic/elasticsearch.py +1 -1
  39. lfx/components/elastic/opensearch.py +1 -1
  40. lfx/components/embeddings/similarity.py +1 -0
  41. lfx/components/embeddings/text_embedder.py +1 -0
  42. lfx/components/firecrawl/firecrawl_crawl_api.py +1 -1
  43. lfx/components/firecrawl/firecrawl_extract_api.py +1 -1
  44. lfx/components/firecrawl/firecrawl_map_api.py +1 -1
  45. lfx/components/firecrawl/firecrawl_scrape_api.py +1 -1
  46. lfx/components/google/gmail.py +1 -0
  47. lfx/components/google/google_generative_ai_embeddings.py +1 -1
  48. lfx/components/helpers/memory.py +8 -6
  49. lfx/components/helpers/output_parser.py +1 -0
  50. lfx/components/helpers/store_message.py +1 -0
  51. lfx/components/huggingface/huggingface.py +3 -1
  52. lfx/components/huggingface/huggingface_inference_api.py +1 -1
  53. lfx/components/ibm/watsonx.py +1 -1
  54. lfx/components/ibm/watsonx_embeddings.py +1 -1
  55. lfx/components/icosacomputing/combinatorial_reasoner.py +1 -1
  56. lfx/components/input_output/chat.py +0 -27
  57. lfx/components/input_output/chat_output.py +3 -27
  58. lfx/components/knowledge_bases/__init__.py +34 -0
  59. lfx/components/knowledge_bases/ingestion.py +686 -0
  60. lfx/components/knowledge_bases/retrieval.py +256 -0
  61. lfx/components/langchain_utilities/langchain_hub.py +1 -1
  62. lfx/components/langwatch/langwatch.py +1 -1
  63. lfx/components/logic/conditional_router.py +40 -3
  64. lfx/components/logic/data_conditional_router.py +1 -0
  65. lfx/components/logic/flow_tool.py +2 -1
  66. lfx/components/logic/pass_message.py +1 -0
  67. lfx/components/logic/sub_flow.py +2 -1
  68. lfx/components/milvus/milvus.py +1 -1
  69. lfx/components/olivya/olivya.py +1 -1
  70. lfx/components/processing/alter_metadata.py +1 -0
  71. lfx/components/processing/combine_text.py +1 -0
  72. lfx/components/processing/create_data.py +1 -0
  73. lfx/components/processing/data_to_dataframe.py +1 -0
  74. lfx/components/processing/extract_key.py +1 -0
  75. lfx/components/processing/filter_data.py +1 -0
  76. lfx/components/processing/filter_data_values.py +1 -0
  77. lfx/components/processing/json_cleaner.py +1 -0
  78. lfx/components/processing/merge_data.py +1 -0
  79. lfx/components/processing/message_to_data.py +1 -0
  80. lfx/components/processing/parse_data.py +1 -0
  81. lfx/components/processing/parse_dataframe.py +1 -0
  82. lfx/components/processing/parse_json_data.py +1 -0
  83. lfx/components/processing/python_repl_core.py +2 -2
  84. lfx/components/processing/regex.py +1 -0
  85. lfx/components/processing/select_data.py +1 -0
  86. lfx/components/processing/structured_output.py +7 -3
  87. lfx/components/processing/update_data.py +1 -0
  88. lfx/components/prototypes/__init__.py +8 -7
  89. lfx/components/qdrant/qdrant.py +1 -1
  90. lfx/components/redis/redis_chat.py +1 -1
  91. lfx/components/tools/__init__.py +0 -6
  92. lfx/components/tools/calculator.py +2 -1
  93. lfx/components/tools/python_code_structured_tool.py +1 -0
  94. lfx/components/tools/python_repl.py +2 -1
  95. lfx/components/tools/search_api.py +2 -1
  96. lfx/components/tools/serp_api.py +2 -1
  97. lfx/components/tools/tavily_search_tool.py +1 -0
  98. lfx/components/tools/wikidata_api.py +2 -1
  99. lfx/components/tools/wikipedia_api.py +2 -1
  100. lfx/components/tools/yahoo_finance.py +2 -1
  101. lfx/components/twelvelabs/video_embeddings.py +1 -1
  102. lfx/components/upstash/upstash.py +1 -1
  103. lfx/components/vectorstores/astradb_graph.py +8 -1
  104. lfx/components/vectorstores/local_db.py +1 -0
  105. lfx/components/vectorstores/weaviate.py +1 -1
  106. lfx/components/wolframalpha/wolfram_alpha_api.py +1 -1
  107. lfx/components/zep/zep.py +2 -1
  108. lfx/custom/attributes.py +1 -0
  109. lfx/custom/validate.py +1 -1
  110. lfx/graph/graph/base.py +61 -4
  111. lfx/inputs/inputs.py +1 -0
  112. lfx/log/logger.py +31 -11
  113. lfx/schema/message.py +6 -1
  114. lfx/schema/schema.py +4 -0
  115. lfx/services/__init__.py +3 -0
  116. lfx/services/mcp_composer/__init__.py +6 -0
  117. lfx/services/mcp_composer/factory.py +16 -0
  118. lfx/services/mcp_composer/service.py +599 -0
  119. lfx/services/schema.py +1 -0
  120. lfx/services/settings/auth.py +18 -15
  121. lfx/services/settings/base.py +38 -0
  122. lfx/services/settings/constants.py +4 -1
  123. lfx/services/settings/feature_flags.py +0 -1
  124. lfx/template/frontend_node/base.py +2 -0
  125. lfx/utils/image.py +1 -1
  126. {lfx_nightly-0.1.12.dev14.dist-info → lfx_nightly-0.1.12.dev16.dist-info}/METADATA +1 -1
  127. {lfx_nightly-0.1.12.dev14.dist-info → lfx_nightly-0.1.12.dev16.dist-info}/RECORD +129 -121
  128. lfx/components/datastax/astradb.py +0 -1285
  129. {lfx_nightly-0.1.12.dev14.dist-info → lfx_nightly-0.1.12.dev16.dist-info}/WHEEL +0 -0
  130. {lfx_nightly-0.1.12.dev14.dist-info → lfx_nightly-0.1.12.dev16.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,686 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import hashlib
6
+ import json
7
+ import re
8
+ import uuid
9
+ from dataclasses import asdict, dataclass, field
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ import pandas as pd
15
+ from cryptography.fernet import InvalidToken
16
+ from langchain_chroma import Chroma
17
+ from langflow.services.auth.utils import decrypt_api_key, encrypt_api_key
18
+ from langflow.services.database.models.user.crud import get_user_by_id
19
+
20
+ from lfx.base.knowledge_bases.knowledge_base_utils import get_knowledge_bases
21
+ from lfx.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES
22
+ from lfx.components.processing.converter import convert_to_dataframe
23
+ from lfx.custom import Component
24
+ from lfx.io import (
25
+ BoolInput,
26
+ DropdownInput,
27
+ HandleInput,
28
+ IntInput,
29
+ Output,
30
+ SecretStrInput,
31
+ StrInput,
32
+ TableInput,
33
+ )
34
+ from lfx.schema.data import Data
35
+ from lfx.schema.table import EditMode
36
+ from lfx.services.deps import (
37
+ get_settings_service,
38
+ get_variable_service,
39
+ session_scope,
40
+ )
41
+
42
+ if TYPE_CHECKING:
43
+ from lfx.schema.dataframe import DataFrame
44
+
45
+ HUGGINGFACE_MODEL_NAMES = [
46
+ "sentence-transformers/all-MiniLM-L6-v2",
47
+ "sentence-transformers/all-mpnet-base-v2",
48
+ ]
49
+ COHERE_MODEL_NAMES = ["embed-english-v3.0", "embed-multilingual-v3.0"]
50
+
51
+ settings = get_settings_service().settings
52
+ knowledge_directory = settings.knowledge_bases_dir
53
+ if not knowledge_directory:
54
+ msg = "Knowledge bases directory is not set in the settings."
55
+ raise ValueError(msg)
56
+ KNOWLEDGE_BASES_ROOT_PATH = Path(knowledge_directory).expanduser()
57
+
58
+
59
+ class KnowledgeIngestionComponent(Component):
60
+ """Create or append to Langflow Knowledge from a DataFrame."""
61
+
62
+ # ------ UI metadata ---------------------------------------------------
63
+ display_name = "Knowledge Ingestion"
64
+ description = "Create or update knowledge in Langflow."
65
+ icon = "upload"
66
+ name = "KnowledgeIngestion"
67
+
68
+ def __init__(self, *args, **kwargs) -> None:
69
+ super().__init__(*args, **kwargs)
70
+ self._cached_kb_path: Path | None = None
71
+
72
+ @dataclass
73
+ class NewKnowledgeBaseInput:
74
+ functionality: str = "create"
75
+ fields: dict[str, dict] = field(
76
+ default_factory=lambda: {
77
+ "data": {
78
+ "node": {
79
+ "name": "create_knowledge_base",
80
+ "description": "Create new knowledge in Langflow.",
81
+ "display_name": "Create new knowledge",
82
+ "field_order": [
83
+ "01_new_kb_name",
84
+ "02_embedding_model",
85
+ "03_api_key",
86
+ ],
87
+ "template": {
88
+ "01_new_kb_name": StrInput(
89
+ name="new_kb_name",
90
+ display_name="Knowledge Name",
91
+ info="Name of the new knowledge to create.",
92
+ required=True,
93
+ ),
94
+ "02_embedding_model": DropdownInput(
95
+ name="embedding_model",
96
+ display_name="Choose Embedding",
97
+ info="Select the embedding model to use for this knowledge base.",
98
+ required=True,
99
+ options=OPENAI_EMBEDDING_MODEL_NAMES + HUGGINGFACE_MODEL_NAMES + COHERE_MODEL_NAMES,
100
+ options_metadata=[{"icon": "OpenAI"} for _ in OPENAI_EMBEDDING_MODEL_NAMES]
101
+ + [{"icon": "HuggingFace"} for _ in HUGGINGFACE_MODEL_NAMES]
102
+ + [{"icon": "Cohere"} for _ in COHERE_MODEL_NAMES],
103
+ ),
104
+ "03_api_key": SecretStrInput(
105
+ name="api_key",
106
+ display_name="API Key",
107
+ info="Provider API key for embedding model",
108
+ required=True,
109
+ load_from_db=False,
110
+ ),
111
+ },
112
+ },
113
+ }
114
+ }
115
+ )
116
+
117
+ # ------ Inputs --------------------------------------------------------
118
+ inputs = [
119
+ DropdownInput(
120
+ name="knowledge_base",
121
+ display_name="Knowledge",
122
+ info="Select the knowledge to load data from.",
123
+ required=True,
124
+ options=[],
125
+ refresh_button=True,
126
+ real_time_refresh=True,
127
+ dialog_inputs=asdict(NewKnowledgeBaseInput()),
128
+ ),
129
+ HandleInput(
130
+ name="input_df",
131
+ display_name="Input",
132
+ info=(
133
+ "Table with all original columns (already chunked / processed). "
134
+ "Accepts Data or DataFrame. If Data is provided, it is converted to a DataFrame automatically."
135
+ ),
136
+ input_types=["Data", "DataFrame"],
137
+ required=True,
138
+ ),
139
+ TableInput(
140
+ name="column_config",
141
+ display_name="Column Configuration",
142
+ info="Configure column behavior for the knowledge base.",
143
+ required=True,
144
+ table_schema=[
145
+ {
146
+ "name": "column_name",
147
+ "display_name": "Column Name",
148
+ "type": "str",
149
+ "description": "Name of the column in the source DataFrame",
150
+ "edit_mode": EditMode.INLINE,
151
+ },
152
+ {
153
+ "name": "vectorize",
154
+ "display_name": "Vectorize",
155
+ "type": "boolean",
156
+ "description": "Create embeddings for this column",
157
+ "default": False,
158
+ "edit_mode": EditMode.INLINE,
159
+ },
160
+ {
161
+ "name": "identifier",
162
+ "display_name": "Identifier",
163
+ "type": "boolean",
164
+ "description": "Use this column as unique identifier",
165
+ "default": False,
166
+ "edit_mode": EditMode.INLINE,
167
+ },
168
+ ],
169
+ value=[
170
+ {
171
+ "column_name": "text",
172
+ "vectorize": True,
173
+ "identifier": True,
174
+ },
175
+ ],
176
+ ),
177
+ IntInput(
178
+ name="chunk_size",
179
+ display_name="Chunk Size",
180
+ info="Batch size for processing embeddings",
181
+ advanced=True,
182
+ value=1000,
183
+ ),
184
+ SecretStrInput(
185
+ name="api_key",
186
+ display_name="Embedding Provider API Key",
187
+ info="API key for the embedding provider to generate embeddings.",
188
+ advanced=True,
189
+ required=False,
190
+ ),
191
+ BoolInput(
192
+ name="allow_duplicates",
193
+ display_name="Allow Duplicates",
194
+ info="Allow duplicate rows in the knowledge base",
195
+ advanced=True,
196
+ value=False,
197
+ ),
198
+ ]
199
+
200
+ # ------ Outputs -------------------------------------------------------
201
+ outputs = [Output(display_name="Results", name="dataframe_output", method="build_kb_info")]
202
+
203
+ # ------ Internal helpers ---------------------------------------------
204
+ def _get_kb_root(self) -> Path:
205
+ """Return the root directory for knowledge bases."""
206
+ return KNOWLEDGE_BASES_ROOT_PATH
207
+
208
+ def _validate_column_config(self, df_source: pd.DataFrame) -> list[dict[str, Any]]:
209
+ """Validate column configuration using Structured Output patterns."""
210
+ if not self.column_config:
211
+ msg = "Column configuration cannot be empty"
212
+ raise ValueError(msg)
213
+
214
+ # Convert table input to list of dicts (similar to Structured Output)
215
+ config_list = self.column_config if isinstance(self.column_config, list) else []
216
+
217
+ # Validate column names exist in DataFrame
218
+ df_columns = set(df_source.columns)
219
+ for config in config_list:
220
+ col_name = config.get("column_name")
221
+ if col_name not in df_columns:
222
+ msg = f"Column '{col_name}' not found in DataFrame. Available columns: {sorted(df_columns)}"
223
+ raise ValueError(msg)
224
+
225
+ return config_list
226
+
227
+ def _get_embedding_provider(self, embedding_model: str) -> str:
228
+ """Get embedding provider by matching model name to lists."""
229
+ if embedding_model in OPENAI_EMBEDDING_MODEL_NAMES:
230
+ return "OpenAI"
231
+ if embedding_model in HUGGINGFACE_MODEL_NAMES:
232
+ return "HuggingFace"
233
+ if embedding_model in COHERE_MODEL_NAMES:
234
+ return "Cohere"
235
+ return "Custom"
236
+
237
+ def _build_embeddings(self, embedding_model: str, api_key: str):
238
+ """Build embedding model using provider patterns."""
239
+ # Get provider by matching model name to lists
240
+ provider = self._get_embedding_provider(embedding_model)
241
+
242
+ # Validate provider and model
243
+ if provider == "OpenAI":
244
+ from langchain_openai import OpenAIEmbeddings
245
+
246
+ if not api_key:
247
+ msg = "OpenAI API key is required when using OpenAI provider"
248
+ raise ValueError(msg)
249
+ return OpenAIEmbeddings(
250
+ model=embedding_model,
251
+ api_key=api_key,
252
+ chunk_size=self.chunk_size,
253
+ )
254
+ if provider == "HuggingFace":
255
+ from langchain_huggingface import HuggingFaceEmbeddings
256
+
257
+ return HuggingFaceEmbeddings(
258
+ model=embedding_model,
259
+ )
260
+ if provider == "Cohere":
261
+ from langchain_cohere import CohereEmbeddings
262
+
263
+ if not api_key:
264
+ msg = "Cohere API key is required when using Cohere provider"
265
+ raise ValueError(msg)
266
+ return CohereEmbeddings(
267
+ model=embedding_model,
268
+ cohere_api_key=api_key,
269
+ )
270
+ if provider == "Custom":
271
+ # For custom embedding models, we would need additional configuration
272
+ msg = "Custom embedding models not yet supported"
273
+ raise NotImplementedError(msg)
274
+ msg = f"Unknown provider: {provider}"
275
+ raise ValueError(msg)
276
+
277
+ def _build_embedding_metadata(self, embedding_model, api_key) -> dict[str, Any]:
278
+ """Build embedding model metadata."""
279
+ # Get provider by matching model name to lists
280
+ embedding_provider = self._get_embedding_provider(embedding_model)
281
+
282
+ api_key_to_save = None
283
+ if api_key and hasattr(api_key, "get_secret_value"):
284
+ api_key_to_save = api_key.get_secret_value()
285
+ elif isinstance(api_key, str):
286
+ api_key_to_save = api_key
287
+
288
+ encrypted_api_key = None
289
+ if api_key_to_save:
290
+ settings_service = get_settings_service()
291
+ try:
292
+ encrypted_api_key = encrypt_api_key(api_key_to_save, settings_service=settings_service)
293
+ except (TypeError, ValueError) as e:
294
+ self.log(f"Could not encrypt API key: {e}")
295
+
296
+ return {
297
+ "embedding_provider": embedding_provider,
298
+ "embedding_model": embedding_model,
299
+ "api_key": encrypted_api_key,
300
+ "api_key_used": bool(api_key),
301
+ "chunk_size": self.chunk_size,
302
+ "created_at": datetime.now(timezone.utc).isoformat(),
303
+ }
304
+
305
+ def _save_embedding_metadata(self, kb_path: Path, embedding_model: str, api_key: str) -> None:
306
+ """Save embedding model metadata."""
307
+ embedding_metadata = self._build_embedding_metadata(embedding_model, api_key)
308
+ metadata_path = kb_path / "embedding_metadata.json"
309
+ metadata_path.write_text(json.dumps(embedding_metadata, indent=2))
310
+
311
+ def _save_kb_files(
312
+ self,
313
+ kb_path: Path,
314
+ config_list: list[dict[str, Any]],
315
+ ) -> None:
316
+ """Save KB files using File Component storage patterns."""
317
+ try:
318
+ # Create directory (following File Component patterns)
319
+ kb_path.mkdir(parents=True, exist_ok=True)
320
+
321
+ # Save column configuration
322
+ # Only do this if the file doesn't exist already
323
+ cfg_path = kb_path / "schema.json"
324
+ if not cfg_path.exists():
325
+ cfg_path.write_text(json.dumps(config_list, indent=2))
326
+
327
+ except (OSError, TypeError, ValueError) as e:
328
+ self.log(f"Error saving KB files: {e}")
329
+
330
+ def _build_column_metadata(self, config_list: list[dict[str, Any]], df_source: pd.DataFrame) -> dict[str, Any]:
331
+ """Build detailed column metadata."""
332
+ metadata: dict[str, Any] = {
333
+ "total_columns": len(df_source.columns),
334
+ "mapped_columns": len(config_list),
335
+ "unmapped_columns": len(df_source.columns) - len(config_list),
336
+ "columns": [],
337
+ "summary": {"vectorized_columns": [], "identifier_columns": []},
338
+ }
339
+
340
+ for config in config_list:
341
+ col_name = config.get("column_name")
342
+ vectorize = config.get("vectorize") == "True" or config.get("vectorize") is True
343
+ identifier = config.get("identifier") == "True" or config.get("identifier") is True
344
+
345
+ # Add to columns list
346
+ metadata["columns"].append(
347
+ {
348
+ "name": col_name,
349
+ "vectorize": vectorize,
350
+ "identifier": identifier,
351
+ }
352
+ )
353
+
354
+ # Update summary
355
+ if vectorize:
356
+ metadata["summary"]["vectorized_columns"].append(col_name)
357
+ if identifier:
358
+ metadata["summary"]["identifier_columns"].append(col_name)
359
+
360
+ return metadata
361
+
362
+ async def _create_vector_store(
363
+ self,
364
+ df_source: pd.DataFrame,
365
+ config_list: list[dict[str, Any]],
366
+ embedding_model: str,
367
+ api_key: str,
368
+ ) -> None:
369
+ """Create vector store following Local DB component pattern."""
370
+ try:
371
+ # Set up vector store directory
372
+ vector_store_dir = await self._kb_path()
373
+ if not vector_store_dir:
374
+ msg = "Knowledge base path is not set. Please create a new knowledge base first."
375
+ raise ValueError(msg)
376
+ vector_store_dir.mkdir(parents=True, exist_ok=True)
377
+
378
+ # Create embeddings model
379
+ embedding_function = self._build_embeddings(embedding_model, api_key)
380
+
381
+ # Convert DataFrame to Data objects (following Local DB pattern)
382
+ data_objects = await self._convert_df_to_data_objects(df_source, config_list)
383
+
384
+ # Create vector store
385
+ chroma = Chroma(
386
+ persist_directory=str(vector_store_dir),
387
+ embedding_function=embedding_function,
388
+ collection_name=self.knowledge_base,
389
+ )
390
+
391
+ # Convert Data objects to LangChain Documents
392
+ documents = []
393
+ for data_obj in data_objects:
394
+ doc = data_obj.to_lc_document()
395
+ documents.append(doc)
396
+
397
+ # Add documents to vector store
398
+ if documents:
399
+ chroma.add_documents(documents)
400
+ self.log(f"Added {len(documents)} documents to vector store '{self.knowledge_base}'")
401
+
402
+ except (OSError, ValueError, RuntimeError) as e:
403
+ self.log(f"Error creating vector store: {e}")
404
+
405
+ async def _convert_df_to_data_objects(
406
+ self, df_source: pd.DataFrame, config_list: list[dict[str, Any]]
407
+ ) -> list[Data]:
408
+ """Convert DataFrame to Data objects for vector store."""
409
+ data_objects: list[Data] = []
410
+
411
+ # Set up vector store directory
412
+ kb_path = await self._kb_path()
413
+
414
+ # If we don't allow duplicates, we need to get the existing hashes
415
+ chroma = Chroma(
416
+ persist_directory=str(kb_path),
417
+ collection_name=self.knowledge_base,
418
+ )
419
+
420
+ # Get all documents and their metadata
421
+ all_docs = chroma.get()
422
+
423
+ # Extract all _id values from metadata
424
+ id_list = [metadata.get("_id") for metadata in all_docs["metadatas"] if metadata.get("_id")]
425
+
426
+ # Get column roles
427
+ content_cols = []
428
+ identifier_cols = []
429
+
430
+ for config in config_list:
431
+ col_name = config.get("column_name")
432
+ vectorize = config.get("vectorize") == "True" or config.get("vectorize") is True
433
+ identifier = config.get("identifier") == "True" or config.get("identifier") is True
434
+
435
+ if vectorize:
436
+ content_cols.append(col_name)
437
+ elif identifier:
438
+ identifier_cols.append(col_name)
439
+
440
+ # Convert each row to a Data object
441
+ for _, row in df_source.iterrows():
442
+ # Build content text from identifier columns using list comprehension
443
+ identifier_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]
444
+
445
+ # Join all parts into a single string
446
+ page_content = " ".join(identifier_parts)
447
+
448
+ # Build metadata from NON-vectorized columns only (simple key-value pairs)
449
+ data_dict = {
450
+ "text": page_content, # Main content for vectorization
451
+ }
452
+
453
+ # Add identifier columns if they exist
454
+ if identifier_cols:
455
+ identifier_parts = [str(row[col]) for col in identifier_cols if col in row and pd.notna(row[col])]
456
+ page_content = " ".join(identifier_parts)
457
+
458
+ # Add metadata columns as simple key-value pairs
459
+ for col in df_source.columns:
460
+ if col not in content_cols and col in row and pd.notna(row[col]):
461
+ # Convert to simple types for Chroma metadata
462
+ value = row[col]
463
+ data_dict[col] = str(value) # Convert complex types to string
464
+
465
+ # Hash the page_content for unique ID
466
+ page_content_hash = hashlib.sha256(page_content.encode()).hexdigest()
467
+ data_dict["_id"] = page_content_hash
468
+
469
+ # If duplicates are disallowed, and hash exists, prevent adding this row
470
+ if not self.allow_duplicates and page_content_hash in id_list:
471
+ self.log(f"Skipping duplicate row with hash {page_content_hash}")
472
+ continue
473
+
474
+ # Create Data object - everything except "text" becomes metadata
475
+ data_obj = Data(data=data_dict)
476
+ data_objects.append(data_obj)
477
+
478
+ return data_objects
479
+
480
+ def is_valid_collection_name(self, name, min_length: int = 3, max_length: int = 63) -> bool:
481
+ """Validates collection name against conditions 1-3.
482
+
483
+ 1. Contains 3-63 characters
484
+ 2. Starts and ends with alphanumeric character
485
+ 3. Contains only alphanumeric characters, underscores, or hyphens.
486
+
487
+ Args:
488
+ name (str): Collection name to validate
489
+ min_length (int): Minimum length of the name
490
+ max_length (int): Maximum length of the name
491
+
492
+ Returns:
493
+ bool: True if valid, False otherwise
494
+ """
495
+ # Check length (condition 1)
496
+ if not (min_length <= len(name) <= max_length):
497
+ return False
498
+
499
+ # Check start/end with alphanumeric (condition 2)
500
+ if not (name[0].isalnum() and name[-1].isalnum()):
501
+ return False
502
+
503
+ # Check allowed characters (condition 3)
504
+ return re.match(r"^[a-zA-Z0-9_-]+$", name) is not None
505
+
506
+ async def _kb_path(self) -> Path | None:
507
+ # Check if we already have the path cached
508
+ cached_path = getattr(self, "_cached_kb_path", None)
509
+ if cached_path is not None:
510
+ return cached_path
511
+
512
+ # If not cached, compute it
513
+ async with session_scope() as db:
514
+ if not self.user_id:
515
+ msg = "User ID is required for fetching knowledge base path."
516
+ raise ValueError(msg)
517
+ current_user = await get_user_by_id(db, self.user_id)
518
+ if not current_user:
519
+ msg = f"User with ID {self.user_id} not found."
520
+ raise ValueError(msg)
521
+ kb_user = current_user.username
522
+
523
+ kb_root = self._get_kb_root()
524
+
525
+ # Cache the result
526
+ self._cached_kb_path = kb_root / kb_user / self.knowledge_base
527
+
528
+ return self._cached_kb_path
529
+
530
+ # ---------------------------------------------------------------------
531
+ # OUTPUT METHODS
532
+ # ---------------------------------------------------------------------
533
+ async def build_kb_info(self) -> Data:
534
+ """Main ingestion routine → returns a dict with KB metadata."""
535
+ try:
536
+ input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df
537
+ df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)
538
+
539
+ # Validate column configuration (using Structured Output patterns)
540
+ config_list = self._validate_column_config(df_source)
541
+ column_metadata = self._build_column_metadata(config_list, df_source)
542
+
543
+ # Read the embedding info from the knowledge base folder
544
+ kb_path = await self._kb_path()
545
+ if not kb_path:
546
+ msg = "Knowledge base path is not set. Please create a new knowledge base first."
547
+ raise ValueError(msg)
548
+ metadata_path = kb_path / "embedding_metadata.json"
549
+
550
+ # If the API key is not provided, try to read it from the metadata file
551
+ if metadata_path.exists():
552
+ settings_service = get_settings_service()
553
+ metadata = json.loads(metadata_path.read_text())
554
+ embedding_model = metadata.get("embedding_model")
555
+ try:
556
+ api_key = decrypt_api_key(metadata["api_key"], settings_service)
557
+ except (InvalidToken, TypeError, ValueError) as e:
558
+ self.log(f"Could not decrypt API key. Please provide it manually. Error: {e}")
559
+
560
+ # Check if a custom API key was provided, update metadata if so
561
+ if self.api_key:
562
+ api_key = self.api_key
563
+ self._save_embedding_metadata(
564
+ kb_path=kb_path,
565
+ embedding_model=embedding_model,
566
+ api_key=api_key,
567
+ )
568
+
569
+ # Create vector store following Local DB component pattern
570
+ await self._create_vector_store(df_source, config_list, embedding_model=embedding_model, api_key=api_key)
571
+
572
+ # Save KB files (using File Component storage patterns)
573
+ self._save_kb_files(kb_path, config_list)
574
+
575
+ # Build metadata response
576
+ meta: dict[str, Any] = {
577
+ "kb_id": str(uuid.uuid4()),
578
+ "kb_name": self.knowledge_base,
579
+ "rows": len(df_source),
580
+ "column_metadata": column_metadata,
581
+ "path": str(kb_path),
582
+ "config_columns": len(config_list),
583
+ "timestamp": datetime.now(tz=timezone.utc).isoformat(),
584
+ }
585
+
586
+ # Set status message
587
+ self.status = f"✅ KB **{self.knowledge_base}** saved · {len(df_source)} chunks."
588
+
589
+ return Data(data=meta)
590
+
591
+ except (OSError, ValueError, RuntimeError, KeyError) as e:
592
+ msg = f"Error during KB ingestion: {e}"
593
+ raise RuntimeError(msg) from e
594
+
595
+ async def _get_api_key_variable(self, field_value: dict[str, Any]):
596
+ async with session_scope() as db:
597
+ if not self.user_id:
598
+ msg = "User ID is required for fetching global variables."
599
+ raise ValueError(msg)
600
+ current_user = await get_user_by_id(db, self.user_id)
601
+ if not current_user:
602
+ msg = f"User with ID {self.user_id} not found."
603
+ raise ValueError(msg)
604
+ variable_service = get_variable_service()
605
+
606
+ # Process the api_key field variable
607
+ return await variable_service.get_variable(
608
+ user_id=current_user.id,
609
+ name=field_value["03_api_key"],
610
+ field="",
611
+ session=db,
612
+ )
613
+
614
+ async def update_build_config(
615
+ self,
616
+ build_config,
617
+ field_value: Any,
618
+ field_name: str | None = None,
619
+ ):
620
+ """Update build configuration based on provider selection."""
621
+ # Create a new knowledge base
622
+ if field_name == "knowledge_base":
623
+ async with session_scope() as db:
624
+ if not self.user_id:
625
+ msg = "User ID is required for fetching knowledge base list."
626
+ raise ValueError(msg)
627
+ current_user = await get_user_by_id(db, self.user_id)
628
+ if not current_user:
629
+ msg = f"User with ID {self.user_id} not found."
630
+ raise ValueError(msg)
631
+ kb_user = current_user.username
632
+ if isinstance(field_value, dict) and "01_new_kb_name" in field_value:
633
+ # Validate the knowledge base name - Make sure it follows these rules:
634
+ if not self.is_valid_collection_name(field_value["01_new_kb_name"]):
635
+ msg = f"Invalid knowledge base name: {field_value['01_new_kb_name']}"
636
+ raise ValueError(msg)
637
+
638
+ api_key = field_value.get("03_api_key", None)
639
+ with contextlib.suppress(Exception):
640
+ # If the API key is a variable, resolve it
641
+ api_key = await self._get_api_key_variable(field_value)
642
+
643
+ # Make sure api_key is a string
644
+ if not isinstance(api_key, str):
645
+ msg = "API key must be a string."
646
+ raise ValueError(msg)
647
+
648
+ # We need to test the API Key one time against the embedding model
649
+ embed_model = self._build_embeddings(embedding_model=field_value["02_embedding_model"], api_key=api_key)
650
+
651
+ # Try to generate a dummy embedding to validate the API key without blocking the event loop
652
+ try:
653
+ await asyncio.wait_for(
654
+ asyncio.to_thread(embed_model.embed_query, "test"),
655
+ timeout=10,
656
+ )
657
+ except TimeoutError as e:
658
+ msg = "Embedding validation timed out. Please verify network connectivity and key."
659
+ raise ValueError(msg) from e
660
+ except Exception as e:
661
+ msg = f"Embedding validation failed: {e!s}"
662
+ raise ValueError(msg) from e
663
+
664
+ # Create the new knowledge base directory
665
+ kb_path = KNOWLEDGE_BASES_ROOT_PATH / kb_user / field_value["01_new_kb_name"]
666
+ kb_path.mkdir(parents=True, exist_ok=True)
667
+
668
+ # Save the embedding metadata
669
+ build_config["knowledge_base"]["value"] = field_value["01_new_kb_name"]
670
+ self._save_embedding_metadata(
671
+ kb_path=kb_path,
672
+ embedding_model=field_value["02_embedding_model"],
673
+ api_key=api_key,
674
+ )
675
+
676
+ # Update the knowledge base options dynamically
677
+ build_config["knowledge_base"]["options"] = await get_knowledge_bases(
678
+ KNOWLEDGE_BASES_ROOT_PATH,
679
+ user_id=self.user_id,
680
+ )
681
+
682
+ # If the selected knowledge base is not available, reset it
683
+ if build_config["knowledge_base"]["value"] not in build_config["knowledge_base"]["options"]:
684
+ build_config["knowledge_base"]["value"] = None
685
+
686
+ return build_config