langflow-base-nightly 0.6.5.dev0__py3-none-any.whl → 0.6.5.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langflow/__main__.py +9 -1
- langflow/api/v1/knowledge_bases.py +15 -7
- langflow/initial_setup/starter_projects/Hybrid Search RAG.json +7 -7
- langflow/initial_setup/starter_projects/Knowledge Ingestion.json +2 -2
- langflow/initial_setup/starter_projects/Knowledge Retrieval.json +2 -2
- langflow/initial_setup/starter_projects/Nvidia Remix.json +1 -1
- langflow/initial_setup/starter_projects/Simple Agent.json +1 -1
- langflow/initial_setup/starter_projects/Vector Store RAG.json +14 -14
- langflow/services/deps.py +11 -0
- {langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/METADATA +2 -2
- {langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/RECORD +13 -13
- {langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/WHEEL +0 -0
- {langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/entry_points.txt +0 -0
langflow/__main__.py
CHANGED
|
@@ -33,7 +33,7 @@ from langflow.cli.progress import create_langflow_progress
|
|
|
33
33
|
from langflow.initial_setup.setup import get_or_create_default_folder
|
|
34
34
|
from langflow.main import setup_app
|
|
35
35
|
from langflow.services.auth.utils import check_key, get_current_user_by_jwt
|
|
36
|
-
from langflow.services.deps import get_db_service, get_settings_service, session_scope
|
|
36
|
+
from langflow.services.deps import get_db_service, get_settings_service, is_settings_service_initialized, session_scope
|
|
37
37
|
from langflow.services.utils import initialize_services
|
|
38
38
|
from langflow.utils.version import fetch_latest_version, get_version_info
|
|
39
39
|
from langflow.utils.version import is_pre_release as langflow_is_pre_release
|
|
@@ -269,6 +269,14 @@ def run(
|
|
|
269
269
|
) -> None:
|
|
270
270
|
"""Run Langflow."""
|
|
271
271
|
if env_file:
|
|
272
|
+
if is_settings_service_initialized():
|
|
273
|
+
err = (
|
|
274
|
+
"Settings service is already initialized. This indicates potential race conditions "
|
|
275
|
+
"with settings initialization. Ensure the settings service is not created during "
|
|
276
|
+
"module loading."
|
|
277
|
+
)
|
|
278
|
+
# i.e. ensures the env file is loaded before the settings service is initialized
|
|
279
|
+
raise ValueError(err)
|
|
272
280
|
load_dotenv(env_file, override=True)
|
|
273
281
|
|
|
274
282
|
# Set and normalize log level, with precedence: cli > env > default
|
|
@@ -15,12 +15,20 @@ from langflow.services.deps import get_settings_service
|
|
|
15
15
|
router = APIRouter(tags=["Knowledge Bases"], prefix="/knowledge_bases")
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
18
|
+
_KNOWLEDGE_BASES_DIR: Path | None = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_knowledge_bases_dir() -> Path:
|
|
22
|
+
"""Lazy load the knowledge bases directory from settings."""
|
|
23
|
+
global _KNOWLEDGE_BASES_DIR # noqa: PLW0603
|
|
24
|
+
if _KNOWLEDGE_BASES_DIR is None:
|
|
25
|
+
settings = get_settings_service().settings
|
|
26
|
+
knowledge_directory = settings.knowledge_bases_dir
|
|
27
|
+
if not knowledge_directory:
|
|
28
|
+
msg = "Knowledge bases directory is not set in the settings."
|
|
29
|
+
raise ValueError(msg)
|
|
30
|
+
_KNOWLEDGE_BASES_DIR = Path(knowledge_directory).expanduser()
|
|
31
|
+
return _KNOWLEDGE_BASES_DIR
|
|
24
32
|
|
|
25
33
|
|
|
26
34
|
class KnowledgeBaseInfo(BaseModel):
|
|
@@ -41,7 +49,7 @@ class BulkDeleteRequest(BaseModel):
|
|
|
41
49
|
|
|
42
50
|
def get_kb_root_path() -> Path:
|
|
43
51
|
"""Get the knowledge bases root path."""
|
|
44
|
-
return
|
|
52
|
+
return _get_knowledge_bases_dir()
|
|
45
53
|
|
|
46
54
|
|
|
47
55
|
def get_directory_size(path: Path) -> int:
|
|
@@ -1888,17 +1888,13 @@
|
|
|
1888
1888
|
"legacy": false,
|
|
1889
1889
|
"lf_version": "1.6.0",
|
|
1890
1890
|
"metadata": {
|
|
1891
|
-
"code_hash": "
|
|
1891
|
+
"code_hash": "fb190928c0c2",
|
|
1892
1892
|
"dependencies": {
|
|
1893
1893
|
"dependencies": [
|
|
1894
1894
|
{
|
|
1895
1895
|
"name": "astrapy",
|
|
1896
1896
|
"version": "2.1.0"
|
|
1897
1897
|
},
|
|
1898
|
-
{
|
|
1899
|
-
"name": "langchain_astradb",
|
|
1900
|
-
"version": "0.6.1"
|
|
1901
|
-
},
|
|
1902
1898
|
{
|
|
1903
1899
|
"name": "langchain_core",
|
|
1904
1900
|
"version": "0.3.79"
|
|
@@ -1906,11 +1902,15 @@
|
|
|
1906
1902
|
{
|
|
1907
1903
|
"name": "lfx",
|
|
1908
1904
|
"version": null
|
|
1905
|
+
},
|
|
1906
|
+
{
|
|
1907
|
+
"name": "langchain_astradb",
|
|
1908
|
+
"version": "0.6.1"
|
|
1909
1909
|
}
|
|
1910
1910
|
],
|
|
1911
1911
|
"total_dependencies": 4
|
|
1912
1912
|
},
|
|
1913
|
-
"module": "lfx.components.
|
|
1913
|
+
"module": "lfx.components.datastax.astradb_vectorstore.AstraDBVectorStoreComponent"
|
|
1914
1914
|
},
|
|
1915
1915
|
"minimized": false,
|
|
1916
1916
|
"output_types": [],
|
|
@@ -2055,7 +2055,7 @@
|
|
|
2055
2055
|
"show": true,
|
|
2056
2056
|
"title_case": false,
|
|
2057
2057
|
"type": "code",
|
|
2058
|
-
"value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import DataAPIClient, Database\nfrom astrapy.data.info.reranking import RerankServiceOptions\nfrom astrapy.info import CollectionDescriptor, CollectionLexicalOptions, CollectionRerankOptions\nfrom langchain_astradb import AstraDBVectorStore, VectorServiceOptions\nfrom langchain_astradb.utils.astradb import HybridSearchMode, _AstraDBCollectionEnvironment\nfrom langchain_core.documents import Document\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.helpers.data import docs_to_data\nfrom lfx.inputs.inputs import FloatInput, NestedDictInput\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n QueryInput,\n SecretStrInput,\n StrInput,\n)\nfrom lfx.schema.data import Data\nfrom lfx.serialization import serialize\nfrom lfx.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n '<a class=\"underline\" href=\"https://astra.datastax.com/\" target=\" _blank\" '\n 'rel=\"noopener noreferrer\">your database in Astra DB</a>'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n DropdownInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n options=[\"prod\", \"test\", \"dev\"],\n value=\"prod\",\n advanced=True,\n real_time_refresh=True,\n combobox=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n DropdownInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n options=[],\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n show=False,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n show=False,\n ),\n *LCVectorStoreComponent.inputs,\n DropdownInput(\n name=\"search_method\",\n display_name=\"Search Method\",\n info=(\n \"Determine how your content is matched: Vector finds semantic similarity, \"\n \"and Hybrid Search (suggested) combines both approaches \"\n \"with a reranker.\"\n ),\n options=[\"Hybrid Search\", \"Vector Search\"], # TODO: Restore Lexical Search?\n options_metadata=[{\"icon\": \"SearchHybrid\"}, {\"icon\": \"SearchVector\"}],\n value=\"Vector Search\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"reranker\",\n display_name=\"Reranker\",\n info=\"Post-retrieval model that re-scores results for optimal relevance ranking.\",\n show=False,\n toggle=True,\n ),\n QueryInput(\n name=\"lexical_terms\",\n display_name=\"Lexical Terms\",\n info=\"Add additional terms/keywords to augment search precision.\",\n placeholder=\"Enter terms to search...\",\n separator=\" \",\n show=False,\n value=\"\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-west-2\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\", \"europe-west4\"],\n },\n },\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n client = DataAPIClient(environment=environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(api_endpoint, token=token)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers.embedding_providers.items():\n # Get the provider display name and models\n display_name = provider_data.display_name\n models = [model.name for model in provider_data.models]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n reranker: str | None = None,\n ):\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n providers = cls.get_vectorize_providers(token=token, environment=environment, api_endpoint=api_endpoint)\n vectorize_options = VectorServiceOptions(\n provider=providers.get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Define the base arguments being passed to the create collection function\n base_args = {\n \"collection_name\": new_collection_name,\n \"token\": token,\n \"api_endpoint\": api_endpoint,\n \"keyspace\": keyspace,\n \"environment\": environment,\n \"embedding_dimension\": dimension,\n \"collection_vector_service_options\": vectorize_options,\n }\n\n # Add optional arguments if the reranker is set\n if reranker:\n # Split the reranker field into a provider a model name\n provider, _ = reranker.split(\"/\")\n base_args[\"collection_rerank\"] = CollectionRerankOptions(\n service=RerankServiceOptions(provider=provider, model_name=reranker),\n )\n base_args[\"collection_lexical\"] = CollectionLexicalOptions(analyzer=\"STANDARD\")\n\n _AstraDBCollectionEnvironment(**base_args)\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = admin_client.list_databases()\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]\n\n # Get the number of collections\n try:\n # Get the number of collections in the database\n num_collections = len(\n client.get_database(\n api_endpoints[0],\n token=token,\n ).list_collection_names()\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.name] = {\n \"api_endpoints\": api_endpoints,\n \"keyspaces\": db.keyspaces,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(\n token=self.token,\n environment=self.environment,\n )\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n endpoints = db.get(\"api_endpoints\") or []\n return endpoints[0] if endpoints else None\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return \"default_keyspace\"\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(environment=self.environment)\n\n return client.get_database(\n api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(environment=self.environment)\n\n database = client.get_database(\n self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name)\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoints\": info[\"api_endpoints\"],\n \"keyspaces\": info[\"keyspaces\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.definition.vector.service.provider\n if (\n collection\n and collection.definition\n and collection.definition.vector\n and collection.definition.vector.service\n )\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name.lower() == \"bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = database.list_collections(keyspace=self.get_keyspace())\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.definition.vector.service.provider\n if col.definition.vector and col.definition.vector.service\n else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.definition.vector.service.model_name\n if col.definition.vector and col.definition.vector.service\n else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"show\"] = bool(build_config[\"database_name\"][\"value\"])\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"options\"] = []\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"show\"] = False\n\n # Set advanced status based on token presence\n database_config[\"show\"] = bool(build_config[\"token\"][\"value\"])\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"show\": False})\n build_config[\"api_endpoint\"][\"options\"] = []\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"show\": False})\n\n return build_config\n\n def _handle_hybrid_search_options(self, build_config: dict) -> dict:\n \"\"\"Set hybrid search options in the build configuration.\"\"\"\n # Detect what hybrid options are available\n # Get the admin object\n client = DataAPIClient(environment=self.environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(self.get_api_endpoint(), token=self.token)\n\n # We will try to get the reranking providers to see if its hybrid emabled\n try:\n providers = db_admin.find_reranking_providers()\n build_config[\"reranker\"][\"options\"] = [\n model.name for provider_data in providers.reranking_providers.values() for model in provider_data.models\n ]\n build_config[\"reranker\"][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=model.name.split(\"/\")[0])}\n for provider in providers.reranking_providers.values()\n for model in provider.models\n ]\n build_config[\"reranker\"][\"value\"] = build_config[\"reranker\"][\"options\"][0]\n\n # Set the default search field to hybrid search\n build_config[\"search_method\"][\"show\"] = True\n build_config[\"search_method\"][\"options\"] = [\"Hybrid Search\", \"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Hybrid Search\"\n except Exception as _: # noqa: BLE001\n build_config[\"reranker\"][\"options\"] = []\n build_config[\"reranker\"][\"options_metadata\"] = []\n\n # Set the default search field to vector search\n build_config[\"search_method\"][\"show\"] = False\n build_config[\"search_method\"][\"options\"] = [\"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Vector Search\"\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Keyspace selection change\n if field_name == \"keyspace\":\n return self.reset_collection_list(build_config)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n # Search method selection change\n if field_name == \"search_method\":\n is_vector_search = field_value == \"Vector Search\"\n is_autodetect = build_config[\"autodetect_collection\"][\"value\"]\n\n # Configure lexical terms (same for both cases)\n build_config[\"lexical_terms\"][\"show\"] = not is_vector_search\n build_config[\"lexical_terms\"][\"value\"] = \"\" if is_vector_search else build_config[\"lexical_terms\"][\"value\"]\n\n # Disable reranker disabling if hybrid search is selected\n build_config[\"reranker\"][\"show\"] = not is_vector_search\n build_config[\"reranker\"][\"toggle_disable\"] = not is_vector_search\n build_config[\"reranker\"][\"toggle_value\"] = True\n build_config[\"reranker\"][\"value\"] = build_config[\"reranker\"][\"options\"][0]\n\n # Toggle search type and score threshold based on search method\n build_config[\"search_type\"][\"show\"] = is_vector_search\n build_config[\"search_score_threshold\"][\"show\"] = is_vector_search\n\n # Make sure the search_type is set to \"Similarity\"\n if not is_vector_search or is_autodetect:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoints\": [],\n \"keyspaces\": [self.get_keyspace()],\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n reranker=self.reranker,\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n # Make sure we always show the reranker options if the collection is hybrid enabled\n # And right now they always are\n build_config[\"lexical_terms\"][\"show\"] = True\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"options\"] = build_config[\"database_name\"][\"options_metadata\"][index][\n \"api_endpoints\"\n ]\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\n \"api_endpoints\"\n ][0]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Update the list of keyspaces based on the db info\n build_config[\"keyspace\"][\"options\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"keyspaces\"]\n build_config[\"keyspace\"][\"value\"] = (\n build_config[\"keyspace\"][\"options\"] and build_config[\"keyspace\"][\"options\"][0]\n if build_config[\"keyspace\"][\"value\"] not in build_config[\"keyspace\"][\"options\"]\n else build_config[\"keyspace\"][\"value\"]\n )\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace()\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f'<a class=\"underline\" target=\"_blank\" rel=\"noopener noreferrer\" '\n f'href=\"https://astra.datastax.com/org/{org_id}/database/{db_id}/data-explorer?createCollection=1&namespace={keyspace}\">'\n \"your database in Astra DB</a>.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n # Handle hybrid search options\n build_config = self._handle_hybrid_search_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n # Reset embedding model if collection selection changes\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n # Get the selected collection index\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n\n # Set the provider of the selected collection\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Grab the collection object\n database = self.get_database_object(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n collection = database.get_collection(\n name=field_value,\n keyspace=build_config[\"keyspace\"][\"value\"],\n )\n\n # Check if hybrid and lexical are enabled\n col_options = collection.options()\n hyb_enabled = col_options.rerank and col_options.rerank.enabled\n lex_enabled = col_options.lexical and col_options.lexical.enabled\n user_hyb_enabled = build_config[\"search_method\"][\"value\"] == \"Hybrid Search\"\n\n # Reranker visible when both the collection supports it and the user selected Hybrid\n hybrid_active = bool(hyb_enabled and user_hyb_enabled)\n build_config[\"reranker\"][\"show\"] = hybrid_active\n build_config[\"reranker\"][\"toggle_value\"] = hybrid_active\n build_config[\"reranker\"][\"toggle_disable\"] = False # allow user to toggle if visible\n\n # If hybrid is active, lock search_type to \"Similarity\"\n if hybrid_active:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n # Show the lexical terms option only if the collection enables lexical search\n build_config[\"lexical_terms\"][\"show\"] = bool(lex_enabled)\n\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = {\"embedding\": self.embedding_model} if self.embedding_model else {}\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Choose HybridSearchMode based on the selected param\n hybrid_search_mode = HybridSearchMode.DEFAULT if self.search_method == \"Hybrid Search\" else HybridSearchMode.OFF\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Hybrid Search Parameters\n hybrid_search=hybrid_search_mode,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n documents = [\n Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents\n ]\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n # Clean up the search query\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n lexical_terms = self.lexical_terms or None\n\n # Check if we have a search query, and if so set the args\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n \"lexical_query\": lexical_terms,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n self.log(f\"store.hybrid_search: {vector_store.hybrid_search}\")\n self.log(f\"Lexical terms: {self.lexical_terms}\")\n self.log(f\"Reranker: {self.reranker}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n"
|
|
2058
|
+
"value": "from astrapy import DataAPIClient\nfrom langchain_core.documents import Document\n\nfrom lfx.base.datastax.astradb_base import AstraDBBaseComponent\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.helpers.data import docs_to_data\nfrom lfx.io import BoolInput, DropdownInput, FloatInput, HandleInput, IntInput, NestedDictInput, QueryInput, StrInput\nfrom lfx.schema.data import Data\nfrom lfx.serialization import serialize\nfrom lfx.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(AstraDBBaseComponent, LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.langflow.org/bundles-datastax#astra-db\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n inputs = [\n *AstraDBBaseComponent.inputs,\n *LCVectorStoreComponent.inputs,\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n show=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"search_method\",\n display_name=\"Search Method\",\n info=(\n \"Determine how your content is matched: Vector finds semantic similarity, \"\n \"and Hybrid Search (suggested) combines both approaches \"\n \"with a reranker.\"\n ),\n options=[\"Hybrid Search\", \"Vector Search\"], # TODO: Restore Lexical Search?\n options_metadata=[{\"icon\": \"SearchHybrid\"}, {\"icon\": \"SearchVector\"}],\n value=\"Vector Search\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"reranker\",\n display_name=\"Reranker\",\n info=\"Post-retrieval model that re-scores results for optimal relevance ranking.\",\n show=False,\n toggle=True,\n ),\n QueryInput(\n name=\"lexical_terms\",\n display_name=\"Lexical Terms\",\n info=\"Add additional terms/keywords to augment search precision.\",\n placeholder=\"Enter terms to search...\",\n separator=\" \",\n show=False,\n value=\"\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n ]\n\n async def update_build_config(\n self,\n build_config: dict,\n field_value: str | dict,\n field_name: str | None = None,\n ) -> dict:\n \"\"\"Update build configuration with proper handling of embedding and search options.\"\"\"\n # Handle base astra db build config updates\n build_config = await super().update_build_config(\n build_config,\n field_value=field_value,\n field_name=field_name,\n )\n\n # Set embedding model display based on provider selection\n if isinstance(field_value, dict) and \"02_embedding_generation_provider\" in field_value:\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n is_custom_provider = embedding_provider and embedding_provider != \"Bring your own\"\n provider = embedding_provider.lower() if is_custom_provider and embedding_provider is not None else None\n\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Early return if no API endpoint is configured\n if not self.get_api_endpoint():\n return build_config\n\n # Configure search method and related options\n return self._configure_search_options(build_config)\n\n def _configure_search_options(self, build_config: dict) -> dict:\n \"\"\"Configure hybrid search, reranker, and vector search options.\"\"\"\n # Detect available hybrid search capabilities\n hybrid_capabilities = self._detect_hybrid_capabilities()\n\n # Return if we haven't selected a collection\n if not build_config[\"collection_name\"][\"options\"] or not build_config[\"collection_name\"][\"value\"]:\n return build_config\n\n # Get collection options\n collection_options = self._get_collection_options(build_config)\n\n # Get the selected collection index\n index = build_config[\"collection_name\"][\"options\"].index(build_config[\"collection_name\"][\"value\"])\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Determine search configuration\n is_vector_search = build_config[\"search_method\"][\"value\"] == \"Vector Search\"\n is_autodetect = build_config[\"autodetect_collection\"][\"value\"]\n\n # Apply hybrid search configuration\n if hybrid_capabilities[\"available\"]:\n build_config[\"search_method\"][\"show\"] = True\n build_config[\"search_method\"][\"options\"] = [\"Hybrid Search\", \"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = build_config[\"search_method\"].get(\"value\", \"Hybrid Search\")\n\n build_config[\"reranker\"][\"options\"] = hybrid_capabilities[\"reranker_models\"]\n build_config[\"reranker\"][\"options_metadata\"] = hybrid_capabilities[\"reranker_metadata\"]\n if hybrid_capabilities[\"reranker_models\"]:\n build_config[\"reranker\"][\"value\"] = hybrid_capabilities[\"reranker_models\"][0]\n else:\n build_config[\"search_method\"][\"show\"] = False\n build_config[\"search_method\"][\"options\"] = [\"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Vector Search\"\n build_config[\"reranker\"][\"options\"] = []\n build_config[\"reranker\"][\"options_metadata\"] = []\n\n # Configure reranker visibility and state\n hybrid_enabled = (\n collection_options[\"rerank_enabled\"] and build_config[\"search_method\"][\"value\"] == \"Hybrid Search\"\n )\n\n build_config[\"reranker\"][\"show\"] = hybrid_enabled\n build_config[\"reranker\"][\"toggle_value\"] = hybrid_enabled\n build_config[\"reranker\"][\"toggle_disable\"] = is_vector_search\n\n # Configure lexical terms\n lexical_visible = collection_options[\"lexical_enabled\"] and not is_vector_search\n build_config[\"lexical_terms\"][\"show\"] = lexical_visible\n build_config[\"lexical_terms\"][\"value\"] = \"\" if is_vector_search else build_config[\"lexical_terms\"][\"value\"]\n\n # Configure search type and score threshold\n build_config[\"search_type\"][\"show\"] = is_vector_search\n build_config[\"search_score_threshold\"][\"show\"] = is_vector_search\n\n # Force similarity search for hybrid mode or autodetect\n if hybrid_enabled or is_autodetect:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n return build_config\n\n def _detect_hybrid_capabilities(self) -> dict:\n \"\"\"Detect available hybrid search and reranking capabilities.\"\"\"\n environment = self.get_environment(self.environment)\n client = DataAPIClient(environment=environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(self.get_api_endpoint(), token=self.token)\n\n try:\n providers = db_admin.find_reranking_providers()\n reranker_models = [\n model.name for provider_data in providers.reranking_providers.values() for model in provider_data.models\n ]\n reranker_metadata = [\n {\"icon\": self.get_provider_icon(provider_name=model.name.split(\"/\")[0])}\n for provider in providers.reranking_providers.values()\n for model in provider.models\n ]\n except (AttributeError, KeyError) as e:\n self.log(f\"Hybrid search not available: {e}\")\n return {\n \"available\": False,\n \"reranker_models\": [],\n \"reranker_metadata\": [],\n }\n else:\n return {\n \"available\": True,\n \"reranker_models\": reranker_models,\n \"reranker_metadata\": reranker_metadata,\n }\n\n def _get_collection_options(self, build_config: dict) -> dict:\n \"\"\"Retrieve collection-level search options.\"\"\"\n database = self.get_database_object(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n collection = database.get_collection(\n name=build_config[\"collection_name\"][\"value\"],\n keyspace=build_config[\"keyspace\"][\"value\"],\n )\n\n col_options = collection.options()\n\n return {\n \"rerank_enabled\": bool(col_options.rerank and col_options.rerank.enabled),\n \"lexical_enabled\": bool(col_options.lexical and col_options.lexical.enabled),\n }\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n from langchain_astradb.utils.astradb import HybridSearchMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = {\"embedding\": self.embedding_model} if self.embedding_model else {}\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Choose HybridSearchMode based on the selected param\n hybrid_search_mode = HybridSearchMode.DEFAULT if self.search_method == \"Hybrid Search\" else HybridSearchMode.OFF\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Hybrid Search Parameters\n hybrid_search=hybrid_search_mode,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except ValueError as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n documents = [\n Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents\n ]\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except ValueError as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except ValueError as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n # Clean up the search query\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n lexical_terms = self.lexical_terms or None\n\n # Check if we have a search query, and if so set the args\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n \"lexical_query\": lexical_terms,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n self.log(f\"store.hybrid_search: {vector_store.hybrid_search}\")\n self.log(f\"Lexical terms: {self.lexical_terms}\")\n self.log(f\"Reranker: {self.reranker}\")\n\n try:\n search_args = self._build_search_args()\n except ValueError as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except ValueError as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n"
|
|
2059
2059
|
},
|
|
2060
2060
|
"collection_name": {
|
|
2061
2061
|
"_input_type": "DropdownInput",
|
|
@@ -737,7 +737,7 @@
|
|
|
737
737
|
"last_updated": "2025-09-29T18:32:20.563Z",
|
|
738
738
|
"legacy": false,
|
|
739
739
|
"metadata": {
|
|
740
|
-
"code_hash": "
|
|
740
|
+
"code_hash": "c753e92261ae",
|
|
741
741
|
"dependencies": {
|
|
742
742
|
"dependencies": [
|
|
743
743
|
{
|
|
@@ -867,7 +867,7 @@
|
|
|
867
867
|
"show": true,
|
|
868
868
|
"title_case": false,
|
|
869
869
|
"type": "code",
|
|
870
|
-
"value": "from __future__ import annotations\n\nimport asyncio\nimport contextlib\nimport hashlib\nimport json\nimport re\nimport uuid\nfrom dataclasses import asdict, dataclass, field\nfrom datetime import datetime, timezone\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nimport pandas as pd\nfrom cryptography.fernet import InvalidToken\nfrom langchain_chroma import Chroma\nfrom langflow.services.auth.utils import decrypt_api_key, encrypt_api_key\nfrom langflow.services.database.models.user.crud import get_user_by_id\n\nfrom lfx.base.knowledge_bases.knowledge_base_utils import get_knowledge_bases\nfrom lfx.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES\nfrom lfx.components.processing.converter import convert_to_dataframe\nfrom lfx.custom import Component\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.schema.data import Data\nfrom lfx.schema.table import EditMode\nfrom lfx.services.deps import (\n get_settings_service,\n get_variable_service,\n session_scope,\n)\n\nif TYPE_CHECKING:\n from lfx.schema.dataframe import DataFrame\n\nHUGGINGFACE_MODEL_NAMES = [\n \"sentence-transformers/all-MiniLM-L6-v2\",\n \"sentence-transformers/all-mpnet-base-v2\",\n]\nCOHERE_MODEL_NAMES = [\"embed-english-v3.0\", \"embed-multilingual-v3.0\"]\n\nsettings = get_settings_service().settings\nknowledge_directory = settings.knowledge_bases_dir\nif not knowledge_directory:\n msg = \"Knowledge bases directory is not set in the settings.\"\n raise ValueError(msg)\nKNOWLEDGE_BASES_ROOT_PATH = Path(knowledge_directory).expanduser()\n\n\nclass KnowledgeIngestionComponent(Component):\n \"\"\"Create or append to Langflow Knowledge from a DataFrame.\"\"\"\n\n # ------ UI metadata ---------------------------------------------------\n display_name = \"Knowledge Ingestion\"\n description = \"Create or update knowledge in Langflow.\"\n icon = \"upload\"\n name = \"KnowledgeIngestion\"\n\n def __init__(self, *args, **kwargs) -> None:\n super().__init__(*args, **kwargs)\n self._cached_kb_path: Path | None = None\n\n @dataclass\n class NewKnowledgeBaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_knowledge_base\",\n \"description\": \"Create new knowledge in Langflow.\",\n \"display_name\": \"Create new knowledge\",\n \"field_order\": [\n \"01_new_kb_name\",\n \"02_embedding_model\",\n \"03_api_key\",\n ],\n \"template\": {\n \"01_new_kb_name\": StrInput(\n name=\"new_kb_name\",\n display_name=\"Knowledge Name\",\n info=\"Name of the new knowledge to create.\",\n required=True,\n ),\n \"02_embedding_model\": DropdownInput(\n name=\"embedding_model\",\n display_name=\"Choose Embedding\",\n info=\"Select the embedding model to use for this knowledge base.\",\n required=True,\n options=OPENAI_EMBEDDING_MODEL_NAMES + HUGGINGFACE_MODEL_NAMES + COHERE_MODEL_NAMES,\n options_metadata=[{\"icon\": \"OpenAI\"} for _ in OPENAI_EMBEDDING_MODEL_NAMES]\n + [{\"icon\": \"HuggingFace\"} for _ in HUGGINGFACE_MODEL_NAMES]\n + [{\"icon\": \"Cohere\"} for _ in COHERE_MODEL_NAMES],\n ),\n \"03_api_key\": SecretStrInput(\n name=\"api_key\",\n display_name=\"API Key\",\n info=\"Provider API key for embedding model\",\n required=True,\n load_from_db=False,\n ),\n },\n },\n }\n }\n )\n\n # ------ Inputs --------------------------------------------------------\n inputs = [\n DropdownInput(\n name=\"knowledge_base\",\n display_name=\"Knowledge\",\n info=\"Select the knowledge to load data from.\",\n required=True,\n options=[],\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewKnowledgeBaseInput()),\n ),\n HandleInput(\n name=\"input_df\",\n display_name=\"Input\",\n info=(\n \"Table with all original columns (already chunked / processed). \"\n \"Accepts Data or DataFrame. If Data is provided, it is converted to a DataFrame automatically.\"\n ),\n input_types=[\"Data\", \"DataFrame\"],\n required=True,\n ),\n TableInput(\n name=\"column_config\",\n display_name=\"Column Configuration\",\n info=\"Configure column behavior for the knowledge base.\",\n required=True,\n table_schema=[\n {\n \"name\": \"column_name\",\n \"display_name\": \"Column Name\",\n \"type\": \"str\",\n \"description\": \"Name of the column in the source DataFrame\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"vectorize\",\n \"display_name\": \"Vectorize\",\n \"type\": \"boolean\",\n \"description\": \"Create embeddings for this column\",\n \"default\": False,\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"identifier\",\n \"display_name\": \"Identifier\",\n \"type\": \"boolean\",\n \"description\": \"Use this column as unique identifier\",\n \"default\": False,\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"column_name\": \"text\",\n \"vectorize\": True,\n \"identifier\": True,\n },\n ],\n ),\n IntInput(\n name=\"chunk_size\",\n display_name=\"Chunk Size\",\n info=\"Batch size for processing embeddings\",\n advanced=True,\n value=1000,\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"Embedding Provider API Key\",\n info=\"API key for the embedding provider to generate embeddings.\",\n advanced=True,\n required=False,\n ),\n BoolInput(\n name=\"allow_duplicates\",\n display_name=\"Allow Duplicates\",\n info=\"Allow duplicate rows in the knowledge base\",\n advanced=True,\n value=False,\n ),\n ]\n\n # ------ Outputs -------------------------------------------------------\n outputs = [Output(display_name=\"Results\", name=\"dataframe_output\", method=\"build_kb_info\")]\n\n # ------ Internal helpers ---------------------------------------------\n def _get_kb_root(self) -> Path:\n \"\"\"Return the root directory for knowledge bases.\"\"\"\n return KNOWLEDGE_BASES_ROOT_PATH\n\n def _validate_column_config(self, df_source: pd.DataFrame) -> list[dict[str, Any]]:\n \"\"\"Validate column configuration using Structured Output patterns.\"\"\"\n if not self.column_config:\n msg = \"Column configuration cannot be empty\"\n raise ValueError(msg)\n\n # Convert table input to list of dicts (similar to Structured Output)\n config_list = self.column_config if isinstance(self.column_config, list) else []\n\n # Validate column names exist in DataFrame\n df_columns = set(df_source.columns)\n for config in config_list:\n col_name = config.get(\"column_name\")\n if col_name not in df_columns:\n msg = f\"Column '{col_name}' not found in DataFrame. Available columns: {sorted(df_columns)}\"\n raise ValueError(msg)\n\n return config_list\n\n def _get_embedding_provider(self, embedding_model: str) -> str:\n \"\"\"Get embedding provider by matching model name to lists.\"\"\"\n if embedding_model in OPENAI_EMBEDDING_MODEL_NAMES:\n return \"OpenAI\"\n if embedding_model in HUGGINGFACE_MODEL_NAMES:\n return \"HuggingFace\"\n if embedding_model in COHERE_MODEL_NAMES:\n return \"Cohere\"\n return \"Custom\"\n\n def _build_embeddings(self, embedding_model: str, api_key: str):\n \"\"\"Build embedding model using provider patterns.\"\"\"\n # Get provider by matching model name to lists\n provider = self._get_embedding_provider(embedding_model)\n\n # Validate provider and model\n if provider == \"OpenAI\":\n from langchain_openai import OpenAIEmbeddings\n\n if not api_key:\n msg = \"OpenAI API key is required when using OpenAI provider\"\n raise ValueError(msg)\n return OpenAIEmbeddings(\n model=embedding_model,\n api_key=api_key,\n chunk_size=self.chunk_size,\n )\n if provider == \"HuggingFace\":\n from langchain_huggingface import HuggingFaceEmbeddings\n\n return HuggingFaceEmbeddings(\n model=embedding_model,\n )\n if provider == \"Cohere\":\n from langchain_cohere import CohereEmbeddings\n\n if not api_key:\n msg = \"Cohere API key is required when using Cohere provider\"\n raise ValueError(msg)\n return CohereEmbeddings(\n model=embedding_model,\n cohere_api_key=api_key,\n )\n if provider == \"Custom\":\n # For custom embedding models, we would need additional configuration\n msg = \"Custom embedding models not yet supported\"\n raise NotImplementedError(msg)\n msg = f\"Unknown provider: {provider}\"\n raise ValueError(msg)\n\n def _build_embedding_metadata(self, embedding_model, api_key) -> dict[str, Any]:\n \"\"\"Build embedding model metadata.\"\"\"\n # Get provider by matching model name to lists\n embedding_provider = self._get_embedding_provider(embedding_model)\n\n api_key_to_save = None\n if api_key and hasattr(api_key, \"get_secret_value\"):\n api_key_to_save = api_key.get_secret_value()\n elif isinstance(api_key, str):\n api_key_to_save = api_key\n\n encrypted_api_key = None\n if api_key_to_save:\n settings_service = get_settings_service()\n try:\n encrypted_api_key = encrypt_api_key(api_key_to_save, settings_service=settings_service)\n except (TypeError, ValueError) as e:\n self.log(f\"Could not encrypt API key: {e}\")\n\n return {\n \"embedding_provider\": embedding_provider,\n \"embedding_model\": embedding_model,\n \"api_key\": encrypted_api_key,\n \"api_key_used\": bool(api_key),\n \"chunk_size\": self.chunk_size,\n \"created_at\": datetime.now(timezone.utc).isoformat(),\n }\n\n def _save_embedding_metadata(self, kb_path: Path, embedding_model: str, api_key: str) -> None:\n \"\"\"Save embedding model metadata.\"\"\"\n embedding_metadata = self._build_embedding_metadata(embedding_model, api_key)\n metadata_path = kb_path / \"embedding_metadata.json\"\n metadata_path.write_text(json.dumps(embedding_metadata, indent=2))\n\n def _save_kb_files(\n self,\n kb_path: Path,\n config_list: list[dict[str, Any]],\n ) -> None:\n \"\"\"Save KB files using File Component storage patterns.\"\"\"\n try:\n # Create directory (following File Component patterns)\n kb_path.mkdir(parents=True, exist_ok=True)\n\n # Save column configuration\n # Only do this if the file doesn't exist already\n cfg_path = kb_path / \"schema.json\"\n if not cfg_path.exists():\n cfg_path.write_text(json.dumps(config_list, indent=2))\n\n except (OSError, TypeError, ValueError) as e:\n self.log(f\"Error saving KB files: {e}\")\n\n def _build_column_metadata(self, config_list: list[dict[str, Any]], df_source: pd.DataFrame) -> dict[str, Any]:\n \"\"\"Build detailed column metadata.\"\"\"\n metadata: dict[str, Any] = {\n \"total_columns\": len(df_source.columns),\n \"mapped_columns\": len(config_list),\n \"unmapped_columns\": len(df_source.columns) - len(config_list),\n \"columns\": [],\n \"summary\": {\"vectorized_columns\": [], \"identifier_columns\": []},\n }\n\n for config in config_list:\n col_name = config.get(\"column_name\")\n vectorize = config.get(\"vectorize\") == \"True\" or config.get(\"vectorize\") is True\n identifier = config.get(\"identifier\") == \"True\" or config.get(\"identifier\") is True\n\n # Add to columns list\n metadata[\"columns\"].append(\n {\n \"name\": col_name,\n \"vectorize\": vectorize,\n \"identifier\": identifier,\n }\n )\n\n # Update summary\n if vectorize:\n metadata[\"summary\"][\"vectorized_columns\"].append(col_name)\n if identifier:\n metadata[\"summary\"][\"identifier_columns\"].append(col_name)\n\n return metadata\n\n async def _create_vector_store(\n self,\n df_source: pd.DataFrame,\n config_list: list[dict[str, Any]],\n embedding_model: str,\n api_key: str,\n ) -> None:\n \"\"\"Create vector store following Local DB component pattern.\"\"\"\n try:\n # Set up vector store directory\n vector_store_dir = await self._kb_path()\n if not vector_store_dir:\n msg = \"Knowledge base path is not set. Please create a new knowledge base first.\"\n raise ValueError(msg)\n vector_store_dir.mkdir(parents=True, exist_ok=True)\n\n # Create embeddings model\n embedding_function = self._build_embeddings(embedding_model, api_key)\n\n # Convert DataFrame to Data objects (following Local DB pattern)\n data_objects = await self._convert_df_to_data_objects(df_source, config_list)\n\n # Create vector store\n chroma = Chroma(\n persist_directory=str(vector_store_dir),\n embedding_function=embedding_function,\n collection_name=self.knowledge_base,\n )\n\n # Convert Data objects to LangChain Documents\n documents = []\n for data_obj in data_objects:\n doc = data_obj.to_lc_document()\n documents.append(doc)\n\n # Add documents to vector store\n if documents:\n chroma.add_documents(documents)\n self.log(f\"Added {len(documents)} documents to vector store '{self.knowledge_base}'\")\n\n except (OSError, ValueError, RuntimeError) as e:\n self.log(f\"Error creating vector store: {e}\")\n\n async def _convert_df_to_data_objects(\n self, df_source: pd.DataFrame, config_list: list[dict[str, Any]]\n ) -> list[Data]:\n \"\"\"Convert DataFrame to Data objects for vector store.\"\"\"\n data_objects: list[Data] = []\n\n # Set up vector store directory\n kb_path = await self._kb_path()\n\n # If we don't allow duplicates, we need to get the existing hashes\n chroma = Chroma(\n persist_directory=str(kb_path),\n collection_name=self.knowledge_base,\n )\n\n # Get all documents and their metadata\n all_docs = chroma.get()\n\n # Extract all _id values from metadata\n id_list = [metadata.get(\"_id\") for metadata in all_docs[\"metadatas\"] if metadata.get(\"_id\")]\n\n # Get column roles\n content_cols = []\n identifier_cols = []\n\n for config in config_list:\n col_name = config.get(\"column_name\")\n vectorize = config.get(\"vectorize\") == \"True\" or config.get(\"vectorize\") is True\n identifier = config.get(\"identifier\") == \"True\" or config.get(\"identifier\") is True\n\n if vectorize:\n content_cols.append(col_name)\n elif identifier:\n identifier_cols.append(col_name)\n\n # Convert each row to a Data object\n for _, row in df_source.iterrows():\n # Build content text from identifier columns using list comprehension\n identifier_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]\n\n # Join all parts into a single string\n page_content = \" \".join(identifier_parts)\n\n # Build metadata from NON-vectorized columns only (simple key-value pairs)\n data_dict = {\n \"text\": page_content, # Main content for vectorization\n }\n\n # Add identifier columns if they exist\n if identifier_cols:\n identifier_parts = [str(row[col]) for col in identifier_cols if col in row and pd.notna(row[col])]\n page_content = \" \".join(identifier_parts)\n\n # Add metadata columns as simple key-value pairs\n for col in df_source.columns:\n if col not in content_cols and col in row and pd.notna(row[col]):\n # Convert to simple types for Chroma metadata\n value = row[col]\n data_dict[col] = str(value) # Convert complex types to string\n\n # Hash the page_content for unique ID\n page_content_hash = hashlib.sha256(page_content.encode()).hexdigest()\n data_dict[\"_id\"] = page_content_hash\n\n # If duplicates are disallowed, and hash exists, prevent adding this row\n if not self.allow_duplicates and page_content_hash in id_list:\n self.log(f\"Skipping duplicate row with hash {page_content_hash}\")\n continue\n\n # Create Data object - everything except \"text\" becomes metadata\n data_obj = Data(data=data_dict)\n data_objects.append(data_obj)\n\n return data_objects\n\n def is_valid_collection_name(self, name, min_length: int = 3, max_length: int = 63) -> bool:\n \"\"\"Validates collection name against conditions 1-3.\n\n 1. Contains 3-63 characters\n 2. Starts and ends with alphanumeric character\n 3. Contains only alphanumeric characters, underscores, or hyphens.\n\n Args:\n name (str): Collection name to validate\n min_length (int): Minimum length of the name\n max_length (int): Maximum length of the name\n\n Returns:\n bool: True if valid, False otherwise\n \"\"\"\n # Check length (condition 1)\n if not (min_length <= len(name) <= max_length):\n return False\n\n # Check start/end with alphanumeric (condition 2)\n if not (name[0].isalnum() and name[-1].isalnum()):\n return False\n\n # Check allowed characters (condition 3)\n return re.match(r\"^[a-zA-Z0-9_-]+$\", name) is not None\n\n async def _kb_path(self) -> Path | None:\n # Check if we already have the path cached\n cached_path = getattr(self, \"_cached_kb_path\", None)\n if cached_path is not None:\n return cached_path\n\n # If not cached, compute it\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching knowledge base path.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n kb_user = current_user.username\n\n kb_root = self._get_kb_root()\n\n # Cache the result\n self._cached_kb_path = kb_root / kb_user / self.knowledge_base\n\n return self._cached_kb_path\n\n # ---------------------------------------------------------------------\n # OUTPUT METHODS\n # ---------------------------------------------------------------------\n async def build_kb_info(self) -> Data:\n \"\"\"Main ingestion routine → returns a dict with KB metadata.\"\"\"\n try:\n input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df\n df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)\n\n # Validate column configuration (using Structured Output patterns)\n config_list = self._validate_column_config(df_source)\n column_metadata = self._build_column_metadata(config_list, df_source)\n\n # Read the embedding info from the knowledge base folder\n kb_path = await self._kb_path()\n if not kb_path:\n msg = \"Knowledge base path is not set. Please create a new knowledge base first.\"\n raise ValueError(msg)\n metadata_path = kb_path / \"embedding_metadata.json\"\n\n # If the API key is not provided, try to read it from the metadata file\n if metadata_path.exists():\n settings_service = get_settings_service()\n metadata = json.loads(metadata_path.read_text())\n embedding_model = metadata.get(\"embedding_model\")\n try:\n api_key = decrypt_api_key(metadata[\"api_key\"], settings_service)\n except (InvalidToken, TypeError, ValueError) as e:\n self.log(f\"Could not decrypt API key. Please provide it manually. Error: {e}\")\n\n # Check if a custom API key was provided, update metadata if so\n if self.api_key:\n api_key = self.api_key\n self._save_embedding_metadata(\n kb_path=kb_path,\n embedding_model=embedding_model,\n api_key=api_key,\n )\n\n # Create vector store following Local DB component pattern\n await self._create_vector_store(df_source, config_list, embedding_model=embedding_model, api_key=api_key)\n\n # Save KB files (using File Component storage patterns)\n self._save_kb_files(kb_path, config_list)\n\n # Build metadata response\n meta: dict[str, Any] = {\n \"kb_id\": str(uuid.uuid4()),\n \"kb_name\": self.knowledge_base,\n \"rows\": len(df_source),\n \"column_metadata\": column_metadata,\n \"path\": str(kb_path),\n \"config_columns\": len(config_list),\n \"timestamp\": datetime.now(tz=timezone.utc).isoformat(),\n }\n\n # Set status message\n self.status = f\"✅ KB **{self.knowledge_base}** saved · {len(df_source)} chunks.\"\n\n return Data(data=meta)\n\n except (OSError, ValueError, RuntimeError, KeyError) as e:\n msg = f\"Error during KB ingestion: {e}\"\n raise RuntimeError(msg) from e\n\n async def _get_api_key_variable(self, field_value: dict[str, Any]):\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching global variables.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n variable_service = get_variable_service()\n\n # Process the api_key field variable\n return await variable_service.get_variable(\n user_id=current_user.id,\n name=field_value[\"03_api_key\"],\n field=\"\",\n session=db,\n )\n\n async def update_build_config(\n self,\n build_config,\n field_value: Any,\n field_name: str | None = None,\n ):\n \"\"\"Update build configuration based on provider selection.\"\"\"\n # Create a new knowledge base\n if field_name == \"knowledge_base\":\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching knowledge base list.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n kb_user = current_user.username\n if isinstance(field_value, dict) and \"01_new_kb_name\" in field_value:\n # Validate the knowledge base name - Make sure it follows these rules:\n if not self.is_valid_collection_name(field_value[\"01_new_kb_name\"]):\n msg = f\"Invalid knowledge base name: {field_value['01_new_kb_name']}\"\n raise ValueError(msg)\n\n api_key = field_value.get(\"03_api_key\", None)\n with contextlib.suppress(Exception):\n # If the API key is a variable, resolve it\n api_key = await self._get_api_key_variable(field_value)\n\n # Make sure api_key is a string\n if not isinstance(api_key, str):\n msg = \"API key must be a string.\"\n raise ValueError(msg)\n\n # We need to test the API Key one time against the embedding model\n embed_model = self._build_embeddings(embedding_model=field_value[\"02_embedding_model\"], api_key=api_key)\n\n # Try to generate a dummy embedding to validate the API key without blocking the event loop\n try:\n await asyncio.wait_for(\n asyncio.to_thread(embed_model.embed_query, \"test\"),\n timeout=10,\n )\n except TimeoutError as e:\n msg = \"Embedding validation timed out. Please verify network connectivity and key.\"\n raise ValueError(msg) from e\n except Exception as e:\n msg = f\"Embedding validation failed: {e!s}\"\n raise ValueError(msg) from e\n\n # Create the new knowledge base directory\n kb_path = KNOWLEDGE_BASES_ROOT_PATH / kb_user / field_value[\"01_new_kb_name\"]\n kb_path.mkdir(parents=True, exist_ok=True)\n\n # Save the embedding metadata\n build_config[\"knowledge_base\"][\"value\"] = field_value[\"01_new_kb_name\"]\n self._save_embedding_metadata(\n kb_path=kb_path,\n embedding_model=field_value[\"02_embedding_model\"],\n api_key=api_key,\n )\n\n # Update the knowledge base options dynamically\n build_config[\"knowledge_base\"][\"options\"] = await get_knowledge_bases(\n KNOWLEDGE_BASES_ROOT_PATH,\n user_id=self.user_id,\n )\n\n # If the selected knowledge base is not available, reset it\n if build_config[\"knowledge_base\"][\"value\"] not in build_config[\"knowledge_base\"][\"options\"]:\n build_config[\"knowledge_base\"][\"value\"] = None\n\n return build_config\n"
|
|
870
|
+
"value": "from __future__ import annotations\n\nimport asyncio\nimport contextlib\nimport hashlib\nimport json\nimport re\nimport uuid\nfrom dataclasses import asdict, dataclass, field\nfrom datetime import datetime, timezone\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any\n\nimport pandas as pd\nfrom cryptography.fernet import InvalidToken\nfrom langchain_chroma import Chroma\nfrom langflow.services.auth.utils import decrypt_api_key, encrypt_api_key\nfrom langflow.services.database.models.user.crud import get_user_by_id\n\nfrom lfx.base.knowledge_bases.knowledge_base_utils import get_knowledge_bases\nfrom lfx.base.models.openai_constants import OPENAI_EMBEDDING_MODEL_NAMES\nfrom lfx.components.processing.converter import convert_to_dataframe\nfrom lfx.custom import Component\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n Output,\n SecretStrInput,\n StrInput,\n TableInput,\n)\nfrom lfx.schema.data import Data\nfrom lfx.schema.table import EditMode\nfrom lfx.services.deps import (\n get_settings_service,\n get_variable_service,\n session_scope,\n)\n\nif TYPE_CHECKING:\n from lfx.schema.dataframe import DataFrame\n\nHUGGINGFACE_MODEL_NAMES = [\n \"sentence-transformers/all-MiniLM-L6-v2\",\n \"sentence-transformers/all-mpnet-base-v2\",\n]\nCOHERE_MODEL_NAMES = [\"embed-english-v3.0\", \"embed-multilingual-v3.0\"]\n\n_KNOWLEDGE_BASES_ROOT_PATH: Path | None = None\n\n\ndef _get_knowledge_bases_root_path() -> Path:\n \"\"\"Lazy load the knowledge bases root path from settings.\"\"\"\n global _KNOWLEDGE_BASES_ROOT_PATH # noqa: PLW0603\n if _KNOWLEDGE_BASES_ROOT_PATH is None:\n settings = get_settings_service().settings\n knowledge_directory = settings.knowledge_bases_dir\n if not knowledge_directory:\n msg = \"Knowledge bases directory is not set in the settings.\"\n raise ValueError(msg)\n _KNOWLEDGE_BASES_ROOT_PATH = Path(knowledge_directory).expanduser()\n return _KNOWLEDGE_BASES_ROOT_PATH\n\n\nclass KnowledgeIngestionComponent(Component):\n \"\"\"Create or append to Langflow Knowledge from a DataFrame.\"\"\"\n\n # ------ UI metadata ---------------------------------------------------\n display_name = \"Knowledge Ingestion\"\n description = \"Create or update knowledge in Langflow.\"\n icon = \"upload\"\n name = \"KnowledgeIngestion\"\n\n def __init__(self, *args, **kwargs) -> None:\n super().__init__(*args, **kwargs)\n self._cached_kb_path: Path | None = None\n\n @dataclass\n class NewKnowledgeBaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_knowledge_base\",\n \"description\": \"Create new knowledge in Langflow.\",\n \"display_name\": \"Create new knowledge\",\n \"field_order\": [\n \"01_new_kb_name\",\n \"02_embedding_model\",\n \"03_api_key\",\n ],\n \"template\": {\n \"01_new_kb_name\": StrInput(\n name=\"new_kb_name\",\n display_name=\"Knowledge Name\",\n info=\"Name of the new knowledge to create.\",\n required=True,\n ),\n \"02_embedding_model\": DropdownInput(\n name=\"embedding_model\",\n display_name=\"Choose Embedding\",\n info=\"Select the embedding model to use for this knowledge base.\",\n required=True,\n options=OPENAI_EMBEDDING_MODEL_NAMES + HUGGINGFACE_MODEL_NAMES + COHERE_MODEL_NAMES,\n options_metadata=[{\"icon\": \"OpenAI\"} for _ in OPENAI_EMBEDDING_MODEL_NAMES]\n + [{\"icon\": \"HuggingFace\"} for _ in HUGGINGFACE_MODEL_NAMES]\n + [{\"icon\": \"Cohere\"} for _ in COHERE_MODEL_NAMES],\n ),\n \"03_api_key\": SecretStrInput(\n name=\"api_key\",\n display_name=\"API Key\",\n info=\"Provider API key for embedding model\",\n required=True,\n load_from_db=False,\n ),\n },\n },\n }\n }\n )\n\n # ------ Inputs --------------------------------------------------------\n inputs = [\n DropdownInput(\n name=\"knowledge_base\",\n display_name=\"Knowledge\",\n info=\"Select the knowledge to load data from.\",\n required=True,\n options=[],\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewKnowledgeBaseInput()),\n ),\n HandleInput(\n name=\"input_df\",\n display_name=\"Input\",\n info=(\n \"Table with all original columns (already chunked / processed). \"\n \"Accepts Data or DataFrame. If Data is provided, it is converted to a DataFrame automatically.\"\n ),\n input_types=[\"Data\", \"DataFrame\"],\n required=True,\n ),\n TableInput(\n name=\"column_config\",\n display_name=\"Column Configuration\",\n info=\"Configure column behavior for the knowledge base.\",\n required=True,\n table_schema=[\n {\n \"name\": \"column_name\",\n \"display_name\": \"Column Name\",\n \"type\": \"str\",\n \"description\": \"Name of the column in the source DataFrame\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"vectorize\",\n \"display_name\": \"Vectorize\",\n \"type\": \"boolean\",\n \"description\": \"Create embeddings for this column\",\n \"default\": False,\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"identifier\",\n \"display_name\": \"Identifier\",\n \"type\": \"boolean\",\n \"description\": \"Use this column as unique identifier\",\n \"default\": False,\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"column_name\": \"text\",\n \"vectorize\": True,\n \"identifier\": True,\n },\n ],\n ),\n IntInput(\n name=\"chunk_size\",\n display_name=\"Chunk Size\",\n info=\"Batch size for processing embeddings\",\n advanced=True,\n value=1000,\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"Embedding Provider API Key\",\n info=\"API key for the embedding provider to generate embeddings.\",\n advanced=True,\n required=False,\n ),\n BoolInput(\n name=\"allow_duplicates\",\n display_name=\"Allow Duplicates\",\n info=\"Allow duplicate rows in the knowledge base\",\n advanced=True,\n value=False,\n ),\n ]\n\n # ------ Outputs -------------------------------------------------------\n outputs = [Output(display_name=\"Results\", name=\"dataframe_output\", method=\"build_kb_info\")]\n\n # ------ Internal helpers ---------------------------------------------\n def _get_kb_root(self) -> Path:\n \"\"\"Return the root directory for knowledge bases.\"\"\"\n return _get_knowledge_bases_root_path()\n\n def _validate_column_config(self, df_source: pd.DataFrame) -> list[dict[str, Any]]:\n \"\"\"Validate column configuration using Structured Output patterns.\"\"\"\n if not self.column_config:\n msg = \"Column configuration cannot be empty\"\n raise ValueError(msg)\n\n # Convert table input to list of dicts (similar to Structured Output)\n config_list = self.column_config if isinstance(self.column_config, list) else []\n\n # Validate column names exist in DataFrame\n df_columns = set(df_source.columns)\n for config in config_list:\n col_name = config.get(\"column_name\")\n if col_name not in df_columns:\n msg = f\"Column '{col_name}' not found in DataFrame. Available columns: {sorted(df_columns)}\"\n raise ValueError(msg)\n\n return config_list\n\n def _get_embedding_provider(self, embedding_model: str) -> str:\n \"\"\"Get embedding provider by matching model name to lists.\"\"\"\n if embedding_model in OPENAI_EMBEDDING_MODEL_NAMES:\n return \"OpenAI\"\n if embedding_model in HUGGINGFACE_MODEL_NAMES:\n return \"HuggingFace\"\n if embedding_model in COHERE_MODEL_NAMES:\n return \"Cohere\"\n return \"Custom\"\n\n def _build_embeddings(self, embedding_model: str, api_key: str):\n \"\"\"Build embedding model using provider patterns.\"\"\"\n # Get provider by matching model name to lists\n provider = self._get_embedding_provider(embedding_model)\n\n # Validate provider and model\n if provider == \"OpenAI\":\n from langchain_openai import OpenAIEmbeddings\n\n if not api_key:\n msg = \"OpenAI API key is required when using OpenAI provider\"\n raise ValueError(msg)\n return OpenAIEmbeddings(\n model=embedding_model,\n api_key=api_key,\n chunk_size=self.chunk_size,\n )\n if provider == \"HuggingFace\":\n from langchain_huggingface import HuggingFaceEmbeddings\n\n return HuggingFaceEmbeddings(\n model=embedding_model,\n )\n if provider == \"Cohere\":\n from langchain_cohere import CohereEmbeddings\n\n if not api_key:\n msg = \"Cohere API key is required when using Cohere provider\"\n raise ValueError(msg)\n return CohereEmbeddings(\n model=embedding_model,\n cohere_api_key=api_key,\n )\n if provider == \"Custom\":\n # For custom embedding models, we would need additional configuration\n msg = \"Custom embedding models not yet supported\"\n raise NotImplementedError(msg)\n msg = f\"Unknown provider: {provider}\"\n raise ValueError(msg)\n\n def _build_embedding_metadata(self, embedding_model, api_key) -> dict[str, Any]:\n \"\"\"Build embedding model metadata.\"\"\"\n # Get provider by matching model name to lists\n embedding_provider = self._get_embedding_provider(embedding_model)\n\n api_key_to_save = None\n if api_key and hasattr(api_key, \"get_secret_value\"):\n api_key_to_save = api_key.get_secret_value()\n elif isinstance(api_key, str):\n api_key_to_save = api_key\n\n encrypted_api_key = None\n if api_key_to_save:\n settings_service = get_settings_service()\n try:\n encrypted_api_key = encrypt_api_key(api_key_to_save, settings_service=settings_service)\n except (TypeError, ValueError) as e:\n self.log(f\"Could not encrypt API key: {e}\")\n\n return {\n \"embedding_provider\": embedding_provider,\n \"embedding_model\": embedding_model,\n \"api_key\": encrypted_api_key,\n \"api_key_used\": bool(api_key),\n \"chunk_size\": self.chunk_size,\n \"created_at\": datetime.now(timezone.utc).isoformat(),\n }\n\n def _save_embedding_metadata(self, kb_path: Path, embedding_model: str, api_key: str) -> None:\n \"\"\"Save embedding model metadata.\"\"\"\n embedding_metadata = self._build_embedding_metadata(embedding_model, api_key)\n metadata_path = kb_path / \"embedding_metadata.json\"\n metadata_path.write_text(json.dumps(embedding_metadata, indent=2))\n\n def _save_kb_files(\n self,\n kb_path: Path,\n config_list: list[dict[str, Any]],\n ) -> None:\n \"\"\"Save KB files using File Component storage patterns.\"\"\"\n try:\n # Create directory (following File Component patterns)\n kb_path.mkdir(parents=True, exist_ok=True)\n\n # Save column configuration\n # Only do this if the file doesn't exist already\n cfg_path = kb_path / \"schema.json\"\n if not cfg_path.exists():\n cfg_path.write_text(json.dumps(config_list, indent=2))\n\n except (OSError, TypeError, ValueError) as e:\n self.log(f\"Error saving KB files: {e}\")\n\n def _build_column_metadata(self, config_list: list[dict[str, Any]], df_source: pd.DataFrame) -> dict[str, Any]:\n \"\"\"Build detailed column metadata.\"\"\"\n metadata: dict[str, Any] = {\n \"total_columns\": len(df_source.columns),\n \"mapped_columns\": len(config_list),\n \"unmapped_columns\": len(df_source.columns) - len(config_list),\n \"columns\": [],\n \"summary\": {\"vectorized_columns\": [], \"identifier_columns\": []},\n }\n\n for config in config_list:\n col_name = config.get(\"column_name\")\n vectorize = config.get(\"vectorize\") == \"True\" or config.get(\"vectorize\") is True\n identifier = config.get(\"identifier\") == \"True\" or config.get(\"identifier\") is True\n\n # Add to columns list\n metadata[\"columns\"].append(\n {\n \"name\": col_name,\n \"vectorize\": vectorize,\n \"identifier\": identifier,\n }\n )\n\n # Update summary\n if vectorize:\n metadata[\"summary\"][\"vectorized_columns\"].append(col_name)\n if identifier:\n metadata[\"summary\"][\"identifier_columns\"].append(col_name)\n\n return metadata\n\n async def _create_vector_store(\n self,\n df_source: pd.DataFrame,\n config_list: list[dict[str, Any]],\n embedding_model: str,\n api_key: str,\n ) -> None:\n \"\"\"Create vector store following Local DB component pattern.\"\"\"\n try:\n # Set up vector store directory\n vector_store_dir = await self._kb_path()\n if not vector_store_dir:\n msg = \"Knowledge base path is not set. Please create a new knowledge base first.\"\n raise ValueError(msg)\n vector_store_dir.mkdir(parents=True, exist_ok=True)\n\n # Create embeddings model\n embedding_function = self._build_embeddings(embedding_model, api_key)\n\n # Convert DataFrame to Data objects (following Local DB pattern)\n data_objects = await self._convert_df_to_data_objects(df_source, config_list)\n\n # Create vector store\n chroma = Chroma(\n persist_directory=str(vector_store_dir),\n embedding_function=embedding_function,\n collection_name=self.knowledge_base,\n )\n\n # Convert Data objects to LangChain Documents\n documents = []\n for data_obj in data_objects:\n doc = data_obj.to_lc_document()\n documents.append(doc)\n\n # Add documents to vector store\n if documents:\n chroma.add_documents(documents)\n self.log(f\"Added {len(documents)} documents to vector store '{self.knowledge_base}'\")\n\n except (OSError, ValueError, RuntimeError) as e:\n self.log(f\"Error creating vector store: {e}\")\n\n async def _convert_df_to_data_objects(\n self, df_source: pd.DataFrame, config_list: list[dict[str, Any]]\n ) -> list[Data]:\n \"\"\"Convert DataFrame to Data objects for vector store.\"\"\"\n data_objects: list[Data] = []\n\n # Set up vector store directory\n kb_path = await self._kb_path()\n\n # If we don't allow duplicates, we need to get the existing hashes\n chroma = Chroma(\n persist_directory=str(kb_path),\n collection_name=self.knowledge_base,\n )\n\n # Get all documents and their metadata\n all_docs = chroma.get()\n\n # Extract all _id values from metadata\n id_list = [metadata.get(\"_id\") for metadata in all_docs[\"metadatas\"] if metadata.get(\"_id\")]\n\n # Get column roles\n content_cols = []\n identifier_cols = []\n\n for config in config_list:\n col_name = config.get(\"column_name\")\n vectorize = config.get(\"vectorize\") == \"True\" or config.get(\"vectorize\") is True\n identifier = config.get(\"identifier\") == \"True\" or config.get(\"identifier\") is True\n\n if vectorize:\n content_cols.append(col_name)\n elif identifier:\n identifier_cols.append(col_name)\n\n # Convert each row to a Data object\n for _, row in df_source.iterrows():\n # Build content text from identifier columns using list comprehension\n identifier_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]\n\n # Join all parts into a single string\n page_content = \" \".join(identifier_parts)\n\n # Build metadata from NON-vectorized columns only (simple key-value pairs)\n data_dict = {\n \"text\": page_content, # Main content for vectorization\n }\n\n # Add identifier columns if they exist\n if identifier_cols:\n identifier_parts = [str(row[col]) for col in identifier_cols if col in row and pd.notna(row[col])]\n page_content = \" \".join(identifier_parts)\n\n # Add metadata columns as simple key-value pairs\n for col in df_source.columns:\n if col not in content_cols and col in row and pd.notna(row[col]):\n # Convert to simple types for Chroma metadata\n value = row[col]\n data_dict[col] = str(value) # Convert complex types to string\n\n # Hash the page_content for unique ID\n page_content_hash = hashlib.sha256(page_content.encode()).hexdigest()\n data_dict[\"_id\"] = page_content_hash\n\n # If duplicates are disallowed, and hash exists, prevent adding this row\n if not self.allow_duplicates and page_content_hash in id_list:\n self.log(f\"Skipping duplicate row with hash {page_content_hash}\")\n continue\n\n # Create Data object - everything except \"text\" becomes metadata\n data_obj = Data(data=data_dict)\n data_objects.append(data_obj)\n\n return data_objects\n\n def is_valid_collection_name(self, name, min_length: int = 3, max_length: int = 63) -> bool:\n \"\"\"Validates collection name against conditions 1-3.\n\n 1. Contains 3-63 characters\n 2. Starts and ends with alphanumeric character\n 3. Contains only alphanumeric characters, underscores, or hyphens.\n\n Args:\n name (str): Collection name to validate\n min_length (int): Minimum length of the name\n max_length (int): Maximum length of the name\n\n Returns:\n bool: True if valid, False otherwise\n \"\"\"\n # Check length (condition 1)\n if not (min_length <= len(name) <= max_length):\n return False\n\n # Check start/end with alphanumeric (condition 2)\n if not (name[0].isalnum() and name[-1].isalnum()):\n return False\n\n # Check allowed characters (condition 3)\n return re.match(r\"^[a-zA-Z0-9_-]+$\", name) is not None\n\n async def _kb_path(self) -> Path | None:\n # Check if we already have the path cached\n cached_path = getattr(self, \"_cached_kb_path\", None)\n if cached_path is not None:\n return cached_path\n\n # If not cached, compute it\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching knowledge base path.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n kb_user = current_user.username\n\n kb_root = self._get_kb_root()\n\n # Cache the result\n self._cached_kb_path = kb_root / kb_user / self.knowledge_base\n\n return self._cached_kb_path\n\n # ---------------------------------------------------------------------\n # OUTPUT METHODS\n # ---------------------------------------------------------------------\n async def build_kb_info(self) -> Data:\n \"\"\"Main ingestion routine → returns a dict with KB metadata.\"\"\"\n try:\n input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df\n df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)\n\n # Validate column configuration (using Structured Output patterns)\n config_list = self._validate_column_config(df_source)\n column_metadata = self._build_column_metadata(config_list, df_source)\n\n # Read the embedding info from the knowledge base folder\n kb_path = await self._kb_path()\n if not kb_path:\n msg = \"Knowledge base path is not set. Please create a new knowledge base first.\"\n raise ValueError(msg)\n metadata_path = kb_path / \"embedding_metadata.json\"\n\n # If the API key is not provided, try to read it from the metadata file\n if metadata_path.exists():\n settings_service = get_settings_service()\n metadata = json.loads(metadata_path.read_text())\n embedding_model = metadata.get(\"embedding_model\")\n try:\n api_key = decrypt_api_key(metadata[\"api_key\"], settings_service)\n except (InvalidToken, TypeError, ValueError) as e:\n self.log(f\"Could not decrypt API key. Please provide it manually. Error: {e}\")\n\n # Check if a custom API key was provided, update metadata if so\n if self.api_key:\n api_key = self.api_key\n self._save_embedding_metadata(\n kb_path=kb_path,\n embedding_model=embedding_model,\n api_key=api_key,\n )\n\n # Create vector store following Local DB component pattern\n await self._create_vector_store(df_source, config_list, embedding_model=embedding_model, api_key=api_key)\n\n # Save KB files (using File Component storage patterns)\n self._save_kb_files(kb_path, config_list)\n\n # Build metadata response\n meta: dict[str, Any] = {\n \"kb_id\": str(uuid.uuid4()),\n \"kb_name\": self.knowledge_base,\n \"rows\": len(df_source),\n \"column_metadata\": column_metadata,\n \"path\": str(kb_path),\n \"config_columns\": len(config_list),\n \"timestamp\": datetime.now(tz=timezone.utc).isoformat(),\n }\n\n # Set status message\n self.status = f\"✅ KB **{self.knowledge_base}** saved · {len(df_source)} chunks.\"\n\n return Data(data=meta)\n\n except (OSError, ValueError, RuntimeError, KeyError) as e:\n msg = f\"Error during KB ingestion: {e}\"\n raise RuntimeError(msg) from e\n\n async def _get_api_key_variable(self, field_value: dict[str, Any]):\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching global variables.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n variable_service = get_variable_service()\n\n # Process the api_key field variable\n return await variable_service.get_variable(\n user_id=current_user.id,\n name=field_value[\"03_api_key\"],\n field=\"\",\n session=db,\n )\n\n async def update_build_config(\n self,\n build_config,\n field_value: Any,\n field_name: str | None = None,\n ):\n \"\"\"Update build configuration based on provider selection.\"\"\"\n # Create a new knowledge base\n if field_name == \"knowledge_base\":\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching knowledge base list.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n kb_user = current_user.username\n if isinstance(field_value, dict) and \"01_new_kb_name\" in field_value:\n # Validate the knowledge base name - Make sure it follows these rules:\n if not self.is_valid_collection_name(field_value[\"01_new_kb_name\"]):\n msg = f\"Invalid knowledge base name: {field_value['01_new_kb_name']}\"\n raise ValueError(msg)\n\n api_key = field_value.get(\"03_api_key\", None)\n with contextlib.suppress(Exception):\n # If the API key is a variable, resolve it\n api_key = await self._get_api_key_variable(field_value)\n\n # Make sure api_key is a string\n if not isinstance(api_key, str):\n msg = \"API key must be a string.\"\n raise ValueError(msg)\n\n # We need to test the API Key one time against the embedding model\n embed_model = self._build_embeddings(embedding_model=field_value[\"02_embedding_model\"], api_key=api_key)\n\n # Try to generate a dummy embedding to validate the API key without blocking the event loop\n try:\n await asyncio.wait_for(\n asyncio.to_thread(embed_model.embed_query, \"test\"),\n timeout=10,\n )\n except TimeoutError as e:\n msg = \"Embedding validation timed out. Please verify network connectivity and key.\"\n raise ValueError(msg) from e\n except Exception as e:\n msg = f\"Embedding validation failed: {e!s}\"\n raise ValueError(msg) from e\n\n # Create the new knowledge base directory\n kb_path = _get_knowledge_bases_root_path() / kb_user / field_value[\"01_new_kb_name\"]\n kb_path.mkdir(parents=True, exist_ok=True)\n\n # Save the embedding metadata\n build_config[\"knowledge_base\"][\"value\"] = field_value[\"01_new_kb_name\"]\n self._save_embedding_metadata(\n kb_path=kb_path,\n embedding_model=field_value[\"02_embedding_model\"],\n api_key=api_key,\n )\n\n # Update the knowledge base options dynamically\n build_config[\"knowledge_base\"][\"options\"] = await get_knowledge_bases(\n _get_knowledge_bases_root_path(),\n user_id=self.user_id,\n )\n\n # If the selected knowledge base is not available, reset it\n if build_config[\"knowledge_base\"][\"value\"] not in build_config[\"knowledge_base\"][\"options\"]:\n build_config[\"knowledge_base\"][\"value\"] = None\n\n return build_config\n"
|
|
871
871
|
},
|
|
872
872
|
"column_config": {
|
|
873
873
|
"_input_type": "TableInput",
|
|
@@ -511,7 +511,7 @@
|
|
|
511
511
|
"last_updated": "2025-08-26T16:19:16.681Z",
|
|
512
512
|
"legacy": false,
|
|
513
513
|
"metadata": {
|
|
514
|
-
"code_hash": "
|
|
514
|
+
"code_hash": "1898acd6df4b",
|
|
515
515
|
"dependencies": {
|
|
516
516
|
"dependencies": [
|
|
517
517
|
{
|
|
@@ -605,7 +605,7 @@
|
|
|
605
605
|
"show": true,
|
|
606
606
|
"title_case": false,
|
|
607
607
|
"type": "code",
|
|
608
|
-
"value": "import json\nfrom pathlib import Path\nfrom typing import Any\n\nfrom cryptography.fernet import InvalidToken\nfrom langchain_chroma import Chroma\nfrom langflow.services.auth.utils import decrypt_api_key\nfrom langflow.services.database.models.user.crud import get_user_by_id\nfrom pydantic import SecretStr\n\nfrom lfx.base.knowledge_bases.knowledge_base_utils import get_knowledge_bases\nfrom lfx.custom import Component\nfrom lfx.io import BoolInput, DropdownInput, IntInput, MessageTextInput, Output, SecretStrInput\nfrom lfx.log.logger import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import DataFrame\nfrom lfx.services.deps import get_settings_service, session_scope\n\nsettings = get_settings_service().settings\nknowledge_directory = settings.knowledge_bases_dir\nif not knowledge_directory:\n msg = \"Knowledge bases directory is not set in the settings.\"\n raise ValueError(msg)\nKNOWLEDGE_BASES_ROOT_PATH = Path(knowledge_directory).expanduser()\n\n\nclass KnowledgeRetrievalComponent(Component):\n display_name = \"Knowledge Retrieval\"\n description = \"Search and retrieve data from knowledge.\"\n icon = \"download\"\n name = \"KnowledgeRetrieval\"\n\n inputs = [\n DropdownInput(\n name=\"knowledge_base\",\n display_name=\"Knowledge\",\n info=\"Select the knowledge to load data from.\",\n required=True,\n options=[],\n refresh_button=True,\n real_time_refresh=True,\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"Embedding Provider API Key\",\n info=\"API key for the embedding provider to generate embeddings.\",\n advanced=True,\n required=False,\n ),\n MessageTextInput(\n name=\"search_query\",\n display_name=\"Search Query\",\n info=\"Optional search query to filter knowledge base data.\",\n tool_mode=True,\n ),\n IntInput(\n name=\"top_k\",\n display_name=\"Top K Results\",\n info=\"Number of top results to return from the knowledge base.\",\n value=5,\n advanced=True,\n required=False,\n ),\n BoolInput(\n name=\"include_metadata\",\n display_name=\"Include Metadata\",\n info=\"Whether to include all metadata in the output. If false, only content is returned.\",\n value=True,\n advanced=False,\n ),\n BoolInput(\n name=\"include_embeddings\",\n display_name=\"Include Embeddings\",\n info=\"Whether to include embeddings in the output. Only applicable if 'Include Metadata' is enabled.\",\n value=False,\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(\n name=\"retrieve_data\",\n display_name=\"Results\",\n method=\"retrieve_data\",\n info=\"Returns the data from the selected knowledge base.\",\n ),\n ]\n\n async def update_build_config(self, build_config, field_value, field_name=None): # noqa: ARG002\n if field_name == \"knowledge_base\":\n # Update the knowledge base options dynamically\n build_config[\"knowledge_base\"][\"options\"] = await get_knowledge_bases(\n KNOWLEDGE_BASES_ROOT_PATH,\n user_id=self.user_id, # Use the user_id from the component context\n )\n\n # If the selected knowledge base is not available, reset it\n if build_config[\"knowledge_base\"][\"value\"] not in build_config[\"knowledge_base\"][\"options\"]:\n build_config[\"knowledge_base\"][\"value\"] = None\n\n return build_config\n\n def _get_kb_metadata(self, kb_path: Path) -> dict:\n \"\"\"Load and process knowledge base metadata.\"\"\"\n metadata: dict[str, Any] = {}\n metadata_file = kb_path / \"embedding_metadata.json\"\n if not metadata_file.exists():\n logger.warning(f\"Embedding metadata file not found at {metadata_file}\")\n return metadata\n\n try:\n with metadata_file.open(\"r\", encoding=\"utf-8\") as f:\n metadata = json.load(f)\n except json.JSONDecodeError:\n logger.error(f\"Error decoding JSON from {metadata_file}\")\n return {}\n\n # Decrypt API key if it exists\n if \"api_key\" in metadata and metadata.get(\"api_key\"):\n settings_service = get_settings_service()\n try:\n decrypted_key = decrypt_api_key(metadata[\"api_key\"], settings_service)\n metadata[\"api_key\"] = decrypted_key\n except (InvalidToken, TypeError, ValueError) as e:\n logger.error(f\"Could not decrypt API key. Please provide it manually. Error: {e}\")\n metadata[\"api_key\"] = None\n return metadata\n\n def _build_embeddings(self, metadata: dict):\n \"\"\"Build embedding model from metadata.\"\"\"\n runtime_api_key = self.api_key.get_secret_value() if isinstance(self.api_key, SecretStr) else self.api_key\n provider = metadata.get(\"embedding_provider\")\n model = metadata.get(\"embedding_model\")\n api_key = runtime_api_key or metadata.get(\"api_key\")\n chunk_size = metadata.get(\"chunk_size\")\n\n # Handle various providers\n if provider == \"OpenAI\":\n from langchain_openai import OpenAIEmbeddings\n\n if not api_key:\n msg = \"OpenAI API key is required. Provide it in the component's advanced settings.\"\n raise ValueError(msg)\n return OpenAIEmbeddings(\n model=model,\n api_key=api_key,\n chunk_size=chunk_size,\n )\n if provider == \"HuggingFace\":\n from langchain_huggingface import HuggingFaceEmbeddings\n\n return HuggingFaceEmbeddings(\n model=model,\n )\n if provider == \"Cohere\":\n from langchain_cohere import CohereEmbeddings\n\n if not api_key:\n msg = \"Cohere API key is required when using Cohere provider\"\n raise ValueError(msg)\n return CohereEmbeddings(\n model=model,\n cohere_api_key=api_key,\n )\n if provider == \"Custom\":\n # For custom embedding models, we would need additional configuration\n msg = \"Custom embedding models not yet supported\"\n raise NotImplementedError(msg)\n # Add other providers here if they become supported in ingest\n msg = f\"Embedding provider '{provider}' is not supported for retrieval.\"\n raise NotImplementedError(msg)\n\n async def retrieve_data(self) -> DataFrame:\n \"\"\"Retrieve data from the selected knowledge base by reading the Chroma collection.\n\n Returns:\n A DataFrame containing the data rows from the knowledge base.\n \"\"\"\n # Get the current user\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching Knowledge Base data.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n kb_user = current_user.username\n kb_path = KNOWLEDGE_BASES_ROOT_PATH / kb_user / self.knowledge_base\n\n metadata = self._get_kb_metadata(kb_path)\n if not metadata:\n msg = f\"Metadata not found for knowledge base: {self.knowledge_base}. Ensure it has been indexed.\"\n raise ValueError(msg)\n\n # Build the embedder for the knowledge base\n embedding_function = self._build_embeddings(metadata)\n\n # Load vector store\n chroma = Chroma(\n persist_directory=str(kb_path),\n embedding_function=embedding_function,\n collection_name=self.knowledge_base,\n )\n\n # If a search query is provided, perform a similarity search\n if self.search_query:\n # Use the search query to perform a similarity search\n logger.info(f\"Performing similarity search with query: {self.search_query}\")\n results = chroma.similarity_search_with_score(\n query=self.search_query or \"\",\n k=self.top_k,\n )\n else:\n results = chroma.similarity_search(\n query=self.search_query or \"\",\n k=self.top_k,\n )\n\n # For each result, make it a tuple to match the expected output format\n results = [(doc, 0) for doc in results] # Assign a dummy score of 0\n\n # If include_embeddings is enabled, get embeddings for the results\n id_to_embedding = {}\n if self.include_embeddings and results:\n doc_ids = [doc[0].metadata.get(\"_id\") for doc in results if doc[0].metadata.get(\"_id\")]\n\n # Only proceed if we have valid document IDs\n if doc_ids:\n # Access underlying client to get embeddings\n collection = chroma._client.get_collection(name=self.knowledge_base)\n embeddings_result = collection.get(where={\"_id\": {\"$in\": doc_ids}}, include=[\"metadatas\", \"embeddings\"])\n\n # Create a mapping from document ID to embedding\n for i, metadata in enumerate(embeddings_result.get(\"metadatas\", [])):\n if metadata and \"_id\" in metadata:\n id_to_embedding[metadata[\"_id\"]] = embeddings_result[\"embeddings\"][i]\n\n # Build output data based on include_metadata setting\n data_list = []\n for doc in results:\n kwargs = {\n \"content\": doc[0].page_content,\n }\n if self.search_query:\n kwargs[\"_score\"] = -1 * doc[1]\n if self.include_metadata:\n # Include all metadata, embeddings, and content\n kwargs.update(doc[0].metadata)\n if self.include_embeddings:\n kwargs[\"_embeddings\"] = id_to_embedding.get(doc[0].metadata.get(\"_id\"))\n\n data_list.append(Data(**kwargs))\n\n # Return the DataFrame containing the data\n return DataFrame(data=data_list)\n"
|
|
608
|
+
"value": "import json\nfrom pathlib import Path\nfrom typing import Any\n\nfrom cryptography.fernet import InvalidToken\nfrom langchain_chroma import Chroma\nfrom langflow.services.auth.utils import decrypt_api_key\nfrom langflow.services.database.models.user.crud import get_user_by_id\nfrom pydantic import SecretStr\n\nfrom lfx.base.knowledge_bases.knowledge_base_utils import get_knowledge_bases\nfrom lfx.custom import Component\nfrom lfx.io import BoolInput, DropdownInput, IntInput, MessageTextInput, Output, SecretStrInput\nfrom lfx.log.logger import logger\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import DataFrame\nfrom lfx.services.deps import get_settings_service, session_scope\n\n_KNOWLEDGE_BASES_ROOT_PATH: Path | None = None\n\n\ndef _get_knowledge_bases_root_path() -> Path:\n \"\"\"Lazy load the knowledge bases root path from settings.\"\"\"\n global _KNOWLEDGE_BASES_ROOT_PATH # noqa: PLW0603\n if _KNOWLEDGE_BASES_ROOT_PATH is None:\n settings = get_settings_service().settings\n knowledge_directory = settings.knowledge_bases_dir\n if not knowledge_directory:\n msg = \"Knowledge bases directory is not set in the settings.\"\n raise ValueError(msg)\n _KNOWLEDGE_BASES_ROOT_PATH = Path(knowledge_directory).expanduser()\n return _KNOWLEDGE_BASES_ROOT_PATH\n\n\nclass KnowledgeRetrievalComponent(Component):\n display_name = \"Knowledge Retrieval\"\n description = \"Search and retrieve data from knowledge.\"\n icon = \"download\"\n name = \"KnowledgeRetrieval\"\n\n inputs = [\n DropdownInput(\n name=\"knowledge_base\",\n display_name=\"Knowledge\",\n info=\"Select the knowledge to load data from.\",\n required=True,\n options=[],\n refresh_button=True,\n real_time_refresh=True,\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"Embedding Provider API Key\",\n info=\"API key for the embedding provider to generate embeddings.\",\n advanced=True,\n required=False,\n ),\n MessageTextInput(\n name=\"search_query\",\n display_name=\"Search Query\",\n info=\"Optional search query to filter knowledge base data.\",\n tool_mode=True,\n ),\n IntInput(\n name=\"top_k\",\n display_name=\"Top K Results\",\n info=\"Number of top results to return from the knowledge base.\",\n value=5,\n advanced=True,\n required=False,\n ),\n BoolInput(\n name=\"include_metadata\",\n display_name=\"Include Metadata\",\n info=\"Whether to include all metadata in the output. If false, only content is returned.\",\n value=True,\n advanced=False,\n ),\n BoolInput(\n name=\"include_embeddings\",\n display_name=\"Include Embeddings\",\n info=\"Whether to include embeddings in the output. Only applicable if 'Include Metadata' is enabled.\",\n value=False,\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(\n name=\"retrieve_data\",\n display_name=\"Results\",\n method=\"retrieve_data\",\n info=\"Returns the data from the selected knowledge base.\",\n ),\n ]\n\n async def update_build_config(self, build_config, field_value, field_name=None): # noqa: ARG002\n if field_name == \"knowledge_base\":\n # Update the knowledge base options dynamically\n build_config[\"knowledge_base\"][\"options\"] = await get_knowledge_bases(\n _get_knowledge_bases_root_path(),\n user_id=self.user_id, # Use the user_id from the component context\n )\n\n # If the selected knowledge base is not available, reset it\n if build_config[\"knowledge_base\"][\"value\"] not in build_config[\"knowledge_base\"][\"options\"]:\n build_config[\"knowledge_base\"][\"value\"] = None\n\n return build_config\n\n def _get_kb_metadata(self, kb_path: Path) -> dict:\n \"\"\"Load and process knowledge base metadata.\"\"\"\n metadata: dict[str, Any] = {}\n metadata_file = kb_path / \"embedding_metadata.json\"\n if not metadata_file.exists():\n logger.warning(f\"Embedding metadata file not found at {metadata_file}\")\n return metadata\n\n try:\n with metadata_file.open(\"r\", encoding=\"utf-8\") as f:\n metadata = json.load(f)\n except json.JSONDecodeError:\n logger.error(f\"Error decoding JSON from {metadata_file}\")\n return {}\n\n # Decrypt API key if it exists\n if \"api_key\" in metadata and metadata.get(\"api_key\"):\n settings_service = get_settings_service()\n try:\n decrypted_key = decrypt_api_key(metadata[\"api_key\"], settings_service)\n metadata[\"api_key\"] = decrypted_key\n except (InvalidToken, TypeError, ValueError) as e:\n logger.error(f\"Could not decrypt API key. Please provide it manually. Error: {e}\")\n metadata[\"api_key\"] = None\n return metadata\n\n def _build_embeddings(self, metadata: dict):\n \"\"\"Build embedding model from metadata.\"\"\"\n runtime_api_key = self.api_key.get_secret_value() if isinstance(self.api_key, SecretStr) else self.api_key\n provider = metadata.get(\"embedding_provider\")\n model = metadata.get(\"embedding_model\")\n api_key = runtime_api_key or metadata.get(\"api_key\")\n chunk_size = metadata.get(\"chunk_size\")\n\n # Handle various providers\n if provider == \"OpenAI\":\n from langchain_openai import OpenAIEmbeddings\n\n if not api_key:\n msg = \"OpenAI API key is required. Provide it in the component's advanced settings.\"\n raise ValueError(msg)\n return OpenAIEmbeddings(\n model=model,\n api_key=api_key,\n chunk_size=chunk_size,\n )\n if provider == \"HuggingFace\":\n from langchain_huggingface import HuggingFaceEmbeddings\n\n return HuggingFaceEmbeddings(\n model=model,\n )\n if provider == \"Cohere\":\n from langchain_cohere import CohereEmbeddings\n\n if not api_key:\n msg = \"Cohere API key is required when using Cohere provider\"\n raise ValueError(msg)\n return CohereEmbeddings(\n model=model,\n cohere_api_key=api_key,\n )\n if provider == \"Custom\":\n # For custom embedding models, we would need additional configuration\n msg = \"Custom embedding models not yet supported\"\n raise NotImplementedError(msg)\n # Add other providers here if they become supported in ingest\n msg = f\"Embedding provider '{provider}' is not supported for retrieval.\"\n raise NotImplementedError(msg)\n\n async def retrieve_data(self) -> DataFrame:\n \"\"\"Retrieve data from the selected knowledge base by reading the Chroma collection.\n\n Returns:\n A DataFrame containing the data rows from the knowledge base.\n \"\"\"\n # Get the current user\n async with session_scope() as db:\n if not self.user_id:\n msg = \"User ID is required for fetching Knowledge Base data.\"\n raise ValueError(msg)\n current_user = await get_user_by_id(db, self.user_id)\n if not current_user:\n msg = f\"User with ID {self.user_id} not found.\"\n raise ValueError(msg)\n kb_user = current_user.username\n kb_path = _get_knowledge_bases_root_path() / kb_user / self.knowledge_base\n\n metadata = self._get_kb_metadata(kb_path)\n if not metadata:\n msg = f\"Metadata not found for knowledge base: {self.knowledge_base}. Ensure it has been indexed.\"\n raise ValueError(msg)\n\n # Build the embedder for the knowledge base\n embedding_function = self._build_embeddings(metadata)\n\n # Load vector store\n chroma = Chroma(\n persist_directory=str(kb_path),\n embedding_function=embedding_function,\n collection_name=self.knowledge_base,\n )\n\n # If a search query is provided, perform a similarity search\n if self.search_query:\n # Use the search query to perform a similarity search\n logger.info(f\"Performing similarity search with query: {self.search_query}\")\n results = chroma.similarity_search_with_score(\n query=self.search_query or \"\",\n k=self.top_k,\n )\n else:\n results = chroma.similarity_search(\n query=self.search_query or \"\",\n k=self.top_k,\n )\n\n # For each result, make it a tuple to match the expected output format\n results = [(doc, 0) for doc in results] # Assign a dummy score of 0\n\n # If include_embeddings is enabled, get embeddings for the results\n id_to_embedding = {}\n if self.include_embeddings and results:\n doc_ids = [doc[0].metadata.get(\"_id\") for doc in results if doc[0].metadata.get(\"_id\")]\n\n # Only proceed if we have valid document IDs\n if doc_ids:\n # Access underlying client to get embeddings\n collection = chroma._client.get_collection(name=self.knowledge_base)\n embeddings_result = collection.get(where={\"_id\": {\"$in\": doc_ids}}, include=[\"metadatas\", \"embeddings\"])\n\n # Create a mapping from document ID to embedding\n for i, metadata in enumerate(embeddings_result.get(\"metadatas\", [])):\n if metadata and \"_id\" in metadata:\n id_to_embedding[metadata[\"_id\"]] = embeddings_result[\"embeddings\"][i]\n\n # Build output data based on include_metadata setting\n data_list = []\n for doc in results:\n kwargs = {\n \"content\": doc[0].page_content,\n }\n if self.search_query:\n kwargs[\"_score\"] = -1 * doc[1]\n if self.include_metadata:\n # Include all metadata, embeddings, and content\n kwargs.update(doc[0].metadata)\n if self.include_embeddings:\n kwargs[\"_embeddings\"] = id_to_embedding.get(doc[0].metadata.get(\"_id\"))\n\n data_list.append(Data(**kwargs))\n\n # Return the DataFrame containing the data\n return DataFrame(data=data_list)\n"
|
|
609
609
|
},
|
|
610
610
|
"include_embeddings": {
|
|
611
611
|
"_input_type": "BoolInput",
|
|
@@ -2111,7 +2111,7 @@
|
|
|
2111
2111
|
],
|
|
2112
2112
|
"total_dependencies": 2
|
|
2113
2113
|
},
|
|
2114
|
-
"module": "lfx.components.
|
|
2114
|
+
"module": "lfx.components.FAISS.faiss.FaissVectorStoreComponent"
|
|
2115
2115
|
},
|
|
2116
2116
|
"minimized": false,
|
|
2117
2117
|
"output_types": [],
|
|
@@ -3478,17 +3478,13 @@
|
|
|
3478
3478
|
"icon": "AstraDB",
|
|
3479
3479
|
"legacy": false,
|
|
3480
3480
|
"metadata": {
|
|
3481
|
-
"code_hash": "
|
|
3481
|
+
"code_hash": "fb190928c0c2",
|
|
3482
3482
|
"dependencies": {
|
|
3483
3483
|
"dependencies": [
|
|
3484
3484
|
{
|
|
3485
3485
|
"name": "astrapy",
|
|
3486
3486
|
"version": "2.1.0"
|
|
3487
3487
|
},
|
|
3488
|
-
{
|
|
3489
|
-
"name": "langchain_astradb",
|
|
3490
|
-
"version": "0.6.1"
|
|
3491
|
-
},
|
|
3492
3488
|
{
|
|
3493
3489
|
"name": "langchain_core",
|
|
3494
3490
|
"version": "0.3.79"
|
|
@@ -3496,11 +3492,15 @@
|
|
|
3496
3492
|
{
|
|
3497
3493
|
"name": "lfx",
|
|
3498
3494
|
"version": null
|
|
3495
|
+
},
|
|
3496
|
+
{
|
|
3497
|
+
"name": "langchain_astradb",
|
|
3498
|
+
"version": "0.6.1"
|
|
3499
3499
|
}
|
|
3500
3500
|
],
|
|
3501
3501
|
"total_dependencies": 4
|
|
3502
3502
|
},
|
|
3503
|
-
"module": "lfx.components.
|
|
3503
|
+
"module": "lfx.components.datastax.astradb_vectorstore.AstraDBVectorStoreComponent"
|
|
3504
3504
|
},
|
|
3505
3505
|
"minimized": false,
|
|
3506
3506
|
"output_types": [],
|
|
@@ -3646,7 +3646,7 @@
|
|
|
3646
3646
|
"show": true,
|
|
3647
3647
|
"title_case": false,
|
|
3648
3648
|
"type": "code",
|
|
3649
|
-
"value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import DataAPIClient, Database\nfrom astrapy.data.info.reranking import RerankServiceOptions\nfrom astrapy.info import CollectionDescriptor, CollectionLexicalOptions, CollectionRerankOptions\nfrom langchain_astradb import AstraDBVectorStore, VectorServiceOptions\nfrom langchain_astradb.utils.astradb import HybridSearchMode, _AstraDBCollectionEnvironment\nfrom langchain_core.documents import Document\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.helpers.data import docs_to_data\nfrom lfx.inputs.inputs import FloatInput, NestedDictInput\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n QueryInput,\n SecretStrInput,\n StrInput,\n)\nfrom lfx.schema.data import Data\nfrom lfx.serialization import serialize\nfrom lfx.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n '<a class=\"underline\" href=\"https://astra.datastax.com/\" target=\" _blank\" '\n 'rel=\"noopener noreferrer\">your database in Astra DB</a>'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n DropdownInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n options=[\"prod\", \"test\", \"dev\"],\n value=\"prod\",\n advanced=True,\n real_time_refresh=True,\n combobox=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n DropdownInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n options=[],\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n show=False,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n show=False,\n ),\n *LCVectorStoreComponent.inputs,\n DropdownInput(\n name=\"search_method\",\n display_name=\"Search Method\",\n info=(\n \"Determine how your content is matched: Vector finds semantic similarity, \"\n \"and Hybrid Search (suggested) combines both approaches \"\n \"with a reranker.\"\n ),\n options=[\"Hybrid Search\", \"Vector Search\"], # TODO: Restore Lexical Search?\n options_metadata=[{\"icon\": \"SearchHybrid\"}, {\"icon\": \"SearchVector\"}],\n value=\"Vector Search\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"reranker\",\n display_name=\"Reranker\",\n info=\"Post-retrieval model that re-scores results for optimal relevance ranking.\",\n show=False,\n toggle=True,\n ),\n QueryInput(\n name=\"lexical_terms\",\n display_name=\"Lexical Terms\",\n info=\"Add additional terms/keywords to augment search precision.\",\n placeholder=\"Enter terms to search...\",\n separator=\" \",\n show=False,\n value=\"\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-west-2\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\", \"europe-west4\"],\n },\n },\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n client = DataAPIClient(environment=environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(api_endpoint, token=token)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers.embedding_providers.items():\n # Get the provider display name and models\n display_name = provider_data.display_name\n models = [model.name for model in provider_data.models]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n reranker: str | None = None,\n ):\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n providers = cls.get_vectorize_providers(token=token, environment=environment, api_endpoint=api_endpoint)\n vectorize_options = VectorServiceOptions(\n provider=providers.get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Define the base arguments being passed to the create collection function\n base_args = {\n \"collection_name\": new_collection_name,\n \"token\": token,\n \"api_endpoint\": api_endpoint,\n \"keyspace\": keyspace,\n \"environment\": environment,\n \"embedding_dimension\": dimension,\n \"collection_vector_service_options\": vectorize_options,\n }\n\n # Add optional arguments if the reranker is set\n if reranker:\n # Split the reranker field into a provider a model name\n provider, _ = reranker.split(\"/\")\n base_args[\"collection_rerank\"] = CollectionRerankOptions(\n service=RerankServiceOptions(provider=provider, model_name=reranker),\n )\n base_args[\"collection_lexical\"] = CollectionLexicalOptions(analyzer=\"STANDARD\")\n\n _AstraDBCollectionEnvironment(**base_args)\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = admin_client.list_databases()\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]\n\n # Get the number of collections\n try:\n # Get the number of collections in the database\n num_collections = len(\n client.get_database(\n api_endpoints[0],\n token=token,\n ).list_collection_names()\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.name] = {\n \"api_endpoints\": api_endpoints,\n \"keyspaces\": db.keyspaces,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(\n token=self.token,\n environment=self.environment,\n )\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n endpoints = db.get(\"api_endpoints\") or []\n return endpoints[0] if endpoints else None\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return \"default_keyspace\"\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(environment=self.environment)\n\n return client.get_database(\n api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(environment=self.environment)\n\n database = client.get_database(\n self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name)\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoints\": info[\"api_endpoints\"],\n \"keyspaces\": info[\"keyspaces\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.definition.vector.service.provider\n if (\n collection\n and collection.definition\n and collection.definition.vector\n and collection.definition.vector.service\n )\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name.lower() == \"bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = database.list_collections(keyspace=self.get_keyspace())\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.definition.vector.service.provider\n if col.definition.vector and col.definition.vector.service\n else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.definition.vector.service.model_name\n if col.definition.vector and col.definition.vector.service\n else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"show\"] = bool(build_config[\"database_name\"][\"value\"])\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"options\"] = []\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"show\"] = False\n\n # Set advanced status based on token presence\n database_config[\"show\"] = bool(build_config[\"token\"][\"value\"])\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"show\": False})\n build_config[\"api_endpoint\"][\"options\"] = []\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"show\": False})\n\n return build_config\n\n def _handle_hybrid_search_options(self, build_config: dict) -> dict:\n \"\"\"Set hybrid search options in the build configuration.\"\"\"\n # Detect what hybrid options are available\n # Get the admin object\n client = DataAPIClient(environment=self.environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(self.get_api_endpoint(), token=self.token)\n\n # We will try to get the reranking providers to see if its hybrid emabled\n try:\n providers = db_admin.find_reranking_providers()\n build_config[\"reranker\"][\"options\"] = [\n model.name for provider_data in providers.reranking_providers.values() for model in provider_data.models\n ]\n build_config[\"reranker\"][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=model.name.split(\"/\")[0])}\n for provider in providers.reranking_providers.values()\n for model in provider.models\n ]\n build_config[\"reranker\"][\"value\"] = build_config[\"reranker\"][\"options\"][0]\n\n # Set the default search field to hybrid search\n build_config[\"search_method\"][\"show\"] = True\n build_config[\"search_method\"][\"options\"] = [\"Hybrid Search\", \"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Hybrid Search\"\n except Exception as _: # noqa: BLE001\n build_config[\"reranker\"][\"options\"] = []\n build_config[\"reranker\"][\"options_metadata\"] = []\n\n # Set the default search field to vector search\n build_config[\"search_method\"][\"show\"] = False\n build_config[\"search_method\"][\"options\"] = [\"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Vector Search\"\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Keyspace selection change\n if field_name == \"keyspace\":\n return self.reset_collection_list(build_config)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n # Search method selection change\n if field_name == \"search_method\":\n is_vector_search = field_value == \"Vector Search\"\n is_autodetect = build_config[\"autodetect_collection\"][\"value\"]\n\n # Configure lexical terms (same for both cases)\n build_config[\"lexical_terms\"][\"show\"] = not is_vector_search\n build_config[\"lexical_terms\"][\"value\"] = \"\" if is_vector_search else build_config[\"lexical_terms\"][\"value\"]\n\n # Disable reranker disabling if hybrid search is selected\n build_config[\"reranker\"][\"show\"] = not is_vector_search\n build_config[\"reranker\"][\"toggle_disable\"] = not is_vector_search\n build_config[\"reranker\"][\"toggle_value\"] = True\n build_config[\"reranker\"][\"value\"] = build_config[\"reranker\"][\"options\"][0]\n\n # Toggle search type and score threshold based on search method\n build_config[\"search_type\"][\"show\"] = is_vector_search\n build_config[\"search_score_threshold\"][\"show\"] = is_vector_search\n\n # Make sure the search_type is set to \"Similarity\"\n if not is_vector_search or is_autodetect:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoints\": [],\n \"keyspaces\": [self.get_keyspace()],\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n reranker=self.reranker,\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n # Make sure we always show the reranker options if the collection is hybrid enabled\n # And right now they always are\n build_config[\"lexical_terms\"][\"show\"] = True\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"options\"] = build_config[\"database_name\"][\"options_metadata\"][index][\n \"api_endpoints\"\n ]\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\n \"api_endpoints\"\n ][0]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Update the list of keyspaces based on the db info\n build_config[\"keyspace\"][\"options\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"keyspaces\"]\n build_config[\"keyspace\"][\"value\"] = (\n build_config[\"keyspace\"][\"options\"] and build_config[\"keyspace\"][\"options\"][0]\n if build_config[\"keyspace\"][\"value\"] not in build_config[\"keyspace\"][\"options\"]\n else build_config[\"keyspace\"][\"value\"]\n )\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace()\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f'<a class=\"underline\" target=\"_blank\" rel=\"noopener noreferrer\" '\n f'href=\"https://astra.datastax.com/org/{org_id}/database/{db_id}/data-explorer?createCollection=1&namespace={keyspace}\">'\n \"your database in Astra DB</a>.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n # Handle hybrid search options\n build_config = self._handle_hybrid_search_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n # Reset embedding model if collection selection changes\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n # Get the selected collection index\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n\n # Set the provider of the selected collection\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Grab the collection object\n database = self.get_database_object(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n collection = database.get_collection(\n name=field_value,\n keyspace=build_config[\"keyspace\"][\"value\"],\n )\n\n # Check if hybrid and lexical are enabled\n col_options = collection.options()\n hyb_enabled = col_options.rerank and col_options.rerank.enabled\n lex_enabled = col_options.lexical and col_options.lexical.enabled\n user_hyb_enabled = build_config[\"search_method\"][\"value\"] == \"Hybrid Search\"\n\n # Reranker visible when both the collection supports it and the user selected Hybrid\n hybrid_active = bool(hyb_enabled and user_hyb_enabled)\n build_config[\"reranker\"][\"show\"] = hybrid_active\n build_config[\"reranker\"][\"toggle_value\"] = hybrid_active\n build_config[\"reranker\"][\"toggle_disable\"] = False # allow user to toggle if visible\n\n # If hybrid is active, lock search_type to \"Similarity\"\n if hybrid_active:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n # Show the lexical terms option only if the collection enables lexical search\n build_config[\"lexical_terms\"][\"show\"] = bool(lex_enabled)\n\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = {\"embedding\": self.embedding_model} if self.embedding_model else {}\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Choose HybridSearchMode based on the selected param\n hybrid_search_mode = HybridSearchMode.DEFAULT if self.search_method == \"Hybrid Search\" else HybridSearchMode.OFF\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Hybrid Search Parameters\n hybrid_search=hybrid_search_mode,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n documents = [\n Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents\n ]\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n # Clean up the search query\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n lexical_terms = self.lexical_terms or None\n\n # Check if we have a search query, and if so set the args\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n \"lexical_query\": lexical_terms,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n self.log(f\"store.hybrid_search: {vector_store.hybrid_search}\")\n self.log(f\"Lexical terms: {self.lexical_terms}\")\n self.log(f\"Reranker: {self.reranker}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n"
|
|
3649
|
+
"value": "from astrapy import DataAPIClient\nfrom langchain_core.documents import Document\n\nfrom lfx.base.datastax.astradb_base import AstraDBBaseComponent\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.helpers.data import docs_to_data\nfrom lfx.io import BoolInput, DropdownInput, FloatInput, HandleInput, IntInput, NestedDictInput, QueryInput, StrInput\nfrom lfx.schema.data import Data\nfrom lfx.serialization import serialize\nfrom lfx.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(AstraDBBaseComponent, LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.langflow.org/bundles-datastax#astra-db\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n inputs = [\n *AstraDBBaseComponent.inputs,\n *LCVectorStoreComponent.inputs,\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n show=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"search_method\",\n display_name=\"Search Method\",\n info=(\n \"Determine how your content is matched: Vector finds semantic similarity, \"\n \"and Hybrid Search (suggested) combines both approaches \"\n \"with a reranker.\"\n ),\n options=[\"Hybrid Search\", \"Vector Search\"], # TODO: Restore Lexical Search?\n options_metadata=[{\"icon\": \"SearchHybrid\"}, {\"icon\": \"SearchVector\"}],\n value=\"Vector Search\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"reranker\",\n display_name=\"Reranker\",\n info=\"Post-retrieval model that re-scores results for optimal relevance ranking.\",\n show=False,\n toggle=True,\n ),\n QueryInput(\n name=\"lexical_terms\",\n display_name=\"Lexical Terms\",\n info=\"Add additional terms/keywords to augment search precision.\",\n placeholder=\"Enter terms to search...\",\n separator=\" \",\n show=False,\n value=\"\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n ]\n\n async def update_build_config(\n self,\n build_config: dict,\n field_value: str | dict,\n field_name: str | None = None,\n ) -> dict:\n \"\"\"Update build configuration with proper handling of embedding and search options.\"\"\"\n # Handle base astra db build config updates\n build_config = await super().update_build_config(\n build_config,\n field_value=field_value,\n field_name=field_name,\n )\n\n # Set embedding model display based on provider selection\n if isinstance(field_value, dict) and \"02_embedding_generation_provider\" in field_value:\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n is_custom_provider = embedding_provider and embedding_provider != \"Bring your own\"\n provider = embedding_provider.lower() if is_custom_provider and embedding_provider is not None else None\n\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Early return if no API endpoint is configured\n if not self.get_api_endpoint():\n return build_config\n\n # Configure search method and related options\n return self._configure_search_options(build_config)\n\n def _configure_search_options(self, build_config: dict) -> dict:\n \"\"\"Configure hybrid search, reranker, and vector search options.\"\"\"\n # Detect available hybrid search capabilities\n hybrid_capabilities = self._detect_hybrid_capabilities()\n\n # Return if we haven't selected a collection\n if not build_config[\"collection_name\"][\"options\"] or not build_config[\"collection_name\"][\"value\"]:\n return build_config\n\n # Get collection options\n collection_options = self._get_collection_options(build_config)\n\n # Get the selected collection index\n index = build_config[\"collection_name\"][\"options\"].index(build_config[\"collection_name\"][\"value\"])\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Determine search configuration\n is_vector_search = build_config[\"search_method\"][\"value\"] == \"Vector Search\"\n is_autodetect = build_config[\"autodetect_collection\"][\"value\"]\n\n # Apply hybrid search configuration\n if hybrid_capabilities[\"available\"]:\n build_config[\"search_method\"][\"show\"] = True\n build_config[\"search_method\"][\"options\"] = [\"Hybrid Search\", \"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = build_config[\"search_method\"].get(\"value\", \"Hybrid Search\")\n\n build_config[\"reranker\"][\"options\"] = hybrid_capabilities[\"reranker_models\"]\n build_config[\"reranker\"][\"options_metadata\"] = hybrid_capabilities[\"reranker_metadata\"]\n if hybrid_capabilities[\"reranker_models\"]:\n build_config[\"reranker\"][\"value\"] = hybrid_capabilities[\"reranker_models\"][0]\n else:\n build_config[\"search_method\"][\"show\"] = False\n build_config[\"search_method\"][\"options\"] = [\"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Vector Search\"\n build_config[\"reranker\"][\"options\"] = []\n build_config[\"reranker\"][\"options_metadata\"] = []\n\n # Configure reranker visibility and state\n hybrid_enabled = (\n collection_options[\"rerank_enabled\"] and build_config[\"search_method\"][\"value\"] == \"Hybrid Search\"\n )\n\n build_config[\"reranker\"][\"show\"] = hybrid_enabled\n build_config[\"reranker\"][\"toggle_value\"] = hybrid_enabled\n build_config[\"reranker\"][\"toggle_disable\"] = is_vector_search\n\n # Configure lexical terms\n lexical_visible = collection_options[\"lexical_enabled\"] and not is_vector_search\n build_config[\"lexical_terms\"][\"show\"] = lexical_visible\n build_config[\"lexical_terms\"][\"value\"] = \"\" if is_vector_search else build_config[\"lexical_terms\"][\"value\"]\n\n # Configure search type and score threshold\n build_config[\"search_type\"][\"show\"] = is_vector_search\n build_config[\"search_score_threshold\"][\"show\"] = is_vector_search\n\n # Force similarity search for hybrid mode or autodetect\n if hybrid_enabled or is_autodetect:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n return build_config\n\n def _detect_hybrid_capabilities(self) -> dict:\n \"\"\"Detect available hybrid search and reranking capabilities.\"\"\"\n environment = self.get_environment(self.environment)\n client = DataAPIClient(environment=environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(self.get_api_endpoint(), token=self.token)\n\n try:\n providers = db_admin.find_reranking_providers()\n reranker_models = [\n model.name for provider_data in providers.reranking_providers.values() for model in provider_data.models\n ]\n reranker_metadata = [\n {\"icon\": self.get_provider_icon(provider_name=model.name.split(\"/\")[0])}\n for provider in providers.reranking_providers.values()\n for model in provider.models\n ]\n except (AttributeError, KeyError) as e:\n self.log(f\"Hybrid search not available: {e}\")\n return {\n \"available\": False,\n \"reranker_models\": [],\n \"reranker_metadata\": [],\n }\n else:\n return {\n \"available\": True,\n \"reranker_models\": reranker_models,\n \"reranker_metadata\": reranker_metadata,\n }\n\n def _get_collection_options(self, build_config: dict) -> dict:\n \"\"\"Retrieve collection-level search options.\"\"\"\n database = self.get_database_object(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n collection = database.get_collection(\n name=build_config[\"collection_name\"][\"value\"],\n keyspace=build_config[\"keyspace\"][\"value\"],\n )\n\n col_options = collection.options()\n\n return {\n \"rerank_enabled\": bool(col_options.rerank and col_options.rerank.enabled),\n \"lexical_enabled\": bool(col_options.lexical and col_options.lexical.enabled),\n }\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n from langchain_astradb.utils.astradb import HybridSearchMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = {\"embedding\": self.embedding_model} if self.embedding_model else {}\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Choose HybridSearchMode based on the selected param\n hybrid_search_mode = HybridSearchMode.DEFAULT if self.search_method == \"Hybrid Search\" else HybridSearchMode.OFF\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Hybrid Search Parameters\n hybrid_search=hybrid_search_mode,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except ValueError as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n documents = [\n Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents\n ]\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except ValueError as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except ValueError as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n # Clean up the search query\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n lexical_terms = self.lexical_terms or None\n\n # Check if we have a search query, and if so set the args\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n \"lexical_query\": lexical_terms,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n self.log(f\"store.hybrid_search: {vector_store.hybrid_search}\")\n self.log(f\"Lexical terms: {self.lexical_terms}\")\n self.log(f\"Reranker: {self.reranker}\")\n\n try:\n search_args = self._build_search_args()\n except ValueError as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except ValueError as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n"
|
|
3650
3650
|
},
|
|
3651
3651
|
"collection_name": {
|
|
3652
3652
|
"_input_type": "DropdownInput",
|
|
@@ -4291,17 +4291,13 @@
|
|
|
4291
4291
|
"icon": "AstraDB",
|
|
4292
4292
|
"legacy": false,
|
|
4293
4293
|
"metadata": {
|
|
4294
|
-
"code_hash": "
|
|
4294
|
+
"code_hash": "fb190928c0c2",
|
|
4295
4295
|
"dependencies": {
|
|
4296
4296
|
"dependencies": [
|
|
4297
4297
|
{
|
|
4298
4298
|
"name": "astrapy",
|
|
4299
4299
|
"version": "2.1.0"
|
|
4300
4300
|
},
|
|
4301
|
-
{
|
|
4302
|
-
"name": "langchain_astradb",
|
|
4303
|
-
"version": "0.6.1"
|
|
4304
|
-
},
|
|
4305
4301
|
{
|
|
4306
4302
|
"name": "langchain_core",
|
|
4307
4303
|
"version": "0.3.79"
|
|
@@ -4309,11 +4305,15 @@
|
|
|
4309
4305
|
{
|
|
4310
4306
|
"name": "lfx",
|
|
4311
4307
|
"version": null
|
|
4308
|
+
},
|
|
4309
|
+
{
|
|
4310
|
+
"name": "langchain_astradb",
|
|
4311
|
+
"version": "0.6.1"
|
|
4312
4312
|
}
|
|
4313
4313
|
],
|
|
4314
4314
|
"total_dependencies": 4
|
|
4315
4315
|
},
|
|
4316
|
-
"module": "lfx.components.
|
|
4316
|
+
"module": "lfx.components.datastax.astradb_vectorstore.AstraDBVectorStoreComponent"
|
|
4317
4317
|
},
|
|
4318
4318
|
"minimized": false,
|
|
4319
4319
|
"output_types": [],
|
|
@@ -4459,7 +4459,7 @@
|
|
|
4459
4459
|
"show": true,
|
|
4460
4460
|
"title_case": false,
|
|
4461
4461
|
"type": "code",
|
|
4462
|
-
"value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import DataAPIClient, Database\nfrom astrapy.data.info.reranking import RerankServiceOptions\nfrom astrapy.info import CollectionDescriptor, CollectionLexicalOptions, CollectionRerankOptions\nfrom langchain_astradb import AstraDBVectorStore, VectorServiceOptions\nfrom langchain_astradb.utils.astradb import HybridSearchMode, _AstraDBCollectionEnvironment\nfrom langchain_core.documents import Document\n\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.helpers.data import docs_to_data\nfrom lfx.inputs.inputs import FloatInput, NestedDictInput\nfrom lfx.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n QueryInput,\n SecretStrInput,\n StrInput,\n)\nfrom lfx.schema.data import Data\nfrom lfx.serialization import serialize\nfrom lfx.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n '<a class=\"underline\" href=\"https://astra.datastax.com/\" target=\" _blank\" '\n 'rel=\"noopener noreferrer\">your database in Astra DB</a>'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n DropdownInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n options=[\"prod\", \"test\", \"dev\"],\n value=\"prod\",\n advanced=True,\n real_time_refresh=True,\n combobox=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n DropdownInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n options=[],\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n show=False,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n show=False,\n ),\n *LCVectorStoreComponent.inputs,\n DropdownInput(\n name=\"search_method\",\n display_name=\"Search Method\",\n info=(\n \"Determine how your content is matched: Vector finds semantic similarity, \"\n \"and Hybrid Search (suggested) combines both approaches \"\n \"with a reranker.\"\n ),\n options=[\"Hybrid Search\", \"Vector Search\"], # TODO: Restore Lexical Search?\n options_metadata=[{\"icon\": \"SearchHybrid\"}, {\"icon\": \"SearchVector\"}],\n value=\"Vector Search\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"reranker\",\n display_name=\"Reranker\",\n info=\"Post-retrieval model that re-scores results for optimal relevance ranking.\",\n show=False,\n toggle=True,\n ),\n QueryInput(\n name=\"lexical_terms\",\n display_name=\"Lexical Terms\",\n info=\"Add additional terms/keywords to augment search precision.\",\n placeholder=\"Enter terms to search...\",\n separator=\" \",\n show=False,\n value=\"\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-west-2\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\", \"europe-west4\"],\n },\n },\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n client = DataAPIClient(environment=environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(api_endpoint, token=token)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers.embedding_providers.items():\n # Get the provider display name and models\n display_name = provider_data.display_name\n models = [model.name for model in provider_data.models]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n reranker: str | None = None,\n ):\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n providers = cls.get_vectorize_providers(token=token, environment=environment, api_endpoint=api_endpoint)\n vectorize_options = VectorServiceOptions(\n provider=providers.get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Define the base arguments being passed to the create collection function\n base_args = {\n \"collection_name\": new_collection_name,\n \"token\": token,\n \"api_endpoint\": api_endpoint,\n \"keyspace\": keyspace,\n \"environment\": environment,\n \"embedding_dimension\": dimension,\n \"collection_vector_service_options\": vectorize_options,\n }\n\n # Add optional arguments if the reranker is set\n if reranker:\n # Split the reranker field into a provider a model name\n provider, _ = reranker.split(\"/\")\n base_args[\"collection_rerank\"] = CollectionRerankOptions(\n service=RerankServiceOptions(provider=provider, model_name=reranker),\n )\n base_args[\"collection_lexical\"] = CollectionLexicalOptions(analyzer=\"STANDARD\")\n\n _AstraDBCollectionEnvironment(**base_args)\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = admin_client.list_databases()\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]\n\n # Get the number of collections\n try:\n # Get the number of collections in the database\n num_collections = len(\n client.get_database(\n api_endpoints[0],\n token=token,\n ).list_collection_names()\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.name] = {\n \"api_endpoints\": api_endpoints,\n \"keyspaces\": db.keyspaces,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(\n token=self.token,\n environment=self.environment,\n )\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n endpoints = db.get(\"api_endpoints\") or []\n return endpoints[0] if endpoints else None\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return \"default_keyspace\"\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(environment=self.environment)\n\n return client.get_database(\n api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(environment=self.environment)\n\n database = client.get_database(\n self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name)\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoints\": info[\"api_endpoints\"],\n \"keyspaces\": info[\"keyspaces\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.definition.vector.service.provider\n if (\n collection\n and collection.definition\n and collection.definition.vector\n and collection.definition.vector.service\n )\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name.lower() == \"bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = database.list_collections(keyspace=self.get_keyspace())\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.definition.vector.service.provider\n if col.definition.vector and col.definition.vector.service\n else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.definition.vector.service.model_name\n if col.definition.vector and col.definition.vector.service\n else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"show\"] = bool(build_config[\"database_name\"][\"value\"])\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"options\"] = []\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"show\"] = False\n\n # Set advanced status based on token presence\n database_config[\"show\"] = bool(build_config[\"token\"][\"value\"])\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"show\": False})\n build_config[\"api_endpoint\"][\"options\"] = []\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"show\": False})\n\n return build_config\n\n def _handle_hybrid_search_options(self, build_config: dict) -> dict:\n \"\"\"Set hybrid search options in the build configuration.\"\"\"\n # Detect what hybrid options are available\n # Get the admin object\n client = DataAPIClient(environment=self.environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(self.get_api_endpoint(), token=self.token)\n\n # We will try to get the reranking providers to see if its hybrid emabled\n try:\n providers = db_admin.find_reranking_providers()\n build_config[\"reranker\"][\"options\"] = [\n model.name for provider_data in providers.reranking_providers.values() for model in provider_data.models\n ]\n build_config[\"reranker\"][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=model.name.split(\"/\")[0])}\n for provider in providers.reranking_providers.values()\n for model in provider.models\n ]\n build_config[\"reranker\"][\"value\"] = build_config[\"reranker\"][\"options\"][0]\n\n # Set the default search field to hybrid search\n build_config[\"search_method\"][\"show\"] = True\n build_config[\"search_method\"][\"options\"] = [\"Hybrid Search\", \"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Hybrid Search\"\n except Exception as _: # noqa: BLE001\n build_config[\"reranker\"][\"options\"] = []\n build_config[\"reranker\"][\"options_metadata\"] = []\n\n # Set the default search field to vector search\n build_config[\"search_method\"][\"show\"] = False\n build_config[\"search_method\"][\"options\"] = [\"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Vector Search\"\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Keyspace selection change\n if field_name == \"keyspace\":\n return self.reset_collection_list(build_config)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n # Search method selection change\n if field_name == \"search_method\":\n is_vector_search = field_value == \"Vector Search\"\n is_autodetect = build_config[\"autodetect_collection\"][\"value\"]\n\n # Configure lexical terms (same for both cases)\n build_config[\"lexical_terms\"][\"show\"] = not is_vector_search\n build_config[\"lexical_terms\"][\"value\"] = \"\" if is_vector_search else build_config[\"lexical_terms\"][\"value\"]\n\n # Disable reranker disabling if hybrid search is selected\n build_config[\"reranker\"][\"show\"] = not is_vector_search\n build_config[\"reranker\"][\"toggle_disable\"] = not is_vector_search\n build_config[\"reranker\"][\"toggle_value\"] = True\n build_config[\"reranker\"][\"value\"] = build_config[\"reranker\"][\"options\"][0]\n\n # Toggle search type and score threshold based on search method\n build_config[\"search_type\"][\"show\"] = is_vector_search\n build_config[\"search_score_threshold\"][\"show\"] = is_vector_search\n\n # Make sure the search_type is set to \"Similarity\"\n if not is_vector_search or is_autodetect:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoints\": [],\n \"keyspaces\": [self.get_keyspace()],\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n reranker=self.reranker,\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n # Make sure we always show the reranker options if the collection is hybrid enabled\n # And right now they always are\n build_config[\"lexical_terms\"][\"show\"] = True\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"options\"] = build_config[\"database_name\"][\"options_metadata\"][index][\n \"api_endpoints\"\n ]\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\n \"api_endpoints\"\n ][0]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Update the list of keyspaces based on the db info\n build_config[\"keyspace\"][\"options\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"keyspaces\"]\n build_config[\"keyspace\"][\"value\"] = (\n build_config[\"keyspace\"][\"options\"] and build_config[\"keyspace\"][\"options\"][0]\n if build_config[\"keyspace\"][\"value\"] not in build_config[\"keyspace\"][\"options\"]\n else build_config[\"keyspace\"][\"value\"]\n )\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace()\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f'<a class=\"underline\" target=\"_blank\" rel=\"noopener noreferrer\" '\n f'href=\"https://astra.datastax.com/org/{org_id}/database/{db_id}/data-explorer?createCollection=1&namespace={keyspace}\">'\n \"your database in Astra DB</a>.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n # Handle hybrid search options\n build_config = self._handle_hybrid_search_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n # Reset embedding model if collection selection changes\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n # Get the selected collection index\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n\n # Set the provider of the selected collection\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Grab the collection object\n database = self.get_database_object(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n collection = database.get_collection(\n name=field_value,\n keyspace=build_config[\"keyspace\"][\"value\"],\n )\n\n # Check if hybrid and lexical are enabled\n col_options = collection.options()\n hyb_enabled = col_options.rerank and col_options.rerank.enabled\n lex_enabled = col_options.lexical and col_options.lexical.enabled\n user_hyb_enabled = build_config[\"search_method\"][\"value\"] == \"Hybrid Search\"\n\n # Reranker visible when both the collection supports it and the user selected Hybrid\n hybrid_active = bool(hyb_enabled and user_hyb_enabled)\n build_config[\"reranker\"][\"show\"] = hybrid_active\n build_config[\"reranker\"][\"toggle_value\"] = hybrid_active\n build_config[\"reranker\"][\"toggle_disable\"] = False # allow user to toggle if visible\n\n # If hybrid is active, lock search_type to \"Similarity\"\n if hybrid_active:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n # Show the lexical terms option only if the collection enables lexical search\n build_config[\"lexical_terms\"][\"show\"] = bool(lex_enabled)\n\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = {\"embedding\": self.embedding_model} if self.embedding_model else {}\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Choose HybridSearchMode based on the selected param\n hybrid_search_mode = HybridSearchMode.DEFAULT if self.search_method == \"Hybrid Search\" else HybridSearchMode.OFF\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Hybrid Search Parameters\n hybrid_search=hybrid_search_mode,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n documents = [\n Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents\n ]\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n # Clean up the search query\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n lexical_terms = self.lexical_terms or None\n\n # Check if we have a search query, and if so set the args\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n \"lexical_query\": lexical_terms,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n self.log(f\"store.hybrid_search: {vector_store.hybrid_search}\")\n self.log(f\"Lexical terms: {self.lexical_terms}\")\n self.log(f\"Reranker: {self.reranker}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n"
|
|
4462
|
+
"value": "from astrapy import DataAPIClient\nfrom langchain_core.documents import Document\n\nfrom lfx.base.datastax.astradb_base import AstraDBBaseComponent\nfrom lfx.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom lfx.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom lfx.helpers.data import docs_to_data\nfrom lfx.io import BoolInput, DropdownInput, FloatInput, HandleInput, IntInput, NestedDictInput, QueryInput, StrInput\nfrom lfx.schema.data import Data\nfrom lfx.serialization import serialize\nfrom lfx.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(AstraDBBaseComponent, LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.langflow.org/bundles-datastax#astra-db\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n inputs = [\n *AstraDBBaseComponent.inputs,\n *LCVectorStoreComponent.inputs,\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n show=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"search_method\",\n display_name=\"Search Method\",\n info=(\n \"Determine how your content is matched: Vector finds semantic similarity, \"\n \"and Hybrid Search (suggested) combines both approaches \"\n \"with a reranker.\"\n ),\n options=[\"Hybrid Search\", \"Vector Search\"], # TODO: Restore Lexical Search?\n options_metadata=[{\"icon\": \"SearchHybrid\"}, {\"icon\": \"SearchVector\"}],\n value=\"Vector Search\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"reranker\",\n display_name=\"Reranker\",\n info=\"Post-retrieval model that re-scores results for optimal relevance ranking.\",\n show=False,\n toggle=True,\n ),\n QueryInput(\n name=\"lexical_terms\",\n display_name=\"Lexical Terms\",\n info=\"Add additional terms/keywords to augment search precision.\",\n placeholder=\"Enter terms to search...\",\n separator=\" \",\n show=False,\n value=\"\",\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n ]\n\n async def update_build_config(\n self,\n build_config: dict,\n field_value: str | dict,\n field_name: str | None = None,\n ) -> dict:\n \"\"\"Update build configuration with proper handling of embedding and search options.\"\"\"\n # Handle base astra db build config updates\n build_config = await super().update_build_config(\n build_config,\n field_value=field_value,\n field_name=field_name,\n )\n\n # Set embedding model display based on provider selection\n if isinstance(field_value, dict) and \"02_embedding_generation_provider\" in field_value:\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n is_custom_provider = embedding_provider and embedding_provider != \"Bring your own\"\n provider = embedding_provider.lower() if is_custom_provider and embedding_provider is not None else None\n\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Early return if no API endpoint is configured\n if not self.get_api_endpoint():\n return build_config\n\n # Configure search method and related options\n return self._configure_search_options(build_config)\n\n def _configure_search_options(self, build_config: dict) -> dict:\n \"\"\"Configure hybrid search, reranker, and vector search options.\"\"\"\n # Detect available hybrid search capabilities\n hybrid_capabilities = self._detect_hybrid_capabilities()\n\n # Return if we haven't selected a collection\n if not build_config[\"collection_name\"][\"options\"] or not build_config[\"collection_name\"][\"value\"]:\n return build_config\n\n # Get collection options\n collection_options = self._get_collection_options(build_config)\n\n # Get the selected collection index\n index = build_config[\"collection_name\"][\"options\"].index(build_config[\"collection_name\"][\"value\"])\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"show\"] = not bool(provider)\n build_config[\"embedding_model\"][\"required\"] = not bool(provider)\n\n # Determine search configuration\n is_vector_search = build_config[\"search_method\"][\"value\"] == \"Vector Search\"\n is_autodetect = build_config[\"autodetect_collection\"][\"value\"]\n\n # Apply hybrid search configuration\n if hybrid_capabilities[\"available\"]:\n build_config[\"search_method\"][\"show\"] = True\n build_config[\"search_method\"][\"options\"] = [\"Hybrid Search\", \"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = build_config[\"search_method\"].get(\"value\", \"Hybrid Search\")\n\n build_config[\"reranker\"][\"options\"] = hybrid_capabilities[\"reranker_models\"]\n build_config[\"reranker\"][\"options_metadata\"] = hybrid_capabilities[\"reranker_metadata\"]\n if hybrid_capabilities[\"reranker_models\"]:\n build_config[\"reranker\"][\"value\"] = hybrid_capabilities[\"reranker_models\"][0]\n else:\n build_config[\"search_method\"][\"show\"] = False\n build_config[\"search_method\"][\"options\"] = [\"Vector Search\"]\n build_config[\"search_method\"][\"value\"] = \"Vector Search\"\n build_config[\"reranker\"][\"options\"] = []\n build_config[\"reranker\"][\"options_metadata\"] = []\n\n # Configure reranker visibility and state\n hybrid_enabled = (\n collection_options[\"rerank_enabled\"] and build_config[\"search_method\"][\"value\"] == \"Hybrid Search\"\n )\n\n build_config[\"reranker\"][\"show\"] = hybrid_enabled\n build_config[\"reranker\"][\"toggle_value\"] = hybrid_enabled\n build_config[\"reranker\"][\"toggle_disable\"] = is_vector_search\n\n # Configure lexical terms\n lexical_visible = collection_options[\"lexical_enabled\"] and not is_vector_search\n build_config[\"lexical_terms\"][\"show\"] = lexical_visible\n build_config[\"lexical_terms\"][\"value\"] = \"\" if is_vector_search else build_config[\"lexical_terms\"][\"value\"]\n\n # Configure search type and score threshold\n build_config[\"search_type\"][\"show\"] = is_vector_search\n build_config[\"search_score_threshold\"][\"show\"] = is_vector_search\n\n # Force similarity search for hybrid mode or autodetect\n if hybrid_enabled or is_autodetect:\n build_config[\"search_type\"][\"value\"] = \"Similarity\"\n\n return build_config\n\n def _detect_hybrid_capabilities(self) -> dict:\n \"\"\"Detect available hybrid search and reranking capabilities.\"\"\"\n environment = self.get_environment(self.environment)\n client = DataAPIClient(environment=environment)\n admin_client = client.get_admin()\n db_admin = admin_client.get_database_admin(self.get_api_endpoint(), token=self.token)\n\n try:\n providers = db_admin.find_reranking_providers()\n reranker_models = [\n model.name for provider_data in providers.reranking_providers.values() for model in provider_data.models\n ]\n reranker_metadata = [\n {\"icon\": self.get_provider_icon(provider_name=model.name.split(\"/\")[0])}\n for provider in providers.reranking_providers.values()\n for model in provider.models\n ]\n except (AttributeError, KeyError) as e:\n self.log(f\"Hybrid search not available: {e}\")\n return {\n \"available\": False,\n \"reranker_models\": [],\n \"reranker_metadata\": [],\n }\n else:\n return {\n \"available\": True,\n \"reranker_models\": reranker_models,\n \"reranker_metadata\": reranker_metadata,\n }\n\n def _get_collection_options(self, build_config: dict) -> dict:\n \"\"\"Retrieve collection-level search options.\"\"\"\n database = self.get_database_object(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n collection = database.get_collection(\n name=build_config[\"collection_name\"][\"value\"],\n keyspace=build_config[\"keyspace\"][\"value\"],\n )\n\n col_options = collection.options()\n\n return {\n \"rerank_enabled\": bool(col_options.rerank and col_options.rerank.enabled),\n \"lexical_enabled\": bool(col_options.lexical and col_options.lexical.enabled),\n }\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n from langchain_astradb.utils.astradb import HybridSearchMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = {\"embedding\": self.embedding_model} if self.embedding_model else {}\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Choose HybridSearchMode based on the selected param\n hybrid_search_mode = HybridSearchMode.DEFAULT if self.search_method == \"Hybrid Search\" else HybridSearchMode.OFF\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Hybrid Search Parameters\n hybrid_search=hybrid_search_mode,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except ValueError as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n documents = [\n Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents\n ]\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except ValueError as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except ValueError as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n # Clean up the search query\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n lexical_terms = self.lexical_terms or None\n\n # Check if we have a search query, and if so set the args\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n \"lexical_query\": lexical_terms,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n self.log(f\"store.hybrid_search: {vector_store.hybrid_search}\")\n self.log(f\"Lexical terms: {self.lexical_terms}\")\n self.log(f\"Reranker: {self.reranker}\")\n\n try:\n search_args = self._build_search_args()\n except ValueError as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except ValueError as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n"
|
|
4463
4463
|
},
|
|
4464
4464
|
"collection_name": {
|
|
4465
4465
|
"_input_type": "DropdownInput",
|
langflow/services/deps.py
CHANGED
|
@@ -108,6 +108,17 @@ def get_variable_service() -> VariableService:
|
|
|
108
108
|
return get_service(ServiceType.VARIABLE_SERVICE, VariableServiceFactory())
|
|
109
109
|
|
|
110
110
|
|
|
111
|
+
def is_settings_service_initialized() -> bool:
|
|
112
|
+
"""Check if the SettingsService is already initialized without triggering initialization.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
bool: True if the SettingsService is already initialized, False otherwise.
|
|
116
|
+
"""
|
|
117
|
+
from lfx.services.manager import get_service_manager
|
|
118
|
+
|
|
119
|
+
return ServiceType.SETTINGS_SERVICE in get_service_manager().services
|
|
120
|
+
|
|
121
|
+
|
|
111
122
|
def get_settings_service() -> SettingsService:
|
|
112
123
|
"""Retrieves the SettingsService instance.
|
|
113
124
|
|
{langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: langflow-base-nightly
|
|
3
|
-
Version: 0.6.5.
|
|
3
|
+
Version: 0.6.5.dev1
|
|
4
4
|
Summary: A Python package with a built-in web application
|
|
5
5
|
Project-URL: Repository, https://github.com/langflow-ai/langflow
|
|
6
6
|
Project-URL: Documentation, https://docs.langflow.org
|
|
@@ -45,7 +45,7 @@ Requires-Dist: langchain-experimental<1.0.0,>=0.3.4
|
|
|
45
45
|
Requires-Dist: langchain-ibm<1.0.0,>=0.3.8
|
|
46
46
|
Requires-Dist: langchainhub~=0.1.15
|
|
47
47
|
Requires-Dist: langchain~=0.3.21
|
|
48
|
-
Requires-Dist: lfx-nightly==0.1.13.
|
|
48
|
+
Requires-Dist: lfx-nightly==0.1.13.dev3
|
|
49
49
|
Requires-Dist: loguru<1.0.0,>=0.7.1
|
|
50
50
|
Requires-Dist: mcp~=1.10.1
|
|
51
51
|
Requires-Dist: multiprocess<1.0.0,>=0.70.14
|
{langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/RECORD
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
langflow/__init__.py,sha256=i-U-ttyryv7AfAXKH9QwOuNmtxsc_iHSPbUbWmroiek,10839
|
|
2
|
-
langflow/__main__.py,sha256=
|
|
2
|
+
langflow/__main__.py,sha256=Qzhu7wNq0a0YqzlTnzJ4TCUqGljFdO4bCcHm-3Oq4sE,37939
|
|
3
3
|
langflow/alembic.ini,sha256=fbUkg3Y988q24z9FRya85qBQBvKvEI8fQbR4CsWHHsk,3503
|
|
4
4
|
langflow/langflow_launcher.py,sha256=T8ske5ZJ-vhETkbDpt2LvuOYZMbgomTkqi_fc-jlt4c,1984
|
|
5
5
|
langflow/main.py,sha256=4rC7X_BTx50k4mjq2Q-7MDsDvEutAxm9MSmH7kx71mg,23425
|
|
@@ -86,7 +86,7 @@ langflow/api/v1/endpoints.py,sha256=fXAq0e6LaS3jhlpzOXj791C8YQURse6atPhM3wjVleI,
|
|
|
86
86
|
langflow/api/v1/files.py,sha256=g74OEks8dZ594RA_TP7mNQtyVAeI6idZh_wh-D0iOFY,7647
|
|
87
87
|
langflow/api/v1/flows.py,sha256=Uj0URyAB9Z22ZKSgpGumIJmjsYrrTT1Uy599Ag41trk,21425
|
|
88
88
|
langflow/api/v1/folders.py,sha256=uk8O2w-8d0jGb8TIJEz3mO09Ib1yTzGhmyW54MKdpqA,3286
|
|
89
|
-
langflow/api/v1/knowledge_bases.py,sha256=
|
|
89
|
+
langflow/api/v1/knowledge_bases.py,sha256=pKNcIEcRh6I4hQnPP5HoatbAPMt8Kg2eK8r9Dv_bnkw,17229
|
|
90
90
|
langflow/api/v1/login.py,sha256=zxfGEXaYHebFk-pYP9xcp_aUbVt8qUOik3W4O_u8IRc,5826
|
|
91
91
|
langflow/api/v1/mcp.py,sha256=jbTllucL8okLDuugCOCyw7Ig5CVtmzps93T5hAKIQ5U,4767
|
|
92
92
|
langflow/api/v1/mcp_projects.py,sha256=p1q0cs1owFmxTB9-gTgckwfSxceeQGp_UryV8K4Axyc,52115
|
|
@@ -2001,17 +2001,17 @@ langflow/initial_setup/starter_projects/Blog Writer.json,sha256=XQ9fYMF2JREIIqO-
|
|
|
2001
2001
|
langflow/initial_setup/starter_projects/Custom Component Generator.json,sha256=Swgah8LXNZHDPY6p45nlcRTzK2Qetm5Ae4nLsIkHEXg,166647
|
|
2002
2002
|
langflow/initial_setup/starter_projects/Document Q&A.json,sha256=ChEOLSq--uvmcbgsIlL4qsDLcmHhEm8XFHK9-0eBWhM,103773
|
|
2003
2003
|
langflow/initial_setup/starter_projects/Financial Report Parser.json,sha256=GAOcJNRZVGiJOCdDu6uuRRqKba88OhY0nwU7H3Fh8pw,86818
|
|
2004
|
-
langflow/initial_setup/starter_projects/Hybrid Search RAG.json,sha256=
|
|
2004
|
+
langflow/initial_setup/starter_projects/Hybrid Search RAG.json,sha256=9U_y8-5GDrIEuZCWziFBhLqSmI2FBZ14qInvt6Xo8qA,172188
|
|
2005
2005
|
langflow/initial_setup/starter_projects/Image Sentiment Analysis.json,sha256=60bWwApq01QLt9_Pkrd2D9iDkpYkH8TS7etq4jG93Xs,112946
|
|
2006
2006
|
langflow/initial_setup/starter_projects/Instagram Copywriter.json,sha256=PT69LL3yCLqtMMhRwA_-F7-jZY2mplqTIY_ICLwyLUk,186580
|
|
2007
2007
|
langflow/initial_setup/starter_projects/Invoice Summarizer.json,sha256=4RAidnjga5It91C_YLfjPV4m4dMfgH55dqOfPSlD99A,110024
|
|
2008
|
-
langflow/initial_setup/starter_projects/Knowledge Ingestion.json,sha256=
|
|
2009
|
-
langflow/initial_setup/starter_projects/Knowledge Retrieval.json,sha256=
|
|
2008
|
+
langflow/initial_setup/starter_projects/Knowledge Ingestion.json,sha256=eaQNMrFws0AH4l1FE9Xy7I8TCsFQseQ0nxXlSbR6MI0,87510
|
|
2009
|
+
langflow/initial_setup/starter_projects/Knowledge Retrieval.json,sha256=ypBr2SIxpvYxyA4jsFDTXNzfyVY6AESupZBtyEiURyA,44772
|
|
2010
2010
|
langflow/initial_setup/starter_projects/Market Research.json,sha256=n88yv8Q5B24k1kYVjNUvWfctQ_r1mSBbdSV9z894SpY,166523
|
|
2011
2011
|
langflow/initial_setup/starter_projects/Meeting Summary.json,sha256=higOgjzBNmM3Zg5N4zpeVOWsu6hEyY-a_QdxBtB0pKE,194311
|
|
2012
2012
|
langflow/initial_setup/starter_projects/Memory Chatbot.json,sha256=KLa8rgXIkIquti9agKnoTLYbjy_Z42ZZwxHDgDsc-Fk,85022
|
|
2013
2013
|
langflow/initial_setup/starter_projects/News Aggregator.json,sha256=MtQBUsOVK0MlgnJ43im7YvXkZ-X42PCBJPeitIE3wZc,154933
|
|
2014
|
-
langflow/initial_setup/starter_projects/Nvidia Remix.json,sha256=
|
|
2014
|
+
langflow/initial_setup/starter_projects/Nvidia Remix.json,sha256=n-cuTxUwPxR3v_rpDX5XcJswKB2RYgIlcwhEQvquTEc,328553
|
|
2015
2015
|
langflow/initial_setup/starter_projects/Pokédex Agent.json,sha256=euqCnHrinp6btRLZ_sX7csz5KWT0UE6TutRTH6U4DzI,126966
|
|
2016
2016
|
langflow/initial_setup/starter_projects/Portfolio Website Code Generator.json,sha256=e8uNXegPzpfn7bXa5fc4HkavFe-DWcVaSZUbTmEt1LU,151839
|
|
2017
2017
|
langflow/initial_setup/starter_projects/Price Deal Finder.json,sha256=Cdkcq2jk6Xd9szzcJ5IcplZFpxSrYeGHq5Mfds_-qkI,137274
|
|
@@ -2021,12 +2021,12 @@ langflow/initial_setup/starter_projects/SEO Keyword Generator.json,sha256=nAt0pQ
|
|
|
2021
2021
|
langflow/initial_setup/starter_projects/SaaS Pricing.json,sha256=Kr1fS4YtYGc298VV4nGo8gKDMKdNeU8v_BBvqb2SsdY,98657
|
|
2022
2022
|
langflow/initial_setup/starter_projects/Search agent.json,sha256=gkXkVNFmETWBryF-tl3uZ7Dh_140wZNXC0K89dTGvSk,97448
|
|
2023
2023
|
langflow/initial_setup/starter_projects/Sequential Tasks Agents.json,sha256=KlzTfB39tcPv9W677JH-N_atlF_hkwc1Ywqm0j9KNGk,284929
|
|
2024
|
-
langflow/initial_setup/starter_projects/Simple Agent.json,sha256=
|
|
2024
|
+
langflow/initial_setup/starter_projects/Simple Agent.json,sha256=5N_BY5lkfwx83e-UCQwVUIeEcBUaUwED8IKesaE1plg,125813
|
|
2025
2025
|
langflow/initial_setup/starter_projects/Social Media Agent.json,sha256=yex0AoHpuZETRyEgA88zZJjOl1KHyADq_5lJWRvcJwU,135257
|
|
2026
2026
|
langflow/initial_setup/starter_projects/Text Sentiment Analysis.json,sha256=BgmTxsPdBaNoWITDcqOkrbsMV3SiBLcSSJVqM5DFqTk,170081
|
|
2027
2027
|
langflow/initial_setup/starter_projects/Travel Planning Agents.json,sha256=IKbcAWQJaG0FadTN0khOoFvgFKl5LW6s9eSpaoRRf-o,247328
|
|
2028
2028
|
langflow/initial_setup/starter_projects/Twitter Thread Generator.json,sha256=ZofhAiL-GfQkchv2fo745QGDhsRmkTHRUS5BgZaxLcc,102332
|
|
2029
|
-
langflow/initial_setup/starter_projects/Vector Store RAG.json,sha256=
|
|
2029
|
+
langflow/initial_setup/starter_projects/Vector Store RAG.json,sha256=eMOSFhJ6X1xh7wVOHAfpSfIc_moKnXYfJx5j2mg44t8,292260
|
|
2030
2030
|
langflow/initial_setup/starter_projects/Youtube Analysis.json,sha256=1ncYZLEg7eReH31p0RsdCQch4VywiPAY1BZsAXW4xOQ,184069
|
|
2031
2031
|
langflow/initial_setup/starter_projects/__init__.py,sha256=c5Z92jvCm680uhVKycyps-BX-lEHjjPKWcsI86METUA,673
|
|
2032
2032
|
langflow/initial_setup/starter_projects/basic_prompting.py,sha256=Lw7anZebWGcypu6S1QM4Zmi7ztk14TJvuPSEC2QjTyo,837
|
|
@@ -2084,7 +2084,7 @@ langflow/serialization/constants.py,sha256=TutYkWK-FSlyXDbUJv5_Fkq3k_a3ecpcQLh9T
|
|
|
2084
2084
|
langflow/serialization/serialization.py,sha256=ALG4g76kvSugJ49BO3GT_O2gVXGW-1fjjJ91nO2eO7Q,12798
|
|
2085
2085
|
langflow/services/__init__.py,sha256=BaOOiIYuzmOaL_pDtXQuCgLehSZFqR-wK5bzcVVTjX0,59
|
|
2086
2086
|
langflow/services/base.py,sha256=aIM0_p_FS4BE7TxNUTBfdUObie1QekeelM2HPM89KP4,813
|
|
2087
|
-
langflow/services/deps.py,sha256=
|
|
2087
|
+
langflow/services/deps.py,sha256=LWcqUR13DLPCPHSJNjvZLUBI8_tAALEX4YkUAf1mDZ8,8058
|
|
2088
2088
|
langflow/services/enhanced_manager.py,sha256=k5yI5sU52diiR24dISchY6uZVIx6VQRziA96ktuYsnQ,2942
|
|
2089
2089
|
langflow/services/factory.py,sha256=SCVXzcfRwCyYs3ryp09KcpIoK93YrAPokEexx9ekgkU,3506
|
|
2090
2090
|
langflow/services/manager.py,sha256=LkSZDXBgujBXDIBXxLQ4aLnhfbjHyRtg88MZxS1cF8c,1186
|
|
@@ -2230,7 +2230,7 @@ langflow/utils/util.py,sha256=bZqi9Fqj2mlp9tKUA-Q4ePpooxtbuVLjlAvdml4kcjs,1516
|
|
|
2230
2230
|
langflow/utils/validate.py,sha256=BPqoIMvjl4wbMJTTWo1zMHP0kQCa2TfmDT9f-nPT9Ng,112
|
|
2231
2231
|
langflow/utils/version.py,sha256=OjSj0smls9XnPd4-LpTH9AWyUO_NAn5mncqKkkXl_fw,2840
|
|
2232
2232
|
langflow/utils/voice_utils.py,sha256=Ypxg8s5jFd1o5wBbx1W8oKK7vh4kwo0-iuTcFqIwy5I,3350
|
|
2233
|
-
langflow_base_nightly-0.6.5.
|
|
2234
|
-
langflow_base_nightly-0.6.5.
|
|
2235
|
-
langflow_base_nightly-0.6.5.
|
|
2236
|
-
langflow_base_nightly-0.6.5.
|
|
2233
|
+
langflow_base_nightly-0.6.5.dev1.dist-info/METADATA,sha256=r0R1osAiZQcw9Oxhfv9tOudhkbks2FbmwB4iPAAIMXE,4376
|
|
2234
|
+
langflow_base_nightly-0.6.5.dev1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
2235
|
+
langflow_base_nightly-0.6.5.dev1.dist-info/entry_points.txt,sha256=JvuLdXSrkeDmDdpb8M-VvFIzb84n4HmqUcIP10_EIF8,57
|
|
2236
|
+
langflow_base_nightly-0.6.5.dev1.dist-info/RECORD,,
|
{langflow_base_nightly-0.6.5.dev0.dist-info → langflow_base_nightly-0.6.5.dev1.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|