lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +9 -4
- lfx/base/agents/altk_base_agent.py +16 -3
- lfx/base/agents/altk_tool_wrappers.py +1 -1
- lfx/base/agents/utils.py +4 -0
- lfx/base/composio/composio_base.py +78 -41
- lfx/base/data/base_file.py +14 -4
- lfx/base/data/cloud_storage_utils.py +156 -0
- lfx/base/data/docling_utils.py +191 -65
- lfx/base/data/storage_utils.py +109 -0
- lfx/base/datastax/astradb_base.py +75 -64
- lfx/base/mcp/util.py +2 -2
- lfx/base/models/__init__.py +11 -1
- lfx/base/models/anthropic_constants.py +21 -12
- lfx/base/models/google_generative_ai_constants.py +33 -9
- lfx/base/models/model_metadata.py +6 -0
- lfx/base/models/ollama_constants.py +196 -30
- lfx/base/models/openai_constants.py +37 -10
- lfx/base/models/unified_models.py +1123 -0
- lfx/base/models/watsonx_constants.py +36 -0
- lfx/base/tools/component_tool.py +2 -9
- lfx/cli/commands.py +6 -1
- lfx/cli/run.py +65 -409
- lfx/cli/script_loader.py +13 -3
- lfx/components/__init__.py +0 -3
- lfx/components/composio/github_composio.py +1 -1
- lfx/components/cuga/cuga_agent.py +39 -27
- lfx/components/data_source/api_request.py +4 -2
- lfx/components/docling/__init__.py +45 -11
- lfx/components/docling/chunk_docling_document.py +3 -1
- lfx/components/docling/docling_inline.py +39 -49
- lfx/components/docling/export_docling_document.py +3 -1
- lfx/components/elastic/opensearch_multimodal.py +215 -57
- lfx/components/files_and_knowledge/file.py +439 -39
- lfx/components/files_and_knowledge/ingestion.py +8 -0
- lfx/components/files_and_knowledge/retrieval.py +10 -0
- lfx/components/files_and_knowledge/save_file.py +123 -53
- lfx/components/ibm/watsonx.py +7 -1
- lfx/components/input_output/chat_output.py +7 -1
- lfx/components/langchain_utilities/tool_calling.py +14 -6
- lfx/components/llm_operations/batch_run.py +80 -25
- lfx/components/llm_operations/lambda_filter.py +33 -6
- lfx/components/llm_operations/llm_conditional_router.py +39 -7
- lfx/components/llm_operations/structured_output.py +38 -12
- lfx/components/models/__init__.py +16 -74
- lfx/components/models_and_agents/agent.py +51 -201
- lfx/components/models_and_agents/embedding_model.py +185 -339
- lfx/components/models_and_agents/language_model.py +54 -318
- lfx/components/models_and_agents/mcp_component.py +58 -9
- lfx/components/ollama/ollama.py +9 -4
- lfx/components/ollama/ollama_embeddings.py +2 -1
- lfx/components/openai/openai_chat_model.py +1 -1
- lfx/components/processing/__init__.py +0 -3
- lfx/components/vllm/__init__.py +37 -0
- lfx/components/vllm/vllm.py +141 -0
- lfx/components/vllm/vllm_embeddings.py +110 -0
- lfx/custom/custom_component/custom_component.py +8 -6
- lfx/custom/directory_reader/directory_reader.py +5 -2
- lfx/graph/utils.py +64 -18
- lfx/inputs/__init__.py +2 -0
- lfx/inputs/input_mixin.py +54 -0
- lfx/inputs/inputs.py +115 -0
- lfx/interface/initialize/loading.py +42 -12
- lfx/io/__init__.py +2 -0
- lfx/run/__init__.py +5 -0
- lfx/run/base.py +494 -0
- lfx/schema/data.py +1 -1
- lfx/schema/image.py +28 -19
- lfx/schema/message.py +19 -3
- lfx/services/interfaces.py +5 -0
- lfx/services/manager.py +5 -4
- lfx/services/mcp_composer/service.py +45 -13
- lfx/services/settings/auth.py +18 -11
- lfx/services/settings/base.py +12 -24
- lfx/services/settings/constants.py +2 -0
- lfx/services/storage/local.py +37 -0
- lfx/services/storage/service.py +19 -0
- lfx/utils/constants.py +1 -0
- lfx/utils/image.py +29 -11
- lfx/utils/validate_cloud.py +14 -3
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +84 -78
- lfx/components/processing/dataframe_to_toolset.py +0 -259
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0
|
@@ -641,8 +641,14 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
641
641
|
@check_cached_vector_store
|
|
642
642
|
def build_vector_store(self) -> OpenSearch:
|
|
643
643
|
# Return raw OpenSearch client as our "vector store."
|
|
644
|
-
self.log(self.ingest_data)
|
|
645
644
|
client = self.build_client()
|
|
645
|
+
|
|
646
|
+
# Check if we're in ingestion-only mode (no search query)
|
|
647
|
+
has_search_query = bool((self.search_query or "").strip())
|
|
648
|
+
if not has_search_query:
|
|
649
|
+
logger.debug("Ingestion-only mode activated: search operations will be skipped")
|
|
650
|
+
logger.debug("Starting ingestion mode...")
|
|
651
|
+
|
|
646
652
|
logger.warning(f"Embedding: {self.embedding}")
|
|
647
653
|
self._add_documents_to_vector_store(client=client)
|
|
648
654
|
return client
|
|
@@ -660,25 +666,41 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
660
666
|
Args:
|
|
661
667
|
client: OpenSearch client for performing operations
|
|
662
668
|
"""
|
|
669
|
+
logger.debug("[INGESTION] _add_documents_to_vector_store called")
|
|
663
670
|
# Convert DataFrame to Data if needed using parent's method
|
|
664
671
|
self.ingest_data = self._prepare_ingest_data()
|
|
665
672
|
|
|
673
|
+
logger.debug(
|
|
674
|
+
f"[INGESTION] ingest_data type: "
|
|
675
|
+
f"{type(self.ingest_data)}, length: {len(self.ingest_data) if self.ingest_data else 0}"
|
|
676
|
+
)
|
|
677
|
+
logger.debug(
|
|
678
|
+
f"[INGESTION] ingest_data content: "
|
|
679
|
+
f"{self.ingest_data[:2] if self.ingest_data and len(self.ingest_data) > 0 else 'empty'}"
|
|
680
|
+
)
|
|
681
|
+
|
|
666
682
|
docs = self.ingest_data or []
|
|
667
683
|
if not docs:
|
|
668
|
-
|
|
684
|
+
logger.debug("Ingestion complete: No documents provided")
|
|
669
685
|
return
|
|
670
686
|
|
|
671
687
|
if not self.embedding:
|
|
672
688
|
msg = "Embedding handle is required to embed documents."
|
|
673
689
|
raise ValueError(msg)
|
|
674
690
|
|
|
675
|
-
# Normalize embedding to list
|
|
691
|
+
# Normalize embedding to list first
|
|
676
692
|
embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]
|
|
677
693
|
|
|
694
|
+
# Filter out None values (fail-safe mode) - do this BEFORE checking if empty
|
|
695
|
+
embeddings_list = [e for e in embeddings_list if e is not None]
|
|
696
|
+
|
|
697
|
+
# NOW check if we have any valid embeddings left after filtering
|
|
678
698
|
if not embeddings_list:
|
|
679
|
-
|
|
680
|
-
|
|
699
|
+
logger.warning("All embeddings returned None (fail-safe mode enabled). Skipping document ingestion.")
|
|
700
|
+
self.log("Embedding returned None (fail-safe mode enabled). Skipping document ingestion.")
|
|
701
|
+
return
|
|
681
702
|
|
|
703
|
+
logger.debug(f"[INGESTION] Valid embeddings after filtering: {len(embeddings_list)}")
|
|
682
704
|
self.log(f"Available embedding models: {len(embeddings_list)}")
|
|
683
705
|
|
|
684
706
|
# Select the embedding to use for ingestion
|
|
@@ -790,6 +812,7 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
790
812
|
|
|
791
813
|
dynamic_field_name = get_embedding_field_name(embedding_model)
|
|
792
814
|
|
|
815
|
+
logger.info(f"Selected embedding model for ingestion: '{embedding_model}'")
|
|
793
816
|
self.log(f"Using embedding model for ingestion: {embedding_model}")
|
|
794
817
|
self.log(f"Dynamic vector field: {dynamic_field_name}")
|
|
795
818
|
|
|
@@ -814,6 +837,7 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
814
837
|
metadatas = []
|
|
815
838
|
# Process docs_metadata table input into a dict
|
|
816
839
|
additional_metadata = {}
|
|
840
|
+
logger.debug(f"[LF] Docs metadata {self.docs_metadata}")
|
|
817
841
|
if hasattr(self, "docs_metadata") and self.docs_metadata:
|
|
818
842
|
logger.info(f"[LF] Docs metadata {self.docs_metadata}")
|
|
819
843
|
if isinstance(self.docs_metadata[-1], Data):
|
|
@@ -841,51 +865,96 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
841
865
|
metadatas.append(data_copy)
|
|
842
866
|
self.log(metadatas)
|
|
843
867
|
|
|
844
|
-
# Generate embeddings
|
|
845
|
-
|
|
846
|
-
|
|
868
|
+
# Generate embeddings with rate-limit-aware retry logic using tenacity
|
|
869
|
+
from tenacity import (
|
|
870
|
+
retry,
|
|
871
|
+
retry_if_exception,
|
|
872
|
+
stop_after_attempt,
|
|
873
|
+
wait_exponential,
|
|
874
|
+
)
|
|
847
875
|
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
876
|
+
def is_rate_limit_error(exception: Exception) -> bool:
|
|
877
|
+
"""Check if exception is a rate limit error (429)."""
|
|
878
|
+
error_str = str(exception).lower()
|
|
879
|
+
return "429" in error_str or "rate_limit" in error_str or "rate limit" in error_str
|
|
880
|
+
|
|
881
|
+
def is_other_retryable_error(exception: Exception) -> bool:
|
|
882
|
+
"""Check if exception is retryable but not a rate limit error."""
|
|
883
|
+
# Retry on most exceptions except for specific non-retryable ones
|
|
884
|
+
# Add other non-retryable exceptions here if needed
|
|
885
|
+
return not is_rate_limit_error(exception)
|
|
886
|
+
|
|
887
|
+
# Create retry decorator for rate limit errors (longer backoff)
|
|
888
|
+
retry_on_rate_limit = retry(
|
|
889
|
+
retry=retry_if_exception(is_rate_limit_error),
|
|
890
|
+
stop=stop_after_attempt(5),
|
|
891
|
+
wait=wait_exponential(multiplier=2, min=2, max=30),
|
|
892
|
+
reraise=True,
|
|
893
|
+
before_sleep=lambda retry_state: logger.warning(
|
|
894
|
+
f"Rate limit hit for chunk (attempt {retry_state.attempt_number}/5), "
|
|
895
|
+
f"backing off for {retry_state.next_action.sleep:.1f}s"
|
|
896
|
+
),
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Create retry decorator for other errors (shorter backoff)
|
|
900
|
+
retry_on_other_errors = retry(
|
|
901
|
+
retry=retry_if_exception(is_other_retryable_error),
|
|
902
|
+
stop=stop_after_attempt(3),
|
|
903
|
+
wait=wait_exponential(multiplier=1, min=1, max=8),
|
|
904
|
+
reraise=True,
|
|
905
|
+
before_sleep=lambda retry_state: logger.warning(
|
|
906
|
+
f"Error embedding chunk (attempt {retry_state.attempt_number}/3), "
|
|
907
|
+
f"retrying in {retry_state.next_action.sleep:.1f}s: {retry_state.outcome.exception()}"
|
|
908
|
+
),
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
def embed_chunk_with_retry(chunk_text: str, chunk_idx: int) -> list[float]:
|
|
912
|
+
"""Embed a single chunk with rate-limit-aware retry logic."""
|
|
913
|
+
|
|
914
|
+
@retry_on_rate_limit
|
|
915
|
+
@retry_on_other_errors
|
|
916
|
+
def _embed(text: str) -> list[float]:
|
|
917
|
+
return selected_embedding.embed_documents([text])[0]
|
|
853
918
|
|
|
854
|
-
while attempts < max_attempts:
|
|
855
|
-
attempts += 1
|
|
856
919
|
try:
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
idx = futures[future]
|
|
863
|
-
vectors[idx] = future.result()
|
|
864
|
-
break
|
|
865
|
-
except Exception as exc:
|
|
866
|
-
last_exception = exc
|
|
867
|
-
if attempts >= max_attempts:
|
|
868
|
-
logger.error(
|
|
869
|
-
f"Embedding generation failed for model {embedding_model} after retries",
|
|
870
|
-
error=str(exc),
|
|
871
|
-
)
|
|
872
|
-
raise
|
|
873
|
-
logger.warning(
|
|
874
|
-
"Threaded embedding generation failed for model %s (attempt %s/%s), retrying in %.1fs",
|
|
875
|
-
embedding_model,
|
|
876
|
-
attempts,
|
|
877
|
-
max_attempts,
|
|
878
|
-
delay,
|
|
920
|
+
return _embed(chunk_text)
|
|
921
|
+
except Exception as e:
|
|
922
|
+
logger.error(
|
|
923
|
+
f"Failed to embed chunk {chunk_idx} after all retries: {e}",
|
|
924
|
+
error=str(e),
|
|
879
925
|
)
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
926
|
+
raise
|
|
927
|
+
|
|
928
|
+
# Restrict concurrency for IBM/Watsonx models to avoid rate limits
|
|
929
|
+
is_ibm = (embedding_model and "ibm" in str(embedding_model).lower()) or (
|
|
930
|
+
selected_embedding and "watsonx" in type(selected_embedding).__name__.lower()
|
|
931
|
+
)
|
|
932
|
+
logger.debug(f"Is IBM: {is_ibm}")
|
|
933
|
+
|
|
934
|
+
# For IBM models, use sequential processing with rate limiting
|
|
935
|
+
# For other models, use parallel processing
|
|
936
|
+
vectors: list[list[float]] = [None] * len(texts)
|
|
937
|
+
|
|
938
|
+
if is_ibm:
|
|
939
|
+
# Sequential processing with inter-request delay for IBM models
|
|
940
|
+
inter_request_delay = 0.6 # ~1.67 req/s, safely under 2 req/s limit
|
|
941
|
+
logger.info(f"Using sequential processing for IBM model with {inter_request_delay}s delay between requests")
|
|
942
|
+
|
|
943
|
+
for idx, chunk in enumerate(texts):
|
|
944
|
+
if idx > 0:
|
|
945
|
+
# Add delay between requests (but not before the first one)
|
|
946
|
+
time.sleep(inter_request_delay)
|
|
947
|
+
vectors[idx] = embed_chunk_with_retry(chunk, idx)
|
|
948
|
+
else:
|
|
949
|
+
# Parallel processing for non-IBM models
|
|
950
|
+
max_workers = min(max(len(texts), 1), 8)
|
|
951
|
+
logger.debug(f"Using parallel processing with {max_workers} workers")
|
|
952
|
+
|
|
953
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
954
|
+
futures = {executor.submit(embed_chunk_with_retry, chunk, idx): idx for idx, chunk in enumerate(texts)}
|
|
955
|
+
for future in as_completed(futures):
|
|
956
|
+
idx = futures[future]
|
|
957
|
+
vectors[idx] = future.result()
|
|
889
958
|
|
|
890
959
|
if not vectors:
|
|
891
960
|
self.log(f"No vectors generated from documents for model {embedding_model}.")
|
|
@@ -956,6 +1025,9 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
956
1025
|
)
|
|
957
1026
|
self.log(metadatas)
|
|
958
1027
|
|
|
1028
|
+
logger.info(
|
|
1029
|
+
f"Ingestion complete: Successfully indexed {len(return_ids)} documents with model '{embedding_model}'"
|
|
1030
|
+
)
|
|
959
1031
|
self.log(f"Successfully indexed {len(return_ids)} documents with model {embedding_model}.")
|
|
960
1032
|
|
|
961
1033
|
# ---------- helpers for filters ----------
|
|
@@ -1050,6 +1122,7 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1050
1122
|
if filter_clauses:
|
|
1051
1123
|
agg_query["query"] = {"bool": {"filter": filter_clauses}}
|
|
1052
1124
|
|
|
1125
|
+
logger.debug(f"Model detection query: {agg_query}")
|
|
1053
1126
|
result = client.search(
|
|
1054
1127
|
index=self.index_name,
|
|
1055
1128
|
body=agg_query,
|
|
@@ -1058,14 +1131,25 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1058
1131
|
buckets = result.get("aggregations", {}).get("embedding_models", {}).get("buckets", [])
|
|
1059
1132
|
models = [b["key"] for b in buckets if b["key"]]
|
|
1060
1133
|
|
|
1134
|
+
# Log detailed bucket info for debugging
|
|
1061
1135
|
logger.info(
|
|
1062
1136
|
f"Detected embedding models in corpus: {models}"
|
|
1063
1137
|
+ (f" (with {len(filter_clauses)} filters)" if filter_clauses else "")
|
|
1064
1138
|
)
|
|
1139
|
+
if not models:
|
|
1140
|
+
total_hits = result.get("hits", {}).get("total", {})
|
|
1141
|
+
total_count = total_hits.get("value", 0) if isinstance(total_hits, dict) else total_hits
|
|
1142
|
+
logger.warning(
|
|
1143
|
+
f"No embedding_model values found in index '{self.index_name}'. "
|
|
1144
|
+
f"Total docs in index: {total_count}. "
|
|
1145
|
+
f"This may indicate documents were indexed without the embedding_model field."
|
|
1146
|
+
)
|
|
1065
1147
|
except (OpenSearchException, KeyError, ValueError) as e:
|
|
1066
1148
|
logger.warning(f"Failed to detect embedding models: {e}")
|
|
1067
1149
|
# Fallback to current model
|
|
1068
|
-
|
|
1150
|
+
fallback_model = self._get_embedding_model_name()
|
|
1151
|
+
logger.info(f"Using fallback model: {fallback_model}")
|
|
1152
|
+
return [fallback_model]
|
|
1069
1153
|
else:
|
|
1070
1154
|
return models
|
|
1071
1155
|
|
|
@@ -1172,6 +1256,11 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1172
1256
|
msg = "Embedding is required to run hybrid search (KNN + keyword)."
|
|
1173
1257
|
raise ValueError(msg)
|
|
1174
1258
|
|
|
1259
|
+
# Check if embedding is None (fail-safe mode)
|
|
1260
|
+
if self.embedding is None or (isinstance(self.embedding, list) and all(e is None for e in self.embedding)):
|
|
1261
|
+
logger.error("Embedding returned None (fail-safe mode enabled). Cannot perform search.")
|
|
1262
|
+
return []
|
|
1263
|
+
|
|
1175
1264
|
# Build filter clauses first so we can use them in model detection
|
|
1176
1265
|
filter_clauses = self._coerce_filter_clauses(filter_obj)
|
|
1177
1266
|
|
|
@@ -1187,6 +1276,14 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1187
1276
|
|
|
1188
1277
|
# Normalize embedding to list
|
|
1189
1278
|
embeddings_list = self.embedding if isinstance(self.embedding, list) else [self.embedding]
|
|
1279
|
+
# Filter out None values (fail-safe mode)
|
|
1280
|
+
embeddings_list = [e for e in embeddings_list if e is not None]
|
|
1281
|
+
|
|
1282
|
+
if not embeddings_list:
|
|
1283
|
+
logger.error(
|
|
1284
|
+
"No valid embeddings available after filtering None values (fail-safe mode). Cannot perform search."
|
|
1285
|
+
)
|
|
1286
|
+
return []
|
|
1190
1287
|
|
|
1191
1288
|
# Create a comprehensive map of model names to embedding objects
|
|
1192
1289
|
# Check all possible identifiers (deployment, model, model_id, model_name)
|
|
@@ -1203,18 +1300,20 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1203
1300
|
model_id = getattr(emb_obj, "model_id", None)
|
|
1204
1301
|
model_name = getattr(emb_obj, "model_name", None)
|
|
1205
1302
|
dimensions = getattr(emb_obj, "dimensions", None)
|
|
1206
|
-
|
|
1303
|
+
available_models_attr = getattr(emb_obj, "available_models", None)
|
|
1207
1304
|
|
|
1208
1305
|
logger.info(
|
|
1209
1306
|
f"Embedding object {idx}: deployment={deployment}, model={model}, "
|
|
1210
1307
|
f"model_id={model_id}, model_name={model_name}, dimensions={dimensions}, "
|
|
1211
|
-
f"available_models={
|
|
1308
|
+
f"available_models={available_models_attr}"
|
|
1212
1309
|
)
|
|
1213
1310
|
|
|
1214
1311
|
# If this embedding has available_models dict, map all models to their dedicated instances
|
|
1215
|
-
if
|
|
1216
|
-
logger.info(
|
|
1217
|
-
|
|
1312
|
+
if available_models_attr and isinstance(available_models_attr, dict):
|
|
1313
|
+
logger.info(
|
|
1314
|
+
f"Embedding object {idx} provides {len(available_models_attr)} models via available_models dict"
|
|
1315
|
+
)
|
|
1316
|
+
for model_name_key, dedicated_embedding in available_models_attr.items():
|
|
1218
1317
|
if model_name_key and str(model_name_key).strip():
|
|
1219
1318
|
model_str = str(model_name_key).strip()
|
|
1220
1319
|
if model_str not in embedding_by_model:
|
|
@@ -1269,6 +1368,12 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1269
1368
|
|
|
1270
1369
|
logger.info(f"Generating embeddings for {len(available_models)} models in index")
|
|
1271
1370
|
logger.info(f"Available embedding identifiers: {list(embedding_by_model.keys())}")
|
|
1371
|
+
self.log(f"[SEARCH] Models detected in index: {available_models}")
|
|
1372
|
+
self.log(f"[SEARCH] Available embedding identifiers: {list(embedding_by_model.keys())}")
|
|
1373
|
+
|
|
1374
|
+
# Track matching status for debugging
|
|
1375
|
+
matched_models = []
|
|
1376
|
+
unmatched_models = []
|
|
1272
1377
|
|
|
1273
1378
|
for model_name in available_models:
|
|
1274
1379
|
try:
|
|
@@ -1298,18 +1403,34 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1298
1403
|
# Use the embedding instance directly - no model switching needed!
|
|
1299
1404
|
vec = emb_obj.embed_query(q)
|
|
1300
1405
|
query_embeddings[model_name] = vec
|
|
1406
|
+
matched_models.append(model_name)
|
|
1301
1407
|
logger.info(f"Generated embedding for model: {model_name} (actual dimensions: {len(vec)})")
|
|
1408
|
+
self.log(f"[MATCH] Model '{model_name}' - generated {len(vec)}-dim embedding")
|
|
1302
1409
|
else:
|
|
1303
1410
|
# No matching embedding found for this model
|
|
1411
|
+
unmatched_models.append(model_name)
|
|
1304
1412
|
logger.warning(
|
|
1305
1413
|
f"No matching embedding found for model '{model_name}'. "
|
|
1306
|
-
f"This model will be skipped. Available
|
|
1414
|
+
f"This model will be skipped. Available identifiers: {list(embedding_by_model.keys())}"
|
|
1307
1415
|
)
|
|
1416
|
+
self.log(f"[NO MATCH] Model '{model_name}' - available: {list(embedding_by_model.keys())}")
|
|
1308
1417
|
except (RuntimeError, ValueError, ConnectionError, TimeoutError, AttributeError, KeyError) as e:
|
|
1309
1418
|
logger.warning(f"Failed to generate embedding for {model_name}: {e}")
|
|
1419
|
+
self.log(f"[ERROR] Embedding generation failed for '{model_name}': {e}")
|
|
1420
|
+
|
|
1421
|
+
# Log summary of model matching
|
|
1422
|
+
logger.info(f"Model matching summary: {len(matched_models)} matched, {len(unmatched_models)} unmatched")
|
|
1423
|
+
self.log(f"[SUMMARY] Model matching: {len(matched_models)} matched, {len(unmatched_models)} unmatched")
|
|
1424
|
+
if unmatched_models:
|
|
1425
|
+
self.log(f"[WARN] Unmatched models in index: {unmatched_models}")
|
|
1310
1426
|
|
|
1311
1427
|
if not query_embeddings:
|
|
1312
|
-
msg =
|
|
1428
|
+
msg = (
|
|
1429
|
+
f"Failed to generate embeddings for any model. "
|
|
1430
|
+
f"Index has models: {available_models}, but no matching embedding objects found. "
|
|
1431
|
+
f"Available embedding identifiers: {list(embedding_by_model.keys())}"
|
|
1432
|
+
)
|
|
1433
|
+
self.log(f"[FAIL] Search failed: {msg}")
|
|
1313
1434
|
raise ValueError(msg)
|
|
1314
1435
|
|
|
1315
1436
|
index_properties = self._get_index_properties(client)
|
|
@@ -1339,6 +1460,7 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1339
1460
|
f"Skipping model {model_name}: field '{field_name}' is not mapped as knn_vector. "
|
|
1340
1461
|
f"Documents must be indexed with this embedding model before querying."
|
|
1341
1462
|
)
|
|
1463
|
+
self.log(f"[SKIP] Field '{selected_field}' not a knn_vector - skipping model '{model_name}'")
|
|
1342
1464
|
continue
|
|
1343
1465
|
|
|
1344
1466
|
# Validate vector dimensions match the field dimensions
|
|
@@ -1349,6 +1471,7 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1349
1471
|
f"Query vector has {vector_dim} dimensions but field '{selected_field}' expects {field_dim}. "
|
|
1350
1472
|
f"Skipping this model to prevent search errors."
|
|
1351
1473
|
)
|
|
1474
|
+
self.log(f"[DIM MISMATCH] Model '{model_name}': query={vector_dim} vs field={field_dim} - skipping")
|
|
1352
1475
|
continue
|
|
1353
1476
|
|
|
1354
1477
|
logger.info(
|
|
@@ -1385,6 +1508,11 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1385
1508
|
"This may indicate an empty index or missing field mappings. "
|
|
1386
1509
|
"Returning empty search results."
|
|
1387
1510
|
)
|
|
1511
|
+
self.log(
|
|
1512
|
+
f"[WARN] No valid KNN queries could be built. "
|
|
1513
|
+
f"Query embeddings generated: {list(query_embeddings.keys())}, "
|
|
1514
|
+
f"but no matching knn_vector fields found in index."
|
|
1515
|
+
)
|
|
1388
1516
|
return []
|
|
1389
1517
|
|
|
1390
1518
|
# Build exists filter - document must have at least one embedding field
|
|
@@ -1448,7 +1576,13 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1448
1576
|
if isinstance(score_threshold, (int, float)) and score_threshold > 0:
|
|
1449
1577
|
body["min_score"] = score_threshold
|
|
1450
1578
|
|
|
1451
|
-
logger.info(
|
|
1579
|
+
logger.info(
|
|
1580
|
+
f"Executing multi-model hybrid search with {len(knn_queries_with_candidates)} embedding models: "
|
|
1581
|
+
f"{list(query_embeddings.keys())}"
|
|
1582
|
+
)
|
|
1583
|
+
self.log(f"[EXEC] Executing search with {len(knn_queries_with_candidates)} KNN queries, limit={limit}")
|
|
1584
|
+
self.log(f"[EXEC] Embedding models used: {list(query_embeddings.keys())}")
|
|
1585
|
+
self.log(f"[EXEC] KNN fields being queried: {embedding_fields}")
|
|
1452
1586
|
|
|
1453
1587
|
try:
|
|
1454
1588
|
resp = client.search(index=self.index_name, body=body, params={"terminate_after": 0})
|
|
@@ -1502,6 +1636,16 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1502
1636
|
hits = resp.get("hits", {}).get("hits", [])
|
|
1503
1637
|
|
|
1504
1638
|
logger.info(f"Found {len(hits)} results")
|
|
1639
|
+
self.log(f"[RESULT] Search complete: {len(hits)} results found")
|
|
1640
|
+
|
|
1641
|
+
if len(hits) == 0:
|
|
1642
|
+
self.log(
|
|
1643
|
+
f"[EMPTY] Debug info: "
|
|
1644
|
+
f"models_in_index={available_models}, "
|
|
1645
|
+
f"matched_models={matched_models}, "
|
|
1646
|
+
f"knn_fields={embedding_fields}, "
|
|
1647
|
+
f"filters={len(filter_clauses)} clauses"
|
|
1648
|
+
)
|
|
1505
1649
|
|
|
1506
1650
|
return [
|
|
1507
1651
|
{
|
|
@@ -1518,6 +1662,9 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1518
1662
|
This is the main interface method that performs the multi-model search using the
|
|
1519
1663
|
configured search_query and returns results in Langflow's Data format.
|
|
1520
1664
|
|
|
1665
|
+
Always builds the vector store (triggering ingestion if needed), then performs
|
|
1666
|
+
search only if a query is provided.
|
|
1667
|
+
|
|
1521
1668
|
Returns:
|
|
1522
1669
|
List of Data objects containing search results with text and metadata
|
|
1523
1670
|
|
|
@@ -1525,9 +1672,20 @@ class OpenSearchVectorStoreComponentMultimodalMultiEmbedding(LCVectorStoreCompon
|
|
|
1525
1672
|
Exception: If search operation fails
|
|
1526
1673
|
"""
|
|
1527
1674
|
try:
|
|
1528
|
-
|
|
1675
|
+
# Always build/cache the vector store to ensure ingestion happens
|
|
1676
|
+
logger.info(f"Search query: {self.search_query}")
|
|
1677
|
+
if self._cached_vector_store is None:
|
|
1678
|
+
self.build_vector_store()
|
|
1679
|
+
|
|
1680
|
+
# Only perform search if query is provided
|
|
1681
|
+
search_query = (self.search_query or "").strip()
|
|
1682
|
+
if not search_query:
|
|
1683
|
+
self.log("No search query provided - ingestion completed, returning empty results")
|
|
1684
|
+
return []
|
|
1685
|
+
|
|
1686
|
+
# Perform search with the provided query
|
|
1687
|
+
raw = self.search(search_query)
|
|
1529
1688
|
return [Data(text=hit["page_content"], **hit["metadata"]) for hit in raw]
|
|
1530
|
-
self.log(self.ingest_data)
|
|
1531
1689
|
except Exception as e:
|
|
1532
1690
|
self.log(f"search_documents error: {e}")
|
|
1533
1691
|
raise
|