lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +9 -4
  3. lfx/base/agents/altk_base_agent.py +16 -3
  4. lfx/base/agents/altk_tool_wrappers.py +1 -1
  5. lfx/base/agents/utils.py +4 -0
  6. lfx/base/composio/composio_base.py +78 -41
  7. lfx/base/data/base_file.py +14 -4
  8. lfx/base/data/cloud_storage_utils.py +156 -0
  9. lfx/base/data/docling_utils.py +191 -65
  10. lfx/base/data/storage_utils.py +109 -0
  11. lfx/base/datastax/astradb_base.py +75 -64
  12. lfx/base/mcp/util.py +2 -2
  13. lfx/base/models/__init__.py +11 -1
  14. lfx/base/models/anthropic_constants.py +21 -12
  15. lfx/base/models/google_generative_ai_constants.py +33 -9
  16. lfx/base/models/model_metadata.py +6 -0
  17. lfx/base/models/ollama_constants.py +196 -30
  18. lfx/base/models/openai_constants.py +37 -10
  19. lfx/base/models/unified_models.py +1123 -0
  20. lfx/base/models/watsonx_constants.py +36 -0
  21. lfx/base/tools/component_tool.py +2 -9
  22. lfx/cli/commands.py +6 -1
  23. lfx/cli/run.py +65 -409
  24. lfx/cli/script_loader.py +13 -3
  25. lfx/components/__init__.py +0 -3
  26. lfx/components/composio/github_composio.py +1 -1
  27. lfx/components/cuga/cuga_agent.py +39 -27
  28. lfx/components/data_source/api_request.py +4 -2
  29. lfx/components/docling/__init__.py +45 -11
  30. lfx/components/docling/chunk_docling_document.py +3 -1
  31. lfx/components/docling/docling_inline.py +39 -49
  32. lfx/components/docling/export_docling_document.py +3 -1
  33. lfx/components/elastic/opensearch_multimodal.py +215 -57
  34. lfx/components/files_and_knowledge/file.py +439 -39
  35. lfx/components/files_and_knowledge/ingestion.py +8 -0
  36. lfx/components/files_and_knowledge/retrieval.py +10 -0
  37. lfx/components/files_and_knowledge/save_file.py +123 -53
  38. lfx/components/ibm/watsonx.py +7 -1
  39. lfx/components/input_output/chat_output.py +7 -1
  40. lfx/components/langchain_utilities/tool_calling.py +14 -6
  41. lfx/components/llm_operations/batch_run.py +80 -25
  42. lfx/components/llm_operations/lambda_filter.py +33 -6
  43. lfx/components/llm_operations/llm_conditional_router.py +39 -7
  44. lfx/components/llm_operations/structured_output.py +38 -12
  45. lfx/components/models/__init__.py +16 -74
  46. lfx/components/models_and_agents/agent.py +51 -201
  47. lfx/components/models_and_agents/embedding_model.py +185 -339
  48. lfx/components/models_and_agents/language_model.py +54 -318
  49. lfx/components/models_and_agents/mcp_component.py +58 -9
  50. lfx/components/ollama/ollama.py +9 -4
  51. lfx/components/ollama/ollama_embeddings.py +2 -1
  52. lfx/components/openai/openai_chat_model.py +1 -1
  53. lfx/components/processing/__init__.py +0 -3
  54. lfx/components/vllm/__init__.py +37 -0
  55. lfx/components/vllm/vllm.py +141 -0
  56. lfx/components/vllm/vllm_embeddings.py +110 -0
  57. lfx/custom/custom_component/custom_component.py +8 -6
  58. lfx/custom/directory_reader/directory_reader.py +5 -2
  59. lfx/graph/utils.py +64 -18
  60. lfx/inputs/__init__.py +2 -0
  61. lfx/inputs/input_mixin.py +54 -0
  62. lfx/inputs/inputs.py +115 -0
  63. lfx/interface/initialize/loading.py +42 -12
  64. lfx/io/__init__.py +2 -0
  65. lfx/run/__init__.py +5 -0
  66. lfx/run/base.py +494 -0
  67. lfx/schema/data.py +1 -1
  68. lfx/schema/image.py +28 -19
  69. lfx/schema/message.py +19 -3
  70. lfx/services/interfaces.py +5 -0
  71. lfx/services/manager.py +5 -4
  72. lfx/services/mcp_composer/service.py +45 -13
  73. lfx/services/settings/auth.py +18 -11
  74. lfx/services/settings/base.py +12 -24
  75. lfx/services/settings/constants.py +2 -0
  76. lfx/services/storage/local.py +37 -0
  77. lfx/services/storage/service.py +19 -0
  78. lfx/utils/constants.py +1 -0
  79. lfx/utils/image.py +29 -11
  80. lfx/utils/validate_cloud.py +14 -3
  81. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
  82. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +84 -78
  83. lfx/components/processing/dataframe_to_toolset.py +0 -259
  84. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
  85. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0
@@ -3,7 +3,7 @@ import signal
3
3
  import sys
4
4
  import traceback
5
5
  from contextlib import suppress
6
- from typing import TYPE_CHECKING
6
+ from functools import lru_cache
7
7
 
8
8
  from docling_core.types.doc import DoclingDocument
9
9
  from pydantic import BaseModel, SecretStr, TypeAdapter
@@ -12,9 +12,6 @@ from lfx.log.logger import logger
12
12
  from lfx.schema.data import Data
13
13
  from lfx.schema.dataframe import DataFrame
14
14
 
15
- if TYPE_CHECKING:
16
- from langchain_core.language_models.chat_models import BaseChatModel
17
-
18
15
 
19
16
  class DoclingDependencyError(Exception):
20
17
  """Custom exception for missing Docling dependencies."""
@@ -25,21 +22,72 @@ class DoclingDependencyError(Exception):
25
22
  super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
26
23
 
27
24
 
28
- def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
25
+ def extract_docling_documents(
26
+ data_inputs: Data | list[Data] | DataFrame, doc_key: str
27
+ ) -> tuple[list[DoclingDocument], str | None]:
28
+ """Extract DoclingDocument objects from data inputs.
29
+
30
+ Args:
31
+ data_inputs: The data inputs containing DoclingDocument objects
32
+ doc_key: The key/column name to look for DoclingDocument objects
33
+
34
+ Returns:
35
+ A tuple of (documents, warning_message) where warning_message is None if no warning
36
+
37
+ Raises:
38
+ TypeError: If the data cannot be extracted or is invalid
39
+ """
29
40
  documents: list[DoclingDocument] = []
41
+ warning_message: str | None = None
42
+
30
43
  if isinstance(data_inputs, DataFrame):
31
44
  if not len(data_inputs):
32
45
  msg = "DataFrame is empty"
33
46
  raise TypeError(msg)
34
47
 
35
- if doc_key not in data_inputs.columns:
36
- msg = f"Column '{doc_key}' not found in DataFrame"
37
- raise TypeError(msg)
38
- try:
39
- documents = data_inputs[doc_key].tolist()
40
- except Exception as e:
41
- msg = f"Error extracting DoclingDocument from DataFrame: {e}"
42
- raise TypeError(msg) from e
48
+ # Primary: Check for exact column name match
49
+ if doc_key in data_inputs.columns:
50
+ try:
51
+ documents = data_inputs[doc_key].tolist()
52
+ except Exception as e:
53
+ msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
54
+ raise TypeError(msg) from e
55
+ else:
56
+ # Fallback: Search all columns for DoclingDocument objects
57
+ found_column = None
58
+ for col in data_inputs.columns:
59
+ try:
60
+ # Check if this column contains DoclingDocument objects
61
+ sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
62
+ if sample is not None and isinstance(sample, DoclingDocument):
63
+ found_column = col
64
+ break
65
+ except (IndexError, AttributeError):
66
+ continue
67
+
68
+ if found_column:
69
+ warning_message = (
70
+ f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
71
+ f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
72
+ )
73
+ logger.warning(warning_message)
74
+ try:
75
+ documents = data_inputs[found_column].tolist()
76
+ except Exception as e:
77
+ msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
78
+ raise TypeError(msg) from e
79
+ else:
80
+ # Provide helpful error message
81
+ available_columns = list(data_inputs.columns)
82
+ msg = (
83
+ f"Column '{doc_key}' not found in DataFrame. "
84
+ f"Available columns: {available_columns}. "
85
+ f"\n\nPossible solutions:\n"
86
+ f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
87
+ f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
88
+ f"3. If using VLM pipeline, try using the standard pipeline"
89
+ )
90
+ raise TypeError(msg)
43
91
  else:
44
92
  if not data_inputs:
45
93
  msg = "No data inputs provided"
@@ -69,7 +117,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
69
117
  except AttributeError as e:
70
118
  msg = f"Invalid input type in collection: {e}"
71
119
  raise TypeError(msg) from e
72
- return documents
120
+ return documents, warning_message
73
121
 
74
122
 
75
123
  def _unwrap_secrets(obj):
@@ -101,6 +149,81 @@ def _deserialize_pydantic_model(data: dict):
101
149
  return adapter.validate_python(data["config"])
102
150
 
103
151
 
152
+ # Global cache for DocumentConverter instances
153
+ # This cache persists across multiple runs and thread invocations
154
+ @lru_cache(maxsize=4)
155
+ def _get_cached_converter(
156
+ pipeline: str,
157
+ ocr_engine: str,
158
+ *,
159
+ do_picture_classification: bool,
160
+ pic_desc_config_hash: str | None,
161
+ ):
162
+ """Create and cache a DocumentConverter instance based on configuration.
163
+
164
+ This function uses LRU caching to maintain DocumentConverter instances in memory,
165
+ eliminating the 15-20 minute model loading time on subsequent runs.
166
+
167
+ Args:
168
+ pipeline: The pipeline type ("standard" or "vlm")
169
+ ocr_engine: The OCR engine to use
170
+ do_picture_classification: Whether to enable picture classification
171
+ pic_desc_config_hash: Hash of the picture description config (for cache key)
172
+
173
+ Returns:
174
+ A cached or newly created DocumentConverter instance
175
+ """
176
+ from docling.datamodel.base_models import InputFormat
177
+ from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
178
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
179
+ from docling.models.factories import get_ocr_factory
180
+ from docling.pipeline.vlm_pipeline import VlmPipeline
181
+
182
+ logger.info(f"Creating DocumentConverter for pipeline={pipeline}, ocr_engine={ocr_engine}")
183
+
184
+ # Configure the standard PDF pipeline
185
+ def _get_standard_opts() -> PdfPipelineOptions:
186
+ pipeline_options = PdfPipelineOptions()
187
+ pipeline_options.do_ocr = ocr_engine not in {"", "None"}
188
+ if pipeline_options.do_ocr:
189
+ ocr_factory = get_ocr_factory(
190
+ allow_external_plugins=False,
191
+ )
192
+ ocr_options: OcrOptions = ocr_factory.create_options(
193
+ kind=ocr_engine,
194
+ )
195
+ pipeline_options.ocr_options = ocr_options
196
+
197
+ pipeline_options.do_picture_classification = do_picture_classification
198
+
199
+ # Note: pic_desc_config_hash is for cache key only
200
+ # Actual picture description is handled separately (non-cached path)
201
+ _ = pic_desc_config_hash # Mark as intentionally unused
202
+
203
+ return pipeline_options
204
+
205
+ # Configure the VLM pipeline
206
+ def _get_vlm_opts() -> VlmPipelineOptions:
207
+ return VlmPipelineOptions()
208
+
209
+ if pipeline == "standard":
210
+ pdf_format_option = PdfFormatOption(
211
+ pipeline_options=_get_standard_opts(),
212
+ )
213
+ elif pipeline == "vlm":
214
+ pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
215
+ else:
216
+ msg = f"Unknown pipeline: {pipeline!r}"
217
+ raise ValueError(msg)
218
+
219
+ format_options: dict[InputFormat, FormatOption] = {
220
+ InputFormat.PDF: pdf_format_option,
221
+ InputFormat.IMAGE: pdf_format_option,
222
+ }
223
+
224
+ return DocumentConverter(format_options=format_options)
225
+
226
+
104
227
  def docling_worker(
105
228
  *,
106
229
  file_paths: list[str],
@@ -111,7 +234,12 @@ def docling_worker(
111
234
  pic_desc_config: dict | None,
112
235
  pic_desc_prompt: str,
113
236
  ):
114
- """Worker function for processing files with Docling in a separate process."""
237
+ """Worker function for processing files with Docling using threading.
238
+
239
+ This function now uses a globally cached DocumentConverter instance,
240
+ significantly reducing processing time on subsequent runs from 15-20 minutes
241
+ to just seconds.
242
+ """
115
243
  # Signal handling for graceful shutdown
116
244
  shutdown_requested = False
117
245
 
@@ -154,12 +282,12 @@ def docling_worker(
154
282
  check_shutdown()
155
283
 
156
284
  try:
157
- from docling.datamodel.base_models import ConversionStatus, InputFormat
158
- from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
159
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
160
- from docling.models.factories import get_ocr_factory
161
- from docling.pipeline.vlm_pipeline import VlmPipeline
162
- from langchain_docling.picture_description import PictureDescriptionLangChainOptions
285
+ from docling.datamodel.base_models import ConversionStatus, InputFormat # noqa: F401
286
+ from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions # noqa: F401
287
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption # noqa: F401
288
+ from docling.models.factories import get_ocr_factory # noqa: F401
289
+ from docling.pipeline.vlm_pipeline import VlmPipeline # noqa: F401
290
+ from langchain_docling.picture_description import PictureDescriptionLangChainOptions # noqa: F401
163
291
 
164
292
  # Check for shutdown after imports
165
293
  check_shutdown()
@@ -182,27 +310,34 @@ def docling_worker(
182
310
  queue.put({"error": "Worker interrupted during imports", "shutdown": True})
183
311
  return
184
312
 
185
- # Configure the standard PDF pipeline
186
- def _get_standard_opts() -> PdfPipelineOptions:
313
+ # Use cached converter instead of creating new one each time
314
+ # This is the key optimization that eliminates 15-20 minute model load times
315
+ def _get_converter() -> DocumentConverter:
187
316
  check_shutdown() # Check before heavy operations
188
317
 
189
- pipeline_options = PdfPipelineOptions()
190
- pipeline_options.do_ocr = ocr_engine not in {"", "None"}
191
- if pipeline_options.do_ocr:
192
- ocr_factory = get_ocr_factory(
193
- allow_external_plugins=False,
194
- )
195
-
196
- ocr_options: OcrOptions = ocr_factory.create_options(
197
- kind=ocr_engine,
198
- )
199
- pipeline_options.ocr_options = ocr_options
200
-
201
- pipeline_options.do_picture_classification = do_picture_classification
202
-
318
+ # For now, we don't support pic_desc_config caching due to serialization complexity
319
+ # This is a known limitation that can be addressed in a future enhancement
203
320
  if pic_desc_config:
204
- pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config)
205
-
321
+ logger.warning(
322
+ "Picture description with LLM is not yet supported with cached converters. "
323
+ "Using non-cached converter for this request."
324
+ )
325
+ # Fall back to creating a new converter (old behavior)
326
+ from docling.datamodel.base_models import InputFormat
327
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
328
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
329
+ from docling.models.factories import get_ocr_factory
330
+ from langchain_docling.picture_description import PictureDescriptionLangChainOptions
331
+
332
+ pipeline_options = PdfPipelineOptions()
333
+ pipeline_options.do_ocr = ocr_engine not in {"", "None"}
334
+ if pipeline_options.do_ocr:
335
+ ocr_factory = get_ocr_factory(allow_external_plugins=False)
336
+ ocr_options = ocr_factory.create_options(kind=ocr_engine)
337
+ pipeline_options.ocr_options = ocr_options
338
+
339
+ pipeline_options.do_picture_classification = do_picture_classification
340
+ pic_desc_llm = _deserialize_pydantic_model(pic_desc_config)
206
341
  logger.info("Docling enabling the picture description stage.")
207
342
  pipeline_options.do_picture_description = True
208
343
  pipeline_options.allow_external_plugins = True
@@ -210,33 +345,24 @@ def docling_worker(
210
345
  llm=pic_desc_llm,
211
346
  prompt=pic_desc_prompt,
212
347
  )
213
- return pipeline_options
214
348
 
215
- # Configure the VLM pipeline
216
- def _get_vlm_opts() -> VlmPipelineOptions:
217
- check_shutdown() # Check before heavy operations
218
- return VlmPipelineOptions()
219
-
220
- # Configure the main format options and create the DocumentConverter()
221
- def _get_converter() -> DocumentConverter:
222
- check_shutdown() # Check before heavy operations
223
-
224
- if pipeline == "standard":
225
- pdf_format_option = PdfFormatOption(
226
- pipeline_options=_get_standard_opts(),
227
- )
228
- elif pipeline == "vlm":
229
- pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
230
- else:
231
- msg = f"Unknown pipeline: {pipeline!r}"
232
- raise ValueError(msg)
233
-
234
- format_options: dict[InputFormat, FormatOption] = {
235
- InputFormat.PDF: pdf_format_option,
236
- InputFormat.IMAGE: pdf_format_option,
237
- }
238
-
239
- return DocumentConverter(format_options=format_options)
349
+ pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
350
+ format_options: dict[InputFormat, FormatOption] = {
351
+ InputFormat.PDF: pdf_format_option,
352
+ InputFormat.IMAGE: pdf_format_option,
353
+ }
354
+ return DocumentConverter(format_options=format_options)
355
+
356
+ # Use cached converter - this is where the magic happens!
357
+ # First run: creates and caches converter (15-20 min)
358
+ # Subsequent runs: reuses cached converter (seconds)
359
+ pic_desc_config_hash = None # Will be None since we checked above
360
+ return _get_cached_converter(
361
+ pipeline=pipeline,
362
+ ocr_engine=ocr_engine,
363
+ do_picture_classification=do_picture_classification,
364
+ pic_desc_config_hash=pic_desc_config_hash,
365
+ )
240
366
 
241
367
  try:
242
368
  # Check for shutdown before creating converter (can be slow)
@@ -190,3 +190,112 @@ def file_exists(file_path: str, storage_service: StorageService | None = None) -
190
190
  return False
191
191
  else:
192
192
  return True
193
+
194
+
195
+ # Magic bytes signatures for common image formats
196
+ MIN_IMAGE_HEADER_SIZE = 12 # Minimum bytes needed to detect image type
197
+
198
+ IMAGE_SIGNATURES: dict[str, list[tuple[bytes, int]]] = {
199
+ "jpeg": [(b"\xff\xd8\xff", 0)],
200
+ "jpg": [(b"\xff\xd8\xff", 0)],
201
+ "png": [(b"\x89PNG\r\n\x1a\n", 0)],
202
+ "gif": [(b"GIF87a", 0), (b"GIF89a", 0)],
203
+ "webp": [(b"RIFF", 0)], # WebP starts with RIFF, then has WEBP at offset 8
204
+ "bmp": [(b"BM", 0)],
205
+ "tiff": [(b"II*\x00", 0), (b"MM\x00*", 0)], # Little-endian and big-endian TIFF
206
+ }
207
+
208
+
209
+ def detect_image_type_from_bytes(content: bytes) -> str | None:
210
+ """Detect the actual image type from file content using magic bytes.
211
+
212
+ Args:
213
+ content: The file content bytes (at least first 12 bytes needed)
214
+
215
+ Returns:
216
+ str | None: The detected image type (e.g., "jpeg", "png") or None if not recognized
217
+ """
218
+ if len(content) < MIN_IMAGE_HEADER_SIZE:
219
+ return None
220
+
221
+ # Check WebP specifically (needs to check both RIFF and WEBP)
222
+ if content[:4] == b"RIFF" and content[8:12] == b"WEBP":
223
+ return "webp"
224
+
225
+ # Check other image signatures
226
+ for image_type, signatures in IMAGE_SIGNATURES.items():
227
+ if image_type == "webp":
228
+ continue # Already handled above
229
+ for signature, offset in signatures:
230
+ if content[offset : offset + len(signature)] == signature:
231
+ return image_type
232
+
233
+ return None
234
+
235
+
236
+ def validate_image_content_type(
237
+ file_path: str,
238
+ content: bytes | None = None,
239
+ storage_service: StorageService | None = None,
240
+ resolve_path: Callable[[str], str] | None = None,
241
+ ) -> tuple[bool, str | None]:
242
+ """Validate that an image file's content matches its declared extension.
243
+
244
+ This prevents errors like "Image does not match the provided media type image/png"
245
+ when a JPEG file is saved with a .png extension.
246
+
247
+ Only rejects files when we can definitively detect a mismatch. Files with
248
+ unrecognized content are allowed through (they may fail later, but that's
249
+ better than false positives blocking valid files).
250
+
251
+ Args:
252
+ file_path: Path to the image file
253
+ content: Optional pre-read file content bytes. If not provided, will read from file.
254
+ storage_service: Optional storage service instance for S3 files
255
+ resolve_path: Optional function to resolve relative paths
256
+
257
+ Returns:
258
+ tuple[bool, str | None]: (is_valid, error_message)
259
+ - (True, None) if the content matches the extension, is unrecognized, or file is not an image
260
+ - (False, error_message) if there's a definite mismatch
261
+ """
262
+ # Get the file extension
263
+ path_obj = Path(file_path)
264
+ extension = path_obj.suffix[1:].lower() if path_obj.suffix else ""
265
+
266
+ # Only validate image files
267
+ image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
268
+ if extension not in image_extensions:
269
+ return True, None
270
+
271
+ # Read content if not provided
272
+ if content is None:
273
+ try:
274
+ content = run_until_complete(read_file_bytes(file_path, storage_service, resolve_path))
275
+ except (FileNotFoundError, ValueError):
276
+ # Can't read file - let it pass, will fail later with better error
277
+ return True, None
278
+
279
+ # Detect actual image type
280
+ detected_type = detect_image_type_from_bytes(content)
281
+
282
+ # If we can't detect the type, the file is not a valid image
283
+ if detected_type is None:
284
+ return False, (
285
+ f"File '{path_obj.name}' has extension '.{extension}' but its content "
286
+ f"is not a valid image format. The file may be corrupted, empty, or not a real image."
287
+ )
288
+
289
+ # Normalize extensions for comparison (jpg == jpeg, tif == tiff)
290
+ extension_normalized = "jpeg" if extension == "jpg" else extension
291
+ detected_normalized = "jpeg" if detected_type == "jpg" else detected_type
292
+
293
+ if extension_normalized != detected_normalized:
294
+ return False, (
295
+ f"File '{path_obj.name}' has extension '.{extension}' but contains "
296
+ f"'{detected_type.upper()}' image data. This mismatch will cause API errors. "
297
+ f"Please rename the file with the correct extension '.{detected_type}' or "
298
+ f"re-save it in the correct format."
299
+ )
300
+
301
+ return True, None
@@ -187,34 +187,38 @@ class AstraDBBaseComponent(Component):
187
187
  @classmethod
188
188
  def map_cloud_providers(cls, token: str, environment: str | None = None) -> dict[str, dict[str, Any]]:
189
189
  """Fetch all available cloud providers and regions."""
190
- # Get the admin object
191
- client = DataAPIClient(environment=cls.get_environment(environment))
192
- admin_client = client.get_admin(token=token)
193
-
194
- # Get the list of available regions
195
- available_regions = admin_client.find_available_regions(only_org_enabled_regions=True)
190
+ try:
191
+ # Get the admin object
192
+ client = DataAPIClient(environment=cls.get_environment(environment))
193
+ admin_client = client.get_admin(token=token)
196
194
 
197
- provider_mapping: dict[str, dict[str, str]] = {
198
- "AWS": {"name": "Amazon Web Services", "id": "aws"},
199
- "GCP": {"name": "Google Cloud Platform", "id": "gcp"},
200
- "Azure": {"name": "Microsoft Azure", "id": "azure"},
201
- }
195
+ # Get the list of available regions
196
+ available_regions = admin_client.find_available_regions(only_org_enabled_regions=True)
202
197
 
203
- result: dict[str, dict[str, Any]] = {}
204
- for region_info in available_regions:
205
- cloud_provider = region_info.cloud_provider
206
- region = region_info.name
198
+ provider_mapping: dict[str, dict[str, str]] = {
199
+ "AWS": {"name": "Amazon Web Services", "id": "aws"},
200
+ "GCP": {"name": "Google Cloud Platform", "id": "gcp"},
201
+ "Azure": {"name": "Microsoft Azure", "id": "azure"},
202
+ }
207
203
 
208
- if cloud_provider in provider_mapping:
209
- provider_name = provider_mapping[cloud_provider]["name"]
210
- provider_id = provider_mapping[cloud_provider]["id"]
204
+ result: dict[str, dict[str, Any]] = {}
205
+ for region_info in available_regions:
206
+ cloud_provider = region_info.cloud_provider
207
+ region = region_info.name
211
208
 
212
- if provider_name not in result:
213
- result[provider_name] = {"id": provider_id, "regions": []}
209
+ if cloud_provider in provider_mapping:
210
+ provider_name = provider_mapping[cloud_provider]["name"]
211
+ provider_id = provider_mapping[cloud_provider]["id"]
214
212
 
215
- result[provider_name]["regions"].append(region)
213
+ if provider_name not in result:
214
+ result[provider_name] = {"id": provider_id, "regions": []}
216
215
 
217
- return result
216
+ result[provider_name]["regions"].append(region)
217
+ except Exception as e: # noqa: BLE001
218
+ logger.debug("Error fetching cloud providers: %s", e)
219
+ return {}
220
+ else:
221
+ return result
218
222
 
219
223
  @classmethod
220
224
  def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):
@@ -327,48 +331,52 @@ class AstraDBBaseComponent(Component):
327
331
 
328
332
  @classmethod
329
333
  def get_database_list_static(cls, token: str, environment: str | None = None):
330
- environment = cls.get_environment(environment)
331
- client = DataAPIClient(environment=environment)
332
-
333
- # Get the admin object
334
- admin_client = client.get_admin(token=token)
334
+ try:
335
+ environment = cls.get_environment(environment)
336
+ client = DataAPIClient(environment=environment)
335
337
 
336
- # Get the list of databases
337
- db_list = admin_client.list_databases()
338
+ # Get the admin object
339
+ admin_client = client.get_admin(token=token)
338
340
 
339
- # Generate the api endpoint for each database
340
- db_info_dict = {}
341
- for db in db_list:
342
- try:
343
- # Get the API endpoint for the database
344
- api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]
341
+ # Get the list of databases
342
+ db_list = admin_client.list_databases()
345
343
 
346
- # Get the number of collections
344
+ # Generate the api endpoint for each database
345
+ db_info_dict = {}
346
+ for db in db_list:
347
347
  try:
348
- # Get the number of collections in the database
349
- num_collections = len(
350
- client.get_database(
351
- api_endpoints[0],
352
- token=token,
353
- ).list_collection_names()
354
- )
355
- except Exception: # noqa: BLE001
356
- if db.status != "PENDING":
357
- continue
358
- num_collections = 0
359
-
360
- # Add the database to the dictionary
361
- db_info_dict[db.name] = {
362
- "api_endpoints": api_endpoints,
363
- "keyspaces": db.keyspaces,
364
- "collections": num_collections,
365
- "status": db.status if db.status != "ACTIVE" else None,
366
- "org_id": db.org_id if db.org_id else None,
367
- }
368
- except Exception as e: # noqa: BLE001
369
- logger.debug("Failed to get metadata for database %s: %s", db.name, e)
370
-
371
- return db_info_dict
348
+ # Get the API endpoint for the database
349
+ api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]
350
+
351
+ # Get the number of collections
352
+ try:
353
+ # Get the number of collections in the database
354
+ num_collections = len(
355
+ client.get_database(
356
+ api_endpoints[0],
357
+ token=token,
358
+ ).list_collection_names()
359
+ )
360
+ except Exception: # noqa: BLE001
361
+ if db.status != "PENDING":
362
+ continue
363
+ num_collections = 0
364
+
365
+ # Add the database to the dictionary
366
+ db_info_dict[db.name] = {
367
+ "api_endpoints": api_endpoints,
368
+ "keyspaces": db.keyspaces,
369
+ "collections": num_collections,
370
+ "status": db.status if db.status != "ACTIVE" else None,
371
+ "org_id": db.org_id if db.org_id else None,
372
+ }
373
+ except Exception as e: # noqa: BLE001
374
+ logger.debug("Failed to get metadata for database %s: %s", db.name, e)
375
+ except Exception as e: # noqa: BLE001
376
+ logger.debug("Error fetching database list: %s", e)
377
+ return {}
378
+ else:
379
+ return db_info_dict
372
380
 
373
381
  def get_database_list(self):
374
382
  return self.get_database_list_static(
@@ -467,6 +475,9 @@ class AstraDBBaseComponent(Component):
467
475
 
468
476
  def _initialize_database_options(self):
469
477
  try:
478
+ db_list = self.get_database_list()
479
+ if not db_list:
480
+ return []
470
481
  return [
471
482
  {
472
483
  "name": name,
@@ -476,11 +487,11 @@ class AstraDBBaseComponent(Component):
476
487
  "keyspaces": info["keyspaces"],
477
488
  "org_id": info["org_id"],
478
489
  }
479
- for name, info in self.get_database_list().items()
490
+ for name, info in db_list.items()
480
491
  ]
481
- except Exception as e:
482
- msg = f"Error fetching database options: {e}"
483
- raise ValueError(msg) from e
492
+ except Exception as e: # noqa: BLE001
493
+ logger.debug("Error fetching database options: %s", e)
494
+ return []
484
495
 
485
496
  @classmethod
486
497
  def get_provider_icon(cls, collection=None, provider_name: str | None = None) -> str:
lfx/base/mcp/util.py CHANGED
@@ -23,6 +23,7 @@ from pydantic import BaseModel
23
23
  from lfx.log.logger import logger
24
24
  from lfx.schema.json_schema import create_input_schema_from_json_schema
25
25
  from lfx.services.deps import get_settings_service
26
+ from lfx.utils.async_helpers import run_until_complete
26
27
 
27
28
  HTTP_ERROR_STATUS_CODE = httpx_codes.BAD_REQUEST # HTTP status code for client errors
28
29
 
@@ -351,8 +352,7 @@ def create_tool_func(tool_name: str, arg_schema: type[BaseModel], client) -> Cal
351
352
  _handle_tool_validation_error(e, tool_name, provided_args, arg_schema)
352
353
 
353
354
  try:
354
- loop = asyncio.get_event_loop()
355
- return loop.run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
355
+ return run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
356
356
  except Exception as e:
357
357
  logger.error(f"Tool '{tool_name}' execution failed: {e}")
358
358
  # Re-raise with more context
@@ -1,3 +1,13 @@
1
1
  from .model import LCModelComponent
2
+ from .unified_models import (
3
+ get_model_provider_variable_mapping,
4
+ get_model_providers,
5
+ get_unified_models_detailed,
6
+ )
2
7
 
3
- __all__ = ["LCModelComponent"]
8
+ __all__ = [
9
+ "LCModelComponent",
10
+ "get_model_provider_variable_mapping",
11
+ "get_model_providers",
12
+ "get_unified_models_detailed",
13
+ ]