lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +9 -4
- lfx/base/agents/altk_base_agent.py +16 -3
- lfx/base/agents/altk_tool_wrappers.py +1 -1
- lfx/base/agents/utils.py +4 -0
- lfx/base/composio/composio_base.py +78 -41
- lfx/base/data/base_file.py +14 -4
- lfx/base/data/cloud_storage_utils.py +156 -0
- lfx/base/data/docling_utils.py +191 -65
- lfx/base/data/storage_utils.py +109 -0
- lfx/base/datastax/astradb_base.py +75 -64
- lfx/base/mcp/util.py +2 -2
- lfx/base/models/__init__.py +11 -1
- lfx/base/models/anthropic_constants.py +21 -12
- lfx/base/models/google_generative_ai_constants.py +33 -9
- lfx/base/models/model_metadata.py +6 -0
- lfx/base/models/ollama_constants.py +196 -30
- lfx/base/models/openai_constants.py +37 -10
- lfx/base/models/unified_models.py +1123 -0
- lfx/base/models/watsonx_constants.py +36 -0
- lfx/base/tools/component_tool.py +2 -9
- lfx/cli/commands.py +6 -1
- lfx/cli/run.py +65 -409
- lfx/cli/script_loader.py +13 -3
- lfx/components/__init__.py +0 -3
- lfx/components/composio/github_composio.py +1 -1
- lfx/components/cuga/cuga_agent.py +39 -27
- lfx/components/data_source/api_request.py +4 -2
- lfx/components/docling/__init__.py +45 -11
- lfx/components/docling/chunk_docling_document.py +3 -1
- lfx/components/docling/docling_inline.py +39 -49
- lfx/components/docling/export_docling_document.py +3 -1
- lfx/components/elastic/opensearch_multimodal.py +215 -57
- lfx/components/files_and_knowledge/file.py +439 -39
- lfx/components/files_and_knowledge/ingestion.py +8 -0
- lfx/components/files_and_knowledge/retrieval.py +10 -0
- lfx/components/files_and_knowledge/save_file.py +123 -53
- lfx/components/ibm/watsonx.py +7 -1
- lfx/components/input_output/chat_output.py +7 -1
- lfx/components/langchain_utilities/tool_calling.py +14 -6
- lfx/components/llm_operations/batch_run.py +80 -25
- lfx/components/llm_operations/lambda_filter.py +33 -6
- lfx/components/llm_operations/llm_conditional_router.py +39 -7
- lfx/components/llm_operations/structured_output.py +38 -12
- lfx/components/models/__init__.py +16 -74
- lfx/components/models_and_agents/agent.py +51 -201
- lfx/components/models_and_agents/embedding_model.py +185 -339
- lfx/components/models_and_agents/language_model.py +54 -318
- lfx/components/models_and_agents/mcp_component.py +58 -9
- lfx/components/ollama/ollama.py +9 -4
- lfx/components/ollama/ollama_embeddings.py +2 -1
- lfx/components/openai/openai_chat_model.py +1 -1
- lfx/components/processing/__init__.py +0 -3
- lfx/components/vllm/__init__.py +37 -0
- lfx/components/vllm/vllm.py +141 -0
- lfx/components/vllm/vllm_embeddings.py +110 -0
- lfx/custom/custom_component/custom_component.py +8 -6
- lfx/custom/directory_reader/directory_reader.py +5 -2
- lfx/graph/utils.py +64 -18
- lfx/inputs/__init__.py +2 -0
- lfx/inputs/input_mixin.py +54 -0
- lfx/inputs/inputs.py +115 -0
- lfx/interface/initialize/loading.py +42 -12
- lfx/io/__init__.py +2 -0
- lfx/run/__init__.py +5 -0
- lfx/run/base.py +494 -0
- lfx/schema/data.py +1 -1
- lfx/schema/image.py +28 -19
- lfx/schema/message.py +19 -3
- lfx/services/interfaces.py +5 -0
- lfx/services/manager.py +5 -4
- lfx/services/mcp_composer/service.py +45 -13
- lfx/services/settings/auth.py +18 -11
- lfx/services/settings/base.py +12 -24
- lfx/services/settings/constants.py +2 -0
- lfx/services/storage/local.py +37 -0
- lfx/services/storage/service.py +19 -0
- lfx/utils/constants.py +1 -0
- lfx/utils/image.py +29 -11
- lfx/utils/validate_cloud.py +14 -3
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +84 -78
- lfx/components/processing/dataframe_to_toolset.py +0 -259
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0
lfx/base/data/docling_utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import signal
|
|
|
3
3
|
import sys
|
|
4
4
|
import traceback
|
|
5
5
|
from contextlib import suppress
|
|
6
|
-
from
|
|
6
|
+
from functools import lru_cache
|
|
7
7
|
|
|
8
8
|
from docling_core.types.doc import DoclingDocument
|
|
9
9
|
from pydantic import BaseModel, SecretStr, TypeAdapter
|
|
@@ -12,9 +12,6 @@ from lfx.log.logger import logger
|
|
|
12
12
|
from lfx.schema.data import Data
|
|
13
13
|
from lfx.schema.dataframe import DataFrame
|
|
14
14
|
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from langchain_core.language_models.chat_models import BaseChatModel
|
|
17
|
-
|
|
18
15
|
|
|
19
16
|
class DoclingDependencyError(Exception):
|
|
20
17
|
"""Custom exception for missing Docling dependencies."""
|
|
@@ -25,21 +22,72 @@ class DoclingDependencyError(Exception):
|
|
|
25
22
|
super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
|
|
26
23
|
|
|
27
24
|
|
|
28
|
-
def extract_docling_documents(
|
|
25
|
+
def extract_docling_documents(
|
|
26
|
+
data_inputs: Data | list[Data] | DataFrame, doc_key: str
|
|
27
|
+
) -> tuple[list[DoclingDocument], str | None]:
|
|
28
|
+
"""Extract DoclingDocument objects from data inputs.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
data_inputs: The data inputs containing DoclingDocument objects
|
|
32
|
+
doc_key: The key/column name to look for DoclingDocument objects
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
A tuple of (documents, warning_message) where warning_message is None if no warning
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
TypeError: If the data cannot be extracted or is invalid
|
|
39
|
+
"""
|
|
29
40
|
documents: list[DoclingDocument] = []
|
|
41
|
+
warning_message: str | None = None
|
|
42
|
+
|
|
30
43
|
if isinstance(data_inputs, DataFrame):
|
|
31
44
|
if not len(data_inputs):
|
|
32
45
|
msg = "DataFrame is empty"
|
|
33
46
|
raise TypeError(msg)
|
|
34
47
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
# Primary: Check for exact column name match
|
|
49
|
+
if doc_key in data_inputs.columns:
|
|
50
|
+
try:
|
|
51
|
+
documents = data_inputs[doc_key].tolist()
|
|
52
|
+
except Exception as e:
|
|
53
|
+
msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
|
|
54
|
+
raise TypeError(msg) from e
|
|
55
|
+
else:
|
|
56
|
+
# Fallback: Search all columns for DoclingDocument objects
|
|
57
|
+
found_column = None
|
|
58
|
+
for col in data_inputs.columns:
|
|
59
|
+
try:
|
|
60
|
+
# Check if this column contains DoclingDocument objects
|
|
61
|
+
sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
|
|
62
|
+
if sample is not None and isinstance(sample, DoclingDocument):
|
|
63
|
+
found_column = col
|
|
64
|
+
break
|
|
65
|
+
except (IndexError, AttributeError):
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if found_column:
|
|
69
|
+
warning_message = (
|
|
70
|
+
f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
|
|
71
|
+
f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
|
|
72
|
+
)
|
|
73
|
+
logger.warning(warning_message)
|
|
74
|
+
try:
|
|
75
|
+
documents = data_inputs[found_column].tolist()
|
|
76
|
+
except Exception as e:
|
|
77
|
+
msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
|
|
78
|
+
raise TypeError(msg) from e
|
|
79
|
+
else:
|
|
80
|
+
# Provide helpful error message
|
|
81
|
+
available_columns = list(data_inputs.columns)
|
|
82
|
+
msg = (
|
|
83
|
+
f"Column '{doc_key}' not found in DataFrame. "
|
|
84
|
+
f"Available columns: {available_columns}. "
|
|
85
|
+
f"\n\nPossible solutions:\n"
|
|
86
|
+
f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
|
|
87
|
+
f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
|
|
88
|
+
f"3. If using VLM pipeline, try using the standard pipeline"
|
|
89
|
+
)
|
|
90
|
+
raise TypeError(msg)
|
|
43
91
|
else:
|
|
44
92
|
if not data_inputs:
|
|
45
93
|
msg = "No data inputs provided"
|
|
@@ -69,7 +117,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
|
|
|
69
117
|
except AttributeError as e:
|
|
70
118
|
msg = f"Invalid input type in collection: {e}"
|
|
71
119
|
raise TypeError(msg) from e
|
|
72
|
-
return documents
|
|
120
|
+
return documents, warning_message
|
|
73
121
|
|
|
74
122
|
|
|
75
123
|
def _unwrap_secrets(obj):
|
|
@@ -101,6 +149,81 @@ def _deserialize_pydantic_model(data: dict):
|
|
|
101
149
|
return adapter.validate_python(data["config"])
|
|
102
150
|
|
|
103
151
|
|
|
152
|
+
# Global cache for DocumentConverter instances
|
|
153
|
+
# This cache persists across multiple runs and thread invocations
|
|
154
|
+
@lru_cache(maxsize=4)
|
|
155
|
+
def _get_cached_converter(
|
|
156
|
+
pipeline: str,
|
|
157
|
+
ocr_engine: str,
|
|
158
|
+
*,
|
|
159
|
+
do_picture_classification: bool,
|
|
160
|
+
pic_desc_config_hash: str | None,
|
|
161
|
+
):
|
|
162
|
+
"""Create and cache a DocumentConverter instance based on configuration.
|
|
163
|
+
|
|
164
|
+
This function uses LRU caching to maintain DocumentConverter instances in memory,
|
|
165
|
+
eliminating the 15-20 minute model loading time on subsequent runs.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
pipeline: The pipeline type ("standard" or "vlm")
|
|
169
|
+
ocr_engine: The OCR engine to use
|
|
170
|
+
do_picture_classification: Whether to enable picture classification
|
|
171
|
+
pic_desc_config_hash: Hash of the picture description config (for cache key)
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
A cached or newly created DocumentConverter instance
|
|
175
|
+
"""
|
|
176
|
+
from docling.datamodel.base_models import InputFormat
|
|
177
|
+
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
|
|
178
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
179
|
+
from docling.models.factories import get_ocr_factory
|
|
180
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
181
|
+
|
|
182
|
+
logger.info(f"Creating DocumentConverter for pipeline={pipeline}, ocr_engine={ocr_engine}")
|
|
183
|
+
|
|
184
|
+
# Configure the standard PDF pipeline
|
|
185
|
+
def _get_standard_opts() -> PdfPipelineOptions:
|
|
186
|
+
pipeline_options = PdfPipelineOptions()
|
|
187
|
+
pipeline_options.do_ocr = ocr_engine not in {"", "None"}
|
|
188
|
+
if pipeline_options.do_ocr:
|
|
189
|
+
ocr_factory = get_ocr_factory(
|
|
190
|
+
allow_external_plugins=False,
|
|
191
|
+
)
|
|
192
|
+
ocr_options: OcrOptions = ocr_factory.create_options(
|
|
193
|
+
kind=ocr_engine,
|
|
194
|
+
)
|
|
195
|
+
pipeline_options.ocr_options = ocr_options
|
|
196
|
+
|
|
197
|
+
pipeline_options.do_picture_classification = do_picture_classification
|
|
198
|
+
|
|
199
|
+
# Note: pic_desc_config_hash is for cache key only
|
|
200
|
+
# Actual picture description is handled separately (non-cached path)
|
|
201
|
+
_ = pic_desc_config_hash # Mark as intentionally unused
|
|
202
|
+
|
|
203
|
+
return pipeline_options
|
|
204
|
+
|
|
205
|
+
# Configure the VLM pipeline
|
|
206
|
+
def _get_vlm_opts() -> VlmPipelineOptions:
|
|
207
|
+
return VlmPipelineOptions()
|
|
208
|
+
|
|
209
|
+
if pipeline == "standard":
|
|
210
|
+
pdf_format_option = PdfFormatOption(
|
|
211
|
+
pipeline_options=_get_standard_opts(),
|
|
212
|
+
)
|
|
213
|
+
elif pipeline == "vlm":
|
|
214
|
+
pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
|
|
215
|
+
else:
|
|
216
|
+
msg = f"Unknown pipeline: {pipeline!r}"
|
|
217
|
+
raise ValueError(msg)
|
|
218
|
+
|
|
219
|
+
format_options: dict[InputFormat, FormatOption] = {
|
|
220
|
+
InputFormat.PDF: pdf_format_option,
|
|
221
|
+
InputFormat.IMAGE: pdf_format_option,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return DocumentConverter(format_options=format_options)
|
|
225
|
+
|
|
226
|
+
|
|
104
227
|
def docling_worker(
|
|
105
228
|
*,
|
|
106
229
|
file_paths: list[str],
|
|
@@ -111,7 +234,12 @@ def docling_worker(
|
|
|
111
234
|
pic_desc_config: dict | None,
|
|
112
235
|
pic_desc_prompt: str,
|
|
113
236
|
):
|
|
114
|
-
"""Worker function for processing files with Docling
|
|
237
|
+
"""Worker function for processing files with Docling using threading.
|
|
238
|
+
|
|
239
|
+
This function now uses a globally cached DocumentConverter instance,
|
|
240
|
+
significantly reducing processing time on subsequent runs from 15-20 minutes
|
|
241
|
+
to just seconds.
|
|
242
|
+
"""
|
|
115
243
|
# Signal handling for graceful shutdown
|
|
116
244
|
shutdown_requested = False
|
|
117
245
|
|
|
@@ -154,12 +282,12 @@ def docling_worker(
|
|
|
154
282
|
check_shutdown()
|
|
155
283
|
|
|
156
284
|
try:
|
|
157
|
-
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
158
|
-
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
|
|
159
|
-
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
160
|
-
from docling.models.factories import get_ocr_factory
|
|
161
|
-
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
162
|
-
from langchain_docling.picture_description import PictureDescriptionLangChainOptions
|
|
285
|
+
from docling.datamodel.base_models import ConversionStatus, InputFormat # noqa: F401
|
|
286
|
+
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions # noqa: F401
|
|
287
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption # noqa: F401
|
|
288
|
+
from docling.models.factories import get_ocr_factory # noqa: F401
|
|
289
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline # noqa: F401
|
|
290
|
+
from langchain_docling.picture_description import PictureDescriptionLangChainOptions # noqa: F401
|
|
163
291
|
|
|
164
292
|
# Check for shutdown after imports
|
|
165
293
|
check_shutdown()
|
|
@@ -182,27 +310,34 @@ def docling_worker(
|
|
|
182
310
|
queue.put({"error": "Worker interrupted during imports", "shutdown": True})
|
|
183
311
|
return
|
|
184
312
|
|
|
185
|
-
#
|
|
186
|
-
|
|
313
|
+
# Use cached converter instead of creating new one each time
|
|
314
|
+
# This is the key optimization that eliminates 15-20 minute model load times
|
|
315
|
+
def _get_converter() -> DocumentConverter:
|
|
187
316
|
check_shutdown() # Check before heavy operations
|
|
188
317
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
if pipeline_options.do_ocr:
|
|
192
|
-
ocr_factory = get_ocr_factory(
|
|
193
|
-
allow_external_plugins=False,
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
ocr_options: OcrOptions = ocr_factory.create_options(
|
|
197
|
-
kind=ocr_engine,
|
|
198
|
-
)
|
|
199
|
-
pipeline_options.ocr_options = ocr_options
|
|
200
|
-
|
|
201
|
-
pipeline_options.do_picture_classification = do_picture_classification
|
|
202
|
-
|
|
318
|
+
# For now, we don't support pic_desc_config caching due to serialization complexity
|
|
319
|
+
# This is a known limitation that can be addressed in a future enhancement
|
|
203
320
|
if pic_desc_config:
|
|
204
|
-
|
|
205
|
-
|
|
321
|
+
logger.warning(
|
|
322
|
+
"Picture description with LLM is not yet supported with cached converters. "
|
|
323
|
+
"Using non-cached converter for this request."
|
|
324
|
+
)
|
|
325
|
+
# Fall back to creating a new converter (old behavior)
|
|
326
|
+
from docling.datamodel.base_models import InputFormat
|
|
327
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
328
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
329
|
+
from docling.models.factories import get_ocr_factory
|
|
330
|
+
from langchain_docling.picture_description import PictureDescriptionLangChainOptions
|
|
331
|
+
|
|
332
|
+
pipeline_options = PdfPipelineOptions()
|
|
333
|
+
pipeline_options.do_ocr = ocr_engine not in {"", "None"}
|
|
334
|
+
if pipeline_options.do_ocr:
|
|
335
|
+
ocr_factory = get_ocr_factory(allow_external_plugins=False)
|
|
336
|
+
ocr_options = ocr_factory.create_options(kind=ocr_engine)
|
|
337
|
+
pipeline_options.ocr_options = ocr_options
|
|
338
|
+
|
|
339
|
+
pipeline_options.do_picture_classification = do_picture_classification
|
|
340
|
+
pic_desc_llm = _deserialize_pydantic_model(pic_desc_config)
|
|
206
341
|
logger.info("Docling enabling the picture description stage.")
|
|
207
342
|
pipeline_options.do_picture_description = True
|
|
208
343
|
pipeline_options.allow_external_plugins = True
|
|
@@ -210,33 +345,24 @@ def docling_worker(
|
|
|
210
345
|
llm=pic_desc_llm,
|
|
211
346
|
prompt=pic_desc_prompt,
|
|
212
347
|
)
|
|
213
|
-
return pipeline_options
|
|
214
348
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
raise ValueError(msg)
|
|
233
|
-
|
|
234
|
-
format_options: dict[InputFormat, FormatOption] = {
|
|
235
|
-
InputFormat.PDF: pdf_format_option,
|
|
236
|
-
InputFormat.IMAGE: pdf_format_option,
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
return DocumentConverter(format_options=format_options)
|
|
349
|
+
pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options)
|
|
350
|
+
format_options: dict[InputFormat, FormatOption] = {
|
|
351
|
+
InputFormat.PDF: pdf_format_option,
|
|
352
|
+
InputFormat.IMAGE: pdf_format_option,
|
|
353
|
+
}
|
|
354
|
+
return DocumentConverter(format_options=format_options)
|
|
355
|
+
|
|
356
|
+
# Use cached converter - this is where the magic happens!
|
|
357
|
+
# First run: creates and caches converter (15-20 min)
|
|
358
|
+
# Subsequent runs: reuses cached converter (seconds)
|
|
359
|
+
pic_desc_config_hash = None # Will be None since we checked above
|
|
360
|
+
return _get_cached_converter(
|
|
361
|
+
pipeline=pipeline,
|
|
362
|
+
ocr_engine=ocr_engine,
|
|
363
|
+
do_picture_classification=do_picture_classification,
|
|
364
|
+
pic_desc_config_hash=pic_desc_config_hash,
|
|
365
|
+
)
|
|
240
366
|
|
|
241
367
|
try:
|
|
242
368
|
# Check for shutdown before creating converter (can be slow)
|
lfx/base/data/storage_utils.py
CHANGED
|
@@ -190,3 +190,112 @@ def file_exists(file_path: str, storage_service: StorageService | None = None) -
|
|
|
190
190
|
return False
|
|
191
191
|
else:
|
|
192
192
|
return True
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Magic bytes signatures for common image formats
|
|
196
|
+
MIN_IMAGE_HEADER_SIZE = 12 # Minimum bytes needed to detect image type
|
|
197
|
+
|
|
198
|
+
IMAGE_SIGNATURES: dict[str, list[tuple[bytes, int]]] = {
|
|
199
|
+
"jpeg": [(b"\xff\xd8\xff", 0)],
|
|
200
|
+
"jpg": [(b"\xff\xd8\xff", 0)],
|
|
201
|
+
"png": [(b"\x89PNG\r\n\x1a\n", 0)],
|
|
202
|
+
"gif": [(b"GIF87a", 0), (b"GIF89a", 0)],
|
|
203
|
+
"webp": [(b"RIFF", 0)], # WebP starts with RIFF, then has WEBP at offset 8
|
|
204
|
+
"bmp": [(b"BM", 0)],
|
|
205
|
+
"tiff": [(b"II*\x00", 0), (b"MM\x00*", 0)], # Little-endian and big-endian TIFF
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def detect_image_type_from_bytes(content: bytes) -> str | None:
|
|
210
|
+
"""Detect the actual image type from file content using magic bytes.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
content: The file content bytes (at least first 12 bytes needed)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
str | None: The detected image type (e.g., "jpeg", "png") or None if not recognized
|
|
217
|
+
"""
|
|
218
|
+
if len(content) < MIN_IMAGE_HEADER_SIZE:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
# Check WebP specifically (needs to check both RIFF and WEBP)
|
|
222
|
+
if content[:4] == b"RIFF" and content[8:12] == b"WEBP":
|
|
223
|
+
return "webp"
|
|
224
|
+
|
|
225
|
+
# Check other image signatures
|
|
226
|
+
for image_type, signatures in IMAGE_SIGNATURES.items():
|
|
227
|
+
if image_type == "webp":
|
|
228
|
+
continue # Already handled above
|
|
229
|
+
for signature, offset in signatures:
|
|
230
|
+
if content[offset : offset + len(signature)] == signature:
|
|
231
|
+
return image_type
|
|
232
|
+
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def validate_image_content_type(
|
|
237
|
+
file_path: str,
|
|
238
|
+
content: bytes | None = None,
|
|
239
|
+
storage_service: StorageService | None = None,
|
|
240
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
241
|
+
) -> tuple[bool, str | None]:
|
|
242
|
+
"""Validate that an image file's content matches its declared extension.
|
|
243
|
+
|
|
244
|
+
This prevents errors like "Image does not match the provided media type image/png"
|
|
245
|
+
when a JPEG file is saved with a .png extension.
|
|
246
|
+
|
|
247
|
+
Only rejects files when we can definitively detect a mismatch. Files with
|
|
248
|
+
unrecognized content are allowed through (they may fail later, but that's
|
|
249
|
+
better than false positives blocking valid files).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
file_path: Path to the image file
|
|
253
|
+
content: Optional pre-read file content bytes. If not provided, will read from file.
|
|
254
|
+
storage_service: Optional storage service instance for S3 files
|
|
255
|
+
resolve_path: Optional function to resolve relative paths
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
tuple[bool, str | None]: (is_valid, error_message)
|
|
259
|
+
- (True, None) if the content matches the extension, is unrecognized, or file is not an image
|
|
260
|
+
- (False, error_message) if there's a definite mismatch
|
|
261
|
+
"""
|
|
262
|
+
# Get the file extension
|
|
263
|
+
path_obj = Path(file_path)
|
|
264
|
+
extension = path_obj.suffix[1:].lower() if path_obj.suffix else ""
|
|
265
|
+
|
|
266
|
+
# Only validate image files
|
|
267
|
+
image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
|
|
268
|
+
if extension not in image_extensions:
|
|
269
|
+
return True, None
|
|
270
|
+
|
|
271
|
+
# Read content if not provided
|
|
272
|
+
if content is None:
|
|
273
|
+
try:
|
|
274
|
+
content = run_until_complete(read_file_bytes(file_path, storage_service, resolve_path))
|
|
275
|
+
except (FileNotFoundError, ValueError):
|
|
276
|
+
# Can't read file - let it pass, will fail later with better error
|
|
277
|
+
return True, None
|
|
278
|
+
|
|
279
|
+
# Detect actual image type
|
|
280
|
+
detected_type = detect_image_type_from_bytes(content)
|
|
281
|
+
|
|
282
|
+
# If we can't detect the type, the file is not a valid image
|
|
283
|
+
if detected_type is None:
|
|
284
|
+
return False, (
|
|
285
|
+
f"File '{path_obj.name}' has extension '.{extension}' but its content "
|
|
286
|
+
f"is not a valid image format. The file may be corrupted, empty, or not a real image."
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Normalize extensions for comparison (jpg == jpeg, tif == tiff)
|
|
290
|
+
extension_normalized = "jpeg" if extension == "jpg" else extension
|
|
291
|
+
detected_normalized = "jpeg" if detected_type == "jpg" else detected_type
|
|
292
|
+
|
|
293
|
+
if extension_normalized != detected_normalized:
|
|
294
|
+
return False, (
|
|
295
|
+
f"File '{path_obj.name}' has extension '.{extension}' but contains "
|
|
296
|
+
f"'{detected_type.upper()}' image data. This mismatch will cause API errors. "
|
|
297
|
+
f"Please rename the file with the correct extension '.{detected_type}' or "
|
|
298
|
+
f"re-save it in the correct format."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return True, None
|
|
@@ -187,34 +187,38 @@ class AstraDBBaseComponent(Component):
|
|
|
187
187
|
@classmethod
|
|
188
188
|
def map_cloud_providers(cls, token: str, environment: str | None = None) -> dict[str, dict[str, Any]]:
|
|
189
189
|
"""Fetch all available cloud providers and regions."""
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# Get the list of available regions
|
|
195
|
-
available_regions = admin_client.find_available_regions(only_org_enabled_regions=True)
|
|
190
|
+
try:
|
|
191
|
+
# Get the admin object
|
|
192
|
+
client = DataAPIClient(environment=cls.get_environment(environment))
|
|
193
|
+
admin_client = client.get_admin(token=token)
|
|
196
194
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
"GCP": {"name": "Google Cloud Platform", "id": "gcp"},
|
|
200
|
-
"Azure": {"name": "Microsoft Azure", "id": "azure"},
|
|
201
|
-
}
|
|
195
|
+
# Get the list of available regions
|
|
196
|
+
available_regions = admin_client.find_available_regions(only_org_enabled_regions=True)
|
|
202
197
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
198
|
+
provider_mapping: dict[str, dict[str, str]] = {
|
|
199
|
+
"AWS": {"name": "Amazon Web Services", "id": "aws"},
|
|
200
|
+
"GCP": {"name": "Google Cloud Platform", "id": "gcp"},
|
|
201
|
+
"Azure": {"name": "Microsoft Azure", "id": "azure"},
|
|
202
|
+
}
|
|
207
203
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
204
|
+
result: dict[str, dict[str, Any]] = {}
|
|
205
|
+
for region_info in available_regions:
|
|
206
|
+
cloud_provider = region_info.cloud_provider
|
|
207
|
+
region = region_info.name
|
|
211
208
|
|
|
212
|
-
if
|
|
213
|
-
|
|
209
|
+
if cloud_provider in provider_mapping:
|
|
210
|
+
provider_name = provider_mapping[cloud_provider]["name"]
|
|
211
|
+
provider_id = provider_mapping[cloud_provider]["id"]
|
|
214
212
|
|
|
215
|
-
|
|
213
|
+
if provider_name not in result:
|
|
214
|
+
result[provider_name] = {"id": provider_id, "regions": []}
|
|
216
215
|
|
|
217
|
-
|
|
216
|
+
result[provider_name]["regions"].append(region)
|
|
217
|
+
except Exception as e: # noqa: BLE001
|
|
218
|
+
logger.debug("Error fetching cloud providers: %s", e)
|
|
219
|
+
return {}
|
|
220
|
+
else:
|
|
221
|
+
return result
|
|
218
222
|
|
|
219
223
|
@classmethod
|
|
220
224
|
def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):
|
|
@@ -327,48 +331,52 @@ class AstraDBBaseComponent(Component):
|
|
|
327
331
|
|
|
328
332
|
@classmethod
|
|
329
333
|
def get_database_list_static(cls, token: str, environment: str | None = None):
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
# Get the admin object
|
|
334
|
-
admin_client = client.get_admin(token=token)
|
|
334
|
+
try:
|
|
335
|
+
environment = cls.get_environment(environment)
|
|
336
|
+
client = DataAPIClient(environment=environment)
|
|
335
337
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
+
# Get the admin object
|
|
339
|
+
admin_client = client.get_admin(token=token)
|
|
338
340
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
for db in db_list:
|
|
342
|
-
try:
|
|
343
|
-
# Get the API endpoint for the database
|
|
344
|
-
api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]
|
|
341
|
+
# Get the list of databases
|
|
342
|
+
db_list = admin_client.list_databases()
|
|
345
343
|
|
|
346
|
-
|
|
344
|
+
# Generate the api endpoint for each database
|
|
345
|
+
db_info_dict = {}
|
|
346
|
+
for db in db_list:
|
|
347
347
|
try:
|
|
348
|
-
# Get the
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
348
|
+
# Get the API endpoint for the database
|
|
349
|
+
api_endpoints = [db_reg.api_endpoint for db_reg in db.regions]
|
|
350
|
+
|
|
351
|
+
# Get the number of collections
|
|
352
|
+
try:
|
|
353
|
+
# Get the number of collections in the database
|
|
354
|
+
num_collections = len(
|
|
355
|
+
client.get_database(
|
|
356
|
+
api_endpoints[0],
|
|
357
|
+
token=token,
|
|
358
|
+
).list_collection_names()
|
|
359
|
+
)
|
|
360
|
+
except Exception: # noqa: BLE001
|
|
361
|
+
if db.status != "PENDING":
|
|
362
|
+
continue
|
|
363
|
+
num_collections = 0
|
|
364
|
+
|
|
365
|
+
# Add the database to the dictionary
|
|
366
|
+
db_info_dict[db.name] = {
|
|
367
|
+
"api_endpoints": api_endpoints,
|
|
368
|
+
"keyspaces": db.keyspaces,
|
|
369
|
+
"collections": num_collections,
|
|
370
|
+
"status": db.status if db.status != "ACTIVE" else None,
|
|
371
|
+
"org_id": db.org_id if db.org_id else None,
|
|
372
|
+
}
|
|
373
|
+
except Exception as e: # noqa: BLE001
|
|
374
|
+
logger.debug("Failed to get metadata for database %s: %s", db.name, e)
|
|
375
|
+
except Exception as e: # noqa: BLE001
|
|
376
|
+
logger.debug("Error fetching database list: %s", e)
|
|
377
|
+
return {}
|
|
378
|
+
else:
|
|
379
|
+
return db_info_dict
|
|
372
380
|
|
|
373
381
|
def get_database_list(self):
|
|
374
382
|
return self.get_database_list_static(
|
|
@@ -467,6 +475,9 @@ class AstraDBBaseComponent(Component):
|
|
|
467
475
|
|
|
468
476
|
def _initialize_database_options(self):
|
|
469
477
|
try:
|
|
478
|
+
db_list = self.get_database_list()
|
|
479
|
+
if not db_list:
|
|
480
|
+
return []
|
|
470
481
|
return [
|
|
471
482
|
{
|
|
472
483
|
"name": name,
|
|
@@ -476,11 +487,11 @@ class AstraDBBaseComponent(Component):
|
|
|
476
487
|
"keyspaces": info["keyspaces"],
|
|
477
488
|
"org_id": info["org_id"],
|
|
478
489
|
}
|
|
479
|
-
for name, info in
|
|
490
|
+
for name, info in db_list.items()
|
|
480
491
|
]
|
|
481
|
-
except Exception as e:
|
|
482
|
-
|
|
483
|
-
|
|
492
|
+
except Exception as e: # noqa: BLE001
|
|
493
|
+
logger.debug("Error fetching database options: %s", e)
|
|
494
|
+
return []
|
|
484
495
|
|
|
485
496
|
@classmethod
|
|
486
497
|
def get_provider_icon(cls, collection=None, provider_name: str | None = None) -> str:
|
lfx/base/mcp/util.py
CHANGED
|
@@ -23,6 +23,7 @@ from pydantic import BaseModel
|
|
|
23
23
|
from lfx.log.logger import logger
|
|
24
24
|
from lfx.schema.json_schema import create_input_schema_from_json_schema
|
|
25
25
|
from lfx.services.deps import get_settings_service
|
|
26
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
26
27
|
|
|
27
28
|
HTTP_ERROR_STATUS_CODE = httpx_codes.BAD_REQUEST # HTTP status code for client errors
|
|
28
29
|
|
|
@@ -351,8 +352,7 @@ def create_tool_func(tool_name: str, arg_schema: type[BaseModel], client) -> Cal
|
|
|
351
352
|
_handle_tool_validation_error(e, tool_name, provided_args, arg_schema)
|
|
352
353
|
|
|
353
354
|
try:
|
|
354
|
-
|
|
355
|
-
return loop.run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
|
|
355
|
+
return run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
|
|
356
356
|
except Exception as e:
|
|
357
357
|
logger.error(f"Tool '{tool_name}' execution failed: {e}")
|
|
358
358
|
# Re-raise with more context
|
lfx/base/models/__init__.py
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
1
|
from .model import LCModelComponent
|
|
2
|
+
from .unified_models import (
|
|
3
|
+
get_model_provider_variable_mapping,
|
|
4
|
+
get_model_providers,
|
|
5
|
+
get_unified_models_detailed,
|
|
6
|
+
)
|
|
2
7
|
|
|
3
|
-
__all__ = [
|
|
8
|
+
__all__ = [
|
|
9
|
+
"LCModelComponent",
|
|
10
|
+
"get_model_provider_variable_mapping",
|
|
11
|
+
"get_model_providers",
|
|
12
|
+
"get_unified_models_detailed",
|
|
13
|
+
]
|