lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.0.dev41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +8 -3
- lfx/base/agents/altk_base_agent.py +16 -3
- lfx/base/data/base_file.py +14 -4
- lfx/base/data/docling_utils.py +61 -10
- lfx/base/data/storage_utils.py +109 -0
- lfx/base/mcp/util.py +2 -2
- lfx/base/models/anthropic_constants.py +21 -12
- lfx/cli/commands.py +3 -1
- lfx/components/docling/chunk_docling_document.py +3 -1
- lfx/components/docling/export_docling_document.py +3 -1
- lfx/components/files_and_knowledge/file.py +59 -7
- lfx/components/files_and_knowledge/save_file.py +79 -12
- lfx/components/ibm/watsonx.py +7 -1
- lfx/components/input_output/chat_output.py +7 -1
- lfx/components/llm_operations/batch_run.py +16 -7
- lfx/components/models_and_agents/agent.py +4 -2
- lfx/components/models_and_agents/embedding_model.py +6 -76
- lfx/components/ollama/ollama.py +9 -4
- lfx/components/processing/__init__.py +0 -3
- lfx/custom/directory_reader/directory_reader.py +5 -2
- lfx/graph/graph/base.py +1 -4
- lfx/graph/vertex/base.py +1 -4
- lfx/schema/image.py +2 -12
- lfx/services/interfaces.py +5 -0
- lfx/services/manager.py +5 -4
- lfx/services/mcp_composer/service.py +38 -12
- lfx/services/settings/auth.py +18 -11
- lfx/services/settings/base.py +5 -23
- lfx/services/storage/local.py +32 -0
- lfx/services/storage/service.py +19 -0
- lfx/utils/image.py +29 -11
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/METADATA +1 -1
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/RECORD +36 -39
- lfx/base/embeddings/embeddings_class.py +0 -113
- lfx/components/elastic/opensearch_multimodal.py +0 -1575
- lfx/components/processing/dataframe_to_toolset.py +0 -259
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/entry_points.txt +0 -0
lfx/base/agents/agent.py
CHANGED
|
@@ -181,7 +181,11 @@ class LCAgentComponent(Component):
|
|
|
181
181
|
else:
|
|
182
182
|
input_dict = {"input": self.input_value}
|
|
183
183
|
|
|
184
|
-
|
|
184
|
+
# Ensure input_dict is initialized
|
|
185
|
+
if "input" not in input_dict:
|
|
186
|
+
input_dict = {"input": self.input_value}
|
|
187
|
+
|
|
188
|
+
if hasattr(self, "system_prompt") and self.system_prompt and self.system_prompt.strip():
|
|
185
189
|
input_dict["system_prompt"] = self.system_prompt
|
|
186
190
|
|
|
187
191
|
if hasattr(self, "chat_history") and self.chat_history:
|
|
@@ -196,8 +200,9 @@ class LCAgentComponent(Component):
|
|
|
196
200
|
# Note: Agent input must be a string, so we extract text and move images to chat_history
|
|
197
201
|
if lc_message is not None and hasattr(lc_message, "content") and isinstance(lc_message.content, list):
|
|
198
202
|
# Extract images and text from the text content items
|
|
199
|
-
|
|
200
|
-
|
|
203
|
+
# Support both "image" (legacy) and "image_url" (standard) types
|
|
204
|
+
image_dicts = [item for item in lc_message.content if item.get("type") in ("image", "image_url")]
|
|
205
|
+
text_content = [item for item in lc_message.content if item.get("type") not in ("image", "image_url")]
|
|
201
206
|
|
|
202
207
|
text_strings = [
|
|
203
208
|
item.get("text", "")
|
|
@@ -319,9 +319,9 @@ class ALTKBaseAgentComponent(AgentComponent):
|
|
|
319
319
|
input_dict["chat_history"] = data_to_messages([m.to_data() for m in self.chat_history])
|
|
320
320
|
if hasattr(lc_message, "content") and isinstance(lc_message.content, list):
|
|
321
321
|
# ! Because the input has to be a string, we must pass the images in the chat_history
|
|
322
|
-
|
|
323
|
-
image_dicts = [item for item in lc_message.content if item.get("type")
|
|
324
|
-
lc_message.content = [item for item in lc_message.content if item.get("type")
|
|
322
|
+
# Support both "image" (legacy) and "image_url" (standard) types
|
|
323
|
+
image_dicts = [item for item in lc_message.content if item.get("type") in ("image", "image_url")]
|
|
324
|
+
lc_message.content = [item for item in lc_message.content if item.get("type") not in ("image", "image_url")]
|
|
325
325
|
|
|
326
326
|
if "chat_history" not in input_dict:
|
|
327
327
|
input_dict["chat_history"] = []
|
|
@@ -330,6 +330,19 @@ class ALTKBaseAgentComponent(AgentComponent):
|
|
|
330
330
|
else:
|
|
331
331
|
input_dict["chat_history"] = [HumanMessage(content=[image_dict]) for image_dict in image_dicts]
|
|
332
332
|
input_dict["input"] = input_text
|
|
333
|
+
|
|
334
|
+
# Copied from agent.py
|
|
335
|
+
# Final safety check: ensure input is never empty (prevents Anthropic API errors)
|
|
336
|
+
current_input = input_dict.get("input", "")
|
|
337
|
+
if isinstance(current_input, list):
|
|
338
|
+
current_input = " ".join(map(str, current_input))
|
|
339
|
+
elif not isinstance(current_input, str):
|
|
340
|
+
current_input = str(current_input)
|
|
341
|
+
if not current_input.strip():
|
|
342
|
+
input_dict["input"] = "Continue the conversation."
|
|
343
|
+
else:
|
|
344
|
+
input_dict["input"] = current_input
|
|
345
|
+
|
|
333
346
|
if hasattr(self, "graph"):
|
|
334
347
|
session_id = self.graph.session_id
|
|
335
348
|
elif hasattr(self, "_session_id"):
|
lfx/base/data/base_file.py
CHANGED
|
@@ -260,8 +260,6 @@ class BaseFileComponent(Component, ABC):
|
|
|
260
260
|
filename = file_path_obj.name
|
|
261
261
|
|
|
262
262
|
settings = get_settings_service().settings
|
|
263
|
-
|
|
264
|
-
# Get file size - use storage service for S3, filesystem for local
|
|
265
263
|
if settings.storage_type == "s3":
|
|
266
264
|
try:
|
|
267
265
|
file_size = get_file_size(file_path)
|
|
@@ -618,9 +616,21 @@ class BaseFileComponent(Component, ABC):
|
|
|
618
616
|
BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
|
|
619
617
|
)
|
|
620
618
|
else:
|
|
621
|
-
|
|
619
|
+
# Check if path looks like a storage path (flow_id/filename format)
|
|
620
|
+
# If so, use get_full_path to resolve it to the actual storage location
|
|
621
|
+
if "/" in path_str and not Path(path_str).is_absolute():
|
|
622
|
+
try:
|
|
623
|
+
resolved_path = Path(self.get_full_path(path_str))
|
|
624
|
+
self.log(f"Resolved storage path '{path_str}' to '{resolved_path}'")
|
|
625
|
+
except (ValueError, AttributeError) as e:
|
|
626
|
+
# Fallback to resolve_path if get_full_path fails
|
|
627
|
+
self.log(f"get_full_path failed for '{path_str}': {e}, falling back to resolve_path")
|
|
628
|
+
resolved_path = Path(self.resolve_path(path_str))
|
|
629
|
+
else:
|
|
630
|
+
resolved_path = Path(self.resolve_path(path_str))
|
|
631
|
+
|
|
622
632
|
if not resolved_path.exists():
|
|
623
|
-
msg = f"File
|
|
633
|
+
msg = f"File not found: '{path}' (resolved to: '{resolved_path}'). Please upload the file again."
|
|
624
634
|
self.log(msg)
|
|
625
635
|
if not self.silent_errors:
|
|
626
636
|
raise ValueError(msg)
|
lfx/base/data/docling_utils.py
CHANGED
|
@@ -25,21 +25,72 @@ class DoclingDependencyError(Exception):
|
|
|
25
25
|
super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def extract_docling_documents(
|
|
28
|
+
def extract_docling_documents(
|
|
29
|
+
data_inputs: Data | list[Data] | DataFrame, doc_key: str
|
|
30
|
+
) -> tuple[list[DoclingDocument], str | None]:
|
|
31
|
+
"""Extract DoclingDocument objects from data inputs.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
data_inputs: The data inputs containing DoclingDocument objects
|
|
35
|
+
doc_key: The key/column name to look for DoclingDocument objects
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
A tuple of (documents, warning_message) where warning_message is None if no warning
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
TypeError: If the data cannot be extracted or is invalid
|
|
42
|
+
"""
|
|
29
43
|
documents: list[DoclingDocument] = []
|
|
44
|
+
warning_message: str | None = None
|
|
45
|
+
|
|
30
46
|
if isinstance(data_inputs, DataFrame):
|
|
31
47
|
if not len(data_inputs):
|
|
32
48
|
msg = "DataFrame is empty"
|
|
33
49
|
raise TypeError(msg)
|
|
34
50
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
51
|
+
# Primary: Check for exact column name match
|
|
52
|
+
if doc_key in data_inputs.columns:
|
|
53
|
+
try:
|
|
54
|
+
documents = data_inputs[doc_key].tolist()
|
|
55
|
+
except Exception as e:
|
|
56
|
+
msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
|
|
57
|
+
raise TypeError(msg) from e
|
|
58
|
+
else:
|
|
59
|
+
# Fallback: Search all columns for DoclingDocument objects
|
|
60
|
+
found_column = None
|
|
61
|
+
for col in data_inputs.columns:
|
|
62
|
+
try:
|
|
63
|
+
# Check if this column contains DoclingDocument objects
|
|
64
|
+
sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
|
|
65
|
+
if sample is not None and isinstance(sample, DoclingDocument):
|
|
66
|
+
found_column = col
|
|
67
|
+
break
|
|
68
|
+
except (IndexError, AttributeError):
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
if found_column:
|
|
72
|
+
warning_message = (
|
|
73
|
+
f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
|
|
74
|
+
f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
|
|
75
|
+
)
|
|
76
|
+
logger.warning(warning_message)
|
|
77
|
+
try:
|
|
78
|
+
documents = data_inputs[found_column].tolist()
|
|
79
|
+
except Exception as e:
|
|
80
|
+
msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
|
|
81
|
+
raise TypeError(msg) from e
|
|
82
|
+
else:
|
|
83
|
+
# Provide helpful error message
|
|
84
|
+
available_columns = list(data_inputs.columns)
|
|
85
|
+
msg = (
|
|
86
|
+
f"Column '{doc_key}' not found in DataFrame. "
|
|
87
|
+
f"Available columns: {available_columns}. "
|
|
88
|
+
f"\n\nPossible solutions:\n"
|
|
89
|
+
f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
|
|
90
|
+
f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
|
|
91
|
+
f"3. If using VLM pipeline, try using the standard pipeline"
|
|
92
|
+
)
|
|
93
|
+
raise TypeError(msg)
|
|
43
94
|
else:
|
|
44
95
|
if not data_inputs:
|
|
45
96
|
msg = "No data inputs provided"
|
|
@@ -69,7 +120,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
|
|
|
69
120
|
except AttributeError as e:
|
|
70
121
|
msg = f"Invalid input type in collection: {e}"
|
|
71
122
|
raise TypeError(msg) from e
|
|
72
|
-
return documents
|
|
123
|
+
return documents, warning_message
|
|
73
124
|
|
|
74
125
|
|
|
75
126
|
def _unwrap_secrets(obj):
|
lfx/base/data/storage_utils.py
CHANGED
|
@@ -190,3 +190,112 @@ def file_exists(file_path: str, storage_service: StorageService | None = None) -
|
|
|
190
190
|
return False
|
|
191
191
|
else:
|
|
192
192
|
return True
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Magic bytes signatures for common image formats
|
|
196
|
+
MIN_IMAGE_HEADER_SIZE = 12 # Minimum bytes needed to detect image type
|
|
197
|
+
|
|
198
|
+
IMAGE_SIGNATURES: dict[str, list[tuple[bytes, int]]] = {
|
|
199
|
+
"jpeg": [(b"\xff\xd8\xff", 0)],
|
|
200
|
+
"jpg": [(b"\xff\xd8\xff", 0)],
|
|
201
|
+
"png": [(b"\x89PNG\r\n\x1a\n", 0)],
|
|
202
|
+
"gif": [(b"GIF87a", 0), (b"GIF89a", 0)],
|
|
203
|
+
"webp": [(b"RIFF", 0)], # WebP starts with RIFF, then has WEBP at offset 8
|
|
204
|
+
"bmp": [(b"BM", 0)],
|
|
205
|
+
"tiff": [(b"II*\x00", 0), (b"MM\x00*", 0)], # Little-endian and big-endian TIFF
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def detect_image_type_from_bytes(content: bytes) -> str | None:
|
|
210
|
+
"""Detect the actual image type from file content using magic bytes.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
content: The file content bytes (at least first 12 bytes needed)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
str | None: The detected image type (e.g., "jpeg", "png") or None if not recognized
|
|
217
|
+
"""
|
|
218
|
+
if len(content) < MIN_IMAGE_HEADER_SIZE:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
# Check WebP specifically (needs to check both RIFF and WEBP)
|
|
222
|
+
if content[:4] == b"RIFF" and content[8:12] == b"WEBP":
|
|
223
|
+
return "webp"
|
|
224
|
+
|
|
225
|
+
# Check other image signatures
|
|
226
|
+
for image_type, signatures in IMAGE_SIGNATURES.items():
|
|
227
|
+
if image_type == "webp":
|
|
228
|
+
continue # Already handled above
|
|
229
|
+
for signature, offset in signatures:
|
|
230
|
+
if content[offset : offset + len(signature)] == signature:
|
|
231
|
+
return image_type
|
|
232
|
+
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def validate_image_content_type(
|
|
237
|
+
file_path: str,
|
|
238
|
+
content: bytes | None = None,
|
|
239
|
+
storage_service: StorageService | None = None,
|
|
240
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
241
|
+
) -> tuple[bool, str | None]:
|
|
242
|
+
"""Validate that an image file's content matches its declared extension.
|
|
243
|
+
|
|
244
|
+
This prevents errors like "Image does not match the provided media type image/png"
|
|
245
|
+
when a JPEG file is saved with a .png extension.
|
|
246
|
+
|
|
247
|
+
Only rejects files when we can definitively detect a mismatch. Files with
|
|
248
|
+
unrecognized content are allowed through (they may fail later, but that's
|
|
249
|
+
better than false positives blocking valid files).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
file_path: Path to the image file
|
|
253
|
+
content: Optional pre-read file content bytes. If not provided, will read from file.
|
|
254
|
+
storage_service: Optional storage service instance for S3 files
|
|
255
|
+
resolve_path: Optional function to resolve relative paths
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
tuple[bool, str | None]: (is_valid, error_message)
|
|
259
|
+
- (True, None) if the content matches the extension, is unrecognized, or file is not an image
|
|
260
|
+
- (False, error_message) if there's a definite mismatch
|
|
261
|
+
"""
|
|
262
|
+
# Get the file extension
|
|
263
|
+
path_obj = Path(file_path)
|
|
264
|
+
extension = path_obj.suffix[1:].lower() if path_obj.suffix else ""
|
|
265
|
+
|
|
266
|
+
# Only validate image files
|
|
267
|
+
image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
|
|
268
|
+
if extension not in image_extensions:
|
|
269
|
+
return True, None
|
|
270
|
+
|
|
271
|
+
# Read content if not provided
|
|
272
|
+
if content is None:
|
|
273
|
+
try:
|
|
274
|
+
content = run_until_complete(read_file_bytes(file_path, storage_service, resolve_path))
|
|
275
|
+
except (FileNotFoundError, ValueError):
|
|
276
|
+
# Can't read file - let it pass, will fail later with better error
|
|
277
|
+
return True, None
|
|
278
|
+
|
|
279
|
+
# Detect actual image type
|
|
280
|
+
detected_type = detect_image_type_from_bytes(content)
|
|
281
|
+
|
|
282
|
+
# If we can't detect the type, the file is not a valid image
|
|
283
|
+
if detected_type is None:
|
|
284
|
+
return False, (
|
|
285
|
+
f"File '{path_obj.name}' has extension '.{extension}' but its content "
|
|
286
|
+
f"is not a valid image format. The file may be corrupted, empty, or not a real image."
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Normalize extensions for comparison (jpg == jpeg, tif == tiff)
|
|
290
|
+
extension_normalized = "jpeg" if extension == "jpg" else extension
|
|
291
|
+
detected_normalized = "jpeg" if detected_type == "jpg" else detected_type
|
|
292
|
+
|
|
293
|
+
if extension_normalized != detected_normalized:
|
|
294
|
+
return False, (
|
|
295
|
+
f"File '{path_obj.name}' has extension '.{extension}' but contains "
|
|
296
|
+
f"'{detected_type.upper()}' image data. This mismatch will cause API errors. "
|
|
297
|
+
f"Please rename the file with the correct extension '.{detected_type}' or "
|
|
298
|
+
f"re-save it in the correct format."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return True, None
|
lfx/base/mcp/util.py
CHANGED
|
@@ -23,6 +23,7 @@ from pydantic import BaseModel
|
|
|
23
23
|
from lfx.log.logger import logger
|
|
24
24
|
from lfx.schema.json_schema import create_input_schema_from_json_schema
|
|
25
25
|
from lfx.services.deps import get_settings_service
|
|
26
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
26
27
|
|
|
27
28
|
HTTP_ERROR_STATUS_CODE = httpx_codes.BAD_REQUEST # HTTP status code for client errors
|
|
28
29
|
|
|
@@ -351,8 +352,7 @@ def create_tool_func(tool_name: str, arg_schema: type[BaseModel], client) -> Cal
|
|
|
351
352
|
_handle_tool_validation_error(e, tool_name, provided_args, arg_schema)
|
|
352
353
|
|
|
353
354
|
try:
|
|
354
|
-
|
|
355
|
-
return loop.run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
|
|
355
|
+
return run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
|
|
356
356
|
except Exception as e:
|
|
357
357
|
logger.error(f"Tool '{tool_name}' execution failed: {e}")
|
|
358
358
|
# Re-raise with more context
|
|
@@ -2,32 +2,41 @@ from .model_metadata import create_model_metadata
|
|
|
2
2
|
|
|
3
3
|
ANTHROPIC_MODELS_DETAILED = [
|
|
4
4
|
# Tool calling supported models
|
|
5
|
+
create_model_metadata(provider="Anthropic", name="claude-opus-4-5-20251101", icon="Anthropic", tool_calling=True),
|
|
6
|
+
create_model_metadata(provider="Anthropic", name="claude-haiku-4-5-20251001", icon="Anthropic", tool_calling=True),
|
|
5
7
|
create_model_metadata(provider="Anthropic", name="claude-sonnet-4-5-20250929", icon="Anthropic", tool_calling=True),
|
|
6
8
|
create_model_metadata(provider="Anthropic", name="claude-opus-4-1-20250805", icon="Anthropic", tool_calling=True),
|
|
7
9
|
create_model_metadata(provider="Anthropic", name="claude-opus-4-20250514", icon="Anthropic", tool_calling=True),
|
|
8
10
|
create_model_metadata(provider="Anthropic", name="claude-sonnet-4-20250514", icon="Anthropic", tool_calling=True),
|
|
9
|
-
create_model_metadata(provider="Anthropic", name="claude-3-
|
|
10
|
-
create_model_metadata(provider="Anthropic", name="claude-3-
|
|
11
|
-
|
|
12
|
-
create_model_metadata(
|
|
11
|
+
create_model_metadata(provider="Anthropic", name="claude-3-5-haiku-20241022", icon="Anthropic", tool_calling=True),
|
|
12
|
+
create_model_metadata(provider="Anthropic", name="claude-3-haiku-20240307", icon="Anthropic", tool_calling=True),
|
|
13
|
+
# Deprecated models
|
|
14
|
+
create_model_metadata(
|
|
15
|
+
provider="Anthropic", name="claude-3-7-sonnet-latest", icon="Anthropic", tool_calling=True, deprecated=True
|
|
16
|
+
),
|
|
17
|
+
create_model_metadata(
|
|
18
|
+
provider="Anthropic", name="claude-3-5-sonnet-latest", icon="Anthropic", tool_calling=True, deprecated=True
|
|
19
|
+
),
|
|
20
|
+
create_model_metadata(
|
|
21
|
+
provider="Anthropic", name="claude-3-5-haiku-latest", icon="Anthropic", tool_calling=True, deprecated=True
|
|
22
|
+
),
|
|
23
|
+
create_model_metadata(
|
|
24
|
+
provider="Anthropic", name="claude-3-opus-latest", icon="Anthropic", tool_calling=True, deprecated=True
|
|
25
|
+
),
|
|
13
26
|
create_model_metadata(
|
|
14
27
|
provider="Anthropic", name="claude-3-sonnet-20240229", icon="Anthropic", tool_calling=True, deprecated=True
|
|
15
28
|
),
|
|
16
|
-
# Tool calling unsupported models
|
|
17
|
-
create_model_metadata(provider="Anthropic", name="claude-2.1", icon="Anthropic", tool_calling=False),
|
|
18
|
-
create_model_metadata(provider="Anthropic", name="claude-2.0", icon="Anthropic", tool_calling=False),
|
|
19
|
-
# Deprecated models
|
|
20
29
|
create_model_metadata(
|
|
21
|
-
provider="Anthropic", name="claude-
|
|
30
|
+
provider="Anthropic", name="claude-2.1", icon="Anthropic", tool_calling=False, deprecated=True
|
|
22
31
|
),
|
|
23
32
|
create_model_metadata(
|
|
24
|
-
provider="Anthropic", name="claude-
|
|
33
|
+
provider="Anthropic", name="claude-2.0", icon="Anthropic", tool_calling=False, deprecated=True
|
|
25
34
|
),
|
|
26
35
|
create_model_metadata(
|
|
27
|
-
provider="Anthropic", name="claude-3-5-
|
|
36
|
+
provider="Anthropic", name="claude-3-5-sonnet-20240620", icon="Anthropic", tool_calling=True, deprecated=True
|
|
28
37
|
),
|
|
29
38
|
create_model_metadata(
|
|
30
|
-
provider="Anthropic", name="claude-3-
|
|
39
|
+
provider="Anthropic", name="claude-3-5-sonnet-20241022", icon="Anthropic", tool_calling=True, deprecated=True
|
|
31
40
|
),
|
|
32
41
|
]
|
|
33
42
|
|
lfx/cli/commands.py
CHANGED
|
@@ -304,12 +304,14 @@ async def serve_command(
|
|
|
304
304
|
|
|
305
305
|
# Start the server
|
|
306
306
|
try:
|
|
307
|
-
uvicorn.
|
|
307
|
+
config = uvicorn.Config(
|
|
308
308
|
serve_app,
|
|
309
309
|
host=host,
|
|
310
310
|
port=port,
|
|
311
311
|
log_level=log_level,
|
|
312
312
|
)
|
|
313
|
+
server = uvicorn.Server(config)
|
|
314
|
+
await server.serve()
|
|
313
315
|
except KeyboardInterrupt:
|
|
314
316
|
verbose_print("\n👋 Server stopped")
|
|
315
317
|
raise typer.Exit(0) from None
|
|
@@ -115,7 +115,9 @@ class ChunkDoclingDocumentComponent(Component):
|
|
|
115
115
|
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
|
|
116
116
|
|
|
117
117
|
def chunk_documents(self) -> DataFrame:
|
|
118
|
-
documents = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
118
|
+
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
119
|
+
if warning:
|
|
120
|
+
self.status = warning
|
|
119
121
|
|
|
120
122
|
chunker: BaseChunker
|
|
121
123
|
if self.chunker == "HybridChunker":
|
|
@@ -86,7 +86,9 @@ class ExportDoclingDocumentComponent(Component):
|
|
|
86
86
|
return build_config
|
|
87
87
|
|
|
88
88
|
def export_document(self) -> list[Data]:
|
|
89
|
-
documents = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
89
|
+
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
90
|
+
if warning:
|
|
91
|
+
self.status = warning
|
|
90
92
|
|
|
91
93
|
results: list[Data] = []
|
|
92
94
|
try:
|
|
@@ -21,7 +21,7 @@ from tempfile import NamedTemporaryFile
|
|
|
21
21
|
from typing import Any
|
|
22
22
|
|
|
23
23
|
from lfx.base.data.base_file import BaseFileComponent
|
|
24
|
-
from lfx.base.data.storage_utils import parse_storage_path
|
|
24
|
+
from lfx.base.data.storage_utils import parse_storage_path, validate_image_content_type
|
|
25
25
|
from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
|
|
26
26
|
from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
|
|
27
27
|
from lfx.io import BoolInput, FileInput, IntInput, Output
|
|
@@ -748,6 +748,27 @@ class FileComponent(BaseFileComponent):
|
|
|
748
748
|
msg = "No files to process."
|
|
749
749
|
raise ValueError(msg)
|
|
750
750
|
|
|
751
|
+
# Validate image files to detect content/extension mismatches
|
|
752
|
+
# This prevents API errors like "Image does not match the provided media type"
|
|
753
|
+
image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
|
|
754
|
+
for file in file_list:
|
|
755
|
+
extension = file.path.suffix[1:].lower()
|
|
756
|
+
if extension in image_extensions:
|
|
757
|
+
# file.path is already resolved, read bytes directly
|
|
758
|
+
try:
|
|
759
|
+
content = file.path.read_bytes()
|
|
760
|
+
is_valid, error_msg = validate_image_content_type(
|
|
761
|
+
str(file.path),
|
|
762
|
+
content=content,
|
|
763
|
+
)
|
|
764
|
+
if not is_valid:
|
|
765
|
+
self.log(error_msg)
|
|
766
|
+
if not self.silent_errors:
|
|
767
|
+
raise ValueError(error_msg)
|
|
768
|
+
except OSError as e:
|
|
769
|
+
self.log(f"Could not read file for validation: {e}")
|
|
770
|
+
# Continue - let it fail later with better error
|
|
771
|
+
|
|
751
772
|
# Validate that files requiring Docling are only processed when advanced mode is enabled
|
|
752
773
|
if not self.advanced_mode:
|
|
753
774
|
for file in file_list:
|
|
@@ -786,7 +807,8 @@ class FileComponent(BaseFileComponent):
|
|
|
786
807
|
# --- UNNEST: expand each element in `doc` to its own Data row
|
|
787
808
|
payload = getattr(advanced_data, "data", {}) or {}
|
|
788
809
|
doc_rows = payload.get("doc")
|
|
789
|
-
if isinstance(doc_rows, list):
|
|
810
|
+
if isinstance(doc_rows, list) and doc_rows:
|
|
811
|
+
# Non-empty list of structured rows
|
|
790
812
|
rows: list[Data | None] = [
|
|
791
813
|
Data(
|
|
792
814
|
data={
|
|
@@ -797,6 +819,19 @@ class FileComponent(BaseFileComponent):
|
|
|
797
819
|
for item in doc_rows
|
|
798
820
|
]
|
|
799
821
|
final_return.extend(self.rollup_data(file_list, rows))
|
|
822
|
+
elif isinstance(doc_rows, list) and not doc_rows:
|
|
823
|
+
# Empty list - file was processed but no text content found
|
|
824
|
+
# Create a Data object indicating no content was extracted
|
|
825
|
+
self.log(f"No text extracted from '{file_path}', creating placeholder data")
|
|
826
|
+
empty_data = Data(
|
|
827
|
+
data={
|
|
828
|
+
"file_path": file_path,
|
|
829
|
+
"text": "(No text content extracted from image)",
|
|
830
|
+
"info": "Image processed successfully but contained no extractable text",
|
|
831
|
+
**{k: v for k, v in payload.items() if k != "doc"},
|
|
832
|
+
},
|
|
833
|
+
)
|
|
834
|
+
final_return.extend(self.rollup_data([file], [empty_data]))
|
|
800
835
|
else:
|
|
801
836
|
# If not structured, keep as-is (e.g., markdown export or error dict)
|
|
802
837
|
final_return.extend(self.rollup_data(file_list, [advanced_data]))
|
|
@@ -820,13 +855,17 @@ class FileComponent(BaseFileComponent):
|
|
|
820
855
|
def load_files_helper(self) -> DataFrame:
|
|
821
856
|
result = self.load_files()
|
|
822
857
|
|
|
823
|
-
#
|
|
824
|
-
if
|
|
825
|
-
if hasattr(result, "error"):
|
|
826
|
-
raise ValueError(result.error[0])
|
|
858
|
+
# Result is a DataFrame - check if it has any rows
|
|
859
|
+
if result.empty:
|
|
827
860
|
msg = "Could not extract content from the provided file(s)."
|
|
828
861
|
raise ValueError(msg)
|
|
829
862
|
|
|
863
|
+
# Check for error column with error messages
|
|
864
|
+
if "error" in result.columns:
|
|
865
|
+
errors = result["error"].dropna().tolist()
|
|
866
|
+
if errors and not any(col in result.columns for col in ["text", "doc", "exported_content"]):
|
|
867
|
+
raise ValueError(errors[0])
|
|
868
|
+
|
|
830
869
|
return result
|
|
831
870
|
|
|
832
871
|
def load_files_dataframe(self) -> DataFrame:
|
|
@@ -838,4 +877,17 @@ class FileComponent(BaseFileComponent):
|
|
|
838
877
|
"""Load files using advanced Docling processing and export to Markdown format."""
|
|
839
878
|
self.markdown = True
|
|
840
879
|
result = self.load_files_helper()
|
|
841
|
-
|
|
880
|
+
|
|
881
|
+
# Result is a DataFrame - check for text or exported_content columns
|
|
882
|
+
if "text" in result.columns and not result["text"].isna().all():
|
|
883
|
+
text_values = result["text"].dropna().tolist()
|
|
884
|
+
if text_values:
|
|
885
|
+
return Message(text=str(text_values[0]))
|
|
886
|
+
|
|
887
|
+
if "exported_content" in result.columns and not result["exported_content"].isna().all():
|
|
888
|
+
content_values = result["exported_content"].dropna().tolist()
|
|
889
|
+
if content_values:
|
|
890
|
+
return Message(text=str(content_values[0]))
|
|
891
|
+
|
|
892
|
+
# Return empty message with info that no text was found
|
|
893
|
+
return Message(text="(No text content extracted from file)")
|