lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.0.dev41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +8 -3
  3. lfx/base/agents/altk_base_agent.py +16 -3
  4. lfx/base/data/base_file.py +14 -4
  5. lfx/base/data/docling_utils.py +61 -10
  6. lfx/base/data/storage_utils.py +109 -0
  7. lfx/base/mcp/util.py +2 -2
  8. lfx/base/models/anthropic_constants.py +21 -12
  9. lfx/cli/commands.py +3 -1
  10. lfx/components/docling/chunk_docling_document.py +3 -1
  11. lfx/components/docling/export_docling_document.py +3 -1
  12. lfx/components/files_and_knowledge/file.py +59 -7
  13. lfx/components/files_and_knowledge/save_file.py +79 -12
  14. lfx/components/ibm/watsonx.py +7 -1
  15. lfx/components/input_output/chat_output.py +7 -1
  16. lfx/components/llm_operations/batch_run.py +16 -7
  17. lfx/components/models_and_agents/agent.py +4 -2
  18. lfx/components/models_and_agents/embedding_model.py +6 -76
  19. lfx/components/ollama/ollama.py +9 -4
  20. lfx/components/processing/__init__.py +0 -3
  21. lfx/custom/directory_reader/directory_reader.py +5 -2
  22. lfx/graph/graph/base.py +1 -4
  23. lfx/graph/vertex/base.py +1 -4
  24. lfx/schema/image.py +2 -12
  25. lfx/services/interfaces.py +5 -0
  26. lfx/services/manager.py +5 -4
  27. lfx/services/mcp_composer/service.py +38 -12
  28. lfx/services/settings/auth.py +18 -11
  29. lfx/services/settings/base.py +5 -23
  30. lfx/services/storage/local.py +32 -0
  31. lfx/services/storage/service.py +19 -0
  32. lfx/utils/image.py +29 -11
  33. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/METADATA +1 -1
  34. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/RECORD +36 -39
  35. lfx/base/embeddings/embeddings_class.py +0 -113
  36. lfx/components/elastic/opensearch_multimodal.py +0 -1575
  37. lfx/components/processing/dataframe_to_toolset.py +0 -259
  38. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/WHEEL +0 -0
  39. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/entry_points.txt +0 -0
lfx/base/agents/agent.py CHANGED
@@ -181,7 +181,11 @@ class LCAgentComponent(Component):
181
181
  else:
182
182
  input_dict = {"input": self.input_value}
183
183
 
184
- if hasattr(self, "system_prompt"):
184
+ # Ensure input_dict is initialized
185
+ if "input" not in input_dict:
186
+ input_dict = {"input": self.input_value}
187
+
188
+ if hasattr(self, "system_prompt") and self.system_prompt and self.system_prompt.strip():
185
189
  input_dict["system_prompt"] = self.system_prompt
186
190
 
187
191
  if hasattr(self, "chat_history") and self.chat_history:
@@ -196,8 +200,9 @@ class LCAgentComponent(Component):
196
200
  # Note: Agent input must be a string, so we extract text and move images to chat_history
197
201
  if lc_message is not None and hasattr(lc_message, "content") and isinstance(lc_message.content, list):
198
202
  # Extract images and text from the text content items
199
- image_dicts = [item for item in lc_message.content if item.get("type") == "image"]
200
- text_content = [item for item in lc_message.content if item.get("type") != "image"]
203
+ # Support both "image" (legacy) and "image_url" (standard) types
204
+ image_dicts = [item for item in lc_message.content if item.get("type") in ("image", "image_url")]
205
+ text_content = [item for item in lc_message.content if item.get("type") not in ("image", "image_url")]
201
206
 
202
207
  text_strings = [
203
208
  item.get("text", "")
@@ -319,9 +319,9 @@ class ALTKBaseAgentComponent(AgentComponent):
319
319
  input_dict["chat_history"] = data_to_messages([m.to_data() for m in self.chat_history])
320
320
  if hasattr(lc_message, "content") and isinstance(lc_message.content, list):
321
321
  # ! Because the input has to be a string, we must pass the images in the chat_history
322
-
323
- image_dicts = [item for item in lc_message.content if item.get("type") == "image"]
324
- lc_message.content = [item for item in lc_message.content if item.get("type") != "image"]
322
+ # Support both "image" (legacy) and "image_url" (standard) types
323
+ image_dicts = [item for item in lc_message.content if item.get("type") in ("image", "image_url")]
324
+ lc_message.content = [item for item in lc_message.content if item.get("type") not in ("image", "image_url")]
325
325
 
326
326
  if "chat_history" not in input_dict:
327
327
  input_dict["chat_history"] = []
@@ -330,6 +330,19 @@ class ALTKBaseAgentComponent(AgentComponent):
330
330
  else:
331
331
  input_dict["chat_history"] = [HumanMessage(content=[image_dict]) for image_dict in image_dicts]
332
332
  input_dict["input"] = input_text
333
+
334
+ # Copied from agent.py
335
+ # Final safety check: ensure input is never empty (prevents Anthropic API errors)
336
+ current_input = input_dict.get("input", "")
337
+ if isinstance(current_input, list):
338
+ current_input = " ".join(map(str, current_input))
339
+ elif not isinstance(current_input, str):
340
+ current_input = str(current_input)
341
+ if not current_input.strip():
342
+ input_dict["input"] = "Continue the conversation."
343
+ else:
344
+ input_dict["input"] = current_input
345
+
333
346
  if hasattr(self, "graph"):
334
347
  session_id = self.graph.session_id
335
348
  elif hasattr(self, "_session_id"):
@@ -260,8 +260,6 @@ class BaseFileComponent(Component, ABC):
260
260
  filename = file_path_obj.name
261
261
 
262
262
  settings = get_settings_service().settings
263
-
264
- # Get file size - use storage service for S3, filesystem for local
265
263
  if settings.storage_type == "s3":
266
264
  try:
267
265
  file_size = get_file_size(file_path)
@@ -618,9 +616,21 @@ class BaseFileComponent(Component, ABC):
618
616
  BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
619
617
  )
620
618
  else:
621
- resolved_path = Path(self.resolve_path(path_str))
619
+ # Check if path looks like a storage path (flow_id/filename format)
620
+ # If so, use get_full_path to resolve it to the actual storage location
621
+ if "/" in path_str and not Path(path_str).is_absolute():
622
+ try:
623
+ resolved_path = Path(self.get_full_path(path_str))
624
+ self.log(f"Resolved storage path '{path_str}' to '{resolved_path}'")
625
+ except (ValueError, AttributeError) as e:
626
+ # Fallback to resolve_path if get_full_path fails
627
+ self.log(f"get_full_path failed for '{path_str}': {e}, falling back to resolve_path")
628
+ resolved_path = Path(self.resolve_path(path_str))
629
+ else:
630
+ resolved_path = Path(self.resolve_path(path_str))
631
+
622
632
  if not resolved_path.exists():
623
- msg = f"File or directory not found: {path}"
633
+ msg = f"File not found: '{path}' (resolved to: '{resolved_path}'). Please upload the file again."
624
634
  self.log(msg)
625
635
  if not self.silent_errors:
626
636
  raise ValueError(msg)
@@ -25,21 +25,72 @@ class DoclingDependencyError(Exception):
25
25
  super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
26
26
 
27
27
 
28
- def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
28
+ def extract_docling_documents(
29
+ data_inputs: Data | list[Data] | DataFrame, doc_key: str
30
+ ) -> tuple[list[DoclingDocument], str | None]:
31
+ """Extract DoclingDocument objects from data inputs.
32
+
33
+ Args:
34
+ data_inputs: The data inputs containing DoclingDocument objects
35
+ doc_key: The key/column name to look for DoclingDocument objects
36
+
37
+ Returns:
38
+ A tuple of (documents, warning_message) where warning_message is None if no warning
39
+
40
+ Raises:
41
+ TypeError: If the data cannot be extracted or is invalid
42
+ """
29
43
  documents: list[DoclingDocument] = []
44
+ warning_message: str | None = None
45
+
30
46
  if isinstance(data_inputs, DataFrame):
31
47
  if not len(data_inputs):
32
48
  msg = "DataFrame is empty"
33
49
  raise TypeError(msg)
34
50
 
35
- if doc_key not in data_inputs.columns:
36
- msg = f"Column '{doc_key}' not found in DataFrame"
37
- raise TypeError(msg)
38
- try:
39
- documents = data_inputs[doc_key].tolist()
40
- except Exception as e:
41
- msg = f"Error extracting DoclingDocument from DataFrame: {e}"
42
- raise TypeError(msg) from e
51
+ # Primary: Check for exact column name match
52
+ if doc_key in data_inputs.columns:
53
+ try:
54
+ documents = data_inputs[doc_key].tolist()
55
+ except Exception as e:
56
+ msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
57
+ raise TypeError(msg) from e
58
+ else:
59
+ # Fallback: Search all columns for DoclingDocument objects
60
+ found_column = None
61
+ for col in data_inputs.columns:
62
+ try:
63
+ # Check if this column contains DoclingDocument objects
64
+ sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
65
+ if sample is not None and isinstance(sample, DoclingDocument):
66
+ found_column = col
67
+ break
68
+ except (IndexError, AttributeError):
69
+ continue
70
+
71
+ if found_column:
72
+ warning_message = (
73
+ f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
74
+ f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
75
+ )
76
+ logger.warning(warning_message)
77
+ try:
78
+ documents = data_inputs[found_column].tolist()
79
+ except Exception as e:
80
+ msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
81
+ raise TypeError(msg) from e
82
+ else:
83
+ # Provide helpful error message
84
+ available_columns = list(data_inputs.columns)
85
+ msg = (
86
+ f"Column '{doc_key}' not found in DataFrame. "
87
+ f"Available columns: {available_columns}. "
88
+ f"\n\nPossible solutions:\n"
89
+ f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
90
+ f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
91
+ f"3. If using VLM pipeline, try using the standard pipeline"
92
+ )
93
+ raise TypeError(msg)
43
94
  else:
44
95
  if not data_inputs:
45
96
  msg = "No data inputs provided"
@@ -69,7 +120,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
69
120
  except AttributeError as e:
70
121
  msg = f"Invalid input type in collection: {e}"
71
122
  raise TypeError(msg) from e
72
- return documents
123
+ return documents, warning_message
73
124
 
74
125
 
75
126
  def _unwrap_secrets(obj):
@@ -190,3 +190,112 @@ def file_exists(file_path: str, storage_service: StorageService | None = None) -
190
190
  return False
191
191
  else:
192
192
  return True
193
+
194
+
195
+ # Magic bytes signatures for common image formats
196
+ MIN_IMAGE_HEADER_SIZE = 12 # Minimum bytes needed to detect image type
197
+
198
+ IMAGE_SIGNATURES: dict[str, list[tuple[bytes, int]]] = {
199
+ "jpeg": [(b"\xff\xd8\xff", 0)],
200
+ "jpg": [(b"\xff\xd8\xff", 0)],
201
+ "png": [(b"\x89PNG\r\n\x1a\n", 0)],
202
+ "gif": [(b"GIF87a", 0), (b"GIF89a", 0)],
203
+ "webp": [(b"RIFF", 0)], # WebP starts with RIFF, then has WEBP at offset 8
204
+ "bmp": [(b"BM", 0)],
205
+ "tiff": [(b"II*\x00", 0), (b"MM\x00*", 0)], # Little-endian and big-endian TIFF
206
+ }
207
+
208
+
209
+ def detect_image_type_from_bytes(content: bytes) -> str | None:
210
+ """Detect the actual image type from file content using magic bytes.
211
+
212
+ Args:
213
+ content: The file content bytes (at least first 12 bytes needed)
214
+
215
+ Returns:
216
+ str | None: The detected image type (e.g., "jpeg", "png") or None if not recognized
217
+ """
218
+ if len(content) < MIN_IMAGE_HEADER_SIZE:
219
+ return None
220
+
221
+ # Check WebP specifically (needs to check both RIFF and WEBP)
222
+ if content[:4] == b"RIFF" and content[8:12] == b"WEBP":
223
+ return "webp"
224
+
225
+ # Check other image signatures
226
+ for image_type, signatures in IMAGE_SIGNATURES.items():
227
+ if image_type == "webp":
228
+ continue # Already handled above
229
+ for signature, offset in signatures:
230
+ if content[offset : offset + len(signature)] == signature:
231
+ return image_type
232
+
233
+ return None
234
+
235
+
236
+ def validate_image_content_type(
237
+ file_path: str,
238
+ content: bytes | None = None,
239
+ storage_service: StorageService | None = None,
240
+ resolve_path: Callable[[str], str] | None = None,
241
+ ) -> tuple[bool, str | None]:
242
+ """Validate that an image file's content matches its declared extension.
243
+
244
+ This prevents errors like "Image does not match the provided media type image/png"
245
+ when a JPEG file is saved with a .png extension.
246
+
247
+ Only rejects files when we can definitively detect a mismatch. Files with
248
+ unrecognized content are allowed through (they may fail later, but that's
249
+ better than false positives blocking valid files).
250
+
251
+ Args:
252
+ file_path: Path to the image file
253
+ content: Optional pre-read file content bytes. If not provided, will read from file.
254
+ storage_service: Optional storage service instance for S3 files
255
+ resolve_path: Optional function to resolve relative paths
256
+
257
+ Returns:
258
+ tuple[bool, str | None]: (is_valid, error_message)
259
+ - (True, None) if the content matches the extension, is unrecognized, or file is not an image
260
+ - (False, error_message) if there's a definite mismatch
261
+ """
262
+ # Get the file extension
263
+ path_obj = Path(file_path)
264
+ extension = path_obj.suffix[1:].lower() if path_obj.suffix else ""
265
+
266
+ # Only validate image files
267
+ image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
268
+ if extension not in image_extensions:
269
+ return True, None
270
+
271
+ # Read content if not provided
272
+ if content is None:
273
+ try:
274
+ content = run_until_complete(read_file_bytes(file_path, storage_service, resolve_path))
275
+ except (FileNotFoundError, ValueError):
276
+ # Can't read file - let it pass, will fail later with better error
277
+ return True, None
278
+
279
+ # Detect actual image type
280
+ detected_type = detect_image_type_from_bytes(content)
281
+
282
+ # If we can't detect the type, the file is not a valid image
283
+ if detected_type is None:
284
+ return False, (
285
+ f"File '{path_obj.name}' has extension '.{extension}' but its content "
286
+ f"is not a valid image format. The file may be corrupted, empty, or not a real image."
287
+ )
288
+
289
+ # Normalize extensions for comparison (jpg == jpeg, tif == tiff)
290
+ extension_normalized = "jpeg" if extension == "jpg" else extension
291
+ detected_normalized = "jpeg" if detected_type == "jpg" else detected_type
292
+
293
+ if extension_normalized != detected_normalized:
294
+ return False, (
295
+ f"File '{path_obj.name}' has extension '.{extension}' but contains "
296
+ f"'{detected_type.upper()}' image data. This mismatch will cause API errors. "
297
+ f"Please rename the file with the correct extension '.{detected_type}' or "
298
+ f"re-save it in the correct format."
299
+ )
300
+
301
+ return True, None
lfx/base/mcp/util.py CHANGED
@@ -23,6 +23,7 @@ from pydantic import BaseModel
23
23
  from lfx.log.logger import logger
24
24
  from lfx.schema.json_schema import create_input_schema_from_json_schema
25
25
  from lfx.services.deps import get_settings_service
26
+ from lfx.utils.async_helpers import run_until_complete
26
27
 
27
28
  HTTP_ERROR_STATUS_CODE = httpx_codes.BAD_REQUEST # HTTP status code for client errors
28
29
 
@@ -351,8 +352,7 @@ def create_tool_func(tool_name: str, arg_schema: type[BaseModel], client) -> Cal
351
352
  _handle_tool_validation_error(e, tool_name, provided_args, arg_schema)
352
353
 
353
354
  try:
354
- loop = asyncio.get_event_loop()
355
- return loop.run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
355
+ return run_until_complete(client.run_tool(tool_name, arguments=validated.model_dump()))
356
356
  except Exception as e:
357
357
  logger.error(f"Tool '{tool_name}' execution failed: {e}")
358
358
  # Re-raise with more context
@@ -2,32 +2,41 @@ from .model_metadata import create_model_metadata
2
2
 
3
3
  ANTHROPIC_MODELS_DETAILED = [
4
4
  # Tool calling supported models
5
+ create_model_metadata(provider="Anthropic", name="claude-opus-4-5-20251101", icon="Anthropic", tool_calling=True),
6
+ create_model_metadata(provider="Anthropic", name="claude-haiku-4-5-20251001", icon="Anthropic", tool_calling=True),
5
7
  create_model_metadata(provider="Anthropic", name="claude-sonnet-4-5-20250929", icon="Anthropic", tool_calling=True),
6
8
  create_model_metadata(provider="Anthropic", name="claude-opus-4-1-20250805", icon="Anthropic", tool_calling=True),
7
9
  create_model_metadata(provider="Anthropic", name="claude-opus-4-20250514", icon="Anthropic", tool_calling=True),
8
10
  create_model_metadata(provider="Anthropic", name="claude-sonnet-4-20250514", icon="Anthropic", tool_calling=True),
9
- create_model_metadata(provider="Anthropic", name="claude-3-7-sonnet-latest", icon="Anthropic", tool_calling=True),
10
- create_model_metadata(provider="Anthropic", name="claude-3-5-sonnet-latest", icon="Anthropic", tool_calling=True),
11
- create_model_metadata(provider="Anthropic", name="claude-3-5-haiku-latest", icon="Anthropic", tool_calling=True),
12
- create_model_metadata(provider="Anthropic", name="claude-3-opus-latest", icon="Anthropic", tool_calling=True),
11
+ create_model_metadata(provider="Anthropic", name="claude-3-5-haiku-20241022", icon="Anthropic", tool_calling=True),
12
+ create_model_metadata(provider="Anthropic", name="claude-3-haiku-20240307", icon="Anthropic", tool_calling=True),
13
+ # Deprecated models
14
+ create_model_metadata(
15
+ provider="Anthropic", name="claude-3-7-sonnet-latest", icon="Anthropic", tool_calling=True, deprecated=True
16
+ ),
17
+ create_model_metadata(
18
+ provider="Anthropic", name="claude-3-5-sonnet-latest", icon="Anthropic", tool_calling=True, deprecated=True
19
+ ),
20
+ create_model_metadata(
21
+ provider="Anthropic", name="claude-3-5-haiku-latest", icon="Anthropic", tool_calling=True, deprecated=True
22
+ ),
23
+ create_model_metadata(
24
+ provider="Anthropic", name="claude-3-opus-latest", icon="Anthropic", tool_calling=True, deprecated=True
25
+ ),
13
26
  create_model_metadata(
14
27
  provider="Anthropic", name="claude-3-sonnet-20240229", icon="Anthropic", tool_calling=True, deprecated=True
15
28
  ),
16
- # Tool calling unsupported models
17
- create_model_metadata(provider="Anthropic", name="claude-2.1", icon="Anthropic", tool_calling=False),
18
- create_model_metadata(provider="Anthropic", name="claude-2.0", icon="Anthropic", tool_calling=False),
19
- # Deprecated models
20
29
  create_model_metadata(
21
- provider="Anthropic", name="claude-3-5-sonnet-20240620", icon="Anthropic", tool_calling=True, deprecated=True
30
+ provider="Anthropic", name="claude-2.1", icon="Anthropic", tool_calling=False, deprecated=True
22
31
  ),
23
32
  create_model_metadata(
24
- provider="Anthropic", name="claude-3-5-sonnet-20241022", icon="Anthropic", tool_calling=True, deprecated=True
33
+ provider="Anthropic", name="claude-2.0", icon="Anthropic", tool_calling=False, deprecated=True
25
34
  ),
26
35
  create_model_metadata(
27
- provider="Anthropic", name="claude-3-5-haiku-20241022", icon="Anthropic", tool_calling=True, deprecated=True
36
+ provider="Anthropic", name="claude-3-5-sonnet-20240620", icon="Anthropic", tool_calling=True, deprecated=True
28
37
  ),
29
38
  create_model_metadata(
30
- provider="Anthropic", name="claude-3-haiku-20240307", icon="Anthropic", tool_calling=True, deprecated=True
39
+ provider="Anthropic", name="claude-3-5-sonnet-20241022", icon="Anthropic", tool_calling=True, deprecated=True
31
40
  ),
32
41
  ]
33
42
 
lfx/cli/commands.py CHANGED
@@ -304,12 +304,14 @@ async def serve_command(
304
304
 
305
305
  # Start the server
306
306
  try:
307
- uvicorn.run(
307
+ config = uvicorn.Config(
308
308
  serve_app,
309
309
  host=host,
310
310
  port=port,
311
311
  log_level=log_level,
312
312
  )
313
+ server = uvicorn.Server(config)
314
+ await server.serve()
313
315
  except KeyboardInterrupt:
314
316
  verbose_print("\n👋 Server stopped")
315
317
  raise typer.Exit(0) from None
@@ -115,7 +115,9 @@ class ChunkDoclingDocumentComponent(Component):
115
115
  return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
116
116
 
117
117
  def chunk_documents(self) -> DataFrame:
118
- documents = extract_docling_documents(self.data_inputs, self.doc_key)
118
+ documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
119
+ if warning:
120
+ self.status = warning
119
121
 
120
122
  chunker: BaseChunker
121
123
  if self.chunker == "HybridChunker":
@@ -86,7 +86,9 @@ class ExportDoclingDocumentComponent(Component):
86
86
  return build_config
87
87
 
88
88
  def export_document(self) -> list[Data]:
89
- documents = extract_docling_documents(self.data_inputs, self.doc_key)
89
+ documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
90
+ if warning:
91
+ self.status = warning
90
92
 
91
93
  results: list[Data] = []
92
94
  try:
@@ -21,7 +21,7 @@ from tempfile import NamedTemporaryFile
21
21
  from typing import Any
22
22
 
23
23
  from lfx.base.data.base_file import BaseFileComponent
24
- from lfx.base.data.storage_utils import parse_storage_path
24
+ from lfx.base.data.storage_utils import parse_storage_path, validate_image_content_type
25
25
  from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
26
26
  from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
27
27
  from lfx.io import BoolInput, FileInput, IntInput, Output
@@ -748,6 +748,27 @@ class FileComponent(BaseFileComponent):
748
748
  msg = "No files to process."
749
749
  raise ValueError(msg)
750
750
 
751
+ # Validate image files to detect content/extension mismatches
752
+ # This prevents API errors like "Image does not match the provided media type"
753
+ image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
754
+ for file in file_list:
755
+ extension = file.path.suffix[1:].lower()
756
+ if extension in image_extensions:
757
+ # file.path is already resolved, read bytes directly
758
+ try:
759
+ content = file.path.read_bytes()
760
+ is_valid, error_msg = validate_image_content_type(
761
+ str(file.path),
762
+ content=content,
763
+ )
764
+ if not is_valid:
765
+ self.log(error_msg)
766
+ if not self.silent_errors:
767
+ raise ValueError(error_msg)
768
+ except OSError as e:
769
+ self.log(f"Could not read file for validation: {e}")
770
+ # Continue - let it fail later with better error
771
+
751
772
  # Validate that files requiring Docling are only processed when advanced mode is enabled
752
773
  if not self.advanced_mode:
753
774
  for file in file_list:
@@ -786,7 +807,8 @@ class FileComponent(BaseFileComponent):
786
807
  # --- UNNEST: expand each element in `doc` to its own Data row
787
808
  payload = getattr(advanced_data, "data", {}) or {}
788
809
  doc_rows = payload.get("doc")
789
- if isinstance(doc_rows, list):
810
+ if isinstance(doc_rows, list) and doc_rows:
811
+ # Non-empty list of structured rows
790
812
  rows: list[Data | None] = [
791
813
  Data(
792
814
  data={
@@ -797,6 +819,19 @@ class FileComponent(BaseFileComponent):
797
819
  for item in doc_rows
798
820
  ]
799
821
  final_return.extend(self.rollup_data(file_list, rows))
822
+ elif isinstance(doc_rows, list) and not doc_rows:
823
+ # Empty list - file was processed but no text content found
824
+ # Create a Data object indicating no content was extracted
825
+ self.log(f"No text extracted from '{file_path}', creating placeholder data")
826
+ empty_data = Data(
827
+ data={
828
+ "file_path": file_path,
829
+ "text": "(No text content extracted from image)",
830
+ "info": "Image processed successfully but contained no extractable text",
831
+ **{k: v for k, v in payload.items() if k != "doc"},
832
+ },
833
+ )
834
+ final_return.extend(self.rollup_data([file], [empty_data]))
800
835
  else:
801
836
  # If not structured, keep as-is (e.g., markdown export or error dict)
802
837
  final_return.extend(self.rollup_data(file_list, [advanced_data]))
@@ -820,13 +855,17 @@ class FileComponent(BaseFileComponent):
820
855
  def load_files_helper(self) -> DataFrame:
821
856
  result = self.load_files()
822
857
 
823
- # Error condition - raise error if no text and an error is present
824
- if not hasattr(result, "text"):
825
- if hasattr(result, "error"):
826
- raise ValueError(result.error[0])
858
+ # Result is a DataFrame - check if it has any rows
859
+ if result.empty:
827
860
  msg = "Could not extract content from the provided file(s)."
828
861
  raise ValueError(msg)
829
862
 
863
+ # Check for error column with error messages
864
+ if "error" in result.columns:
865
+ errors = result["error"].dropna().tolist()
866
+ if errors and not any(col in result.columns for col in ["text", "doc", "exported_content"]):
867
+ raise ValueError(errors[0])
868
+
830
869
  return result
831
870
 
832
871
  def load_files_dataframe(self) -> DataFrame:
@@ -838,4 +877,17 @@ class FileComponent(BaseFileComponent):
838
877
  """Load files using advanced Docling processing and export to Markdown format."""
839
878
  self.markdown = True
840
879
  result = self.load_files_helper()
841
- return Message(text=str(result.text[0]))
880
+
881
+ # Result is a DataFrame - check for text or exported_content columns
882
+ if "text" in result.columns and not result["text"].isna().all():
883
+ text_values = result["text"].dropna().tolist()
884
+ if text_values:
885
+ return Message(text=str(text_values[0]))
886
+
887
+ if "exported_content" in result.columns and not result["exported_content"].isna().all():
888
+ content_values = result["exported_content"].dropna().tolist()
889
+ if content_values:
890
+ return Message(text=str(content_values[0]))
891
+
892
+ # Return empty message with info that no text was found
893
+ return Message(text="(No text content extracted from file)")