lfx-nightly 0.2.0.dev26__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +9 -4
  3. lfx/base/agents/altk_base_agent.py +16 -3
  4. lfx/base/agents/altk_tool_wrappers.py +1 -1
  5. lfx/base/agents/utils.py +4 -0
  6. lfx/base/composio/composio_base.py +78 -41
  7. lfx/base/data/base_file.py +14 -4
  8. lfx/base/data/cloud_storage_utils.py +156 -0
  9. lfx/base/data/docling_utils.py +191 -65
  10. lfx/base/data/storage_utils.py +109 -0
  11. lfx/base/datastax/astradb_base.py +75 -64
  12. lfx/base/mcp/util.py +2 -2
  13. lfx/base/models/__init__.py +11 -1
  14. lfx/base/models/anthropic_constants.py +21 -12
  15. lfx/base/models/google_generative_ai_constants.py +33 -9
  16. lfx/base/models/model_metadata.py +6 -0
  17. lfx/base/models/ollama_constants.py +196 -30
  18. lfx/base/models/openai_constants.py +37 -10
  19. lfx/base/models/unified_models.py +1123 -0
  20. lfx/base/models/watsonx_constants.py +36 -0
  21. lfx/base/tools/component_tool.py +2 -9
  22. lfx/cli/commands.py +6 -1
  23. lfx/cli/run.py +65 -409
  24. lfx/cli/script_loader.py +13 -3
  25. lfx/components/__init__.py +0 -3
  26. lfx/components/composio/github_composio.py +1 -1
  27. lfx/components/cuga/cuga_agent.py +39 -27
  28. lfx/components/data_source/api_request.py +4 -2
  29. lfx/components/docling/__init__.py +45 -11
  30. lfx/components/docling/chunk_docling_document.py +3 -1
  31. lfx/components/docling/docling_inline.py +39 -49
  32. lfx/components/docling/export_docling_document.py +3 -1
  33. lfx/components/elastic/opensearch_multimodal.py +215 -57
  34. lfx/components/files_and_knowledge/file.py +439 -39
  35. lfx/components/files_and_knowledge/ingestion.py +8 -0
  36. lfx/components/files_and_knowledge/retrieval.py +10 -0
  37. lfx/components/files_and_knowledge/save_file.py +123 -53
  38. lfx/components/ibm/watsonx.py +7 -1
  39. lfx/components/input_output/chat_output.py +7 -1
  40. lfx/components/langchain_utilities/tool_calling.py +14 -6
  41. lfx/components/llm_operations/batch_run.py +80 -25
  42. lfx/components/llm_operations/lambda_filter.py +33 -6
  43. lfx/components/llm_operations/llm_conditional_router.py +39 -7
  44. lfx/components/llm_operations/structured_output.py +38 -12
  45. lfx/components/models/__init__.py +16 -74
  46. lfx/components/models_and_agents/agent.py +51 -201
  47. lfx/components/models_and_agents/embedding_model.py +185 -339
  48. lfx/components/models_and_agents/language_model.py +54 -318
  49. lfx/components/models_and_agents/mcp_component.py +58 -9
  50. lfx/components/ollama/ollama.py +9 -4
  51. lfx/components/ollama/ollama_embeddings.py +2 -1
  52. lfx/components/openai/openai_chat_model.py +1 -1
  53. lfx/components/processing/__init__.py +0 -3
  54. lfx/components/vllm/__init__.py +37 -0
  55. lfx/components/vllm/vllm.py +141 -0
  56. lfx/components/vllm/vllm_embeddings.py +110 -0
  57. lfx/custom/custom_component/custom_component.py +8 -6
  58. lfx/custom/directory_reader/directory_reader.py +5 -2
  59. lfx/graph/utils.py +64 -18
  60. lfx/inputs/__init__.py +2 -0
  61. lfx/inputs/input_mixin.py +54 -0
  62. lfx/inputs/inputs.py +115 -0
  63. lfx/interface/initialize/loading.py +42 -12
  64. lfx/io/__init__.py +2 -0
  65. lfx/run/__init__.py +5 -0
  66. lfx/run/base.py +494 -0
  67. lfx/schema/data.py +1 -1
  68. lfx/schema/image.py +28 -19
  69. lfx/schema/message.py +19 -3
  70. lfx/services/interfaces.py +5 -0
  71. lfx/services/manager.py +5 -4
  72. lfx/services/mcp_composer/service.py +45 -13
  73. lfx/services/settings/auth.py +18 -11
  74. lfx/services/settings/base.py +12 -24
  75. lfx/services/settings/constants.py +2 -0
  76. lfx/services/storage/local.py +37 -0
  77. lfx/services/storage/service.py +19 -0
  78. lfx/utils/constants.py +1 -0
  79. lfx/utils/image.py +29 -11
  80. lfx/utils/validate_cloud.py +14 -3
  81. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
  82. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +84 -78
  83. lfx/components/processing/dataframe_to_toolset.py +0 -259
  84. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
  85. {lfx_nightly-0.2.0.dev26.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0
@@ -21,15 +21,25 @@ from tempfile import NamedTemporaryFile
21
21
  from typing import Any
22
22
 
23
23
  from lfx.base.data.base_file import BaseFileComponent
24
- from lfx.base.data.storage_utils import parse_storage_path
24
+ from lfx.base.data.storage_utils import parse_storage_path, read_file_bytes, validate_image_content_type
25
25
  from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
26
+ from lfx.inputs import SortableListInput
26
27
  from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
27
- from lfx.io import BoolInput, FileInput, IntInput, Output
28
+ from lfx.io import BoolInput, FileInput, IntInput, Output, SecretStrInput
28
29
  from lfx.schema.data import Data
29
30
  from lfx.schema.dataframe import DataFrame # noqa: TC001
30
31
  from lfx.schema.message import Message
31
32
  from lfx.services.deps import get_settings_service, get_storage_service
32
33
  from lfx.utils.async_helpers import run_until_complete
34
+ from lfx.utils.validate_cloud import is_astra_cloud_environment
35
+
36
+
37
+ def _get_storage_location_options():
38
+ """Get storage location options, filtering out Local if in Astra cloud environment."""
39
+ all_options = [{"name": "AWS", "icon": "Amazon"}, {"name": "Google Drive", "icon": "google"}]
40
+ if is_astra_cloud_environment():
41
+ return all_options
42
+ return [{"name": "Local", "icon": "hard-drive"}, *all_options]
33
43
 
34
44
 
35
45
  class FileComponent(BaseFileComponent):
@@ -91,6 +101,15 @@ class FileComponent(BaseFileComponent):
91
101
  break
92
102
 
93
103
  inputs = [
104
+ SortableListInput(
105
+ name="storage_location",
106
+ display_name="Storage Location",
107
+ placeholder="Select Location",
108
+ info="Choose where to read the file from.",
109
+ options=_get_storage_location_options(),
110
+ real_time_refresh=True,
111
+ limit=1,
112
+ ),
94
113
  *_base_inputs,
95
114
  StrInput(
96
115
  name="file_path_str",
@@ -104,6 +123,63 @@ class FileComponent(BaseFileComponent):
104
123
  tool_mode=True, # Required for Toolset toggle, but _get_tools() ignores this parameter
105
124
  required=False,
106
125
  ),
126
+ # AWS S3 specific inputs
127
+ SecretStrInput(
128
+ name="aws_access_key_id",
129
+ display_name="AWS Access Key ID",
130
+ info="AWS Access key ID.",
131
+ show=False,
132
+ advanced=False,
133
+ required=True,
134
+ ),
135
+ SecretStrInput(
136
+ name="aws_secret_access_key",
137
+ display_name="AWS Secret Key",
138
+ info="AWS Secret Key.",
139
+ show=False,
140
+ advanced=False,
141
+ required=True,
142
+ ),
143
+ StrInput(
144
+ name="bucket_name",
145
+ display_name="S3 Bucket Name",
146
+ info="Enter the name of the S3 bucket.",
147
+ show=False,
148
+ advanced=False,
149
+ required=True,
150
+ ),
151
+ StrInput(
152
+ name="aws_region",
153
+ display_name="AWS Region",
154
+ info="AWS region (e.g., us-east-1, eu-west-1).",
155
+ show=False,
156
+ advanced=False,
157
+ ),
158
+ StrInput(
159
+ name="s3_file_key",
160
+ display_name="S3 File Key",
161
+ info="The key (path) of the file in S3 bucket.",
162
+ show=False,
163
+ advanced=False,
164
+ required=True,
165
+ ),
166
+ # Google Drive specific inputs
167
+ SecretStrInput(
168
+ name="service_account_key",
169
+ display_name="GCP Credentials Secret Key",
170
+ info="Your Google Cloud Platform service account JSON key as a secret string (complete JSON content).",
171
+ show=False,
172
+ advanced=False,
173
+ required=True,
174
+ ),
175
+ StrInput(
176
+ name="file_id",
177
+ display_name="Google Drive File ID",
178
+ info=("The Google Drive file ID to read. The file must be shared with the service account email."),
179
+ show=False,
180
+ advanced=False,
181
+ required=True,
182
+ ),
107
183
  BoolInput(
108
184
  name="advanced_mode",
109
185
  display_name="Advanced Parser",
@@ -113,7 +189,8 @@ class FileComponent(BaseFileComponent):
113
189
  "Enable advanced document processing and export with Docling for PDFs, images, and office documents. "
114
190
  "Note that advanced document processing can consume significant resources."
115
191
  ),
116
- show=True,
192
+ # Disabled in cloud
193
+ show=not is_astra_cloud_environment(),
117
194
  ),
118
195
  DropdownInput(
119
196
  name="pipeline",
@@ -269,6 +346,20 @@ class FileComponent(BaseFileComponent):
269
346
  """Return the list of currently selected file paths from the template."""
270
347
  return template.get("path", {}).get("file_path", [])
271
348
 
349
+ def _disable_docling_fields_in_cloud(self, build_config: dict[str, Any]) -> None:
350
+ """Disable all Docling-related fields in cloud environments."""
351
+ if "advanced_mode" in build_config:
352
+ build_config["advanced_mode"]["show"] = False
353
+ build_config["advanced_mode"]["value"] = False
354
+ # Hide all Docling-related fields
355
+ docling_fields = ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder")
356
+ for field in docling_fields:
357
+ if field in build_config:
358
+ build_config[field]["show"] = False
359
+ # Also disable OCR engine specifically
360
+ if "ocr_engine" in build_config:
361
+ build_config["ocr_engine"]["value"] = "None"
362
+
272
363
  def update_build_config(
273
364
  self,
274
365
  build_config: dict[str, Any],
@@ -276,28 +367,120 @@ class FileComponent(BaseFileComponent):
276
367
  field_name: str | None = None,
277
368
  ) -> dict[str, Any]:
278
369
  """Show/hide Advanced Parser and related fields based on selection context."""
370
+ # Update storage location options dynamically based on cloud environment
371
+ if "storage_location" in build_config:
372
+ updated_options = _get_storage_location_options()
373
+ build_config["storage_location"]["options"] = updated_options
374
+
375
+ # Handle storage location selection
376
+ if field_name == "storage_location":
377
+ # Extract selected storage location
378
+ selected = [location["name"] for location in field_value] if isinstance(field_value, list) else []
379
+
380
+ # Hide all storage-specific fields first
381
+ storage_fields = [
382
+ "aws_access_key_id",
383
+ "aws_secret_access_key",
384
+ "bucket_name",
385
+ "aws_region",
386
+ "s3_file_key",
387
+ "service_account_key",
388
+ "file_id",
389
+ ]
390
+
391
+ for f_name in storage_fields:
392
+ if f_name in build_config:
393
+ build_config[f_name]["show"] = False
394
+
395
+ # Show fields based on selected storage location
396
+ if len(selected) == 1:
397
+ location = selected[0]
398
+
399
+ if location == "Local":
400
+ # Show file upload input for local storage
401
+ if "path" in build_config:
402
+ build_config["path"]["show"] = True
403
+
404
+ elif location == "AWS":
405
+ # Hide file upload input, show AWS fields
406
+ if "path" in build_config:
407
+ build_config["path"]["show"] = False
408
+
409
+ aws_fields = [
410
+ "aws_access_key_id",
411
+ "aws_secret_access_key",
412
+ "bucket_name",
413
+ "aws_region",
414
+ "s3_file_key",
415
+ ]
416
+ for f_name in aws_fields:
417
+ if f_name in build_config:
418
+ build_config[f_name]["show"] = True
419
+ build_config[f_name]["advanced"] = False
420
+
421
+ elif location == "Google Drive":
422
+ # Hide file upload input, show Google Drive fields
423
+ if "path" in build_config:
424
+ build_config["path"]["show"] = False
425
+
426
+ gdrive_fields = ["service_account_key", "file_id"]
427
+ for f_name in gdrive_fields:
428
+ if f_name in build_config:
429
+ build_config[f_name]["show"] = True
430
+ build_config[f_name]["advanced"] = False
431
+ # No storage location selected - show file upload by default
432
+ elif "path" in build_config:
433
+ build_config["path"]["show"] = True
434
+
435
+ return build_config
436
+
279
437
  if field_name == "path":
280
438
  paths = self._path_value(build_config)
281
439
 
282
- # If all files can be processed by docling, do so
283
- allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
284
- build_config["advanced_mode"]["show"] = allow_advanced
285
- if not allow_advanced:
286
- build_config["advanced_mode"]["value"] = False
287
- for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
288
- if f in build_config:
289
- build_config[f]["show"] = False
440
+ # Disable in cloud environments
441
+ if is_astra_cloud_environment():
442
+ self._disable_docling_fields_in_cloud(build_config)
443
+ else:
444
+ # If all files can be processed by docling, do so
445
+ allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
446
+ build_config["advanced_mode"]["show"] = allow_advanced
447
+ if not allow_advanced:
448
+ build_config["advanced_mode"]["value"] = False
449
+ docling_fields = (
450
+ "pipeline",
451
+ "ocr_engine",
452
+ "doc_key",
453
+ "md_image_placeholder",
454
+ "md_page_break_placeholder",
455
+ )
456
+ for field in docling_fields:
457
+ if field in build_config:
458
+ build_config[field]["show"] = False
290
459
 
291
460
  # Docling Processing
292
461
  elif field_name == "advanced_mode":
293
- for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
294
- if f in build_config:
295
- build_config[f]["show"] = bool(field_value)
296
- if f == "pipeline":
297
- build_config[f]["advanced"] = not bool(field_value)
462
+ # Disable in cloud environments - don't show Docling fields even if advanced_mode is toggled
463
+ if is_astra_cloud_environment():
464
+ self._disable_docling_fields_in_cloud(build_config)
465
+ else:
466
+ docling_fields = (
467
+ "pipeline",
468
+ "ocr_engine",
469
+ "doc_key",
470
+ "md_image_placeholder",
471
+ "md_page_break_placeholder",
472
+ )
473
+ for field in docling_fields:
474
+ if field in build_config:
475
+ build_config[field]["show"] = bool(field_value)
476
+ if field == "pipeline":
477
+ build_config[field]["advanced"] = not bool(field_value)
298
478
 
299
479
  elif field_name == "pipeline":
300
- if field_value == "standard":
480
+ # Disable in cloud environments - don't show OCR engine even if pipeline is changed
481
+ if is_astra_cloud_environment():
482
+ self._disable_docling_fields_in_cloud(build_config)
483
+ elif field_value == "standard":
301
484
  build_config["ocr_engine"]["show"] = True
302
485
  build_config["ocr_engine"]["value"] = "easyocr"
303
486
  else:
@@ -368,15 +551,34 @@ class FileComponent(BaseFileComponent):
368
551
 
369
552
  # ------------------------------ Core processing ----------------------------------
370
553
 
554
+ def _get_selected_storage_location(self) -> str:
555
+ """Get the selected storage location from the SortableListInput."""
556
+ if hasattr(self, "storage_location") and self.storage_location:
557
+ if isinstance(self.storage_location, list) and len(self.storage_location) > 0:
558
+ return self.storage_location[0].get("name", "")
559
+ if isinstance(self.storage_location, dict):
560
+ return self.storage_location.get("name", "")
561
+ return "Local" # Default to Local if not specified
562
+
371
563
  def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
372
- """Override to handle file_path_str input from tool mode.
564
+ """Override to handle file_path_str input from tool mode and cloud storage.
373
565
 
374
- When called as a tool, the file_path_str parameter can be set.
375
- If not provided, it will fall back to using the path FileInput (uploaded file).
376
566
  Priority:
377
- 1. file_path_str (if provided by the tool call)
378
- 2. path (uploaded file from UI)
567
+ 1. Cloud storage (AWS/Google Drive) if selected
568
+ 2. file_path_str (if provided by the tool call)
569
+ 3. path (uploaded file from UI)
379
570
  """
571
+ storage_location = self._get_selected_storage_location()
572
+
573
+ # Handle AWS S3
574
+ if storage_location == "AWS":
575
+ return self._read_from_aws_s3()
576
+
577
+ # Handle Google Drive
578
+ if storage_location == "Google Drive":
579
+ return self._read_from_google_drive()
580
+
581
+ # Handle Local storage
380
582
  # Check if file_path_str is provided (from tool mode)
381
583
  file_path_str = getattr(self, "file_path_str", None)
382
584
  if file_path_str:
@@ -399,6 +601,101 @@ class FileComponent(BaseFileComponent):
399
601
  # Otherwise use the default implementation (uses path FileInput)
400
602
  return super()._validate_and_resolve_paths()
401
603
 
604
+ def _read_from_aws_s3(self) -> list[BaseFileComponent.BaseFile]:
605
+ """Read file from AWS S3."""
606
+ from lfx.base.data.cloud_storage_utils import create_s3_client, validate_aws_credentials
607
+
608
+ # Validate AWS credentials
609
+ validate_aws_credentials(self)
610
+ if not getattr(self, "s3_file_key", None):
611
+ msg = "S3 File Key is required"
612
+ raise ValueError(msg)
613
+
614
+ # Create S3 client
615
+ s3_client = create_s3_client(self)
616
+
617
+ # Download file to temp location
618
+ import tempfile
619
+
620
+ # Get file extension from S3 key
621
+ file_extension = Path(self.s3_file_key).suffix or ""
622
+
623
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
624
+ temp_file_path = temp_file.name
625
+ try:
626
+ s3_client.download_fileobj(self.bucket_name, self.s3_file_key, temp_file)
627
+ except Exception as e:
628
+ # Clean up temp file on failure
629
+ with contextlib.suppress(OSError):
630
+ Path(temp_file_path).unlink()
631
+ msg = f"Failed to download file from S3: {e}"
632
+ raise RuntimeError(msg) from e
633
+
634
+ # Create BaseFile object
635
+ from lfx.schema.data import Data
636
+
637
+ temp_path = Path(temp_file_path)
638
+ data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
639
+ return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
640
+
641
+ def _read_from_google_drive(self) -> list[BaseFileComponent.BaseFile]:
642
+ """Read file from Google Drive."""
643
+ import tempfile
644
+
645
+ from googleapiclient.http import MediaIoBaseDownload
646
+
647
+ from lfx.base.data.cloud_storage_utils import create_google_drive_service
648
+
649
+ # Validate Google Drive credentials
650
+ if not getattr(self, "service_account_key", None):
651
+ msg = "GCP Credentials Secret Key is required for Google Drive storage"
652
+ raise ValueError(msg)
653
+ if not getattr(self, "file_id", None):
654
+ msg = "Google Drive File ID is required"
655
+ raise ValueError(msg)
656
+
657
+ # Create Google Drive service with read-only scope
658
+ drive_service = create_google_drive_service(
659
+ self.service_account_key, scopes=["https://www.googleapis.com/auth/drive.readonly"]
660
+ )
661
+
662
+ # Get file metadata to determine file name and extension
663
+ try:
664
+ file_metadata = drive_service.files().get(fileId=self.file_id, fields="name,mimeType").execute()
665
+ file_name = file_metadata.get("name", "download")
666
+ except Exception as e:
667
+ msg = (
668
+ f"Unable to access file with ID '{self.file_id}'. "
669
+ f"Error: {e!s}. "
670
+ "Please ensure: 1) The file ID is correct, 2) The file exists, "
671
+ "3) The service account has been granted access to this file."
672
+ )
673
+ raise ValueError(msg) from e
674
+
675
+ # Download file to temp location
676
+ file_extension = Path(file_name).suffix or ""
677
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
678
+ temp_file_path = temp_file.name
679
+ try:
680
+ request = drive_service.files().get_media(fileId=self.file_id)
681
+ downloader = MediaIoBaseDownload(temp_file, request)
682
+ done = False
683
+ while not done:
684
+ _status, done = downloader.next_chunk()
685
+ except Exception as e:
686
+ # Clean up temp file on failure
687
+ with contextlib.suppress(OSError):
688
+ Path(temp_file_path).unlink()
689
+ msg = f"Failed to download file from Google Drive: {e}"
690
+ raise RuntimeError(msg) from e
691
+
692
+ # Create BaseFile object
693
+ from lfx.schema.data import Data
694
+
695
+ temp_path = Path(temp_file_path)
696
+ data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
697
+ return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
698
+
402
699
  def _is_docling_compatible(self, file_path: str) -> bool:
403
700
  """Lightweight extension gate for Docling-compatible types."""
404
701
  docling_exts = (
@@ -515,9 +812,6 @@ class FileComponent(BaseFileComponent):
515
812
  ),
516
813
  }
517
814
 
518
- self.log(f"Starting Docling subprocess for file: {local_file_path}")
519
- self.log(args)
520
-
521
815
  # Child script for isolating the docling processing
522
816
  child_script = textwrap.dedent(
523
817
  r"""
@@ -707,7 +1001,7 @@ class FileComponent(BaseFileComponent):
707
1001
  )
708
1002
 
709
1003
  if not proc.stdout:
710
- err_msg = proc.stderr.decode("utf-8", errors="replace") or "no output from child process"
1004
+ err_msg = proc.stderr.decode("utf-8", errors="replace") if proc.stderr else "no output from child process"
711
1005
  return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
712
1006
 
713
1007
  try:
@@ -722,9 +1016,16 @@ class FileComponent(BaseFileComponent):
722
1016
  )
723
1017
 
724
1018
  if not result.get("ok"):
725
- return Data(data={"error": result.get("error", "Unknown Docling error"), **result.get("meta", {})})
1019
+ error_msg = result.get("error", "Unknown Docling error")
1020
+ # Override meta file_path with original_file_path to ensure correct path matching
1021
+ meta = result.get("meta", {})
1022
+ meta["file_path"] = original_file_path
1023
+ return Data(data={"error": error_msg, **meta})
726
1024
 
727
1025
  meta = result.get("meta", {})
1026
+ # Override meta file_path with original_file_path to ensure correct path matching
1027
+ # The subprocess returns the temp file path, but we need the original S3/local path for rollup_data
1028
+ meta["file_path"] = original_file_path
728
1029
  if result.get("mode") == "markdown":
729
1030
  exported_content = str(result.get("text", ""))
730
1031
  return Data(
@@ -748,15 +1049,50 @@ class FileComponent(BaseFileComponent):
748
1049
  msg = "No files to process."
749
1050
  raise ValueError(msg)
750
1051
 
1052
+ # Validate image files to detect content/extension mismatches
1053
+ # This prevents API errors like "Image does not match the provided media type"
1054
+ image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
1055
+ settings = get_settings_service().settings
1056
+ for file in file_list:
1057
+ extension = file.path.suffix[1:].lower()
1058
+ if extension in image_extensions:
1059
+ # Read bytes based on storage type
1060
+ try:
1061
+ if settings.storage_type == "s3":
1062
+ # For S3 storage, use storage service to read file bytes
1063
+ file_path_str = str(file.path)
1064
+ content = run_until_complete(read_file_bytes(file_path_str))
1065
+ else:
1066
+ # For local storage, read bytes directly from filesystem
1067
+ content = file.path.read_bytes()
1068
+
1069
+ is_valid, error_msg = validate_image_content_type(
1070
+ str(file.path),
1071
+ content=content,
1072
+ )
1073
+ if not is_valid:
1074
+ self.log(error_msg)
1075
+ if not self.silent_errors:
1076
+ raise ValueError(error_msg)
1077
+ except (OSError, FileNotFoundError) as e:
1078
+ self.log(f"Could not read file for validation: {e}")
1079
+ # Continue - let it fail later with better error
1080
+
751
1081
  # Validate that files requiring Docling are only processed when advanced mode is enabled
752
1082
  if not self.advanced_mode:
753
1083
  for file in file_list:
754
1084
  extension = file.path.suffix[1:].lower()
755
1085
  if extension in self.DOCLING_ONLY_EXTENSIONS:
756
- msg = (
757
- f"File '{file.path.name}' has extension '.{extension}' which requires "
758
- f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
759
- )
1086
+ if is_astra_cloud_environment():
1087
+ msg = (
1088
+ f"File '{file.path.name}' has extension '.{extension}' which requires "
1089
+ f"Advanced Parser mode. Advanced Parser is not available in cloud environments."
1090
+ )
1091
+ else:
1092
+ msg = (
1093
+ f"File '{file.path.name}' has extension '.{extension}' which requires "
1094
+ f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
1095
+ )
760
1096
  self.log(msg)
761
1097
  raise ValueError(msg)
762
1098
 
@@ -783,10 +1119,36 @@ class FileComponent(BaseFileComponent):
783
1119
  file_path = str(file.path)
784
1120
  advanced_data: Data | None = self._process_docling_in_subprocess(file_path)
785
1121
 
1122
+ # Handle None case - Docling processing failed or returned None
1123
+ if advanced_data is None:
1124
+ error_data = Data(
1125
+ data={
1126
+ "file_path": file_path,
1127
+ "error": "Docling processing returned no result. Check logs for details.",
1128
+ },
1129
+ )
1130
+ final_return.extend(self.rollup_data([file], [error_data]))
1131
+ continue
1132
+
786
1133
  # --- UNNEST: expand each element in `doc` to its own Data row
787
1134
  payload = getattr(advanced_data, "data", {}) or {}
1135
+
1136
+ # Check for errors first
1137
+ if "error" in payload:
1138
+ error_msg = payload.get("error", "Unknown error")
1139
+ error_data = Data(
1140
+ data={
1141
+ "file_path": file_path,
1142
+ "error": error_msg,
1143
+ **{k: v for k, v in payload.items() if k not in ("error", "file_path")},
1144
+ },
1145
+ )
1146
+ final_return.extend(self.rollup_data([file], [error_data]))
1147
+ continue
1148
+
788
1149
  doc_rows = payload.get("doc")
789
- if isinstance(doc_rows, list):
1150
+ if isinstance(doc_rows, list) and doc_rows:
1151
+ # Non-empty list of structured rows
790
1152
  rows: list[Data | None] = [
791
1153
  Data(
792
1154
  data={
@@ -796,10 +1158,31 @@ class FileComponent(BaseFileComponent):
796
1158
  )
797
1159
  for item in doc_rows
798
1160
  ]
799
- final_return.extend(self.rollup_data(file_list, rows))
1161
+ final_return.extend(self.rollup_data([file], rows))
1162
+ elif isinstance(doc_rows, list) and not doc_rows:
1163
+ # Empty list - file was processed but no text content found
1164
+ # Create a Data object indicating no content was extracted
1165
+ self.log(f"No text extracted from '{file_path}', creating placeholder data")
1166
+ empty_data = Data(
1167
+ data={
1168
+ "file_path": file_path,
1169
+ "text": "(No text content extracted from image)",
1170
+ "info": "Image processed successfully but contained no extractable text",
1171
+ **{k: v for k, v in payload.items() if k != "doc"},
1172
+ },
1173
+ )
1174
+ final_return.extend(self.rollup_data([file], [empty_data]))
800
1175
  else:
801
1176
  # If not structured, keep as-is (e.g., markdown export or error dict)
802
- final_return.extend(self.rollup_data(file_list, [advanced_data]))
1177
+ # Ensure file_path is set for proper rollup matching
1178
+ if not payload.get("file_path"):
1179
+ payload["file_path"] = file_path
1180
+ # Create new Data with file_path
1181
+ advanced_data = Data(
1182
+ data=payload,
1183
+ text=getattr(advanced_data, "text", None),
1184
+ )
1185
+ final_return.extend(self.rollup_data([file], [advanced_data]))
803
1186
  return final_return
804
1187
 
805
1188
  # Standard multi-file (or single non-advanced) path
@@ -820,13 +1203,17 @@ class FileComponent(BaseFileComponent):
820
1203
  def load_files_helper(self) -> DataFrame:
821
1204
  result = self.load_files()
822
1205
 
823
- # Error condition - raise error if no text and an error is present
824
- if not hasattr(result, "text"):
825
- if hasattr(result, "error"):
826
- raise ValueError(result.error[0])
1206
+ # Result is a DataFrame - check if it has any rows
1207
+ if result.empty:
827
1208
  msg = "Could not extract content from the provided file(s)."
828
1209
  raise ValueError(msg)
829
1210
 
1211
+ # Check for error column with error messages
1212
+ if "error" in result.columns:
1213
+ errors = result["error"].dropna().tolist()
1214
+ if errors and not any(col in result.columns for col in ["text", "doc", "exported_content"]):
1215
+ raise ValueError(errors[0])
1216
+
830
1217
  return result
831
1218
 
832
1219
  def load_files_dataframe(self) -> DataFrame:
@@ -838,4 +1225,17 @@ class FileComponent(BaseFileComponent):
838
1225
  """Load files using advanced Docling processing and export to Markdown format."""
839
1226
  self.markdown = True
840
1227
  result = self.load_files_helper()
841
- return Message(text=str(result.text[0]))
1228
+
1229
+ # Result is a DataFrame - check for text or exported_content columns
1230
+ if "text" in result.columns and not result["text"].isna().all():
1231
+ text_values = result["text"].dropna().tolist()
1232
+ if text_values:
1233
+ return Message(text=str(text_values[0]))
1234
+
1235
+ if "exported_content" in result.columns and not result["exported_content"].isna().all():
1236
+ content_values = result["exported_content"].dropna().tolist()
1237
+ if content_values:
1238
+ return Message(text=str(content_values[0]))
1239
+
1240
+ # Return empty message with info that no text was found
1241
+ return Message(text="(No text content extracted from file)")
@@ -38,6 +38,7 @@ from lfx.services.deps import (
38
38
  get_variable_service,
39
39
  session_scope,
40
40
  )
41
+ from lfx.utils.validate_cloud import raise_error_if_astra_cloud_disable_component
41
42
 
42
43
  if TYPE_CHECKING:
43
44
  from lfx.schema.dataframe import DataFrame
@@ -50,6 +51,9 @@ COHERE_MODEL_NAMES = ["embed-english-v3.0", "embed-multilingual-v3.0"]
50
51
 
51
52
  _KNOWLEDGE_BASES_ROOT_PATH: Path | None = None
52
53
 
54
+ # Error message to raise if we're in Astra cloud environment and the component is not supported.
55
+ astra_error_msg = "Knowledge ingestion is not supported in Astra cloud environment."
56
+
53
57
 
54
58
  def _get_knowledge_bases_root_path() -> Path:
55
59
  """Lazy load the knowledge bases root path from settings."""
@@ -540,6 +544,8 @@ class KnowledgeIngestionComponent(Component):
540
544
  # ---------------------------------------------------------------------
541
545
  async def build_kb_info(self) -> Data:
542
546
  """Main ingestion routine → returns a dict with KB metadata."""
547
+ # Check if we're in Astra cloud environment and raise an error if we are.
548
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
543
549
  try:
544
550
  input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df
545
551
  df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)
@@ -626,6 +632,8 @@ class KnowledgeIngestionComponent(Component):
626
632
  field_name: str | None = None,
627
633
  ):
628
634
  """Update build configuration based on provider selection."""
635
+ # Check if we're in Astra cloud environment and raise an error if we are.
636
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
629
637
  # Create a new knowledge base
630
638
  if field_name == "knowledge_base":
631
639
  async with session_scope() as db: