lfx-nightly 0.2.0.dev41__py3-none-any.whl → 0.2.1.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. lfx/_assets/component_index.json +1 -1
  2. lfx/base/agents/agent.py +1 -1
  3. lfx/base/agents/altk_tool_wrappers.py +1 -1
  4. lfx/base/agents/utils.py +4 -0
  5. lfx/base/composio/composio_base.py +78 -41
  6. lfx/base/data/cloud_storage_utils.py +156 -0
  7. lfx/base/data/docling_utils.py +130 -55
  8. lfx/base/datastax/astradb_base.py +75 -64
  9. lfx/base/embeddings/embeddings_class.py +113 -0
  10. lfx/base/models/__init__.py +11 -1
  11. lfx/base/models/google_generative_ai_constants.py +33 -9
  12. lfx/base/models/model_metadata.py +6 -0
  13. lfx/base/models/ollama_constants.py +196 -30
  14. lfx/base/models/openai_constants.py +37 -10
  15. lfx/base/models/unified_models.py +1123 -0
  16. lfx/base/models/watsonx_constants.py +36 -0
  17. lfx/base/tools/component_tool.py +2 -9
  18. lfx/cli/commands.py +3 -0
  19. lfx/cli/run.py +65 -409
  20. lfx/cli/script_loader.py +13 -3
  21. lfx/components/__init__.py +0 -3
  22. lfx/components/composio/github_composio.py +1 -1
  23. lfx/components/cuga/cuga_agent.py +39 -27
  24. lfx/components/data_source/api_request.py +4 -2
  25. lfx/components/docling/__init__.py +45 -11
  26. lfx/components/docling/docling_inline.py +39 -49
  27. lfx/components/elastic/opensearch_multimodal.py +1733 -0
  28. lfx/components/files_and_knowledge/file.py +384 -36
  29. lfx/components/files_and_knowledge/ingestion.py +8 -0
  30. lfx/components/files_and_knowledge/retrieval.py +10 -0
  31. lfx/components/files_and_knowledge/save_file.py +91 -88
  32. lfx/components/langchain_utilities/tool_calling.py +14 -6
  33. lfx/components/llm_operations/batch_run.py +64 -18
  34. lfx/components/llm_operations/lambda_filter.py +33 -6
  35. lfx/components/llm_operations/llm_conditional_router.py +39 -7
  36. lfx/components/llm_operations/structured_output.py +38 -12
  37. lfx/components/models/__init__.py +16 -74
  38. lfx/components/models_and_agents/agent.py +51 -203
  39. lfx/components/models_and_agents/embedding_model.py +171 -255
  40. lfx/components/models_and_agents/language_model.py +54 -318
  41. lfx/components/models_and_agents/mcp_component.py +58 -9
  42. lfx/components/ollama/ollama_embeddings.py +2 -1
  43. lfx/components/openai/openai_chat_model.py +1 -1
  44. lfx/components/vllm/__init__.py +37 -0
  45. lfx/components/vllm/vllm.py +141 -0
  46. lfx/components/vllm/vllm_embeddings.py +110 -0
  47. lfx/custom/custom_component/custom_component.py +8 -6
  48. lfx/graph/graph/base.py +4 -1
  49. lfx/graph/utils.py +64 -18
  50. lfx/graph/vertex/base.py +4 -1
  51. lfx/inputs/__init__.py +2 -0
  52. lfx/inputs/input_mixin.py +54 -0
  53. lfx/inputs/inputs.py +115 -0
  54. lfx/interface/initialize/loading.py +42 -12
  55. lfx/io/__init__.py +2 -0
  56. lfx/run/__init__.py +5 -0
  57. lfx/run/base.py +494 -0
  58. lfx/schema/data.py +1 -1
  59. lfx/schema/image.py +26 -7
  60. lfx/schema/message.py +19 -3
  61. lfx/services/mcp_composer/service.py +7 -1
  62. lfx/services/settings/base.py +7 -1
  63. lfx/services/settings/constants.py +2 -0
  64. lfx/services/storage/local.py +13 -8
  65. lfx/utils/constants.py +1 -0
  66. lfx/utils/validate_cloud.py +14 -3
  67. {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/METADATA +5 -2
  68. {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/RECORD +70 -61
  69. {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/WHEEL +0 -0
  70. {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.2.1.dev7.dist-info}/entry_points.txt +0 -0
@@ -21,15 +21,25 @@ from tempfile import NamedTemporaryFile
21
21
  from typing import Any
22
22
 
23
23
  from lfx.base.data.base_file import BaseFileComponent
24
- from lfx.base.data.storage_utils import parse_storage_path, validate_image_content_type
24
+ from lfx.base.data.storage_utils import parse_storage_path, read_file_bytes, validate_image_content_type
25
25
  from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
26
+ from lfx.inputs import SortableListInput
26
27
  from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
27
- from lfx.io import BoolInput, FileInput, IntInput, Output
28
+ from lfx.io import BoolInput, FileInput, IntInput, Output, SecretStrInput
28
29
  from lfx.schema.data import Data
29
30
  from lfx.schema.dataframe import DataFrame # noqa: TC001
30
31
  from lfx.schema.message import Message
31
32
  from lfx.services.deps import get_settings_service, get_storage_service
32
33
  from lfx.utils.async_helpers import run_until_complete
34
+ from lfx.utils.validate_cloud import is_astra_cloud_environment
35
+
36
+
37
+ def _get_storage_location_options():
38
+ """Get storage location options, filtering out Local if in Astra cloud environment."""
39
+ all_options = [{"name": "AWS", "icon": "Amazon"}, {"name": "Google Drive", "icon": "google"}]
40
+ if is_astra_cloud_environment():
41
+ return all_options
42
+ return [{"name": "Local", "icon": "hard-drive"}, *all_options]
33
43
 
34
44
 
35
45
  class FileComponent(BaseFileComponent):
@@ -91,6 +101,15 @@ class FileComponent(BaseFileComponent):
91
101
  break
92
102
 
93
103
  inputs = [
104
+ SortableListInput(
105
+ name="storage_location",
106
+ display_name="Storage Location",
107
+ placeholder="Select Location",
108
+ info="Choose where to read the file from.",
109
+ options=_get_storage_location_options(),
110
+ real_time_refresh=True,
111
+ limit=1,
112
+ ),
94
113
  *_base_inputs,
95
114
  StrInput(
96
115
  name="file_path_str",
@@ -104,6 +123,63 @@ class FileComponent(BaseFileComponent):
104
123
  tool_mode=True, # Required for Toolset toggle, but _get_tools() ignores this parameter
105
124
  required=False,
106
125
  ),
126
+ # AWS S3 specific inputs
127
+ SecretStrInput(
128
+ name="aws_access_key_id",
129
+ display_name="AWS Access Key ID",
130
+ info="AWS Access key ID.",
131
+ show=False,
132
+ advanced=False,
133
+ required=True,
134
+ ),
135
+ SecretStrInput(
136
+ name="aws_secret_access_key",
137
+ display_name="AWS Secret Key",
138
+ info="AWS Secret Key.",
139
+ show=False,
140
+ advanced=False,
141
+ required=True,
142
+ ),
143
+ StrInput(
144
+ name="bucket_name",
145
+ display_name="S3 Bucket Name",
146
+ info="Enter the name of the S3 bucket.",
147
+ show=False,
148
+ advanced=False,
149
+ required=True,
150
+ ),
151
+ StrInput(
152
+ name="aws_region",
153
+ display_name="AWS Region",
154
+ info="AWS region (e.g., us-east-1, eu-west-1).",
155
+ show=False,
156
+ advanced=False,
157
+ ),
158
+ StrInput(
159
+ name="s3_file_key",
160
+ display_name="S3 File Key",
161
+ info="The key (path) of the file in S3 bucket.",
162
+ show=False,
163
+ advanced=False,
164
+ required=True,
165
+ ),
166
+ # Google Drive specific inputs
167
+ SecretStrInput(
168
+ name="service_account_key",
169
+ display_name="GCP Credentials Secret Key",
170
+ info="Your Google Cloud Platform service account JSON key as a secret string (complete JSON content).",
171
+ show=False,
172
+ advanced=False,
173
+ required=True,
174
+ ),
175
+ StrInput(
176
+ name="file_id",
177
+ display_name="Google Drive File ID",
178
+ info=("The Google Drive file ID to read. The file must be shared with the service account email."),
179
+ show=False,
180
+ advanced=False,
181
+ required=True,
182
+ ),
107
183
  BoolInput(
108
184
  name="advanced_mode",
109
185
  display_name="Advanced Parser",
@@ -113,7 +189,8 @@ class FileComponent(BaseFileComponent):
113
189
  "Enable advanced document processing and export with Docling for PDFs, images, and office documents. "
114
190
  "Note that advanced document processing can consume significant resources."
115
191
  ),
116
- show=True,
192
+ # Disabled in cloud
193
+ show=not is_astra_cloud_environment(),
117
194
  ),
118
195
  DropdownInput(
119
196
  name="pipeline",
@@ -269,6 +346,20 @@ class FileComponent(BaseFileComponent):
269
346
  """Return the list of currently selected file paths from the template."""
270
347
  return template.get("path", {}).get("file_path", [])
271
348
 
349
+ def _disable_docling_fields_in_cloud(self, build_config: dict[str, Any]) -> None:
350
+ """Disable all Docling-related fields in cloud environments."""
351
+ if "advanced_mode" in build_config:
352
+ build_config["advanced_mode"]["show"] = False
353
+ build_config["advanced_mode"]["value"] = False
354
+ # Hide all Docling-related fields
355
+ docling_fields = ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder")
356
+ for field in docling_fields:
357
+ if field in build_config:
358
+ build_config[field]["show"] = False
359
+ # Also disable OCR engine specifically
360
+ if "ocr_engine" in build_config:
361
+ build_config["ocr_engine"]["value"] = "None"
362
+
272
363
  def update_build_config(
273
364
  self,
274
365
  build_config: dict[str, Any],
@@ -276,28 +367,120 @@ class FileComponent(BaseFileComponent):
276
367
  field_name: str | None = None,
277
368
  ) -> dict[str, Any]:
278
369
  """Show/hide Advanced Parser and related fields based on selection context."""
370
+ # Update storage location options dynamically based on cloud environment
371
+ if "storage_location" in build_config:
372
+ updated_options = _get_storage_location_options()
373
+ build_config["storage_location"]["options"] = updated_options
374
+
375
+ # Handle storage location selection
376
+ if field_name == "storage_location":
377
+ # Extract selected storage location
378
+ selected = [location["name"] for location in field_value] if isinstance(field_value, list) else []
379
+
380
+ # Hide all storage-specific fields first
381
+ storage_fields = [
382
+ "aws_access_key_id",
383
+ "aws_secret_access_key",
384
+ "bucket_name",
385
+ "aws_region",
386
+ "s3_file_key",
387
+ "service_account_key",
388
+ "file_id",
389
+ ]
390
+
391
+ for f_name in storage_fields:
392
+ if f_name in build_config:
393
+ build_config[f_name]["show"] = False
394
+
395
+ # Show fields based on selected storage location
396
+ if len(selected) == 1:
397
+ location = selected[0]
398
+
399
+ if location == "Local":
400
+ # Show file upload input for local storage
401
+ if "path" in build_config:
402
+ build_config["path"]["show"] = True
403
+
404
+ elif location == "AWS":
405
+ # Hide file upload input, show AWS fields
406
+ if "path" in build_config:
407
+ build_config["path"]["show"] = False
408
+
409
+ aws_fields = [
410
+ "aws_access_key_id",
411
+ "aws_secret_access_key",
412
+ "bucket_name",
413
+ "aws_region",
414
+ "s3_file_key",
415
+ ]
416
+ for f_name in aws_fields:
417
+ if f_name in build_config:
418
+ build_config[f_name]["show"] = True
419
+ build_config[f_name]["advanced"] = False
420
+
421
+ elif location == "Google Drive":
422
+ # Hide file upload input, show Google Drive fields
423
+ if "path" in build_config:
424
+ build_config["path"]["show"] = False
425
+
426
+ gdrive_fields = ["service_account_key", "file_id"]
427
+ for f_name in gdrive_fields:
428
+ if f_name in build_config:
429
+ build_config[f_name]["show"] = True
430
+ build_config[f_name]["advanced"] = False
431
+ # No storage location selected - show file upload by default
432
+ elif "path" in build_config:
433
+ build_config["path"]["show"] = True
434
+
435
+ return build_config
436
+
279
437
  if field_name == "path":
280
438
  paths = self._path_value(build_config)
281
439
 
282
- # If all files can be processed by docling, do so
283
- allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
284
- build_config["advanced_mode"]["show"] = allow_advanced
285
- if not allow_advanced:
286
- build_config["advanced_mode"]["value"] = False
287
- for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
288
- if f in build_config:
289
- build_config[f]["show"] = False
440
+ # Disable in cloud environments
441
+ if is_astra_cloud_environment():
442
+ self._disable_docling_fields_in_cloud(build_config)
443
+ else:
444
+ # If all files can be processed by docling, do so
445
+ allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
446
+ build_config["advanced_mode"]["show"] = allow_advanced
447
+ if not allow_advanced:
448
+ build_config["advanced_mode"]["value"] = False
449
+ docling_fields = (
450
+ "pipeline",
451
+ "ocr_engine",
452
+ "doc_key",
453
+ "md_image_placeholder",
454
+ "md_page_break_placeholder",
455
+ )
456
+ for field in docling_fields:
457
+ if field in build_config:
458
+ build_config[field]["show"] = False
290
459
 
291
460
  # Docling Processing
292
461
  elif field_name == "advanced_mode":
293
- for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
294
- if f in build_config:
295
- build_config[f]["show"] = bool(field_value)
296
- if f == "pipeline":
297
- build_config[f]["advanced"] = not bool(field_value)
462
+ # Disable in cloud environments - don't show Docling fields even if advanced_mode is toggled
463
+ if is_astra_cloud_environment():
464
+ self._disable_docling_fields_in_cloud(build_config)
465
+ else:
466
+ docling_fields = (
467
+ "pipeline",
468
+ "ocr_engine",
469
+ "doc_key",
470
+ "md_image_placeholder",
471
+ "md_page_break_placeholder",
472
+ )
473
+ for field in docling_fields:
474
+ if field in build_config:
475
+ build_config[field]["show"] = bool(field_value)
476
+ if field == "pipeline":
477
+ build_config[field]["advanced"] = not bool(field_value)
298
478
 
299
479
  elif field_name == "pipeline":
300
- if field_value == "standard":
480
+ # Disable in cloud environments - don't show OCR engine even if pipeline is changed
481
+ if is_astra_cloud_environment():
482
+ self._disable_docling_fields_in_cloud(build_config)
483
+ elif field_value == "standard":
301
484
  build_config["ocr_engine"]["show"] = True
302
485
  build_config["ocr_engine"]["value"] = "easyocr"
303
486
  else:
@@ -368,15 +551,34 @@ class FileComponent(BaseFileComponent):
368
551
 
369
552
  # ------------------------------ Core processing ----------------------------------
370
553
 
554
+ def _get_selected_storage_location(self) -> str:
555
+ """Get the selected storage location from the SortableListInput."""
556
+ if hasattr(self, "storage_location") and self.storage_location:
557
+ if isinstance(self.storage_location, list) and len(self.storage_location) > 0:
558
+ return self.storage_location[0].get("name", "")
559
+ if isinstance(self.storage_location, dict):
560
+ return self.storage_location.get("name", "")
561
+ return "Local" # Default to Local if not specified
562
+
371
563
  def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
372
- """Override to handle file_path_str input from tool mode.
564
+ """Override to handle file_path_str input from tool mode and cloud storage.
373
565
 
374
- When called as a tool, the file_path_str parameter can be set.
375
- If not provided, it will fall back to using the path FileInput (uploaded file).
376
566
  Priority:
377
- 1. file_path_str (if provided by the tool call)
378
- 2. path (uploaded file from UI)
567
+ 1. Cloud storage (AWS/Google Drive) if selected
568
+ 2. file_path_str (if provided by the tool call)
569
+ 3. path (uploaded file from UI)
379
570
  """
571
+ storage_location = self._get_selected_storage_location()
572
+
573
+ # Handle AWS S3
574
+ if storage_location == "AWS":
575
+ return self._read_from_aws_s3()
576
+
577
+ # Handle Google Drive
578
+ if storage_location == "Google Drive":
579
+ return self._read_from_google_drive()
580
+
581
+ # Handle Local storage
380
582
  # Check if file_path_str is provided (from tool mode)
381
583
  file_path_str = getattr(self, "file_path_str", None)
382
584
  if file_path_str:
@@ -399,6 +601,101 @@ class FileComponent(BaseFileComponent):
399
601
  # Otherwise use the default implementation (uses path FileInput)
400
602
  return super()._validate_and_resolve_paths()
401
603
 
604
+ def _read_from_aws_s3(self) -> list[BaseFileComponent.BaseFile]:
605
+ """Read file from AWS S3."""
606
+ from lfx.base.data.cloud_storage_utils import create_s3_client, validate_aws_credentials
607
+
608
+ # Validate AWS credentials
609
+ validate_aws_credentials(self)
610
+ if not getattr(self, "s3_file_key", None):
611
+ msg = "S3 File Key is required"
612
+ raise ValueError(msg)
613
+
614
+ # Create S3 client
615
+ s3_client = create_s3_client(self)
616
+
617
+ # Download file to temp location
618
+ import tempfile
619
+
620
+ # Get file extension from S3 key
621
+ file_extension = Path(self.s3_file_key).suffix or ""
622
+
623
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
624
+ temp_file_path = temp_file.name
625
+ try:
626
+ s3_client.download_fileobj(self.bucket_name, self.s3_file_key, temp_file)
627
+ except Exception as e:
628
+ # Clean up temp file on failure
629
+ with contextlib.suppress(OSError):
630
+ Path(temp_file_path).unlink()
631
+ msg = f"Failed to download file from S3: {e}"
632
+ raise RuntimeError(msg) from e
633
+
634
+ # Create BaseFile object
635
+ from lfx.schema.data import Data
636
+
637
+ temp_path = Path(temp_file_path)
638
+ data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
639
+ return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
640
+
641
+ def _read_from_google_drive(self) -> list[BaseFileComponent.BaseFile]:
642
+ """Read file from Google Drive."""
643
+ import tempfile
644
+
645
+ from googleapiclient.http import MediaIoBaseDownload
646
+
647
+ from lfx.base.data.cloud_storage_utils import create_google_drive_service
648
+
649
+ # Validate Google Drive credentials
650
+ if not getattr(self, "service_account_key", None):
651
+ msg = "GCP Credentials Secret Key is required for Google Drive storage"
652
+ raise ValueError(msg)
653
+ if not getattr(self, "file_id", None):
654
+ msg = "Google Drive File ID is required"
655
+ raise ValueError(msg)
656
+
657
+ # Create Google Drive service with read-only scope
658
+ drive_service = create_google_drive_service(
659
+ self.service_account_key, scopes=["https://www.googleapis.com/auth/drive.readonly"]
660
+ )
661
+
662
+ # Get file metadata to determine file name and extension
663
+ try:
664
+ file_metadata = drive_service.files().get(fileId=self.file_id, fields="name,mimeType").execute()
665
+ file_name = file_metadata.get("name", "download")
666
+ except Exception as e:
667
+ msg = (
668
+ f"Unable to access file with ID '{self.file_id}'. "
669
+ f"Error: {e!s}. "
670
+ "Please ensure: 1) The file ID is correct, 2) The file exists, "
671
+ "3) The service account has been granted access to this file."
672
+ )
673
+ raise ValueError(msg) from e
674
+
675
+ # Download file to temp location
676
+ file_extension = Path(file_name).suffix or ""
677
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
678
+ temp_file_path = temp_file.name
679
+ try:
680
+ request = drive_service.files().get_media(fileId=self.file_id)
681
+ downloader = MediaIoBaseDownload(temp_file, request)
682
+ done = False
683
+ while not done:
684
+ _status, done = downloader.next_chunk()
685
+ except Exception as e:
686
+ # Clean up temp file on failure
687
+ with contextlib.suppress(OSError):
688
+ Path(temp_file_path).unlink()
689
+ msg = f"Failed to download file from Google Drive: {e}"
690
+ raise RuntimeError(msg) from e
691
+
692
+ # Create BaseFile object
693
+ from lfx.schema.data import Data
694
+
695
+ temp_path = Path(temp_file_path)
696
+ data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
697
+ return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
698
+
402
699
  def _is_docling_compatible(self, file_path: str) -> bool:
403
700
  """Lightweight extension gate for Docling-compatible types."""
404
701
  docling_exts = (
@@ -515,9 +812,6 @@ class FileComponent(BaseFileComponent):
515
812
  ),
516
813
  }
517
814
 
518
- self.log(f"Starting Docling subprocess for file: {local_file_path}")
519
- self.log(args)
520
-
521
815
  # Child script for isolating the docling processing
522
816
  child_script = textwrap.dedent(
523
817
  r"""
@@ -707,7 +1001,7 @@ class FileComponent(BaseFileComponent):
707
1001
  )
708
1002
 
709
1003
  if not proc.stdout:
710
- err_msg = proc.stderr.decode("utf-8", errors="replace") or "no output from child process"
1004
+ err_msg = proc.stderr.decode("utf-8", errors="replace") if proc.stderr else "no output from child process"
711
1005
  return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
712
1006
 
713
1007
  try:
@@ -722,9 +1016,16 @@ class FileComponent(BaseFileComponent):
722
1016
  )
723
1017
 
724
1018
  if not result.get("ok"):
725
- return Data(data={"error": result.get("error", "Unknown Docling error"), **result.get("meta", {})})
1019
+ error_msg = result.get("error", "Unknown Docling error")
1020
+ # Override meta file_path with original_file_path to ensure correct path matching
1021
+ meta = result.get("meta", {})
1022
+ meta["file_path"] = original_file_path
1023
+ return Data(data={"error": error_msg, **meta})
726
1024
 
727
1025
  meta = result.get("meta", {})
1026
+ # Override meta file_path with original_file_path to ensure correct path matching
1027
+ # The subprocess returns the temp file path, but we need the original S3/local path for rollup_data
1028
+ meta["file_path"] = original_file_path
728
1029
  if result.get("mode") == "markdown":
729
1030
  exported_content = str(result.get("text", ""))
730
1031
  return Data(
@@ -751,12 +1052,20 @@ class FileComponent(BaseFileComponent):
751
1052
  # Validate image files to detect content/extension mismatches
752
1053
  # This prevents API errors like "Image does not match the provided media type"
753
1054
  image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
1055
+ settings = get_settings_service().settings
754
1056
  for file in file_list:
755
1057
  extension = file.path.suffix[1:].lower()
756
1058
  if extension in image_extensions:
757
- # file.path is already resolved, read bytes directly
1059
+ # Read bytes based on storage type
758
1060
  try:
759
- content = file.path.read_bytes()
1061
+ if settings.storage_type == "s3":
1062
+ # For S3 storage, use storage service to read file bytes
1063
+ file_path_str = str(file.path)
1064
+ content = run_until_complete(read_file_bytes(file_path_str))
1065
+ else:
1066
+ # For local storage, read bytes directly from filesystem
1067
+ content = file.path.read_bytes()
1068
+
760
1069
  is_valid, error_msg = validate_image_content_type(
761
1070
  str(file.path),
762
1071
  content=content,
@@ -765,7 +1074,7 @@ class FileComponent(BaseFileComponent):
765
1074
  self.log(error_msg)
766
1075
  if not self.silent_errors:
767
1076
  raise ValueError(error_msg)
768
- except OSError as e:
1077
+ except (OSError, FileNotFoundError) as e:
769
1078
  self.log(f"Could not read file for validation: {e}")
770
1079
  # Continue - let it fail later with better error
771
1080
 
@@ -774,10 +1083,16 @@ class FileComponent(BaseFileComponent):
774
1083
  for file in file_list:
775
1084
  extension = file.path.suffix[1:].lower()
776
1085
  if extension in self.DOCLING_ONLY_EXTENSIONS:
777
- msg = (
778
- f"File '{file.path.name}' has extension '.{extension}' which requires "
779
- f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
780
- )
1086
+ if is_astra_cloud_environment():
1087
+ msg = (
1088
+ f"File '{file.path.name}' has extension '.{extension}' which requires "
1089
+ f"Advanced Parser mode. Advanced Parser is not available in cloud environments."
1090
+ )
1091
+ else:
1092
+ msg = (
1093
+ f"File '{file.path.name}' has extension '.{extension}' which requires "
1094
+ f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
1095
+ )
781
1096
  self.log(msg)
782
1097
  raise ValueError(msg)
783
1098
 
@@ -804,8 +1119,33 @@ class FileComponent(BaseFileComponent):
804
1119
  file_path = str(file.path)
805
1120
  advanced_data: Data | None = self._process_docling_in_subprocess(file_path)
806
1121
 
1122
+ # Handle None case - Docling processing failed or returned None
1123
+ if advanced_data is None:
1124
+ error_data = Data(
1125
+ data={
1126
+ "file_path": file_path,
1127
+ "error": "Docling processing returned no result. Check logs for details.",
1128
+ },
1129
+ )
1130
+ final_return.extend(self.rollup_data([file], [error_data]))
1131
+ continue
1132
+
807
1133
  # --- UNNEST: expand each element in `doc` to its own Data row
808
1134
  payload = getattr(advanced_data, "data", {}) or {}
1135
+
1136
+ # Check for errors first
1137
+ if "error" in payload:
1138
+ error_msg = payload.get("error", "Unknown error")
1139
+ error_data = Data(
1140
+ data={
1141
+ "file_path": file_path,
1142
+ "error": error_msg,
1143
+ **{k: v for k, v in payload.items() if k not in ("error", "file_path")},
1144
+ },
1145
+ )
1146
+ final_return.extend(self.rollup_data([file], [error_data]))
1147
+ continue
1148
+
809
1149
  doc_rows = payload.get("doc")
810
1150
  if isinstance(doc_rows, list) and doc_rows:
811
1151
  # Non-empty list of structured rows
@@ -818,7 +1158,7 @@ class FileComponent(BaseFileComponent):
818
1158
  )
819
1159
  for item in doc_rows
820
1160
  ]
821
- final_return.extend(self.rollup_data(file_list, rows))
1161
+ final_return.extend(self.rollup_data([file], rows))
822
1162
  elif isinstance(doc_rows, list) and not doc_rows:
823
1163
  # Empty list - file was processed but no text content found
824
1164
  # Create a Data object indicating no content was extracted
@@ -834,7 +1174,15 @@ class FileComponent(BaseFileComponent):
834
1174
  final_return.extend(self.rollup_data([file], [empty_data]))
835
1175
  else:
836
1176
  # If not structured, keep as-is (e.g., markdown export or error dict)
837
- final_return.extend(self.rollup_data(file_list, [advanced_data]))
1177
+ # Ensure file_path is set for proper rollup matching
1178
+ if not payload.get("file_path"):
1179
+ payload["file_path"] = file_path
1180
+ # Create new Data with file_path
1181
+ advanced_data = Data(
1182
+ data=payload,
1183
+ text=getattr(advanced_data, "text", None),
1184
+ )
1185
+ final_return.extend(self.rollup_data([file], [advanced_data]))
838
1186
  return final_return
839
1187
 
840
1188
  # Standard multi-file (or single non-advanced) path
@@ -38,6 +38,7 @@ from lfx.services.deps import (
38
38
  get_variable_service,
39
39
  session_scope,
40
40
  )
41
+ from lfx.utils.validate_cloud import raise_error_if_astra_cloud_disable_component
41
42
 
42
43
  if TYPE_CHECKING:
43
44
  from lfx.schema.dataframe import DataFrame
@@ -50,6 +51,9 @@ COHERE_MODEL_NAMES = ["embed-english-v3.0", "embed-multilingual-v3.0"]
50
51
 
51
52
  _KNOWLEDGE_BASES_ROOT_PATH: Path | None = None
52
53
 
54
+ # Error message to raise if we're in Astra cloud environment and the component is not supported.
55
+ astra_error_msg = "Knowledge ingestion is not supported in Astra cloud environment."
56
+
53
57
 
54
58
  def _get_knowledge_bases_root_path() -> Path:
55
59
  """Lazy load the knowledge bases root path from settings."""
@@ -540,6 +544,8 @@ class KnowledgeIngestionComponent(Component):
540
544
  # ---------------------------------------------------------------------
541
545
  async def build_kb_info(self) -> Data:
542
546
  """Main ingestion routine → returns a dict with KB metadata."""
547
+ # Check if we're in Astra cloud environment and raise an error if we are.
548
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
543
549
  try:
544
550
  input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df
545
551
  df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)
@@ -626,6 +632,8 @@ class KnowledgeIngestionComponent(Component):
626
632
  field_name: str | None = None,
627
633
  ):
628
634
  """Update build configuration based on provider selection."""
635
+ # Check if we're in Astra cloud environment and raise an error if we are.
636
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
629
637
  # Create a new knowledge base
630
638
  if field_name == "knowledge_base":
631
639
  async with session_scope() as db:
@@ -15,9 +15,13 @@ from lfx.log.logger import logger
15
15
  from lfx.schema.data import Data
16
16
  from lfx.schema.dataframe import DataFrame
17
17
  from lfx.services.deps import get_settings_service, session_scope
18
+ from lfx.utils.validate_cloud import raise_error_if_astra_cloud_disable_component
18
19
 
19
20
  _KNOWLEDGE_BASES_ROOT_PATH: Path | None = None
20
21
 
22
+ # Error message to raise if we're in Astra cloud environment and the component is not supported.
23
+ astra_error_msg = "Knowledge retrieval is not supported in Astra cloud environment."
24
+
21
25
 
22
26
  def _get_knowledge_bases_root_path() -> Path:
23
27
  """Lazy load the knowledge bases root path from settings."""
@@ -95,6 +99,8 @@ class KnowledgeRetrievalComponent(Component):
95
99
  ]
96
100
 
97
101
  async def update_build_config(self, build_config, field_value, field_name=None): # noqa: ARG002
102
+ # Check if we're in Astra cloud environment and raise an error if we are.
103
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
98
104
  if field_name == "knowledge_base":
99
105
  # Update the knowledge base options dynamically
100
106
  build_config["knowledge_base"]["options"] = await get_knowledge_bases(
@@ -110,6 +116,8 @@ class KnowledgeRetrievalComponent(Component):
110
116
 
111
117
  def _get_kb_metadata(self, kb_path: Path) -> dict:
112
118
  """Load and process knowledge base metadata."""
119
+ # Check if we're in Astra cloud environment and raise an error if we are.
120
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
113
121
  metadata: dict[str, Any] = {}
114
122
  metadata_file = kb_path / "embedding_metadata.json"
115
123
  if not metadata_file.exists():
@@ -184,6 +192,8 @@ class KnowledgeRetrievalComponent(Component):
184
192
  Returns:
185
193
  A DataFrame containing the data rows from the knowledge base.
186
194
  """
195
+ # Check if we're in Astra cloud environment and raise an error if we are.
196
+ raise_error_if_astra_cloud_disable_component(astra_error_msg)
187
197
  # Get the current user
188
198
  async with session_scope() as db:
189
199
  if not self.user_id: