lfx-nightly 0.2.0.dev41__py3-none-any.whl → 0.3.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/__main__.py +137 -6
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +10 -6
- lfx/base/agents/altk_base_agent.py +5 -3
- lfx/base/agents/altk_tool_wrappers.py +1 -1
- lfx/base/agents/events.py +1 -1
- lfx/base/agents/utils.py +4 -0
- lfx/base/composio/composio_base.py +78 -41
- lfx/base/data/cloud_storage_utils.py +156 -0
- lfx/base/data/docling_utils.py +130 -55
- lfx/base/datastax/astradb_base.py +75 -64
- lfx/base/embeddings/embeddings_class.py +113 -0
- lfx/base/models/__init__.py +11 -1
- lfx/base/models/google_generative_ai_constants.py +33 -9
- lfx/base/models/model_metadata.py +6 -0
- lfx/base/models/ollama_constants.py +196 -30
- lfx/base/models/openai_constants.py +37 -10
- lfx/base/models/unified_models.py +1123 -0
- lfx/base/models/watsonx_constants.py +43 -4
- lfx/base/prompts/api_utils.py +40 -5
- lfx/base/tools/component_tool.py +2 -9
- lfx/cli/__init__.py +10 -2
- lfx/cli/commands.py +3 -0
- lfx/cli/run.py +65 -409
- lfx/cli/script_loader.py +18 -7
- lfx/cli/validation.py +6 -3
- lfx/components/__init__.py +0 -3
- lfx/components/composio/github_composio.py +1 -1
- lfx/components/cuga/cuga_agent.py +39 -27
- lfx/components/data_source/api_request.py +4 -2
- lfx/components/datastax/astradb_assistant_manager.py +4 -2
- lfx/components/docling/__init__.py +45 -11
- lfx/components/docling/docling_inline.py +39 -49
- lfx/components/docling/docling_remote.py +1 -0
- lfx/components/elastic/opensearch_multimodal.py +1733 -0
- lfx/components/files_and_knowledge/file.py +384 -36
- lfx/components/files_and_knowledge/ingestion.py +8 -0
- lfx/components/files_and_knowledge/retrieval.py +10 -0
- lfx/components/files_and_knowledge/save_file.py +91 -88
- lfx/components/langchain_utilities/ibm_granite_handler.py +211 -0
- lfx/components/langchain_utilities/tool_calling.py +37 -6
- lfx/components/llm_operations/batch_run.py +64 -18
- lfx/components/llm_operations/lambda_filter.py +213 -101
- lfx/components/llm_operations/llm_conditional_router.py +39 -7
- lfx/components/llm_operations/structured_output.py +38 -12
- lfx/components/models/__init__.py +16 -74
- lfx/components/models_and_agents/agent.py +51 -203
- lfx/components/models_and_agents/embedding_model.py +171 -255
- lfx/components/models_and_agents/language_model.py +54 -318
- lfx/components/models_and_agents/mcp_component.py +96 -10
- lfx/components/models_and_agents/prompt.py +105 -18
- lfx/components/ollama/ollama_embeddings.py +111 -29
- lfx/components/openai/openai_chat_model.py +1 -1
- lfx/components/processing/text_operations.py +580 -0
- lfx/components/vllm/__init__.py +37 -0
- lfx/components/vllm/vllm.py +141 -0
- lfx/components/vllm/vllm_embeddings.py +110 -0
- lfx/custom/custom_component/component.py +65 -10
- lfx/custom/custom_component/custom_component.py +8 -6
- lfx/events/observability/__init__.py +0 -0
- lfx/events/observability/lifecycle_events.py +111 -0
- lfx/field_typing/__init__.py +57 -58
- lfx/graph/graph/base.py +40 -1
- lfx/graph/utils.py +109 -30
- lfx/graph/vertex/base.py +75 -23
- lfx/graph/vertex/vertex_types.py +0 -5
- lfx/inputs/__init__.py +2 -0
- lfx/inputs/input_mixin.py +55 -0
- lfx/inputs/inputs.py +120 -0
- lfx/interface/components.py +24 -7
- lfx/interface/initialize/loading.py +42 -12
- lfx/io/__init__.py +2 -0
- lfx/run/__init__.py +5 -0
- lfx/run/base.py +464 -0
- lfx/schema/__init__.py +50 -0
- lfx/schema/data.py +1 -1
- lfx/schema/image.py +26 -7
- lfx/schema/message.py +104 -11
- lfx/schema/workflow.py +171 -0
- lfx/services/deps.py +12 -0
- lfx/services/interfaces.py +43 -1
- lfx/services/mcp_composer/service.py +7 -1
- lfx/services/schema.py +1 -0
- lfx/services/settings/auth.py +95 -4
- lfx/services/settings/base.py +11 -1
- lfx/services/settings/constants.py +2 -0
- lfx/services/settings/utils.py +82 -0
- lfx/services/storage/local.py +13 -8
- lfx/services/transaction/__init__.py +5 -0
- lfx/services/transaction/service.py +35 -0
- lfx/tests/unit/components/__init__.py +0 -0
- lfx/utils/constants.py +2 -0
- lfx/utils/mustache_security.py +79 -0
- lfx/utils/validate_cloud.py +81 -3
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/METADATA +7 -2
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/RECORD +98 -80
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.2.0.dev41.dist-info → lfx_nightly-0.3.0.dev3.dist-info}/entry_points.txt +0 -0
|
@@ -21,15 +21,25 @@ from tempfile import NamedTemporaryFile
|
|
|
21
21
|
from typing import Any
|
|
22
22
|
|
|
23
23
|
from lfx.base.data.base_file import BaseFileComponent
|
|
24
|
-
from lfx.base.data.storage_utils import parse_storage_path, validate_image_content_type
|
|
24
|
+
from lfx.base.data.storage_utils import parse_storage_path, read_file_bytes, validate_image_content_type
|
|
25
25
|
from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
|
|
26
|
+
from lfx.inputs import SortableListInput
|
|
26
27
|
from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
|
|
27
|
-
from lfx.io import BoolInput, FileInput, IntInput, Output
|
|
28
|
+
from lfx.io import BoolInput, FileInput, IntInput, Output, SecretStrInput
|
|
28
29
|
from lfx.schema.data import Data
|
|
29
30
|
from lfx.schema.dataframe import DataFrame # noqa: TC001
|
|
30
31
|
from lfx.schema.message import Message
|
|
31
32
|
from lfx.services.deps import get_settings_service, get_storage_service
|
|
32
33
|
from lfx.utils.async_helpers import run_until_complete
|
|
34
|
+
from lfx.utils.validate_cloud import is_astra_cloud_environment
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_storage_location_options():
|
|
38
|
+
"""Get storage location options, filtering out Local if in Astra cloud environment."""
|
|
39
|
+
all_options = [{"name": "AWS", "icon": "Amazon"}, {"name": "Google Drive", "icon": "google"}]
|
|
40
|
+
if is_astra_cloud_environment():
|
|
41
|
+
return all_options
|
|
42
|
+
return [{"name": "Local", "icon": "hard-drive"}, *all_options]
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
class FileComponent(BaseFileComponent):
|
|
@@ -91,6 +101,15 @@ class FileComponent(BaseFileComponent):
|
|
|
91
101
|
break
|
|
92
102
|
|
|
93
103
|
inputs = [
|
|
104
|
+
SortableListInput(
|
|
105
|
+
name="storage_location",
|
|
106
|
+
display_name="Storage Location",
|
|
107
|
+
placeholder="Select Location",
|
|
108
|
+
info="Choose where to read the file from.",
|
|
109
|
+
options=_get_storage_location_options(),
|
|
110
|
+
real_time_refresh=True,
|
|
111
|
+
limit=1,
|
|
112
|
+
),
|
|
94
113
|
*_base_inputs,
|
|
95
114
|
StrInput(
|
|
96
115
|
name="file_path_str",
|
|
@@ -104,6 +123,63 @@ class FileComponent(BaseFileComponent):
|
|
|
104
123
|
tool_mode=True, # Required for Toolset toggle, but _get_tools() ignores this parameter
|
|
105
124
|
required=False,
|
|
106
125
|
),
|
|
126
|
+
# AWS S3 specific inputs
|
|
127
|
+
SecretStrInput(
|
|
128
|
+
name="aws_access_key_id",
|
|
129
|
+
display_name="AWS Access Key ID",
|
|
130
|
+
info="AWS Access key ID.",
|
|
131
|
+
show=False,
|
|
132
|
+
advanced=False,
|
|
133
|
+
required=True,
|
|
134
|
+
),
|
|
135
|
+
SecretStrInput(
|
|
136
|
+
name="aws_secret_access_key",
|
|
137
|
+
display_name="AWS Secret Key",
|
|
138
|
+
info="AWS Secret Key.",
|
|
139
|
+
show=False,
|
|
140
|
+
advanced=False,
|
|
141
|
+
required=True,
|
|
142
|
+
),
|
|
143
|
+
StrInput(
|
|
144
|
+
name="bucket_name",
|
|
145
|
+
display_name="S3 Bucket Name",
|
|
146
|
+
info="Enter the name of the S3 bucket.",
|
|
147
|
+
show=False,
|
|
148
|
+
advanced=False,
|
|
149
|
+
required=True,
|
|
150
|
+
),
|
|
151
|
+
StrInput(
|
|
152
|
+
name="aws_region",
|
|
153
|
+
display_name="AWS Region",
|
|
154
|
+
info="AWS region (e.g., us-east-1, eu-west-1).",
|
|
155
|
+
show=False,
|
|
156
|
+
advanced=False,
|
|
157
|
+
),
|
|
158
|
+
StrInput(
|
|
159
|
+
name="s3_file_key",
|
|
160
|
+
display_name="S3 File Key",
|
|
161
|
+
info="The key (path) of the file in S3 bucket.",
|
|
162
|
+
show=False,
|
|
163
|
+
advanced=False,
|
|
164
|
+
required=True,
|
|
165
|
+
),
|
|
166
|
+
# Google Drive specific inputs
|
|
167
|
+
SecretStrInput(
|
|
168
|
+
name="service_account_key",
|
|
169
|
+
display_name="GCP Credentials Secret Key",
|
|
170
|
+
info="Your Google Cloud Platform service account JSON key as a secret string (complete JSON content).",
|
|
171
|
+
show=False,
|
|
172
|
+
advanced=False,
|
|
173
|
+
required=True,
|
|
174
|
+
),
|
|
175
|
+
StrInput(
|
|
176
|
+
name="file_id",
|
|
177
|
+
display_name="Google Drive File ID",
|
|
178
|
+
info=("The Google Drive file ID to read. The file must be shared with the service account email."),
|
|
179
|
+
show=False,
|
|
180
|
+
advanced=False,
|
|
181
|
+
required=True,
|
|
182
|
+
),
|
|
107
183
|
BoolInput(
|
|
108
184
|
name="advanced_mode",
|
|
109
185
|
display_name="Advanced Parser",
|
|
@@ -113,7 +189,8 @@ class FileComponent(BaseFileComponent):
|
|
|
113
189
|
"Enable advanced document processing and export with Docling for PDFs, images, and office documents. "
|
|
114
190
|
"Note that advanced document processing can consume significant resources."
|
|
115
191
|
),
|
|
116
|
-
|
|
192
|
+
# Disabled in cloud
|
|
193
|
+
show=not is_astra_cloud_environment(),
|
|
117
194
|
),
|
|
118
195
|
DropdownInput(
|
|
119
196
|
name="pipeline",
|
|
@@ -269,6 +346,20 @@ class FileComponent(BaseFileComponent):
|
|
|
269
346
|
"""Return the list of currently selected file paths from the template."""
|
|
270
347
|
return template.get("path", {}).get("file_path", [])
|
|
271
348
|
|
|
349
|
+
def _disable_docling_fields_in_cloud(self, build_config: dict[str, Any]) -> None:
|
|
350
|
+
"""Disable all Docling-related fields in cloud environments."""
|
|
351
|
+
if "advanced_mode" in build_config:
|
|
352
|
+
build_config["advanced_mode"]["show"] = False
|
|
353
|
+
build_config["advanced_mode"]["value"] = False
|
|
354
|
+
# Hide all Docling-related fields
|
|
355
|
+
docling_fields = ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder")
|
|
356
|
+
for field in docling_fields:
|
|
357
|
+
if field in build_config:
|
|
358
|
+
build_config[field]["show"] = False
|
|
359
|
+
# Also disable OCR engine specifically
|
|
360
|
+
if "ocr_engine" in build_config:
|
|
361
|
+
build_config["ocr_engine"]["value"] = "None"
|
|
362
|
+
|
|
272
363
|
def update_build_config(
|
|
273
364
|
self,
|
|
274
365
|
build_config: dict[str, Any],
|
|
@@ -276,28 +367,120 @@ class FileComponent(BaseFileComponent):
|
|
|
276
367
|
field_name: str | None = None,
|
|
277
368
|
) -> dict[str, Any]:
|
|
278
369
|
"""Show/hide Advanced Parser and related fields based on selection context."""
|
|
370
|
+
# Update storage location options dynamically based on cloud environment
|
|
371
|
+
if "storage_location" in build_config:
|
|
372
|
+
updated_options = _get_storage_location_options()
|
|
373
|
+
build_config["storage_location"]["options"] = updated_options
|
|
374
|
+
|
|
375
|
+
# Handle storage location selection
|
|
376
|
+
if field_name == "storage_location":
|
|
377
|
+
# Extract selected storage location
|
|
378
|
+
selected = [location["name"] for location in field_value] if isinstance(field_value, list) else []
|
|
379
|
+
|
|
380
|
+
# Hide all storage-specific fields first
|
|
381
|
+
storage_fields = [
|
|
382
|
+
"aws_access_key_id",
|
|
383
|
+
"aws_secret_access_key",
|
|
384
|
+
"bucket_name",
|
|
385
|
+
"aws_region",
|
|
386
|
+
"s3_file_key",
|
|
387
|
+
"service_account_key",
|
|
388
|
+
"file_id",
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
for f_name in storage_fields:
|
|
392
|
+
if f_name in build_config:
|
|
393
|
+
build_config[f_name]["show"] = False
|
|
394
|
+
|
|
395
|
+
# Show fields based on selected storage location
|
|
396
|
+
if len(selected) == 1:
|
|
397
|
+
location = selected[0]
|
|
398
|
+
|
|
399
|
+
if location == "Local":
|
|
400
|
+
# Show file upload input for local storage
|
|
401
|
+
if "path" in build_config:
|
|
402
|
+
build_config["path"]["show"] = True
|
|
403
|
+
|
|
404
|
+
elif location == "AWS":
|
|
405
|
+
# Hide file upload input, show AWS fields
|
|
406
|
+
if "path" in build_config:
|
|
407
|
+
build_config["path"]["show"] = False
|
|
408
|
+
|
|
409
|
+
aws_fields = [
|
|
410
|
+
"aws_access_key_id",
|
|
411
|
+
"aws_secret_access_key",
|
|
412
|
+
"bucket_name",
|
|
413
|
+
"aws_region",
|
|
414
|
+
"s3_file_key",
|
|
415
|
+
]
|
|
416
|
+
for f_name in aws_fields:
|
|
417
|
+
if f_name in build_config:
|
|
418
|
+
build_config[f_name]["show"] = True
|
|
419
|
+
build_config[f_name]["advanced"] = False
|
|
420
|
+
|
|
421
|
+
elif location == "Google Drive":
|
|
422
|
+
# Hide file upload input, show Google Drive fields
|
|
423
|
+
if "path" in build_config:
|
|
424
|
+
build_config["path"]["show"] = False
|
|
425
|
+
|
|
426
|
+
gdrive_fields = ["service_account_key", "file_id"]
|
|
427
|
+
for f_name in gdrive_fields:
|
|
428
|
+
if f_name in build_config:
|
|
429
|
+
build_config[f_name]["show"] = True
|
|
430
|
+
build_config[f_name]["advanced"] = False
|
|
431
|
+
# No storage location selected - show file upload by default
|
|
432
|
+
elif "path" in build_config:
|
|
433
|
+
build_config["path"]["show"] = True
|
|
434
|
+
|
|
435
|
+
return build_config
|
|
436
|
+
|
|
279
437
|
if field_name == "path":
|
|
280
438
|
paths = self._path_value(build_config)
|
|
281
439
|
|
|
282
|
-
#
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
440
|
+
# Disable in cloud environments
|
|
441
|
+
if is_astra_cloud_environment():
|
|
442
|
+
self._disable_docling_fields_in_cloud(build_config)
|
|
443
|
+
else:
|
|
444
|
+
# If all files can be processed by docling, do so
|
|
445
|
+
allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
|
|
446
|
+
build_config["advanced_mode"]["show"] = allow_advanced
|
|
447
|
+
if not allow_advanced:
|
|
448
|
+
build_config["advanced_mode"]["value"] = False
|
|
449
|
+
docling_fields = (
|
|
450
|
+
"pipeline",
|
|
451
|
+
"ocr_engine",
|
|
452
|
+
"doc_key",
|
|
453
|
+
"md_image_placeholder",
|
|
454
|
+
"md_page_break_placeholder",
|
|
455
|
+
)
|
|
456
|
+
for field in docling_fields:
|
|
457
|
+
if field in build_config:
|
|
458
|
+
build_config[field]["show"] = False
|
|
290
459
|
|
|
291
460
|
# Docling Processing
|
|
292
461
|
elif field_name == "advanced_mode":
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
462
|
+
# Disable in cloud environments - don't show Docling fields even if advanced_mode is toggled
|
|
463
|
+
if is_astra_cloud_environment():
|
|
464
|
+
self._disable_docling_fields_in_cloud(build_config)
|
|
465
|
+
else:
|
|
466
|
+
docling_fields = (
|
|
467
|
+
"pipeline",
|
|
468
|
+
"ocr_engine",
|
|
469
|
+
"doc_key",
|
|
470
|
+
"md_image_placeholder",
|
|
471
|
+
"md_page_break_placeholder",
|
|
472
|
+
)
|
|
473
|
+
for field in docling_fields:
|
|
474
|
+
if field in build_config:
|
|
475
|
+
build_config[field]["show"] = bool(field_value)
|
|
476
|
+
if field == "pipeline":
|
|
477
|
+
build_config[field]["advanced"] = not bool(field_value)
|
|
298
478
|
|
|
299
479
|
elif field_name == "pipeline":
|
|
300
|
-
if
|
|
480
|
+
# Disable in cloud environments - don't show OCR engine even if pipeline is changed
|
|
481
|
+
if is_astra_cloud_environment():
|
|
482
|
+
self._disable_docling_fields_in_cloud(build_config)
|
|
483
|
+
elif field_value == "standard":
|
|
301
484
|
build_config["ocr_engine"]["show"] = True
|
|
302
485
|
build_config["ocr_engine"]["value"] = "easyocr"
|
|
303
486
|
else:
|
|
@@ -368,15 +551,34 @@ class FileComponent(BaseFileComponent):
|
|
|
368
551
|
|
|
369
552
|
# ------------------------------ Core processing ----------------------------------
|
|
370
553
|
|
|
554
|
+
def _get_selected_storage_location(self) -> str:
|
|
555
|
+
"""Get the selected storage location from the SortableListInput."""
|
|
556
|
+
if hasattr(self, "storage_location") and self.storage_location:
|
|
557
|
+
if isinstance(self.storage_location, list) and len(self.storage_location) > 0:
|
|
558
|
+
return self.storage_location[0].get("name", "")
|
|
559
|
+
if isinstance(self.storage_location, dict):
|
|
560
|
+
return self.storage_location.get("name", "")
|
|
561
|
+
return "Local" # Default to Local if not specified
|
|
562
|
+
|
|
371
563
|
def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
|
|
372
|
-
"""Override to handle file_path_str input from tool mode.
|
|
564
|
+
"""Override to handle file_path_str input from tool mode and cloud storage.
|
|
373
565
|
|
|
374
|
-
When called as a tool, the file_path_str parameter can be set.
|
|
375
|
-
If not provided, it will fall back to using the path FileInput (uploaded file).
|
|
376
566
|
Priority:
|
|
377
|
-
1.
|
|
378
|
-
2.
|
|
567
|
+
1. Cloud storage (AWS/Google Drive) if selected
|
|
568
|
+
2. file_path_str (if provided by the tool call)
|
|
569
|
+
3. path (uploaded file from UI)
|
|
379
570
|
"""
|
|
571
|
+
storage_location = self._get_selected_storage_location()
|
|
572
|
+
|
|
573
|
+
# Handle AWS S3
|
|
574
|
+
if storage_location == "AWS":
|
|
575
|
+
return self._read_from_aws_s3()
|
|
576
|
+
|
|
577
|
+
# Handle Google Drive
|
|
578
|
+
if storage_location == "Google Drive":
|
|
579
|
+
return self._read_from_google_drive()
|
|
580
|
+
|
|
581
|
+
# Handle Local storage
|
|
380
582
|
# Check if file_path_str is provided (from tool mode)
|
|
381
583
|
file_path_str = getattr(self, "file_path_str", None)
|
|
382
584
|
if file_path_str:
|
|
@@ -399,6 +601,101 @@ class FileComponent(BaseFileComponent):
|
|
|
399
601
|
# Otherwise use the default implementation (uses path FileInput)
|
|
400
602
|
return super()._validate_and_resolve_paths()
|
|
401
603
|
|
|
604
|
+
def _read_from_aws_s3(self) -> list[BaseFileComponent.BaseFile]:
|
|
605
|
+
"""Read file from AWS S3."""
|
|
606
|
+
from lfx.base.data.cloud_storage_utils import create_s3_client, validate_aws_credentials
|
|
607
|
+
|
|
608
|
+
# Validate AWS credentials
|
|
609
|
+
validate_aws_credentials(self)
|
|
610
|
+
if not getattr(self, "s3_file_key", None):
|
|
611
|
+
msg = "S3 File Key is required"
|
|
612
|
+
raise ValueError(msg)
|
|
613
|
+
|
|
614
|
+
# Create S3 client
|
|
615
|
+
s3_client = create_s3_client(self)
|
|
616
|
+
|
|
617
|
+
# Download file to temp location
|
|
618
|
+
import tempfile
|
|
619
|
+
|
|
620
|
+
# Get file extension from S3 key
|
|
621
|
+
file_extension = Path(self.s3_file_key).suffix or ""
|
|
622
|
+
|
|
623
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
|
|
624
|
+
temp_file_path = temp_file.name
|
|
625
|
+
try:
|
|
626
|
+
s3_client.download_fileobj(self.bucket_name, self.s3_file_key, temp_file)
|
|
627
|
+
except Exception as e:
|
|
628
|
+
# Clean up temp file on failure
|
|
629
|
+
with contextlib.suppress(OSError):
|
|
630
|
+
Path(temp_file_path).unlink()
|
|
631
|
+
msg = f"Failed to download file from S3: {e}"
|
|
632
|
+
raise RuntimeError(msg) from e
|
|
633
|
+
|
|
634
|
+
# Create BaseFile object
|
|
635
|
+
from lfx.schema.data import Data
|
|
636
|
+
|
|
637
|
+
temp_path = Path(temp_file_path)
|
|
638
|
+
data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
|
|
639
|
+
return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
|
|
640
|
+
|
|
641
|
+
def _read_from_google_drive(self) -> list[BaseFileComponent.BaseFile]:
|
|
642
|
+
"""Read file from Google Drive."""
|
|
643
|
+
import tempfile
|
|
644
|
+
|
|
645
|
+
from googleapiclient.http import MediaIoBaseDownload
|
|
646
|
+
|
|
647
|
+
from lfx.base.data.cloud_storage_utils import create_google_drive_service
|
|
648
|
+
|
|
649
|
+
# Validate Google Drive credentials
|
|
650
|
+
if not getattr(self, "service_account_key", None):
|
|
651
|
+
msg = "GCP Credentials Secret Key is required for Google Drive storage"
|
|
652
|
+
raise ValueError(msg)
|
|
653
|
+
if not getattr(self, "file_id", None):
|
|
654
|
+
msg = "Google Drive File ID is required"
|
|
655
|
+
raise ValueError(msg)
|
|
656
|
+
|
|
657
|
+
# Create Google Drive service with read-only scope
|
|
658
|
+
drive_service = create_google_drive_service(
|
|
659
|
+
self.service_account_key, scopes=["https://www.googleapis.com/auth/drive.readonly"]
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
# Get file metadata to determine file name and extension
|
|
663
|
+
try:
|
|
664
|
+
file_metadata = drive_service.files().get(fileId=self.file_id, fields="name,mimeType").execute()
|
|
665
|
+
file_name = file_metadata.get("name", "download")
|
|
666
|
+
except Exception as e:
|
|
667
|
+
msg = (
|
|
668
|
+
f"Unable to access file with ID '{self.file_id}'. "
|
|
669
|
+
f"Error: {e!s}. "
|
|
670
|
+
"Please ensure: 1) The file ID is correct, 2) The file exists, "
|
|
671
|
+
"3) The service account has been granted access to this file."
|
|
672
|
+
)
|
|
673
|
+
raise ValueError(msg) from e
|
|
674
|
+
|
|
675
|
+
# Download file to temp location
|
|
676
|
+
file_extension = Path(file_name).suffix or ""
|
|
677
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=file_extension, delete=False) as temp_file:
|
|
678
|
+
temp_file_path = temp_file.name
|
|
679
|
+
try:
|
|
680
|
+
request = drive_service.files().get_media(fileId=self.file_id)
|
|
681
|
+
downloader = MediaIoBaseDownload(temp_file, request)
|
|
682
|
+
done = False
|
|
683
|
+
while not done:
|
|
684
|
+
_status, done = downloader.next_chunk()
|
|
685
|
+
except Exception as e:
|
|
686
|
+
# Clean up temp file on failure
|
|
687
|
+
with contextlib.suppress(OSError):
|
|
688
|
+
Path(temp_file_path).unlink()
|
|
689
|
+
msg = f"Failed to download file from Google Drive: {e}"
|
|
690
|
+
raise RuntimeError(msg) from e
|
|
691
|
+
|
|
692
|
+
# Create BaseFile object
|
|
693
|
+
from lfx.schema.data import Data
|
|
694
|
+
|
|
695
|
+
temp_path = Path(temp_file_path)
|
|
696
|
+
data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(temp_path)})
|
|
697
|
+
return [BaseFileComponent.BaseFile(data_obj, temp_path, delete_after_processing=True)]
|
|
698
|
+
|
|
402
699
|
def _is_docling_compatible(self, file_path: str) -> bool:
|
|
403
700
|
"""Lightweight extension gate for Docling-compatible types."""
|
|
404
701
|
docling_exts = (
|
|
@@ -515,9 +812,6 @@ class FileComponent(BaseFileComponent):
|
|
|
515
812
|
),
|
|
516
813
|
}
|
|
517
814
|
|
|
518
|
-
self.log(f"Starting Docling subprocess for file: {local_file_path}")
|
|
519
|
-
self.log(args)
|
|
520
|
-
|
|
521
815
|
# Child script for isolating the docling processing
|
|
522
816
|
child_script = textwrap.dedent(
|
|
523
817
|
r"""
|
|
@@ -707,7 +1001,7 @@ class FileComponent(BaseFileComponent):
|
|
|
707
1001
|
)
|
|
708
1002
|
|
|
709
1003
|
if not proc.stdout:
|
|
710
|
-
err_msg = proc.stderr.decode("utf-8", errors="replace")
|
|
1004
|
+
err_msg = proc.stderr.decode("utf-8", errors="replace") if proc.stderr else "no output from child process"
|
|
711
1005
|
return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
|
|
712
1006
|
|
|
713
1007
|
try:
|
|
@@ -722,9 +1016,16 @@ class FileComponent(BaseFileComponent):
|
|
|
722
1016
|
)
|
|
723
1017
|
|
|
724
1018
|
if not result.get("ok"):
|
|
725
|
-
|
|
1019
|
+
error_msg = result.get("error", "Unknown Docling error")
|
|
1020
|
+
# Override meta file_path with original_file_path to ensure correct path matching
|
|
1021
|
+
meta = result.get("meta", {})
|
|
1022
|
+
meta["file_path"] = original_file_path
|
|
1023
|
+
return Data(data={"error": error_msg, **meta})
|
|
726
1024
|
|
|
727
1025
|
meta = result.get("meta", {})
|
|
1026
|
+
# Override meta file_path with original_file_path to ensure correct path matching
|
|
1027
|
+
# The subprocess returns the temp file path, but we need the original S3/local path for rollup_data
|
|
1028
|
+
meta["file_path"] = original_file_path
|
|
728
1029
|
if result.get("mode") == "markdown":
|
|
729
1030
|
exported_content = str(result.get("text", ""))
|
|
730
1031
|
return Data(
|
|
@@ -751,12 +1052,20 @@ class FileComponent(BaseFileComponent):
|
|
|
751
1052
|
# Validate image files to detect content/extension mismatches
|
|
752
1053
|
# This prevents API errors like "Image does not match the provided media type"
|
|
753
1054
|
image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
|
|
1055
|
+
settings = get_settings_service().settings
|
|
754
1056
|
for file in file_list:
|
|
755
1057
|
extension = file.path.suffix[1:].lower()
|
|
756
1058
|
if extension in image_extensions:
|
|
757
|
-
#
|
|
1059
|
+
# Read bytes based on storage type
|
|
758
1060
|
try:
|
|
759
|
-
|
|
1061
|
+
if settings.storage_type == "s3":
|
|
1062
|
+
# For S3 storage, use storage service to read file bytes
|
|
1063
|
+
file_path_str = str(file.path)
|
|
1064
|
+
content = run_until_complete(read_file_bytes(file_path_str))
|
|
1065
|
+
else:
|
|
1066
|
+
# For local storage, read bytes directly from filesystem
|
|
1067
|
+
content = file.path.read_bytes()
|
|
1068
|
+
|
|
760
1069
|
is_valid, error_msg = validate_image_content_type(
|
|
761
1070
|
str(file.path),
|
|
762
1071
|
content=content,
|
|
@@ -765,7 +1074,7 @@ class FileComponent(BaseFileComponent):
|
|
|
765
1074
|
self.log(error_msg)
|
|
766
1075
|
if not self.silent_errors:
|
|
767
1076
|
raise ValueError(error_msg)
|
|
768
|
-
except OSError as e:
|
|
1077
|
+
except (OSError, FileNotFoundError) as e:
|
|
769
1078
|
self.log(f"Could not read file for validation: {e}")
|
|
770
1079
|
# Continue - let it fail later with better error
|
|
771
1080
|
|
|
@@ -774,10 +1083,16 @@ class FileComponent(BaseFileComponent):
|
|
|
774
1083
|
for file in file_list:
|
|
775
1084
|
extension = file.path.suffix[1:].lower()
|
|
776
1085
|
if extension in self.DOCLING_ONLY_EXTENSIONS:
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
1086
|
+
if is_astra_cloud_environment():
|
|
1087
|
+
msg = (
|
|
1088
|
+
f"File '{file.path.name}' has extension '.{extension}' which requires "
|
|
1089
|
+
f"Advanced Parser mode. Advanced Parser is not available in cloud environments."
|
|
1090
|
+
)
|
|
1091
|
+
else:
|
|
1092
|
+
msg = (
|
|
1093
|
+
f"File '{file.path.name}' has extension '.{extension}' which requires "
|
|
1094
|
+
f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
|
|
1095
|
+
)
|
|
781
1096
|
self.log(msg)
|
|
782
1097
|
raise ValueError(msg)
|
|
783
1098
|
|
|
@@ -804,8 +1119,33 @@ class FileComponent(BaseFileComponent):
|
|
|
804
1119
|
file_path = str(file.path)
|
|
805
1120
|
advanced_data: Data | None = self._process_docling_in_subprocess(file_path)
|
|
806
1121
|
|
|
1122
|
+
# Handle None case - Docling processing failed or returned None
|
|
1123
|
+
if advanced_data is None:
|
|
1124
|
+
error_data = Data(
|
|
1125
|
+
data={
|
|
1126
|
+
"file_path": file_path,
|
|
1127
|
+
"error": "Docling processing returned no result. Check logs for details.",
|
|
1128
|
+
},
|
|
1129
|
+
)
|
|
1130
|
+
final_return.extend(self.rollup_data([file], [error_data]))
|
|
1131
|
+
continue
|
|
1132
|
+
|
|
807
1133
|
# --- UNNEST: expand each element in `doc` to its own Data row
|
|
808
1134
|
payload = getattr(advanced_data, "data", {}) or {}
|
|
1135
|
+
|
|
1136
|
+
# Check for errors first
|
|
1137
|
+
if "error" in payload:
|
|
1138
|
+
error_msg = payload.get("error", "Unknown error")
|
|
1139
|
+
error_data = Data(
|
|
1140
|
+
data={
|
|
1141
|
+
"file_path": file_path,
|
|
1142
|
+
"error": error_msg,
|
|
1143
|
+
**{k: v for k, v in payload.items() if k not in ("error", "file_path")},
|
|
1144
|
+
},
|
|
1145
|
+
)
|
|
1146
|
+
final_return.extend(self.rollup_data([file], [error_data]))
|
|
1147
|
+
continue
|
|
1148
|
+
|
|
809
1149
|
doc_rows = payload.get("doc")
|
|
810
1150
|
if isinstance(doc_rows, list) and doc_rows:
|
|
811
1151
|
# Non-empty list of structured rows
|
|
@@ -818,7 +1158,7 @@ class FileComponent(BaseFileComponent):
|
|
|
818
1158
|
)
|
|
819
1159
|
for item in doc_rows
|
|
820
1160
|
]
|
|
821
|
-
final_return.extend(self.rollup_data(
|
|
1161
|
+
final_return.extend(self.rollup_data([file], rows))
|
|
822
1162
|
elif isinstance(doc_rows, list) and not doc_rows:
|
|
823
1163
|
# Empty list - file was processed but no text content found
|
|
824
1164
|
# Create a Data object indicating no content was extracted
|
|
@@ -834,7 +1174,15 @@ class FileComponent(BaseFileComponent):
|
|
|
834
1174
|
final_return.extend(self.rollup_data([file], [empty_data]))
|
|
835
1175
|
else:
|
|
836
1176
|
# If not structured, keep as-is (e.g., markdown export or error dict)
|
|
837
|
-
|
|
1177
|
+
# Ensure file_path is set for proper rollup matching
|
|
1178
|
+
if not payload.get("file_path"):
|
|
1179
|
+
payload["file_path"] = file_path
|
|
1180
|
+
# Create new Data with file_path
|
|
1181
|
+
advanced_data = Data(
|
|
1182
|
+
data=payload,
|
|
1183
|
+
text=getattr(advanced_data, "text", None),
|
|
1184
|
+
)
|
|
1185
|
+
final_return.extend(self.rollup_data([file], [advanced_data]))
|
|
838
1186
|
return final_return
|
|
839
1187
|
|
|
840
1188
|
# Standard multi-file (or single non-advanced) path
|
|
@@ -38,6 +38,7 @@ from lfx.services.deps import (
|
|
|
38
38
|
get_variable_service,
|
|
39
39
|
session_scope,
|
|
40
40
|
)
|
|
41
|
+
from lfx.utils.validate_cloud import raise_error_if_astra_cloud_disable_component
|
|
41
42
|
|
|
42
43
|
if TYPE_CHECKING:
|
|
43
44
|
from lfx.schema.dataframe import DataFrame
|
|
@@ -50,6 +51,9 @@ COHERE_MODEL_NAMES = ["embed-english-v3.0", "embed-multilingual-v3.0"]
|
|
|
50
51
|
|
|
51
52
|
_KNOWLEDGE_BASES_ROOT_PATH: Path | None = None
|
|
52
53
|
|
|
54
|
+
# Error message to raise if we're in Astra cloud environment and the component is not supported.
|
|
55
|
+
astra_error_msg = "Knowledge ingestion is not supported in Astra cloud environment."
|
|
56
|
+
|
|
53
57
|
|
|
54
58
|
def _get_knowledge_bases_root_path() -> Path:
|
|
55
59
|
"""Lazy load the knowledge bases root path from settings."""
|
|
@@ -540,6 +544,8 @@ class KnowledgeIngestionComponent(Component):
|
|
|
540
544
|
# ---------------------------------------------------------------------
|
|
541
545
|
async def build_kb_info(self) -> Data:
|
|
542
546
|
"""Main ingestion routine → returns a dict with KB metadata."""
|
|
547
|
+
# Check if we're in Astra cloud environment and raise an error if we are.
|
|
548
|
+
raise_error_if_astra_cloud_disable_component(astra_error_msg)
|
|
543
549
|
try:
|
|
544
550
|
input_value = self.input_df[0] if isinstance(self.input_df, list) else self.input_df
|
|
545
551
|
df_source: DataFrame = convert_to_dataframe(input_value, auto_parse=False)
|
|
@@ -626,6 +632,8 @@ class KnowledgeIngestionComponent(Component):
|
|
|
626
632
|
field_name: str | None = None,
|
|
627
633
|
):
|
|
628
634
|
"""Update build configuration based on provider selection."""
|
|
635
|
+
# Check if we're in Astra cloud environment and raise an error if we are.
|
|
636
|
+
raise_error_if_astra_cloud_disable_component(astra_error_msg)
|
|
629
637
|
# Create a new knowledge base
|
|
630
638
|
if field_name == "knowledge_base":
|
|
631
639
|
async with session_scope() as db:
|
|
@@ -15,9 +15,13 @@ from lfx.log.logger import logger
|
|
|
15
15
|
from lfx.schema.data import Data
|
|
16
16
|
from lfx.schema.dataframe import DataFrame
|
|
17
17
|
from lfx.services.deps import get_settings_service, session_scope
|
|
18
|
+
from lfx.utils.validate_cloud import raise_error_if_astra_cloud_disable_component
|
|
18
19
|
|
|
19
20
|
_KNOWLEDGE_BASES_ROOT_PATH: Path | None = None
|
|
20
21
|
|
|
22
|
+
# Error message to raise if we're in Astra cloud environment and the component is not supported.
|
|
23
|
+
astra_error_msg = "Knowledge retrieval is not supported in Astra cloud environment."
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def _get_knowledge_bases_root_path() -> Path:
|
|
23
27
|
"""Lazy load the knowledge bases root path from settings."""
|
|
@@ -95,6 +99,8 @@ class KnowledgeRetrievalComponent(Component):
|
|
|
95
99
|
]
|
|
96
100
|
|
|
97
101
|
async def update_build_config(self, build_config, field_value, field_name=None): # noqa: ARG002
|
|
102
|
+
# Check if we're in Astra cloud environment and raise an error if we are.
|
|
103
|
+
raise_error_if_astra_cloud_disable_component(astra_error_msg)
|
|
98
104
|
if field_name == "knowledge_base":
|
|
99
105
|
# Update the knowledge base options dynamically
|
|
100
106
|
build_config["knowledge_base"]["options"] = await get_knowledge_bases(
|
|
@@ -110,6 +116,8 @@ class KnowledgeRetrievalComponent(Component):
|
|
|
110
116
|
|
|
111
117
|
def _get_kb_metadata(self, kb_path: Path) -> dict:
|
|
112
118
|
"""Load and process knowledge base metadata."""
|
|
119
|
+
# Check if we're in Astra cloud environment and raise an error if we are.
|
|
120
|
+
raise_error_if_astra_cloud_disable_component(astra_error_msg)
|
|
113
121
|
metadata: dict[str, Any] = {}
|
|
114
122
|
metadata_file = kb_path / "embedding_metadata.json"
|
|
115
123
|
if not metadata_file.exists():
|
|
@@ -184,6 +192,8 @@ class KnowledgeRetrievalComponent(Component):
|
|
|
184
192
|
Returns:
|
|
185
193
|
A DataFrame containing the data rows from the knowledge base.
|
|
186
194
|
"""
|
|
195
|
+
# Check if we're in Astra cloud environment and raise an error if we are.
|
|
196
|
+
raise_error_if_astra_cloud_disable_component(astra_error_msg)
|
|
187
197
|
# Get the current user
|
|
188
198
|
async with session_scope() as db:
|
|
189
199
|
if not self.user_id:
|