lfx-nightly 0.2.0.dev0__py3-none-any.whl → 0.2.0.dev41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +21 -4
- lfx/base/agents/altk_base_agent.py +393 -0
- lfx/base/agents/altk_tool_wrappers.py +565 -0
- lfx/base/agents/events.py +2 -1
- lfx/base/composio/composio_base.py +159 -224
- lfx/base/data/base_file.py +97 -20
- lfx/base/data/docling_utils.py +61 -10
- lfx/base/data/storage_utils.py +301 -0
- lfx/base/data/utils.py +178 -14
- lfx/base/mcp/util.py +2 -2
- lfx/base/models/anthropic_constants.py +21 -12
- lfx/base/models/groq_constants.py +74 -58
- lfx/base/models/groq_model_discovery.py +265 -0
- lfx/base/models/model.py +1 -1
- lfx/base/models/model_utils.py +100 -0
- lfx/base/models/openai_constants.py +7 -0
- lfx/base/models/watsonx_constants.py +32 -8
- lfx/base/tools/run_flow.py +601 -129
- lfx/cli/commands.py +9 -4
- lfx/cli/common.py +2 -2
- lfx/cli/run.py +1 -1
- lfx/cli/script_loader.py +53 -11
- lfx/components/Notion/create_page.py +1 -1
- lfx/components/Notion/list_database_properties.py +1 -1
- lfx/components/Notion/list_pages.py +1 -1
- lfx/components/Notion/list_users.py +1 -1
- lfx/components/Notion/page_content_viewer.py +1 -1
- lfx/components/Notion/search.py +1 -1
- lfx/components/Notion/update_page_property.py +1 -1
- lfx/components/__init__.py +19 -5
- lfx/components/{agents → altk}/__init__.py +5 -9
- lfx/components/altk/altk_agent.py +193 -0
- lfx/components/apify/apify_actor.py +1 -1
- lfx/components/composio/__init__.py +70 -18
- lfx/components/composio/apollo_composio.py +11 -0
- lfx/components/composio/bitbucket_composio.py +11 -0
- lfx/components/composio/canva_composio.py +11 -0
- lfx/components/composio/coda_composio.py +11 -0
- lfx/components/composio/composio_api.py +10 -0
- lfx/components/composio/discord_composio.py +1 -1
- lfx/components/composio/elevenlabs_composio.py +11 -0
- lfx/components/composio/exa_composio.py +11 -0
- lfx/components/composio/firecrawl_composio.py +11 -0
- lfx/components/composio/fireflies_composio.py +11 -0
- lfx/components/composio/gmail_composio.py +1 -1
- lfx/components/composio/googlebigquery_composio.py +11 -0
- lfx/components/composio/googlecalendar_composio.py +1 -1
- lfx/components/composio/googledocs_composio.py +1 -1
- lfx/components/composio/googlemeet_composio.py +1 -1
- lfx/components/composio/googlesheets_composio.py +1 -1
- lfx/components/composio/googletasks_composio.py +1 -1
- lfx/components/composio/heygen_composio.py +11 -0
- lfx/components/composio/mem0_composio.py +11 -0
- lfx/components/composio/peopledatalabs_composio.py +11 -0
- lfx/components/composio/perplexityai_composio.py +11 -0
- lfx/components/composio/serpapi_composio.py +11 -0
- lfx/components/composio/slack_composio.py +3 -574
- lfx/components/composio/slackbot_composio.py +1 -1
- lfx/components/composio/snowflake_composio.py +11 -0
- lfx/components/composio/tavily_composio.py +11 -0
- lfx/components/composio/youtube_composio.py +2 -2
- lfx/components/cuga/__init__.py +34 -0
- lfx/components/cuga/cuga_agent.py +730 -0
- lfx/components/data/__init__.py +78 -28
- lfx/components/data_source/__init__.py +58 -0
- lfx/components/{data → data_source}/api_request.py +26 -3
- lfx/components/{data → data_source}/csv_to_data.py +15 -10
- lfx/components/{data → data_source}/json_to_data.py +15 -8
- lfx/components/{data → data_source}/news_search.py +1 -1
- lfx/components/{data → data_source}/rss.py +1 -1
- lfx/components/{data → data_source}/sql_executor.py +1 -1
- lfx/components/{data → data_source}/url.py +1 -1
- lfx/components/{data → data_source}/web_search.py +1 -1
- lfx/components/datastax/astradb_cql.py +1 -1
- lfx/components/datastax/astradb_graph.py +1 -1
- lfx/components/datastax/astradb_tool.py +1 -1
- lfx/components/datastax/astradb_vectorstore.py +1 -1
- lfx/components/datastax/hcd.py +1 -1
- lfx/components/deactivated/json_document_builder.py +1 -1
- lfx/components/docling/__init__.py +0 -3
- lfx/components/docling/chunk_docling_document.py +3 -1
- lfx/components/docling/export_docling_document.py +3 -1
- lfx/components/elastic/elasticsearch.py +1 -1
- lfx/components/files_and_knowledge/__init__.py +47 -0
- lfx/components/{data → files_and_knowledge}/directory.py +1 -1
- lfx/components/{data → files_and_knowledge}/file.py +304 -24
- lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +2 -2
- lfx/components/{data → files_and_knowledge}/save_file.py +218 -31
- lfx/components/flow_controls/__init__.py +58 -0
- lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
- lfx/components/{logic → flow_controls}/loop.py +43 -9
- lfx/components/flow_controls/run_flow.py +108 -0
- lfx/components/glean/glean_search_api.py +1 -1
- lfx/components/groq/groq.py +35 -28
- lfx/components/helpers/__init__.py +102 -0
- lfx/components/ibm/watsonx.py +7 -1
- lfx/components/input_output/__init__.py +3 -1
- lfx/components/input_output/chat.py +4 -3
- lfx/components/input_output/chat_output.py +10 -4
- lfx/components/input_output/text.py +1 -1
- lfx/components/input_output/text_output.py +1 -1
- lfx/components/{data → input_output}/webhook.py +1 -1
- lfx/components/knowledge_bases/__init__.py +59 -4
- lfx/components/langchain_utilities/character.py +1 -1
- lfx/components/langchain_utilities/csv_agent.py +84 -16
- lfx/components/langchain_utilities/json_agent.py +67 -12
- lfx/components/langchain_utilities/language_recursive.py +1 -1
- lfx/components/llm_operations/__init__.py +46 -0
- lfx/components/{processing → llm_operations}/batch_run.py +17 -8
- lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
- lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
- lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
- lfx/components/{processing → llm_operations}/structured_output.py +1 -1
- lfx/components/logic/__init__.py +126 -0
- lfx/components/mem0/mem0_chat_memory.py +11 -0
- lfx/components/models/__init__.py +64 -9
- lfx/components/models_and_agents/__init__.py +49 -0
- lfx/components/{agents → models_and_agents}/agent.py +6 -4
- lfx/components/models_and_agents/embedding_model.py +353 -0
- lfx/components/models_and_agents/language_model.py +398 -0
- lfx/components/{agents → models_and_agents}/mcp_component.py +53 -44
- lfx/components/{helpers → models_and_agents}/memory.py +1 -1
- lfx/components/nvidia/system_assist.py +1 -1
- lfx/components/olivya/olivya.py +1 -1
- lfx/components/ollama/ollama.py +24 -5
- lfx/components/processing/__init__.py +9 -60
- lfx/components/processing/converter.py +1 -1
- lfx/components/processing/dataframe_operations.py +1 -1
- lfx/components/processing/parse_json_data.py +2 -2
- lfx/components/processing/parser.py +1 -1
- lfx/components/processing/split_text.py +1 -1
- lfx/components/qdrant/qdrant.py +1 -1
- lfx/components/redis/redis.py +1 -1
- lfx/components/twelvelabs/split_video.py +10 -0
- lfx/components/twelvelabs/video_file.py +12 -0
- lfx/components/utilities/__init__.py +43 -0
- lfx/components/{helpers → utilities}/calculator_core.py +1 -1
- lfx/components/{helpers → utilities}/current_date.py +1 -1
- lfx/components/{processing → utilities}/python_repl_core.py +1 -1
- lfx/components/vectorstores/local_db.py +9 -0
- lfx/components/youtube/youtube_transcripts.py +118 -30
- lfx/custom/custom_component/component.py +57 -1
- lfx/custom/custom_component/custom_component.py +68 -6
- lfx/custom/directory_reader/directory_reader.py +5 -2
- lfx/graph/edge/base.py +43 -20
- lfx/graph/state/model.py +15 -2
- lfx/graph/utils.py +6 -0
- lfx/graph/vertex/param_handler.py +10 -7
- lfx/helpers/__init__.py +12 -0
- lfx/helpers/flow.py +117 -0
- lfx/inputs/input_mixin.py +24 -1
- lfx/inputs/inputs.py +13 -1
- lfx/interface/components.py +161 -83
- lfx/log/logger.py +5 -3
- lfx/schema/image.py +2 -12
- lfx/services/database/__init__.py +5 -0
- lfx/services/database/service.py +25 -0
- lfx/services/deps.py +87 -22
- lfx/services/interfaces.py +5 -0
- lfx/services/manager.py +24 -10
- lfx/services/mcp_composer/service.py +1029 -162
- lfx/services/session.py +5 -0
- lfx/services/settings/auth.py +18 -11
- lfx/services/settings/base.py +56 -30
- lfx/services/settings/constants.py +8 -0
- lfx/services/storage/local.py +108 -46
- lfx/services/storage/service.py +171 -29
- lfx/template/field/base.py +3 -0
- lfx/utils/image.py +29 -11
- lfx/utils/ssrf_protection.py +384 -0
- lfx/utils/validate_cloud.py +26 -0
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/METADATA +38 -22
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/RECORD +189 -160
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/WHEEL +1 -1
- lfx/components/agents/altk_agent.py +0 -366
- lfx/components/agents/cuga_agent.py +0 -1013
- lfx/components/docling/docling_remote_vlm.py +0 -284
- lfx/components/logic/run_flow.py +0 -71
- lfx/components/models/embedding_model.py +0 -195
- lfx/components/models/language_model.py +0 -144
- lfx/components/processing/dataframe_to_toolset.py +0 -259
- /lfx/components/{data → data_source}/mock_data.py +0 -0
- /lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +0 -0
- /lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
- /lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
- /lfx/components/{logic → flow_controls}/listen.py +0 -0
- /lfx/components/{logic → flow_controls}/notify.py +0 -0
- /lfx/components/{logic → flow_controls}/pass_message.py +0 -0
- /lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
- /lfx/components/{processing → models_and_agents}/prompt.py +0 -0
- /lfx/components/{helpers → processing}/create_list.py +0 -0
- /lfx/components/{helpers → processing}/output_parser.py +0 -0
- /lfx/components/{helpers → processing}/store_message.py +0 -0
- /lfx/components/{helpers → utilities}/id_generator.py +0 -0
- {lfx_nightly-0.2.0.dev0.dist-info → lfx_nightly-0.2.0.dev41.dist-info}/entry_points.txt +0 -0
lfx/base/data/base_file.py
CHANGED
|
@@ -2,6 +2,7 @@ import ast
|
|
|
2
2
|
import shutil
|
|
3
3
|
import tarfile
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
+
from io import BytesIO
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from tempfile import TemporaryDirectory
|
|
7
8
|
from typing import TYPE_CHECKING, Any
|
|
@@ -10,11 +11,14 @@ from zipfile import ZipFile, is_zipfile
|
|
|
10
11
|
import orjson
|
|
11
12
|
import pandas as pd
|
|
12
13
|
|
|
14
|
+
from lfx.base.data.storage_utils import get_file_size, read_file_bytes
|
|
13
15
|
from lfx.custom.custom_component.component import Component
|
|
14
16
|
from lfx.io import BoolInput, FileInput, HandleInput, Output, StrInput
|
|
15
17
|
from lfx.schema.data import Data
|
|
16
18
|
from lfx.schema.dataframe import DataFrame
|
|
17
19
|
from lfx.schema.message import Message
|
|
20
|
+
from lfx.services.deps import get_settings_service
|
|
21
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
18
22
|
from lfx.utils.helpers import build_content_type_from_extension
|
|
19
23
|
|
|
20
24
|
if TYPE_CHECKING:
|
|
@@ -27,6 +31,8 @@ class BaseFileComponent(Component, ABC):
|
|
|
27
31
|
This class provides common functionality for resolving, validating, and
|
|
28
32
|
processing file paths. Child classes must define valid file extensions
|
|
29
33
|
and implement the `process_files` method.
|
|
34
|
+
|
|
35
|
+
# TODO: May want to subclass for local and remote files
|
|
30
36
|
"""
|
|
31
37
|
|
|
32
38
|
class BaseFile:
|
|
@@ -251,12 +257,25 @@ class BaseFileComponent(Component, ABC):
|
|
|
251
257
|
|
|
252
258
|
file_path = data_item.file_path
|
|
253
259
|
file_path_obj = Path(file_path)
|
|
254
|
-
file_size_stat = file_path_obj.stat()
|
|
255
260
|
filename = file_path_obj.name
|
|
256
261
|
|
|
262
|
+
settings = get_settings_service().settings
|
|
263
|
+
if settings.storage_type == "s3":
|
|
264
|
+
try:
|
|
265
|
+
file_size = get_file_size(file_path)
|
|
266
|
+
except (FileNotFoundError, ValueError):
|
|
267
|
+
# If we can't get file size, set to 0 or omit
|
|
268
|
+
file_size = 0
|
|
269
|
+
else:
|
|
270
|
+
try:
|
|
271
|
+
file_size_stat = file_path_obj.stat()
|
|
272
|
+
file_size = file_size_stat.st_size
|
|
273
|
+
except OSError:
|
|
274
|
+
file_size = 0
|
|
275
|
+
|
|
257
276
|
# Basic file metadata
|
|
258
277
|
metadata["filename"] = filename
|
|
259
|
-
metadata["file_size"] =
|
|
278
|
+
metadata["file_size"] = file_size
|
|
260
279
|
|
|
261
280
|
# Add MIME type from extension
|
|
262
281
|
extension = filename.split(".")[-1]
|
|
@@ -321,7 +340,16 @@ class BaseFileComponent(Component, ABC):
|
|
|
321
340
|
Message: Message containing file paths
|
|
322
341
|
"""
|
|
323
342
|
files = self._validate_and_resolve_paths()
|
|
324
|
-
|
|
343
|
+
settings = get_settings_service().settings
|
|
344
|
+
|
|
345
|
+
# For S3 storage, paths are virtual storage keys that don't exist on the local filesystem.
|
|
346
|
+
# Skip the exists() check for S3 files to preserve them in the output.
|
|
347
|
+
# Validation of S3 file existence is deferred until file processing (see _validate_and_resolve_paths).
|
|
348
|
+
# If a file was removed from S3, it will fail when attempting to read/process it later.
|
|
349
|
+
if settings.storage_type == "s3":
|
|
350
|
+
paths = [file.path.as_posix() for file in files]
|
|
351
|
+
else:
|
|
352
|
+
paths = [file.path.as_posix() for file in files if file.path.exists()]
|
|
325
353
|
|
|
326
354
|
return Message(text="\n".join(paths) if paths else "")
|
|
327
355
|
|
|
@@ -329,7 +357,29 @@ class BaseFileComponent(Component, ABC):
|
|
|
329
357
|
if not file_path:
|
|
330
358
|
return None
|
|
331
359
|
|
|
332
|
-
#
|
|
360
|
+
# Get file extension in lowercase
|
|
361
|
+
ext = Path(file_path).suffix.lower()
|
|
362
|
+
|
|
363
|
+
settings = get_settings_service().settings
|
|
364
|
+
|
|
365
|
+
# For S3 storage, download file bytes first
|
|
366
|
+
if settings.storage_type == "s3":
|
|
367
|
+
# Download file content from S3
|
|
368
|
+
content = run_until_complete(read_file_bytes(file_path))
|
|
369
|
+
|
|
370
|
+
# Map file extensions to pandas read functions that support BytesIO
|
|
371
|
+
if ext == ".csv":
|
|
372
|
+
result = pd.read_csv(BytesIO(content))
|
|
373
|
+
elif ext == ".xlsx":
|
|
374
|
+
result = pd.read_excel(BytesIO(content))
|
|
375
|
+
elif ext == ".parquet":
|
|
376
|
+
result = pd.read_parquet(BytesIO(content))
|
|
377
|
+
else:
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
return result.to_dict("records")
|
|
381
|
+
|
|
382
|
+
# Local storage - read directly from filesystem
|
|
333
383
|
file_readers: dict[str, Callable[[str], pd.DataFrame]] = {
|
|
334
384
|
".csv": pd.read_csv,
|
|
335
385
|
".xlsx": pd.read_excel,
|
|
@@ -337,9 +387,6 @@ class BaseFileComponent(Component, ABC):
|
|
|
337
387
|
# TODO: sqlite and json support?
|
|
338
388
|
}
|
|
339
389
|
|
|
340
|
-
# Get file extension in lowercase
|
|
341
|
-
ext = Path(file_path).suffix.lower()
|
|
342
|
-
|
|
343
390
|
# Get the appropriate reader function or None
|
|
344
391
|
reader = file_readers.get(ext)
|
|
345
392
|
|
|
@@ -558,16 +605,38 @@ class BaseFileComponent(Component, ABC):
|
|
|
558
605
|
resolved_files = []
|
|
559
606
|
|
|
560
607
|
def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
|
|
561
|
-
|
|
608
|
+
path_str = str(path)
|
|
609
|
+
settings = get_settings_service().settings
|
|
610
|
+
|
|
611
|
+
# When using object storage (S3), file paths are storage keys (e.g., "<flow_id>/<filename>")
|
|
612
|
+
# that don't exist on the local filesystem. We defer validation until file processing.
|
|
613
|
+
# For local storage, validate the file exists immediately to fail fast.
|
|
614
|
+
if settings.storage_type == "s3":
|
|
615
|
+
resolved_files.append(
|
|
616
|
+
BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
|
|
617
|
+
)
|
|
618
|
+
else:
|
|
619
|
+
# Check if path looks like a storage path (flow_id/filename format)
|
|
620
|
+
# If so, use get_full_path to resolve it to the actual storage location
|
|
621
|
+
if "/" in path_str and not Path(path_str).is_absolute():
|
|
622
|
+
try:
|
|
623
|
+
resolved_path = Path(self.get_full_path(path_str))
|
|
624
|
+
self.log(f"Resolved storage path '{path_str}' to '{resolved_path}'")
|
|
625
|
+
except (ValueError, AttributeError) as e:
|
|
626
|
+
# Fallback to resolve_path if get_full_path fails
|
|
627
|
+
self.log(f"get_full_path failed for '{path_str}': {e}, falling back to resolve_path")
|
|
628
|
+
resolved_path = Path(self.resolve_path(path_str))
|
|
629
|
+
else:
|
|
630
|
+
resolved_path = Path(self.resolve_path(path_str))
|
|
562
631
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
632
|
+
if not resolved_path.exists():
|
|
633
|
+
msg = f"File not found: '{path}' (resolved to: '{resolved_path}'). Please upload the file again."
|
|
634
|
+
self.log(msg)
|
|
635
|
+
if not self.silent_errors:
|
|
636
|
+
raise ValueError(msg)
|
|
637
|
+
resolved_files.append(
|
|
638
|
+
BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
|
|
639
|
+
)
|
|
571
640
|
|
|
572
641
|
file_path = self._file_path_as_list()
|
|
573
642
|
|
|
@@ -707,7 +776,7 @@ class BaseFileComponent(Component, ABC):
|
|
|
707
776
|
raise ValueError(msg)
|
|
708
777
|
|
|
709
778
|
def _filter_and_mark_files(self, files: list[BaseFile]) -> list[BaseFile]:
|
|
710
|
-
"""Validate file types and
|
|
779
|
+
"""Validate file types and filter out invalid files.
|
|
711
780
|
|
|
712
781
|
Args:
|
|
713
782
|
files (list[BaseFile]): List of BaseFile instances.
|
|
@@ -718,18 +787,26 @@ class BaseFileComponent(Component, ABC):
|
|
|
718
787
|
Raises:
|
|
719
788
|
ValueError: If unsupported files are encountered and `ignore_unsupported_extensions` is False.
|
|
720
789
|
"""
|
|
790
|
+
settings = get_settings_service().settings
|
|
791
|
+
is_s3_storage = settings.storage_type == "s3"
|
|
721
792
|
final_files = []
|
|
722
793
|
ignored_files = []
|
|
723
794
|
|
|
724
795
|
for file in files:
|
|
725
|
-
|
|
796
|
+
# For local storage, verify the path is actually a file
|
|
797
|
+
# For S3 storage, paths are virtual keys that don't exist locally
|
|
798
|
+
if not is_s3_storage and not file.path.is_file():
|
|
726
799
|
self.log(f"Not a file: {file.path.name}")
|
|
727
800
|
continue
|
|
728
801
|
|
|
729
|
-
|
|
730
|
-
|
|
802
|
+
# Validate file extension
|
|
803
|
+
extension = file.path.suffix[1:].lower() if file.path.suffix else ""
|
|
804
|
+
if extension not in self.valid_extensions:
|
|
805
|
+
# For local storage, optionally ignore unsupported extensions
|
|
806
|
+
if not is_s3_storage and self.ignore_unsupported_extensions:
|
|
731
807
|
ignored_files.append(file.path.name)
|
|
732
808
|
continue
|
|
809
|
+
|
|
733
810
|
msg = f"Unsupported file extension: {file.path.suffix}"
|
|
734
811
|
self.log(msg)
|
|
735
812
|
if not self.silent_errors:
|
lfx/base/data/docling_utils.py
CHANGED
|
@@ -25,21 +25,72 @@ class DoclingDependencyError(Exception):
|
|
|
25
25
|
super().__init__(f"{dependency_name} is not correctly installed. {install_command}")
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def extract_docling_documents(
|
|
28
|
+
def extract_docling_documents(
|
|
29
|
+
data_inputs: Data | list[Data] | DataFrame, doc_key: str
|
|
30
|
+
) -> tuple[list[DoclingDocument], str | None]:
|
|
31
|
+
"""Extract DoclingDocument objects from data inputs.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
data_inputs: The data inputs containing DoclingDocument objects
|
|
35
|
+
doc_key: The key/column name to look for DoclingDocument objects
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
A tuple of (documents, warning_message) where warning_message is None if no warning
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
TypeError: If the data cannot be extracted or is invalid
|
|
42
|
+
"""
|
|
29
43
|
documents: list[DoclingDocument] = []
|
|
44
|
+
warning_message: str | None = None
|
|
45
|
+
|
|
30
46
|
if isinstance(data_inputs, DataFrame):
|
|
31
47
|
if not len(data_inputs):
|
|
32
48
|
msg = "DataFrame is empty"
|
|
33
49
|
raise TypeError(msg)
|
|
34
50
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
51
|
+
# Primary: Check for exact column name match
|
|
52
|
+
if doc_key in data_inputs.columns:
|
|
53
|
+
try:
|
|
54
|
+
documents = data_inputs[doc_key].tolist()
|
|
55
|
+
except Exception as e:
|
|
56
|
+
msg = f"Error extracting DoclingDocument from DataFrame column '{doc_key}': {e}"
|
|
57
|
+
raise TypeError(msg) from e
|
|
58
|
+
else:
|
|
59
|
+
# Fallback: Search all columns for DoclingDocument objects
|
|
60
|
+
found_column = None
|
|
61
|
+
for col in data_inputs.columns:
|
|
62
|
+
try:
|
|
63
|
+
# Check if this column contains DoclingDocument objects
|
|
64
|
+
sample = data_inputs[col].dropna().iloc[0] if len(data_inputs[col].dropna()) > 0 else None
|
|
65
|
+
if sample is not None and isinstance(sample, DoclingDocument):
|
|
66
|
+
found_column = col
|
|
67
|
+
break
|
|
68
|
+
except (IndexError, AttributeError):
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
if found_column:
|
|
72
|
+
warning_message = (
|
|
73
|
+
f"Column '{doc_key}' not found, but found DoclingDocument objects in column '{found_column}'. "
|
|
74
|
+
f"Using '{found_column}' instead. Consider updating the 'Doc Key' parameter."
|
|
75
|
+
)
|
|
76
|
+
logger.warning(warning_message)
|
|
77
|
+
try:
|
|
78
|
+
documents = data_inputs[found_column].tolist()
|
|
79
|
+
except Exception as e:
|
|
80
|
+
msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
|
|
81
|
+
raise TypeError(msg) from e
|
|
82
|
+
else:
|
|
83
|
+
# Provide helpful error message
|
|
84
|
+
available_columns = list(data_inputs.columns)
|
|
85
|
+
msg = (
|
|
86
|
+
f"Column '{doc_key}' not found in DataFrame. "
|
|
87
|
+
f"Available columns: {available_columns}. "
|
|
88
|
+
f"\n\nPossible solutions:\n"
|
|
89
|
+
f"1. Use the 'Data' output from Docling component instead of 'DataFrame' output\n"
|
|
90
|
+
f"2. Update the 'Doc Key' parameter to match one of the available columns\n"
|
|
91
|
+
f"3. If using VLM pipeline, try using the standard pipeline"
|
|
92
|
+
)
|
|
93
|
+
raise TypeError(msg)
|
|
43
94
|
else:
|
|
44
95
|
if not data_inputs:
|
|
45
96
|
msg = "No data inputs provided"
|
|
@@ -69,7 +120,7 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
|
|
|
69
120
|
except AttributeError as e:
|
|
70
121
|
msg = f"Invalid input type in collection: {e}"
|
|
71
122
|
raise TypeError(msg) from e
|
|
72
|
-
return documents
|
|
123
|
+
return documents, warning_message
|
|
73
124
|
|
|
74
125
|
|
|
75
126
|
def _unwrap_secrets(obj):
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Storage-aware file utilities for components.
|
|
2
|
+
|
|
3
|
+
This module provides utilities that work with both local files and remote files
|
|
4
|
+
stored in the storage service.
|
|
5
|
+
|
|
6
|
+
TODO: Can abstract these into the storage service interface and update
|
|
7
|
+
implementations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
from lfx.services.deps import get_settings_service, get_storage_service
|
|
16
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Callable
|
|
20
|
+
|
|
21
|
+
from lfx.services.storage.service import StorageService
|
|
22
|
+
|
|
23
|
+
# Constants for path parsing
|
|
24
|
+
EXPECTED_PATH_PARTS = 2 # Path format: "flow_id/filename"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_storage_path(path: str) -> tuple[str, str] | None:
|
|
28
|
+
"""Parse a storage service path into flow_id and filename.
|
|
29
|
+
|
|
30
|
+
Storage service paths follow the format: flow_id/filename
|
|
31
|
+
This should only be called when storage_type == "s3".
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
path: The storage service path in format "flow_id/filename"
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
tuple[str, str] | None: (flow_id, filename) or None if invalid format
|
|
38
|
+
"""
|
|
39
|
+
if not path or "/" not in path:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
parts = path.split("/", 1)
|
|
43
|
+
if len(parts) != EXPECTED_PATH_PARTS or not parts[0] or not parts[1]:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return parts[0], parts[1]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def read_file_bytes(
|
|
50
|
+
file_path: str,
|
|
51
|
+
storage_service: StorageService | None = None,
|
|
52
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
53
|
+
) -> bytes:
|
|
54
|
+
"""Read file bytes from either storage service or local filesystem.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or local path)
|
|
58
|
+
storage_service: Optional storage service instance (will get from deps if not provided)
|
|
59
|
+
resolve_path: Optional function to resolve relative paths to absolute paths
|
|
60
|
+
(typically Component.resolve_path). Only used for local storage.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
bytes: The file content
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
FileNotFoundError: If the file doesn't exist
|
|
67
|
+
"""
|
|
68
|
+
settings = get_settings_service().settings
|
|
69
|
+
|
|
70
|
+
if settings.storage_type == "s3":
|
|
71
|
+
parsed = parse_storage_path(file_path)
|
|
72
|
+
if not parsed:
|
|
73
|
+
msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
|
|
74
|
+
raise ValueError(msg)
|
|
75
|
+
|
|
76
|
+
if storage_service is None:
|
|
77
|
+
storage_service = get_storage_service()
|
|
78
|
+
|
|
79
|
+
flow_id, filename = parsed
|
|
80
|
+
return await storage_service.get_file(flow_id, filename)
|
|
81
|
+
|
|
82
|
+
# For local storage, resolve path if resolver provided
|
|
83
|
+
if resolve_path:
|
|
84
|
+
file_path = resolve_path(file_path)
|
|
85
|
+
|
|
86
|
+
path_obj = Path(file_path)
|
|
87
|
+
if not path_obj.exists():
|
|
88
|
+
msg = f"File not found: {file_path}"
|
|
89
|
+
raise FileNotFoundError(msg)
|
|
90
|
+
|
|
91
|
+
return path_obj.read_bytes()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def read_file_text(
|
|
95
|
+
file_path: str,
|
|
96
|
+
encoding: str = "utf-8",
|
|
97
|
+
storage_service: StorageService | None = None,
|
|
98
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
99
|
+
newline: str | None = None,
|
|
100
|
+
) -> str:
|
|
101
|
+
r"""Read file text from either storage service or local filesystem.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
file_path: Path to the file (storage service path or local path)
|
|
105
|
+
encoding: Text encoding to use
|
|
106
|
+
storage_service: Optional storage service instance
|
|
107
|
+
resolve_path: Optional function to resolve relative paths to absolute paths
|
|
108
|
+
(typically Component.resolve_path). Only used for local storage.
|
|
109
|
+
newline: Newline mode (None for default, "" for universal newlines like CSV).
|
|
110
|
+
When set to "", normalizes all line endings to \\n for consistency.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
str: The file content as text
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
FileNotFoundError: If the file doesn't exist
|
|
117
|
+
"""
|
|
118
|
+
settings = get_settings_service().settings
|
|
119
|
+
|
|
120
|
+
if settings.storage_type == "s3":
|
|
121
|
+
content = await read_file_bytes(file_path, storage_service, resolve_path)
|
|
122
|
+
text = content.decode(encoding)
|
|
123
|
+
# Normalize newlines for S3 when newline="" is specified (universal newline mode)
|
|
124
|
+
if newline == "":
|
|
125
|
+
# Convert all line endings to \n (matches Python's universal newline mode)
|
|
126
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
127
|
+
return text
|
|
128
|
+
# For local storage, resolve path if resolver provided
|
|
129
|
+
if resolve_path:
|
|
130
|
+
file_path = resolve_path(file_path)
|
|
131
|
+
|
|
132
|
+
path_obj = Path(file_path)
|
|
133
|
+
if newline is not None:
|
|
134
|
+
with path_obj.open(newline=newline, encoding=encoding) as f: # noqa: ASYNC230
|
|
135
|
+
return f.read()
|
|
136
|
+
return path_obj.read_text(encoding=encoding)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_file_size(file_path: str, storage_service: StorageService | None = None) -> int:
|
|
140
|
+
"""Get file size from either storage service or local filesystem.
|
|
141
|
+
|
|
142
|
+
Note: This is a sync wrapper - for async code, use the storage service directly.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
|
|
146
|
+
storage_service: Optional storage service instance
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
int: File size in bytes
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
FileNotFoundError: If the file doesn't exist
|
|
153
|
+
"""
|
|
154
|
+
settings = get_settings_service().settings
|
|
155
|
+
|
|
156
|
+
if settings.storage_type == "s3":
|
|
157
|
+
parsed = parse_storage_path(file_path)
|
|
158
|
+
if not parsed:
|
|
159
|
+
msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
|
|
160
|
+
raise ValueError(msg)
|
|
161
|
+
|
|
162
|
+
if storage_service is None:
|
|
163
|
+
storage_service = get_storage_service()
|
|
164
|
+
|
|
165
|
+
flow_id, filename = parsed
|
|
166
|
+
return run_until_complete(storage_service.get_file_size(flow_id, filename))
|
|
167
|
+
|
|
168
|
+
# Local file system
|
|
169
|
+
path_obj = Path(file_path)
|
|
170
|
+
if not path_obj.exists():
|
|
171
|
+
msg = f"File not found: {file_path}"
|
|
172
|
+
raise FileNotFoundError(msg)
|
|
173
|
+
|
|
174
|
+
return path_obj.stat().st_size
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def file_exists(file_path: str, storage_service: StorageService | None = None) -> bool:
|
|
178
|
+
"""Check if a file exists in either storage service or local filesystem.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
|
|
182
|
+
storage_service: Optional storage service instance
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
bool: True if the file exists
|
|
186
|
+
"""
|
|
187
|
+
try:
|
|
188
|
+
get_file_size(file_path, storage_service)
|
|
189
|
+
except (FileNotFoundError, ValueError):
|
|
190
|
+
return False
|
|
191
|
+
else:
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Magic bytes signatures for common image formats
|
|
196
|
+
MIN_IMAGE_HEADER_SIZE = 12 # Minimum bytes needed to detect image type
|
|
197
|
+
|
|
198
|
+
IMAGE_SIGNATURES: dict[str, list[tuple[bytes, int]]] = {
|
|
199
|
+
"jpeg": [(b"\xff\xd8\xff", 0)],
|
|
200
|
+
"jpg": [(b"\xff\xd8\xff", 0)],
|
|
201
|
+
"png": [(b"\x89PNG\r\n\x1a\n", 0)],
|
|
202
|
+
"gif": [(b"GIF87a", 0), (b"GIF89a", 0)],
|
|
203
|
+
"webp": [(b"RIFF", 0)], # WebP starts with RIFF, then has WEBP at offset 8
|
|
204
|
+
"bmp": [(b"BM", 0)],
|
|
205
|
+
"tiff": [(b"II*\x00", 0), (b"MM\x00*", 0)], # Little-endian and big-endian TIFF
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def detect_image_type_from_bytes(content: bytes) -> str | None:
|
|
210
|
+
"""Detect the actual image type from file content using magic bytes.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
content: The file content bytes (at least first 12 bytes needed)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
str | None: The detected image type (e.g., "jpeg", "png") or None if not recognized
|
|
217
|
+
"""
|
|
218
|
+
if len(content) < MIN_IMAGE_HEADER_SIZE:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
# Check WebP specifically (needs to check both RIFF and WEBP)
|
|
222
|
+
if content[:4] == b"RIFF" and content[8:12] == b"WEBP":
|
|
223
|
+
return "webp"
|
|
224
|
+
|
|
225
|
+
# Check other image signatures
|
|
226
|
+
for image_type, signatures in IMAGE_SIGNATURES.items():
|
|
227
|
+
if image_type == "webp":
|
|
228
|
+
continue # Already handled above
|
|
229
|
+
for signature, offset in signatures:
|
|
230
|
+
if content[offset : offset + len(signature)] == signature:
|
|
231
|
+
return image_type
|
|
232
|
+
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def validate_image_content_type(
|
|
237
|
+
file_path: str,
|
|
238
|
+
content: bytes | None = None,
|
|
239
|
+
storage_service: StorageService | None = None,
|
|
240
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
241
|
+
) -> tuple[bool, str | None]:
|
|
242
|
+
"""Validate that an image file's content matches its declared extension.
|
|
243
|
+
|
|
244
|
+
This prevents errors like "Image does not match the provided media type image/png"
|
|
245
|
+
when a JPEG file is saved with a .png extension.
|
|
246
|
+
|
|
247
|
+
Only rejects files when we can definitively detect a mismatch. Files with
|
|
248
|
+
unrecognized content are allowed through (they may fail later, but that's
|
|
249
|
+
better than false positives blocking valid files).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
file_path: Path to the image file
|
|
253
|
+
content: Optional pre-read file content bytes. If not provided, will read from file.
|
|
254
|
+
storage_service: Optional storage service instance for S3 files
|
|
255
|
+
resolve_path: Optional function to resolve relative paths
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
tuple[bool, str | None]: (is_valid, error_message)
|
|
259
|
+
- (True, None) if the content matches the extension, is unrecognized, or file is not an image
|
|
260
|
+
- (False, error_message) if there's a definite mismatch
|
|
261
|
+
"""
|
|
262
|
+
# Get the file extension
|
|
263
|
+
path_obj = Path(file_path)
|
|
264
|
+
extension = path_obj.suffix[1:].lower() if path_obj.suffix else ""
|
|
265
|
+
|
|
266
|
+
# Only validate image files
|
|
267
|
+
image_extensions = {"jpeg", "jpg", "png", "gif", "webp", "bmp", "tiff"}
|
|
268
|
+
if extension not in image_extensions:
|
|
269
|
+
return True, None
|
|
270
|
+
|
|
271
|
+
# Read content if not provided
|
|
272
|
+
if content is None:
|
|
273
|
+
try:
|
|
274
|
+
content = run_until_complete(read_file_bytes(file_path, storage_service, resolve_path))
|
|
275
|
+
except (FileNotFoundError, ValueError):
|
|
276
|
+
# Can't read file - let it pass, will fail later with better error
|
|
277
|
+
return True, None
|
|
278
|
+
|
|
279
|
+
# Detect actual image type
|
|
280
|
+
detected_type = detect_image_type_from_bytes(content)
|
|
281
|
+
|
|
282
|
+
# If we can't detect the type, the file is not a valid image
|
|
283
|
+
if detected_type is None:
|
|
284
|
+
return False, (
|
|
285
|
+
f"File '{path_obj.name}' has extension '.{extension}' but its content "
|
|
286
|
+
f"is not a valid image format. The file may be corrupted, empty, or not a real image."
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Normalize extensions for comparison (jpg == jpeg, tif == tiff)
|
|
290
|
+
extension_normalized = "jpeg" if extension == "jpg" else extension
|
|
291
|
+
detected_normalized = "jpeg" if detected_type == "jpg" else detected_type
|
|
292
|
+
|
|
293
|
+
if extension_normalized != detected_normalized:
|
|
294
|
+
return False, (
|
|
295
|
+
f"File '{path_obj.name}' has extension '.{extension}' but contains "
|
|
296
|
+
f"'{detected_type.upper()}' image data. This mismatch will cause API errors. "
|
|
297
|
+
f"Please rename the file with the correct extension '.{detected_type}' or "
|
|
298
|
+
f"re-save it in the correct format."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return True, None
|