lfx-nightly 0.1.13.dev0__py3-none-any.whl → 0.2.0.dev26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/_assets/component_index.json +1 -1
- lfx/base/agents/agent.py +121 -29
- lfx/base/agents/altk_base_agent.py +380 -0
- lfx/base/agents/altk_tool_wrappers.py +565 -0
- lfx/base/agents/events.py +103 -35
- lfx/base/agents/utils.py +15 -2
- lfx/base/composio/composio_base.py +183 -233
- lfx/base/data/base_file.py +88 -21
- lfx/base/data/storage_utils.py +192 -0
- lfx/base/data/utils.py +178 -14
- lfx/base/datastax/__init__.py +5 -0
- lfx/{components/vectorstores/astradb.py → base/datastax/astradb_base.py} +84 -473
- lfx/base/embeddings/embeddings_class.py +113 -0
- lfx/base/io/chat.py +5 -4
- lfx/base/mcp/util.py +101 -15
- lfx/base/models/groq_constants.py +74 -58
- lfx/base/models/groq_model_discovery.py +265 -0
- lfx/base/models/model.py +1 -1
- lfx/base/models/model_input_constants.py +74 -7
- lfx/base/models/model_utils.py +100 -0
- lfx/base/models/ollama_constants.py +3 -0
- lfx/base/models/openai_constants.py +7 -0
- lfx/base/models/watsonx_constants.py +36 -0
- lfx/base/tools/run_flow.py +601 -129
- lfx/cli/commands.py +7 -4
- lfx/cli/common.py +2 -2
- lfx/cli/run.py +1 -1
- lfx/cli/script_loader.py +53 -11
- lfx/components/Notion/create_page.py +1 -1
- lfx/components/Notion/list_database_properties.py +1 -1
- lfx/components/Notion/list_pages.py +1 -1
- lfx/components/Notion/list_users.py +1 -1
- lfx/components/Notion/page_content_viewer.py +1 -1
- lfx/components/Notion/search.py +1 -1
- lfx/components/Notion/update_page_property.py +1 -1
- lfx/components/__init__.py +19 -5
- lfx/components/altk/__init__.py +34 -0
- lfx/components/altk/altk_agent.py +193 -0
- lfx/components/amazon/amazon_bedrock_converse.py +1 -1
- lfx/components/apify/apify_actor.py +4 -4
- lfx/components/composio/__init__.py +70 -18
- lfx/components/composio/apollo_composio.py +11 -0
- lfx/components/composio/bitbucket_composio.py +11 -0
- lfx/components/composio/canva_composio.py +11 -0
- lfx/components/composio/coda_composio.py +11 -0
- lfx/components/composio/composio_api.py +10 -0
- lfx/components/composio/discord_composio.py +1 -1
- lfx/components/composio/elevenlabs_composio.py +11 -0
- lfx/components/composio/exa_composio.py +11 -0
- lfx/components/composio/firecrawl_composio.py +11 -0
- lfx/components/composio/fireflies_composio.py +11 -0
- lfx/components/composio/gmail_composio.py +1 -1
- lfx/components/composio/googlebigquery_composio.py +11 -0
- lfx/components/composio/googlecalendar_composio.py +1 -1
- lfx/components/composio/googledocs_composio.py +1 -1
- lfx/components/composio/googlemeet_composio.py +1 -1
- lfx/components/composio/googlesheets_composio.py +1 -1
- lfx/components/composio/googletasks_composio.py +1 -1
- lfx/components/composio/heygen_composio.py +11 -0
- lfx/components/composio/mem0_composio.py +11 -0
- lfx/components/composio/peopledatalabs_composio.py +11 -0
- lfx/components/composio/perplexityai_composio.py +11 -0
- lfx/components/composio/serpapi_composio.py +11 -0
- lfx/components/composio/slack_composio.py +3 -574
- lfx/components/composio/slackbot_composio.py +1 -1
- lfx/components/composio/snowflake_composio.py +11 -0
- lfx/components/composio/tavily_composio.py +11 -0
- lfx/components/composio/youtube_composio.py +2 -2
- lfx/components/{agents → cuga}/__init__.py +5 -7
- lfx/components/cuga/cuga_agent.py +730 -0
- lfx/components/data/__init__.py +78 -28
- lfx/components/data_source/__init__.py +58 -0
- lfx/components/{data → data_source}/api_request.py +26 -3
- lfx/components/{data → data_source}/csv_to_data.py +15 -10
- lfx/components/{data → data_source}/json_to_data.py +15 -8
- lfx/components/{data → data_source}/news_search.py +1 -1
- lfx/components/{data → data_source}/rss.py +1 -1
- lfx/components/{data → data_source}/sql_executor.py +1 -1
- lfx/components/{data → data_source}/url.py +1 -1
- lfx/components/{data → data_source}/web_search.py +1 -1
- lfx/components/datastax/__init__.py +12 -6
- lfx/components/datastax/{astra_assistant_manager.py → astradb_assistant_manager.py} +1 -0
- lfx/components/datastax/astradb_chatmemory.py +40 -0
- lfx/components/datastax/astradb_cql.py +6 -32
- lfx/components/datastax/astradb_graph.py +10 -124
- lfx/components/datastax/astradb_tool.py +13 -53
- lfx/components/datastax/astradb_vectorstore.py +134 -977
- lfx/components/datastax/create_assistant.py +1 -0
- lfx/components/datastax/create_thread.py +1 -0
- lfx/components/datastax/dotenv.py +1 -0
- lfx/components/datastax/get_assistant.py +1 -0
- lfx/components/datastax/getenvvar.py +1 -0
- lfx/components/datastax/graph_rag.py +1 -1
- lfx/components/datastax/hcd.py +1 -1
- lfx/components/datastax/list_assistants.py +1 -0
- lfx/components/datastax/run.py +1 -0
- lfx/components/deactivated/json_document_builder.py +1 -1
- lfx/components/elastic/elasticsearch.py +1 -1
- lfx/components/elastic/opensearch_multimodal.py +1575 -0
- lfx/components/files_and_knowledge/__init__.py +47 -0
- lfx/components/{data → files_and_knowledge}/directory.py +1 -1
- lfx/components/{data → files_and_knowledge}/file.py +246 -18
- lfx/components/{knowledge_bases → files_and_knowledge}/ingestion.py +17 -9
- lfx/components/{knowledge_bases → files_and_knowledge}/retrieval.py +18 -10
- lfx/components/{data → files_and_knowledge}/save_file.py +142 -22
- lfx/components/flow_controls/__init__.py +58 -0
- lfx/components/{logic → flow_controls}/conditional_router.py +1 -1
- lfx/components/{logic → flow_controls}/loop.py +47 -9
- lfx/components/flow_controls/run_flow.py +108 -0
- lfx/components/glean/glean_search_api.py +1 -1
- lfx/components/groq/groq.py +35 -28
- lfx/components/helpers/__init__.py +102 -0
- lfx/components/ibm/watsonx.py +25 -21
- lfx/components/input_output/__init__.py +3 -1
- lfx/components/input_output/chat.py +12 -3
- lfx/components/input_output/chat_output.py +12 -4
- lfx/components/input_output/text.py +1 -1
- lfx/components/input_output/text_output.py +1 -1
- lfx/components/{data → input_output}/webhook.py +1 -1
- lfx/components/knowledge_bases/__init__.py +59 -4
- lfx/components/langchain_utilities/character.py +1 -1
- lfx/components/langchain_utilities/csv_agent.py +84 -16
- lfx/components/langchain_utilities/json_agent.py +67 -12
- lfx/components/langchain_utilities/language_recursive.py +1 -1
- lfx/components/llm_operations/__init__.py +46 -0
- lfx/components/{processing → llm_operations}/batch_run.py +1 -1
- lfx/components/{processing → llm_operations}/lambda_filter.py +1 -1
- lfx/components/{logic → llm_operations}/llm_conditional_router.py +1 -1
- lfx/components/{processing/llm_router.py → llm_operations/llm_selector.py} +3 -3
- lfx/components/{processing → llm_operations}/structured_output.py +56 -18
- lfx/components/logic/__init__.py +126 -0
- lfx/components/mem0/mem0_chat_memory.py +11 -0
- lfx/components/mistral/mistral_embeddings.py +1 -1
- lfx/components/models/__init__.py +64 -9
- lfx/components/models_and_agents/__init__.py +49 -0
- lfx/components/{agents → models_and_agents}/agent.py +49 -6
- lfx/components/models_and_agents/embedding_model.py +423 -0
- lfx/components/models_and_agents/language_model.py +398 -0
- lfx/components/{agents → models_and_agents}/mcp_component.py +84 -45
- lfx/components/{helpers → models_and_agents}/memory.py +1 -1
- lfx/components/nvidia/system_assist.py +1 -1
- lfx/components/olivya/olivya.py +1 -1
- lfx/components/ollama/ollama.py +235 -14
- lfx/components/openrouter/openrouter.py +49 -147
- lfx/components/processing/__init__.py +9 -57
- lfx/components/processing/converter.py +1 -1
- lfx/components/processing/dataframe_operations.py +1 -1
- lfx/components/processing/parse_json_data.py +2 -2
- lfx/components/processing/parser.py +7 -2
- lfx/components/processing/split_text.py +1 -1
- lfx/components/qdrant/qdrant.py +1 -1
- lfx/components/redis/redis.py +1 -1
- lfx/components/twelvelabs/split_video.py +10 -0
- lfx/components/twelvelabs/video_file.py +12 -0
- lfx/components/utilities/__init__.py +43 -0
- lfx/components/{helpers → utilities}/calculator_core.py +1 -1
- lfx/components/{helpers → utilities}/current_date.py +1 -1
- lfx/components/{processing → utilities}/python_repl_core.py +1 -1
- lfx/components/vectorstores/__init__.py +0 -6
- lfx/components/vectorstores/local_db.py +9 -0
- lfx/components/youtube/youtube_transcripts.py +118 -30
- lfx/custom/custom_component/component.py +60 -3
- lfx/custom/custom_component/custom_component.py +68 -6
- lfx/field_typing/constants.py +1 -0
- lfx/graph/edge/base.py +45 -22
- lfx/graph/graph/base.py +5 -2
- lfx/graph/graph/schema.py +3 -2
- lfx/graph/state/model.py +15 -2
- lfx/graph/utils.py +6 -0
- lfx/graph/vertex/base.py +4 -1
- lfx/graph/vertex/param_handler.py +10 -7
- lfx/graph/vertex/vertex_types.py +1 -1
- lfx/helpers/__init__.py +12 -0
- lfx/helpers/flow.py +117 -0
- lfx/inputs/input_mixin.py +24 -1
- lfx/inputs/inputs.py +13 -1
- lfx/interface/components.py +161 -83
- lfx/io/schema.py +6 -0
- lfx/log/logger.py +5 -3
- lfx/schema/schema.py +5 -0
- lfx/services/database/__init__.py +5 -0
- lfx/services/database/service.py +25 -0
- lfx/services/deps.py +87 -22
- lfx/services/manager.py +19 -6
- lfx/services/mcp_composer/service.py +998 -157
- lfx/services/session.py +5 -0
- lfx/services/settings/base.py +51 -7
- lfx/services/settings/constants.py +8 -0
- lfx/services/storage/local.py +76 -46
- lfx/services/storage/service.py +152 -29
- lfx/template/field/base.py +3 -0
- lfx/utils/ssrf_protection.py +384 -0
- lfx/utils/validate_cloud.py +26 -0
- {lfx_nightly-0.1.13.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/METADATA +38 -22
- {lfx_nightly-0.1.13.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/RECORD +210 -196
- {lfx_nightly-0.1.13.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/WHEEL +1 -1
- lfx/components/agents/cuga_agent.py +0 -1013
- lfx/components/datastax/astra_db.py +0 -77
- lfx/components/datastax/cassandra.py +0 -92
- lfx/components/logic/run_flow.py +0 -71
- lfx/components/models/embedding_model.py +0 -114
- lfx/components/models/language_model.py +0 -144
- lfx/components/vectorstores/astradb_graph.py +0 -326
- lfx/components/vectorstores/cassandra.py +0 -264
- lfx/components/vectorstores/cassandra_graph.py +0 -238
- lfx/components/vectorstores/chroma.py +0 -167
- lfx/components/vectorstores/clickhouse.py +0 -135
- lfx/components/vectorstores/couchbase.py +0 -102
- lfx/components/vectorstores/elasticsearch.py +0 -267
- lfx/components/vectorstores/faiss.py +0 -111
- lfx/components/vectorstores/graph_rag.py +0 -141
- lfx/components/vectorstores/hcd.py +0 -314
- lfx/components/vectorstores/milvus.py +0 -115
- lfx/components/vectorstores/mongodb_atlas.py +0 -213
- lfx/components/vectorstores/opensearch.py +0 -243
- lfx/components/vectorstores/pgvector.py +0 -72
- lfx/components/vectorstores/pinecone.py +0 -134
- lfx/components/vectorstores/qdrant.py +0 -109
- lfx/components/vectorstores/supabase.py +0 -76
- lfx/components/vectorstores/upstash.py +0 -124
- lfx/components/vectorstores/vectara.py +0 -97
- lfx/components/vectorstores/vectara_rag.py +0 -164
- lfx/components/vectorstores/weaviate.py +0 -89
- /lfx/components/{data → data_source}/mock_data.py +0 -0
- /lfx/components/datastax/{astra_vectorize.py → astradb_vectorize.py} +0 -0
- /lfx/components/{logic → flow_controls}/data_conditional_router.py +0 -0
- /lfx/components/{logic → flow_controls}/flow_tool.py +0 -0
- /lfx/components/{logic → flow_controls}/listen.py +0 -0
- /lfx/components/{logic → flow_controls}/notify.py +0 -0
- /lfx/components/{logic → flow_controls}/pass_message.py +0 -0
- /lfx/components/{logic → flow_controls}/sub_flow.py +0 -0
- /lfx/components/{processing → models_and_agents}/prompt.py +0 -0
- /lfx/components/{helpers → processing}/create_list.py +0 -0
- /lfx/components/{helpers → processing}/output_parser.py +0 -0
- /lfx/components/{helpers → processing}/store_message.py +0 -0
- /lfx/components/{helpers → utilities}/id_generator.py +0 -0
- {lfx_nightly-0.1.13.dev0.dist-info → lfx_nightly-0.2.0.dev26.dist-info}/entry_points.txt +0 -0
lfx/base/data/base_file.py
CHANGED
|
@@ -2,6 +2,7 @@ import ast
|
|
|
2
2
|
import shutil
|
|
3
3
|
import tarfile
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
+
from io import BytesIO
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from tempfile import TemporaryDirectory
|
|
7
8
|
from typing import TYPE_CHECKING, Any
|
|
@@ -10,11 +11,14 @@ from zipfile import ZipFile, is_zipfile
|
|
|
10
11
|
import orjson
|
|
11
12
|
import pandas as pd
|
|
12
13
|
|
|
14
|
+
from lfx.base.data.storage_utils import get_file_size, read_file_bytes
|
|
13
15
|
from lfx.custom.custom_component.component import Component
|
|
14
16
|
from lfx.io import BoolInput, FileInput, HandleInput, Output, StrInput
|
|
15
17
|
from lfx.schema.data import Data
|
|
16
18
|
from lfx.schema.dataframe import DataFrame
|
|
17
19
|
from lfx.schema.message import Message
|
|
20
|
+
from lfx.services.deps import get_settings_service
|
|
21
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
18
22
|
from lfx.utils.helpers import build_content_type_from_extension
|
|
19
23
|
|
|
20
24
|
if TYPE_CHECKING:
|
|
@@ -27,6 +31,8 @@ class BaseFileComponent(Component, ABC):
|
|
|
27
31
|
This class provides common functionality for resolving, validating, and
|
|
28
32
|
processing file paths. Child classes must define valid file extensions
|
|
29
33
|
and implement the `process_files` method.
|
|
34
|
+
|
|
35
|
+
# TODO: May want to subclass for local and remote files
|
|
30
36
|
"""
|
|
31
37
|
|
|
32
38
|
class BaseFile:
|
|
@@ -251,12 +257,27 @@ class BaseFileComponent(Component, ABC):
|
|
|
251
257
|
|
|
252
258
|
file_path = data_item.file_path
|
|
253
259
|
file_path_obj = Path(file_path)
|
|
254
|
-
file_size_stat = file_path_obj.stat()
|
|
255
260
|
filename = file_path_obj.name
|
|
256
261
|
|
|
262
|
+
settings = get_settings_service().settings
|
|
263
|
+
|
|
264
|
+
# Get file size - use storage service for S3, filesystem for local
|
|
265
|
+
if settings.storage_type == "s3":
|
|
266
|
+
try:
|
|
267
|
+
file_size = get_file_size(file_path)
|
|
268
|
+
except (FileNotFoundError, ValueError):
|
|
269
|
+
# If we can't get file size, set to 0 or omit
|
|
270
|
+
file_size = 0
|
|
271
|
+
else:
|
|
272
|
+
try:
|
|
273
|
+
file_size_stat = file_path_obj.stat()
|
|
274
|
+
file_size = file_size_stat.st_size
|
|
275
|
+
except OSError:
|
|
276
|
+
file_size = 0
|
|
277
|
+
|
|
257
278
|
# Basic file metadata
|
|
258
279
|
metadata["filename"] = filename
|
|
259
|
-
metadata["file_size"] =
|
|
280
|
+
metadata["file_size"] = file_size
|
|
260
281
|
|
|
261
282
|
# Add MIME type from extension
|
|
262
283
|
extension = filename.split(".")[-1]
|
|
@@ -321,7 +342,16 @@ class BaseFileComponent(Component, ABC):
|
|
|
321
342
|
Message: Message containing file paths
|
|
322
343
|
"""
|
|
323
344
|
files = self._validate_and_resolve_paths()
|
|
324
|
-
|
|
345
|
+
settings = get_settings_service().settings
|
|
346
|
+
|
|
347
|
+
# For S3 storage, paths are virtual storage keys that don't exist on the local filesystem.
|
|
348
|
+
# Skip the exists() check for S3 files to preserve them in the output.
|
|
349
|
+
# Validation of S3 file existence is deferred until file processing (see _validate_and_resolve_paths).
|
|
350
|
+
# If a file was removed from S3, it will fail when attempting to read/process it later.
|
|
351
|
+
if settings.storage_type == "s3":
|
|
352
|
+
paths = [file.path.as_posix() for file in files]
|
|
353
|
+
else:
|
|
354
|
+
paths = [file.path.as_posix() for file in files if file.path.exists()]
|
|
325
355
|
|
|
326
356
|
return Message(text="\n".join(paths) if paths else "")
|
|
327
357
|
|
|
@@ -329,7 +359,29 @@ class BaseFileComponent(Component, ABC):
|
|
|
329
359
|
if not file_path:
|
|
330
360
|
return None
|
|
331
361
|
|
|
332
|
-
#
|
|
362
|
+
# Get file extension in lowercase
|
|
363
|
+
ext = Path(file_path).suffix.lower()
|
|
364
|
+
|
|
365
|
+
settings = get_settings_service().settings
|
|
366
|
+
|
|
367
|
+
# For S3 storage, download file bytes first
|
|
368
|
+
if settings.storage_type == "s3":
|
|
369
|
+
# Download file content from S3
|
|
370
|
+
content = run_until_complete(read_file_bytes(file_path))
|
|
371
|
+
|
|
372
|
+
# Map file extensions to pandas read functions that support BytesIO
|
|
373
|
+
if ext == ".csv":
|
|
374
|
+
result = pd.read_csv(BytesIO(content))
|
|
375
|
+
elif ext == ".xlsx":
|
|
376
|
+
result = pd.read_excel(BytesIO(content))
|
|
377
|
+
elif ext == ".parquet":
|
|
378
|
+
result = pd.read_parquet(BytesIO(content))
|
|
379
|
+
else:
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
return result.to_dict("records")
|
|
383
|
+
|
|
384
|
+
# Local storage - read directly from filesystem
|
|
333
385
|
file_readers: dict[str, Callable[[str], pd.DataFrame]] = {
|
|
334
386
|
".csv": pd.read_csv,
|
|
335
387
|
".xlsx": pd.read_excel,
|
|
@@ -337,9 +389,6 @@ class BaseFileComponent(Component, ABC):
|
|
|
337
389
|
# TODO: sqlite and json support?
|
|
338
390
|
}
|
|
339
391
|
|
|
340
|
-
# Get file extension in lowercase
|
|
341
|
-
ext = Path(file_path).suffix.lower()
|
|
342
|
-
|
|
343
392
|
# Get the appropriate reader function or None
|
|
344
393
|
reader = file_readers.get(ext)
|
|
345
394
|
|
|
@@ -558,16 +607,26 @@ class BaseFileComponent(Component, ABC):
|
|
|
558
607
|
resolved_files = []
|
|
559
608
|
|
|
560
609
|
def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
610
|
+
path_str = str(path)
|
|
611
|
+
settings = get_settings_service().settings
|
|
612
|
+
|
|
613
|
+
# When using object storage (S3), file paths are storage keys (e.g., "<flow_id>/<filename>")
|
|
614
|
+
# that don't exist on the local filesystem. We defer validation until file processing.
|
|
615
|
+
# For local storage, validate the file exists immediately to fail fast.
|
|
616
|
+
if settings.storage_type == "s3":
|
|
617
|
+
resolved_files.append(
|
|
618
|
+
BaseFileComponent.BaseFile(data, Path(path_str), delete_after_processing=delete_after_processing)
|
|
619
|
+
)
|
|
620
|
+
else:
|
|
621
|
+
resolved_path = Path(self.resolve_path(path_str))
|
|
622
|
+
if not resolved_path.exists():
|
|
623
|
+
msg = f"File or directory not found: {path}"
|
|
624
|
+
self.log(msg)
|
|
625
|
+
if not self.silent_errors:
|
|
626
|
+
raise ValueError(msg)
|
|
627
|
+
resolved_files.append(
|
|
628
|
+
BaseFileComponent.BaseFile(data, resolved_path, delete_after_processing=delete_after_processing)
|
|
629
|
+
)
|
|
571
630
|
|
|
572
631
|
file_path = self._file_path_as_list()
|
|
573
632
|
|
|
@@ -707,7 +766,7 @@ class BaseFileComponent(Component, ABC):
|
|
|
707
766
|
raise ValueError(msg)
|
|
708
767
|
|
|
709
768
|
def _filter_and_mark_files(self, files: list[BaseFile]) -> list[BaseFile]:
|
|
710
|
-
"""Validate file types and
|
|
769
|
+
"""Validate file types and filter out invalid files.
|
|
711
770
|
|
|
712
771
|
Args:
|
|
713
772
|
files (list[BaseFile]): List of BaseFile instances.
|
|
@@ -718,18 +777,26 @@ class BaseFileComponent(Component, ABC):
|
|
|
718
777
|
Raises:
|
|
719
778
|
ValueError: If unsupported files are encountered and `ignore_unsupported_extensions` is False.
|
|
720
779
|
"""
|
|
780
|
+
settings = get_settings_service().settings
|
|
781
|
+
is_s3_storage = settings.storage_type == "s3"
|
|
721
782
|
final_files = []
|
|
722
783
|
ignored_files = []
|
|
723
784
|
|
|
724
785
|
for file in files:
|
|
725
|
-
|
|
786
|
+
# For local storage, verify the path is actually a file
|
|
787
|
+
# For S3 storage, paths are virtual keys that don't exist locally
|
|
788
|
+
if not is_s3_storage and not file.path.is_file():
|
|
726
789
|
self.log(f"Not a file: {file.path.name}")
|
|
727
790
|
continue
|
|
728
791
|
|
|
729
|
-
|
|
730
|
-
|
|
792
|
+
# Validate file extension
|
|
793
|
+
extension = file.path.suffix[1:].lower() if file.path.suffix else ""
|
|
794
|
+
if extension not in self.valid_extensions:
|
|
795
|
+
# For local storage, optionally ignore unsupported extensions
|
|
796
|
+
if not is_s3_storage and self.ignore_unsupported_extensions:
|
|
731
797
|
ignored_files.append(file.path.name)
|
|
732
798
|
continue
|
|
799
|
+
|
|
733
800
|
msg = f"Unsupported file extension: {file.path.suffix}"
|
|
734
801
|
self.log(msg)
|
|
735
802
|
if not self.silent_errors:
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Storage-aware file utilities for components.
|
|
2
|
+
|
|
3
|
+
This module provides utilities that work with both local files and remote files
|
|
4
|
+
stored in the storage service.
|
|
5
|
+
|
|
6
|
+
TODO: Can abstract these into the storage service interface and update
|
|
7
|
+
implementations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
from lfx.services.deps import get_settings_service, get_storage_service
|
|
16
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Callable
|
|
20
|
+
|
|
21
|
+
from lfx.services.storage.service import StorageService
|
|
22
|
+
|
|
23
|
+
# Constants for path parsing
|
|
24
|
+
EXPECTED_PATH_PARTS = 2 # Path format: "flow_id/filename"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_storage_path(path: str) -> tuple[str, str] | None:
|
|
28
|
+
"""Parse a storage service path into flow_id and filename.
|
|
29
|
+
|
|
30
|
+
Storage service paths follow the format: flow_id/filename
|
|
31
|
+
This should only be called when storage_type == "s3".
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
path: The storage service path in format "flow_id/filename"
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
tuple[str, str] | None: (flow_id, filename) or None if invalid format
|
|
38
|
+
"""
|
|
39
|
+
if not path or "/" not in path:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
parts = path.split("/", 1)
|
|
43
|
+
if len(parts) != EXPECTED_PATH_PARTS or not parts[0] or not parts[1]:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return parts[0], parts[1]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def read_file_bytes(
|
|
50
|
+
file_path: str,
|
|
51
|
+
storage_service: StorageService | None = None,
|
|
52
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
53
|
+
) -> bytes:
|
|
54
|
+
"""Read file bytes from either storage service or local filesystem.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or local path)
|
|
58
|
+
storage_service: Optional storage service instance (will get from deps if not provided)
|
|
59
|
+
resolve_path: Optional function to resolve relative paths to absolute paths
|
|
60
|
+
(typically Component.resolve_path). Only used for local storage.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
bytes: The file content
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
FileNotFoundError: If the file doesn't exist
|
|
67
|
+
"""
|
|
68
|
+
settings = get_settings_service().settings
|
|
69
|
+
|
|
70
|
+
if settings.storage_type == "s3":
|
|
71
|
+
parsed = parse_storage_path(file_path)
|
|
72
|
+
if not parsed:
|
|
73
|
+
msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
|
|
74
|
+
raise ValueError(msg)
|
|
75
|
+
|
|
76
|
+
if storage_service is None:
|
|
77
|
+
storage_service = get_storage_service()
|
|
78
|
+
|
|
79
|
+
flow_id, filename = parsed
|
|
80
|
+
return await storage_service.get_file(flow_id, filename)
|
|
81
|
+
|
|
82
|
+
# For local storage, resolve path if resolver provided
|
|
83
|
+
if resolve_path:
|
|
84
|
+
file_path = resolve_path(file_path)
|
|
85
|
+
|
|
86
|
+
path_obj = Path(file_path)
|
|
87
|
+
if not path_obj.exists():
|
|
88
|
+
msg = f"File not found: {file_path}"
|
|
89
|
+
raise FileNotFoundError(msg)
|
|
90
|
+
|
|
91
|
+
return path_obj.read_bytes()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def read_file_text(
|
|
95
|
+
file_path: str,
|
|
96
|
+
encoding: str = "utf-8",
|
|
97
|
+
storage_service: StorageService | None = None,
|
|
98
|
+
resolve_path: Callable[[str], str] | None = None,
|
|
99
|
+
newline: str | None = None,
|
|
100
|
+
) -> str:
|
|
101
|
+
r"""Read file text from either storage service or local filesystem.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
file_path: Path to the file (storage service path or local path)
|
|
105
|
+
encoding: Text encoding to use
|
|
106
|
+
storage_service: Optional storage service instance
|
|
107
|
+
resolve_path: Optional function to resolve relative paths to absolute paths
|
|
108
|
+
(typically Component.resolve_path). Only used for local storage.
|
|
109
|
+
newline: Newline mode (None for default, "" for universal newlines like CSV).
|
|
110
|
+
When set to "", normalizes all line endings to \\n for consistency.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
str: The file content as text
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
FileNotFoundError: If the file doesn't exist
|
|
117
|
+
"""
|
|
118
|
+
settings = get_settings_service().settings
|
|
119
|
+
|
|
120
|
+
if settings.storage_type == "s3":
|
|
121
|
+
content = await read_file_bytes(file_path, storage_service, resolve_path)
|
|
122
|
+
text = content.decode(encoding)
|
|
123
|
+
# Normalize newlines for S3 when newline="" is specified (universal newline mode)
|
|
124
|
+
if newline == "":
|
|
125
|
+
# Convert all line endings to \n (matches Python's universal newline mode)
|
|
126
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
127
|
+
return text
|
|
128
|
+
# For local storage, resolve path if resolver provided
|
|
129
|
+
if resolve_path:
|
|
130
|
+
file_path = resolve_path(file_path)
|
|
131
|
+
|
|
132
|
+
path_obj = Path(file_path)
|
|
133
|
+
if newline is not None:
|
|
134
|
+
with path_obj.open(newline=newline, encoding=encoding) as f: # noqa: ASYNC230
|
|
135
|
+
return f.read()
|
|
136
|
+
return path_obj.read_text(encoding=encoding)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_file_size(file_path: str, storage_service: StorageService | None = None) -> int:
|
|
140
|
+
"""Get file size from either storage service or local filesystem.
|
|
141
|
+
|
|
142
|
+
Note: This is a sync wrapper - for async code, use the storage service directly.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
|
|
146
|
+
storage_service: Optional storage service instance
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
int: File size in bytes
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
FileNotFoundError: If the file doesn't exist
|
|
153
|
+
"""
|
|
154
|
+
settings = get_settings_service().settings
|
|
155
|
+
|
|
156
|
+
if settings.storage_type == "s3":
|
|
157
|
+
parsed = parse_storage_path(file_path)
|
|
158
|
+
if not parsed:
|
|
159
|
+
msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
|
|
160
|
+
raise ValueError(msg)
|
|
161
|
+
|
|
162
|
+
if storage_service is None:
|
|
163
|
+
storage_service = get_storage_service()
|
|
164
|
+
|
|
165
|
+
flow_id, filename = parsed
|
|
166
|
+
return run_until_complete(storage_service.get_file_size(flow_id, filename))
|
|
167
|
+
|
|
168
|
+
# Local file system
|
|
169
|
+
path_obj = Path(file_path)
|
|
170
|
+
if not path_obj.exists():
|
|
171
|
+
msg = f"File not found: {file_path}"
|
|
172
|
+
raise FileNotFoundError(msg)
|
|
173
|
+
|
|
174
|
+
return path_obj.stat().st_size
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def file_exists(file_path: str, storage_service: StorageService | None = None) -> bool:
|
|
178
|
+
"""Check if a file exists in either storage service or local filesystem.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or absolute local path)
|
|
182
|
+
storage_service: Optional storage service instance
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
bool: True if the file exists
|
|
186
|
+
"""
|
|
187
|
+
try:
|
|
188
|
+
get_file_size(file_path, storage_service)
|
|
189
|
+
except (FileNotFoundError, ValueError):
|
|
190
|
+
return False
|
|
191
|
+
else:
|
|
192
|
+
return True
|
lfx/base/data/utils.py
CHANGED
|
@@ -1,14 +1,21 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import tempfile
|
|
1
3
|
import unicodedata
|
|
2
4
|
from collections.abc import Callable
|
|
3
5
|
from concurrent import futures
|
|
6
|
+
from io import BytesIO
|
|
4
7
|
from pathlib import Path
|
|
5
8
|
|
|
6
9
|
import chardet
|
|
7
10
|
import orjson
|
|
8
11
|
import yaml
|
|
9
12
|
from defusedxml import ElementTree
|
|
13
|
+
from pypdf import PdfReader
|
|
10
14
|
|
|
15
|
+
from lfx.base.data.storage_utils import read_file_bytes
|
|
11
16
|
from lfx.schema.data import Data
|
|
17
|
+
from lfx.services.deps import get_settings_service
|
|
18
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
12
19
|
|
|
13
20
|
# Types of files that can be read simply by file.read()
|
|
14
21
|
# and have 100% to be completely readable
|
|
@@ -36,6 +43,34 @@ TEXT_FILE_TYPES = [
|
|
|
36
43
|
IMG_FILE_TYPES = ["jpg", "jpeg", "png", "bmp", "image"]
|
|
37
44
|
|
|
38
45
|
|
|
46
|
+
def parse_structured_text(text: str, file_path: str) -> str | dict | list:
|
|
47
|
+
"""Parse structured text formats (JSON, YAML, XML) and normalize text.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
text: The text content to parse
|
|
51
|
+
file_path: The file path (used to determine format)
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Parsed content (dict/list for JSON, dict for YAML, str for XML)
|
|
55
|
+
"""
|
|
56
|
+
if file_path.endswith(".json"):
|
|
57
|
+
loaded_json = orjson.loads(text)
|
|
58
|
+
if isinstance(loaded_json, dict):
|
|
59
|
+
loaded_json = {k: normalize_text(v) if isinstance(v, str) else v for k, v in loaded_json.items()}
|
|
60
|
+
elif isinstance(loaded_json, list):
|
|
61
|
+
loaded_json = [normalize_text(item) if isinstance(item, str) else item for item in loaded_json]
|
|
62
|
+
return orjson.dumps(loaded_json).decode("utf-8")
|
|
63
|
+
|
|
64
|
+
if file_path.endswith((".yaml", ".yml")):
|
|
65
|
+
return yaml.safe_load(text)
|
|
66
|
+
|
|
67
|
+
if file_path.endswith(".xml"):
|
|
68
|
+
xml_element = ElementTree.fromstring(text)
|
|
69
|
+
return ElementTree.tostring(xml_element, encoding="unicode")
|
|
70
|
+
|
|
71
|
+
return text
|
|
72
|
+
|
|
73
|
+
|
|
39
74
|
def normalize_text(text):
|
|
40
75
|
return unicodedata.normalize("NFKD", text)
|
|
41
76
|
|
|
@@ -109,6 +144,14 @@ def partition_file_to_data(file_path: str, *, silent_errors: bool) -> Data | Non
|
|
|
109
144
|
|
|
110
145
|
|
|
111
146
|
def read_text_file(file_path: str) -> str:
|
|
147
|
+
"""Read a text file with automatic encoding detection.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
file_path: Path to the file (local path only, not storage service path)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
str: The file content as text
|
|
154
|
+
"""
|
|
112
155
|
file_path_ = Path(file_path)
|
|
113
156
|
raw_data = file_path_.read_bytes()
|
|
114
157
|
result = chardet.detect(raw_data)
|
|
@@ -120,13 +163,90 @@ def read_text_file(file_path: str) -> str:
|
|
|
120
163
|
return file_path_.read_text(encoding=encoding)
|
|
121
164
|
|
|
122
165
|
|
|
166
|
+
async def read_text_file_async(file_path: str) -> str:
|
|
167
|
+
"""Read a text file with automatic encoding detection (async, storage-aware).
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
file_path: Path to the file (S3 key format "flow_id/filename" or local path)
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
str: The file content as text
|
|
174
|
+
"""
|
|
175
|
+
from .storage_utils import read_file_bytes
|
|
176
|
+
|
|
177
|
+
# Use storage-aware read to get bytes
|
|
178
|
+
raw_data = await read_file_bytes(file_path)
|
|
179
|
+
|
|
180
|
+
# Auto-detect encoding
|
|
181
|
+
result = chardet.detect(raw_data)
|
|
182
|
+
encoding = result.get("encoding")
|
|
183
|
+
|
|
184
|
+
# If encoding detection fails (e.g., binary file), default to utf-8
|
|
185
|
+
if not encoding or encoding in {"Windows-1252", "Windows-1254", "MacRoman"}:
|
|
186
|
+
encoding = "utf-8"
|
|
187
|
+
|
|
188
|
+
return raw_data.decode(encoding, errors="replace")
|
|
189
|
+
|
|
190
|
+
|
|
123
191
|
def read_docx_file(file_path: str) -> str:
|
|
192
|
+
"""Read a DOCX file and extract text.
|
|
193
|
+
|
|
194
|
+
ote: python-docx requires a file path, so this only works with local files.
|
|
195
|
+
For storage service files, use read_docx_file_async which downloads to temp.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
file_path: Path to the DOCX file (local path only)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
str: Extracted text from the document
|
|
202
|
+
"""
|
|
124
203
|
from docx import Document
|
|
125
204
|
|
|
126
205
|
doc = Document(file_path)
|
|
127
206
|
return "\n\n".join([p.text for p in doc.paragraphs])
|
|
128
207
|
|
|
129
208
|
|
|
209
|
+
async def read_docx_file_async(file_path: str) -> str:
|
|
210
|
+
"""Read a DOCX file and extract text (async, storage-aware).
|
|
211
|
+
|
|
212
|
+
For S3 storage, downloads to temp file (python-docx requires file path).
|
|
213
|
+
For local storage, reads directly.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
file_path: Path to the DOCX file (S3 key format "flow_id/filename" or local path)
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
str: Extracted text from the document
|
|
220
|
+
"""
|
|
221
|
+
from docx import Document
|
|
222
|
+
|
|
223
|
+
from .storage_utils import read_file_bytes
|
|
224
|
+
|
|
225
|
+
settings = get_settings_service().settings
|
|
226
|
+
|
|
227
|
+
if settings.storage_type == "local":
|
|
228
|
+
# Local storage - read directly
|
|
229
|
+
doc = Document(file_path)
|
|
230
|
+
return "\n\n".join([p.text for p in doc.paragraphs])
|
|
231
|
+
|
|
232
|
+
# S3 storage - need temp file for python-docx (doesn't support BytesIO)
|
|
233
|
+
content = await read_file_bytes(file_path)
|
|
234
|
+
|
|
235
|
+
# Create temp file with .docx extension
|
|
236
|
+
# Extract filename from path for suffix
|
|
237
|
+
suffix = Path(file_path.split("/")[-1]).suffix
|
|
238
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp_file:
|
|
239
|
+
tmp_file.write(content)
|
|
240
|
+
temp_path = tmp_file.name
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
doc = Document(temp_path)
|
|
244
|
+
return "\n\n".join([p.text for p in doc.paragraphs])
|
|
245
|
+
finally:
|
|
246
|
+
with contextlib.suppress(Exception):
|
|
247
|
+
Path(temp_path).unlink()
|
|
248
|
+
|
|
249
|
+
|
|
130
250
|
def parse_pdf_to_text(file_path: str) -> str:
|
|
131
251
|
from pypdf import PdfReader
|
|
132
252
|
|
|
@@ -134,7 +254,35 @@ def parse_pdf_to_text(file_path: str) -> str:
|
|
|
134
254
|
return "\n\n".join([page.extract_text() for page in reader.pages])
|
|
135
255
|
|
|
136
256
|
|
|
257
|
+
async def parse_pdf_to_text_async(file_path: str) -> str:
|
|
258
|
+
"""Parse a PDF file to extract text (async, storage-aware).
|
|
259
|
+
|
|
260
|
+
Uses storage-aware file reading to support both local and S3 storage.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
file_path: Path to the PDF file (S3 key format "flow_id/filename" or local path)
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
str: Extracted text from all pages
|
|
267
|
+
"""
|
|
268
|
+
content = await read_file_bytes(file_path)
|
|
269
|
+
with BytesIO(content) as f, PdfReader(f) as reader:
|
|
270
|
+
return "\n\n".join([page.extract_text() for page in reader.pages])
|
|
271
|
+
|
|
272
|
+
|
|
137
273
|
def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | None:
|
|
274
|
+
"""Parse a text file to Data (sync version).
|
|
275
|
+
|
|
276
|
+
For S3 storage, this will use async operations to fetch the file.
|
|
277
|
+
For local storage, reads directly from filesystem.
|
|
278
|
+
"""
|
|
279
|
+
settings = get_settings_service().settings
|
|
280
|
+
|
|
281
|
+
# If using S3 storage, we need to use async operations
|
|
282
|
+
if settings.storage_type == "s3":
|
|
283
|
+
# Run the async version safely (handles existing event loops)
|
|
284
|
+
return run_until_complete(parse_text_file_to_data_async(file_path, silent_errors=silent_errors))
|
|
285
|
+
|
|
138
286
|
try:
|
|
139
287
|
if file_path.endswith(".pdf"):
|
|
140
288
|
text = parse_pdf_to_text(file_path)
|
|
@@ -143,20 +291,7 @@ def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | No
|
|
|
143
291
|
else:
|
|
144
292
|
text = read_text_file(file_path)
|
|
145
293
|
|
|
146
|
-
|
|
147
|
-
if file_path.endswith(".json"):
|
|
148
|
-
loaded_json = orjson.loads(text)
|
|
149
|
-
if isinstance(loaded_json, dict):
|
|
150
|
-
loaded_json = {k: normalize_text(v) if isinstance(v, str) else v for k, v in loaded_json.items()}
|
|
151
|
-
elif isinstance(loaded_json, list):
|
|
152
|
-
loaded_json = [normalize_text(item) if isinstance(item, str) else item for item in loaded_json]
|
|
153
|
-
text = orjson.dumps(loaded_json).decode("utf-8")
|
|
154
|
-
|
|
155
|
-
elif file_path.endswith((".yaml", ".yml")):
|
|
156
|
-
text = yaml.safe_load(text)
|
|
157
|
-
elif file_path.endswith(".xml"):
|
|
158
|
-
xml_element = ElementTree.fromstring(text)
|
|
159
|
-
text = ElementTree.tostring(xml_element, encoding="unicode")
|
|
294
|
+
text = parse_structured_text(text, file_path)
|
|
160
295
|
except Exception as e:
|
|
161
296
|
if not silent_errors:
|
|
162
297
|
msg = f"Error loading file {file_path}: {e}"
|
|
@@ -166,6 +301,35 @@ def parse_text_file_to_data(file_path: str, *, silent_errors: bool) -> Data | No
|
|
|
166
301
|
return Data(data={"file_path": file_path, "text": text})
|
|
167
302
|
|
|
168
303
|
|
|
304
|
+
async def parse_text_file_to_data_async(file_path: str, *, silent_errors: bool) -> Data | None:
|
|
305
|
+
"""Parse a text file to Data (async version, supports storage service).
|
|
306
|
+
|
|
307
|
+
This version properly handles storage service files:
|
|
308
|
+
- For text/JSON/YAML/XML: reads bytes directly (no temp file)
|
|
309
|
+
- For PDF: reads bytes directly via BytesIO (no temp file)
|
|
310
|
+
- For DOCX: downloads to temp file (python-docx requires file path)
|
|
311
|
+
"""
|
|
312
|
+
try:
|
|
313
|
+
if file_path.endswith(".pdf"):
|
|
314
|
+
text = await parse_pdf_to_text_async(file_path)
|
|
315
|
+
elif file_path.endswith(".docx"):
|
|
316
|
+
text = await read_docx_file_async(file_path)
|
|
317
|
+
else:
|
|
318
|
+
# Text files - read directly, no temp file needed
|
|
319
|
+
text = await read_text_file_async(file_path)
|
|
320
|
+
|
|
321
|
+
# Parse structured formats (JSON, YAML, XML)
|
|
322
|
+
text = parse_structured_text(text, file_path)
|
|
323
|
+
|
|
324
|
+
return Data(data={"file_path": file_path, "text": text})
|
|
325
|
+
|
|
326
|
+
except Exception as e:
|
|
327
|
+
if not silent_errors:
|
|
328
|
+
msg = f"Error loading file {file_path}: {e}"
|
|
329
|
+
raise ValueError(msg) from e
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
169
333
|
# ! Removing unstructured dependency until
|
|
170
334
|
# ! 3.12 is supported
|
|
171
335
|
# def get_elements(
|