lfx-nightly 0.2.0.dev25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lfx-nightly might be problematic. Click here for more details.
- lfx/__init__.py +0 -0
- lfx/__main__.py +25 -0
- lfx/_assets/component_index.json +1 -0
- lfx/base/__init__.py +0 -0
- lfx/base/agents/__init__.py +0 -0
- lfx/base/agents/agent.py +375 -0
- lfx/base/agents/altk_base_agent.py +380 -0
- lfx/base/agents/altk_tool_wrappers.py +565 -0
- lfx/base/agents/callback.py +130 -0
- lfx/base/agents/context.py +109 -0
- lfx/base/agents/crewai/__init__.py +0 -0
- lfx/base/agents/crewai/crew.py +231 -0
- lfx/base/agents/crewai/tasks.py +12 -0
- lfx/base/agents/default_prompts.py +23 -0
- lfx/base/agents/errors.py +15 -0
- lfx/base/agents/events.py +430 -0
- lfx/base/agents/utils.py +237 -0
- lfx/base/astra_assistants/__init__.py +0 -0
- lfx/base/astra_assistants/util.py +171 -0
- lfx/base/chains/__init__.py +0 -0
- lfx/base/chains/model.py +19 -0
- lfx/base/composio/__init__.py +0 -0
- lfx/base/composio/composio_base.py +2584 -0
- lfx/base/compressors/__init__.py +0 -0
- lfx/base/compressors/model.py +60 -0
- lfx/base/constants.py +46 -0
- lfx/base/curl/__init__.py +0 -0
- lfx/base/curl/parse.py +188 -0
- lfx/base/data/__init__.py +5 -0
- lfx/base/data/base_file.py +810 -0
- lfx/base/data/docling_utils.py +338 -0
- lfx/base/data/storage_utils.py +192 -0
- lfx/base/data/utils.py +362 -0
- lfx/base/datastax/__init__.py +5 -0
- lfx/base/datastax/astradb_base.py +896 -0
- lfx/base/document_transformers/__init__.py +0 -0
- lfx/base/document_transformers/model.py +43 -0
- lfx/base/embeddings/__init__.py +0 -0
- lfx/base/embeddings/aiml_embeddings.py +62 -0
- lfx/base/embeddings/embeddings_class.py +113 -0
- lfx/base/embeddings/model.py +26 -0
- lfx/base/flow_processing/__init__.py +0 -0
- lfx/base/flow_processing/utils.py +86 -0
- lfx/base/huggingface/__init__.py +0 -0
- lfx/base/huggingface/model_bridge.py +133 -0
- lfx/base/io/__init__.py +0 -0
- lfx/base/io/chat.py +21 -0
- lfx/base/io/text.py +22 -0
- lfx/base/knowledge_bases/__init__.py +3 -0
- lfx/base/knowledge_bases/knowledge_base_utils.py +137 -0
- lfx/base/langchain_utilities/__init__.py +0 -0
- lfx/base/langchain_utilities/model.py +35 -0
- lfx/base/langchain_utilities/spider_constants.py +1 -0
- lfx/base/langwatch/__init__.py +0 -0
- lfx/base/langwatch/utils.py +18 -0
- lfx/base/mcp/__init__.py +0 -0
- lfx/base/mcp/constants.py +2 -0
- lfx/base/mcp/util.py +1659 -0
- lfx/base/memory/__init__.py +0 -0
- lfx/base/memory/memory.py +49 -0
- lfx/base/memory/model.py +38 -0
- lfx/base/models/__init__.py +3 -0
- lfx/base/models/aiml_constants.py +51 -0
- lfx/base/models/anthropic_constants.py +51 -0
- lfx/base/models/aws_constants.py +151 -0
- lfx/base/models/chat_result.py +76 -0
- lfx/base/models/cometapi_constants.py +54 -0
- lfx/base/models/google_generative_ai_constants.py +70 -0
- lfx/base/models/google_generative_ai_model.py +38 -0
- lfx/base/models/groq_constants.py +150 -0
- lfx/base/models/groq_model_discovery.py +265 -0
- lfx/base/models/model.py +375 -0
- lfx/base/models/model_input_constants.py +378 -0
- lfx/base/models/model_metadata.py +41 -0
- lfx/base/models/model_utils.py +108 -0
- lfx/base/models/novita_constants.py +35 -0
- lfx/base/models/ollama_constants.py +52 -0
- lfx/base/models/openai_constants.py +129 -0
- lfx/base/models/sambanova_constants.py +18 -0
- lfx/base/models/watsonx_constants.py +36 -0
- lfx/base/processing/__init__.py +0 -0
- lfx/base/prompts/__init__.py +0 -0
- lfx/base/prompts/api_utils.py +224 -0
- lfx/base/prompts/utils.py +61 -0
- lfx/base/textsplitters/__init__.py +0 -0
- lfx/base/textsplitters/model.py +28 -0
- lfx/base/tools/__init__.py +0 -0
- lfx/base/tools/base.py +26 -0
- lfx/base/tools/component_tool.py +325 -0
- lfx/base/tools/constants.py +49 -0
- lfx/base/tools/flow_tool.py +132 -0
- lfx/base/tools/run_flow.py +698 -0
- lfx/base/vectorstores/__init__.py +0 -0
- lfx/base/vectorstores/model.py +193 -0
- lfx/base/vectorstores/utils.py +22 -0
- lfx/base/vectorstores/vector_store_connection_decorator.py +52 -0
- lfx/cli/__init__.py +5 -0
- lfx/cli/commands.py +327 -0
- lfx/cli/common.py +650 -0
- lfx/cli/run.py +506 -0
- lfx/cli/script_loader.py +289 -0
- lfx/cli/serve_app.py +546 -0
- lfx/cli/validation.py +69 -0
- lfx/components/FAISS/__init__.py +34 -0
- lfx/components/FAISS/faiss.py +111 -0
- lfx/components/Notion/__init__.py +19 -0
- lfx/components/Notion/add_content_to_page.py +269 -0
- lfx/components/Notion/create_page.py +94 -0
- lfx/components/Notion/list_database_properties.py +68 -0
- lfx/components/Notion/list_pages.py +122 -0
- lfx/components/Notion/list_users.py +77 -0
- lfx/components/Notion/page_content_viewer.py +93 -0
- lfx/components/Notion/search.py +111 -0
- lfx/components/Notion/update_page_property.py +114 -0
- lfx/components/__init__.py +428 -0
- lfx/components/_importing.py +42 -0
- lfx/components/agentql/__init__.py +3 -0
- lfx/components/agentql/agentql_api.py +151 -0
- lfx/components/aiml/__init__.py +37 -0
- lfx/components/aiml/aiml.py +115 -0
- lfx/components/aiml/aiml_embeddings.py +37 -0
- lfx/components/altk/__init__.py +34 -0
- lfx/components/altk/altk_agent.py +193 -0
- lfx/components/amazon/__init__.py +36 -0
- lfx/components/amazon/amazon_bedrock_converse.py +195 -0
- lfx/components/amazon/amazon_bedrock_embedding.py +109 -0
- lfx/components/amazon/amazon_bedrock_model.py +130 -0
- lfx/components/amazon/s3_bucket_uploader.py +211 -0
- lfx/components/anthropic/__init__.py +34 -0
- lfx/components/anthropic/anthropic.py +187 -0
- lfx/components/apify/__init__.py +5 -0
- lfx/components/apify/apify_actor.py +325 -0
- lfx/components/arxiv/__init__.py +3 -0
- lfx/components/arxiv/arxiv.py +169 -0
- lfx/components/assemblyai/__init__.py +46 -0
- lfx/components/assemblyai/assemblyai_get_subtitles.py +83 -0
- lfx/components/assemblyai/assemblyai_lemur.py +183 -0
- lfx/components/assemblyai/assemblyai_list_transcripts.py +95 -0
- lfx/components/assemblyai/assemblyai_poll_transcript.py +72 -0
- lfx/components/assemblyai/assemblyai_start_transcript.py +188 -0
- lfx/components/azure/__init__.py +37 -0
- lfx/components/azure/azure_openai.py +95 -0
- lfx/components/azure/azure_openai_embeddings.py +83 -0
- lfx/components/baidu/__init__.py +32 -0
- lfx/components/baidu/baidu_qianfan_chat.py +113 -0
- lfx/components/bing/__init__.py +3 -0
- lfx/components/bing/bing_search_api.py +61 -0
- lfx/components/cassandra/__init__.py +40 -0
- lfx/components/cassandra/cassandra.py +264 -0
- lfx/components/cassandra/cassandra_chat.py +92 -0
- lfx/components/cassandra/cassandra_graph.py +238 -0
- lfx/components/chains/__init__.py +3 -0
- lfx/components/chroma/__init__.py +34 -0
- lfx/components/chroma/chroma.py +169 -0
- lfx/components/cleanlab/__init__.py +40 -0
- lfx/components/cleanlab/cleanlab_evaluator.py +155 -0
- lfx/components/cleanlab/cleanlab_rag_evaluator.py +254 -0
- lfx/components/cleanlab/cleanlab_remediator.py +131 -0
- lfx/components/clickhouse/__init__.py +34 -0
- lfx/components/clickhouse/clickhouse.py +135 -0
- lfx/components/cloudflare/__init__.py +32 -0
- lfx/components/cloudflare/cloudflare.py +81 -0
- lfx/components/cohere/__init__.py +40 -0
- lfx/components/cohere/cohere_embeddings.py +81 -0
- lfx/components/cohere/cohere_models.py +46 -0
- lfx/components/cohere/cohere_rerank.py +51 -0
- lfx/components/cometapi/__init__.py +32 -0
- lfx/components/cometapi/cometapi.py +166 -0
- lfx/components/composio/__init__.py +222 -0
- lfx/components/composio/agentql_composio.py +11 -0
- lfx/components/composio/agiled_composio.py +11 -0
- lfx/components/composio/airtable_composio.py +11 -0
- lfx/components/composio/apollo_composio.py +11 -0
- lfx/components/composio/asana_composio.py +11 -0
- lfx/components/composio/attio_composio.py +11 -0
- lfx/components/composio/bitbucket_composio.py +11 -0
- lfx/components/composio/bolna_composio.py +11 -0
- lfx/components/composio/brightdata_composio.py +11 -0
- lfx/components/composio/calendly_composio.py +11 -0
- lfx/components/composio/canva_composio.py +11 -0
- lfx/components/composio/canvas_composio.py +11 -0
- lfx/components/composio/coda_composio.py +11 -0
- lfx/components/composio/composio_api.py +278 -0
- lfx/components/composio/contentful_composio.py +11 -0
- lfx/components/composio/digicert_composio.py +11 -0
- lfx/components/composio/discord_composio.py +11 -0
- lfx/components/composio/dropbox_compnent.py +11 -0
- lfx/components/composio/elevenlabs_composio.py +11 -0
- lfx/components/composio/exa_composio.py +11 -0
- lfx/components/composio/figma_composio.py +11 -0
- lfx/components/composio/finage_composio.py +11 -0
- lfx/components/composio/firecrawl_composio.py +11 -0
- lfx/components/composio/fireflies_composio.py +11 -0
- lfx/components/composio/fixer_composio.py +11 -0
- lfx/components/composio/flexisign_composio.py +11 -0
- lfx/components/composio/freshdesk_composio.py +11 -0
- lfx/components/composio/github_composio.py +11 -0
- lfx/components/composio/gmail_composio.py +38 -0
- lfx/components/composio/googlebigquery_composio.py +11 -0
- lfx/components/composio/googlecalendar_composio.py +11 -0
- lfx/components/composio/googleclassroom_composio.py +11 -0
- lfx/components/composio/googledocs_composio.py +11 -0
- lfx/components/composio/googlemeet_composio.py +11 -0
- lfx/components/composio/googlesheets_composio.py +11 -0
- lfx/components/composio/googletasks_composio.py +8 -0
- lfx/components/composio/heygen_composio.py +11 -0
- lfx/components/composio/instagram_composio.py +11 -0
- lfx/components/composio/jira_composio.py +11 -0
- lfx/components/composio/jotform_composio.py +11 -0
- lfx/components/composio/klaviyo_composio.py +11 -0
- lfx/components/composio/linear_composio.py +11 -0
- lfx/components/composio/listennotes_composio.py +11 -0
- lfx/components/composio/mem0_composio.py +11 -0
- lfx/components/composio/miro_composio.py +11 -0
- lfx/components/composio/missive_composio.py +11 -0
- lfx/components/composio/notion_composio.py +11 -0
- lfx/components/composio/onedrive_composio.py +11 -0
- lfx/components/composio/outlook_composio.py +11 -0
- lfx/components/composio/pandadoc_composio.py +11 -0
- lfx/components/composio/peopledatalabs_composio.py +11 -0
- lfx/components/composio/perplexityai_composio.py +11 -0
- lfx/components/composio/reddit_composio.py +11 -0
- lfx/components/composio/serpapi_composio.py +11 -0
- lfx/components/composio/slack_composio.py +11 -0
- lfx/components/composio/slackbot_composio.py +11 -0
- lfx/components/composio/snowflake_composio.py +11 -0
- lfx/components/composio/supabase_composio.py +11 -0
- lfx/components/composio/tavily_composio.py +11 -0
- lfx/components/composio/timelinesai_composio.py +11 -0
- lfx/components/composio/todoist_composio.py +11 -0
- lfx/components/composio/wrike_composio.py +11 -0
- lfx/components/composio/youtube_composio.py +11 -0
- lfx/components/confluence/__init__.py +3 -0
- lfx/components/confluence/confluence.py +84 -0
- lfx/components/couchbase/__init__.py +34 -0
- lfx/components/couchbase/couchbase.py +102 -0
- lfx/components/crewai/__init__.py +49 -0
- lfx/components/crewai/crewai.py +108 -0
- lfx/components/crewai/hierarchical_crew.py +47 -0
- lfx/components/crewai/hierarchical_task.py +45 -0
- lfx/components/crewai/sequential_crew.py +53 -0
- lfx/components/crewai/sequential_task.py +74 -0
- lfx/components/crewai/sequential_task_agent.py +144 -0
- lfx/components/cuga/__init__.py +34 -0
- lfx/components/cuga/cuga_agent.py +730 -0
- lfx/components/custom_component/__init__.py +34 -0
- lfx/components/custom_component/custom_component.py +31 -0
- lfx/components/data/__init__.py +114 -0
- lfx/components/data_source/__init__.py +58 -0
- lfx/components/data_source/api_request.py +577 -0
- lfx/components/data_source/csv_to_data.py +101 -0
- lfx/components/data_source/json_to_data.py +106 -0
- lfx/components/data_source/mock_data.py +398 -0
- lfx/components/data_source/news_search.py +166 -0
- lfx/components/data_source/rss.py +71 -0
- lfx/components/data_source/sql_executor.py +101 -0
- lfx/components/data_source/url.py +311 -0
- lfx/components/data_source/web_search.py +326 -0
- lfx/components/datastax/__init__.py +76 -0
- lfx/components/datastax/astradb_assistant_manager.py +307 -0
- lfx/components/datastax/astradb_chatmemory.py +40 -0
- lfx/components/datastax/astradb_cql.py +288 -0
- lfx/components/datastax/astradb_graph.py +217 -0
- lfx/components/datastax/astradb_tool.py +378 -0
- lfx/components/datastax/astradb_vectorize.py +122 -0
- lfx/components/datastax/astradb_vectorstore.py +449 -0
- lfx/components/datastax/create_assistant.py +59 -0
- lfx/components/datastax/create_thread.py +33 -0
- lfx/components/datastax/dotenv.py +36 -0
- lfx/components/datastax/get_assistant.py +38 -0
- lfx/components/datastax/getenvvar.py +31 -0
- lfx/components/datastax/graph_rag.py +141 -0
- lfx/components/datastax/hcd.py +315 -0
- lfx/components/datastax/list_assistants.py +26 -0
- lfx/components/datastax/run.py +90 -0
- lfx/components/deactivated/__init__.py +15 -0
- lfx/components/deactivated/amazon_kendra.py +66 -0
- lfx/components/deactivated/chat_litellm_model.py +158 -0
- lfx/components/deactivated/code_block_extractor.py +26 -0
- lfx/components/deactivated/documents_to_data.py +22 -0
- lfx/components/deactivated/embed.py +16 -0
- lfx/components/deactivated/extract_key_from_data.py +46 -0
- lfx/components/deactivated/json_document_builder.py +57 -0
- lfx/components/deactivated/list_flows.py +20 -0
- lfx/components/deactivated/mcp_sse.py +61 -0
- lfx/components/deactivated/mcp_stdio.py +62 -0
- lfx/components/deactivated/merge_data.py +93 -0
- lfx/components/deactivated/message.py +37 -0
- lfx/components/deactivated/metal.py +54 -0
- lfx/components/deactivated/multi_query.py +59 -0
- lfx/components/deactivated/retriever.py +43 -0
- lfx/components/deactivated/selective_passthrough.py +77 -0
- lfx/components/deactivated/should_run_next.py +40 -0
- lfx/components/deactivated/split_text.py +63 -0
- lfx/components/deactivated/store_message.py +24 -0
- lfx/components/deactivated/sub_flow.py +124 -0
- lfx/components/deactivated/vectara_self_query.py +76 -0
- lfx/components/deactivated/vector_store.py +24 -0
- lfx/components/deepseek/__init__.py +34 -0
- lfx/components/deepseek/deepseek.py +136 -0
- lfx/components/docling/__init__.py +43 -0
- lfx/components/docling/chunk_docling_document.py +186 -0
- lfx/components/docling/docling_inline.py +238 -0
- lfx/components/docling/docling_remote.py +195 -0
- lfx/components/docling/export_docling_document.py +117 -0
- lfx/components/documentloaders/__init__.py +3 -0
- lfx/components/duckduckgo/__init__.py +3 -0
- lfx/components/duckduckgo/duck_duck_go_search_run.py +92 -0
- lfx/components/elastic/__init__.py +37 -0
- lfx/components/elastic/elasticsearch.py +267 -0
- lfx/components/elastic/opensearch.py +789 -0
- lfx/components/elastic/opensearch_multimodal.py +1575 -0
- lfx/components/embeddings/__init__.py +37 -0
- lfx/components/embeddings/similarity.py +77 -0
- lfx/components/embeddings/text_embedder.py +65 -0
- lfx/components/exa/__init__.py +3 -0
- lfx/components/exa/exa_search.py +68 -0
- lfx/components/files_and_knowledge/__init__.py +47 -0
- lfx/components/files_and_knowledge/directory.py +113 -0
- lfx/components/files_and_knowledge/file.py +841 -0
- lfx/components/files_and_knowledge/ingestion.py +694 -0
- lfx/components/files_and_knowledge/retrieval.py +264 -0
- lfx/components/files_and_knowledge/save_file.py +746 -0
- lfx/components/firecrawl/__init__.py +43 -0
- lfx/components/firecrawl/firecrawl_crawl_api.py +88 -0
- lfx/components/firecrawl/firecrawl_extract_api.py +136 -0
- lfx/components/firecrawl/firecrawl_map_api.py +89 -0
- lfx/components/firecrawl/firecrawl_scrape_api.py +73 -0
- lfx/components/flow_controls/__init__.py +58 -0
- lfx/components/flow_controls/conditional_router.py +208 -0
- lfx/components/flow_controls/data_conditional_router.py +126 -0
- lfx/components/flow_controls/flow_tool.py +111 -0
- lfx/components/flow_controls/listen.py +29 -0
- lfx/components/flow_controls/loop.py +163 -0
- lfx/components/flow_controls/notify.py +88 -0
- lfx/components/flow_controls/pass_message.py +36 -0
- lfx/components/flow_controls/run_flow.py +108 -0
- lfx/components/flow_controls/sub_flow.py +115 -0
- lfx/components/git/__init__.py +4 -0
- lfx/components/git/git.py +262 -0
- lfx/components/git/gitextractor.py +196 -0
- lfx/components/glean/__init__.py +3 -0
- lfx/components/glean/glean_search_api.py +173 -0
- lfx/components/google/__init__.py +17 -0
- lfx/components/google/gmail.py +193 -0
- lfx/components/google/google_bq_sql_executor.py +157 -0
- lfx/components/google/google_drive.py +92 -0
- lfx/components/google/google_drive_search.py +152 -0
- lfx/components/google/google_generative_ai.py +144 -0
- lfx/components/google/google_generative_ai_embeddings.py +141 -0
- lfx/components/google/google_oauth_token.py +89 -0
- lfx/components/google/google_search_api_core.py +68 -0
- lfx/components/google/google_serper_api_core.py +74 -0
- lfx/components/groq/__init__.py +34 -0
- lfx/components/groq/groq.py +143 -0
- lfx/components/helpers/__init__.py +154 -0
- lfx/components/homeassistant/__init__.py +7 -0
- lfx/components/homeassistant/home_assistant_control.py +152 -0
- lfx/components/homeassistant/list_home_assistant_states.py +137 -0
- lfx/components/huggingface/__init__.py +37 -0
- lfx/components/huggingface/huggingface.py +199 -0
- lfx/components/huggingface/huggingface_inference_api.py +106 -0
- lfx/components/ibm/__init__.py +34 -0
- lfx/components/ibm/watsonx.py +207 -0
- lfx/components/ibm/watsonx_embeddings.py +135 -0
- lfx/components/icosacomputing/__init__.py +5 -0
- lfx/components/icosacomputing/combinatorial_reasoner.py +84 -0
- lfx/components/input_output/__init__.py +40 -0
- lfx/components/input_output/chat.py +109 -0
- lfx/components/input_output/chat_output.py +184 -0
- lfx/components/input_output/text.py +27 -0
- lfx/components/input_output/text_output.py +29 -0
- lfx/components/input_output/webhook.py +56 -0
- lfx/components/jigsawstack/__init__.py +23 -0
- lfx/components/jigsawstack/ai_scrape.py +126 -0
- lfx/components/jigsawstack/ai_web_search.py +136 -0
- lfx/components/jigsawstack/file_read.py +115 -0
- lfx/components/jigsawstack/file_upload.py +94 -0
- lfx/components/jigsawstack/image_generation.py +205 -0
- lfx/components/jigsawstack/nsfw.py +60 -0
- lfx/components/jigsawstack/object_detection.py +124 -0
- lfx/components/jigsawstack/sentiment.py +112 -0
- lfx/components/jigsawstack/text_to_sql.py +90 -0
- lfx/components/jigsawstack/text_translate.py +77 -0
- lfx/components/jigsawstack/vocr.py +107 -0
- lfx/components/knowledge_bases/__init__.py +89 -0
- lfx/components/langchain_utilities/__init__.py +109 -0
- lfx/components/langchain_utilities/character.py +53 -0
- lfx/components/langchain_utilities/conversation.py +59 -0
- lfx/components/langchain_utilities/csv_agent.py +175 -0
- lfx/components/langchain_utilities/fake_embeddings.py +26 -0
- lfx/components/langchain_utilities/html_link_extractor.py +35 -0
- lfx/components/langchain_utilities/json_agent.py +100 -0
- lfx/components/langchain_utilities/langchain_hub.py +126 -0
- lfx/components/langchain_utilities/language_recursive.py +49 -0
- lfx/components/langchain_utilities/language_semantic.py +138 -0
- lfx/components/langchain_utilities/llm_checker.py +39 -0
- lfx/components/langchain_utilities/llm_math.py +42 -0
- lfx/components/langchain_utilities/natural_language.py +61 -0
- lfx/components/langchain_utilities/openai_tools.py +53 -0
- lfx/components/langchain_utilities/openapi.py +48 -0
- lfx/components/langchain_utilities/recursive_character.py +60 -0
- lfx/components/langchain_utilities/retrieval_qa.py +83 -0
- lfx/components/langchain_utilities/runnable_executor.py +137 -0
- lfx/components/langchain_utilities/self_query.py +80 -0
- lfx/components/langchain_utilities/spider.py +142 -0
- lfx/components/langchain_utilities/sql.py +40 -0
- lfx/components/langchain_utilities/sql_database.py +35 -0
- lfx/components/langchain_utilities/sql_generator.py +78 -0
- lfx/components/langchain_utilities/tool_calling.py +59 -0
- lfx/components/langchain_utilities/vector_store_info.py +49 -0
- lfx/components/langchain_utilities/vector_store_router.py +33 -0
- lfx/components/langchain_utilities/xml_agent.py +71 -0
- lfx/components/langwatch/__init__.py +3 -0
- lfx/components/langwatch/langwatch.py +278 -0
- lfx/components/link_extractors/__init__.py +3 -0
- lfx/components/llm_operations/__init__.py +46 -0
- lfx/components/llm_operations/batch_run.py +205 -0
- lfx/components/llm_operations/lambda_filter.py +218 -0
- lfx/components/llm_operations/llm_conditional_router.py +421 -0
- lfx/components/llm_operations/llm_selector.py +499 -0
- lfx/components/llm_operations/structured_output.py +244 -0
- lfx/components/lmstudio/__init__.py +34 -0
- lfx/components/lmstudio/lmstudioembeddings.py +89 -0
- lfx/components/lmstudio/lmstudiomodel.py +133 -0
- lfx/components/logic/__init__.py +181 -0
- lfx/components/maritalk/__init__.py +32 -0
- lfx/components/maritalk/maritalk.py +52 -0
- lfx/components/mem0/__init__.py +3 -0
- lfx/components/mem0/mem0_chat_memory.py +147 -0
- lfx/components/milvus/__init__.py +34 -0
- lfx/components/milvus/milvus.py +115 -0
- lfx/components/mistral/__init__.py +37 -0
- lfx/components/mistral/mistral.py +114 -0
- lfx/components/mistral/mistral_embeddings.py +58 -0
- lfx/components/models/__init__.py +89 -0
- lfx/components/models_and_agents/__init__.py +49 -0
- lfx/components/models_and_agents/agent.py +644 -0
- lfx/components/models_and_agents/embedding_model.py +423 -0
- lfx/components/models_and_agents/language_model.py +398 -0
- lfx/components/models_and_agents/mcp_component.py +594 -0
- lfx/components/models_and_agents/memory.py +268 -0
- lfx/components/models_and_agents/prompt.py +67 -0
- lfx/components/mongodb/__init__.py +34 -0
- lfx/components/mongodb/mongodb_atlas.py +213 -0
- lfx/components/needle/__init__.py +3 -0
- lfx/components/needle/needle.py +104 -0
- lfx/components/notdiamond/__init__.py +34 -0
- lfx/components/notdiamond/notdiamond.py +228 -0
- lfx/components/novita/__init__.py +32 -0
- lfx/components/novita/novita.py +130 -0
- lfx/components/nvidia/__init__.py +57 -0
- lfx/components/nvidia/nvidia.py +151 -0
- lfx/components/nvidia/nvidia_embedding.py +77 -0
- lfx/components/nvidia/nvidia_ingest.py +317 -0
- lfx/components/nvidia/nvidia_rerank.py +63 -0
- lfx/components/nvidia/system_assist.py +65 -0
- lfx/components/olivya/__init__.py +3 -0
- lfx/components/olivya/olivya.py +116 -0
- lfx/components/ollama/__init__.py +37 -0
- lfx/components/ollama/ollama.py +548 -0
- lfx/components/ollama/ollama_embeddings.py +103 -0
- lfx/components/openai/__init__.py +37 -0
- lfx/components/openai/openai.py +100 -0
- lfx/components/openai/openai_chat_model.py +176 -0
- lfx/components/openrouter/__init__.py +32 -0
- lfx/components/openrouter/openrouter.py +104 -0
- lfx/components/output_parsers/__init__.py +3 -0
- lfx/components/perplexity/__init__.py +34 -0
- lfx/components/perplexity/perplexity.py +75 -0
- lfx/components/pgvector/__init__.py +34 -0
- lfx/components/pgvector/pgvector.py +72 -0
- lfx/components/pinecone/__init__.py +34 -0
- lfx/components/pinecone/pinecone.py +134 -0
- lfx/components/processing/__init__.py +72 -0
- lfx/components/processing/alter_metadata.py +109 -0
- lfx/components/processing/combine_text.py +40 -0
- lfx/components/processing/converter.py +248 -0
- lfx/components/processing/create_data.py +111 -0
- lfx/components/processing/create_list.py +40 -0
- lfx/components/processing/data_operations.py +528 -0
- lfx/components/processing/data_to_dataframe.py +71 -0
- lfx/components/processing/dataframe_operations.py +313 -0
- lfx/components/processing/dataframe_to_toolset.py +259 -0
- lfx/components/processing/dynamic_create_data.py +357 -0
- lfx/components/processing/extract_key.py +54 -0
- lfx/components/processing/filter_data.py +43 -0
- lfx/components/processing/filter_data_values.py +89 -0
- lfx/components/processing/json_cleaner.py +104 -0
- lfx/components/processing/merge_data.py +91 -0
- lfx/components/processing/message_to_data.py +37 -0
- lfx/components/processing/output_parser.py +46 -0
- lfx/components/processing/parse_data.py +71 -0
- lfx/components/processing/parse_dataframe.py +69 -0
- lfx/components/processing/parse_json_data.py +91 -0
- lfx/components/processing/parser.py +148 -0
- lfx/components/processing/regex.py +83 -0
- lfx/components/processing/select_data.py +49 -0
- lfx/components/processing/split_text.py +141 -0
- lfx/components/processing/store_message.py +91 -0
- lfx/components/processing/update_data.py +161 -0
- lfx/components/prototypes/__init__.py +35 -0
- lfx/components/prototypes/python_function.py +73 -0
- lfx/components/qdrant/__init__.py +34 -0
- lfx/components/qdrant/qdrant.py +109 -0
- lfx/components/redis/__init__.py +37 -0
- lfx/components/redis/redis.py +89 -0
- lfx/components/redis/redis_chat.py +43 -0
- lfx/components/sambanova/__init__.py +32 -0
- lfx/components/sambanova/sambanova.py +84 -0
- lfx/components/scrapegraph/__init__.py +40 -0
- lfx/components/scrapegraph/scrapegraph_markdownify_api.py +64 -0
- lfx/components/scrapegraph/scrapegraph_search_api.py +64 -0
- lfx/components/scrapegraph/scrapegraph_smart_scraper_api.py +71 -0
- lfx/components/searchapi/__init__.py +34 -0
- lfx/components/searchapi/search.py +79 -0
- lfx/components/serpapi/__init__.py +3 -0
- lfx/components/serpapi/serp.py +115 -0
- lfx/components/supabase/__init__.py +34 -0
- lfx/components/supabase/supabase.py +76 -0
- lfx/components/tavily/__init__.py +4 -0
- lfx/components/tavily/tavily_extract.py +117 -0
- lfx/components/tavily/tavily_search.py +212 -0
- lfx/components/textsplitters/__init__.py +3 -0
- lfx/components/toolkits/__init__.py +3 -0
- lfx/components/tools/__init__.py +66 -0
- lfx/components/tools/calculator.py +109 -0
- lfx/components/tools/google_search_api.py +45 -0
- lfx/components/tools/google_serper_api.py +115 -0
- lfx/components/tools/python_code_structured_tool.py +328 -0
- lfx/components/tools/python_repl.py +98 -0
- lfx/components/tools/search_api.py +88 -0
- lfx/components/tools/searxng.py +145 -0
- lfx/components/tools/serp_api.py +120 -0
- lfx/components/tools/tavily_search_tool.py +345 -0
- lfx/components/tools/wikidata_api.py +103 -0
- lfx/components/tools/wikipedia_api.py +50 -0
- lfx/components/tools/yahoo_finance.py +130 -0
- lfx/components/twelvelabs/__init__.py +52 -0
- lfx/components/twelvelabs/convert_astra_results.py +84 -0
- lfx/components/twelvelabs/pegasus_index.py +311 -0
- lfx/components/twelvelabs/split_video.py +301 -0
- lfx/components/twelvelabs/text_embeddings.py +57 -0
- lfx/components/twelvelabs/twelvelabs_pegasus.py +408 -0
- lfx/components/twelvelabs/video_embeddings.py +100 -0
- lfx/components/twelvelabs/video_file.py +191 -0
- lfx/components/unstructured/__init__.py +3 -0
- lfx/components/unstructured/unstructured.py +121 -0
- lfx/components/upstash/__init__.py +34 -0
- lfx/components/upstash/upstash.py +124 -0
- lfx/components/utilities/__init__.py +43 -0
- lfx/components/utilities/calculator_core.py +89 -0
- lfx/components/utilities/current_date.py +42 -0
- lfx/components/utilities/id_generator.py +42 -0
- lfx/components/utilities/python_repl_core.py +98 -0
- lfx/components/vectara/__init__.py +37 -0
- lfx/components/vectara/vectara.py +97 -0
- lfx/components/vectara/vectara_rag.py +164 -0
- lfx/components/vectorstores/__init__.py +34 -0
- lfx/components/vectorstores/local_db.py +270 -0
- lfx/components/vertexai/__init__.py +37 -0
- lfx/components/vertexai/vertexai.py +71 -0
- lfx/components/vertexai/vertexai_embeddings.py +67 -0
- lfx/components/vlmrun/__init__.py +34 -0
- lfx/components/vlmrun/vlmrun_transcription.py +224 -0
- lfx/components/weaviate/__init__.py +34 -0
- lfx/components/weaviate/weaviate.py +89 -0
- lfx/components/wikipedia/__init__.py +4 -0
- lfx/components/wikipedia/wikidata.py +86 -0
- lfx/components/wikipedia/wikipedia.py +53 -0
- lfx/components/wolframalpha/__init__.py +3 -0
- lfx/components/wolframalpha/wolfram_alpha_api.py +54 -0
- lfx/components/xai/__init__.py +32 -0
- lfx/components/xai/xai.py +167 -0
- lfx/components/yahoosearch/__init__.py +3 -0
- lfx/components/yahoosearch/yahoo.py +137 -0
- lfx/components/youtube/__init__.py +52 -0
- lfx/components/youtube/channel.py +227 -0
- lfx/components/youtube/comments.py +231 -0
- lfx/components/youtube/playlist.py +33 -0
- lfx/components/youtube/search.py +120 -0
- lfx/components/youtube/trending.py +285 -0
- lfx/components/youtube/video_details.py +263 -0
- lfx/components/youtube/youtube_transcripts.py +206 -0
- lfx/components/zep/__init__.py +3 -0
- lfx/components/zep/zep.py +45 -0
- lfx/constants.py +6 -0
- lfx/custom/__init__.py +7 -0
- lfx/custom/attributes.py +87 -0
- lfx/custom/code_parser/__init__.py +3 -0
- lfx/custom/code_parser/code_parser.py +361 -0
- lfx/custom/custom_component/__init__.py +0 -0
- lfx/custom/custom_component/base_component.py +128 -0
- lfx/custom/custom_component/component.py +1890 -0
- lfx/custom/custom_component/component_with_cache.py +8 -0
- lfx/custom/custom_component/custom_component.py +650 -0
- lfx/custom/dependency_analyzer.py +165 -0
- lfx/custom/directory_reader/__init__.py +3 -0
- lfx/custom/directory_reader/directory_reader.py +359 -0
- lfx/custom/directory_reader/utils.py +171 -0
- lfx/custom/eval.py +12 -0
- lfx/custom/schema.py +32 -0
- lfx/custom/tree_visitor.py +21 -0
- lfx/custom/utils.py +877 -0
- lfx/custom/validate.py +523 -0
- lfx/events/__init__.py +1 -0
- lfx/events/event_manager.py +110 -0
- lfx/exceptions/__init__.py +0 -0
- lfx/exceptions/component.py +15 -0
- lfx/field_typing/__init__.py +91 -0
- lfx/field_typing/constants.py +216 -0
- lfx/field_typing/range_spec.py +35 -0
- lfx/graph/__init__.py +6 -0
- lfx/graph/edge/__init__.py +0 -0
- lfx/graph/edge/base.py +300 -0
- lfx/graph/edge/schema.py +119 -0
- lfx/graph/edge/utils.py +0 -0
- lfx/graph/graph/__init__.py +0 -0
- lfx/graph/graph/ascii.py +202 -0
- lfx/graph/graph/base.py +2298 -0
- lfx/graph/graph/constants.py +63 -0
- lfx/graph/graph/runnable_vertices_manager.py +133 -0
- lfx/graph/graph/schema.py +53 -0
- lfx/graph/graph/state_model.py +66 -0
- lfx/graph/graph/utils.py +1024 -0
- lfx/graph/schema.py +75 -0
- lfx/graph/state/__init__.py +0 -0
- lfx/graph/state/model.py +250 -0
- lfx/graph/utils.py +206 -0
- lfx/graph/vertex/__init__.py +0 -0
- lfx/graph/vertex/base.py +826 -0
- lfx/graph/vertex/constants.py +0 -0
- lfx/graph/vertex/exceptions.py +4 -0
- lfx/graph/vertex/param_handler.py +316 -0
- lfx/graph/vertex/schema.py +26 -0
- lfx/graph/vertex/utils.py +19 -0
- lfx/graph/vertex/vertex_types.py +489 -0
- lfx/helpers/__init__.py +141 -0
- lfx/helpers/base_model.py +71 -0
- lfx/helpers/custom.py +13 -0
- lfx/helpers/data.py +167 -0
- lfx/helpers/flow.py +308 -0
- lfx/inputs/__init__.py +68 -0
- lfx/inputs/constants.py +2 -0
- lfx/inputs/input_mixin.py +352 -0
- lfx/inputs/inputs.py +718 -0
- lfx/inputs/validators.py +19 -0
- lfx/interface/__init__.py +6 -0
- lfx/interface/components.py +897 -0
- lfx/interface/importing/__init__.py +5 -0
- lfx/interface/importing/utils.py +39 -0
- lfx/interface/initialize/__init__.py +3 -0
- lfx/interface/initialize/loading.py +317 -0
- lfx/interface/listing.py +26 -0
- lfx/interface/run.py +16 -0
- lfx/interface/utils.py +111 -0
- lfx/io/__init__.py +63 -0
- lfx/io/schema.py +295 -0
- lfx/load/__init__.py +8 -0
- lfx/load/load.py +256 -0
- lfx/load/utils.py +99 -0
- lfx/log/__init__.py +5 -0
- lfx/log/logger.py +411 -0
- lfx/logging/__init__.py +11 -0
- lfx/logging/logger.py +24 -0
- lfx/memory/__init__.py +70 -0
- lfx/memory/stubs.py +302 -0
- lfx/processing/__init__.py +1 -0
- lfx/processing/process.py +238 -0
- lfx/processing/utils.py +25 -0
- lfx/py.typed +0 -0
- lfx/schema/__init__.py +66 -0
- lfx/schema/artifact.py +83 -0
- lfx/schema/content_block.py +62 -0
- lfx/schema/content_types.py +91 -0
- lfx/schema/cross_module.py +80 -0
- lfx/schema/data.py +309 -0
- lfx/schema/dataframe.py +210 -0
- lfx/schema/dotdict.py +74 -0
- lfx/schema/encoders.py +13 -0
- lfx/schema/graph.py +47 -0
- lfx/schema/image.py +184 -0
- lfx/schema/json_schema.py +186 -0
- lfx/schema/log.py +62 -0
- lfx/schema/message.py +493 -0
- lfx/schema/openai_responses_schemas.py +74 -0
- lfx/schema/properties.py +41 -0
- lfx/schema/schema.py +180 -0
- lfx/schema/serialize.py +13 -0
- lfx/schema/table.py +142 -0
- lfx/schema/validators.py +114 -0
- lfx/serialization/__init__.py +5 -0
- lfx/serialization/constants.py +2 -0
- lfx/serialization/serialization.py +314 -0
- lfx/services/__init__.py +26 -0
- lfx/services/base.py +28 -0
- lfx/services/cache/__init__.py +6 -0
- lfx/services/cache/base.py +183 -0
- lfx/services/cache/service.py +166 -0
- lfx/services/cache/utils.py +169 -0
- lfx/services/chat/__init__.py +1 -0
- lfx/services/chat/config.py +2 -0
- lfx/services/chat/schema.py +10 -0
- lfx/services/database/__init__.py +5 -0
- lfx/services/database/service.py +25 -0
- lfx/services/deps.py +194 -0
- lfx/services/factory.py +19 -0
- lfx/services/initialize.py +19 -0
- lfx/services/interfaces.py +103 -0
- lfx/services/manager.py +185 -0
- lfx/services/mcp_composer/__init__.py +6 -0
- lfx/services/mcp_composer/factory.py +16 -0
- lfx/services/mcp_composer/service.py +1441 -0
- lfx/services/schema.py +21 -0
- lfx/services/session.py +87 -0
- lfx/services/settings/__init__.py +3 -0
- lfx/services/settings/auth.py +133 -0
- lfx/services/settings/base.py +668 -0
- lfx/services/settings/constants.py +43 -0
- lfx/services/settings/factory.py +23 -0
- lfx/services/settings/feature_flags.py +11 -0
- lfx/services/settings/service.py +35 -0
- lfx/services/settings/utils.py +40 -0
- lfx/services/shared_component_cache/__init__.py +1 -0
- lfx/services/shared_component_cache/factory.py +30 -0
- lfx/services/shared_component_cache/service.py +9 -0
- lfx/services/storage/__init__.py +5 -0
- lfx/services/storage/local.py +185 -0
- lfx/services/storage/service.py +177 -0
- lfx/services/tracing/__init__.py +1 -0
- lfx/services/tracing/service.py +21 -0
- lfx/settings.py +6 -0
- lfx/template/__init__.py +6 -0
- lfx/template/field/__init__.py +0 -0
- lfx/template/field/base.py +260 -0
- lfx/template/field/prompt.py +15 -0
- lfx/template/frontend_node/__init__.py +6 -0
- lfx/template/frontend_node/base.py +214 -0
- lfx/template/frontend_node/constants.py +65 -0
- lfx/template/frontend_node/custom_components.py +79 -0
- lfx/template/template/__init__.py +0 -0
- lfx/template/template/base.py +100 -0
- lfx/template/utils.py +217 -0
- lfx/type_extraction/__init__.py +19 -0
- lfx/type_extraction/type_extraction.py +75 -0
- lfx/type_extraction.py +80 -0
- lfx/utils/__init__.py +1 -0
- lfx/utils/async_helpers.py +42 -0
- lfx/utils/component_utils.py +154 -0
- lfx/utils/concurrency.py +60 -0
- lfx/utils/connection_string_parser.py +11 -0
- lfx/utils/constants.py +233 -0
- lfx/utils/data_structure.py +212 -0
- lfx/utils/exceptions.py +22 -0
- lfx/utils/helpers.py +34 -0
- lfx/utils/image.py +79 -0
- lfx/utils/langflow_utils.py +52 -0
- lfx/utils/lazy_load.py +15 -0
- lfx/utils/request_utils.py +18 -0
- lfx/utils/schemas.py +139 -0
- lfx/utils/ssrf_protection.py +384 -0
- lfx/utils/util.py +626 -0
- lfx/utils/util_strings.py +56 -0
- lfx/utils/validate_cloud.py +26 -0
- lfx/utils/version.py +24 -0
- lfx_nightly-0.2.0.dev25.dist-info/METADATA +312 -0
- lfx_nightly-0.2.0.dev25.dist-info/RECORD +769 -0
- lfx_nightly-0.2.0.dev25.dist-info/WHEEL +4 -0
- lfx_nightly-0.2.0.dev25.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,841 @@
|
|
|
1
|
+
"""Enhanced file component with Docling support and process isolation.
|
|
2
|
+
|
|
3
|
+
Notes:
|
|
4
|
+
-----
|
|
5
|
+
- ALL Docling parsing/export runs in a separate OS process to prevent memory
|
|
6
|
+
growth and native library state from impacting the main Langflow process.
|
|
7
|
+
- Standard text/structured parsing continues to use existing BaseFileComponent
|
|
8
|
+
utilities (and optional threading via `parallel_load_data`).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import contextlib
|
|
14
|
+
import json
|
|
15
|
+
import subprocess
|
|
16
|
+
import sys
|
|
17
|
+
import textwrap
|
|
18
|
+
from copy import deepcopy
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from tempfile import NamedTemporaryFile
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from lfx.base.data.base_file import BaseFileComponent
|
|
24
|
+
from lfx.base.data.storage_utils import parse_storage_path
|
|
25
|
+
from lfx.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
|
|
26
|
+
from lfx.inputs.inputs import DropdownInput, MessageTextInput, StrInput
|
|
27
|
+
from lfx.io import BoolInput, FileInput, IntInput, Output
|
|
28
|
+
from lfx.schema.data import Data
|
|
29
|
+
from lfx.schema.dataframe import DataFrame # noqa: TC001
|
|
30
|
+
from lfx.schema.message import Message
|
|
31
|
+
from lfx.services.deps import get_settings_service, get_storage_service
|
|
32
|
+
from lfx.utils.async_helpers import run_until_complete
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FileComponent(BaseFileComponent):
|
|
36
|
+
"""File component with optional Docling processing (isolated in a subprocess)."""
|
|
37
|
+
|
|
38
|
+
display_name = "Read File"
|
|
39
|
+
# description is now a dynamic property - see get_tool_description()
|
|
40
|
+
_base_description = "Loads content from one or more files."
|
|
41
|
+
documentation: str = "https://docs.langflow.org/read-file"
|
|
42
|
+
icon = "file-text"
|
|
43
|
+
name = "File"
|
|
44
|
+
add_tool_output = True # Enable tool mode toggle without requiring tool_mode inputs
|
|
45
|
+
|
|
46
|
+
# Extensions that can be processed without Docling (using standard text parsing)
|
|
47
|
+
TEXT_EXTENSIONS = TEXT_FILE_TYPES
|
|
48
|
+
|
|
49
|
+
# Extensions that require Docling for processing (images, advanced office formats, etc.)
|
|
50
|
+
DOCLING_ONLY_EXTENSIONS = [
|
|
51
|
+
"adoc",
|
|
52
|
+
"asciidoc",
|
|
53
|
+
"asc",
|
|
54
|
+
"bmp",
|
|
55
|
+
"dotx",
|
|
56
|
+
"dotm",
|
|
57
|
+
"docm",
|
|
58
|
+
"jpg",
|
|
59
|
+
"jpeg",
|
|
60
|
+
"png",
|
|
61
|
+
"potx",
|
|
62
|
+
"ppsx",
|
|
63
|
+
"pptm",
|
|
64
|
+
"potm",
|
|
65
|
+
"ppsm",
|
|
66
|
+
"pptx",
|
|
67
|
+
"tiff",
|
|
68
|
+
"xls",
|
|
69
|
+
"xlsx",
|
|
70
|
+
"xhtml",
|
|
71
|
+
"webp",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
# Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader.
|
|
75
|
+
VALID_EXTENSIONS = [
|
|
76
|
+
*TEXT_EXTENSIONS,
|
|
77
|
+
*DOCLING_ONLY_EXTENSIONS,
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
# Fixed export settings used when markdown export is requested.
|
|
81
|
+
EXPORT_FORMAT = "Markdown"
|
|
82
|
+
IMAGE_MODE = "placeholder"
|
|
83
|
+
|
|
84
|
+
_base_inputs = deepcopy(BaseFileComponent.get_base_inputs())
|
|
85
|
+
|
|
86
|
+
for input_item in _base_inputs:
|
|
87
|
+
if isinstance(input_item, FileInput) and input_item.name == "path":
|
|
88
|
+
input_item.real_time_refresh = True
|
|
89
|
+
input_item.tool_mode = False # Disable tool mode for file upload input
|
|
90
|
+
input_item.required = False # Make it optional so it doesn't error in tool mode
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
inputs = [
|
|
94
|
+
*_base_inputs,
|
|
95
|
+
StrInput(
|
|
96
|
+
name="file_path_str",
|
|
97
|
+
display_name="File Path",
|
|
98
|
+
info=(
|
|
99
|
+
"Path to the file to read. Used when component is called as a tool. "
|
|
100
|
+
"If not provided, will use the uploaded file from 'path' input."
|
|
101
|
+
),
|
|
102
|
+
show=False,
|
|
103
|
+
advanced=True,
|
|
104
|
+
tool_mode=True, # Required for Toolset toggle, but _get_tools() ignores this parameter
|
|
105
|
+
required=False,
|
|
106
|
+
),
|
|
107
|
+
BoolInput(
|
|
108
|
+
name="advanced_mode",
|
|
109
|
+
display_name="Advanced Parser",
|
|
110
|
+
value=False,
|
|
111
|
+
real_time_refresh=True,
|
|
112
|
+
info=(
|
|
113
|
+
"Enable advanced document processing and export with Docling for PDFs, images, and office documents. "
|
|
114
|
+
"Note that advanced document processing can consume significant resources."
|
|
115
|
+
),
|
|
116
|
+
show=True,
|
|
117
|
+
),
|
|
118
|
+
DropdownInput(
|
|
119
|
+
name="pipeline",
|
|
120
|
+
display_name="Pipeline",
|
|
121
|
+
info="Docling pipeline to use",
|
|
122
|
+
options=["standard", "vlm"],
|
|
123
|
+
value="standard",
|
|
124
|
+
advanced=True,
|
|
125
|
+
real_time_refresh=True,
|
|
126
|
+
),
|
|
127
|
+
DropdownInput(
|
|
128
|
+
name="ocr_engine",
|
|
129
|
+
display_name="OCR Engine",
|
|
130
|
+
info="OCR engine to use. Only available when pipeline is set to 'standard'.",
|
|
131
|
+
options=["None", "easyocr"],
|
|
132
|
+
value="easyocr",
|
|
133
|
+
show=False,
|
|
134
|
+
advanced=True,
|
|
135
|
+
),
|
|
136
|
+
StrInput(
|
|
137
|
+
name="md_image_placeholder",
|
|
138
|
+
display_name="Image placeholder",
|
|
139
|
+
info="Specify the image placeholder for markdown exports.",
|
|
140
|
+
value="<!-- image -->",
|
|
141
|
+
advanced=True,
|
|
142
|
+
show=False,
|
|
143
|
+
),
|
|
144
|
+
StrInput(
|
|
145
|
+
name="md_page_break_placeholder",
|
|
146
|
+
display_name="Page break placeholder",
|
|
147
|
+
info="Add this placeholder between pages in the markdown output.",
|
|
148
|
+
value="",
|
|
149
|
+
advanced=True,
|
|
150
|
+
show=False,
|
|
151
|
+
),
|
|
152
|
+
MessageTextInput(
|
|
153
|
+
name="doc_key",
|
|
154
|
+
display_name="Doc Key",
|
|
155
|
+
info="The key to use for the DoclingDocument column.",
|
|
156
|
+
value="doc",
|
|
157
|
+
advanced=True,
|
|
158
|
+
show=False,
|
|
159
|
+
),
|
|
160
|
+
# Deprecated input retained for backward-compatibility.
|
|
161
|
+
BoolInput(
|
|
162
|
+
name="use_multithreading",
|
|
163
|
+
display_name="[Deprecated] Use Multithreading",
|
|
164
|
+
advanced=True,
|
|
165
|
+
value=True,
|
|
166
|
+
info="Set 'Processing Concurrency' greater than 1 to enable multithreading.",
|
|
167
|
+
),
|
|
168
|
+
IntInput(
|
|
169
|
+
name="concurrency_multithreading",
|
|
170
|
+
display_name="Processing Concurrency",
|
|
171
|
+
advanced=True,
|
|
172
|
+
info="When multiple files are being processed, the number of files to process concurrently.",
|
|
173
|
+
value=1,
|
|
174
|
+
),
|
|
175
|
+
BoolInput(
|
|
176
|
+
name="markdown",
|
|
177
|
+
display_name="Markdown Export",
|
|
178
|
+
info="Export processed documents to Markdown format. Only available when advanced mode is enabled.",
|
|
179
|
+
value=False,
|
|
180
|
+
show=False,
|
|
181
|
+
),
|
|
182
|
+
]
|
|
183
|
+
|
|
184
|
+
outputs = [
|
|
185
|
+
Output(display_name="Raw Content", name="message", method="load_files_message", tool_mode=True),
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
# ------------------------------ Tool description with file names --------------
|
|
189
|
+
|
|
190
|
+
def get_tool_description(self) -> str:
|
|
191
|
+
"""Return a dynamic description that includes the names of uploaded files.
|
|
192
|
+
|
|
193
|
+
This helps the Agent understand which files are available to read.
|
|
194
|
+
"""
|
|
195
|
+
base_description = "Loads and returns the content from uploaded files."
|
|
196
|
+
|
|
197
|
+
# Get the list of uploaded file paths
|
|
198
|
+
file_paths = getattr(self, "path", None)
|
|
199
|
+
if not file_paths:
|
|
200
|
+
return base_description
|
|
201
|
+
|
|
202
|
+
# Ensure it's a list
|
|
203
|
+
if not isinstance(file_paths, list):
|
|
204
|
+
file_paths = [file_paths]
|
|
205
|
+
|
|
206
|
+
# Extract just the file names from the paths
|
|
207
|
+
file_names = []
|
|
208
|
+
for fp in file_paths:
|
|
209
|
+
if fp:
|
|
210
|
+
name = Path(fp).name
|
|
211
|
+
file_names.append(name)
|
|
212
|
+
|
|
213
|
+
if file_names:
|
|
214
|
+
files_str = ", ".join(file_names)
|
|
215
|
+
return f"{base_description} Available files: {files_str}. Call this tool to read these files."
|
|
216
|
+
|
|
217
|
+
return base_description
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def description(self) -> str:
|
|
221
|
+
"""Dynamic description property that includes uploaded file names."""
|
|
222
|
+
return self.get_tool_description()
|
|
223
|
+
|
|
224
|
+
async def _get_tools(self) -> list:
|
|
225
|
+
"""Override to create a tool without parameters.
|
|
226
|
+
|
|
227
|
+
The Read File component should use the files already uploaded via UI,
|
|
228
|
+
not accept file paths from the Agent (which wouldn't know the internal paths).
|
|
229
|
+
"""
|
|
230
|
+
from langchain_core.tools import StructuredTool
|
|
231
|
+
from pydantic import BaseModel
|
|
232
|
+
|
|
233
|
+
# Empty schema - no parameters needed
|
|
234
|
+
class EmptySchema(BaseModel):
|
|
235
|
+
"""No parameters required - uses pre-uploaded files."""
|
|
236
|
+
|
|
237
|
+
async def read_files_tool() -> str:
|
|
238
|
+
"""Read the content of uploaded files."""
|
|
239
|
+
try:
|
|
240
|
+
result = self.load_files_message()
|
|
241
|
+
if hasattr(result, "get_text"):
|
|
242
|
+
return result.get_text()
|
|
243
|
+
if hasattr(result, "text"):
|
|
244
|
+
return result.text
|
|
245
|
+
return str(result)
|
|
246
|
+
except (FileNotFoundError, ValueError, OSError, RuntimeError) as e:
|
|
247
|
+
return f"Error reading files: {e}"
|
|
248
|
+
|
|
249
|
+
description = self.get_tool_description()
|
|
250
|
+
|
|
251
|
+
tool = StructuredTool(
|
|
252
|
+
name="load_files_message",
|
|
253
|
+
description=description,
|
|
254
|
+
coroutine=read_files_tool,
|
|
255
|
+
args_schema=EmptySchema,
|
|
256
|
+
handle_tool_error=True,
|
|
257
|
+
tags=["load_files_message"],
|
|
258
|
+
metadata={
|
|
259
|
+
"display_name": "Read File",
|
|
260
|
+
"display_description": description,
|
|
261
|
+
},
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
return [tool]
|
|
265
|
+
|
|
266
|
+
# ------------------------------ UI helpers --------------------------------------
|
|
267
|
+
|
|
268
|
+
def _path_value(self, template: dict) -> list[str]:
|
|
269
|
+
"""Return the list of currently selected file paths from the template."""
|
|
270
|
+
return template.get("path", {}).get("file_path", [])
|
|
271
|
+
|
|
272
|
+
def update_build_config(
|
|
273
|
+
self,
|
|
274
|
+
build_config: dict[str, Any],
|
|
275
|
+
field_value: Any,
|
|
276
|
+
field_name: str | None = None,
|
|
277
|
+
) -> dict[str, Any]:
|
|
278
|
+
"""Show/hide Advanced Parser and related fields based on selection context."""
|
|
279
|
+
if field_name == "path":
|
|
280
|
+
paths = self._path_value(build_config)
|
|
281
|
+
|
|
282
|
+
# If all files can be processed by docling, do so
|
|
283
|
+
allow_advanced = all(not file_path.endswith((".csv", ".xlsx", ".parquet")) for file_path in paths)
|
|
284
|
+
build_config["advanced_mode"]["show"] = allow_advanced
|
|
285
|
+
if not allow_advanced:
|
|
286
|
+
build_config["advanced_mode"]["value"] = False
|
|
287
|
+
for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
|
|
288
|
+
if f in build_config:
|
|
289
|
+
build_config[f]["show"] = False
|
|
290
|
+
|
|
291
|
+
# Docling Processing
|
|
292
|
+
elif field_name == "advanced_mode":
|
|
293
|
+
for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"):
|
|
294
|
+
if f in build_config:
|
|
295
|
+
build_config[f]["show"] = bool(field_value)
|
|
296
|
+
if f == "pipeline":
|
|
297
|
+
build_config[f]["advanced"] = not bool(field_value)
|
|
298
|
+
|
|
299
|
+
elif field_name == "pipeline":
|
|
300
|
+
if field_value == "standard":
|
|
301
|
+
build_config["ocr_engine"]["show"] = True
|
|
302
|
+
build_config["ocr_engine"]["value"] = "easyocr"
|
|
303
|
+
else:
|
|
304
|
+
build_config["ocr_engine"]["show"] = False
|
|
305
|
+
build_config["ocr_engine"]["value"] = "None"
|
|
306
|
+
|
|
307
|
+
return build_config
|
|
308
|
+
|
|
309
|
+
def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]: # noqa: ARG002
|
|
310
|
+
"""Dynamically show outputs based on file count/type and advanced mode."""
|
|
311
|
+
if field_name not in ["path", "advanced_mode", "pipeline"]:
|
|
312
|
+
return frontend_node
|
|
313
|
+
|
|
314
|
+
template = frontend_node.get("template", {})
|
|
315
|
+
paths = self._path_value(template)
|
|
316
|
+
if not paths:
|
|
317
|
+
return frontend_node
|
|
318
|
+
|
|
319
|
+
frontend_node["outputs"] = []
|
|
320
|
+
if len(paths) == 1:
|
|
321
|
+
file_path = paths[0] if field_name == "path" else frontend_node["template"]["path"]["file_path"][0]
|
|
322
|
+
if file_path.endswith((".csv", ".xlsx", ".parquet")):
|
|
323
|
+
frontend_node["outputs"].append(
|
|
324
|
+
Output(
|
|
325
|
+
display_name="Structured Content",
|
|
326
|
+
name="dataframe",
|
|
327
|
+
method="load_files_structured",
|
|
328
|
+
tool_mode=True,
|
|
329
|
+
),
|
|
330
|
+
)
|
|
331
|
+
elif file_path.endswith(".json"):
|
|
332
|
+
frontend_node["outputs"].append(
|
|
333
|
+
Output(display_name="Structured Content", name="json", method="load_files_json", tool_mode=True),
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
advanced_mode = frontend_node.get("template", {}).get("advanced_mode", {}).get("value", False)
|
|
337
|
+
if advanced_mode:
|
|
338
|
+
frontend_node["outputs"].append(
|
|
339
|
+
Output(
|
|
340
|
+
display_name="Structured Output",
|
|
341
|
+
name="advanced_dataframe",
|
|
342
|
+
method="load_files_dataframe",
|
|
343
|
+
tool_mode=True,
|
|
344
|
+
),
|
|
345
|
+
)
|
|
346
|
+
frontend_node["outputs"].append(
|
|
347
|
+
Output(
|
|
348
|
+
display_name="Markdown", name="advanced_markdown", method="load_files_markdown", tool_mode=True
|
|
349
|
+
),
|
|
350
|
+
)
|
|
351
|
+
frontend_node["outputs"].append(
|
|
352
|
+
Output(display_name="File Path", name="path", method="load_files_path", tool_mode=True),
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
frontend_node["outputs"].append(
|
|
356
|
+
Output(display_name="Raw Content", name="message", method="load_files_message", tool_mode=True),
|
|
357
|
+
)
|
|
358
|
+
frontend_node["outputs"].append(
|
|
359
|
+
Output(display_name="File Path", name="path", method="load_files_path", tool_mode=True),
|
|
360
|
+
)
|
|
361
|
+
else:
|
|
362
|
+
# Multiple files => DataFrame output; advanced parser disabled
|
|
363
|
+
frontend_node["outputs"].append(
|
|
364
|
+
Output(display_name="Files", name="dataframe", method="load_files", tool_mode=True)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
return frontend_node
|
|
368
|
+
|
|
369
|
+
# ------------------------------ Core processing ----------------------------------
|
|
370
|
+
|
|
371
|
+
def _validate_and_resolve_paths(self) -> list[BaseFileComponent.BaseFile]:
|
|
372
|
+
"""Override to handle file_path_str input from tool mode.
|
|
373
|
+
|
|
374
|
+
When called as a tool, the file_path_str parameter can be set.
|
|
375
|
+
If not provided, it will fall back to using the path FileInput (uploaded file).
|
|
376
|
+
Priority:
|
|
377
|
+
1. file_path_str (if provided by the tool call)
|
|
378
|
+
2. path (uploaded file from UI)
|
|
379
|
+
"""
|
|
380
|
+
# Check if file_path_str is provided (from tool mode)
|
|
381
|
+
file_path_str = getattr(self, "file_path_str", None)
|
|
382
|
+
if file_path_str:
|
|
383
|
+
# Use the string path from tool mode
|
|
384
|
+
from pathlib import Path
|
|
385
|
+
|
|
386
|
+
from lfx.schema.data import Data
|
|
387
|
+
|
|
388
|
+
resolved_path = Path(self.resolve_path(file_path_str))
|
|
389
|
+
if not resolved_path.exists():
|
|
390
|
+
msg = f"File or directory not found: {file_path_str}"
|
|
391
|
+
self.log(msg)
|
|
392
|
+
if not self.silent_errors:
|
|
393
|
+
raise ValueError(msg)
|
|
394
|
+
return []
|
|
395
|
+
|
|
396
|
+
data_obj = Data(data={self.SERVER_FILE_PATH_FIELDNAME: str(resolved_path)})
|
|
397
|
+
return [BaseFileComponent.BaseFile(data_obj, resolved_path, delete_after_processing=False)]
|
|
398
|
+
|
|
399
|
+
# Otherwise use the default implementation (uses path FileInput)
|
|
400
|
+
return super()._validate_and_resolve_paths()
|
|
401
|
+
|
|
402
|
+
def _is_docling_compatible(self, file_path: str) -> bool:
|
|
403
|
+
"""Lightweight extension gate for Docling-compatible types."""
|
|
404
|
+
docling_exts = (
|
|
405
|
+
".adoc",
|
|
406
|
+
".asciidoc",
|
|
407
|
+
".asc",
|
|
408
|
+
".bmp",
|
|
409
|
+
".csv",
|
|
410
|
+
".dotx",
|
|
411
|
+
".dotm",
|
|
412
|
+
".docm",
|
|
413
|
+
".docx",
|
|
414
|
+
".htm",
|
|
415
|
+
".html",
|
|
416
|
+
".jpg",
|
|
417
|
+
".jpeg",
|
|
418
|
+
".json",
|
|
419
|
+
".md",
|
|
420
|
+
".pdf",
|
|
421
|
+
".png",
|
|
422
|
+
".potx",
|
|
423
|
+
".ppsx",
|
|
424
|
+
".pptm",
|
|
425
|
+
".potm",
|
|
426
|
+
".ppsm",
|
|
427
|
+
".pptx",
|
|
428
|
+
".tiff",
|
|
429
|
+
".txt",
|
|
430
|
+
".xls",
|
|
431
|
+
".xlsx",
|
|
432
|
+
".xhtml",
|
|
433
|
+
".xml",
|
|
434
|
+
".webp",
|
|
435
|
+
)
|
|
436
|
+
return file_path.lower().endswith(docling_exts)
|
|
437
|
+
|
|
438
|
+
async def _get_local_file_for_docling(self, file_path: str) -> tuple[str, bool]:
|
|
439
|
+
"""Get a local file path for Docling processing, downloading from S3 if needed.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
file_path: Either a local path or S3 key (format "flow_id/filename")
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
tuple[str, bool]: (local_path, should_delete) where should_delete indicates
|
|
446
|
+
if this is a temporary file that should be cleaned up
|
|
447
|
+
"""
|
|
448
|
+
settings = get_settings_service().settings
|
|
449
|
+
if settings.storage_type == "local":
|
|
450
|
+
return file_path, False
|
|
451
|
+
|
|
452
|
+
# S3 storage - download to temp file
|
|
453
|
+
parsed = parse_storage_path(file_path)
|
|
454
|
+
if not parsed:
|
|
455
|
+
msg = f"Invalid S3 path format: {file_path}. Expected 'flow_id/filename'"
|
|
456
|
+
raise ValueError(msg)
|
|
457
|
+
|
|
458
|
+
storage_service = get_storage_service()
|
|
459
|
+
flow_id, filename = parsed
|
|
460
|
+
|
|
461
|
+
# Get file content from S3
|
|
462
|
+
content = await storage_service.get_file(flow_id, filename)
|
|
463
|
+
|
|
464
|
+
suffix = Path(filename).suffix
|
|
465
|
+
with NamedTemporaryFile(mode="wb", suffix=suffix, delete=False) as tmp_file:
|
|
466
|
+
tmp_file.write(content)
|
|
467
|
+
temp_path = tmp_file.name
|
|
468
|
+
|
|
469
|
+
return temp_path, True
|
|
470
|
+
|
|
471
|
+
def _process_docling_in_subprocess(self, file_path: str) -> Data | None:
|
|
472
|
+
"""Run Docling in a separate OS process and map the result to a Data object.
|
|
473
|
+
|
|
474
|
+
We avoid multiprocessing pickling by launching `python -c "<script>"` and
|
|
475
|
+
passing JSON config via stdin. The child prints a JSON result to stdout.
|
|
476
|
+
|
|
477
|
+
For S3 storage, the file is downloaded to a temp file first.
|
|
478
|
+
"""
|
|
479
|
+
if not file_path:
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
settings = get_settings_service().settings
|
|
483
|
+
if settings.storage_type == "s3":
|
|
484
|
+
local_path, should_delete = run_until_complete(self._get_local_file_for_docling(file_path))
|
|
485
|
+
else:
|
|
486
|
+
local_path = file_path
|
|
487
|
+
should_delete = False
|
|
488
|
+
|
|
489
|
+
try:
|
|
490
|
+
return self._process_docling_subprocess_impl(local_path, file_path)
|
|
491
|
+
finally:
|
|
492
|
+
# Clean up temp file if we created one
|
|
493
|
+
if should_delete:
|
|
494
|
+
with contextlib.suppress(Exception):
|
|
495
|
+
Path(local_path).unlink() # Ignore cleanup errors
|
|
496
|
+
|
|
497
|
+
def _process_docling_subprocess_impl(self, local_file_path: str, original_file_path: str) -> Data | None:
|
|
498
|
+
"""Implementation of Docling subprocess processing.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
local_file_path: Path to local file to process
|
|
502
|
+
original_file_path: Original file path to include in metadata
|
|
503
|
+
Returns:
|
|
504
|
+
Data object with processed content
|
|
505
|
+
"""
|
|
506
|
+
args: dict[str, Any] = {
|
|
507
|
+
"file_path": local_file_path,
|
|
508
|
+
"markdown": bool(self.markdown),
|
|
509
|
+
"image_mode": str(self.IMAGE_MODE),
|
|
510
|
+
"md_image_placeholder": str(self.md_image_placeholder),
|
|
511
|
+
"md_page_break_placeholder": str(self.md_page_break_placeholder),
|
|
512
|
+
"pipeline": str(self.pipeline),
|
|
513
|
+
"ocr_engine": (
|
|
514
|
+
self.ocr_engine if self.ocr_engine and self.ocr_engine != "None" and self.pipeline != "vlm" else None
|
|
515
|
+
),
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
self.log(f"Starting Docling subprocess for file: {local_file_path}")
|
|
519
|
+
self.log(args)
|
|
520
|
+
|
|
521
|
+
# Child script for isolating the docling processing
|
|
522
|
+
child_script = textwrap.dedent(
|
|
523
|
+
r"""
|
|
524
|
+
import json, sys
|
|
525
|
+
|
|
526
|
+
def try_imports():
|
|
527
|
+
try:
|
|
528
|
+
from docling.datamodel.base_models import ConversionStatus, InputFormat # type: ignore
|
|
529
|
+
from docling.document_converter import DocumentConverter # type: ignore
|
|
530
|
+
from docling_core.types.doc import ImageRefMode # type: ignore
|
|
531
|
+
return ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, "latest"
|
|
532
|
+
except Exception as e:
|
|
533
|
+
raise e
|
|
534
|
+
|
|
535
|
+
def create_converter(strategy, input_format, DocumentConverter, pipeline, ocr_engine):
|
|
536
|
+
# --- Standard PDF/IMAGE pipeline (your existing behavior), with optional OCR ---
|
|
537
|
+
if pipeline == "standard":
|
|
538
|
+
try:
|
|
539
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions # type: ignore
|
|
540
|
+
from docling.document_converter import PdfFormatOption # type: ignore
|
|
541
|
+
|
|
542
|
+
pipe = PdfPipelineOptions()
|
|
543
|
+
pipe.do_ocr = False
|
|
544
|
+
|
|
545
|
+
if ocr_engine:
|
|
546
|
+
try:
|
|
547
|
+
from docling.models.factories import get_ocr_factory # type: ignore
|
|
548
|
+
pipe.do_ocr = True
|
|
549
|
+
fac = get_ocr_factory(allow_external_plugins=False)
|
|
550
|
+
pipe.ocr_options = fac.create_options(kind=ocr_engine)
|
|
551
|
+
except Exception:
|
|
552
|
+
# If OCR setup fails, disable it
|
|
553
|
+
pipe.do_ocr = False
|
|
554
|
+
|
|
555
|
+
fmt = {}
|
|
556
|
+
if hasattr(input_format, "PDF"):
|
|
557
|
+
fmt[getattr(input_format, "PDF")] = PdfFormatOption(pipeline_options=pipe)
|
|
558
|
+
if hasattr(input_format, "IMAGE"):
|
|
559
|
+
fmt[getattr(input_format, "IMAGE")] = PdfFormatOption(pipeline_options=pipe)
|
|
560
|
+
|
|
561
|
+
return DocumentConverter(format_options=fmt)
|
|
562
|
+
except Exception:
|
|
563
|
+
return DocumentConverter()
|
|
564
|
+
|
|
565
|
+
# --- Vision-Language Model (VLM) pipeline ---
|
|
566
|
+
if pipeline == "vlm":
|
|
567
|
+
try:
|
|
568
|
+
from docling.datamodel.pipeline_options import VlmPipelineOptions
|
|
569
|
+
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_MLX, GRANITEDOCLING_TRANSFORMERS
|
|
570
|
+
from docling.document_converter import PdfFormatOption
|
|
571
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
572
|
+
|
|
573
|
+
vl_pipe = VlmPipelineOptions(
|
|
574
|
+
vlm_options=GRANITEDOCLING_TRANSFORMERS,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
if sys.platform == "darwin":
|
|
578
|
+
try:
|
|
579
|
+
import mlx_vlm
|
|
580
|
+
vl_pipe.vlm_options = GRANITEDOCLING_MLX
|
|
581
|
+
except ImportError as e:
|
|
582
|
+
raise e
|
|
583
|
+
|
|
584
|
+
# VLM paths generally don't need OCR; keep OCR off by default here.
|
|
585
|
+
fmt = {}
|
|
586
|
+
if hasattr(input_format, "PDF"):
|
|
587
|
+
fmt[getattr(input_format, "PDF")] = PdfFormatOption(
|
|
588
|
+
pipeline_cls=VlmPipeline,
|
|
589
|
+
pipeline_options=vl_pipe
|
|
590
|
+
)
|
|
591
|
+
if hasattr(input_format, "IMAGE"):
|
|
592
|
+
fmt[getattr(input_format, "IMAGE")] = PdfFormatOption(
|
|
593
|
+
pipeline_cls=VlmPipeline,
|
|
594
|
+
pipeline_options=vl_pipe
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
return DocumentConverter(format_options=fmt)
|
|
598
|
+
except Exception as e:
|
|
599
|
+
raise e
|
|
600
|
+
|
|
601
|
+
# --- Fallback: default converter with no special options ---
|
|
602
|
+
return DocumentConverter()
|
|
603
|
+
|
|
604
|
+
def export_markdown(document, ImageRefMode, image_mode, img_ph, pg_ph):
|
|
605
|
+
try:
|
|
606
|
+
mode = getattr(ImageRefMode, image_mode.upper(), image_mode)
|
|
607
|
+
return document.export_to_markdown(
|
|
608
|
+
image_mode=mode,
|
|
609
|
+
image_placeholder=img_ph,
|
|
610
|
+
page_break_placeholder=pg_ph,
|
|
611
|
+
)
|
|
612
|
+
except Exception:
|
|
613
|
+
try:
|
|
614
|
+
return document.export_to_text()
|
|
615
|
+
except Exception:
|
|
616
|
+
return str(document)
|
|
617
|
+
|
|
618
|
+
def to_rows(doc_dict):
|
|
619
|
+
rows = []
|
|
620
|
+
for t in doc_dict.get("texts", []):
|
|
621
|
+
prov = t.get("prov") or []
|
|
622
|
+
page_no = None
|
|
623
|
+
if prov and isinstance(prov, list) and isinstance(prov[0], dict):
|
|
624
|
+
page_no = prov[0].get("page_no")
|
|
625
|
+
rows.append({
|
|
626
|
+
"page_no": page_no,
|
|
627
|
+
"label": t.get("label"),
|
|
628
|
+
"text": t.get("text"),
|
|
629
|
+
"level": t.get("level"),
|
|
630
|
+
})
|
|
631
|
+
return rows
|
|
632
|
+
|
|
633
|
+
def main():
|
|
634
|
+
cfg = json.loads(sys.stdin.read())
|
|
635
|
+
file_path = cfg["file_path"]
|
|
636
|
+
markdown = cfg["markdown"]
|
|
637
|
+
image_mode = cfg["image_mode"]
|
|
638
|
+
img_ph = cfg["md_image_placeholder"]
|
|
639
|
+
pg_ph = cfg["md_page_break_placeholder"]
|
|
640
|
+
pipeline = cfg["pipeline"]
|
|
641
|
+
ocr_engine = cfg.get("ocr_engine")
|
|
642
|
+
meta = {"file_path": file_path}
|
|
643
|
+
|
|
644
|
+
try:
|
|
645
|
+
ConversionStatus, InputFormat, DocumentConverter, ImageRefMode, strategy = try_imports()
|
|
646
|
+
converter = create_converter(strategy, InputFormat, DocumentConverter, pipeline, ocr_engine)
|
|
647
|
+
try:
|
|
648
|
+
res = converter.convert(file_path)
|
|
649
|
+
except Exception as e:
|
|
650
|
+
print(json.dumps({"ok": False, "error": f"Docling conversion error: {e}", "meta": meta}))
|
|
651
|
+
return
|
|
652
|
+
|
|
653
|
+
ok = False
|
|
654
|
+
if hasattr(res, "status"):
|
|
655
|
+
try:
|
|
656
|
+
ok = (res.status == ConversionStatus.SUCCESS) or (str(res.status).lower() == "success")
|
|
657
|
+
except Exception:
|
|
658
|
+
ok = (str(res.status).lower() == "success")
|
|
659
|
+
if not ok and hasattr(res, "document"):
|
|
660
|
+
ok = getattr(res, "document", None) is not None
|
|
661
|
+
if not ok:
|
|
662
|
+
print(json.dumps({"ok": False, "error": "Docling conversion failed", "meta": meta}))
|
|
663
|
+
return
|
|
664
|
+
|
|
665
|
+
doc = getattr(res, "document", None)
|
|
666
|
+
if doc is None:
|
|
667
|
+
print(json.dumps({"ok": False, "error": "Docling produced no document", "meta": meta}))
|
|
668
|
+
return
|
|
669
|
+
|
|
670
|
+
if markdown:
|
|
671
|
+
text = export_markdown(doc, ImageRefMode, image_mode, img_ph, pg_ph)
|
|
672
|
+
print(json.dumps({"ok": True, "mode": "markdown", "text": text, "meta": meta}))
|
|
673
|
+
return
|
|
674
|
+
|
|
675
|
+
# structured
|
|
676
|
+
try:
|
|
677
|
+
doc_dict = doc.export_to_dict()
|
|
678
|
+
except Exception as e:
|
|
679
|
+
print(json.dumps({"ok": False, "error": f"Docling export_to_dict failed: {e}", "meta": meta}))
|
|
680
|
+
return
|
|
681
|
+
|
|
682
|
+
rows = to_rows(doc_dict)
|
|
683
|
+
print(json.dumps({"ok": True, "mode": "structured", "doc": rows, "meta": meta}))
|
|
684
|
+
except Exception as e:
|
|
685
|
+
print(
|
|
686
|
+
json.dumps({
|
|
687
|
+
"ok": False,
|
|
688
|
+
"error": f"Docling processing error: {e}",
|
|
689
|
+
"meta": {"file_path": file_path},
|
|
690
|
+
})
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
if __name__ == "__main__":
|
|
694
|
+
main()
|
|
695
|
+
"""
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# Validate file_path to avoid command injection or unsafe input
|
|
699
|
+
if not isinstance(args["file_path"], str) or any(c in args["file_path"] for c in [";", "|", "&", "$", "`"]):
|
|
700
|
+
return Data(data={"error": "Unsafe file path detected.", "file_path": args["file_path"]})
|
|
701
|
+
|
|
702
|
+
proc = subprocess.run( # noqa: S603
|
|
703
|
+
[sys.executable, "-u", "-c", child_script],
|
|
704
|
+
input=json.dumps(args).encode("utf-8"),
|
|
705
|
+
capture_output=True,
|
|
706
|
+
check=False,
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
if not proc.stdout:
|
|
710
|
+
err_msg = proc.stderr.decode("utf-8", errors="replace") or "no output from child process"
|
|
711
|
+
return Data(data={"error": f"Docling subprocess error: {err_msg}", "file_path": original_file_path})
|
|
712
|
+
|
|
713
|
+
try:
|
|
714
|
+
result = json.loads(proc.stdout.decode("utf-8"))
|
|
715
|
+
except Exception as e: # noqa: BLE001
|
|
716
|
+
err_msg = proc.stderr.decode("utf-8", errors="replace")
|
|
717
|
+
return Data(
|
|
718
|
+
data={
|
|
719
|
+
"error": f"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}",
|
|
720
|
+
"file_path": original_file_path,
|
|
721
|
+
},
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
if not result.get("ok"):
|
|
725
|
+
return Data(data={"error": result.get("error", "Unknown Docling error"), **result.get("meta", {})})
|
|
726
|
+
|
|
727
|
+
meta = result.get("meta", {})
|
|
728
|
+
if result.get("mode") == "markdown":
|
|
729
|
+
exported_content = str(result.get("text", ""))
|
|
730
|
+
return Data(
|
|
731
|
+
text=exported_content,
|
|
732
|
+
data={"exported_content": exported_content, "export_format": self.EXPORT_FORMAT, **meta},
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
rows = list(result.get("doc", []))
|
|
736
|
+
return Data(data={"doc": rows, "export_format": self.EXPORT_FORMAT, **meta})
|
|
737
|
+
|
|
738
|
+
def process_files(
|
|
739
|
+
self,
|
|
740
|
+
file_list: list[BaseFileComponent.BaseFile],
|
|
741
|
+
) -> list[BaseFileComponent.BaseFile]:
|
|
742
|
+
"""Process input files.
|
|
743
|
+
|
|
744
|
+
- advanced_mode => Docling in a separate process.
|
|
745
|
+
- Otherwise => standard parsing in current process (optionally threaded).
|
|
746
|
+
"""
|
|
747
|
+
if not file_list:
|
|
748
|
+
msg = "No files to process."
|
|
749
|
+
raise ValueError(msg)
|
|
750
|
+
|
|
751
|
+
# Validate that files requiring Docling are only processed when advanced mode is enabled
|
|
752
|
+
if not self.advanced_mode:
|
|
753
|
+
for file in file_list:
|
|
754
|
+
extension = file.path.suffix[1:].lower()
|
|
755
|
+
if extension in self.DOCLING_ONLY_EXTENSIONS:
|
|
756
|
+
msg = (
|
|
757
|
+
f"File '{file.path.name}' has extension '.{extension}' which requires "
|
|
758
|
+
f"Advanced Parser mode. Please enable 'Advanced Parser' to process this file."
|
|
759
|
+
)
|
|
760
|
+
self.log(msg)
|
|
761
|
+
raise ValueError(msg)
|
|
762
|
+
|
|
763
|
+
def process_file_standard(file_path: str, *, silent_errors: bool = False) -> Data | None:
|
|
764
|
+
try:
|
|
765
|
+
return parse_text_file_to_data(file_path, silent_errors=silent_errors)
|
|
766
|
+
except FileNotFoundError as e:
|
|
767
|
+
self.log(f"File not found: {file_path}. Error: {e}")
|
|
768
|
+
if not silent_errors:
|
|
769
|
+
raise
|
|
770
|
+
return None
|
|
771
|
+
except Exception as e:
|
|
772
|
+
self.log(f"Unexpected error processing {file_path}: {e}")
|
|
773
|
+
if not silent_errors:
|
|
774
|
+
raise
|
|
775
|
+
return None
|
|
776
|
+
|
|
777
|
+
docling_compatible = all(self._is_docling_compatible(str(f.path)) for f in file_list)
|
|
778
|
+
|
|
779
|
+
# Advanced path: Check if ALL files are compatible with Docling
|
|
780
|
+
if self.advanced_mode and docling_compatible:
|
|
781
|
+
final_return: list[BaseFileComponent.BaseFile] = []
|
|
782
|
+
for file in file_list:
|
|
783
|
+
file_path = str(file.path)
|
|
784
|
+
advanced_data: Data | None = self._process_docling_in_subprocess(file_path)
|
|
785
|
+
|
|
786
|
+
# --- UNNEST: expand each element in `doc` to its own Data row
|
|
787
|
+
payload = getattr(advanced_data, "data", {}) or {}
|
|
788
|
+
doc_rows = payload.get("doc")
|
|
789
|
+
if isinstance(doc_rows, list):
|
|
790
|
+
rows: list[Data | None] = [
|
|
791
|
+
Data(
|
|
792
|
+
data={
|
|
793
|
+
"file_path": file_path,
|
|
794
|
+
**(item if isinstance(item, dict) else {"value": item}),
|
|
795
|
+
},
|
|
796
|
+
)
|
|
797
|
+
for item in doc_rows
|
|
798
|
+
]
|
|
799
|
+
final_return.extend(self.rollup_data(file_list, rows))
|
|
800
|
+
else:
|
|
801
|
+
# If not structured, keep as-is (e.g., markdown export or error dict)
|
|
802
|
+
final_return.extend(self.rollup_data(file_list, [advanced_data]))
|
|
803
|
+
return final_return
|
|
804
|
+
|
|
805
|
+
# Standard multi-file (or single non-advanced) path
|
|
806
|
+
concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)
|
|
807
|
+
|
|
808
|
+
file_paths = [str(f.path) for f in file_list]
|
|
809
|
+
self.log(f"Starting parallel processing of {len(file_paths)} files with concurrency: {concurrency}.")
|
|
810
|
+
my_data = parallel_load_data(
|
|
811
|
+
file_paths,
|
|
812
|
+
silent_errors=self.silent_errors,
|
|
813
|
+
load_function=process_file_standard,
|
|
814
|
+
max_concurrency=concurrency,
|
|
815
|
+
)
|
|
816
|
+
return self.rollup_data(file_list, my_data)
|
|
817
|
+
|
|
818
|
+
# ------------------------------ Output helpers -----------------------------------
|
|
819
|
+
|
|
820
|
+
def load_files_helper(self) -> DataFrame:
|
|
821
|
+
result = self.load_files()
|
|
822
|
+
|
|
823
|
+
# Error condition - raise error if no text and an error is present
|
|
824
|
+
if not hasattr(result, "text"):
|
|
825
|
+
if hasattr(result, "error"):
|
|
826
|
+
raise ValueError(result.error[0])
|
|
827
|
+
msg = "Could not extract content from the provided file(s)."
|
|
828
|
+
raise ValueError(msg)
|
|
829
|
+
|
|
830
|
+
return result
|
|
831
|
+
|
|
832
|
+
def load_files_dataframe(self) -> DataFrame:
|
|
833
|
+
"""Load files using advanced Docling processing and export to DataFrame format."""
|
|
834
|
+
self.markdown = False
|
|
835
|
+
return self.load_files_helper()
|
|
836
|
+
|
|
837
|
+
def load_files_markdown(self) -> Message:
|
|
838
|
+
"""Load files using advanced Docling processing and export to Markdown format."""
|
|
839
|
+
self.markdown = True
|
|
840
|
+
result = self.load_files_helper()
|
|
841
|
+
return Message(text=str(result.text[0]))
|