langflow-base-nightly 0.5.0.dev35__py3-none-any.whl → 0.5.0.dev36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import signal
4
+ import sys
5
+ import traceback
6
+ from contextlib import suppress
3
7
  from typing import TYPE_CHECKING, Any
4
8
 
9
+ from loguru import logger
10
+
5
11
  from langflow.components._importing import import_mod
6
12
 
7
13
  if TYPE_CHECKING:
@@ -41,3 +47,195 @@ def __getattr__(attr_name: str) -> Any:
41
47
 
42
48
  def __dir__() -> list[str]:
43
49
  return list(__all__)
50
+
51
+
52
+ def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str):
53
+ """Worker function for processing files with Docling in a separate process."""
54
+ # Signal handling for graceful shutdown
55
+ shutdown_requested = False
56
+
57
+ def signal_handler(signum: int, frame) -> None: # noqa: ARG001
58
+ """Handle shutdown signals gracefully."""
59
+ nonlocal shutdown_requested
60
+ signal_names: dict[int, str] = {signal.SIGTERM: "SIGTERM", signal.SIGINT: "SIGINT"}
61
+ signal_name = signal_names.get(signum, f"signal {signum}")
62
+
63
+ logger.debug(f"Docling worker received {signal_name}, initiating graceful shutdown...")
64
+ shutdown_requested = True
65
+
66
+ # Send shutdown notification to parent process
67
+ with suppress(Exception):
68
+ queue.put({"error": f"Worker interrupted by {signal_name}", "shutdown": True})
69
+
70
+ # Exit gracefully
71
+ sys.exit(0)
72
+
73
+ def check_shutdown() -> None:
74
+ """Check if shutdown was requested and exit if so."""
75
+ if shutdown_requested:
76
+ logger.info("Shutdown requested, exiting worker...")
77
+
78
+ with suppress(Exception):
79
+ queue.put({"error": "Worker shutdown requested", "shutdown": True})
80
+
81
+ sys.exit(0)
82
+
83
+ # Register signal handlers early
84
+ try:
85
+ signal.signal(signal.SIGTERM, signal_handler)
86
+ signal.signal(signal.SIGINT, signal_handler)
87
+ logger.debug("Signal handlers registered for graceful shutdown")
88
+ except (OSError, ValueError) as e:
89
+ # Some signals might not be available on all platforms
90
+ logger.warning(f"Warning: Could not register signal handlers: {e}")
91
+
92
+ # Check for shutdown before heavy imports
93
+ check_shutdown()
94
+
95
+ try:
96
+ from docling.datamodel.base_models import ConversionStatus, InputFormat
97
+ from docling.datamodel.pipeline_options import (
98
+ OcrOptions,
99
+ PdfPipelineOptions,
100
+ VlmPipelineOptions,
101
+ )
102
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
103
+ from docling.models.factories import get_ocr_factory
104
+ from docling.pipeline.vlm_pipeline import VlmPipeline
105
+
106
+ # Check for shutdown after imports
107
+ check_shutdown()
108
+ logger.debug("Docling dependencies loaded successfully")
109
+
110
+ except ModuleNotFoundError:
111
+ msg = (
112
+ "Docling is an optional dependency of Langflow. "
113
+ "Install with `uv pip install 'langflow[docling]'` "
114
+ "or refer to the documentation"
115
+ )
116
+ queue.put({"error": msg})
117
+ return
118
+ except ImportError as e:
119
+ # A different import failed (e.g., a transitive dependency); preserve details.
120
+ queue.put({"error": f"Failed to import a Docling dependency: {e}"})
121
+ return
122
+ except KeyboardInterrupt:
123
+ logger.warning("KeyboardInterrupt during imports, exiting...")
124
+ queue.put({"error": "Worker interrupted during imports", "shutdown": True})
125
+ return
126
+
127
+ # Configure the standard PDF pipeline
128
+ def _get_standard_opts() -> PdfPipelineOptions:
129
+ check_shutdown() # Check before heavy operations
130
+
131
+ pipeline_options = PdfPipelineOptions()
132
+ pipeline_options.do_ocr = ocr_engine != ""
133
+ if pipeline_options.do_ocr:
134
+ ocr_factory = get_ocr_factory(
135
+ allow_external_plugins=False,
136
+ )
137
+
138
+ ocr_options: OcrOptions = ocr_factory.create_options(
139
+ kind=ocr_engine,
140
+ )
141
+ pipeline_options.ocr_options = ocr_options
142
+ return pipeline_options
143
+
144
+ # Configure the VLM pipeline
145
+ def _get_vlm_opts() -> VlmPipelineOptions:
146
+ check_shutdown() # Check before heavy operations
147
+ return VlmPipelineOptions()
148
+
149
+ # Configure the main format options and create the DocumentConverter()
150
+ def _get_converter() -> DocumentConverter:
151
+ check_shutdown() # Check before heavy operations
152
+
153
+ if pipeline == "standard":
154
+ pdf_format_option = PdfFormatOption(
155
+ pipeline_options=_get_standard_opts(),
156
+ )
157
+ elif pipeline == "vlm":
158
+ pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
159
+ else:
160
+ msg = f"Unknown pipeline: {pipeline!r}"
161
+ raise ValueError(msg)
162
+
163
+ format_options: dict[InputFormat, FormatOption] = {
164
+ InputFormat.PDF: pdf_format_option,
165
+ InputFormat.IMAGE: pdf_format_option,
166
+ }
167
+
168
+ return DocumentConverter(format_options=format_options)
169
+
170
+ try:
171
+ # Check for shutdown before creating converter (can be slow)
172
+ check_shutdown()
173
+ logger.info(f"Initializing {pipeline} pipeline with OCR: {ocr_engine or 'disabled'}")
174
+
175
+ converter = _get_converter()
176
+
177
+ # Check for shutdown before processing files
178
+ check_shutdown()
179
+ logger.info(f"Starting to process {len(file_paths)} files...")
180
+
181
+ # Process files with periodic shutdown checks
182
+ results = []
183
+ for i, file_path in enumerate(file_paths):
184
+ # Check for shutdown before processing each file
185
+ check_shutdown()
186
+
187
+ logger.debug(f"Processing file {i + 1}/{len(file_paths)}: {file_path}")
188
+
189
+ try:
190
+ # Process single file (we can't easily interrupt convert_all)
191
+ single_result = converter.convert_all([file_path])
192
+ results.extend(single_result)
193
+
194
+ # Check for shutdown after each file
195
+ check_shutdown()
196
+
197
+ except (OSError, ValueError, RuntimeError, ImportError) as file_error:
198
+ # Handle specific file processing errors
199
+ logger.error(f"Error processing file {file_path}: {file_error}")
200
+ # Continue with other files, but check for shutdown
201
+ check_shutdown()
202
+ except Exception as file_error: # noqa: BLE001
203
+ # Catch any other unexpected errors to prevent worker crash
204
+ logger.error(f"Unexpected error processing file {file_path}: {file_error}")
205
+ # Continue with other files, but check for shutdown
206
+ check_shutdown()
207
+
208
+ # Final shutdown check before sending results
209
+ check_shutdown()
210
+
211
+ # Process the results while maintaining the original structure
212
+ processed_data = [
213
+ {"document": res.document, "file_path": str(res.input.file), "status": res.status.name}
214
+ if res.status == ConversionStatus.SUCCESS
215
+ else None
216
+ for res in results
217
+ ]
218
+
219
+ logger.info(f"Successfully processed {len([d for d in processed_data if d])} files")
220
+ queue.put(processed_data)
221
+
222
+ except KeyboardInterrupt:
223
+ logger.warning("KeyboardInterrupt during processing, exiting gracefully...")
224
+ queue.put({"error": "Worker interrupted during processing", "shutdown": True})
225
+ return
226
+ except Exception as e: # noqa: BLE001
227
+ if shutdown_requested:
228
+ logger.exception("Exception occurred during shutdown, exiting...")
229
+ return
230
+
231
+ # Send any processing error to the main process with traceback
232
+ error_info = {"error": str(e), "traceback": traceback.format_exc()}
233
+ logger.error(f"Error in worker: {error_info}")
234
+ queue.put(error_info)
235
+ finally:
236
+ logger.info("Docling worker finishing...")
237
+ # Ensure we don't leave any hanging processes
238
+ if shutdown_requested:
239
+ logger.debug("Worker shutdown completed")
240
+ else:
241
+ logger.debug("Worker completed normally")
@@ -1,4 +1,9 @@
1
+ import time
2
+ from multiprocessing import Queue, get_context
3
+ from queue import Empty
4
+
1
5
  from langflow.base.data import BaseFileComponent
6
+ from langflow.components.docling import docling_worker
2
7
  from langflow.inputs import DropdownInput
3
8
  from langflow.schema import Data
4
9
 
@@ -69,73 +74,110 @@ class DoclingInlineComponent(BaseFileComponent):
69
74
  *BaseFileComponent._base_outputs,
70
75
  ]
71
76
 
72
- def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
73
- try:
74
- from docling.datamodel.base_models import ConversionStatus, InputFormat
75
- from docling.datamodel.pipeline_options import (
76
- OcrOptions,
77
- PdfPipelineOptions,
78
- VlmPipelineOptions,
79
- )
80
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
81
- from docling.models.factories import get_ocr_factory
82
- from docling.pipeline.vlm_pipeline import VlmPipeline
83
- except ImportError as e:
84
- msg = (
85
- "Docling is not installed. Please install it with `uv pip install docling` or"
86
- " `uv pip install langflow[docling]`."
87
- )
88
- raise ImportError(msg) from e
89
-
90
- # Configure the standard PDF pipeline
91
- def _get_standard_opts() -> PdfPipelineOptions:
92
- pipeline_options = PdfPipelineOptions()
93
- pipeline_options.do_ocr = self.ocr_engine != ""
94
- if pipeline_options.do_ocr:
95
- ocr_factory = get_ocr_factory(
96
- allow_external_plugins=False,
97
- )
98
-
99
- ocr_options: OcrOptions = ocr_factory.create_options(
100
- kind=self.ocr_engine,
101
- )
102
- pipeline_options.ocr_options = ocr_options
103
- return pipeline_options
104
-
105
- # Configure the VLM pipeline
106
- def _get_vlm_opts() -> VlmPipelineOptions:
107
- return VlmPipelineOptions()
108
-
109
- # Configure the main format options and create the DocumentConverter()
110
- def _get_converter() -> DocumentConverter:
111
- if self.pipeline == "standard":
112
- pdf_format_option = PdfFormatOption(
113
- pipeline_options=_get_standard_opts(),
114
- )
115
- elif self.pipeline == "vlm":
116
- pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
117
-
118
- format_options: dict[InputFormat, FormatOption] = {
119
- InputFormat.PDF: pdf_format_option,
120
- InputFormat.IMAGE: pdf_format_option,
121
- }
122
-
123
- return DocumentConverter(format_options=format_options)
77
+ def _wait_for_result_with_process_monitoring(self, queue: Queue, proc, timeout: int = 300):
78
+ """Wait for result from queue while monitoring process health.
79
+
80
+ Handles cases where process crashes without sending result.
81
+ """
82
+ start_time = time.time()
83
+
84
+ while time.time() - start_time < timeout:
85
+ # Check if process is still alive
86
+ if not proc.is_alive():
87
+ # Process died, try to get any result it might have sent
88
+ try:
89
+ result = queue.get_nowait()
90
+ except Empty:
91
+ # Process died without sending result
92
+ msg = f"Worker process crashed unexpectedly without producing result. Exit code: {proc.exitcode}"
93
+ raise RuntimeError(msg) from None
94
+ else:
95
+ self.log("Process completed and result retrieved")
96
+ return result
97
+
98
+ # Poll the queue instead of blocking
99
+ try:
100
+ result = queue.get(timeout=1)
101
+ except Empty:
102
+ # No result yet, continue monitoring
103
+ continue
104
+ else:
105
+ self.log("Result received from worker process")
106
+ return result
107
+
108
+ # Overall timeout reached
109
+ msg = f"Process timed out after {timeout} seconds"
110
+ raise TimeoutError(msg)
111
+
112
+ def _terminate_process_gracefully(self, proc, timeout_terminate: int = 10, timeout_kill: int = 5):
113
+ """Terminate process gracefully with escalating signals.
114
+
115
+ First tries SIGTERM, then SIGKILL if needed.
116
+ """
117
+ if not proc.is_alive():
118
+ return
119
+
120
+ self.log("Attempting graceful process termination with SIGTERM")
121
+ proc.terminate() # Send SIGTERM
122
+ proc.join(timeout=timeout_terminate)
123
+
124
+ if proc.is_alive():
125
+ self.log("Process didn't respond to SIGTERM, using SIGKILL")
126
+ proc.kill() # Send SIGKILL
127
+ proc.join(timeout=timeout_kill)
128
+
129
+ if proc.is_alive():
130
+ self.log("Warning: Process still alive after SIGKILL")
124
131
 
132
+ def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
125
133
  file_paths = [file.path for file in file_list if file.path]
126
134
 
127
135
  if not file_paths:
128
136
  self.log("No files to process.")
129
137
  return file_list
130
138
 
131
- converter = _get_converter()
132
- results = converter.convert_all(file_paths)
139
+ ctx = get_context("spawn")
140
+ queue: Queue = ctx.Queue()
141
+ proc = ctx.Process(
142
+ target=docling_worker,
143
+ args=(file_paths, queue, self.pipeline, self.ocr_engine),
144
+ )
133
145
 
134
- processed_data: list[Data | None] = [
135
- Data(data={"doc": res.document, "file_path": str(res.input.file)})
136
- if res.status == ConversionStatus.SUCCESS
137
- else None
138
- for res in results
139
- ]
146
+ result = None
147
+ proc.start()
140
148
 
149
+ try:
150
+ result = self._wait_for_result_with_process_monitoring(queue, proc, timeout=300)
151
+ except KeyboardInterrupt:
152
+ self.log("Docling process cancelled by user")
153
+ result = []
154
+ except Exception as e:
155
+ self.log(f"Error during processing: {e}")
156
+ raise
157
+ finally:
158
+ # Improved cleanup with graceful termination
159
+ try:
160
+ self._terminate_process_gracefully(proc)
161
+ finally:
162
+ # Always close and cleanup queue resources
163
+ try:
164
+ queue.close()
165
+ queue.join_thread()
166
+ except Exception as e: # noqa: BLE001
167
+ # Ignore cleanup errors, but log them
168
+ self.log(f"Warning: Error during queue cleanup - {e}")
169
+
170
+ # Check if there was an error in the worker
171
+ if isinstance(result, dict) and "error" in result:
172
+ msg = result["error"]
173
+ if msg.startswith("Docling is not installed"):
174
+ raise ImportError(msg)
175
+ # Handle interrupt gracefully - return empty result instead of raising error
176
+ if "Worker interrupted by SIGINT" in msg or "shutdown" in result:
177
+ self.log("Docling process cancelled by user")
178
+ result = []
179
+ else:
180
+ raise RuntimeError(msg)
181
+
182
+ processed_data = [Data(data={"doc": r["document"], "file_path": r["file_path"]}) if r else None for r in result]
141
183
  return self.rollup_data(file_list, processed_data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langflow-base-nightly
3
- Version: 0.5.0.dev35
3
+ Version: 0.5.0.dev36
4
4
  Summary: A Python package with a built-in web application
5
5
  Project-URL: Repository, https://github.com/langflow-ai/langflow
6
6
  Project-URL: Documentation, https://docs.langflow.org
@@ -313,9 +313,9 @@ langflow/components/deactivated/vectara_self_query.py,sha256=4O1jCCBLxTwzr1HUOwJ
313
313
  langflow/components/deactivated/vector_store.py,sha256=vLBqJ99SwXfWO3BrHnL-DEh0mnJ9IXWl43hiM6I-d7Y,743
314
314
  langflow/components/deepseek/__init__.py,sha256=KJHElyBgRcJIGoejJV-MSwSQ-fz3ZbVCVNbi-vs63nc,940
315
315
  langflow/components/deepseek/deepseek.py,sha256=VJo3tfF8lOoAmhzzv3CeRUFb2zDZG18-BokpAR-GMYU,4717
316
- langflow/components/docling/__init__.py,sha256=MxEEA_bDfWEQ630yj-K0cwkRNd9UVFDKeEJ9HXhRghg,1429
316
+ langflow/components/docling/__init__.py,sha256=fYTURN1YnJtmcVC3eX25-sA9pn-_qOtU5O6XhNOCRBA,9174
317
317
  langflow/components/docling/chunk_docling_document.py,sha256=Y8JVHza3uUiANkpoGkmvULKG47So7R_0h5ogtc3KA4E,7620
318
- langflow/components/docling/docling_inline.py,sha256=SShARkFo6Emf9T5fjBV3iUI3BWOIYWERdWtGRGDPnR4,4564
318
+ langflow/components/docling/docling_inline.py,sha256=5e8Er9Mq_Hr-mORU3M67Isr_WOaqHYFYZxX0qBSSUoE,6080
319
319
  langflow/components/docling/docling_remote.py,sha256=iAU4hgQxklYr_3OECuoXI08iQ_MvJ22JC2LrGVU0vwQ,6810
320
320
  langflow/components/docling/export_docling_document.py,sha256=RRyWc71MpzlI7Tx1mW4XMn9lKkQCqlIh_fPCkeAAbpE,4701
321
321
  langflow/components/documentloaders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -1129,7 +1129,7 @@ langflow/utils/util_strings.py,sha256=Blz5lwvE7lml7nKCG9vVJ6me5VNmVtYzFXDVPHPK7v
1129
1129
  langflow/utils/validate.py,sha256=8RnY61LZFCBU1HIlPDCMI3vsXOmK_IFAYBGZIfZJcsU,16362
1130
1130
  langflow/utils/version.py,sha256=OjSj0smls9XnPd4-LpTH9AWyUO_NAn5mncqKkkXl_fw,2840
1131
1131
  langflow/utils/voice_utils.py,sha256=pzU6uuseI2_5mi-yXzFIjMavVRFyuVrpLmR6LqbF7mE,3346
1132
- langflow_base_nightly-0.5.0.dev35.dist-info/METADATA,sha256=eOTE0uH1t1a7TDnxA9KuOxZO5bIO9dptiAqdIG2dDNo,4212
1133
- langflow_base_nightly-0.5.0.dev35.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
1134
- langflow_base_nightly-0.5.0.dev35.dist-info/entry_points.txt,sha256=JvuLdXSrkeDmDdpb8M-VvFIzb84n4HmqUcIP10_EIF8,57
1135
- langflow_base_nightly-0.5.0.dev35.dist-info/RECORD,,
1132
+ langflow_base_nightly-0.5.0.dev36.dist-info/METADATA,sha256=FST1eF7horVqutiEmlhKW2RUm0QRgQve8nL3gMj9E0o,4212
1133
+ langflow_base_nightly-0.5.0.dev36.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
1134
+ langflow_base_nightly-0.5.0.dev36.dist-info/entry_points.txt,sha256=JvuLdXSrkeDmDdpb8M-VvFIzb84n4HmqUcIP10_EIF8,57
1135
+ langflow_base_nightly-0.5.0.dev36.dist-info/RECORD,,