lfx-nightly 0.1.12.dev0__py3-none-any.whl → 0.1.12.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx/base/data/docling_utils.py +59 -1
- lfx/components/docling/docling_inline.py +36 -43
- {lfx_nightly-0.1.12.dev0.dist-info → lfx_nightly-0.1.12.dev1.dist-info}/METADATA +1 -1
- {lfx_nightly-0.1.12.dev0.dist-info → lfx_nightly-0.1.12.dev1.dist-info}/RECORD +6 -6
- {lfx_nightly-0.1.12.dev0.dist-info → lfx_nightly-0.1.12.dev1.dist-info}/WHEEL +0 -0
- {lfx_nightly-0.1.12.dev0.dist-info → lfx_nightly-0.1.12.dev1.dist-info}/entry_points.txt +0 -0
lfx/base/data/docling_utils.py
CHANGED
@@ -1,14 +1,20 @@
|
|
1
|
+
import importlib
|
1
2
|
import signal
|
2
3
|
import sys
|
3
4
|
import traceback
|
4
5
|
from contextlib import suppress
|
6
|
+
from typing import TYPE_CHECKING
|
5
7
|
|
6
8
|
from docling_core.types.doc import DoclingDocument
|
9
|
+
from pydantic import BaseModel, SecretStr, TypeAdapter
|
7
10
|
|
8
11
|
from lfx.log.logger import logger
|
9
12
|
from lfx.schema.data import Data
|
10
13
|
from lfx.schema.dataframe import DataFrame
|
11
14
|
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
17
|
+
|
12
18
|
|
13
19
|
def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
|
14
20
|
documents: list[DoclingDocument] = []
|
@@ -57,7 +63,45 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
|
|
57
63
|
return documents
|
58
64
|
|
59
65
|
|
60
|
-
def
|
66
|
+
def _unwrap_secrets(obj):
|
67
|
+
if isinstance(obj, SecretStr):
|
68
|
+
return obj.get_secret_value()
|
69
|
+
if isinstance(obj, dict):
|
70
|
+
return {k: _unwrap_secrets(v) for k, v in obj.items()}
|
71
|
+
if isinstance(obj, list):
|
72
|
+
return [_unwrap_secrets(v) for v in obj]
|
73
|
+
return obj
|
74
|
+
|
75
|
+
|
76
|
+
def _dump_with_secrets(model: BaseModel):
|
77
|
+
return _unwrap_secrets(model.model_dump(mode="python", round_trip=True))
|
78
|
+
|
79
|
+
|
80
|
+
def _serialize_pydantic_model(model: BaseModel):
|
81
|
+
return {
|
82
|
+
"__class_path__": f"{model.__class__.__module__}.{model.__class__.__name__}",
|
83
|
+
"config": _dump_with_secrets(model),
|
84
|
+
}
|
85
|
+
|
86
|
+
|
87
|
+
def _deserialize_pydantic_model(data: dict):
|
88
|
+
module_name, class_name = data["__class_path__"].rsplit(".", 1)
|
89
|
+
module = importlib.import_module(module_name)
|
90
|
+
cls = getattr(module, class_name)
|
91
|
+
adapter = TypeAdapter(cls)
|
92
|
+
return adapter.validate_python(data["config"])
|
93
|
+
|
94
|
+
|
95
|
+
def docling_worker(
|
96
|
+
*,
|
97
|
+
file_paths: list[str],
|
98
|
+
queue,
|
99
|
+
pipeline: str,
|
100
|
+
ocr_engine: str,
|
101
|
+
do_picture_classification: bool,
|
102
|
+
pic_desc_config: dict | None,
|
103
|
+
pic_desc_prompt: str,
|
104
|
+
):
|
61
105
|
"""Worker function for processing files with Docling in a separate process."""
|
62
106
|
# Signal handling for graceful shutdown
|
63
107
|
shutdown_requested = False
|
@@ -106,6 +150,7 @@ def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str)
|
|
106
150
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
107
151
|
from docling.models.factories import get_ocr_factory
|
108
152
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
153
|
+
from langchain_docling.picture_description import PictureDescriptionLangChainOptions
|
109
154
|
|
110
155
|
# Check for shutdown after imports
|
111
156
|
check_shutdown()
|
@@ -143,6 +188,19 @@ def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str)
|
|
143
188
|
kind=ocr_engine,
|
144
189
|
)
|
145
190
|
pipeline_options.ocr_options = ocr_options
|
191
|
+
|
192
|
+
pipeline_options.do_picture_classification = do_picture_classification
|
193
|
+
|
194
|
+
if pic_desc_config:
|
195
|
+
pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config)
|
196
|
+
|
197
|
+
logger.info("Docling enabling the picture description stage.")
|
198
|
+
pipeline_options.do_picture_description = True
|
199
|
+
pipeline_options.allow_external_plugins = True
|
200
|
+
pipeline_options.picture_description_options = PictureDescriptionLangChainOptions(
|
201
|
+
llm=pic_desc_llm,
|
202
|
+
prompt=pic_desc_prompt,
|
203
|
+
)
|
146
204
|
return pipeline_options
|
147
205
|
|
148
206
|
# Configure the VLM pipeline
|
@@ -3,8 +3,8 @@ from multiprocessing import Queue, get_context
|
|
3
3
|
from queue import Empty
|
4
4
|
|
5
5
|
from lfx.base.data import BaseFileComponent
|
6
|
-
from lfx.base.data.docling_utils import docling_worker
|
7
|
-
from lfx.inputs import DropdownInput
|
6
|
+
from lfx.base.data.docling_utils import _serialize_pydantic_model, docling_worker
|
7
|
+
from lfx.inputs import BoolInput, DropdownInput, HandleInput, StrInput
|
8
8
|
from lfx.schema import Data
|
9
9
|
|
10
10
|
|
@@ -67,6 +67,26 @@ class DoclingInlineComponent(BaseFileComponent):
|
|
67
67
|
real_time_refresh=False,
|
68
68
|
value="None",
|
69
69
|
),
|
70
|
+
BoolInput(
|
71
|
+
name="do_picture_classification",
|
72
|
+
display_name="Picture classification",
|
73
|
+
info="If enabled, the Docling pipeline will classify the pictures type.",
|
74
|
+
value=False,
|
75
|
+
),
|
76
|
+
HandleInput(
|
77
|
+
name="pic_desc_llm",
|
78
|
+
display_name="Picture description LLM",
|
79
|
+
info="If connected, the model to use for running the picture description task.",
|
80
|
+
input_types=["LanguageModel"],
|
81
|
+
required=False,
|
82
|
+
),
|
83
|
+
StrInput(
|
84
|
+
name="pic_desc_prompt",
|
85
|
+
display_name="Picture description prompt",
|
86
|
+
value="Describe the image in three sentences. Be concise and accurate.",
|
87
|
+
info="The user prompt to use when invoking the model.",
|
88
|
+
advanced=True,
|
89
|
+
),
|
70
90
|
# TODO: expose more Docling options
|
71
91
|
]
|
72
92
|
|
@@ -131,11 +151,7 @@ class DoclingInlineComponent(BaseFileComponent):
|
|
131
151
|
|
132
152
|
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
|
133
153
|
try:
|
134
|
-
from docling.
|
135
|
-
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
|
136
|
-
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
137
|
-
from docling.models.factories import get_ocr_factory
|
138
|
-
from docling.pipeline.vlm_pipeline import VlmPipeline
|
154
|
+
from docling.document_converter import DocumentConverter # noqa: F401
|
139
155
|
except ImportError as e:
|
140
156
|
msg = (
|
141
157
|
"Docling is an optional dependency. Install with `uv pip install 'langflow[docling]'` or refer to the "
|
@@ -143,52 +159,29 @@ class DoclingInlineComponent(BaseFileComponent):
|
|
143
159
|
)
|
144
160
|
raise ImportError(msg) from e
|
145
161
|
|
146
|
-
# Configure the standard PDF pipeline
|
147
|
-
def _get_standard_opts() -> PdfPipelineOptions:
|
148
|
-
pipeline_options = PdfPipelineOptions()
|
149
|
-
pipeline_options.do_ocr = self.ocr_engine != "None"
|
150
|
-
if pipeline_options.do_ocr:
|
151
|
-
ocr_factory = get_ocr_factory(
|
152
|
-
allow_external_plugins=False,
|
153
|
-
)
|
154
|
-
|
155
|
-
ocr_options: OcrOptions = ocr_factory.create_options(
|
156
|
-
kind=self.ocr_engine,
|
157
|
-
)
|
158
|
-
pipeline_options.ocr_options = ocr_options
|
159
|
-
return pipeline_options
|
160
|
-
|
161
|
-
# Configure the VLM pipeline
|
162
|
-
def _get_vlm_opts() -> VlmPipelineOptions:
|
163
|
-
return VlmPipelineOptions()
|
164
|
-
|
165
|
-
# Configure the main format options and create the DocumentConverter()
|
166
|
-
def _get_converter() -> DocumentConverter:
|
167
|
-
if self.pipeline == "standard":
|
168
|
-
pdf_format_option = PdfFormatOption(
|
169
|
-
pipeline_options=_get_standard_opts(),
|
170
|
-
)
|
171
|
-
elif self.pipeline == "vlm":
|
172
|
-
pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
|
173
|
-
|
174
|
-
format_options: dict[InputFormat, FormatOption] = {
|
175
|
-
InputFormat.PDF: pdf_format_option,
|
176
|
-
InputFormat.IMAGE: pdf_format_option,
|
177
|
-
}
|
178
|
-
|
179
|
-
return DocumentConverter(format_options=format_options)
|
180
|
-
|
181
162
|
file_paths = [file.path for file in file_list if file.path]
|
182
163
|
|
183
164
|
if not file_paths:
|
184
165
|
self.log("No files to process.")
|
185
166
|
return file_list
|
186
167
|
|
168
|
+
pic_desc_config: dict | None = None
|
169
|
+
if self.pic_desc_llm is not None:
|
170
|
+
pic_desc_config = _serialize_pydantic_model(self.pic_desc_llm)
|
171
|
+
|
187
172
|
ctx = get_context("spawn")
|
188
173
|
queue: Queue = ctx.Queue()
|
189
174
|
proc = ctx.Process(
|
190
175
|
target=docling_worker,
|
191
|
-
|
176
|
+
kwargs={
|
177
|
+
"file_paths": file_paths,
|
178
|
+
"queue": queue,
|
179
|
+
"pipeline": self.pipeline,
|
180
|
+
"ocr_engine": self.ocr_engine,
|
181
|
+
"do_picture_classification": self.do_picture_classification,
|
182
|
+
"pic_desc_config": pic_desc_config,
|
183
|
+
"pic_desc_prompt": self.pic_desc_prompt,
|
184
|
+
},
|
192
185
|
)
|
193
186
|
|
194
187
|
result = None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: lfx-nightly
|
3
|
-
Version: 0.1.12.
|
3
|
+
Version: 0.1.12.dev1
|
4
4
|
Summary: Langflow Executor - A lightweight CLI tool for executing and serving Langflow AI flows
|
5
5
|
Author-email: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
|
6
6
|
Requires-Python: <3.14,>=3.10
|
@@ -29,7 +29,7 @@ lfx/base/curl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
29
|
lfx/base/curl/parse.py,sha256=Yw6mMbGg7e-ffrBItEUJeTiljneCXlNyt5afzEP9eUI,6094
|
30
30
|
lfx/base/data/__init__.py,sha256=lQsYYMyAg_jA9ZF7oc-LNZsRE2uMGT6g16WzsUByHqs,81
|
31
31
|
lfx/base/data/base_file.py,sha256=XFj3u9OGHcRbWfzslzvvxn-qpaCeX0uUQ0fStUCo65I,25495
|
32
|
-
lfx/base/data/docling_utils.py,sha256=
|
32
|
+
lfx/base/data/docling_utils.py,sha256=i0KpNNLgPJ0D226Tm5j_oaCv09w9IspBU2OwTDCfnBc,11625
|
33
33
|
lfx/base/data/utils.py,sha256=eZJgkOvQ3MaURDfgkH2MiZZOBF5_D0nSlmDY6LgLRik,5960
|
34
34
|
lfx/base/document_transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
lfx/base/document_transformers/model.py,sha256=etVEmyakiEgflB-fayClPnFRhaEdXfdUu4cqpgtk8ek,1317
|
@@ -242,7 +242,7 @@ lfx/components/deepseek/__init__.py,sha256=gmyOcLeNEcnwSeowow0N0UhBDlSuZ_8x-DMUj
|
|
242
242
|
lfx/components/deepseek/deepseek.py,sha256=yNrHoljXOMScKng-oSB-ceWhVZeuh11lmrAY7WiB2H0,4702
|
243
243
|
lfx/components/docling/__init__.py,sha256=O4utz9GHFpTVe_Wy0PR80yA1irJQRnAFQWkoLCVj888,1424
|
244
244
|
lfx/components/docling/chunk_docling_document.py,sha256=OX-jj4nX3UZgopViMAGAnFgtLql0sgs6cVmU8p9QbqA,7600
|
245
|
-
lfx/components/docling/docling_inline.py,sha256
|
245
|
+
lfx/components/docling/docling_inline.py,sha256=-m8hTANtdUDUjsJtJTB1sl6MJMhXG8zMeBMwbn0w9Ig,7871
|
246
246
|
lfx/components/docling/docling_remote.py,sha256=kwMS_-QMiM_JmPqvtHf4gDS73d2hZrIbtAPsN8bZxGE,6769
|
247
247
|
lfx/components/docling/export_docling_document.py,sha256=TeFt3TesCxSqW57nv-30gf2dX8qMDUHLRhwU-1ciq08,4681
|
248
248
|
lfx/components/documentloaders/__init__.py,sha256=LNl2hG2InevQCUREFKhF9ylaTf_kwPsdjiDbx2ElX3M,69
|
@@ -693,7 +693,7 @@ lfx/utils/schemas.py,sha256=NbOtVQBrn4d0BAu-0H_eCTZI2CXkKZlRY37XCSmuJwc,3865
|
|
693
693
|
lfx/utils/util.py,sha256=xGR32XDRr_TtruhjnXfI7lEWmk-vgywHAy3kz5SBowc,15725
|
694
694
|
lfx/utils/util_strings.py,sha256=nU_IcdphNaj6bAPbjeL-c1cInQPfTBit8mp5Y57lwQk,1686
|
695
695
|
lfx/utils/version.py,sha256=cHpbO0OJD2JQAvVaTH_6ibYeFbHJV0QDHs_YXXZ-bT8,671
|
696
|
-
lfx_nightly-0.1.12.
|
697
|
-
lfx_nightly-0.1.12.
|
698
|
-
lfx_nightly-0.1.12.
|
699
|
-
lfx_nightly-0.1.12.
|
696
|
+
lfx_nightly-0.1.12.dev1.dist-info/METADATA,sha256=eMZwEM_BySUNrUL6AE3XpjsO-k1I1zchBEvtSHaZF4M,8000
|
697
|
+
lfx_nightly-0.1.12.dev1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
698
|
+
lfx_nightly-0.1.12.dev1.dist-info/entry_points.txt,sha256=1724p3RHDQRT2CKx_QRzEIa7sFuSVO0Ux70YfXfoMT4,42
|
699
|
+
lfx_nightly-0.1.12.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|