lfx-docling 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lfx_docling/__init__.py +13 -0
- lfx_docling/components/docling/__init__.py +11 -0
- lfx_docling/components/docling/chunk_docling_document.py +224 -0
- lfx_docling/components/docling/docling_inline.py +350 -0
- lfx_docling/components/docling/docling_remote.py +353 -0
- lfx_docling/components/docling/export_docling_document.py +137 -0
- lfx_docling/extension.json +16 -0
- lfx_docling-0.1.0.dist-info/METADATA +77 -0
- lfx_docling-0.1.0.dist-info/RECORD +11 -0
- lfx_docling-0.1.0.dist-info/WHEEL +4 -0
- lfx_docling-0.1.0.dist-info/entry_points.txt +2 -0
lfx_docling/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""lfx-docling: Docling document processing components."""
|
|
2
|
+
|
|
3
|
+
from lfx_docling.components.docling.chunk_docling_document import ChunkDoclingDocumentComponent
|
|
4
|
+
from lfx_docling.components.docling.docling_inline import DoclingInlineComponent
|
|
5
|
+
from lfx_docling.components.docling.docling_remote import DoclingRemoteComponent
|
|
6
|
+
from lfx_docling.components.docling.export_docling_document import ExportDoclingDocumentComponent
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ChunkDoclingDocumentComponent",
|
|
10
|
+
"DoclingInlineComponent",
|
|
11
|
+
"DoclingRemoteComponent",
|
|
12
|
+
"ExportDoclingDocumentComponent",
|
|
13
|
+
]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .chunk_docling_document import ChunkDoclingDocumentComponent
|
|
2
|
+
from .docling_inline import DoclingInlineComponent
|
|
3
|
+
from .docling_remote import DoclingRemoteComponent
|
|
4
|
+
from .export_docling_document import ExportDoclingDocumentComponent
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ChunkDoclingDocumentComponent",
|
|
8
|
+
"DoclingInlineComponent",
|
|
9
|
+
"DoclingRemoteComponent",
|
|
10
|
+
"ExportDoclingDocumentComponent",
|
|
11
|
+
]
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from lfx.base.data.docling_utils import extract_docling_documents
|
|
5
|
+
from lfx.custom import Component
|
|
6
|
+
from lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MessageTextInput, Output, StrInput
|
|
7
|
+
from lfx.schema import Data, DataFrame
|
|
8
|
+
|
|
9
|
+
_CHUNKING_INSTALL_HINT = (
|
|
10
|
+
"Install them with `uv pip install 'lfx-docling[chunking]'`, "
|
|
11
|
+
"`uv pip install 'langflow[docling-chunking]'`, or "
|
|
12
|
+
"`uv pip install 'docling-core[chunking]' tiktoken`."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _load_docling_chunker_dependencies() -> tuple[type[Any], type[Any]]:
|
|
17
|
+
try:
|
|
18
|
+
from docling_core.transforms.chunker.doc_chunk import DocMeta as DocMetaCls
|
|
19
|
+
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker as HierarchicalChunkerCls
|
|
20
|
+
except (ImportError, RuntimeError) as e:
|
|
21
|
+
msg = f"Docling chunking dependencies are not installed. {_CHUNKING_INSTALL_HINT}"
|
|
22
|
+
raise ImportError(msg) from e
|
|
23
|
+
return DocMetaCls, HierarchicalChunkerCls
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ChunkDoclingDocumentComponent(Component):
|
|
27
|
+
display_name: str = "Chunk DoclingDocument"
|
|
28
|
+
description: str = "Use DoclingDocument chunkers to split the document into chunks."
|
|
29
|
+
documentation = "https://docling-project.github.io/docling/concepts/chunking/"
|
|
30
|
+
icon = "Docling"
|
|
31
|
+
name = "ChunkDoclingDocument"
|
|
32
|
+
|
|
33
|
+
inputs = [
|
|
34
|
+
HandleInput(
|
|
35
|
+
name="data_inputs",
|
|
36
|
+
display_name="JSON or Table",
|
|
37
|
+
info="The data with documents to split in chunks.",
|
|
38
|
+
input_types=["Data", "JSON", "DataFrame", "Table"],
|
|
39
|
+
required=True,
|
|
40
|
+
),
|
|
41
|
+
DropdownInput(
|
|
42
|
+
name="chunker",
|
|
43
|
+
display_name="Chunker",
|
|
44
|
+
options=["HybridChunker", "HierarchicalChunker"],
|
|
45
|
+
info=("Which chunker to use."),
|
|
46
|
+
value="HybridChunker",
|
|
47
|
+
real_time_refresh=True,
|
|
48
|
+
input_types=["Message"],
|
|
49
|
+
),
|
|
50
|
+
DropdownInput(
|
|
51
|
+
name="provider",
|
|
52
|
+
display_name="Provider",
|
|
53
|
+
options=["Hugging Face", "OpenAI"],
|
|
54
|
+
info=("Which tokenizer provider."),
|
|
55
|
+
value="Hugging Face",
|
|
56
|
+
show=True,
|
|
57
|
+
real_time_refresh=True,
|
|
58
|
+
advanced=True,
|
|
59
|
+
dynamic=True,
|
|
60
|
+
),
|
|
61
|
+
StrInput(
|
|
62
|
+
name="hf_model_name",
|
|
63
|
+
display_name="HF model name",
|
|
64
|
+
info=(
|
|
65
|
+
"Model name of the tokenizer to use with the HybridChunker when Hugging Face is chosen as a tokenizer."
|
|
66
|
+
),
|
|
67
|
+
value="sentence-transformers/all-MiniLM-L6-v2",
|
|
68
|
+
show=True,
|
|
69
|
+
advanced=True,
|
|
70
|
+
dynamic=True,
|
|
71
|
+
),
|
|
72
|
+
StrInput(
|
|
73
|
+
name="openai_model_name",
|
|
74
|
+
display_name="OpenAI model name",
|
|
75
|
+
info=("Model name of the tokenizer to use with the HybridChunker when OpenAI is chosen as a tokenizer."),
|
|
76
|
+
value="gpt-4o",
|
|
77
|
+
show=False,
|
|
78
|
+
advanced=True,
|
|
79
|
+
dynamic=True,
|
|
80
|
+
),
|
|
81
|
+
IntInput(
|
|
82
|
+
name="max_tokens",
|
|
83
|
+
display_name="Maximum tokens",
|
|
84
|
+
info=("Maximum number of tokens for the HybridChunker."),
|
|
85
|
+
show=True,
|
|
86
|
+
required=False,
|
|
87
|
+
advanced=True,
|
|
88
|
+
dynamic=True,
|
|
89
|
+
input_types=["Message"],
|
|
90
|
+
),
|
|
91
|
+
BoolInput(
|
|
92
|
+
name="merge_peers",
|
|
93
|
+
display_name="Merge peers",
|
|
94
|
+
info="Merge undersized chunks sharing the same relevant metadata.",
|
|
95
|
+
value=True,
|
|
96
|
+
show=True,
|
|
97
|
+
advanced=True,
|
|
98
|
+
dynamic=True,
|
|
99
|
+
),
|
|
100
|
+
BoolInput(
|
|
101
|
+
name="always_emit_headings",
|
|
102
|
+
display_name="Always emit headings",
|
|
103
|
+
info="Emit headings even for empty sections.",
|
|
104
|
+
value=False,
|
|
105
|
+
show=True,
|
|
106
|
+
advanced=True,
|
|
107
|
+
dynamic=True,
|
|
108
|
+
),
|
|
109
|
+
MessageTextInput(
|
|
110
|
+
name="doc_key",
|
|
111
|
+
display_name="Doc Key",
|
|
112
|
+
info="The key to use for the DoclingDocument column.",
|
|
113
|
+
value="doc",
|
|
114
|
+
advanced=True,
|
|
115
|
+
),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
outputs = [
|
|
119
|
+
Output(display_name="Table", name="dataframe", method="chunk_documents"),
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:
|
|
123
|
+
"""Update build_config to show/hide fields based on chunker and provider selection."""
|
|
124
|
+
if field_name == "chunker":
|
|
125
|
+
provider_type = build_config["provider"]["value"]
|
|
126
|
+
is_hf = provider_type == "Hugging Face"
|
|
127
|
+
is_openai = provider_type == "OpenAI"
|
|
128
|
+
if field_value == "HybridChunker":
|
|
129
|
+
build_config["provider"]["show"] = True
|
|
130
|
+
build_config["hf_model_name"]["show"] = is_hf
|
|
131
|
+
build_config["openai_model_name"]["show"] = is_openai
|
|
132
|
+
build_config["max_tokens"]["show"] = True
|
|
133
|
+
build_config["merge_peers"]["show"] = True
|
|
134
|
+
build_config["always_emit_headings"]["show"] = True
|
|
135
|
+
else:
|
|
136
|
+
build_config["provider"]["show"] = False
|
|
137
|
+
build_config["hf_model_name"]["show"] = False
|
|
138
|
+
build_config["openai_model_name"]["show"] = False
|
|
139
|
+
build_config["max_tokens"]["show"] = False
|
|
140
|
+
build_config["merge_peers"]["show"] = False
|
|
141
|
+
build_config["always_emit_headings"]["show"] = False
|
|
142
|
+
elif field_name == "provider" and build_config["chunker"]["value"] == "HybridChunker":
|
|
143
|
+
if field_value == "Hugging Face":
|
|
144
|
+
build_config["hf_model_name"]["show"] = True
|
|
145
|
+
build_config["openai_model_name"]["show"] = False
|
|
146
|
+
elif field_value == "OpenAI":
|
|
147
|
+
build_config["hf_model_name"]["show"] = False
|
|
148
|
+
build_config["openai_model_name"]["show"] = True
|
|
149
|
+
|
|
150
|
+
return build_config
|
|
151
|
+
|
|
152
|
+
def _docs_to_data(self, docs) -> list[Data]:
|
|
153
|
+
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
|
|
154
|
+
|
|
155
|
+
def chunk_documents(self) -> DataFrame:
|
|
156
|
+
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
157
|
+
if warning:
|
|
158
|
+
self.status = warning
|
|
159
|
+
|
|
160
|
+
doc_meta_cls, hierarchical_chunker_cls = _load_docling_chunker_dependencies()
|
|
161
|
+
chunker: Any
|
|
162
|
+
if self.chunker == "HybridChunker":
|
|
163
|
+
try:
|
|
164
|
+
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
|
|
165
|
+
except (ImportError, RuntimeError) as e:
|
|
166
|
+
msg = f"HybridChunker is not installed. {_CHUNKING_INSTALL_HINT}"
|
|
167
|
+
raise ImportError(msg) from e
|
|
168
|
+
max_tokens: int | None = self.max_tokens if self.max_tokens else None
|
|
169
|
+
if self.provider == "Hugging Face":
|
|
170
|
+
try:
|
|
171
|
+
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
|
|
172
|
+
except (ImportError, RuntimeError) as e:
|
|
173
|
+
msg = f"HuggingFaceTokenizer is not installed. {_CHUNKING_INSTALL_HINT}"
|
|
174
|
+
raise ImportError(msg) from e
|
|
175
|
+
tokenizer = HuggingFaceTokenizer.from_pretrained(
|
|
176
|
+
model_name=self.hf_model_name,
|
|
177
|
+
max_tokens=max_tokens,
|
|
178
|
+
)
|
|
179
|
+
elif self.provider == "OpenAI":
|
|
180
|
+
try:
|
|
181
|
+
import tiktoken
|
|
182
|
+
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
|
|
183
|
+
except (ImportError, RuntimeError) as e:
|
|
184
|
+
msg = f"OpenAITokenizer is not installed. {_CHUNKING_INSTALL_HINT}"
|
|
185
|
+
raise ImportError(msg) from e
|
|
186
|
+
if max_tokens is None:
|
|
187
|
+
max_tokens = 128 * 1024 # context window length required for OpenAI tokenizers
|
|
188
|
+
tokenizer = OpenAITokenizer(
|
|
189
|
+
tokenizer=tiktoken.encoding_for_model(self.openai_model_name), max_tokens=max_tokens
|
|
190
|
+
)
|
|
191
|
+
chunker = HybridChunker(
|
|
192
|
+
tokenizer=tokenizer,
|
|
193
|
+
merge_peers=bool(self.merge_peers),
|
|
194
|
+
always_emit_headings=bool(self.always_emit_headings),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
elif self.chunker == "HierarchicalChunker":
|
|
198
|
+
chunker = hierarchical_chunker_cls()
|
|
199
|
+
else:
|
|
200
|
+
msg = f"Unknown chunker: {self.chunker}"
|
|
201
|
+
raise ValueError(msg)
|
|
202
|
+
|
|
203
|
+
results: list[Data] = []
|
|
204
|
+
try:
|
|
205
|
+
for doc in documents:
|
|
206
|
+
for chunk in chunker.chunk(dl_doc=doc):
|
|
207
|
+
enriched_text = chunker.contextualize(chunk=chunk)
|
|
208
|
+
meta = doc_meta_cls.model_validate(chunk.meta)
|
|
209
|
+
|
|
210
|
+
results.append(
|
|
211
|
+
Data(
|
|
212
|
+
data={
|
|
213
|
+
"text": enriched_text,
|
|
214
|
+
"document_id": f"{doc.origin.binary_hash}",
|
|
215
|
+
"doc_items": json.dumps([item.self_ref for item in meta.doc_items]),
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
msg = f"Error splitting text: {e}"
|
|
222
|
+
raise TypeError(msg) from e
|
|
223
|
+
|
|
224
|
+
return DataFrame(results)
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
import textwrap
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from lfx.base.data import BaseFileComponent
|
|
8
|
+
from lfx.base.data.docling_utils import _serialize_pydantic_model
|
|
9
|
+
from lfx.inputs import BoolInput, DropdownInput, HandleInput, StrInput
|
|
10
|
+
from lfx.schema import Data, DataFrame
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DoclingInlineComponent(BaseFileComponent):
|
|
14
|
+
display_name = "Docling"
|
|
15
|
+
description = "Uses Docling to process input documents running the Docling models locally."
|
|
16
|
+
documentation = "https://docling-project.github.io/docling/"
|
|
17
|
+
trace_type = "tool"
|
|
18
|
+
icon = "Docling"
|
|
19
|
+
name = "DoclingInline"
|
|
20
|
+
|
|
21
|
+
# https://docling-project.github.io/docling/usage/supported_formats/
|
|
22
|
+
VALID_EXTENSIONS = [
|
|
23
|
+
"adoc",
|
|
24
|
+
"asciidoc",
|
|
25
|
+
"asc",
|
|
26
|
+
"bmp",
|
|
27
|
+
"csv",
|
|
28
|
+
"dotx",
|
|
29
|
+
"dotm",
|
|
30
|
+
"docm",
|
|
31
|
+
"docx",
|
|
32
|
+
"htm",
|
|
33
|
+
"html",
|
|
34
|
+
"jpeg",
|
|
35
|
+
"json",
|
|
36
|
+
"md",
|
|
37
|
+
"pdf",
|
|
38
|
+
"png",
|
|
39
|
+
"potx",
|
|
40
|
+
"ppsx",
|
|
41
|
+
"pptm",
|
|
42
|
+
"potm",
|
|
43
|
+
"ppsm",
|
|
44
|
+
"pptx",
|
|
45
|
+
"tiff",
|
|
46
|
+
"txt",
|
|
47
|
+
"xls",
|
|
48
|
+
"xlsx",
|
|
49
|
+
"xhtml",
|
|
50
|
+
"xml",
|
|
51
|
+
"webp",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
inputs = [
|
|
55
|
+
*BaseFileComponent.get_base_inputs(),
|
|
56
|
+
DropdownInput(
|
|
57
|
+
name="pipeline",
|
|
58
|
+
display_name="Pipeline",
|
|
59
|
+
info="Docling pipeline to use",
|
|
60
|
+
options=["standard", "vlm"],
|
|
61
|
+
value="standard",
|
|
62
|
+
),
|
|
63
|
+
DropdownInput(
|
|
64
|
+
name="ocr_engine",
|
|
65
|
+
display_name="OCR Engine",
|
|
66
|
+
info="OCR engine to use. None will disable OCR.",
|
|
67
|
+
options=["None", "easyocr", "tesserocr", "rapidocr", "ocrmac"],
|
|
68
|
+
value="None",
|
|
69
|
+
),
|
|
70
|
+
BoolInput(
|
|
71
|
+
name="do_picture_classification",
|
|
72
|
+
display_name="Picture classification",
|
|
73
|
+
info="If enabled, the Docling pipeline will classify the pictures type.",
|
|
74
|
+
value=False,
|
|
75
|
+
),
|
|
76
|
+
HandleInput(
|
|
77
|
+
name="pic_desc_llm",
|
|
78
|
+
display_name="Picture description LLM",
|
|
79
|
+
info="If connected, the model to use for running the picture description task.",
|
|
80
|
+
input_types=["LanguageModel"],
|
|
81
|
+
required=False,
|
|
82
|
+
),
|
|
83
|
+
StrInput(
|
|
84
|
+
name="pic_desc_prompt",
|
|
85
|
+
display_name="Picture description prompt",
|
|
86
|
+
value="Describe the image in three sentences. Be concise and accurate.",
|
|
87
|
+
info="The user prompt to use when invoking the model.",
|
|
88
|
+
advanced=True,
|
|
89
|
+
),
|
|
90
|
+
# TODO: expose more Docling options
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
outputs = [
|
|
94
|
+
*BaseFileComponent.get_base_outputs(),
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def build(self) -> DataFrame:
|
|
98
|
+
# Static bundle validation cannot see BaseFileComponent's inherited output method.
|
|
99
|
+
return self.load_files()
|
|
100
|
+
|
|
101
|
+
# ------------------------------------------------------------------ #
|
|
102
|
+
# Child script that runs Docling in a separate OS process. #
|
|
103
|
+
# Uses subprocess.Popen (same pattern as Read File advanced mode) #
|
|
104
|
+
# instead of multiprocessing/threading so that: #
|
|
105
|
+
# 1. It works reliably under Gunicorn's fork-based workers #
|
|
106
|
+
# 2. The parent's event loop stays free for SSE heartbeats #
|
|
107
|
+
# 3. No pickling / signal-handler conflicts #
|
|
108
|
+
# ------------------------------------------------------------------ #
|
|
109
|
+
_CHILD_SCRIPT: str = textwrap.dedent(r"""
|
|
110
|
+
import json, sys
|
|
111
|
+
|
|
112
|
+
def main():
|
|
113
|
+
cfg = json.loads(sys.stdin.read())
|
|
114
|
+
file_paths = cfg["file_paths"]
|
|
115
|
+
pipeline = cfg["pipeline"]
|
|
116
|
+
ocr_engine = cfg["ocr_engine"]
|
|
117
|
+
do_picture_cls = cfg["do_picture_classification"]
|
|
118
|
+
pic_desc_config = cfg.get("pic_desc_config")
|
|
119
|
+
pic_desc_prompt = cfg.get("pic_desc_prompt", "")
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
123
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
124
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
125
|
+
except ImportError as e:
|
|
126
|
+
print(json.dumps({"ok": False, "error": f"Docling is not installed: {e}"}))
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
# --- build converter ------------------------------------------------
|
|
130
|
+
try:
|
|
131
|
+
pipe = PdfPipelineOptions()
|
|
132
|
+
pipe.do_ocr = ocr_engine not in ("", "None")
|
|
133
|
+
if pipe.do_ocr:
|
|
134
|
+
try:
|
|
135
|
+
from docling.models.factories import get_ocr_factory
|
|
136
|
+
fac = get_ocr_factory(allow_external_plugins=False)
|
|
137
|
+
pipe.ocr_options = fac.create_options(kind=ocr_engine)
|
|
138
|
+
except Exception:
|
|
139
|
+
pipe.do_ocr = False
|
|
140
|
+
|
|
141
|
+
pipe.do_picture_classification = do_picture_cls
|
|
142
|
+
|
|
143
|
+
if pic_desc_config:
|
|
144
|
+
try:
|
|
145
|
+
import importlib
|
|
146
|
+
from pydantic import TypeAdapter
|
|
147
|
+
try:
|
|
148
|
+
from langchain_docling.picture_description import PictureDescriptionLangChainOptions
|
|
149
|
+
except ImportError:
|
|
150
|
+
print(json.dumps({
|
|
151
|
+
"ok": False,
|
|
152
|
+
"error": (
|
|
153
|
+
"langchain-docling is not installed. Install it with "
|
|
154
|
+
"`uv pip install 'langflow[docling-image-description]'` or "
|
|
155
|
+
"`uv pip install 'lfx-docling[image-description]'`."
|
|
156
|
+
)
|
|
157
|
+
}))
|
|
158
|
+
return
|
|
159
|
+
mod_name, cls_name = pic_desc_config["__class_path__"].rsplit(".", 1)
|
|
160
|
+
mod = importlib.import_module(mod_name)
|
|
161
|
+
cls = getattr(mod, cls_name)
|
|
162
|
+
adapter = TypeAdapter(cls)
|
|
163
|
+
llm = adapter.validate_python(pic_desc_config["config"])
|
|
164
|
+
pipe.do_picture_description = True
|
|
165
|
+
pipe.allow_external_plugins = True
|
|
166
|
+
pipe.picture_description_options = PictureDescriptionLangChainOptions(
|
|
167
|
+
llm=llm, prompt=pic_desc_prompt,
|
|
168
|
+
)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
print(json.dumps({"ok": False, "error": f"Picture description setup failed: {e}"}))
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
if pipeline == "vlm":
|
|
174
|
+
try:
|
|
175
|
+
from docling.datamodel.pipeline_options import VlmPipelineOptions
|
|
176
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
177
|
+
vlm_opts = VlmPipelineOptions()
|
|
178
|
+
if sys.platform == "darwin":
|
|
179
|
+
try:
|
|
180
|
+
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_MLX
|
|
181
|
+
vlm_opts.vlm_options = GRANITEDOCLING_MLX
|
|
182
|
+
except ImportError:
|
|
183
|
+
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS
|
|
184
|
+
vlm_opts.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
|
185
|
+
fmt = {}
|
|
186
|
+
if hasattr(InputFormat, "PDF"):
|
|
187
|
+
fmt[InputFormat.PDF] = PdfFormatOption(
|
|
188
|
+
pipeline_cls=VlmPipeline, pipeline_options=vlm_opts,
|
|
189
|
+
)
|
|
190
|
+
if hasattr(InputFormat, "IMAGE"):
|
|
191
|
+
fmt[InputFormat.IMAGE] = PdfFormatOption(
|
|
192
|
+
pipeline_cls=VlmPipeline, pipeline_options=vlm_opts,
|
|
193
|
+
)
|
|
194
|
+
converter = DocumentConverter(format_options=fmt)
|
|
195
|
+
except Exception as e:
|
|
196
|
+
print(json.dumps({"ok": False, "error": f"VLM pipeline setup failed: {e}"}))
|
|
197
|
+
return
|
|
198
|
+
else:
|
|
199
|
+
pdf_opt = PdfFormatOption(pipeline_options=pipe)
|
|
200
|
+
fmt = {}
|
|
201
|
+
if hasattr(InputFormat, "PDF"):
|
|
202
|
+
fmt[InputFormat.PDF] = pdf_opt
|
|
203
|
+
if hasattr(InputFormat, "IMAGE"):
|
|
204
|
+
fmt[InputFormat.IMAGE] = pdf_opt
|
|
205
|
+
converter = DocumentConverter(format_options=fmt)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
print(json.dumps({"ok": False, "error": f"Converter creation failed: {e}"}))
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
# --- process files --------------------------------------------------
|
|
211
|
+
results = []
|
|
212
|
+
for fp in file_paths:
|
|
213
|
+
try:
|
|
214
|
+
res = converter.convert(fp)
|
|
215
|
+
ok = False
|
|
216
|
+
if hasattr(res, "status"):
|
|
217
|
+
try:
|
|
218
|
+
ok = res.status == ConversionStatus.SUCCESS
|
|
219
|
+
except Exception:
|
|
220
|
+
ok = str(res.status).lower() == "success"
|
|
221
|
+
if not ok and getattr(res, "document", None) is not None:
|
|
222
|
+
ok = True
|
|
223
|
+
if ok and res.document is not None:
|
|
224
|
+
doc_json = res.document.export_to_dict()
|
|
225
|
+
results.append({
|
|
226
|
+
"document": doc_json,
|
|
227
|
+
"file_path": str(fp),
|
|
228
|
+
"status": "SUCCESS",
|
|
229
|
+
})
|
|
230
|
+
else:
|
|
231
|
+
results.append(None)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
sys.stderr.write(f"Error processing {fp}: {e}\n")
|
|
234
|
+
results.append(None)
|
|
235
|
+
|
|
236
|
+
print(json.dumps({"ok": True, "results": results}))
|
|
237
|
+
|
|
238
|
+
if __name__ == "__main__":
|
|
239
|
+
main()
|
|
240
|
+
""")
|
|
241
|
+
|
|
242
|
+
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
|
|
243
|
+
# Check that docling is installed without actually importing it.
|
|
244
|
+
# The real import (PyTorch, transformers, etc.) happens in the child
|
|
245
|
+
# subprocess. Importing it here would spike memory and get the
|
|
246
|
+
# Gunicorn worker SIGKILL'd by the OOM killer.
|
|
247
|
+
import importlib.util
|
|
248
|
+
|
|
249
|
+
if importlib.util.find_spec("docling") is None:
|
|
250
|
+
msg = (
|
|
251
|
+
"Docling is an optional dependency. Install with `uv pip install 'langflow[docling]'` or "
|
|
252
|
+
"`uv pip install 'lfx-docling[local]'`."
|
|
253
|
+
)
|
|
254
|
+
raise ImportError(msg)
|
|
255
|
+
|
|
256
|
+
file_paths = [str(file.path) for file in file_list if file.path]
|
|
257
|
+
|
|
258
|
+
if not file_paths:
|
|
259
|
+
self.log("No files to process.")
|
|
260
|
+
return file_list
|
|
261
|
+
|
|
262
|
+
pic_desc_config: dict | None = None
|
|
263
|
+
if self.pic_desc_llm is not None:
|
|
264
|
+
pic_desc_config = _serialize_pydantic_model(self.pic_desc_llm)
|
|
265
|
+
|
|
266
|
+
args = {
|
|
267
|
+
"file_paths": file_paths,
|
|
268
|
+
"pipeline": self.pipeline,
|
|
269
|
+
"ocr_engine": self.ocr_engine,
|
|
270
|
+
"do_picture_classification": self.do_picture_classification,
|
|
271
|
+
"pic_desc_config": pic_desc_config,
|
|
272
|
+
"pic_desc_prompt": self.pic_desc_prompt,
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
# Use Popen with a polling loop (same pattern as Read File advanced mode).
|
|
276
|
+
# This avoids multiprocessing/threading issues under Gunicorn and keeps the
|
|
277
|
+
# SSE event stream alive via periodic heartbeat logs.
|
|
278
|
+
docling_timeout = 600 # 10 minutes
|
|
279
|
+
poll_interval = 5
|
|
280
|
+
|
|
281
|
+
# Use a temporary file for stdout to avoid pipe buffer deadlocks.
|
|
282
|
+
# Docling (and its transitive imports: PyTorch, transformers, etc.) can
|
|
283
|
+
# write large amounts of output. With subprocess.PIPE the OS pipe
|
|
284
|
+
# buffer (~16 KB on macOS) fills up, the child blocks on write, and the
|
|
285
|
+
# parent - which only reads *after* the child exits - waits forever.
|
|
286
|
+
import tempfile
|
|
287
|
+
|
|
288
|
+
with tempfile.TemporaryFile() as stdout_file, tempfile.TemporaryFile() as stderr_file:
|
|
289
|
+
proc = subprocess.Popen( # noqa: S603
|
|
290
|
+
[sys.executable, "-u", "-c", self._CHILD_SCRIPT],
|
|
291
|
+
stdin=subprocess.PIPE,
|
|
292
|
+
stdout=stdout_file,
|
|
293
|
+
stderr=stderr_file,
|
|
294
|
+
)
|
|
295
|
+
proc.stdin.write(json.dumps(args).encode("utf-8"))
|
|
296
|
+
proc.stdin.close()
|
|
297
|
+
|
|
298
|
+
start = time.monotonic()
|
|
299
|
+
while proc.poll() is None:
|
|
300
|
+
elapsed = time.monotonic() - start
|
|
301
|
+
if elapsed >= docling_timeout:
|
|
302
|
+
proc.kill()
|
|
303
|
+
proc.wait()
|
|
304
|
+
msg = (
|
|
305
|
+
f"Docling processing timed out after {docling_timeout}s. Try processing fewer or smaller files."
|
|
306
|
+
)
|
|
307
|
+
raise TimeoutError(msg)
|
|
308
|
+
self.log(f"Docling processing in progress ({int(elapsed)}s elapsed)...")
|
|
309
|
+
time.sleep(poll_interval)
|
|
310
|
+
|
|
311
|
+
stdout_file.seek(0)
|
|
312
|
+
stderr_file.seek(0)
|
|
313
|
+
stdout_bytes = stdout_file.read()
|
|
314
|
+
stderr_bytes = stderr_file.read()
|
|
315
|
+
|
|
316
|
+
if not stdout_bytes:
|
|
317
|
+
err_msg = stderr_bytes.decode("utf-8", errors="replace") if stderr_bytes else "no output"
|
|
318
|
+
msg = f"Docling subprocess error: {err_msg}"
|
|
319
|
+
raise RuntimeError(msg)
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
payload = json.loads(stdout_bytes.decode("utf-8"))
|
|
323
|
+
except Exception as e:
|
|
324
|
+
err_msg = stderr_bytes.decode("utf-8", errors="replace")
|
|
325
|
+
msg = f"Invalid JSON from Docling subprocess: {e}. stderr={err_msg}"
|
|
326
|
+
raise RuntimeError(msg) from e
|
|
327
|
+
|
|
328
|
+
if not payload.get("ok"):
|
|
329
|
+
error_msg = payload.get("error", "Unknown Docling error")
|
|
330
|
+
if "not installed" in error_msg.lower():
|
|
331
|
+
raise ImportError(error_msg)
|
|
332
|
+
raise RuntimeError(error_msg)
|
|
333
|
+
|
|
334
|
+
# Reconstruct DoclingDocument objects from JSON dicts returned by the child
|
|
335
|
+
from docling_core.types.doc import DoclingDocument
|
|
336
|
+
|
|
337
|
+
raw_results = payload.get("results", [])
|
|
338
|
+
processed_data: list[Data | None] = []
|
|
339
|
+
for r in raw_results:
|
|
340
|
+
if r is None:
|
|
341
|
+
processed_data.append(None)
|
|
342
|
+
continue
|
|
343
|
+
try:
|
|
344
|
+
doc = DoclingDocument.model_validate(r["document"])
|
|
345
|
+
except Exception: # noqa: BLE001
|
|
346
|
+
# Fall back to keeping the raw dict if validation fails
|
|
347
|
+
doc = r["document"]
|
|
348
|
+
processed_data.append(Data(data={"doc": doc, "file_path": r["file_path"]}))
|
|
349
|
+
|
|
350
|
+
return self.rollup_data(file_list, processed_data)
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import json
|
|
5
|
+
import time
|
|
6
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
7
|
+
from pathlib import Path # noqa: TC003
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from lfx.base.data import BaseFileComponent
|
|
12
|
+
from lfx.base.data.docling_utils import coerce_docling_document
|
|
13
|
+
from lfx.inputs import IntInput, NestedDictInput, StrInput, TableInput
|
|
14
|
+
from lfx.inputs.inputs import FloatInput
|
|
15
|
+
from lfx.schema import Data, DataFrame, dotdict
|
|
16
|
+
from lfx.utils.util import transform_localhost_url
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DoclingRemoteComponent(BaseFileComponent):
|
|
20
|
+
display_name = "Docling Serve"
|
|
21
|
+
description = "Uses Docling to process input documents connecting to your instance of Docling Serve."
|
|
22
|
+
documentation = "https://docling-project.github.io/docling/"
|
|
23
|
+
trace_type = "tool"
|
|
24
|
+
icon = "Docling"
|
|
25
|
+
name = "DoclingRemote"
|
|
26
|
+
|
|
27
|
+
MAX_500_RETRIES = 5
|
|
28
|
+
|
|
29
|
+
# https://docling-project.github.io/docling/usage/supported_formats/
|
|
30
|
+
VALID_EXTENSIONS = [
|
|
31
|
+
"adoc",
|
|
32
|
+
"asciidoc",
|
|
33
|
+
"asc",
|
|
34
|
+
"bmp",
|
|
35
|
+
"csv",
|
|
36
|
+
"dotx",
|
|
37
|
+
"dotm",
|
|
38
|
+
"docm",
|
|
39
|
+
"docx",
|
|
40
|
+
"htm",
|
|
41
|
+
"html",
|
|
42
|
+
"jpeg",
|
|
43
|
+
"jpg",
|
|
44
|
+
"json",
|
|
45
|
+
"md",
|
|
46
|
+
"pdf",
|
|
47
|
+
"png",
|
|
48
|
+
"potx",
|
|
49
|
+
"ppsx",
|
|
50
|
+
"pptm",
|
|
51
|
+
"potm",
|
|
52
|
+
"ppsm",
|
|
53
|
+
"pptx",
|
|
54
|
+
"tiff",
|
|
55
|
+
"txt",
|
|
56
|
+
"xls",
|
|
57
|
+
"xlsx",
|
|
58
|
+
"xhtml",
|
|
59
|
+
"xml",
|
|
60
|
+
"webp",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
inputs = [
|
|
64
|
+
*BaseFileComponent.get_base_inputs(),
|
|
65
|
+
StrInput(
|
|
66
|
+
name="api_url",
|
|
67
|
+
display_name="Server address",
|
|
68
|
+
info="URL of the Docling Serve instance.",
|
|
69
|
+
required=True,
|
|
70
|
+
),
|
|
71
|
+
StrInput(
|
|
72
|
+
name="task_id",
|
|
73
|
+
display_name="Task ID",
|
|
74
|
+
info=(
|
|
75
|
+
"Optional task ID from a previous Docling Serve upload. "
|
|
76
|
+
"If provided, file input is ignored and the component polls for this task's results."
|
|
77
|
+
),
|
|
78
|
+
required=False,
|
|
79
|
+
),
|
|
80
|
+
IntInput(
|
|
81
|
+
name="max_concurrency",
|
|
82
|
+
display_name="Concurrency",
|
|
83
|
+
info="Maximum number of concurrent requests for the server.",
|
|
84
|
+
advanced=True,
|
|
85
|
+
value=2,
|
|
86
|
+
input_types=["Message"],
|
|
87
|
+
),
|
|
88
|
+
FloatInput(
|
|
89
|
+
name="max_poll_timeout",
|
|
90
|
+
display_name="Maximum poll time",
|
|
91
|
+
info="Maximum waiting time for the document conversion to complete.",
|
|
92
|
+
advanced=True,
|
|
93
|
+
value=3600,
|
|
94
|
+
input_types=["Message"],
|
|
95
|
+
),
|
|
96
|
+
TableInput(
|
|
97
|
+
name="api_headers",
|
|
98
|
+
display_name="HTTP headers",
|
|
99
|
+
advanced=True,
|
|
100
|
+
required=False,
|
|
101
|
+
info=("Optional headers required for connecting to Docling Serve."),
|
|
102
|
+
table_schema=[
|
|
103
|
+
{
|
|
104
|
+
"name": "key",
|
|
105
|
+
"display_name": "Key",
|
|
106
|
+
"type": "string",
|
|
107
|
+
"description": "Key name",
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"name": "value",
|
|
111
|
+
"display_name": "Value",
|
|
112
|
+
"load_from_db": True,
|
|
113
|
+
"type": "string",
|
|
114
|
+
"description": "Value of the header",
|
|
115
|
+
},
|
|
116
|
+
],
|
|
117
|
+
value=[],
|
|
118
|
+
real_time_refresh=True,
|
|
119
|
+
input_types=["Data", "JSON"],
|
|
120
|
+
),
|
|
121
|
+
NestedDictInput(
|
|
122
|
+
name="docling_serve_opts",
|
|
123
|
+
display_name="Docling options",
|
|
124
|
+
advanced=True,
|
|
125
|
+
required=False,
|
|
126
|
+
info=(
|
|
127
|
+
"Optional dictionary of additional options. "
|
|
128
|
+
"See https://github.com/docling-project/docling-serve/blob/main/docs/usage.md for more information."
|
|
129
|
+
),
|
|
130
|
+
input_types=["Message"],
|
|
131
|
+
),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
outputs = [
|
|
135
|
+
*BaseFileComponent.get_base_outputs(),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
def build(self) -> DataFrame:
|
|
139
|
+
# Static bundle validation cannot see BaseFileComponent's inherited output method.
|
|
140
|
+
return self.load_files()
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def _add_header(headers: dict[str, str], key: Any, value: Any) -> None:
|
|
144
|
+
key_str = str(key).strip()
|
|
145
|
+
if not key_str or key_str == "None":
|
|
146
|
+
return
|
|
147
|
+
headers[key_str] = str(value)
|
|
148
|
+
|
|
149
|
+
def _process_headers_input(self, headers_input: Any, component_headers_dict: dict[str, str]) -> None:
|
|
150
|
+
if not headers_input:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
items = headers_input if isinstance(headers_input, list) else [headers_input]
|
|
154
|
+
|
|
155
|
+
for item in items:
|
|
156
|
+
if not item:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# Case 1: Data object
|
|
160
|
+
if hasattr(item, "data") and isinstance(item.data, dict):
|
|
161
|
+
data = item.data
|
|
162
|
+
if "key" in data and "value" in data:
|
|
163
|
+
self._add_header(component_headers_dict, data["key"], data["value"])
|
|
164
|
+
else:
|
|
165
|
+
# Fallback: merge all keys from Data object
|
|
166
|
+
for k, v in data.items():
|
|
167
|
+
if k not in ("text_key", "default_value"):
|
|
168
|
+
self._add_header(component_headers_dict, k, v)
|
|
169
|
+
|
|
170
|
+
# Case 2: Dictionary (Table row)
|
|
171
|
+
elif isinstance(item, dict):
|
|
172
|
+
if "key" in item and "value" in item:
|
|
173
|
+
self._add_header(component_headers_dict, item["key"], item["value"])
|
|
174
|
+
else:
|
|
175
|
+
# Fallback: merge all keys
|
|
176
|
+
for k, v in item.items():
|
|
177
|
+
self._add_header(component_headers_dict, k, v)
|
|
178
|
+
|
|
179
|
+
# Case 3: Message object
|
|
180
|
+
elif hasattr(item, "text") and isinstance(item.text, str):
|
|
181
|
+
try:
|
|
182
|
+
parsed = json.loads(item.text)
|
|
183
|
+
if isinstance(parsed, dict):
|
|
184
|
+
for k, v in parsed.items():
|
|
185
|
+
self._add_header(component_headers_dict, k, v)
|
|
186
|
+
except json.JSONDecodeError:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
def _process_headers(self) -> dict[str, str]:
|
|
190
|
+
"""Process the headers input into a valid dictionary."""
|
|
191
|
+
component_headers_dict: dict[str, str] = {}
|
|
192
|
+
self._process_headers_input(self.api_headers, component_headers_dict)
|
|
193
|
+
return component_headers_dict
|
|
194
|
+
|
|
195
|
+
def update_build_config(self, build_config: dotdict, field_value: Any, field_name: str | None = None) -> dotdict:
|
|
196
|
+
if field_name == "api_headers":
|
|
197
|
+
if isinstance(field_value, dict):
|
|
198
|
+
build_config["api_headers"]["value"] = [{"key": k, "value": v} for k, v in field_value.items()]
|
|
199
|
+
return build_config
|
|
200
|
+
if field_value is None:
|
|
201
|
+
build_config["api_headers"]["value"] = []
|
|
202
|
+
return build_config
|
|
203
|
+
|
|
204
|
+
# Default behavior
|
|
205
|
+
return super().update_build_config(build_config, field_value, field_name)
|
|
206
|
+
|
|
207
|
+
def _poll_and_fetch_result(
|
|
208
|
+
self, client: httpx.Client, base_url: str, task_id: str, file_path: str | None = None
|
|
209
|
+
) -> Data | None:
|
|
210
|
+
"""Poll for task completion and fetch the result.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
client: The HTTP client to use for requests.
|
|
214
|
+
base_url: The base URL of the Docling Serve API.
|
|
215
|
+
task_id: The task ID to poll for.
|
|
216
|
+
file_path: Optional file path to include in the result data.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Data object with the DoclingDocument, or None if processing failed.
|
|
220
|
+
"""
|
|
221
|
+
http_failures = 0
|
|
222
|
+
retry_status_start = 500
|
|
223
|
+
retry_status_end = 600
|
|
224
|
+
start_wait_time = time.monotonic()
|
|
225
|
+
|
|
226
|
+
task_status = None
|
|
227
|
+
while task_status not in ("success", "failure"):
|
|
228
|
+
processing_time = time.monotonic() - start_wait_time
|
|
229
|
+
if processing_time >= self.max_poll_timeout:
|
|
230
|
+
msg = (
|
|
231
|
+
f"Processing time {processing_time=} exceeds the maximum poll timeout {self.max_poll_timeout=}."
|
|
232
|
+
"Please increase the max_poll_timeout parameter or review why the processing "
|
|
233
|
+
"takes long on the server."
|
|
234
|
+
)
|
|
235
|
+
self.log(msg)
|
|
236
|
+
raise RuntimeError(msg)
|
|
237
|
+
|
|
238
|
+
response = client.get(f"{base_url}/status/poll/{task_id}")
|
|
239
|
+
|
|
240
|
+
if retry_status_start <= response.status_code < retry_status_end:
|
|
241
|
+
http_failures += 1
|
|
242
|
+
if http_failures > self.MAX_500_RETRIES:
|
|
243
|
+
self.log(f"The status requests got a http response {response.status_code} too many times.")
|
|
244
|
+
return None
|
|
245
|
+
time.sleep(2)
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
response.raise_for_status()
|
|
249
|
+
task = response.json()
|
|
250
|
+
task_status = task["task_status"]
|
|
251
|
+
if task_status not in ("success", "failure"):
|
|
252
|
+
time.sleep(2)
|
|
253
|
+
|
|
254
|
+
result_resp = client.get(f"{base_url}/result/{task_id}")
|
|
255
|
+
result_resp.raise_for_status()
|
|
256
|
+
result = result_resp.json()
|
|
257
|
+
|
|
258
|
+
if result.get("status") == "failure" or result.get("errors"):
|
|
259
|
+
errors = result.get("errors", [])
|
|
260
|
+
err_msg_list = []
|
|
261
|
+
for err in errors:
|
|
262
|
+
if isinstance(err, dict) and "error_message" in err:
|
|
263
|
+
err_msg_list.append(err["error_message"])
|
|
264
|
+
elif isinstance(err, str):
|
|
265
|
+
err_msg_list.append(err)
|
|
266
|
+
|
|
267
|
+
err_details = "; ".join(err_msg_list) if err_msg_list else "Unknown Docling processing error"
|
|
268
|
+
|
|
269
|
+
msg = f"Docling processing failed: {err_details}"
|
|
270
|
+
raise ValueError(msg)
|
|
271
|
+
|
|
272
|
+
if "json_content" not in result["document"] or result["document"]["json_content"] is None:
|
|
273
|
+
self.log("No JSON DoclingDocument found in the result.")
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
doc = coerce_docling_document(result["document"]["json_content"])
|
|
278
|
+
data_dict: dict[str, Any] = {"doc": doc}
|
|
279
|
+
if file_path:
|
|
280
|
+
data_dict["file_path"] = file_path
|
|
281
|
+
return Data(data=data_dict)
|
|
282
|
+
except Exception as e: # noqa: BLE001
|
|
283
|
+
self.log(f"Error validating the document. {e}")
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
def _process_task_id(self) -> list[Data]:
|
|
287
|
+
"""Process an existing task by polling for status and retrieving results.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
List containing the result Data object, or empty list if processing failed.
|
|
291
|
+
"""
|
|
292
|
+
transformed_url = transform_localhost_url(self.api_url)
|
|
293
|
+
base_url = f"{transformed_url}/v1"
|
|
294
|
+
|
|
295
|
+
with httpx.Client(headers=self._process_headers()) as client:
|
|
296
|
+
result = self._poll_and_fetch_result(client, base_url, self.task_id)
|
|
297
|
+
return [result] if result else []
|
|
298
|
+
|
|
299
|
+
def load_files_base(self) -> list[Data]:
|
|
300
|
+
"""Load and process files, or poll an existing task if task_id is provided.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
list[Data]: Parsed data from the processed files or task.
|
|
304
|
+
"""
|
|
305
|
+
if self.task_id:
|
|
306
|
+
return self._process_task_id()
|
|
307
|
+
return super().load_files_base()
|
|
308
|
+
|
|
309
|
+
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
|
|
310
|
+
transformed_url = transform_localhost_url(self.api_url)
|
|
311
|
+
base_url = f"{transformed_url}/v1"
|
|
312
|
+
|
|
313
|
+
def _convert_document(client: httpx.Client, file_path: Path, options: dict[str, Any]) -> Data | None:
|
|
314
|
+
encoded_doc = base64.b64encode(file_path.read_bytes()).decode()
|
|
315
|
+
payload = {
|
|
316
|
+
"options": options,
|
|
317
|
+
"sources": [{"kind": "file", "base64_string": encoded_doc, "filename": file_path.name}],
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
response = client.post(f"{base_url}/convert/source/async", json=payload)
|
|
321
|
+
response.raise_for_status()
|
|
322
|
+
task = response.json()
|
|
323
|
+
|
|
324
|
+
return self._poll_and_fetch_result(client, base_url, task["task_id"], str(file_path))
|
|
325
|
+
|
|
326
|
+
docling_options = {
|
|
327
|
+
"to_formats": ["json"],
|
|
328
|
+
"image_export_mode": "placeholder",
|
|
329
|
+
**(self.docling_serve_opts or {}),
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
processed_data: list[Data | None] = []
|
|
333
|
+
with (
|
|
334
|
+
httpx.Client(headers=self._process_headers()) as client,
|
|
335
|
+
ThreadPoolExecutor(max_workers=self.max_concurrency) as executor,
|
|
336
|
+
):
|
|
337
|
+
futures: list[tuple[int, Future]] = []
|
|
338
|
+
for i, file in enumerate(file_list):
|
|
339
|
+
if file.path is None:
|
|
340
|
+
processed_data.append(None)
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
futures.append((i, executor.submit(_convert_document, client, file.path, docling_options)))
|
|
344
|
+
|
|
345
|
+
for _index, future in futures:
|
|
346
|
+
try:
|
|
347
|
+
result_data = future.result()
|
|
348
|
+
processed_data.append(result_data)
|
|
349
|
+
except (httpx.HTTPStatusError, httpx.RequestError, KeyError, ValueError) as exc:
|
|
350
|
+
self.log(f"Docling remote processing failed: {exc}")
|
|
351
|
+
raise
|
|
352
|
+
|
|
353
|
+
return self.rollup_data(file_list, processed_data)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from lfx.base.data.docling_utils import coerce_docling_document, extract_docling_documents, get_docling_image_ref_mode
|
|
4
|
+
from lfx.custom import Component
|
|
5
|
+
from lfx.io import DropdownInput, HandleInput, MessageTextInput, Output, StrInput
|
|
6
|
+
from lfx.schema import Data, DataFrame
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ExportDoclingDocumentComponent(Component):
|
|
10
|
+
display_name: str = "Export DoclingDocument"
|
|
11
|
+
description: str = "Export DoclingDocument to markdown, html or other formats."
|
|
12
|
+
documentation = "https://docling-project.github.io/docling/"
|
|
13
|
+
icon = "Docling"
|
|
14
|
+
name = "ExportDoclingDocument"
|
|
15
|
+
|
|
16
|
+
inputs = [
|
|
17
|
+
HandleInput(
|
|
18
|
+
name="data_inputs",
|
|
19
|
+
display_name="JSON or Table",
|
|
20
|
+
info="The data with documents to export.",
|
|
21
|
+
input_types=["Data", "JSON", "DataFrame", "Table"],
|
|
22
|
+
required=True,
|
|
23
|
+
),
|
|
24
|
+
DropdownInput(
|
|
25
|
+
name="export_format",
|
|
26
|
+
display_name="Export format",
|
|
27
|
+
options=["Markdown", "HTML", "Plaintext", "DocTags"],
|
|
28
|
+
info="Select the export format to convert the input.",
|
|
29
|
+
value="Markdown",
|
|
30
|
+
real_time_refresh=True,
|
|
31
|
+
),
|
|
32
|
+
DropdownInput(
|
|
33
|
+
name="image_mode",
|
|
34
|
+
display_name="Image export mode",
|
|
35
|
+
options=["placeholder", "embedded"],
|
|
36
|
+
info=(
|
|
37
|
+
"Specify how images are exported in the output. Placeholder will replace the images with a string, "
|
|
38
|
+
"whereas Embedded will include them as base64 encoded images."
|
|
39
|
+
),
|
|
40
|
+
value="placeholder",
|
|
41
|
+
),
|
|
42
|
+
StrInput(
|
|
43
|
+
name="md_image_placeholder",
|
|
44
|
+
display_name="Image placeholder",
|
|
45
|
+
info="Specify the image placeholder for markdown exports.",
|
|
46
|
+
value="<!-- image -->",
|
|
47
|
+
advanced=True,
|
|
48
|
+
),
|
|
49
|
+
StrInput(
|
|
50
|
+
name="md_page_break_placeholder",
|
|
51
|
+
display_name="Page break placeholder",
|
|
52
|
+
info="Add this placeholder between pages in the markdown output.",
|
|
53
|
+
value="",
|
|
54
|
+
advanced=True,
|
|
55
|
+
),
|
|
56
|
+
MessageTextInput(
|
|
57
|
+
name="doc_key",
|
|
58
|
+
display_name="Doc Key",
|
|
59
|
+
info="The key to use for the DoclingDocument column.",
|
|
60
|
+
value="doc",
|
|
61
|
+
advanced=True,
|
|
62
|
+
),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
outputs = [
|
|
66
|
+
Output(display_name="Exported data", name="data", method="export_document"),
|
|
67
|
+
Output(display_name="Table", name="dataframe", method="as_dataframe"),
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:
|
|
71
|
+
if field_name == "export_format" and field_value == "Markdown":
|
|
72
|
+
build_config["md_image_placeholder"]["show"] = True
|
|
73
|
+
build_config["md_page_break_placeholder"]["show"] = True
|
|
74
|
+
build_config["image_mode"]["show"] = True
|
|
75
|
+
elif field_name == "export_format" and field_value == "HTML":
|
|
76
|
+
build_config["md_image_placeholder"]["show"] = False
|
|
77
|
+
build_config["md_page_break_placeholder"]["show"] = False
|
|
78
|
+
build_config["image_mode"]["show"] = True
|
|
79
|
+
elif field_name == "export_format" and field_value in {"Plaintext", "DocTags"}:
|
|
80
|
+
build_config["md_image_placeholder"]["show"] = False
|
|
81
|
+
build_config["md_page_break_placeholder"]["show"] = False
|
|
82
|
+
build_config["image_mode"]["show"] = False
|
|
83
|
+
|
|
84
|
+
return build_config
|
|
85
|
+
|
|
86
|
+
def _get_image_mode(self) -> Any:
|
|
87
|
+
return get_docling_image_ref_mode(self.image_mode)
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _coerce_exportable_document(doc: Any) -> Any:
|
|
91
|
+
return coerce_docling_document(doc)
|
|
92
|
+
|
|
93
|
+
def export_document(self) -> list[Data]:
|
|
94
|
+
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
|
|
95
|
+
if warning:
|
|
96
|
+
self.status = warning
|
|
97
|
+
|
|
98
|
+
results: list[Data] = []
|
|
99
|
+
try:
|
|
100
|
+
image_mode = self._get_image_mode()
|
|
101
|
+
for raw_doc in documents:
|
|
102
|
+
doc = self._coerce_exportable_document(raw_doc)
|
|
103
|
+
content = ""
|
|
104
|
+
if self.export_format == "Markdown":
|
|
105
|
+
content = doc.export_to_markdown(
|
|
106
|
+
image_mode=image_mode,
|
|
107
|
+
image_placeholder=self.md_image_placeholder,
|
|
108
|
+
page_break_placeholder=self.md_page_break_placeholder,
|
|
109
|
+
)
|
|
110
|
+
elif self.export_format == "HTML":
|
|
111
|
+
content = doc.export_to_html(image_mode=image_mode)
|
|
112
|
+
elif self.export_format == "Plaintext":
|
|
113
|
+
content = doc.export_to_text()
|
|
114
|
+
elif self.export_format == "DocTags":
|
|
115
|
+
content = doc.export_to_doctags()
|
|
116
|
+
|
|
117
|
+
# Preserve metadata from the DoclingDocument
|
|
118
|
+
metadata: dict = {"export_format": self.export_format}
|
|
119
|
+
if hasattr(doc, "name") and doc.name:
|
|
120
|
+
metadata["name"] = doc.name
|
|
121
|
+
if hasattr(doc, "origin") and doc.origin is not None:
|
|
122
|
+
if hasattr(doc.origin, "filename") and doc.origin.filename:
|
|
123
|
+
metadata["filename"] = doc.origin.filename
|
|
124
|
+
if hasattr(doc.origin, "binary_hash") and doc.origin.binary_hash:
|
|
125
|
+
metadata["document_id"] = str(doc.origin.binary_hash)
|
|
126
|
+
if hasattr(doc.origin, "mimetype") and doc.origin.mimetype:
|
|
127
|
+
metadata["mimetype"] = doc.origin.mimetype
|
|
128
|
+
|
|
129
|
+
results.append(Data(text=content, data={"text": content, **metadata}))
|
|
130
|
+
except Exception as e:
|
|
131
|
+
msg = f"Error exporting document: {e}"
|
|
132
|
+
raise TypeError(msg) from e
|
|
133
|
+
|
|
134
|
+
return results
|
|
135
|
+
|
|
136
|
+
def as_dataframe(self) -> DataFrame:
|
|
137
|
+
return DataFrame(self.export_document())
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://schemas.langflow.org/extension/v1.json",
|
|
3
|
+
"id": "lfx-docling",
|
|
4
|
+
"version": "0.1.0",
|
|
5
|
+
"name": "Docling",
|
|
6
|
+
"description": "Docling document processing components as a standalone Langflow Extension Bundle.",
|
|
7
|
+
"lfx": {
|
|
8
|
+
"compat": ["1"]
|
|
9
|
+
},
|
|
10
|
+
"bundles": [
|
|
11
|
+
{
|
|
12
|
+
"name": "docling",
|
|
13
|
+
"path": "components/docling"
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lfx-docling
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Docling document processing components as a standalone Langflow Extension Bundle.
|
|
5
|
+
Project-URL: Homepage, https://github.com/langflow-ai/langflow
|
|
6
|
+
Project-URL: Documentation, https://docs.langflow.org/bundles-docling
|
|
7
|
+
Project-URL: Repository, https://github.com/langflow-ai/langflow
|
|
8
|
+
Author-email: Langflow <contact@langflow.org>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: bundle,docling,documents,extension,langflow,lfx
|
|
11
|
+
Requires-Python: <3.15,>=3.10
|
|
12
|
+
Requires-Dist: docling-core<3.0.0,>=2.36.1
|
|
13
|
+
Requires-Dist: httpx<1.0.0,>=0.28.1
|
|
14
|
+
Requires-Dist: lfx>=0.5.0
|
|
15
|
+
Provides-Extra: all
|
|
16
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.36.1; extra == 'all'
|
|
17
|
+
Requires-Dist: docling<3.0.0,>=2.36.1; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'all'
|
|
18
|
+
Requires-Dist: langchain-docling>=1.1.0; extra == 'all'
|
|
19
|
+
Requires-Dist: ocrmac>=1.0.0; (sys_platform == 'darwin') and extra == 'all'
|
|
20
|
+
Requires-Dist: rapidocr-onnxruntime>=1.4.4; extra == 'all'
|
|
21
|
+
Requires-Dist: tesserocr>=2.8.0; extra == 'all'
|
|
22
|
+
Requires-Dist: tiktoken>=0.7.0; extra == 'all'
|
|
23
|
+
Requires-Dist: torch>=2.6.0; extra == 'all'
|
|
24
|
+
Requires-Dist: torchvision>=0.21.0; extra == 'all'
|
|
25
|
+
Provides-Extra: chunking
|
|
26
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.36.1; extra == 'chunking'
|
|
27
|
+
Requires-Dist: tiktoken>=0.7.0; extra == 'chunking'
|
|
28
|
+
Provides-Extra: image-description
|
|
29
|
+
Requires-Dist: docling<3.0.0,>=2.36.1; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'image-description'
|
|
30
|
+
Requires-Dist: langchain-docling>=1.1.0; extra == 'image-description'
|
|
31
|
+
Requires-Dist: ocrmac>=1.0.0; (sys_platform == 'darwin') and extra == 'image-description'
|
|
32
|
+
Requires-Dist: rapidocr-onnxruntime>=1.4.4; extra == 'image-description'
|
|
33
|
+
Requires-Dist: tesserocr>=2.8.0; extra == 'image-description'
|
|
34
|
+
Requires-Dist: torch>=2.6.0; extra == 'image-description'
|
|
35
|
+
Requires-Dist: torchvision>=0.21.0; extra == 'image-description'
|
|
36
|
+
Provides-Extra: local
|
|
37
|
+
Requires-Dist: docling<3.0.0,>=2.36.1; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'local'
|
|
38
|
+
Requires-Dist: ocrmac>=1.0.0; (sys_platform == 'darwin') and extra == 'local'
|
|
39
|
+
Requires-Dist: rapidocr-onnxruntime>=1.4.4; extra == 'local'
|
|
40
|
+
Requires-Dist: tesserocr>=2.8.0; extra == 'local'
|
|
41
|
+
Requires-Dist: torch>=2.6.0; extra == 'local'
|
|
42
|
+
Requires-Dist: torchvision>=0.21.0; extra == 'local'
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# Docling Bundle
|
|
46
|
+
|
|
47
|
+
Docling components for Langflow packaged as a standalone Extension Bundle.
|
|
48
|
+
|
|
49
|
+
## Components
|
|
50
|
+
|
|
51
|
+
- Docling
|
|
52
|
+
- Docling Serve
|
|
53
|
+
- Export DoclingDocument
|
|
54
|
+
- Chunk DoclingDocument
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
The bundle is installed with Langflow in the 1.10 workspace. The base package includes `docling-core` for the `DoclingDocument` schema. For standalone local conversion:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
uv pip install "lfx-docling[local]"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Chunking and picture-description support use separate optional extras. Chunking
|
|
65
|
+
does not install the full local converter/OCR stack:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
uv pip install "lfx-docling[chunking]"
|
|
69
|
+
uv pip install "lfx-docling[image-description]"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Develop
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
uv run lfx extension validate src/bundles/docling/src/lfx_docling
|
|
76
|
+
uv run pytest src/bundles/docling/tests
|
|
77
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
lfx_docling/__init__.py,sha256=cCRwW1jLZX9BJ5oV9TXYYcYRuhlVDPVDqyPjZBlaBxk,566
|
|
2
|
+
lfx_docling/extension.json,sha256=omS8BGiwJYDI-2zkjTeT1TLTH35ORLBas1ugmA_bdgI,362
|
|
3
|
+
lfx_docling/components/docling/__init__.py,sha256=6xxVeK1DutDKS7vdiuF9XFGbd5yJ9M_ep2e_YYqa32I,386
|
|
4
|
+
lfx_docling/components/docling/chunk_docling_document.py,sha256=9e8yUfXO4YHxjgBfb0Vm7cQ2XSFW4KPkgifrLu489Tc,9220
|
|
5
|
+
lfx_docling/components/docling/docling_inline.py,sha256=ZdOLflUAEm_bbqnucA86jLzDci2ExCmvH9f9rK7ogQw,14833
|
|
6
|
+
lfx_docling/components/docling/docling_remote.py,sha256=pE4ULahalE96hGkcedvEWRehWZmKJmZJUdbgkfMrUEk,13050
|
|
7
|
+
lfx_docling/components/docling/export_docling_document.py,sha256=9gGsRKBQMNRQx8cSkWyhBSlQlC7vHt_z04t7fbWSIoM,5836
|
|
8
|
+
lfx_docling-0.1.0.dist-info/METADATA,sha256=SlOPqGiirb0PkVqdOvnv0oPXspiFJO55tyKIo6Gy8Y4,3167
|
|
9
|
+
lfx_docling-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
10
|
+
lfx_docling-0.1.0.dist-info/entry_points.txt,sha256=ZMFLxk5y0VG5SrHLjZeXSZSA1ceNypO23fDHeHLmDnc,48
|
|
11
|
+
lfx_docling-0.1.0.dist-info/RECORD,,
|