docling-agent 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_agent/__init__.py +0 -0
- docling_agent/agent/__init__.py +0 -0
- docling_agent/agent/base.py +80 -0
- docling_agent/agent/base_functions.py +590 -0
- docling_agent/agent/editor.py +383 -0
- docling_agent/agent/enricher.py +631 -0
- docling_agent/agent/extractor.py +148 -0
- docling_agent/agent/library.py +231 -0
- docling_agent/agent/orchestrator.py +531 -0
- docling_agent/agent/rag.py +449 -0
- docling_agent/agent/rag_models.py +36 -0
- docling_agent/agent/writer.py +615 -0
- docling_agent/agent_models.py +137 -0
- docling_agent/agents.py +36 -0
- docling_agent/cli/__init__.py +148 -0
- docling_agent/logging.py +45 -0
- docling_agent/py.typed +0 -0
- docling_agent/resources/__init__.py +1 -0
- docling_agent/resources/prompts.py +128 -0
- docling_agent/task_model.py +156 -0
- docling_agent-0.1.0.dist-info/METADATA +208 -0
- docling_agent-0.1.0.dist-info/RECORD +25 -0
- docling_agent-0.1.0.dist-info/WHEEL +4 -0
- docling_agent-0.1.0.dist-info/entry_points.txt +2 -0
- docling_agent-0.1.0.dist-info/licenses/LICENSE +21 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
# from smolagents import MCPClient, Tool, ToolCollection
|
|
9
|
+
# from smolagents.models import ChatMessage, MessageRole, Model
|
|
10
|
+
from mellea.backends.model_ids import ModelIdentifier
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
15
|
+
|
|
16
|
+
# Use shared logger from docling_agent.agents
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DoclingAgentType(Enum):
|
|
20
|
+
"""Enumeration of supported agent types."""
|
|
21
|
+
|
|
22
|
+
# Core agent types
|
|
23
|
+
DOCLING_DOCUMENT_WRITER = "writer"
|
|
24
|
+
DOCLING_DOCUMENT_EDITOR = "editor"
|
|
25
|
+
DOCLING_DOCUMENT_EXTRACTOR = "extractor"
|
|
26
|
+
DOCLING_DOCUMENT_ENRICHER = "enricher"
|
|
27
|
+
DOCLING_DOCUMENT_RAG = "rag"
|
|
28
|
+
DOCLING_DOCUMENT_ORCHESTRATOR = "orchestrator"
|
|
29
|
+
|
|
30
|
+
def __str__(self) -> str:
|
|
31
|
+
"""Return the string value of the enum."""
|
|
32
|
+
return self.value
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_string(cls, value: str) -> DoclingAgentType:
|
|
36
|
+
"""Create AgentType from string value."""
|
|
37
|
+
for agent_type in cls:
|
|
38
|
+
if agent_type.value == value:
|
|
39
|
+
return agent_type
|
|
40
|
+
raise ValueError(f"Invalid agent type: {value}. Valid types: {[t.value for t in cls]}")
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def get_all_types(cls) -> list[str]:
|
|
44
|
+
"""Get all available agent type strings."""
|
|
45
|
+
return [agent_type.value for agent_type in cls]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BaseDoclingAgent(BaseModel):
|
|
49
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
50
|
+
|
|
51
|
+
agent_type: DoclingAgentType
|
|
52
|
+
model_id: ModelIdentifier
|
|
53
|
+
tools: list
|
|
54
|
+
|
|
55
|
+
# model needed for reasoning/instruction following
|
|
56
|
+
reasoning_model_id: ModelIdentifier | None = None
|
|
57
|
+
|
|
58
|
+
# model needed for writing, summarizing, etc
|
|
59
|
+
writing_model_id: ModelIdentifier | None = None
|
|
60
|
+
|
|
61
|
+
max_iteration: int = 16
|
|
62
|
+
|
|
63
|
+
def get_reasoning_model_id(self) -> ModelIdentifier:
|
|
64
|
+
"""Return the reasoning model id, falling back to the primary model."""
|
|
65
|
+
return self.reasoning_model_id or self.model_id
|
|
66
|
+
|
|
67
|
+
def get_writing_model_id(self) -> ModelIdentifier:
|
|
68
|
+
"""Return the writing model id, falling back to the primary model."""
|
|
69
|
+
return self.writing_model_id or self.model_id
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def run(
|
|
73
|
+
self,
|
|
74
|
+
task: str,
|
|
75
|
+
document: DoclingDocument | None = None,
|
|
76
|
+
sources: list[DoclingDocument | Path] = [],
|
|
77
|
+
**kwargs,
|
|
78
|
+
) -> DoclingDocument:
|
|
79
|
+
"""Execute the agent for a task and return a document."""
|
|
80
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,590 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
|
|
5
|
+
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
6
|
+
from docling.datamodel.document import ConversionResult
|
|
7
|
+
from docling.document_converter import DocumentConverter
|
|
8
|
+
from docling_core.experimental.serializer.outline import (
|
|
9
|
+
OutlineDocSerializer,
|
|
10
|
+
OutlineFormat,
|
|
11
|
+
OutlineMode,
|
|
12
|
+
OutlineParams,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.transforms.serializer.markdown import (
|
|
15
|
+
MarkdownDocSerializer,
|
|
16
|
+
MarkdownParams,
|
|
17
|
+
)
|
|
18
|
+
from docling_core.types.doc.document import (
|
|
19
|
+
DocItemLabel,
|
|
20
|
+
DoclingDocument,
|
|
21
|
+
GroupItem,
|
|
22
|
+
GroupLabel,
|
|
23
|
+
ListGroup,
|
|
24
|
+
ListItem,
|
|
25
|
+
NodeItem,
|
|
26
|
+
PictureItem,
|
|
27
|
+
RefItem,
|
|
28
|
+
SectionHeaderItem,
|
|
29
|
+
TableItem,
|
|
30
|
+
TextItem,
|
|
31
|
+
TitleItem,
|
|
32
|
+
)
|
|
33
|
+
from docling_core.types.io import DocumentStream
|
|
34
|
+
|
|
35
|
+
from docling_agent.logging import logger
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def find_crefs(text: str) -> list[RefItem]:
|
|
39
|
+
"""
|
|
40
|
+
Check if a string matches the pattern ```markdown(.*)?```
|
|
41
|
+
"""
|
|
42
|
+
logger.info("find_crefs")
|
|
43
|
+
labels: str = "|".join(e.value for e in DocItemLabel)
|
|
44
|
+
pattern = rf"#/({labels})/\d+"
|
|
45
|
+
|
|
46
|
+
refs: list[RefItem] = []
|
|
47
|
+
for m in re.finditer(pattern, text, re.DOTALL):
|
|
48
|
+
refs.append(RefItem(cref=m.group(0)))
|
|
49
|
+
|
|
50
|
+
return refs
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def has_crefs(text: str) -> bool:
|
|
54
|
+
logger.info("has_crefs")
|
|
55
|
+
return len(find_crefs(text)) > 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def has_json_dicts(text: str) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Extract JSON dictionaries from ```json code blocks
|
|
61
|
+
"""
|
|
62
|
+
logger.info("has_json_dicts")
|
|
63
|
+
pattern = r"```json\s*(.*?)\s*```"
|
|
64
|
+
matches = re.findall(pattern, text, re.DOTALL)
|
|
65
|
+
|
|
66
|
+
calls = []
|
|
67
|
+
for i, json_content in enumerate(matches):
|
|
68
|
+
try:
|
|
69
|
+
calls.append(json.loads(json_content))
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.error("Failed to parse JSON call block: %s", e)
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
return len(calls) > 0
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def find_json_dicts(text: str) -> list[dict]:
|
|
78
|
+
"""
|
|
79
|
+
Extract JSON dictionaries from ```json code blocks
|
|
80
|
+
"""
|
|
81
|
+
logger.info("find_json_dicts")
|
|
82
|
+
pattern = r"```json\s*(.*?)\s*```"
|
|
83
|
+
matches = re.findall(pattern, text, re.DOTALL)
|
|
84
|
+
|
|
85
|
+
calls = []
|
|
86
|
+
for i, json_content in enumerate(matches):
|
|
87
|
+
try:
|
|
88
|
+
# print(f"call {i}: {json_content}")
|
|
89
|
+
parsed = json.loads(json_content)
|
|
90
|
+
if isinstance(parsed, list):
|
|
91
|
+
calls.extend(parsed)
|
|
92
|
+
else:
|
|
93
|
+
calls.append(parsed)
|
|
94
|
+
except json.JSONDecodeError as e:
|
|
95
|
+
logger.warning(f"Failed to parse JSON in match {i}: {e}")
|
|
96
|
+
|
|
97
|
+
return calls
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def create_document_outline(
|
|
101
|
+
doc: DoclingDocument, mode: OutlineMode = OutlineMode.TABLE_OF_CONTENTS, format: OutlineFormat = OutlineFormat.JSON
|
|
102
|
+
) -> str:
|
|
103
|
+
"""Create a document outline.
|
|
104
|
+
|
|
105
|
+
This function uses the experimental OutlineDocSerializer to generate a structured outline of the document.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
doc: The DoclingDocument to create an outline for.
|
|
109
|
+
mode: The outline mode, either as a table of contents or as general outline.
|
|
110
|
+
format: The outline format, including markdown, indented text, and JSON.
|
|
111
|
+
Returns:
|
|
112
|
+
A text representation of a document outline.
|
|
113
|
+
"""
|
|
114
|
+
logger.debug("create_document_outline")
|
|
115
|
+
|
|
116
|
+
# Use OutlineDocSerializer with JSON format to get structured data
|
|
117
|
+
params = OutlineParams(
|
|
118
|
+
include_non_meta=True,
|
|
119
|
+
mode=mode,
|
|
120
|
+
format=format,
|
|
121
|
+
)
|
|
122
|
+
serializer = OutlineDocSerializer(doc=doc, params=params)
|
|
123
|
+
result = serializer.serialize()
|
|
124
|
+
|
|
125
|
+
return result.text
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def serialize_item_to_markdown(item: TextItem, doc: DoclingDocument) -> str:
|
|
129
|
+
"""Serialize a text item to markdown format using existing serializer."""
|
|
130
|
+
logger.info("serialize_item_to_markdown")
|
|
131
|
+
|
|
132
|
+
serializer = MarkdownDocSerializer(doc=doc, params=MarkdownParams())
|
|
133
|
+
|
|
134
|
+
result = serializer.serialize(item=item)
|
|
135
|
+
return result.text
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def serialize_table_to_html(table: TableItem, doc: DoclingDocument) -> str:
|
|
139
|
+
logger.info("serialize_table_to_html")
|
|
140
|
+
from docling_core.transforms.serializer.html import (
|
|
141
|
+
HTMLDocSerializer,
|
|
142
|
+
HTMLTableSerializer,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Create the table serializer
|
|
146
|
+
table_serializer = HTMLTableSerializer()
|
|
147
|
+
|
|
148
|
+
# Create a document serializer (needed as dependency)
|
|
149
|
+
doc_serializer = HTMLDocSerializer(doc=doc)
|
|
150
|
+
|
|
151
|
+
# Serialize the table
|
|
152
|
+
result = table_serializer.serialize(item=table, doc_serializer=doc_serializer, doc=doc)
|
|
153
|
+
|
|
154
|
+
return result.text
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def find_html_code_block(text: str) -> str | None:
|
|
158
|
+
"""
|
|
159
|
+
Check if a string matches the pattern ```html(.*)?```
|
|
160
|
+
"""
|
|
161
|
+
logger.info("find_html_code_block")
|
|
162
|
+
pattern = r"```html(.*?)```"
|
|
163
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
164
|
+
return match.group(1) if match else None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def has_html_code_block(text: str) -> bool:
|
|
168
|
+
"""
|
|
169
|
+
Check if a string contains a html code block pattern anywhere in the text
|
|
170
|
+
"""
|
|
171
|
+
logger.info("has_html_code_block")
|
|
172
|
+
return find_html_code_block(text) is not None
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def find_markdown_code_block(text: str) -> str | None:
|
|
176
|
+
"""
|
|
177
|
+
Check if a string matches the pattern ```(md|markdown)(.*)?```
|
|
178
|
+
"""
|
|
179
|
+
logger.info("find_markdown_code_block")
|
|
180
|
+
pattern = r"```(md|markdown)(.*?)```"
|
|
181
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
182
|
+
return match.group(2) if match else None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def has_markdown_code_block(text: str) -> bool:
|
|
186
|
+
"""
|
|
187
|
+
Check if a string contains a markdown code block pattern anywhere in the text
|
|
188
|
+
"""
|
|
189
|
+
logger.info("has_markdown_code_block")
|
|
190
|
+
return find_markdown_code_block(text) is not None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def convert_html_to_docling_table(text: str) -> list[TableItem] | None:
|
|
194
|
+
logger.info("convert_html_to_docling_table")
|
|
195
|
+
text_ = find_html_code_block(text)
|
|
196
|
+
if text_ is None:
|
|
197
|
+
text_ = text # assume the entire text is html
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
|
|
201
|
+
|
|
202
|
+
buff = BytesIO(text.encode("utf-8"))
|
|
203
|
+
doc_stream = DocumentStream(name="tmp.html", stream=buff)
|
|
204
|
+
|
|
205
|
+
conv: ConversionResult = converter.convert(doc_stream)
|
|
206
|
+
|
|
207
|
+
if conv.status == ConversionStatus.SUCCESS:
|
|
208
|
+
return conv.document.tables
|
|
209
|
+
|
|
210
|
+
except Exception as exc:
|
|
211
|
+
logger.error(exc)
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def validate_html_to_docling_table(text: str) -> bool:
|
|
218
|
+
logger.info("validate_html_to_docling_table")
|
|
219
|
+
return convert_html_to_docling_table(text) is not None
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def convert_markdown_to_docling_document(text: str) -> DoclingDocument | None:
|
|
223
|
+
logger.info("convert_markdown_to_docling_document")
|
|
224
|
+
text_ = find_markdown_code_block(text)
|
|
225
|
+
if text_ is None:
|
|
226
|
+
text_ = text # assume the entire text is html
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
converter = DocumentConverter(allowed_formats=[InputFormat.MD])
|
|
230
|
+
|
|
231
|
+
buff = BytesIO(text_.encode("utf-8"))
|
|
232
|
+
doc_stream = DocumentStream(name="tmp.md", stream=buff)
|
|
233
|
+
|
|
234
|
+
conv: ConversionResult = converter.convert(doc_stream)
|
|
235
|
+
|
|
236
|
+
if conv.status == ConversionStatus.SUCCESS:
|
|
237
|
+
return conv.document
|
|
238
|
+
except Exception:
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def validate_markdown_to_docling_document(text: str) -> bool:
|
|
245
|
+
logger.info("validate_markdown_to_docling_document")
|
|
246
|
+
return convert_markdown_to_docling_document(text) is not None
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def convert_html_to_docling_document(text: str) -> DoclingDocument | None:
|
|
250
|
+
logger.info("convert_html_to_docling_document")
|
|
251
|
+
text_ = find_html_code_block(text)
|
|
252
|
+
if text_ is None:
|
|
253
|
+
text_ = text # assume the entire text is html
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
|
|
257
|
+
|
|
258
|
+
buff = BytesIO(text_.encode("utf-8"))
|
|
259
|
+
doc_stream = DocumentStream(name="tmp.html", stream=buff)
|
|
260
|
+
|
|
261
|
+
conv: ConversionResult = converter.convert(doc_stream)
|
|
262
|
+
|
|
263
|
+
if conv.status == ConversionStatus.SUCCESS:
|
|
264
|
+
return conv.document
|
|
265
|
+
except Exception as exc:
|
|
266
|
+
logger.error(f"error: {exc}")
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def validate_html_to_docling_document(text: str) -> bool:
|
|
273
|
+
logger.info("validate_html_to_docling_document")
|
|
274
|
+
return convert_html_to_docling_document(text) is not None
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def insert_document(*, item: NodeItem, doc: DoclingDocument, updated_doc: DoclingDocument) -> DoclingDocument:
|
|
278
|
+
logger.info(f"insert_document: item={item.self_ref}")
|
|
279
|
+
|
|
280
|
+
group_item = GroupItem(
|
|
281
|
+
label=GroupLabel.UNSPECIFIED,
|
|
282
|
+
name="inserted-group",
|
|
283
|
+
self_ref="#", # temporary placeholder
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if isinstance(item, ListItem):
|
|
287
|
+
# we should delete all the children of the list-item and put the text to ""
|
|
288
|
+
raise ValueError("ListItem insertion is not yet supported!")
|
|
289
|
+
|
|
290
|
+
doc.replace_item(old_item=item, new_item=group_item) # group_item is being updated here ...
|
|
291
|
+
|
|
292
|
+
to_item: dict[str, NodeItem] = {}
|
|
293
|
+
for _item, level in updated_doc.iterate_items(with_groups=True):
|
|
294
|
+
if isinstance(_item, GroupItem) and _item.self_ref == "#/body":
|
|
295
|
+
to_item[_item.self_ref] = group_item
|
|
296
|
+
|
|
297
|
+
elif _item.parent is None:
|
|
298
|
+
logger.error(f"Item with null parent: {_item}")
|
|
299
|
+
|
|
300
|
+
elif _item.parent.cref not in to_item:
|
|
301
|
+
logger.error(f"Item with unknown parent: {_item}")
|
|
302
|
+
|
|
303
|
+
elif isinstance(_item, GroupItem):
|
|
304
|
+
gr = doc.add_group(
|
|
305
|
+
name=_item.name,
|
|
306
|
+
label=_item.label,
|
|
307
|
+
parent=to_item[_item.parent.cref],
|
|
308
|
+
)
|
|
309
|
+
to_item[_item.self_ref] = gr
|
|
310
|
+
|
|
311
|
+
elif isinstance(_item, ListItem):
|
|
312
|
+
li = doc.add_list_item(
|
|
313
|
+
text=_item.text,
|
|
314
|
+
formatting=_item.formatting,
|
|
315
|
+
parent=to_item[_item.parent.cref],
|
|
316
|
+
)
|
|
317
|
+
to_item[_item.self_ref] = li
|
|
318
|
+
|
|
319
|
+
elif isinstance(_item, TextItem):
|
|
320
|
+
te = doc.add_text(
|
|
321
|
+
text=_item.text,
|
|
322
|
+
label=_item.label,
|
|
323
|
+
formatting=_item.formatting,
|
|
324
|
+
parent=to_item[_item.parent.cref],
|
|
325
|
+
)
|
|
326
|
+
to_item[_item.self_ref] = te
|
|
327
|
+
|
|
328
|
+
elif isinstance(_item, TableItem):
|
|
329
|
+
if len(_item.captions) > 0:
|
|
330
|
+
# Caption entries may be references; create an empty caption text item
|
|
331
|
+
caption = doc.add_text(label=DocItemLabel.CAPTION, text="")
|
|
332
|
+
te = doc.add_table(
|
|
333
|
+
data=_item.data,
|
|
334
|
+
caption=caption,
|
|
335
|
+
)
|
|
336
|
+
to_item[_item.self_ref] = te
|
|
337
|
+
else:
|
|
338
|
+
te = doc.add_table(
|
|
339
|
+
data=_item.data,
|
|
340
|
+
)
|
|
341
|
+
to_item[_item.self_ref] = te
|
|
342
|
+
|
|
343
|
+
else:
|
|
344
|
+
logger.warning(f"No support to insert items of type: {type(item).__name__}")
|
|
345
|
+
|
|
346
|
+
return doc
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
# ---------------------------------------------------------------------------
|
|
350
|
+
# Document tree utilities
|
|
351
|
+
# ---------------------------------------------------------------------------
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def get_item_by_ref(doc: DoclingDocument, ref: str) -> NodeItem | None:
|
|
355
|
+
"""Resolve a self_ref string to a NodeItem. Returns None on failure."""
|
|
356
|
+
logger.info(f"get_item_by_ref: ref={ref!r}")
|
|
357
|
+
try:
|
|
358
|
+
return RefItem(cref=ref).resolve(doc)
|
|
359
|
+
except Exception:
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def collect_subtree_text(node: NodeItem, doc: DoclingDocument) -> str:
|
|
364
|
+
"""Recursively collect all text from a node and its descendants.
|
|
365
|
+
|
|
366
|
+
Resolves each child RefItem and concatenates text from TextItem instances
|
|
367
|
+
(which includes TitleItem, SectionHeaderItem, ListItem, TextItem proper).
|
|
368
|
+
Non-text nodes (TableItem, PictureItem, GroupItem) are traversed for their
|
|
369
|
+
children but do not contribute text directly.
|
|
370
|
+
"""
|
|
371
|
+
logger.info(f"collect_subtree_text: node={node.self_ref!r}")
|
|
372
|
+
parts: list[str] = []
|
|
373
|
+
if hasattr(node, "text") and node.text:
|
|
374
|
+
parts.append(node.text)
|
|
375
|
+
for child_ref in node.children or []:
|
|
376
|
+
try:
|
|
377
|
+
child = child_ref.resolve(doc)
|
|
378
|
+
subtree = collect_subtree_text(child, doc)
|
|
379
|
+
if subtree:
|
|
380
|
+
parts.append(subtree)
|
|
381
|
+
except Exception:
|
|
382
|
+
pass
|
|
383
|
+
return "\n".join(parts)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _copy_list_group(
|
|
387
|
+
source: ListGroup,
|
|
388
|
+
source_doc: DoclingDocument,
|
|
389
|
+
target_doc: DoclingDocument,
|
|
390
|
+
parent: NodeItem,
|
|
391
|
+
) -> ListGroup:
|
|
392
|
+
logger.info(f"_copy_list_group: source={source.self_ref!r}")
|
|
393
|
+
new_group = target_doc.add_list_group(parent=parent)
|
|
394
|
+
new_group.meta = source.meta
|
|
395
|
+
for child_ref in source.children or []:
|
|
396
|
+
try:
|
|
397
|
+
child = child_ref.resolve(source_doc)
|
|
398
|
+
if isinstance(child, ListItem):
|
|
399
|
+
new_item = target_doc.add_list_item(
|
|
400
|
+
text=child.text,
|
|
401
|
+
enumerated=child.enumerated,
|
|
402
|
+
parent=new_group,
|
|
403
|
+
)
|
|
404
|
+
new_item.meta = child.meta
|
|
405
|
+
# Recursively copy nested list groups
|
|
406
|
+
for nested_ref in child.children or []:
|
|
407
|
+
try:
|
|
408
|
+
nested = nested_ref.resolve(source_doc)
|
|
409
|
+
if isinstance(nested, ListGroup):
|
|
410
|
+
_copy_list_group(nested, source_doc, target_doc, new_item)
|
|
411
|
+
except Exception:
|
|
412
|
+
pass
|
|
413
|
+
except Exception as exc:
|
|
414
|
+
logger.warning(f"Could not copy list child: {exc}")
|
|
415
|
+
return new_group
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _copy_table(
|
|
419
|
+
source: TableItem,
|
|
420
|
+
source_doc: DoclingDocument,
|
|
421
|
+
target_doc: DoclingDocument,
|
|
422
|
+
parent: NodeItem,
|
|
423
|
+
) -> TableItem:
|
|
424
|
+
logger.info(f"_copy_table: source={source.self_ref!r}")
|
|
425
|
+
new_table = target_doc.add_table(data=source.data, parent=parent)
|
|
426
|
+
new_table.meta = source.meta
|
|
427
|
+
for cap_ref in source.captions:
|
|
428
|
+
try:
|
|
429
|
+
cap = cap_ref.resolve(source_doc)
|
|
430
|
+
if hasattr(cap, "text"):
|
|
431
|
+
new_cap = target_doc.add_text(label=cap.label, text=cap.text, parent=new_table)
|
|
432
|
+
new_cap.meta = cap.meta
|
|
433
|
+
new_table.captions.append(new_cap.get_ref())
|
|
434
|
+
except Exception as exc:
|
|
435
|
+
logger.warning(f"Could not copy table caption: {exc}")
|
|
436
|
+
return new_table
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _copy_picture(
|
|
440
|
+
source: PictureItem,
|
|
441
|
+
source_doc: DoclingDocument,
|
|
442
|
+
target_doc: DoclingDocument,
|
|
443
|
+
parent: NodeItem,
|
|
444
|
+
) -> PictureItem:
|
|
445
|
+
logger.info(f"_copy_picture: source={source.self_ref!r}")
|
|
446
|
+
new_pic = target_doc.add_picture(image=source.image, parent=parent)
|
|
447
|
+
new_pic.meta = source.meta
|
|
448
|
+
for cap_ref in source.captions:
|
|
449
|
+
try:
|
|
450
|
+
cap = cap_ref.resolve(source_doc)
|
|
451
|
+
if hasattr(cap, "text"):
|
|
452
|
+
new_cap = target_doc.add_text(label=cap.label, text=cap.text, parent=new_pic)
|
|
453
|
+
new_cap.meta = cap.meta
|
|
454
|
+
new_pic.captions.append(new_cap.get_ref())
|
|
455
|
+
except Exception as exc:
|
|
456
|
+
logger.warning(f"Could not copy picture caption: {exc}")
|
|
457
|
+
return new_pic
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _flatten_into(
|
|
461
|
+
node: NodeItem,
|
|
462
|
+
source_doc: DoclingDocument,
|
|
463
|
+
target_doc: DoclingDocument,
|
|
464
|
+
target_parent: NodeItem,
|
|
465
|
+
) -> None:
|
|
466
|
+
"""Recursively add node's children to target_parent, preserving atomic units."""
|
|
467
|
+
logger.info(f"_flatten_into: node={node.self_ref!r}")
|
|
468
|
+
for child_ref in node.children or []:
|
|
469
|
+
try:
|
|
470
|
+
child = child_ref.resolve(source_doc)
|
|
471
|
+
except Exception as exc:
|
|
472
|
+
logger.warning(f"Could not resolve child {child_ref}: {exc}")
|
|
473
|
+
continue
|
|
474
|
+
|
|
475
|
+
if isinstance(child, ListGroup):
|
|
476
|
+
_copy_list_group(child, source_doc, target_doc, target_parent)
|
|
477
|
+
elif isinstance(child, TableItem):
|
|
478
|
+
_copy_table(child, source_doc, target_doc, target_parent)
|
|
479
|
+
elif isinstance(child, PictureItem):
|
|
480
|
+
_copy_picture(child, source_doc, target_doc, target_parent)
|
|
481
|
+
elif isinstance(child, TitleItem):
|
|
482
|
+
new_item = target_doc.add_title(text=child.text, parent=target_parent)
|
|
483
|
+
new_item.meta = child.meta
|
|
484
|
+
_flatten_into(child, source_doc, target_doc, target_parent)
|
|
485
|
+
elif isinstance(child, SectionHeaderItem):
|
|
486
|
+
new_item = target_doc.add_heading(text=child.text, level=child.level, parent=target_parent)
|
|
487
|
+
new_item.meta = child.meta
|
|
488
|
+
_flatten_into(child, source_doc, target_doc, target_parent)
|
|
489
|
+
elif isinstance(child, ListItem):
|
|
490
|
+
logger.warning(f"ListItem {child.self_ref} found outside a ListGroup; skipping")
|
|
491
|
+
elif isinstance(child, GroupItem):
|
|
492
|
+
# Dissolve other groups (recurse into children without adding the group)
|
|
493
|
+
_flatten_into(child, source_doc, target_doc, target_parent)
|
|
494
|
+
elif hasattr(child, "text"):
|
|
495
|
+
new_item = target_doc.add_text(label=child.label, text=child.text, parent=target_parent)
|
|
496
|
+
new_item.meta = child.meta
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def make_flat_document(doc: DoclingDocument) -> DoclingDocument:
|
|
500
|
+
"""Return a new document where every item is a direct child of body.
|
|
501
|
+
|
|
502
|
+
Iterates ``doc`` in document order and appends each item to the new body,
|
|
503
|
+
preserving:
|
|
504
|
+
- SectionHeaderItem.level (needed for make_hierarchical_document to invert)
|
|
505
|
+
- List internal structure (ListGroup → ListItem nesting is kept)
|
|
506
|
+
- Table / picture caption children
|
|
507
|
+
All other parent-child links (section → text) are dissolved.
|
|
508
|
+
"""
|
|
509
|
+
logger.info(f"make_flat_document: doc={doc.name!r}")
|
|
510
|
+
new_doc = DoclingDocument(name=doc.name)
|
|
511
|
+
_flatten_into(doc.body, doc, new_doc, new_doc.body)
|
|
512
|
+
return new_doc
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def make_hierarchical_document(doc: DoclingDocument) -> DoclingDocument:
|
|
516
|
+
"""Return a new document with maximal section nesting.
|
|
517
|
+
|
|
518
|
+
Iterates ``doc`` in document order (after flattening first). Maintains a
|
|
519
|
+
stack of open section headers keyed by their level. Each non-header item
|
|
520
|
+
(text, table, picture, list) is appended as a child of the most recently
|
|
521
|
+
opened section header (or of body if no header has been seen yet).
|
|
522
|
+
A section header at level N is appended as a child of the nearest ancestor
|
|
523
|
+
whose level is strictly less than N.
|
|
524
|
+
|
|
525
|
+
Lists, table-caption pairs and picture-caption pairs are treated as atomic
|
|
526
|
+
units and are not split across parent boundaries.
|
|
527
|
+
"""
|
|
528
|
+
logger.info(f"make_hierarchical_document: doc={doc.name!r}")
|
|
529
|
+
flat = make_flat_document(doc)
|
|
530
|
+
new_doc = DoclingDocument(name=doc.name)
|
|
531
|
+
|
|
532
|
+
# open_sections maps level -> SectionHeaderItem (only section headers, not title).
|
|
533
|
+
open_sections: dict[int, NodeItem] = {}
|
|
534
|
+
# title_node is the most recently seen TitleItem; text before any section header
|
|
535
|
+
# becomes a child of the title rather than of body.
|
|
536
|
+
title_node: NodeItem | None = None
|
|
537
|
+
|
|
538
|
+
def _current_parent() -> NodeItem:
|
|
539
|
+
if open_sections:
|
|
540
|
+
return open_sections[max(open_sections)]
|
|
541
|
+
if title_node is not None:
|
|
542
|
+
return title_node
|
|
543
|
+
return new_doc.body
|
|
544
|
+
|
|
545
|
+
def _parent_for_level(level: int) -> NodeItem:
|
|
546
|
+
# Section headers nest only under other section headers, never under the title.
|
|
547
|
+
candidates = [lv for lv in open_sections if lv < level]
|
|
548
|
+
if not candidates:
|
|
549
|
+
return new_doc.body
|
|
550
|
+
return open_sections[max(candidates)]
|
|
551
|
+
|
|
552
|
+
for child_ref in flat.body.children or []:
|
|
553
|
+
try:
|
|
554
|
+
child = child_ref.resolve(flat)
|
|
555
|
+
except Exception as exc:
|
|
556
|
+
logger.warning(f"Could not resolve body child {child_ref}: {exc}")
|
|
557
|
+
continue
|
|
558
|
+
|
|
559
|
+
if isinstance(child, TitleItem):
|
|
560
|
+
new_item = new_doc.add_title(text=child.text, parent=new_doc.body)
|
|
561
|
+
new_item.meta = child.meta
|
|
562
|
+
title_node = new_item
|
|
563
|
+
open_sections = {}
|
|
564
|
+
|
|
565
|
+
elif isinstance(child, SectionHeaderItem):
|
|
566
|
+
level = child.level
|
|
567
|
+
parent = _parent_for_level(level)
|
|
568
|
+
new_item = new_doc.add_heading(text=child.text, level=level, parent=parent)
|
|
569
|
+
new_item.meta = child.meta
|
|
570
|
+
# Close all open sections at >= this level
|
|
571
|
+
open_sections = {lv: n for lv, n in open_sections.items() if lv < level}
|
|
572
|
+
open_sections[level] = new_item
|
|
573
|
+
|
|
574
|
+
elif isinstance(child, ListGroup):
|
|
575
|
+
_copy_list_group(child, flat, new_doc, _current_parent())
|
|
576
|
+
|
|
577
|
+
elif isinstance(child, TableItem):
|
|
578
|
+
_copy_table(child, flat, new_doc, _current_parent())
|
|
579
|
+
|
|
580
|
+
elif isinstance(child, PictureItem):
|
|
581
|
+
_copy_picture(child, flat, new_doc, _current_parent())
|
|
582
|
+
|
|
583
|
+
elif hasattr(child, "text"):
|
|
584
|
+
new_item = new_doc.add_text(label=child.label, text=child.text, parent=_current_parent())
|
|
585
|
+
new_item.meta = child.meta
|
|
586
|
+
|
|
587
|
+
else:
|
|
588
|
+
logger.warning(f"Unhandled item type {type(child).__name__} in make_hierarchical_document")
|
|
589
|
+
|
|
590
|
+
return new_doc
|