docling-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import abstractmethod
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ # from smolagents import MCPClient, Tool, ToolCollection
9
+ # from smolagents.models import ChatMessage, MessageRole, Model
10
+ from mellea.backends.model_ids import ModelIdentifier
11
+ from pydantic import BaseModel, ConfigDict
12
+
13
+ if TYPE_CHECKING:
14
+ from docling_core.types.doc.document import DoclingDocument
15
+
16
+ # Use shared logger from docling_agent.agents
17
+
18
+
19
+ class DoclingAgentType(Enum):
20
+ """Enumeration of supported agent types."""
21
+
22
+ # Core agent types
23
+ DOCLING_DOCUMENT_WRITER = "writer"
24
+ DOCLING_DOCUMENT_EDITOR = "editor"
25
+ DOCLING_DOCUMENT_EXTRACTOR = "extractor"
26
+ DOCLING_DOCUMENT_ENRICHER = "enricher"
27
+ DOCLING_DOCUMENT_RAG = "rag"
28
+ DOCLING_DOCUMENT_ORCHESTRATOR = "orchestrator"
29
+
30
+ def __str__(self) -> str:
31
+ """Return the string value of the enum."""
32
+ return self.value
33
+
34
+ @classmethod
35
+ def from_string(cls, value: str) -> DoclingAgentType:
36
+ """Create AgentType from string value."""
37
+ for agent_type in cls:
38
+ if agent_type.value == value:
39
+ return agent_type
40
+ raise ValueError(f"Invalid agent type: {value}. Valid types: {[t.value for t in cls]}")
41
+
42
+ @classmethod
43
+ def get_all_types(cls) -> list[str]:
44
+ """Get all available agent type strings."""
45
+ return [agent_type.value for agent_type in cls]
46
+
47
+
48
+ class BaseDoclingAgent(BaseModel):
49
+ model_config = ConfigDict(arbitrary_types_allowed=True)
50
+
51
+ agent_type: DoclingAgentType
52
+ model_id: ModelIdentifier
53
+ tools: list
54
+
55
+ # model needed for reasoning/instruction following
56
+ reasoning_model_id: ModelIdentifier | None = None
57
+
58
+ # model needed for writing, summarizing, etc
59
+ writing_model_id: ModelIdentifier | None = None
60
+
61
+ max_iteration: int = 16
62
+
63
+ def get_reasoning_model_id(self) -> ModelIdentifier:
64
+ """Return the reasoning model id, falling back to the primary model."""
65
+ return self.reasoning_model_id or self.model_id
66
+
67
+ def get_writing_model_id(self) -> ModelIdentifier:
68
+ """Return the writing model id, falling back to the primary model."""
69
+ return self.writing_model_id or self.model_id
70
+
71
+ @abstractmethod
72
+ def run(
73
+ self,
74
+ task: str,
75
+ document: DoclingDocument | None = None,
76
+ sources: list[DoclingDocument | Path] = [],
77
+ **kwargs,
78
+ ) -> DoclingDocument:
79
+ """Execute the agent for a task and return a document."""
80
+ raise NotImplementedError
@@ -0,0 +1,590 @@
1
+ import json
2
+ import re
3
+ from io import BytesIO
4
+
5
+ from docling.datamodel.base_models import ConversionStatus, InputFormat
6
+ from docling.datamodel.document import ConversionResult
7
+ from docling.document_converter import DocumentConverter
8
+ from docling_core.experimental.serializer.outline import (
9
+ OutlineDocSerializer,
10
+ OutlineFormat,
11
+ OutlineMode,
12
+ OutlineParams,
13
+ )
14
+ from docling_core.transforms.serializer.markdown import (
15
+ MarkdownDocSerializer,
16
+ MarkdownParams,
17
+ )
18
+ from docling_core.types.doc.document import (
19
+ DocItemLabel,
20
+ DoclingDocument,
21
+ GroupItem,
22
+ GroupLabel,
23
+ ListGroup,
24
+ ListItem,
25
+ NodeItem,
26
+ PictureItem,
27
+ RefItem,
28
+ SectionHeaderItem,
29
+ TableItem,
30
+ TextItem,
31
+ TitleItem,
32
+ )
33
+ from docling_core.types.io import DocumentStream
34
+
35
+ from docling_agent.logging import logger
36
+
37
+
38
+ def find_crefs(text: str) -> list[RefItem]:
39
+ """
40
+ Check if a string matches the pattern ```markdown(.*)?```
41
+ """
42
+ logger.info("find_crefs")
43
+ labels: str = "|".join(e.value for e in DocItemLabel)
44
+ pattern = rf"#/({labels})/\d+"
45
+
46
+ refs: list[RefItem] = []
47
+ for m in re.finditer(pattern, text, re.DOTALL):
48
+ refs.append(RefItem(cref=m.group(0)))
49
+
50
+ return refs
51
+
52
+
53
+ def has_crefs(text: str) -> bool:
54
+ logger.info("has_crefs")
55
+ return len(find_crefs(text)) > 0
56
+
57
+
58
+ def has_json_dicts(text: str) -> bool:
59
+ """
60
+ Extract JSON dictionaries from ```json code blocks
61
+ """
62
+ logger.info("has_json_dicts")
63
+ pattern = r"```json\s*(.*?)\s*```"
64
+ matches = re.findall(pattern, text, re.DOTALL)
65
+
66
+ calls = []
67
+ for i, json_content in enumerate(matches):
68
+ try:
69
+ calls.append(json.loads(json_content))
70
+ except Exception as e:
71
+ logger.error("Failed to parse JSON call block: %s", e)
72
+ return False
73
+
74
+ return len(calls) > 0
75
+
76
+
77
+ def find_json_dicts(text: str) -> list[dict]:
78
+ """
79
+ Extract JSON dictionaries from ```json code blocks
80
+ """
81
+ logger.info("find_json_dicts")
82
+ pattern = r"```json\s*(.*?)\s*```"
83
+ matches = re.findall(pattern, text, re.DOTALL)
84
+
85
+ calls = []
86
+ for i, json_content in enumerate(matches):
87
+ try:
88
+ # print(f"call {i}: {json_content}")
89
+ parsed = json.loads(json_content)
90
+ if isinstance(parsed, list):
91
+ calls.extend(parsed)
92
+ else:
93
+ calls.append(parsed)
94
+ except json.JSONDecodeError as e:
95
+ logger.warning(f"Failed to parse JSON in match {i}: {e}")
96
+
97
+ return calls
98
+
99
+
100
+ def create_document_outline(
101
+ doc: DoclingDocument, mode: OutlineMode = OutlineMode.TABLE_OF_CONTENTS, format: OutlineFormat = OutlineFormat.JSON
102
+ ) -> str:
103
+ """Create a document outline.
104
+
105
+ This function uses the experimental OutlineDocSerializer to generate a structured outline of the document.
106
+
107
+ Args:
108
+ doc: The DoclingDocument to create an outline for.
109
+ mode: The outline mode, either as a table of contents or as general outline.
110
+ format: The outline format, including markdown, indented text, and JSON.
111
+ Returns:
112
+ A text representation of a document outline.
113
+ """
114
+ logger.debug("create_document_outline")
115
+
116
+ # Use OutlineDocSerializer with JSON format to get structured data
117
+ params = OutlineParams(
118
+ include_non_meta=True,
119
+ mode=mode,
120
+ format=format,
121
+ )
122
+ serializer = OutlineDocSerializer(doc=doc, params=params)
123
+ result = serializer.serialize()
124
+
125
+ return result.text
126
+
127
+
128
+ def serialize_item_to_markdown(item: TextItem, doc: DoclingDocument) -> str:
129
+ """Serialize a text item to markdown format using existing serializer."""
130
+ logger.info("serialize_item_to_markdown")
131
+
132
+ serializer = MarkdownDocSerializer(doc=doc, params=MarkdownParams())
133
+
134
+ result = serializer.serialize(item=item)
135
+ return result.text
136
+
137
+
138
+ def serialize_table_to_html(table: TableItem, doc: DoclingDocument) -> str:
139
+ logger.info("serialize_table_to_html")
140
+ from docling_core.transforms.serializer.html import (
141
+ HTMLDocSerializer,
142
+ HTMLTableSerializer,
143
+ )
144
+
145
+ # Create the table serializer
146
+ table_serializer = HTMLTableSerializer()
147
+
148
+ # Create a document serializer (needed as dependency)
149
+ doc_serializer = HTMLDocSerializer(doc=doc)
150
+
151
+ # Serialize the table
152
+ result = table_serializer.serialize(item=table, doc_serializer=doc_serializer, doc=doc)
153
+
154
+ return result.text
155
+
156
+
157
+ def find_html_code_block(text: str) -> str | None:
158
+ """
159
+ Check if a string matches the pattern ```html(.*)?```
160
+ """
161
+ logger.info("find_html_code_block")
162
+ pattern = r"```html(.*?)```"
163
+ match = re.search(pattern, text, re.DOTALL)
164
+ return match.group(1) if match else None
165
+
166
+
167
+ def has_html_code_block(text: str) -> bool:
168
+ """
169
+ Check if a string contains a html code block pattern anywhere in the text
170
+ """
171
+ logger.info("has_html_code_block")
172
+ return find_html_code_block(text) is not None
173
+
174
+
175
+ def find_markdown_code_block(text: str) -> str | None:
176
+ """
177
+ Check if a string matches the pattern ```(md|markdown)(.*)?```
178
+ """
179
+ logger.info("find_markdown_code_block")
180
+ pattern = r"```(md|markdown)(.*?)```"
181
+ match = re.search(pattern, text, re.DOTALL)
182
+ return match.group(2) if match else None
183
+
184
+
185
+ def has_markdown_code_block(text: str) -> bool:
186
+ """
187
+ Check if a string contains a markdown code block pattern anywhere in the text
188
+ """
189
+ logger.info("has_markdown_code_block")
190
+ return find_markdown_code_block(text) is not None
191
+
192
+
193
+ def convert_html_to_docling_table(text: str) -> list[TableItem] | None:
194
+ logger.info("convert_html_to_docling_table")
195
+ text_ = find_html_code_block(text)
196
+ if text_ is None:
197
+ text_ = text # assume the entire text is html
198
+
199
+ try:
200
+ converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
201
+
202
+ buff = BytesIO(text.encode("utf-8"))
203
+ doc_stream = DocumentStream(name="tmp.html", stream=buff)
204
+
205
+ conv: ConversionResult = converter.convert(doc_stream)
206
+
207
+ if conv.status == ConversionStatus.SUCCESS:
208
+ return conv.document.tables
209
+
210
+ except Exception as exc:
211
+ logger.error(exc)
212
+ return None
213
+
214
+ return None
215
+
216
+
217
+ def validate_html_to_docling_table(text: str) -> bool:
218
+ logger.info("validate_html_to_docling_table")
219
+ return convert_html_to_docling_table(text) is not None
220
+
221
+
222
+ def convert_markdown_to_docling_document(text: str) -> DoclingDocument | None:
223
+ logger.info("convert_markdown_to_docling_document")
224
+ text_ = find_markdown_code_block(text)
225
+ if text_ is None:
226
+ text_ = text # assume the entire text is html
227
+
228
+ try:
229
+ converter = DocumentConverter(allowed_formats=[InputFormat.MD])
230
+
231
+ buff = BytesIO(text_.encode("utf-8"))
232
+ doc_stream = DocumentStream(name="tmp.md", stream=buff)
233
+
234
+ conv: ConversionResult = converter.convert(doc_stream)
235
+
236
+ if conv.status == ConversionStatus.SUCCESS:
237
+ return conv.document
238
+ except Exception:
239
+ return None
240
+
241
+ return None
242
+
243
+
244
+ def validate_markdown_to_docling_document(text: str) -> bool:
245
+ logger.info("validate_markdown_to_docling_document")
246
+ return convert_markdown_to_docling_document(text) is not None
247
+
248
+
249
+ def convert_html_to_docling_document(text: str) -> DoclingDocument | None:
250
+ logger.info("convert_html_to_docling_document")
251
+ text_ = find_html_code_block(text)
252
+ if text_ is None:
253
+ text_ = text # assume the entire text is html
254
+
255
+ try:
256
+ converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
257
+
258
+ buff = BytesIO(text_.encode("utf-8"))
259
+ doc_stream = DocumentStream(name="tmp.html", stream=buff)
260
+
261
+ conv: ConversionResult = converter.convert(doc_stream)
262
+
263
+ if conv.status == ConversionStatus.SUCCESS:
264
+ return conv.document
265
+ except Exception as exc:
266
+ logger.error(f"error: {exc}")
267
+ return None
268
+
269
+ return None
270
+
271
+
272
+ def validate_html_to_docling_document(text: str) -> bool:
273
+ logger.info("validate_html_to_docling_document")
274
+ return convert_html_to_docling_document(text) is not None
275
+
276
+
277
+ def insert_document(*, item: NodeItem, doc: DoclingDocument, updated_doc: DoclingDocument) -> DoclingDocument:
278
+ logger.info(f"insert_document: item={item.self_ref}")
279
+
280
+ group_item = GroupItem(
281
+ label=GroupLabel.UNSPECIFIED,
282
+ name="inserted-group",
283
+ self_ref="#", # temporary placeholder
284
+ )
285
+
286
+ if isinstance(item, ListItem):
287
+ # we should delete all the children of the list-item and put the text to ""
288
+ raise ValueError("ListItem insertion is not yet supported!")
289
+
290
+ doc.replace_item(old_item=item, new_item=group_item) # group_item is being updated here ...
291
+
292
+ to_item: dict[str, NodeItem] = {}
293
+ for _item, level in updated_doc.iterate_items(with_groups=True):
294
+ if isinstance(_item, GroupItem) and _item.self_ref == "#/body":
295
+ to_item[_item.self_ref] = group_item
296
+
297
+ elif _item.parent is None:
298
+ logger.error(f"Item with null parent: {_item}")
299
+
300
+ elif _item.parent.cref not in to_item:
301
+ logger.error(f"Item with unknown parent: {_item}")
302
+
303
+ elif isinstance(_item, GroupItem):
304
+ gr = doc.add_group(
305
+ name=_item.name,
306
+ label=_item.label,
307
+ parent=to_item[_item.parent.cref],
308
+ )
309
+ to_item[_item.self_ref] = gr
310
+
311
+ elif isinstance(_item, ListItem):
312
+ li = doc.add_list_item(
313
+ text=_item.text,
314
+ formatting=_item.formatting,
315
+ parent=to_item[_item.parent.cref],
316
+ )
317
+ to_item[_item.self_ref] = li
318
+
319
+ elif isinstance(_item, TextItem):
320
+ te = doc.add_text(
321
+ text=_item.text,
322
+ label=_item.label,
323
+ formatting=_item.formatting,
324
+ parent=to_item[_item.parent.cref],
325
+ )
326
+ to_item[_item.self_ref] = te
327
+
328
+ elif isinstance(_item, TableItem):
329
+ if len(_item.captions) > 0:
330
+ # Caption entries may be references; create an empty caption text item
331
+ caption = doc.add_text(label=DocItemLabel.CAPTION, text="")
332
+ te = doc.add_table(
333
+ data=_item.data,
334
+ caption=caption,
335
+ )
336
+ to_item[_item.self_ref] = te
337
+ else:
338
+ te = doc.add_table(
339
+ data=_item.data,
340
+ )
341
+ to_item[_item.self_ref] = te
342
+
343
+ else:
344
+ logger.warning(f"No support to insert items of type: {type(item).__name__}")
345
+
346
+ return doc
347
+
348
+
349
+ # ---------------------------------------------------------------------------
350
+ # Document tree utilities
351
+ # ---------------------------------------------------------------------------
352
+
353
+
354
+ def get_item_by_ref(doc: DoclingDocument, ref: str) -> NodeItem | None:
355
+ """Resolve a self_ref string to a NodeItem. Returns None on failure."""
356
+ logger.info(f"get_item_by_ref: ref={ref!r}")
357
+ try:
358
+ return RefItem(cref=ref).resolve(doc)
359
+ except Exception:
360
+ return None
361
+
362
+
363
+ def collect_subtree_text(node: NodeItem, doc: DoclingDocument) -> str:
364
+ """Recursively collect all text from a node and its descendants.
365
+
366
+ Resolves each child RefItem and concatenates text from TextItem instances
367
+ (which includes TitleItem, SectionHeaderItem, ListItem, TextItem proper).
368
+ Non-text nodes (TableItem, PictureItem, GroupItem) are traversed for their
369
+ children but do not contribute text directly.
370
+ """
371
+ logger.info(f"collect_subtree_text: node={node.self_ref!r}")
372
+ parts: list[str] = []
373
+ if hasattr(node, "text") and node.text:
374
+ parts.append(node.text)
375
+ for child_ref in node.children or []:
376
+ try:
377
+ child = child_ref.resolve(doc)
378
+ subtree = collect_subtree_text(child, doc)
379
+ if subtree:
380
+ parts.append(subtree)
381
+ except Exception:
382
+ pass
383
+ return "\n".join(parts)
384
+
385
+
386
+ def _copy_list_group(
387
+ source: ListGroup,
388
+ source_doc: DoclingDocument,
389
+ target_doc: DoclingDocument,
390
+ parent: NodeItem,
391
+ ) -> ListGroup:
392
+ logger.info(f"_copy_list_group: source={source.self_ref!r}")
393
+ new_group = target_doc.add_list_group(parent=parent)
394
+ new_group.meta = source.meta
395
+ for child_ref in source.children or []:
396
+ try:
397
+ child = child_ref.resolve(source_doc)
398
+ if isinstance(child, ListItem):
399
+ new_item = target_doc.add_list_item(
400
+ text=child.text,
401
+ enumerated=child.enumerated,
402
+ parent=new_group,
403
+ )
404
+ new_item.meta = child.meta
405
+ # Recursively copy nested list groups
406
+ for nested_ref in child.children or []:
407
+ try:
408
+ nested = nested_ref.resolve(source_doc)
409
+ if isinstance(nested, ListGroup):
410
+ _copy_list_group(nested, source_doc, target_doc, new_item)
411
+ except Exception:
412
+ pass
413
+ except Exception as exc:
414
+ logger.warning(f"Could not copy list child: {exc}")
415
+ return new_group
416
+
417
+
418
+ def _copy_table(
419
+ source: TableItem,
420
+ source_doc: DoclingDocument,
421
+ target_doc: DoclingDocument,
422
+ parent: NodeItem,
423
+ ) -> TableItem:
424
+ logger.info(f"_copy_table: source={source.self_ref!r}")
425
+ new_table = target_doc.add_table(data=source.data, parent=parent)
426
+ new_table.meta = source.meta
427
+ for cap_ref in source.captions:
428
+ try:
429
+ cap = cap_ref.resolve(source_doc)
430
+ if hasattr(cap, "text"):
431
+ new_cap = target_doc.add_text(label=cap.label, text=cap.text, parent=new_table)
432
+ new_cap.meta = cap.meta
433
+ new_table.captions.append(new_cap.get_ref())
434
+ except Exception as exc:
435
+ logger.warning(f"Could not copy table caption: {exc}")
436
+ return new_table
437
+
438
+
439
+ def _copy_picture(
440
+ source: PictureItem,
441
+ source_doc: DoclingDocument,
442
+ target_doc: DoclingDocument,
443
+ parent: NodeItem,
444
+ ) -> PictureItem:
445
+ logger.info(f"_copy_picture: source={source.self_ref!r}")
446
+ new_pic = target_doc.add_picture(image=source.image, parent=parent)
447
+ new_pic.meta = source.meta
448
+ for cap_ref in source.captions:
449
+ try:
450
+ cap = cap_ref.resolve(source_doc)
451
+ if hasattr(cap, "text"):
452
+ new_cap = target_doc.add_text(label=cap.label, text=cap.text, parent=new_pic)
453
+ new_cap.meta = cap.meta
454
+ new_pic.captions.append(new_cap.get_ref())
455
+ except Exception as exc:
456
+ logger.warning(f"Could not copy picture caption: {exc}")
457
+ return new_pic
458
+
459
+
460
+ def _flatten_into(
461
+ node: NodeItem,
462
+ source_doc: DoclingDocument,
463
+ target_doc: DoclingDocument,
464
+ target_parent: NodeItem,
465
+ ) -> None:
466
+ """Recursively add node's children to target_parent, preserving atomic units."""
467
+ logger.info(f"_flatten_into: node={node.self_ref!r}")
468
+ for child_ref in node.children or []:
469
+ try:
470
+ child = child_ref.resolve(source_doc)
471
+ except Exception as exc:
472
+ logger.warning(f"Could not resolve child {child_ref}: {exc}")
473
+ continue
474
+
475
+ if isinstance(child, ListGroup):
476
+ _copy_list_group(child, source_doc, target_doc, target_parent)
477
+ elif isinstance(child, TableItem):
478
+ _copy_table(child, source_doc, target_doc, target_parent)
479
+ elif isinstance(child, PictureItem):
480
+ _copy_picture(child, source_doc, target_doc, target_parent)
481
+ elif isinstance(child, TitleItem):
482
+ new_item = target_doc.add_title(text=child.text, parent=target_parent)
483
+ new_item.meta = child.meta
484
+ _flatten_into(child, source_doc, target_doc, target_parent)
485
+ elif isinstance(child, SectionHeaderItem):
486
+ new_item = target_doc.add_heading(text=child.text, level=child.level, parent=target_parent)
487
+ new_item.meta = child.meta
488
+ _flatten_into(child, source_doc, target_doc, target_parent)
489
+ elif isinstance(child, ListItem):
490
+ logger.warning(f"ListItem {child.self_ref} found outside a ListGroup; skipping")
491
+ elif isinstance(child, GroupItem):
492
+ # Dissolve other groups (recurse into children without adding the group)
493
+ _flatten_into(child, source_doc, target_doc, target_parent)
494
+ elif hasattr(child, "text"):
495
+ new_item = target_doc.add_text(label=child.label, text=child.text, parent=target_parent)
496
+ new_item.meta = child.meta
497
+
498
+
499
+ def make_flat_document(doc: DoclingDocument) -> DoclingDocument:
500
+ """Return a new document where every item is a direct child of body.
501
+
502
+ Iterates ``doc`` in document order and appends each item to the new body,
503
+ preserving:
504
+ - SectionHeaderItem.level (needed for make_hierarchical_document to invert)
505
+ - List internal structure (ListGroup → ListItem nesting is kept)
506
+ - Table / picture caption children
507
+ All other parent-child links (section → text) are dissolved.
508
+ """
509
+ logger.info(f"make_flat_document: doc={doc.name!r}")
510
+ new_doc = DoclingDocument(name=doc.name)
511
+ _flatten_into(doc.body, doc, new_doc, new_doc.body)
512
+ return new_doc
513
+
514
+
515
+ def make_hierarchical_document(doc: DoclingDocument) -> DoclingDocument:
516
+ """Return a new document with maximal section nesting.
517
+
518
+ Iterates ``doc`` in document order (after flattening first). Maintains a
519
+ stack of open section headers keyed by their level. Each non-header item
520
+ (text, table, picture, list) is appended as a child of the most recently
521
+ opened section header (or of body if no header has been seen yet).
522
+ A section header at level N is appended as a child of the nearest ancestor
523
+ whose level is strictly less than N.
524
+
525
+ Lists, table-caption pairs and picture-caption pairs are treated as atomic
526
+ units and are not split across parent boundaries.
527
+ """
528
+ logger.info(f"make_hierarchical_document: doc={doc.name!r}")
529
+ flat = make_flat_document(doc)
530
+ new_doc = DoclingDocument(name=doc.name)
531
+
532
+ # open_sections maps level -> SectionHeaderItem (only section headers, not title).
533
+ open_sections: dict[int, NodeItem] = {}
534
+ # title_node is the most recently seen TitleItem; text before any section header
535
+ # becomes a child of the title rather than of body.
536
+ title_node: NodeItem | None = None
537
+
538
+ def _current_parent() -> NodeItem:
539
+ if open_sections:
540
+ return open_sections[max(open_sections)]
541
+ if title_node is not None:
542
+ return title_node
543
+ return new_doc.body
544
+
545
+ def _parent_for_level(level: int) -> NodeItem:
546
+ # Section headers nest only under other section headers, never under the title.
547
+ candidates = [lv for lv in open_sections if lv < level]
548
+ if not candidates:
549
+ return new_doc.body
550
+ return open_sections[max(candidates)]
551
+
552
+ for child_ref in flat.body.children or []:
553
+ try:
554
+ child = child_ref.resolve(flat)
555
+ except Exception as exc:
556
+ logger.warning(f"Could not resolve body child {child_ref}: {exc}")
557
+ continue
558
+
559
+ if isinstance(child, TitleItem):
560
+ new_item = new_doc.add_title(text=child.text, parent=new_doc.body)
561
+ new_item.meta = child.meta
562
+ title_node = new_item
563
+ open_sections = {}
564
+
565
+ elif isinstance(child, SectionHeaderItem):
566
+ level = child.level
567
+ parent = _parent_for_level(level)
568
+ new_item = new_doc.add_heading(text=child.text, level=level, parent=parent)
569
+ new_item.meta = child.meta
570
+ # Close all open sections at >= this level
571
+ open_sections = {lv: n for lv, n in open_sections.items() if lv < level}
572
+ open_sections[level] = new_item
573
+
574
+ elif isinstance(child, ListGroup):
575
+ _copy_list_group(child, flat, new_doc, _current_parent())
576
+
577
+ elif isinstance(child, TableItem):
578
+ _copy_table(child, flat, new_doc, _current_parent())
579
+
580
+ elif isinstance(child, PictureItem):
581
+ _copy_picture(child, flat, new_doc, _current_parent())
582
+
583
+ elif hasattr(child, "text"):
584
+ new_item = new_doc.add_text(label=child.label, text=child.text, parent=_current_parent())
585
+ new_item.meta = child.meta
586
+
587
+ else:
588
+ logger.warning(f"Unhandled item type {type(child).__name__} in make_hierarchical_document")
589
+
590
+ return new_doc