docling 2.45.0__tar.gz → 2.47.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.45.0 → docling-2.47.0}/PKG-INFO +3 -2
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docling_parse_v4_backend.py +61 -27
- {docling-2.45.0 → docling-2.47.0}/docling/backend/html_backend.py +119 -17
- {docling-2.45.0 → docling-2.47.0}/docling/backend/msword_backend.py +126 -16
- {docling-2.45.0 → docling-2.47.0}/docling/cli/main.py +14 -0
- {docling-2.45.0 → docling-2.47.0}/docling/cli/models.py +56 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/base_models.py +1 -1
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/pipeline_options.py +4 -3
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/pipeline_options_vlm_model.py +5 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/vlm_model_specs.py +114 -1
- docling-2.47.0/docling/models/base_model.py +186 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/code_formula_model.py +87 -76
- {docling-2.45.0 → docling-2.47.0}/docling/models/page_preprocessing_model.py +5 -1
- {docling-2.45.0 → docling-2.47.0}/docling/models/picture_description_vlm_model.py +4 -2
- {docling-2.45.0 → docling-2.47.0}/docling/models/tesseract_ocr_cli_model.py +4 -2
- docling-2.47.0/docling/models/vlm_models_inline/hf_transformers_model.py +314 -0
- docling-2.47.0/docling/models/vlm_models_inline/mlx_model.py +260 -0
- docling-2.47.0/docling/models/vlm_models_inline/vllm_model.py +235 -0
- {docling-2.45.0 → docling-2.47.0}/docling/pipeline/base_pipeline.py +7 -1
- {docling-2.45.0 → docling-2.47.0}/docling/pipeline/threaded_standard_pdf_pipeline.py +7 -5
- {docling-2.45.0 → docling-2.47.0}/docling/pipeline/vlm_pipeline.py +14 -1
- docling-2.47.0/docling/py.typed +1 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/layout_postprocessor.py +51 -43
- {docling-2.45.0 → docling-2.47.0}/docling.egg-info/PKG-INFO +3 -2
- {docling-2.45.0 → docling-2.47.0}/docling.egg-info/SOURCES.txt +1 -0
- {docling-2.45.0 → docling-2.47.0}/docling.egg-info/requires.txt +4 -1
- {docling-2.45.0 → docling-2.47.0}/pyproject.toml +4 -2
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_html.py +32 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_e2e_conversion.py +1 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_interfaces.py +3 -0
- docling-2.45.0/docling/models/base_model.py +0 -93
- docling-2.45.0/docling/models/vlm_models_inline/hf_transformers_model.py +0 -214
- docling-2.45.0/docling/models/vlm_models_inline/mlx_model.py +0 -149
- docling-2.45.0/docling/utils/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/LICENSE +0 -0
- {docling-2.45.0 → docling-2.47.0}/README.md +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docx/latex/latex_dict.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/docx/latex/omml.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/md_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/mets_gbs_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/noop_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/chunking/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/cli/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/cli/tools.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/accelerator_options.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/asr_model_specs.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/document.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/layout_model_specs.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/pipeline_options_asr_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/datamodel/settings.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/document_converter.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/exceptions.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/api_vlm_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/layout_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/picture_description_api_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/picture_description_base_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/utils/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/models/utils/hf_model_download.py +0 -0
- /docling-2.45.0/docling/py.typed → /docling-2.47.0/docling/models/vlm_models_inline/__init__.py +0 -0
- {docling-2.45.0/docling/models/vlm_models_inline → docling-2.47.0/docling/pipeline}/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/pipeline/asr_pipeline.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.45.0/docling/pipeline → docling-2.47.0/docling/utils}/__init__.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/api_image_request.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/export.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/locks.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/orientation.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/profiling.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/utils.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling/utils/visualization.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling.egg-info/dependency_links.txt +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling.egg-info/entry_points.txt +0 -0
- {docling-2.45.0 → docling-2.47.0}/docling.egg-info/top_level.txt +0 -0
- {docling-2.45.0 → docling-2.47.0}/setup.cfg +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_asr_pipeline.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_asciidoc.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_csv.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_docling_json.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_docling_parse.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_docling_parse_v2.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_docling_parse_v4.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_jats.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_markdown.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_mets_gbs.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_msexcel.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_msword.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_patent_uspto.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_pdfium.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_pptx.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_backend_webp.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_cli.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_code_formula.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_data_gen_flag.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_document_picture_classifier.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_e2e_ocr_conversion.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_input_doc.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_invalid_input.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_legacy_format_transform.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_ocr_utils.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_options.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_settings_load.py +0 -0
- {docling-2.45.0 → docling-2.47.0}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.47.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.2.2
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
@@ -59,6 +59,7 @@ Provides-Extra: vlm
|
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
61
|
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
|
+
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
|
62
63
|
Provides-Extra: rapidocr
|
63
64
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
65
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
|
|
22
22
|
|
23
23
|
|
24
24
|
class DoclingParseV4PageBackend(PdfPageBackend):
|
25
|
-
def __init__(
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
*,
|
28
|
+
dp_doc: PdfDocument,
|
29
|
+
page_obj: PdfPage,
|
30
|
+
page_no: int,
|
31
|
+
create_words: bool = True,
|
32
|
+
create_textlines: bool = True,
|
33
|
+
):
|
26
34
|
self._ppage = page_obj
|
27
|
-
self.
|
28
|
-
self.
|
35
|
+
self._dp_doc = dp_doc
|
36
|
+
self._page_no = page_no
|
37
|
+
self._create_words = create_words
|
38
|
+
self._create_textlines = create_textlines
|
39
|
+
|
40
|
+
self._dpage: Optional[SegmentedPdfPage] = None
|
41
|
+
self._unloaded = False
|
42
|
+
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
|
43
|
+
|
44
|
+
def _ensure_parsed(self) -> None:
|
45
|
+
if self._dpage is not None:
|
46
|
+
return
|
47
|
+
|
48
|
+
seg_page = self._dp_doc.get_page(
|
49
|
+
self._page_no + 1,
|
50
|
+
create_words=self._create_words,
|
51
|
+
create_textlines=self._create_textlines,
|
52
|
+
)
|
53
|
+
|
54
|
+
# In Docling, all TextCell instances are expected with top-left origin.
|
55
|
+
[
|
56
|
+
tc.to_top_left_origin(seg_page.dimension.height)
|
57
|
+
for tc in seg_page.textline_cells
|
58
|
+
]
|
59
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
|
60
|
+
[tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
|
61
|
+
|
62
|
+
self._dpage = seg_page
|
29
63
|
|
30
64
|
def is_valid(self) -> bool:
|
31
65
|
return self.valid
|
32
66
|
|
33
67
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
68
|
+
self._ensure_parsed()
|
69
|
+
assert self._dpage is not None
|
70
|
+
|
34
71
|
# Find intersecting cells on the page
|
35
72
|
text_piece = ""
|
36
73
|
page_size = self.get_size()
|
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
56
93
|
return text_piece
|
57
94
|
|
58
95
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
96
|
+
self._ensure_parsed()
|
59
97
|
return self._dpage
|
60
98
|
|
61
99
|
def get_text_cells(self) -> Iterable[TextCell]:
|
100
|
+
self._ensure_parsed()
|
101
|
+
assert self._dpage is not None
|
102
|
+
|
62
103
|
return self._dpage.textline_cells
|
63
104
|
|
64
105
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
106
|
+
self._ensure_parsed()
|
107
|
+
assert self._dpage is not None
|
108
|
+
|
65
109
|
AREA_THRESHOLD = 0 # 32 * 32
|
66
110
|
|
67
111
|
images = self._dpage.bitmap_resources
|
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
123
167
|
# )
|
124
168
|
|
125
169
|
def unload(self):
|
170
|
+
if not self._unloaded and self._dp_doc is not None:
|
171
|
+
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
|
172
|
+
self._unloaded = True
|
173
|
+
|
126
174
|
self._ppage = None
|
127
175
|
self._dpage = None
|
176
|
+
self._dp_doc = None
|
128
177
|
|
129
178
|
|
130
179
|
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
157
206
|
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
158
207
|
) -> DoclingParseV4PageBackend:
|
159
208
|
with pypdfium2_lock:
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
for tc in seg_page.textline_cells
|
170
|
-
]
|
171
|
-
[
|
172
|
-
tc.to_top_left_origin(seg_page.dimension.height)
|
173
|
-
for tc in seg_page.char_cells
|
174
|
-
]
|
175
|
-
[
|
176
|
-
tc.to_top_left_origin(seg_page.dimension.height)
|
177
|
-
for tc in seg_page.word_cells
|
178
|
-
]
|
179
|
-
|
180
|
-
return DoclingParseV4PageBackend(
|
181
|
-
seg_page,
|
182
|
-
self._pdoc[page_no],
|
183
|
-
)
|
209
|
+
ppage = self._pdoc[page_no]
|
210
|
+
|
211
|
+
return DoclingParseV4PageBackend(
|
212
|
+
dp_doc=self.dp_doc,
|
213
|
+
page_obj=ppage,
|
214
|
+
page_no=page_no,
|
215
|
+
create_words=create_words,
|
216
|
+
create_textlines=create_textlines,
|
217
|
+
)
|
184
218
|
|
185
219
|
def is_valid(self) -> bool:
|
186
220
|
return self.page_count() > 0
|
@@ -20,7 +20,7 @@ from docling_core.types.doc import (
|
|
20
20
|
TableData,
|
21
21
|
TextItem,
|
22
22
|
)
|
23
|
-
from docling_core.types.doc.document import ContentLayer
|
23
|
+
from docling_core.types.doc.document import ContentLayer, Formatting, Script
|
24
24
|
from pydantic import AnyUrl, BaseModel, ValidationError
|
25
25
|
from typing_extensions import override
|
26
26
|
|
@@ -38,6 +38,7 @@ _BLOCK_TAGS: Final = {
|
|
38
38
|
"address",
|
39
39
|
"details",
|
40
40
|
"figure",
|
41
|
+
"footer",
|
41
42
|
"h1",
|
42
43
|
"h2",
|
43
44
|
"h3",
|
@@ -53,6 +54,21 @@ _BLOCK_TAGS: Final = {
|
|
53
54
|
"table",
|
54
55
|
}
|
55
56
|
|
57
|
+
_FORMAT_TAG_MAP: Final = {
|
58
|
+
"b": {"bold": True},
|
59
|
+
"strong": {"bold": True},
|
60
|
+
"i": {"italic": True},
|
61
|
+
"em": {"italic": True},
|
62
|
+
# "mark",
|
63
|
+
# "small",
|
64
|
+
"s": {"strikethrough": True},
|
65
|
+
"del": {"strikethrough": True},
|
66
|
+
"u": {"underline": True},
|
67
|
+
"ins": {"underline": True},
|
68
|
+
"sub": {"script": Script.SUB},
|
69
|
+
"sup": {"script": Script.SUPER},
|
70
|
+
}
|
71
|
+
|
56
72
|
|
57
73
|
class _Context(BaseModel):
|
58
74
|
list_ordered_flag_by_ref: dict[str, bool] = {}
|
@@ -62,23 +78,34 @@ class _Context(BaseModel):
|
|
62
78
|
class AnnotatedText(BaseModel):
|
63
79
|
text: str
|
64
80
|
hyperlink: Union[AnyUrl, Path, None] = None
|
81
|
+
formatting: Union[Formatting, None] = None
|
65
82
|
|
66
83
|
|
67
84
|
class AnnotatedTextList(list):
|
68
85
|
def to_single_text_element(self) -> AnnotatedText:
|
69
86
|
current_h = None
|
70
87
|
current_text = ""
|
88
|
+
current_f = None
|
71
89
|
for at in self:
|
72
90
|
t = at.text
|
73
91
|
h = at.hyperlink
|
92
|
+
f = at.formatting
|
74
93
|
current_text += t.strip() + " "
|
94
|
+
if f is not None and current_f is None:
|
95
|
+
current_f = f
|
96
|
+
elif f is not None and current_f is not None and f != current_f:
|
97
|
+
_log.warning(
|
98
|
+
f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
|
99
|
+
)
|
75
100
|
if h is not None and current_h is None:
|
76
101
|
current_h = h
|
77
102
|
elif h is not None and current_h is not None and h != current_h:
|
78
103
|
_log.warning(
|
79
104
|
f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
|
80
105
|
)
|
81
|
-
return AnnotatedText(
|
106
|
+
return AnnotatedText(
|
107
|
+
text=current_text.strip(), hyperlink=current_h, formatting=current_f
|
108
|
+
)
|
82
109
|
|
83
110
|
def simplify_text_elements(self) -> "AnnotatedTextList":
|
84
111
|
simplified = AnnotatedTextList()
|
@@ -86,21 +113,27 @@ class AnnotatedTextList(list):
|
|
86
113
|
return self
|
87
114
|
text = self[0].text
|
88
115
|
hyperlink = self[0].hyperlink
|
116
|
+
formatting = self[0].formatting
|
89
117
|
last_elm = text
|
90
118
|
for i in range(1, len(self)):
|
91
|
-
if hyperlink == self[i].hyperlink:
|
119
|
+
if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
|
92
120
|
sep = " "
|
93
121
|
if not self[i].text.strip() or not last_elm.strip():
|
94
122
|
sep = ""
|
95
123
|
text += sep + self[i].text
|
96
124
|
last_elm = self[i].text
|
97
125
|
else:
|
98
|
-
simplified.append(
|
126
|
+
simplified.append(
|
127
|
+
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
|
128
|
+
)
|
99
129
|
text = self[i].text
|
100
130
|
last_elm = text
|
101
131
|
hyperlink = self[i].hyperlink
|
132
|
+
formatting = self[i].formatting
|
102
133
|
if text:
|
103
|
-
simplified.append(
|
134
|
+
simplified.append(
|
135
|
+
AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
|
136
|
+
)
|
104
137
|
return simplified
|
105
138
|
|
106
139
|
def split_by_newline(self):
|
@@ -143,6 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
143
176
|
self.parents[i] = None
|
144
177
|
self.hyperlink = None
|
145
178
|
self.original_url = original_url
|
179
|
+
self.format_tags: list[str] = []
|
146
180
|
|
147
181
|
try:
|
148
182
|
raw = (
|
@@ -253,6 +287,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
253
287
|
label=DocItemLabel.TEXT,
|
254
288
|
text=seg_clean,
|
255
289
|
content_layer=self.content_layer,
|
290
|
+
formatting=annotated_text.formatting,
|
256
291
|
hyperlink=annotated_text.hyperlink,
|
257
292
|
)
|
258
293
|
|
@@ -262,6 +297,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
262
297
|
if name == "img":
|
263
298
|
flush_buffer()
|
264
299
|
self._emit_image(node, doc)
|
300
|
+
elif name in _FORMAT_TAG_MAP:
|
301
|
+
with self.use_format([name]):
|
302
|
+
self._walk(node, doc)
|
265
303
|
elif name == "a":
|
266
304
|
with self.use_hyperlink(node):
|
267
305
|
self._walk(node, doc)
|
@@ -291,6 +329,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
291
329
|
|
292
330
|
flush_buffer()
|
293
331
|
|
332
|
+
@staticmethod
|
333
|
+
def _collect_parent_format_tags(item: PageElement) -> list[str]:
|
334
|
+
tags = []
|
335
|
+
for format_tag in _FORMAT_TAG_MAP:
|
336
|
+
this_parent = item.parent
|
337
|
+
while this_parent is not None:
|
338
|
+
if this_parent.name == format_tag:
|
339
|
+
tags.append(format_tag)
|
340
|
+
break
|
341
|
+
this_parent = this_parent.parent
|
342
|
+
return tags
|
343
|
+
|
344
|
+
@property
|
345
|
+
def _formatting(self):
|
346
|
+
kwargs = {}
|
347
|
+
for t in self.format_tags:
|
348
|
+
kwargs.update(_FORMAT_TAG_MAP[t])
|
349
|
+
if not kwargs:
|
350
|
+
return None
|
351
|
+
return Formatting(**kwargs)
|
352
|
+
|
294
353
|
def _extract_text_and_hyperlink_recursively(
|
295
354
|
self,
|
296
355
|
item: PageElement,
|
@@ -301,15 +360,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
301
360
|
result: AnnotatedTextList = AnnotatedTextList()
|
302
361
|
|
303
362
|
# If find_parent_annotation, make sure that we keep track of
|
304
|
-
# any a-tag that has been present in the
|
363
|
+
# any a- or formatting-tag that has been present in the
|
364
|
+
# DOM-parents already.
|
305
365
|
if find_parent_annotation:
|
366
|
+
format_tags = self._collect_parent_format_tags(item)
|
306
367
|
this_parent = item.parent
|
307
368
|
while this_parent is not None:
|
308
369
|
if this_parent.name == "a" and this_parent.get("href"):
|
309
|
-
with self.
|
310
|
-
|
311
|
-
|
312
|
-
|
370
|
+
with self.use_format(format_tags):
|
371
|
+
with self.use_hyperlink(this_parent):
|
372
|
+
return self._extract_text_and_hyperlink_recursively(
|
373
|
+
item, ignore_list
|
374
|
+
)
|
313
375
|
this_parent = this_parent.parent
|
314
376
|
|
315
377
|
if isinstance(item, PreformattedString):
|
@@ -319,18 +381,37 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
319
381
|
text = item.strip()
|
320
382
|
if text:
|
321
383
|
return AnnotatedTextList(
|
322
|
-
[
|
384
|
+
[
|
385
|
+
AnnotatedText(
|
386
|
+
text=text,
|
387
|
+
hyperlink=self.hyperlink,
|
388
|
+
formatting=self._formatting,
|
389
|
+
)
|
390
|
+
]
|
323
391
|
)
|
324
392
|
if keep_newlines and item.strip("\n\r") == "":
|
325
393
|
return AnnotatedTextList(
|
326
|
-
[
|
394
|
+
[
|
395
|
+
AnnotatedText(
|
396
|
+
text="\n",
|
397
|
+
hyperlink=self.hyperlink,
|
398
|
+
formatting=self._formatting,
|
399
|
+
)
|
400
|
+
]
|
327
401
|
)
|
328
402
|
return AnnotatedTextList()
|
329
403
|
|
330
404
|
tag = cast(Tag, item)
|
331
405
|
if not ignore_list or (tag.name not in ["ul", "ol"]):
|
332
406
|
for child in tag:
|
333
|
-
if isinstance(child, Tag) and child.name
|
407
|
+
if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
|
408
|
+
with self.use_format([child.name]):
|
409
|
+
result.extend(
|
410
|
+
self._extract_text_and_hyperlink_recursively(
|
411
|
+
child, ignore_list, keep_newlines=keep_newlines
|
412
|
+
)
|
413
|
+
)
|
414
|
+
elif isinstance(child, Tag) and child.name == "a":
|
334
415
|
with self.use_hyperlink(child):
|
335
416
|
result.extend(
|
336
417
|
self._extract_text_and_hyperlink_recursively(
|
@@ -368,6 +449,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
368
449
|
if this_href:
|
369
450
|
self.hyperlink = old_hyperlink
|
370
451
|
|
452
|
+
@contextmanager
|
453
|
+
def use_format(self, tags: list[str]):
|
454
|
+
if not tags:
|
455
|
+
yield None
|
456
|
+
else:
|
457
|
+
self.format_tags.extend(tags)
|
458
|
+
try:
|
459
|
+
yield None
|
460
|
+
finally:
|
461
|
+
self.format_tags = self.format_tags[: -len(tags)]
|
462
|
+
|
371
463
|
@contextmanager
|
372
464
|
def use_inline_group(
|
373
465
|
self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
|
@@ -419,6 +511,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
419
511
|
self.parents[self.level + 1] = doc.add_title(
|
420
512
|
text_clean,
|
421
513
|
content_layer=self.content_layer,
|
514
|
+
formatting=annotated_text.formatting,
|
422
515
|
hyperlink=annotated_text.hyperlink,
|
423
516
|
)
|
424
517
|
# the other levels need to be lowered by 1 if a title was set
|
@@ -448,6 +541,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
448
541
|
orig=annotated_text.text,
|
449
542
|
level=self.level,
|
450
543
|
content_layer=self.content_layer,
|
544
|
+
formatting=annotated_text.formatting,
|
451
545
|
hyperlink=annotated_text.hyperlink,
|
452
546
|
)
|
453
547
|
self.level += 1
|
@@ -528,6 +622,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
528
622
|
label=DocItemLabel.TEXT,
|
529
623
|
text=li_clean,
|
530
624
|
content_layer=self.content_layer,
|
625
|
+
formatting=annotated_text.formatting,
|
531
626
|
hyperlink=annotated_text.hyperlink,
|
532
627
|
)
|
533
628
|
|
@@ -550,6 +645,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
550
645
|
orig=li_text,
|
551
646
|
parent=list_group,
|
552
647
|
content_layer=self.content_layer,
|
648
|
+
formatting=annotated_text.formatting,
|
553
649
|
hyperlink=annotated_text.hyperlink,
|
554
650
|
)
|
555
651
|
|
@@ -602,6 +698,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
602
698
|
label=DocItemLabel.TEXT,
|
603
699
|
text=seg_clean,
|
604
700
|
content_layer=self.content_layer,
|
701
|
+
formatting=annotated_text.formatting,
|
605
702
|
hyperlink=annotated_text.hyperlink,
|
606
703
|
)
|
607
704
|
|
@@ -636,13 +733,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
636
733
|
parent=self.parents[self.level],
|
637
734
|
text=text_clean,
|
638
735
|
content_layer=self.content_layer,
|
736
|
+
formatting=annotated_text.formatting,
|
639
737
|
hyperlink=annotated_text.hyperlink,
|
640
738
|
)
|
641
739
|
|
642
|
-
elif tag_name
|
643
|
-
|
740
|
+
elif tag_name in {"details", "footer"}:
|
741
|
+
if tag_name == "footer":
|
742
|
+
current_layer = self.content_layer
|
743
|
+
self.content_layer = ContentLayer.FURNITURE
|
644
744
|
self.parents[self.level + 1] = doc.add_group(
|
645
|
-
name=
|
745
|
+
name=tag_name,
|
646
746
|
label=GroupLabel.SECTION,
|
647
747
|
parent=self.parents[self.level],
|
648
748
|
content_layer=self.content_layer,
|
@@ -651,6 +751,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
651
751
|
self._walk(tag, doc)
|
652
752
|
self.parents[self.level + 1] = None
|
653
753
|
self.level -= 1
|
754
|
+
if tag_name == "footer":
|
755
|
+
self.content_layer = current_layer
|
654
756
|
|
655
757
|
def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
|
656
758
|
figure = img_tag.find_parent("figure")
|
@@ -686,12 +788,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
686
788
|
text_clean = HTMLDocumentBackend._clean_unicode(
|
687
789
|
caption_anno_text.text.strip()
|
688
790
|
)
|
689
|
-
print(caption_anno_text)
|
690
791
|
caption_item = doc.add_text(
|
691
792
|
label=DocItemLabel.CAPTION,
|
692
793
|
text=text_clean,
|
693
794
|
orig=caption_anno_text.text,
|
694
795
|
content_layer=self.content_layer,
|
796
|
+
formatting=caption_anno_text.formatting,
|
695
797
|
hyperlink=caption_anno_text.hyperlink,
|
696
798
|
)
|
697
799
|
|
@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
67
67
|
|
68
68
|
self.level = 0
|
69
69
|
self.listIter = 0
|
70
|
+
# Track list counters per numId and ilvl
|
71
|
+
self.list_counters: dict[tuple[int, int], int] = {}
|
70
72
|
|
71
73
|
self.history: dict[str, Any] = {
|
72
74
|
"names": [None],
|
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
315
317
|
|
316
318
|
return None, None # If the paragraph is not part of a list
|
317
319
|
|
320
|
+
def _get_list_counter(self, numid: int, ilvl: int) -> int:
|
321
|
+
"""Get and increment the counter for a specific numId and ilvl combination."""
|
322
|
+
key = (numid, ilvl)
|
323
|
+
if key not in self.list_counters:
|
324
|
+
self.list_counters[key] = 0
|
325
|
+
self.list_counters[key] += 1
|
326
|
+
return self.list_counters[key]
|
327
|
+
|
328
|
+
def _reset_list_counters_for_new_sequence(self, numid: int):
|
329
|
+
"""Reset counters when starting a new numbering sequence."""
|
330
|
+
# Reset all counters for this numid
|
331
|
+
keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
|
332
|
+
for key in keys_to_reset:
|
333
|
+
self.list_counters[key] = 0
|
334
|
+
|
335
|
+
def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
|
336
|
+
"""Check if a list is numbered based on its numFmt value."""
|
337
|
+
try:
|
338
|
+
# Access the numbering part of the document
|
339
|
+
if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
|
340
|
+
return False
|
341
|
+
|
342
|
+
numbering_part = None
|
343
|
+
# Find the numbering part
|
344
|
+
for part in docx_obj.part.package.parts:
|
345
|
+
if "numbering" in part.partname:
|
346
|
+
numbering_part = part
|
347
|
+
break
|
348
|
+
|
349
|
+
if numbering_part is None:
|
350
|
+
return False
|
351
|
+
|
352
|
+
# Parse the numbering XML
|
353
|
+
numbering_root = numbering_part.element
|
354
|
+
namespaces = {
|
355
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
356
|
+
}
|
357
|
+
|
358
|
+
# Find the numbering definition with the given numId
|
359
|
+
num_xpath = f".//w:num[@w:numId='{numId}']"
|
360
|
+
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
|
361
|
+
|
362
|
+
if num_element is None:
|
363
|
+
return False
|
364
|
+
|
365
|
+
# Get the abstractNumId from the num element
|
366
|
+
abstract_num_id_elem = num_element.find(
|
367
|
+
".//w:abstractNumId", namespaces=namespaces
|
368
|
+
)
|
369
|
+
if abstract_num_id_elem is None:
|
370
|
+
return False
|
371
|
+
|
372
|
+
abstract_num_id = abstract_num_id_elem.get(
|
373
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
374
|
+
)
|
375
|
+
if abstract_num_id is None:
|
376
|
+
return False
|
377
|
+
|
378
|
+
# Find the abstract numbering definition
|
379
|
+
abstract_num_xpath = (
|
380
|
+
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
|
381
|
+
)
|
382
|
+
abstract_num_element = numbering_root.find(
|
383
|
+
abstract_num_xpath, namespaces=namespaces
|
384
|
+
)
|
385
|
+
|
386
|
+
if abstract_num_element is None:
|
387
|
+
return False
|
388
|
+
|
389
|
+
# Find the level definition for the given ilvl
|
390
|
+
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
|
391
|
+
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
|
392
|
+
|
393
|
+
if lvl_element is None:
|
394
|
+
return False
|
395
|
+
|
396
|
+
# Get the numFmt element
|
397
|
+
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
|
398
|
+
if num_fmt_element is None:
|
399
|
+
return False
|
400
|
+
|
401
|
+
num_fmt = num_fmt_element.get(
|
402
|
+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
403
|
+
)
|
404
|
+
|
405
|
+
# Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
|
406
|
+
# Bullet formats include: bullet
|
407
|
+
numbered_formats = {
|
408
|
+
"decimal",
|
409
|
+
"lowerRoman",
|
410
|
+
"upperRoman",
|
411
|
+
"lowerLetter",
|
412
|
+
"upperLetter",
|
413
|
+
"decimalZero",
|
414
|
+
}
|
415
|
+
|
416
|
+
return num_fmt in numbered_formats
|
417
|
+
|
418
|
+
except Exception as e:
|
419
|
+
_log.debug(f"Error determining if list is numbered: {e}")
|
420
|
+
return False
|
421
|
+
|
318
422
|
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
319
423
|
parts = self._split_text_and_number(style_label)
|
320
424
|
|
@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
713
817
|
# Common styles for bullet and numbered lists.
|
714
818
|
# "List Bullet", "List Number", "List Paragraph"
|
715
819
|
# Identify whether list is a numbered list or not
|
716
|
-
# is_numbered = "List Bullet" not in paragraph.style.name
|
717
|
-
is_numbered = False
|
718
820
|
p_style_id, p_level = self._get_label_and_level(paragraph)
|
719
821
|
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
720
822
|
|
@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
727
829
|
and ilevel is not None
|
728
830
|
and p_style_id not in ["Title", "Heading"]
|
729
831
|
):
|
832
|
+
# Check if this is actually a numbered list by examining the numFmt
|
833
|
+
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
|
834
|
+
|
730
835
|
self._add_list_item(
|
731
836
|
doc=doc,
|
732
837
|
numid=numid,
|
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
983
1088
|
if self._prev_numid() is None: # Open new list
|
984
1089
|
self.level_at_new_list = level
|
985
1090
|
|
1091
|
+
# Reset counters for the new numbering sequence
|
1092
|
+
self._reset_list_counters_for_new_sequence(numid)
|
1093
|
+
|
986
1094
|
self.parents[level] = doc.add_list_group(
|
987
1095
|
name="list", parent=self.parents[level - 1]
|
988
1096
|
)
|
989
1097
|
|
990
1098
|
# Set marker and enumerated arguments if this is an enumeration element.
|
991
|
-
self.listIter += 1
|
992
1099
|
if is_numbered:
|
993
|
-
|
994
|
-
|
1100
|
+
counter = self._get_list_counter(numid, ilevel)
|
1101
|
+
enum_marker = str(counter) + "."
|
1102
|
+
else:
|
1103
|
+
enum_marker = ""
|
995
1104
|
self._add_formatted_list_item(
|
996
1105
|
doc, elements, enum_marker, is_numbered, level
|
997
1106
|
)
|
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1005
1114
|
self.level_at_new_list + prev_indent + 1,
|
1006
1115
|
self.level_at_new_list + ilevel + 1,
|
1007
1116
|
):
|
1008
|
-
self.listIter = 0
|
1009
1117
|
self.parents[i] = doc.add_list_group(
|
1010
1118
|
name="list", parent=self.parents[i - 1]
|
1011
1119
|
)
|
1012
1120
|
|
1013
1121
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
1014
|
-
self.listIter += 1
|
1015
1122
|
if is_numbered:
|
1016
|
-
|
1017
|
-
|
1123
|
+
counter = self._get_list_counter(numid, ilevel)
|
1124
|
+
enum_marker = str(counter) + "."
|
1125
|
+
else:
|
1126
|
+
enum_marker = ""
|
1018
1127
|
self._add_formatted_list_item(
|
1019
1128
|
doc,
|
1020
1129
|
elements,
|
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1033
1142
|
self.parents[k] = None
|
1034
1143
|
|
1035
1144
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
1036
|
-
self.listIter += 1
|
1037
1145
|
if is_numbered:
|
1038
|
-
|
1039
|
-
|
1146
|
+
counter = self._get_list_counter(numid, ilevel)
|
1147
|
+
enum_marker = str(counter) + "."
|
1148
|
+
else:
|
1149
|
+
enum_marker = ""
|
1040
1150
|
self._add_formatted_list_item(
|
1041
1151
|
doc,
|
1042
1152
|
elements,
|
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1044
1154
|
is_numbered,
|
1045
1155
|
self.level_at_new_list + ilevel,
|
1046
1156
|
)
|
1047
|
-
self.listIter = 0
|
1048
1157
|
|
1049
1158
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
1050
1159
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
1051
|
-
self.listIter += 1
|
1052
1160
|
if is_numbered:
|
1053
|
-
|
1054
|
-
|
1161
|
+
counter = self._get_list_counter(numid, ilevel)
|
1162
|
+
enum_marker = str(counter) + "."
|
1163
|
+
else:
|
1164
|
+
enum_marker = ""
|
1055
1165
|
self._add_formatted_list_item(
|
1056
1166
|
doc, elements, enum_marker, is_numbered, level - 1
|
1057
1167
|
)
|