docling-ibm-models 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_ibm_models/list_item_normalizer/__init__.py +0 -0
- docling_ibm_models/list_item_normalizer/list_marker_processor.py +302 -0
- {docling_ibm_models-3.5.0.dist-info → docling_ibm_models-3.6.0.dist-info}/METADATA +1 -1
- {docling_ibm_models-3.5.0.dist-info → docling_ibm_models-3.6.0.dist-info}/RECORD +7 -5
- {docling_ibm_models-3.5.0.dist-info → docling_ibm_models-3.6.0.dist-info}/WHEEL +0 -0
- {docling_ibm_models-3.5.0.dist-info → docling_ibm_models-3.6.0.dist-info}/licenses/LICENSE +0 -0
- {docling_ibm_models-3.5.0.dist-info → docling_ibm_models-3.6.0.dist-info}/top_level.txt +0 -0
File without changes
|
@@ -0,0 +1,302 @@
|
|
1
|
+
"""
|
2
|
+
List Item Marker Processor for Docling Documents
|
3
|
+
|
4
|
+
This module provides a rule-based model to identify list item markers and
|
5
|
+
merge marker-only TextItems with their content to create proper ListItems.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import re
|
10
|
+
from typing import Union
|
11
|
+
|
12
|
+
from docling_core.types.doc.document import (
|
13
|
+
DocItemLabel,
|
14
|
+
DoclingDocument,
|
15
|
+
ListItem,
|
16
|
+
ProvenanceItem,
|
17
|
+
RefItem,
|
18
|
+
TextItem,
|
19
|
+
)
|
20
|
+
from docling_core.types.doc.labels import DocItemLabel
|
21
|
+
|
22
|
+
_log = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class ListItemMarkerProcessor:
|
26
|
+
"""
|
27
|
+
A rule-based processor for identifying and processing list item markers.
|
28
|
+
|
29
|
+
This class can:
|
30
|
+
1. Identify various list item markers (bullets, numbers, letters)
|
31
|
+
2. Detect marker-only TextItems followed by content TextItems
|
32
|
+
3. Merge them into proper ListItems
|
33
|
+
4. Group consecutive ListItems into appropriate list containers
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(self):
|
37
|
+
"""Initialize the processor with marker patterns."""
|
38
|
+
# Bullet markers (unordered lists)
|
39
|
+
self._bullet_patterns = [
|
40
|
+
r"[\u2022\u2023\u25E6\u2043\u204C\u204D\u2219\u25AA\u25AB\u25CF\u25CB]", # Various bullet symbols
|
41
|
+
r"[-*+•·‣⁃]", # Common ASCII and Unicode bullets
|
42
|
+
r"[►▶▸‣➤➢]", # Arrow-like bullets
|
43
|
+
r"[✓✔✗✘]", # Checkmark bullets
|
44
|
+
]
|
45
|
+
|
46
|
+
# Numbered markers (ordered lists)
|
47
|
+
self._numbered_patterns = [
|
48
|
+
r"\d+\.", # 1. 2. 3.
|
49
|
+
r"\d+\)", # 1) 2) 3)
|
50
|
+
r"\(\d+\)", # (1) (2) (3)
|
51
|
+
r"\[\d+\]", # [1] [2] [3]
|
52
|
+
r"[ivxlcdm]+\.", # i. ii. iii. (Roman numerals lowercase)
|
53
|
+
r"[IVXLCDM]+\.", # I. II. III. (Roman numerals uppercase)
|
54
|
+
r"[a-z]\.", # a. b. c.
|
55
|
+
r"[A-Z]\.", # A. B. C.
|
56
|
+
r"[a-z]\)", # a) b) c)
|
57
|
+
r"[A-Z]\)", # A) B) C)
|
58
|
+
]
|
59
|
+
|
60
|
+
# Compile all patterns
|
61
|
+
self._compiled_bullet_patterns = [
|
62
|
+
re.compile(f"^{pattern}$") for pattern in self._bullet_patterns
|
63
|
+
]
|
64
|
+
self._compiled_numbered_patterns = [
|
65
|
+
re.compile(f"^{pattern}$") for pattern in self._numbered_patterns
|
66
|
+
]
|
67
|
+
|
68
|
+
self._compiled_bullet_item_patterns = [
|
69
|
+
re.compile(f"^({pattern})" + r"\s(.+)") for pattern in self._bullet_patterns
|
70
|
+
]
|
71
|
+
self._compiled_numbered_item_patterns = [
|
72
|
+
re.compile(f"^({pattern})" + r"\s(.+)")
|
73
|
+
for pattern in self._numbered_patterns
|
74
|
+
]
|
75
|
+
|
76
|
+
self._compiled_item_patterns = (
|
77
|
+
self._compiled_bullet_item_patterns + self._compiled_numbered_item_patterns
|
78
|
+
)
|
79
|
+
|
80
|
+
def _is_bullet_marker(self, text: str) -> bool:
|
81
|
+
"""Check if text is a bullet marker."""
|
82
|
+
text = text.strip()
|
83
|
+
return any(pattern.match(text) for pattern in self._compiled_bullet_patterns)
|
84
|
+
|
85
|
+
def _is_numbered_marker(self, text: str) -> bool:
|
86
|
+
"""Check if text is a numbered marker."""
|
87
|
+
text = text.strip()
|
88
|
+
return any(pattern.match(text) for pattern in self._compiled_numbered_patterns)
|
89
|
+
|
90
|
+
def _find_marker_content_pairs(self, doc: DoclingDocument):
|
91
|
+
"""
|
92
|
+
Find pairs of marker-only TextItems and their content TextItems.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
List of (marker_item, content_item) tuples. content_item can be None
|
96
|
+
if the marker item already contains content.
|
97
|
+
"""
|
98
|
+
self._matched_items: dict[int, tuple[RefItem, bool]] = (
|
99
|
+
{}
|
100
|
+
) # index to (self_ref, is_pure_marker)
|
101
|
+
self._other: dict[int, RefItem] = {} # index to self_ref
|
102
|
+
|
103
|
+
for i, (item, level) in enumerate(doc.iterate_items(with_groups=False)):
|
104
|
+
if not isinstance(item, TextItem):
|
105
|
+
continue
|
106
|
+
|
107
|
+
if self._is_bullet_marker(item.orig):
|
108
|
+
self._matched_items[i] = (item.get_ref(), True)
|
109
|
+
elif self._is_numbered_marker(item.orig):
|
110
|
+
self._matched_items[i] = (item.get_ref(), True)
|
111
|
+
else:
|
112
|
+
for pattern in self._compiled_item_patterns:
|
113
|
+
mtch = pattern.match(item.orig)
|
114
|
+
if mtch:
|
115
|
+
self._matched_items[i] = (item.get_ref(), False)
|
116
|
+
|
117
|
+
if i not in self._matched_items:
|
118
|
+
self._other[i] = item.get_ref()
|
119
|
+
|
120
|
+
def _group_consecutive_list_items(self, doc: DoclingDocument) -> DoclingDocument:
|
121
|
+
"""
|
122
|
+
Might need to group list-items, not sure yet how...
|
123
|
+
"""
|
124
|
+
return doc
|
125
|
+
|
126
|
+
def process_list_item(self, item: ListItem) -> ListItem:
|
127
|
+
"""Process a ListItem to extract and update marker and text from bullet/numbered patterns.
|
128
|
+
|
129
|
+
This method applies compiled regex patterns to match bullet point or numbered list
|
130
|
+
formatting in the original text, then updates the ListItem's marker and text fields
|
131
|
+
accordingly.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
item (ListItem): The list item to process, containing original text that may
|
135
|
+
have bullet or numbered list formatting.
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
ListItem: The same ListItem instance with updated marker and text fields
|
139
|
+
if a pattern match was found, otherwise unchanged.
|
140
|
+
|
141
|
+
Note:
|
142
|
+
The method modifies the input item in place when a pattern matches.
|
143
|
+
If the item is not actually a ListItem type, a warning is logged.
|
144
|
+
"""
|
145
|
+
for pattern in self._compiled_item_patterns:
|
146
|
+
mtch = pattern.match(item.orig)
|
147
|
+
if mtch:
|
148
|
+
if isinstance(item, ListItem): # update item in place
|
149
|
+
item.marker = mtch[1]
|
150
|
+
item.text = mtch[2]
|
151
|
+
else:
|
152
|
+
_log.warning(
|
153
|
+
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
|
154
|
+
)
|
155
|
+
return item
|
156
|
+
|
157
|
+
def process_text_item(self, item: TextItem) -> Union[TextItem, ListItem]:
|
158
|
+
"""Process a TextItem to detect and convert bullet/numbered list formatting.
|
159
|
+
|
160
|
+
This method examines TextItem instances to determine if they contain bullet point
|
161
|
+
or numbered list formatting. If detected and appropriate, it either updates an
|
162
|
+
existing ListItem or converts the TextItem into a new ListItem.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
item (TextItem): The text item to process, which may contain bullet or
|
166
|
+
numbered list formatting in its original text.
|
167
|
+
|
168
|
+
Returns:
|
169
|
+
Union[TextItem, ListItem]:
|
170
|
+
- If item is already a ListItem: returns the updated ListItem
|
171
|
+
- If item is a TextItem with list formatting (and not a section heading
|
172
|
+
or footnote): returns a new ListItem with extracted marker and text
|
173
|
+
- Otherwise: returns the original TextItem unchanged
|
174
|
+
|
175
|
+
Note:
|
176
|
+
Section headings and footnotes are excluded from conversion to preserve
|
177
|
+
their semantic meaning. A warning is logged if pattern matching occurs
|
178
|
+
on unexpected item types.
|
179
|
+
"""
|
180
|
+
for pattern in self._compiled_item_patterns:
|
181
|
+
mtch = pattern.match(item.orig)
|
182
|
+
if mtch:
|
183
|
+
if isinstance(item, ListItem): # update item in place
|
184
|
+
item.marker = mtch[1]
|
185
|
+
item.text = mtch[2]
|
186
|
+
|
187
|
+
return item
|
188
|
+
elif isinstance(item, TextItem) and (
|
189
|
+
item.label
|
190
|
+
not in [DocItemLabel.SECTION_HEADER, DocItemLabel.FOOTNOTE]
|
191
|
+
):
|
192
|
+
# Create new ListItem
|
193
|
+
return ListItem(
|
194
|
+
self_ref=item.get_ref().cref,
|
195
|
+
marker=mtch[1],
|
196
|
+
text=mtch[2],
|
197
|
+
orig=item.orig,
|
198
|
+
prov=item.prov,
|
199
|
+
)
|
200
|
+
else:
|
201
|
+
_log.warning(
|
202
|
+
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
|
203
|
+
)
|
204
|
+
return item
|
205
|
+
|
206
|
+
def update_list_items_in_place(
|
207
|
+
self, doc: DoclingDocument, allow_textitem: bool = False
|
208
|
+
) -> DoclingDocument:
|
209
|
+
for item, level in doc.iterate_items():
|
210
|
+
if isinstance(item, ListItem):
|
211
|
+
item = self.process_list_item(item)
|
212
|
+
elif allow_textitem and isinstance(item, TextItem):
|
213
|
+
item = self.process_text_item(item)
|
214
|
+
|
215
|
+
return doc
|
216
|
+
|
217
|
+
def merge_markers_and_text_items_into_list_items(
|
218
|
+
self, doc: DoclingDocument
|
219
|
+
) -> DoclingDocument:
|
220
|
+
def create_listitem(
|
221
|
+
marker_text: str,
|
222
|
+
content_text: str,
|
223
|
+
orig_text: str,
|
224
|
+
prov: list[ProvenanceItem],
|
225
|
+
) -> ListItem:
|
226
|
+
# Create new ListItem
|
227
|
+
return ListItem(
|
228
|
+
self_ref="#",
|
229
|
+
marker=marker_text,
|
230
|
+
text=content_text,
|
231
|
+
orig=orig_text,
|
232
|
+
prov=prov,
|
233
|
+
)
|
234
|
+
|
235
|
+
# Find all marker-content pairs: this function will identify text-items
|
236
|
+
# with a marker fused into the text
|
237
|
+
self._find_marker_content_pairs(doc)
|
238
|
+
|
239
|
+
# If you find a sole marker-item followed by a text, there are
|
240
|
+
# good chances we need to merge them into a list-item. This
|
241
|
+
# function is only necessary as long as the layout-model does not
|
242
|
+
# recognize list-items properly
|
243
|
+
for ind, (self_ref, is_marker) in self._matched_items.items():
|
244
|
+
|
245
|
+
if is_marker:
|
246
|
+
|
247
|
+
marker_item = self_ref.resolve(doc=doc)
|
248
|
+
|
249
|
+
if ind + 1 in self._other:
|
250
|
+
next_item = self._other[ind + 1].resolve(doc=doc)
|
251
|
+
|
252
|
+
if (isinstance(next_item, TextItem)) and (
|
253
|
+
next_item.label in [DocItemLabel.TEXT, DocItemLabel.LIST_ITEM]
|
254
|
+
):
|
255
|
+
|
256
|
+
marker_text: str = marker_item.text
|
257
|
+
content_text: str = next_item.text
|
258
|
+
prov = marker_item.prov
|
259
|
+
prov.extend(next_item.prov)
|
260
|
+
|
261
|
+
list_item = create_listitem(
|
262
|
+
marker_text=marker_text,
|
263
|
+
content_text=content_text,
|
264
|
+
orig_text=f"{marker_text} {content_text}",
|
265
|
+
prov=prov,
|
266
|
+
)
|
267
|
+
|
268
|
+
# Insert the new ListItem
|
269
|
+
doc.insert_item_before_sibling(
|
270
|
+
new_item=list_item, sibling=marker_item
|
271
|
+
)
|
272
|
+
|
273
|
+
# Delete original items
|
274
|
+
items_to_delete = [marker_item, next_item]
|
275
|
+
doc.delete_items(node_items=items_to_delete)
|
276
|
+
|
277
|
+
return doc
|
278
|
+
|
279
|
+
def process_document(
|
280
|
+
self,
|
281
|
+
doc: DoclingDocument,
|
282
|
+
allow_textitem: bool = False,
|
283
|
+
merge_items: bool = False,
|
284
|
+
) -> DoclingDocument:
|
285
|
+
"""
|
286
|
+
Process the entire document to identify and convert list markers.
|
287
|
+
|
288
|
+
Args:
|
289
|
+
doc: The DoclingDocument to process
|
290
|
+
|
291
|
+
Returns:
|
292
|
+
The processed document (modified in-place)
|
293
|
+
"""
|
294
|
+
doc = self.update_list_items_in_place(doc, allow_textitem=allow_textitem)
|
295
|
+
|
296
|
+
if merge_items:
|
297
|
+
doc = self.merge_markers_and_text_items_into_list_items(doc)
|
298
|
+
|
299
|
+
# Group consecutive list items
|
300
|
+
doc = self._group_consecutive_list_items(doc)
|
301
|
+
|
302
|
+
return doc
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling-ibm-models
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.6.0
|
4
4
|
Summary: This package contains the AI models used by the Docling PDF conversion package
|
5
5
|
Author-email: Nikos Livathinos <nli@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -10,6 +10,8 @@ docling_ibm_models/document_figure_classifier_model/__init__.py,sha256=47DEQpj8H
|
|
10
10
|
docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py,sha256=vRIp02rs9Xa4n1K-M7AYO_tFj4S7WQCQmL9i006T9Qk,5795
|
11
11
|
docling_ibm_models/layoutmodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling_ibm_models/layoutmodel/layout_predictor.py,sha256=ArVgs7FBOiu23TC-JoybcaTp7F7a4BgYC8uRVxTgx4E,5681
|
13
|
+
docling_ibm_models/list_item_normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
docling_ibm_models/list_item_normalizer/list_marker_processor.py,sha256=IC_U-FrwPjCoYEPyMT7TTIcshSDmZAkx1tmYbXDV0x4,11469
|
13
15
|
docling_ibm_models/reading_order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
16
|
docling_ibm_models/reading_order/reading_order_rb.py,sha256=RpcR0Q1oeF3JK-j6O0KyNZtGgBeqKUHsIOj7hmPumUo,21670
|
15
17
|
docling_ibm_models/tableformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -34,8 +36,8 @@ docling_ibm_models/tableformer/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeu
|
|
34
36
|
docling_ibm_models/tableformer/utils/app_profiler.py,sha256=Pb7o1zcikKXh7ninaNt4_nVa1xuUrogZxbTr6U6jkEE,8392
|
35
37
|
docling_ibm_models/tableformer/utils/mem_monitor.py,sha256=NFZUnrfLThXNZQrm3ESRmPSJmPF2J1z3E2v_72O4dRw,6408
|
36
38
|
docling_ibm_models/tableformer/utils/utils.py,sha256=8Bxf1rEn977lFbY9NX0r5xh9PvxIRipQZX_EZW92XfA,10980
|
37
|
-
docling_ibm_models-3.
|
38
|
-
docling_ibm_models-3.
|
39
|
-
docling_ibm_models-3.
|
40
|
-
docling_ibm_models-3.
|
41
|
-
docling_ibm_models-3.
|
39
|
+
docling_ibm_models-3.6.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
40
|
+
docling_ibm_models-3.6.0.dist-info/METADATA,sha256=ya3n3Aj7G97OjtBX7kLPWdoFPcaHHVpZwSRInO8oL9k,6705
|
41
|
+
docling_ibm_models-3.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
42
|
+
docling_ibm_models-3.6.0.dist-info/top_level.txt,sha256=tIB9D3naeP7s92RAs1d9SPaHc4S4iQIepjtbkf5Q5g0,19
|
43
|
+
docling_ibm_models-3.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|