docling-ibm-models 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,302 @@
1
+ """
2
+ List Item Marker Processor for Docling Documents
3
+
4
+ This module provides a rule-based model to identify list item markers and
5
+ merge marker-only TextItems with their content to create proper ListItems.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from typing import Union
11
+
12
+ from docling_core.types.doc.document import (
13
+ DocItemLabel,
14
+ DoclingDocument,
15
+ ListItem,
16
+ ProvenanceItem,
17
+ RefItem,
18
+ TextItem,
19
+ )
20
+ from docling_core.types.doc.labels import DocItemLabel
21
+
22
+ _log = logging.getLogger(__name__)
23
+
24
+
25
+ class ListItemMarkerProcessor:
26
+ """
27
+ A rule-based processor for identifying and processing list item markers.
28
+
29
+ This class can:
30
+ 1. Identify various list item markers (bullets, numbers, letters)
31
+ 2. Detect marker-only TextItems followed by content TextItems
32
+ 3. Merge them into proper ListItems
33
+ 4. Group consecutive ListItems into appropriate list containers
34
+ """
35
+
36
+ def __init__(self):
37
+ """Initialize the processor with marker patterns."""
38
+ # Bullet markers (unordered lists)
39
+ self._bullet_patterns = [
40
+ r"[\u2022\u2023\u25E6\u2043\u204C\u204D\u2219\u25AA\u25AB\u25CF\u25CB]", # Various bullet symbols
41
+ r"[-*+•·‣⁃]", # Common ASCII and Unicode bullets
42
+ r"[►▶▸‣➤➢]", # Arrow-like bullets
43
+ r"[✓✔✗✘]", # Checkmark bullets
44
+ ]
45
+
46
+ # Numbered markers (ordered lists)
47
+ self._numbered_patterns = [
48
+ r"\d+\.", # 1. 2. 3.
49
+ r"\d+\)", # 1) 2) 3)
50
+ r"\(\d+\)", # (1) (2) (3)
51
+ r"\[\d+\]", # [1] [2] [3]
52
+ r"[ivxlcdm]+\.", # i. ii. iii. (Roman numerals lowercase)
53
+ r"[IVXLCDM]+\.", # I. II. III. (Roman numerals uppercase)
54
+ r"[a-z]\.", # a. b. c.
55
+ r"[A-Z]\.", # A. B. C.
56
+ r"[a-z]\)", # a) b) c)
57
+ r"[A-Z]\)", # A) B) C)
58
+ ]
59
+
60
+ # Compile all patterns
61
+ self._compiled_bullet_patterns = [
62
+ re.compile(f"^{pattern}$") for pattern in self._bullet_patterns
63
+ ]
64
+ self._compiled_numbered_patterns = [
65
+ re.compile(f"^{pattern}$") for pattern in self._numbered_patterns
66
+ ]
67
+
68
+ self._compiled_bullet_item_patterns = [
69
+ re.compile(f"^({pattern})" + r"\s(.+)") for pattern in self._bullet_patterns
70
+ ]
71
+ self._compiled_numbered_item_patterns = [
72
+ re.compile(f"^({pattern})" + r"\s(.+)")
73
+ for pattern in self._numbered_patterns
74
+ ]
75
+
76
+ self._compiled_item_patterns = (
77
+ self._compiled_bullet_item_patterns + self._compiled_numbered_item_patterns
78
+ )
79
+
80
+ def _is_bullet_marker(self, text: str) -> bool:
81
+ """Check if text is a bullet marker."""
82
+ text = text.strip()
83
+ return any(pattern.match(text) for pattern in self._compiled_bullet_patterns)
84
+
85
+ def _is_numbered_marker(self, text: str) -> bool:
86
+ """Check if text is a numbered marker."""
87
+ text = text.strip()
88
+ return any(pattern.match(text) for pattern in self._compiled_numbered_patterns)
89
+
90
+ def _find_marker_content_pairs(self, doc: DoclingDocument):
91
+ """
92
+ Find pairs of marker-only TextItems and their content TextItems.
93
+
94
+ Returns:
95
+ List of (marker_item, content_item) tuples. content_item can be None
96
+ if the marker item already contains content.
97
+ """
98
+ self._matched_items: dict[int, tuple[RefItem, bool]] = (
99
+ {}
100
+ ) # index to (self_ref, is_pure_marker)
101
+ self._other: dict[int, RefItem] = {} # index to self_ref
102
+
103
+ for i, (item, level) in enumerate(doc.iterate_items(with_groups=False)):
104
+ if not isinstance(item, TextItem):
105
+ continue
106
+
107
+ if self._is_bullet_marker(item.orig):
108
+ self._matched_items[i] = (item.get_ref(), True)
109
+ elif self._is_numbered_marker(item.orig):
110
+ self._matched_items[i] = (item.get_ref(), True)
111
+ else:
112
+ for pattern in self._compiled_item_patterns:
113
+ mtch = pattern.match(item.orig)
114
+ if mtch:
115
+ self._matched_items[i] = (item.get_ref(), False)
116
+
117
+ if i not in self._matched_items:
118
+ self._other[i] = item.get_ref()
119
+
120
+ def _group_consecutive_list_items(self, doc: DoclingDocument) -> DoclingDocument:
121
+ """
122
+ Might need to group list-items, not sure yet how...
123
+ """
124
+ return doc
125
+
126
+ def process_list_item(self, item: ListItem) -> ListItem:
127
+ """Process a ListItem to extract and update marker and text from bullet/numbered patterns.
128
+
129
+ This method applies compiled regex patterns to match bullet point or numbered list
130
+ formatting in the original text, then updates the ListItem's marker and text fields
131
+ accordingly.
132
+
133
+ Args:
134
+ item (ListItem): The list item to process, containing original text that may
135
+ have bullet or numbered list formatting.
136
+
137
+ Returns:
138
+ ListItem: The same ListItem instance with updated marker and text fields
139
+ if a pattern match was found, otherwise unchanged.
140
+
141
+ Note:
142
+ The method modifies the input item in place when a pattern matches.
143
+ If the item is not actually a ListItem type, a warning is logged.
144
+ """
145
+ for pattern in self._compiled_item_patterns:
146
+ mtch = pattern.match(item.orig)
147
+ if mtch:
148
+ if isinstance(item, ListItem): # update item in place
149
+ item.marker = mtch[1]
150
+ item.text = mtch[2]
151
+ else:
152
+ _log.warning(
153
+ f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
154
+ )
155
+ return item
156
+
157
+ def process_text_item(self, item: TextItem) -> Union[TextItem, ListItem]:
158
+ """Process a TextItem to detect and convert bullet/numbered list formatting.
159
+
160
+ This method examines TextItem instances to determine if they contain bullet point
161
+ or numbered list formatting. If detected and appropriate, it either updates an
162
+ existing ListItem or converts the TextItem into a new ListItem.
163
+
164
+ Args:
165
+ item (TextItem): The text item to process, which may contain bullet or
166
+ numbered list formatting in its original text.
167
+
168
+ Returns:
169
+ Union[TextItem, ListItem]:
170
+ - If item is already a ListItem: returns the updated ListItem
171
+ - If item is a TextItem with list formatting (and not a section heading
172
+ or footnote): returns a new ListItem with extracted marker and text
173
+ - Otherwise: returns the original TextItem unchanged
174
+
175
+ Note:
176
+ Section headings and footnotes are excluded from conversion to preserve
177
+ their semantic meaning. A warning is logged if pattern matching occurs
178
+ on unexpected item types.
179
+ """
180
+ for pattern in self._compiled_item_patterns:
181
+ mtch = pattern.match(item.orig)
182
+ if mtch:
183
+ if isinstance(item, ListItem): # update item in place
184
+ item.marker = mtch[1]
185
+ item.text = mtch[2]
186
+
187
+ return item
188
+ elif isinstance(item, TextItem) and (
189
+ item.label
190
+ not in [DocItemLabel.SECTION_HEADER, DocItemLabel.FOOTNOTE]
191
+ ):
192
+ # Create new ListItem
193
+ return ListItem(
194
+ self_ref=item.get_ref().cref,
195
+ marker=mtch[1],
196
+ text=mtch[2],
197
+ orig=item.orig,
198
+ prov=item.prov,
199
+ )
200
+ else:
201
+ _log.warning(
202
+ f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
203
+ )
204
+ return item
205
+
206
+ def update_list_items_in_place(
207
+ self, doc: DoclingDocument, allow_textitem: bool = False
208
+ ) -> DoclingDocument:
209
+ for item, level in doc.iterate_items():
210
+ if isinstance(item, ListItem):
211
+ item = self.process_list_item(item)
212
+ elif allow_textitem and isinstance(item, TextItem):
213
+ item = self.process_text_item(item)
214
+
215
+ return doc
216
+
217
+ def merge_markers_and_text_items_into_list_items(
218
+ self, doc: DoclingDocument
219
+ ) -> DoclingDocument:
220
+ def create_listitem(
221
+ marker_text: str,
222
+ content_text: str,
223
+ orig_text: str,
224
+ prov: list[ProvenanceItem],
225
+ ) -> ListItem:
226
+ # Create new ListItem
227
+ return ListItem(
228
+ self_ref="#",
229
+ marker=marker_text,
230
+ text=content_text,
231
+ orig=orig_text,
232
+ prov=prov,
233
+ )
234
+
235
+ # Find all marker-content pairs: this function will identify text-items
236
+ # with a marker fused into the text
237
+ self._find_marker_content_pairs(doc)
238
+
239
+ # If you find a sole marker-item followed by a text, there are
240
+ # good chances we need to merge them into a list-item. This
241
+ # function is only necessary as long as the layout-model does not
242
+ # recognize list-items properly
243
+ for ind, (self_ref, is_marker) in self._matched_items.items():
244
+
245
+ if is_marker:
246
+
247
+ marker_item = self_ref.resolve(doc=doc)
248
+
249
+ if ind + 1 in self._other:
250
+ next_item = self._other[ind + 1].resolve(doc=doc)
251
+
252
+ if (isinstance(next_item, TextItem)) and (
253
+ next_item.label in [DocItemLabel.TEXT, DocItemLabel.LIST_ITEM]
254
+ ):
255
+
256
+ marker_text: str = marker_item.text
257
+ content_text: str = next_item.text
258
+ prov = marker_item.prov
259
+ prov.extend(next_item.prov)
260
+
261
+ list_item = create_listitem(
262
+ marker_text=marker_text,
263
+ content_text=content_text,
264
+ orig_text=f"{marker_text} {content_text}",
265
+ prov=prov,
266
+ )
267
+
268
+ # Insert the new ListItem
269
+ doc.insert_item_before_sibling(
270
+ new_item=list_item, sibling=marker_item
271
+ )
272
+
273
+ # Delete original items
274
+ items_to_delete = [marker_item, next_item]
275
+ doc.delete_items(node_items=items_to_delete)
276
+
277
+ return doc
278
+
279
+ def process_document(
280
+ self,
281
+ doc: DoclingDocument,
282
+ allow_textitem: bool = False,
283
+ merge_items: bool = False,
284
+ ) -> DoclingDocument:
285
+ """
286
+ Process the entire document to identify and convert list markers.
287
+
288
+ Args:
289
+ doc: The DoclingDocument to process
290
+
291
+ Returns:
292
+ The processed document (modified in-place)
293
+ """
294
+ doc = self.update_list_items_in_place(doc, allow_textitem=allow_textitem)
295
+
296
+ if merge_items:
297
+ doc = self.merge_markers_and_text_items_into_list_items(doc)
298
+
299
+ # Group consecutive list items
300
+ doc = self._group_consecutive_list_items(doc)
301
+
302
+ return doc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-ibm-models
3
- Version: 3.5.0
3
+ Version: 3.6.0
4
4
  Summary: This package contains the AI models used by the Docling PDF conversion package
5
5
  Author-email: Nikos Livathinos <nli@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -10,6 +10,8 @@ docling_ibm_models/document_figure_classifier_model/__init__.py,sha256=47DEQpj8H
10
10
  docling_ibm_models/document_figure_classifier_model/document_figure_classifier_predictor.py,sha256=vRIp02rs9Xa4n1K-M7AYO_tFj4S7WQCQmL9i006T9Qk,5795
11
11
  docling_ibm_models/layoutmodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  docling_ibm_models/layoutmodel/layout_predictor.py,sha256=ArVgs7FBOiu23TC-JoybcaTp7F7a4BgYC8uRVxTgx4E,5681
13
+ docling_ibm_models/list_item_normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ docling_ibm_models/list_item_normalizer/list_marker_processor.py,sha256=IC_U-FrwPjCoYEPyMT7TTIcshSDmZAkx1tmYbXDV0x4,11469
13
15
  docling_ibm_models/reading_order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
16
  docling_ibm_models/reading_order/reading_order_rb.py,sha256=RpcR0Q1oeF3JK-j6O0KyNZtGgBeqKUHsIOj7hmPumUo,21670
15
17
  docling_ibm_models/tableformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -34,8 +36,8 @@ docling_ibm_models/tableformer/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeu
34
36
  docling_ibm_models/tableformer/utils/app_profiler.py,sha256=Pb7o1zcikKXh7ninaNt4_nVa1xuUrogZxbTr6U6jkEE,8392
35
37
  docling_ibm_models/tableformer/utils/mem_monitor.py,sha256=NFZUnrfLThXNZQrm3ESRmPSJmPF2J1z3E2v_72O4dRw,6408
36
38
  docling_ibm_models/tableformer/utils/utils.py,sha256=8Bxf1rEn977lFbY9NX0r5xh9PvxIRipQZX_EZW92XfA,10980
37
- docling_ibm_models-3.5.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
38
- docling_ibm_models-3.5.0.dist-info/METADATA,sha256=fyDgSAuTt6vIhYa4HdFZIDrEMPMzmMJp5QhqWCAhO6E,6705
39
- docling_ibm_models-3.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- docling_ibm_models-3.5.0.dist-info/top_level.txt,sha256=tIB9D3naeP7s92RAs1d9SPaHc4S4iQIepjtbkf5Q5g0,19
41
- docling_ibm_models-3.5.0.dist-info/RECORD,,
39
+ docling_ibm_models-3.6.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
40
+ docling_ibm_models-3.6.0.dist-info/METADATA,sha256=ya3n3Aj7G97OjtBX7kLPWdoFPcaHHVpZwSRInO8oL9k,6705
41
+ docling_ibm_models-3.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
42
+ docling_ibm_models-3.6.0.dist-info/top_level.txt,sha256=tIB9D3naeP7s92RAs1d9SPaHc4S4iQIepjtbkf5Q5g0,19
43
+ docling_ibm_models-3.6.0.dist-info/RECORD,,