docling-core 2.25.0__tar.gz → 2.26.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (72) hide show
  1. {docling_core-2.25.0 → docling_core-2.26.1}/PKG-INFO +1 -1
  2. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/base.py +29 -3
  3. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/common.py +157 -71
  4. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/doctags.py +88 -54
  5. docling_core-2.26.1/docling_core/experimental/serializer/html.py +941 -0
  6. docling_core-2.26.1/docling_core/experimental/serializer/html_styles.py +212 -0
  7. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/markdown.py +105 -63
  8. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/chunker/base.py +8 -2
  9. docling_core-2.26.1/docling_core/transforms/chunker/hierarchical_chunker.py +262 -0
  10. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/chunker/hybrid_chunker.py +54 -12
  11. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/document.py +702 -482
  12. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/labels.py +2 -0
  13. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/page.py +12 -17
  14. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/tokens.py +3 -0
  15. {docling_core-2.25.0 → docling_core-2.26.1}/pyproject.toml +1 -1
  16. docling_core-2.25.0/docling_core/transforms/chunker/hierarchical_chunker.py +0 -241
  17. {docling_core-2.25.0 → docling_core-2.26.1}/LICENSE +0 -0
  18. {docling_core-2.25.0 → docling_core-2.26.1}/README.md +0 -0
  19. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/__init__.py +0 -0
  20. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/cli/__init__.py +0 -0
  21. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/cli/view.py +0 -0
  22. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/__init__.py +0 -0
  23. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/experimental/serializer/__init__.py +0 -0
  24. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/py.typed +0 -0
  25. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  26. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  27. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  28. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  29. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  30. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  31. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  32. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  33. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/__init__.py +0 -0
  34. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  35. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/mapping.py +0 -0
  36. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/meta.py +0 -0
  37. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/search/package.py +0 -0
  38. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/__init__.py +0 -0
  39. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/transforms/chunker/__init__.py +0 -0
  40. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/__init__.py +0 -0
  41. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/base.py +0 -0
  42. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/__init__.py +0 -0
  43. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/base.py +0 -0
  44. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/doc/utils.py +0 -0
  45. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/gen/__init__.py +0 -0
  46. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/gen/generic.py +0 -0
  47. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/io/__init__.py +0 -0
  48. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  49. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/base.py +0 -0
  50. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  51. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  52. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  53. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/document.py +0 -0
  54. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  55. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/nlp/__init__.py +0 -0
  56. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/nlp/qa.py +0 -0
  57. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/nlp/qa_labels.py +0 -0
  58. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/__init__.py +0 -0
  59. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/attribute.py +0 -0
  60. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/base.py +0 -0
  61. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/predicate.py +0 -0
  62. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/record.py +0 -0
  63. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/statement.py +0 -0
  64. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/types/rec/subject.py +0 -0
  65. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/__init__.py +0 -0
  66. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/alias.py +0 -0
  67. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/file.py +0 -0
  68. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/generate_docs.py +0 -0
  69. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/generate_jsonschema.py +0 -0
  70. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/legacy.py +0 -0
  71. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/validate.py +0 -0
  72. {docling_core-2.25.0 → docling_core-2.26.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.25.0
3
+ Version: 2.26.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -11,6 +11,7 @@ from typing import Optional, Union
11
11
  from pydantic import AnyUrl, BaseModel
12
12
 
13
13
  from docling_core.types.doc.document import (
14
+ DocItem,
14
15
  DoclingDocument,
15
16
  FloatingItem,
16
17
  FormItem,
@@ -25,10 +26,19 @@ from docling_core.types.doc.document import (
25
26
  )
26
27
 
27
28
 
29
+ class Span(BaseModel):
30
+ """Class encapsulating fine-granular document span information."""
31
+
32
+ item: DocItem
33
+ # prov_idx: Optional[PositiveInt] = None # None to be interpreted as whole DocItem
34
+
35
+
28
36
  class SerializationResult(BaseModel):
29
37
  """SerializationResult."""
30
38
 
31
- text: str
39
+ text: str = ""
40
+ spans: list[Span] = []
41
+ # group: Optional[GroupItem] = None # set when result reflects specific group item
32
42
 
33
43
 
34
44
  class BaseTextSerializer(ABC):
@@ -163,7 +173,9 @@ class BaseDocSerializer(ABC):
163
173
  """Base class for document serializers."""
164
174
 
165
175
  @abstractmethod
166
- def serialize(self, **kwargs) -> SerializationResult:
176
+ def serialize(
177
+ self, *, item: Optional[NodeItem] = None, **kwargs
178
+ ) -> SerializationResult:
167
179
  """Run the serialization."""
168
180
  ...
169
181
 
@@ -222,6 +234,20 @@ class BaseDocSerializer(ABC):
222
234
  ...
223
235
 
224
236
  @abstractmethod
225
- def get_excluded_refs(self, **kwargs) -> list[str]:
237
+ def get_excluded_refs(self, **kwargs) -> set[str]:
226
238
  """Get references to excluded items."""
227
239
  ...
240
+
241
+ @abstractmethod
242
+ def requires_page_break(self) -> bool:
243
+ """Whether to add page breaks."""
244
+ ...
245
+
246
+
247
+ class BaseSerializerProvider(ABC):
248
+ """Base class for document serializer providers."""
249
+
250
+ @abstractmethod
251
+ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
252
+ """Get a the associated serializer."""
253
+ ...
@@ -4,12 +4,12 @@
4
4
  #
5
5
 
6
6
  """Define base classes for serialization."""
7
+ import re
7
8
  import sys
8
9
  from abc import abstractmethod
9
- from copy import deepcopy
10
10
  from functools import cached_property
11
11
  from pathlib import Path
12
- from typing import Any, Optional, Union
12
+ from typing import Any, Iterable, Optional, Tuple, Union
13
13
 
14
14
  from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
15
15
  from typing_extensions import Self, override
@@ -25,6 +25,7 @@ from docling_core.experimental.serializer.base import (
25
25
  BaseTableSerializer,
26
26
  BaseTextSerializer,
27
27
  SerializationResult,
28
+ Span,
28
29
  )
29
30
  from docling_core.types.doc.document import (
30
31
  DOCUMENT_TOKENS_EXPORT_LABELS,
@@ -49,6 +50,81 @@ _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
49
50
  _DEFAULT_LAYERS = {cl for cl in ContentLayer}
50
51
 
51
52
 
53
+ class _PageBreakNode(NodeItem):
54
+ """Page break node."""
55
+
56
+ prev_page: int
57
+ next_page: int
58
+
59
+
60
+ class _PageBreakSerResult(SerializationResult):
61
+ """Page break serialization result."""
62
+
63
+ node: _PageBreakNode
64
+
65
+
66
+ def _iterate_items(
67
+ doc: DoclingDocument,
68
+ layers: Optional[set[ContentLayer]],
69
+ node: Optional[NodeItem] = None,
70
+ traverse_pictures: bool = False,
71
+ add_page_breaks: bool = False,
72
+ ):
73
+ prev_page_nr: Optional[int] = None
74
+ page_break_i = 0
75
+ for item, _ in doc.iterate_items(
76
+ root=node,
77
+ with_groups=True,
78
+ included_content_layers=layers,
79
+ traverse_pictures=traverse_pictures,
80
+ ):
81
+ if isinstance(item, DocItem):
82
+ if item.prov:
83
+ page_no = item.prov[0].page_no
84
+ if add_page_breaks and (prev_page_nr is None or page_no > prev_page_nr):
85
+ if prev_page_nr is not None: # close previous range
86
+ yield _PageBreakNode(
87
+ self_ref=f"#/pb/{page_break_i}",
88
+ prev_page=prev_page_nr,
89
+ next_page=page_no,
90
+ )
91
+ page_break_i += 1
92
+ prev_page_nr = page_no
93
+ yield item
94
+
95
+
96
+ def create_ser_result(
97
+ *,
98
+ text: str = "",
99
+ span_source: Union[DocItem, list[SerializationResult]] = [],
100
+ ) -> SerializationResult:
101
+ """Function for creating `SerializationResult` instances.
102
+
103
+ Args:
104
+ text: the text the use. Defaults to "".
105
+ span_source: the item or list of results to use as span source. Defaults to [].
106
+
107
+ Returns:
108
+ The created `SerializationResult`.
109
+ """
110
+ spans: list[Span]
111
+ if isinstance(span_source, DocItem):
112
+ spans = [Span(item=span_source)]
113
+ else:
114
+ results: list[SerializationResult] = span_source
115
+ spans = []
116
+ span_ids: set[str] = set()
117
+ for ser_res in results:
118
+ for span in ser_res.spans:
119
+ if (span_id := span.item.self_ref) not in span_ids:
120
+ span_ids.add(span_id)
121
+ spans.append(span)
122
+ return SerializationResult(
123
+ text=text,
124
+ spans=spans,
125
+ )
126
+
127
+
52
128
  class CommonParams(BaseModel):
53
129
  """Common serialization parameters."""
54
130
 
@@ -95,7 +171,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
95
171
 
96
172
  params: CommonParams = CommonParams()
97
173
 
98
- _excluded_refs_cache: dict[str, list[str]] = {}
174
+ _excluded_refs_cache: dict[str, set[str]] = {}
99
175
 
100
176
  @computed_field # type: ignore[misc]
101
177
  @cached_property
@@ -113,19 +189,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
113
189
  return refs
114
190
 
115
191
  @override
116
- def get_excluded_refs(self, **kwargs) -> list[str]:
192
+ def get_excluded_refs(self, **kwargs) -> set[str]:
117
193
  """References to excluded items."""
118
194
  params = self.params.merge_with_patch(patch=kwargs)
119
195
  params_json = params.model_dump_json()
120
196
  refs = self._excluded_refs_cache.get(params_json)
121
197
  if refs is None:
122
- refs = [
198
+ refs = {
123
199
  item.self_ref
124
- for ix, (item, _) in enumerate(
125
- self.doc.iterate_items(
126
- with_groups=True,
200
+ for ix, item in enumerate(
201
+ _iterate_items(
202
+ doc=self.doc,
127
203
  traverse_pictures=True,
128
- included_content_layers=params.layers,
204
+ layers=params.layers,
129
205
  )
130
206
  )
131
207
  if (
@@ -145,56 +221,21 @@ class DocSerializer(BaseModel, BaseDocSerializer):
145
221
  )
146
222
  )
147
223
  )
148
- ]
224
+ }
149
225
  self._excluded_refs_cache[params_json] = refs
150
226
  return refs
151
227
 
152
228
  @abstractmethod
153
- def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
154
- """Serialize a page out of its parts."""
155
- ...
156
-
157
- @abstractmethod
158
- def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
229
+ def serialize_doc(
230
+ self, *, parts: list[SerializationResult], **kwargs
231
+ ) -> SerializationResult:
159
232
  """Serialize a document out of its pages."""
160
233
  ...
161
234
 
162
235
  def _serialize_body(self) -> SerializationResult:
163
236
  """Serialize the document body."""
164
- # find page ranges if available; otherwise regard whole doc as a single page
165
- last_page: Optional[int] = None
166
- starts: list[int] = []
167
- for ix, (item, _) in enumerate(
168
- self.doc.iterate_items(
169
- with_groups=True,
170
- traverse_pictures=True,
171
- included_content_layers=self.params.layers,
172
- )
173
- ):
174
- if isinstance(item, DocItem):
175
- if item.prov:
176
- if last_page is None or item.prov[0].page_no > last_page:
177
- starts.append(ix)
178
- last_page = item.prov[0].page_no
179
- page_ranges = [
180
- (
181
- (starts[i] if i > 0 else 0),
182
- (starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
183
- )
184
- for i, _ in enumerate(starts)
185
- ] or [
186
- (0, sys.maxsize)
187
- ] # use whole range if no pages detected
188
-
189
- page_results: list[SerializationResult] = []
190
- for page_range in page_ranges:
191
- params_to_pass = deepcopy(self.params)
192
- params_to_pass.start_idx = page_range[0]
193
- params_to_pass.stop_idx = page_range[1]
194
- subparts = self.get_parts(**params_to_pass.model_dump())
195
- page_res = self.serialize_page(subparts)
196
- page_results.append(page_res)
197
- res = self.serialize_doc(page_results)
237
+ subparts = self.get_parts()
238
+ res = self.serialize_doc(parts=subparts)
198
239
  return res
199
240
 
200
241
  @override
@@ -209,7 +250,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
209
250
  ) -> SerializationResult:
210
251
  """Serialize a given node."""
211
252
  my_visited: set[str] = visited if visited is not None else set()
212
- empty_res = SerializationResult(text="")
253
+ my_kwargs = self.params.merge_with_patch(patch=kwargs).model_dump()
254
+ empty_res = create_ser_result()
213
255
  if item is None or item == self.doc.body:
214
256
  if self.doc.body.self_ref not in my_visited:
215
257
  my_visited.add(self.doc.body.self_ref)
@@ -217,6 +259,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
217
259
  else:
218
260
  return empty_res
219
261
 
262
+ my_visited.add(item.self_ref)
263
+
220
264
  ########
221
265
  # groups
222
266
  ########
@@ -228,7 +272,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
228
272
  list_level=list_level,
229
273
  is_inline_scope=is_inline_scope,
230
274
  visited=my_visited,
231
- **kwargs,
275
+ **my_kwargs,
232
276
  )
233
277
  elif isinstance(item, InlineGroup):
234
278
  part = self.inline_serializer.serialize(
@@ -237,7 +281,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
237
281
  doc=self.doc,
238
282
  list_level=list_level,
239
283
  visited=my_visited,
240
- **kwargs,
284
+ **my_kwargs,
241
285
  )
242
286
  ###########
243
287
  # doc items
@@ -253,7 +297,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
253
297
  doc_serializer=self,
254
298
  doc=self.doc,
255
299
  is_inline_scope=is_inline_scope,
256
- **kwargs,
300
+ **my_kwargs,
257
301
  )
258
302
  if item.self_ref not in self.get_excluded_refs(**kwargs)
259
303
  else empty_res
@@ -263,7 +307,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
263
307
  item=item,
264
308
  doc_serializer=self,
265
309
  doc=self.doc,
266
- **kwargs,
310
+ **my_kwargs,
267
311
  )
268
312
  elif isinstance(item, PictureItem):
269
313
  part = self.picture_serializer.serialize(
@@ -271,28 +315,33 @@ class DocSerializer(BaseModel, BaseDocSerializer):
271
315
  doc_serializer=self,
272
316
  doc=self.doc,
273
317
  visited=my_visited,
274
- **kwargs,
318
+ **my_kwargs,
275
319
  )
276
320
  elif isinstance(item, KeyValueItem):
277
321
  part = self.key_value_serializer.serialize(
278
322
  item=item,
279
323
  doc_serializer=self,
280
324
  doc=self.doc,
281
- **kwargs,
325
+ **my_kwargs,
282
326
  )
283
327
  elif isinstance(item, FormItem):
284
328
  part = self.form_serializer.serialize(
285
329
  item=item,
286
330
  doc_serializer=self,
287
331
  doc=self.doc,
288
- **kwargs,
332
+ **my_kwargs,
333
+ )
334
+ elif isinstance(item, _PageBreakNode):
335
+ part = _PageBreakSerResult(
336
+ text=self._create_page_break(node=item),
337
+ node=item,
289
338
  )
290
339
  else:
291
340
  part = self.fallback_serializer.serialize(
292
341
  item=item,
293
342
  doc_serializer=self,
294
343
  doc=self.doc,
295
- **kwargs,
344
+ **my_kwargs,
296
345
  )
297
346
  return part
298
347
 
@@ -312,18 +361,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
312
361
  parts: list[SerializationResult] = []
313
362
  my_visited: set[str] = visited if visited is not None else set()
314
363
  params = self.params.merge_with_patch(patch=kwargs)
315
- for item, _ in self.doc.iterate_items(
316
- root=item,
317
- with_groups=True,
318
- traverse_pictures=traverse_pictures,
319
- included_content_layers=params.layers,
364
+
365
+ for node in _iterate_items(
366
+ node=item,
367
+ doc=self.doc,
368
+ layers=params.layers,
369
+ add_page_breaks=self.requires_page_break(),
320
370
  ):
321
- if item.self_ref in my_visited:
371
+ if node.self_ref in my_visited:
322
372
  continue
323
373
  else:
324
- my_visited.add(item.self_ref)
374
+ my_visited.add(node.self_ref)
325
375
  part = self.serialize(
326
- item=item,
376
+ item=node,
327
377
  list_level=list_level,
328
378
  is_inline_scope=is_inline_scope,
329
379
  visited=my_visited,
@@ -393,15 +443,51 @@ class DocSerializer(BaseModel, BaseDocSerializer):
393
443
  ) -> SerializationResult:
394
444
  """Serialize the item's captions."""
395
445
  params = self.params.merge_with_patch(patch=kwargs)
446
+ results: list[SerializationResult] = []
396
447
  if DocItemLabel.CAPTION in params.labels:
397
- text_parts: list[str] = [
398
- it.text
448
+ results = [
449
+ create_ser_result(text=it.text, span_source=it)
399
450
  for cap in item.captions
400
451
  if isinstance(it := cap.resolve(self.doc), TextItem)
401
452
  and it.self_ref not in self.get_excluded_refs(**kwargs)
402
453
  ]
403
- text_res = params.caption_delim.join(text_parts)
454
+ text_res = params.caption_delim.join([r.text for r in results])
404
455
  text_res = self.post_process(text=text_res)
405
456
  else:
406
457
  text_res = ""
407
- return SerializationResult(text=text_res)
458
+ return create_ser_result(text=text_res, span_source=results)
459
+
460
+ def _get_applicable_pages(self) -> Optional[list[int]]:
461
+ pages = {
462
+ item.prov[0].page_no: ...
463
+ for ix, (item, _) in enumerate(
464
+ self.doc.iterate_items(
465
+ with_groups=True,
466
+ included_content_layers=self.params.layers,
467
+ traverse_pictures=True,
468
+ )
469
+ )
470
+ if (
471
+ isinstance(item, DocItem)
472
+ and item.prov
473
+ and (
474
+ self.params.pages is None
475
+ or item.prov[0].page_no in self.params.pages
476
+ )
477
+ and ix >= self.params.start_idx
478
+ and ix < self.params.stop_idx
479
+ )
480
+ }
481
+ return [p for p in pages] or None
482
+
483
+ def _create_page_break(self, node: _PageBreakNode) -> str:
484
+ return f"#_#_DOCLING_DOC_PAGE_BREAK_{node.prev_page}_{node.next_page}_#_#"
485
+
486
+ def _get_page_breaks(self, text: str) -> Iterable[Tuple[str, int, int]]:
487
+ pattern = r"#_#_DOCLING_DOC_PAGE_BREAK_(\d+)_(\d+)_#_#"
488
+ matches = re.finditer(pattern, text)
489
+ for match in matches:
490
+ full_match = match.group(0)
491
+ prev_page_nr = int(match.group(1))
492
+ next_page_nr = int(match.group(2))
493
+ yield (full_match, prev_page_nr, next_page_nr)