docling-core 2.46.0__py3-none-any.whl → 2.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -145,7 +145,7 @@ class TripletTableSerializer(BaseTableSerializer):
145
145
  parts.append(cap_res)
146
146
 
147
147
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
148
- table_df = item.export_to_dataframe()
148
+ table_df = item.export_to_dataframe(doc)
149
149
  if table_df.shape[0] >= 1 and table_df.shape[1] >= 2:
150
150
 
151
151
  # copy header as first row and shift all rows by one
@@ -394,6 +394,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
394
394
  item=item,
395
395
  doc_serializer=self,
396
396
  doc=self.doc,
397
+ visited=my_visited,
397
398
  **my_kwargs,
398
399
  )
399
400
  return part
@@ -32,6 +32,7 @@ from docling_core.types.doc.document import (
32
32
  DoclingDocument,
33
33
  FloatingItem,
34
34
  FormItem,
35
+ GroupItem,
35
36
  InlineGroup,
36
37
  KeyValueItem,
37
38
  ListGroup,
@@ -42,6 +43,7 @@ from docling_core.types.doc.document import (
42
43
  PictureMoleculeData,
43
44
  PictureTabularChartData,
44
45
  ProvenanceItem,
46
+ SectionHeaderItem,
45
47
  TableItem,
46
48
  TextItem,
47
49
  )
@@ -94,11 +96,11 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
94
96
  item: TextItem,
95
97
  doc_serializer: BaseDocSerializer,
96
98
  doc: DoclingDocument,
99
+ visited: Optional[set[str]] = None,
97
100
  **kwargs: Any,
98
101
  ) -> SerializationResult:
99
102
  """Serializes the passed item."""
100
- from docling_core.types.doc.document import SectionHeaderItem
101
-
103
+ my_visited = visited if visited is not None else set()
102
104
  params = DocTagsParams(**kwargs)
103
105
  wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
104
106
  label=item.label,
@@ -116,12 +118,21 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
116
118
  parts.append(location)
117
119
 
118
120
  if params.add_content:
119
- text_part = item.text
120
- text_part = doc_serializer.post_process(
121
- text=text_part,
122
- formatting=item.formatting,
123
- hyperlink=item.hyperlink,
124
- )
121
+ if (
122
+ item.text == ""
123
+ and len(item.children) == 1
124
+ and isinstance(
125
+ (child_group := item.children[0].resolve(doc)), InlineGroup
126
+ )
127
+ ):
128
+ ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
129
+ text_part = ser_res.text
130
+ else:
131
+ text_part = doc_serializer.post_process(
132
+ text=item.text,
133
+ formatting=item.formatting,
134
+ hyperlink=item.hyperlink,
135
+ )
125
136
 
126
137
  if isinstance(item, CodeItem):
127
138
  language_token = DocumentToken.get_code_language_token(
@@ -506,7 +517,12 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
506
517
  **kwargs: Any,
507
518
  ) -> SerializationResult:
508
519
  """Serializes the passed item."""
509
- return create_ser_result()
520
+ if isinstance(item, GroupItem):
521
+ parts = doc_serializer.get_parts(item=item, **kwargs)
522
+ text_res = "\n".join([p.text for p in parts if p.text])
523
+ return create_ser_result(text=text_res, span_source=parts)
524
+ else:
525
+ return create_ser_result()
510
526
 
511
527
 
512
528
  class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
55
55
  FormItem,
56
56
  FormulaItem,
57
57
  GraphData,
58
+ GroupItem,
58
59
  ImageRef,
59
60
  InlineGroup,
60
61
  KeyValueItem,
@@ -139,21 +140,34 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
139
140
  res_parts: list[SerializationResult] = []
140
141
  post_processed = False
141
142
 
142
- # Prepare the HTML based on item type
143
- if isinstance(item, TitleItem):
144
- text_inner = self._prepare_content(item.text)
145
- text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
143
+ has_inline_repr = (
144
+ item.text == ""
145
+ and len(item.children) == 1
146
+ and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
147
+ )
148
+ if has_inline_repr:
149
+ text = doc_serializer.serialize(item=child_group, visited=my_visited).text
150
+ post_processed = True
151
+ else:
152
+ text = item.text
153
+ if not isinstance(item, (CodeItem, FormulaItem)):
154
+ text = html.escape(text, quote=False)
155
+ text = text.replace("\n", "<br>")
146
156
 
147
- elif isinstance(item, SectionHeaderItem):
148
- section_level = min(item.level + 1, 6)
149
- text_inner = self._prepare_content(item.text)
157
+ # Prepare the HTML based on item type
158
+ if isinstance(item, (TitleItem, SectionHeaderItem)):
159
+ section_level = (
160
+ min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
161
+ )
150
162
  text = get_html_tag_with_text_direction(
151
- html_tag=f"h{section_level}", text=text_inner
163
+ html_tag=f"h{section_level}", text=text
152
164
  )
153
165
 
154
166
  elif isinstance(item, FormulaItem):
155
167
  text = self._process_formula(
156
168
  item=item,
169
+ text=text,
170
+ orig=item.orig,
157
171
  doc=doc,
158
172
  image_mode=params.image_mode,
159
173
  formula_to_mathml=params.formula_to_mathml,
@@ -161,19 +175,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
161
175
  )
162
176
 
163
177
  elif isinstance(item, CodeItem):
164
- text = self._process_code(item=item, is_inline_scope=is_inline_scope)
178
+ text = (
179
+ f"<code>{text}</code>"
180
+ if is_inline_scope
181
+ else f"<pre><code>{text}</code></pre>"
182
+ )
165
183
 
166
184
  elif isinstance(item, ListItem):
167
185
  # List items are handled by list serializer
168
186
  text_parts: list[str] = []
169
- if item_text := self._prepare_content(item.text):
170
- item_text = doc_serializer.post_process(
171
- text=item_text,
172
- formatting=item.formatting,
173
- hyperlink=item.hyperlink,
174
- )
175
- post_processed = True
176
- text_parts.append(item_text)
187
+ if text:
188
+ if has_inline_repr:
189
+ text = f"\n{text}\n"
190
+ else:
191
+ text = doc_serializer.post_process(
192
+ text=text,
193
+ formatting=item.formatting,
194
+ hyperlink=item.hyperlink,
195
+ )
196
+ post_processed = True
197
+ text_parts.append(text)
177
198
  nested_parts = [
178
199
  r.text
179
200
  for r in doc_serializer.get_parts(
@@ -184,29 +205,26 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
184
205
  )
185
206
  ]
186
207
  text_parts.extend(nested_parts)
187
- text_inner = "\n".join(text_parts)
208
+ text = "\n".join(text_parts)
188
209
  if nested_parts:
189
- text_inner = f"\n{text_inner}\n"
210
+ text = f"\n{text}\n"
190
211
  text = (
191
212
  get_html_tag_with_text_direction(
192
213
  html_tag="li",
193
- text=text_inner,
214
+ text=text,
194
215
  attrs=(
195
216
  {"style": f"list-style-type: '{item.marker} ';"}
196
217
  if params.show_original_list_item_marker and item.marker
197
218
  else {}
198
219
  ),
199
220
  )
200
- if text_inner
221
+ if text
201
222
  else ""
202
223
  )
203
224
 
204
- elif is_inline_scope:
205
- text = self._prepare_content(item.text)
206
- else:
225
+ elif not is_inline_scope:
207
226
  # Regular text item
208
- text_inner = self._prepare_content(item.text)
209
- text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
227
+ text = get_html_tag_with_text_direction(html_tag="p", text=text)
210
228
 
211
229
  # Apply formatting and hyperlinks
212
230
  if not post_processed:
@@ -227,66 +245,44 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
227
245
 
228
246
  return create_ser_result(text=text, span_source=res_parts)
229
247
 
230
- def _prepare_content(
231
- self, text: str, do_escape_html=True, do_replace_newline=True
232
- ) -> str:
233
- """Prepare text content for HTML inclusion."""
234
- if do_escape_html:
235
- text = html.escape(text, quote=False)
236
- if do_replace_newline:
237
- text = text.replace("\n", "<br>")
238
- return text
239
-
240
- def _process_code(
241
- self,
242
- item: CodeItem,
243
- is_inline_scope: bool,
244
- ) -> str:
245
- code_text = self._prepare_content(
246
- item.text, do_escape_html=False, do_replace_newline=False
247
- )
248
- if is_inline_scope:
249
- text = f"<code>{code_text}</code>"
250
- else:
251
- text = f"<pre><code>{code_text}</code></pre>"
252
-
253
- return text
254
-
255
248
  def _process_formula(
256
249
  self,
257
- item: FormulaItem,
250
+ *,
251
+ item: DocItem,
252
+ text: str,
253
+ orig: str,
258
254
  doc: DoclingDocument,
259
255
  image_mode: ImageRefMode,
260
256
  formula_to_mathml: bool,
261
257
  is_inline_scope: bool,
262
258
  ) -> str:
263
259
  """Process a formula item to HTML/MathML."""
264
- math_formula = self._prepare_content(
265
- item.text, do_escape_html=False, do_replace_newline=False
266
- )
267
-
268
260
  # If formula is empty, try to use an image fallback
269
- if item.text == "" and item.orig != "":
270
- img_fallback = self._get_formula_image_fallback(item, doc)
271
- if (
272
- image_mode == ImageRefMode.EMBEDDED
273
- and len(item.prov) > 0
274
- and img_fallback
275
- ):
276
- return img_fallback
261
+ if (
262
+ text == ""
263
+ and orig != ""
264
+ and len(item.prov) > 0
265
+ and image_mode == ImageRefMode.EMBEDDED
266
+ and (
267
+ img_fallback := self._get_formula_image_fallback(
268
+ item=item, orig=orig, doc=doc
269
+ )
270
+ )
271
+ ):
272
+ return img_fallback
277
273
 
278
274
  # Try to generate MathML
279
- if formula_to_mathml and math_formula:
275
+ elif formula_to_mathml and text:
280
276
  try:
281
277
  # Set display mode based on context
282
278
  display_mode = "inline" if is_inline_scope else "block"
283
279
  mathml_element = latex2mathml.converter.convert_to_element(
284
- math_formula, display=display_mode
280
+ text, display=display_mode
285
281
  )
286
282
  annotation = SubElement(
287
283
  mathml_element, "annotation", dict(encoding="TeX")
288
284
  )
289
- annotation.text = math_formula
285
+ annotation.text = text
290
286
  mathml = unescape(tostring(mathml_element, encoding="unicode"))
291
287
 
292
288
  # Don't wrap in div for inline formulas
@@ -296,40 +292,40 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
296
292
  return f"<div>{mathml}</div>"
297
293
 
298
294
  except Exception:
299
- img_fallback = self._get_formula_image_fallback(item, doc)
295
+ img_fallback = self._get_formula_image_fallback(
296
+ item=item, orig=orig, doc=doc
297
+ )
300
298
  if (
301
299
  image_mode == ImageRefMode.EMBEDDED
302
300
  and len(item.prov) > 0
303
301
  and img_fallback
304
302
  ):
305
303
  return img_fallback
306
- elif math_formula:
307
- return f"<pre>{math_formula}</pre>"
304
+ elif text:
305
+ return f"<pre>{text}</pre>"
308
306
  else:
309
307
  return "<pre>Formula not decoded</pre>"
310
308
 
311
309
  _logger.warning("Could not parse formula with MathML")
312
310
 
313
311
  # Fallback options if we got here
314
- if math_formula and is_inline_scope:
315
- return f"<code>{math_formula}</code>"
316
- elif math_formula and (not is_inline_scope):
317
- f"<pre>{math_formula}</pre>"
312
+ if text and is_inline_scope:
313
+ return f"<code>{text}</code>"
314
+ elif text and (not is_inline_scope):
315
+ f"<pre>{text}</pre>"
318
316
  elif is_inline_scope:
319
317
  return '<span class="formula-not-decoded">Formula not decoded</span>'
320
318
 
321
319
  return '<div class="formula-not-decoded">Formula not decoded</div>'
322
320
 
323
321
  def _get_formula_image_fallback(
324
- self, item: TextItem, doc: DoclingDocument
322
+ self, *, item: DocItem, orig: str, doc: DoclingDocument
325
323
  ) -> Optional[str]:
326
324
  """Try to get an image fallback for a formula."""
327
325
  item_image = item.get_image(doc=doc)
328
326
  if item_image is not None:
329
327
  img_ref = ImageRef.from_pil(item_image, dpi=72)
330
- return (
331
- "<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
332
- )
328
+ return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
333
329
  return None
334
330
 
335
331
 
@@ -792,21 +788,30 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
792
788
  """HTML-specific fallback serializer."""
793
789
 
794
790
  @override
795
- def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
791
+ def serialize(
792
+ self,
793
+ *,
794
+ item: NodeItem,
795
+ doc_serializer: "BaseDocSerializer",
796
+ doc: DoclingDocument,
797
+ **kwargs: Any,
798
+ ) -> SerializationResult:
796
799
  """Fallback serializer for items not handled by other serializers."""
797
- if isinstance(item, DocItem):
800
+ if isinstance(item, GroupItem):
801
+ parts = doc_serializer.get_parts(item=item, **kwargs)
802
+ text_res = "\n".join([p.text for p in parts if p.text])
803
+ return create_ser_result(text=text_res, span_source=parts)
804
+ else:
798
805
  return create_ser_result(
799
806
  text=f"<!-- Unhandled item type: {item.__class__.__name__} -->",
800
- span_source=item,
807
+ span_source=item if isinstance(item, DocItem) else [],
801
808
  )
802
- else:
803
- # For group items, we don't generate any markup
804
- return create_ser_result()
805
809
 
806
810
 
807
811
  class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
808
812
  """HTML-specific annotation serializer."""
809
813
 
814
+ @override
810
815
  def serialize(
811
816
  self,
812
817
  *,
@@ -45,6 +45,7 @@ from docling_core.types.doc.document import (
45
45
  Formatting,
46
46
  FormItem,
47
47
  FormulaItem,
48
+ GroupItem,
48
49
  ImageRef,
49
50
  InlineGroup,
50
51
  KeyValueItem,
@@ -124,26 +125,24 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
124
125
  my_visited = visited if visited is not None else set()
125
126
  params = MarkdownParams(**kwargs)
126
127
  res_parts: list[SerializationResult] = []
127
- text = item.text
128
128
  escape_html = True
129
129
  escape_underscores = True
130
- processing_pending = True
131
- if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
132
- # case where processing/formatting should be applied first (in inner scope)
130
+
131
+ has_inline_repr = (
132
+ item.text == ""
133
+ and len(item.children) == 1
134
+ and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
135
+ )
136
+ if has_inline_repr:
137
+ text = doc_serializer.serialize(item=child_group, visited=my_visited).text
133
138
  processing_pending = False
134
- if (
135
- text == ""
136
- and len(item.children) == 1
137
- and isinstance(
138
- (child_group := item.children[0].resolve(doc)), InlineGroup
139
- )
140
- ):
141
- # case of inline within heading / list item
142
- ser_res = doc_serializer.serialize(item=child_group)
143
- text = ser_res.text
144
- for span in ser_res.spans:
145
- my_visited.add(span.item.self_ref)
146
- else:
139
+ else:
140
+ text = item.text
141
+ processing_pending = True
142
+
143
+ if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
144
+ if not has_inline_repr:
145
+ # case where processing/formatting should be applied first (in inner scope)
147
146
  text = doc_serializer.post_process(
148
147
  text=text,
149
148
  escape_html=escape_html,
@@ -151,6 +150,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
151
150
  formatting=item.formatting,
152
151
  hyperlink=item.hyperlink,
153
152
  )
153
+ processing_pending = False
154
154
 
155
155
  if isinstance(item, ListItem):
156
156
  pieces: list[str] = []
@@ -600,13 +600,15 @@ class MarkdownFallbackSerializer(BaseFallbackSerializer):
600
600
  **kwargs: Any,
601
601
  ) -> SerializationResult:
602
602
  """Serializes the passed item."""
603
- if isinstance(item, DocItem):
603
+ if isinstance(item, GroupItem):
604
+ parts = doc_serializer.get_parts(item=item, **kwargs)
605
+ text_res = "\n\n".join([p.text for p in parts if p.text])
606
+ return create_ser_result(text=text_res, span_source=parts)
607
+ else:
604
608
  return create_ser_result(
605
609
  text="<!-- missing-text -->",
606
- span_source=item,
610
+ span_source=item if isinstance(item, DocItem) else [],
607
611
  )
608
- else:
609
- return create_ser_result()
610
612
 
611
613
 
612
614
  class MarkdownDocSerializer(DocSerializer):
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
60
60
 
61
61
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
62
62
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
63
- CURRENT_VERSION: Final = "1.6.0"
63
+ CURRENT_VERSION: Final = "1.7.0"
64
64
 
65
65
  DEFAULT_EXPORT_LABELS = {
66
66
  DocItemLabel.TITLE,
@@ -310,6 +310,7 @@ class TableCell(BaseModel):
310
310
  column_header: bool = False
311
311
  row_header: bool = False
312
312
  row_section: bool = False
313
+ fillable: bool = False
313
314
 
314
315
  @model_validator(mode="before")
315
316
  @classmethod
@@ -4045,7 +4046,7 @@ class DoclingDocument(BaseModel):
4045
4046
  root=root,
4046
4047
  with_groups=with_groups,
4047
4048
  traverse_pictures=traverse_pictures,
4048
- page_no=page_no,
4049
+ page_nrs={page_no} if page_no is not None else None,
4049
4050
  included_content_layers=included_content_layers,
4050
4051
  ):
4051
4052
  yield item, len(stack)
@@ -4055,7 +4056,7 @@ class DoclingDocument(BaseModel):
4055
4056
  root: Optional[NodeItem] = None,
4056
4057
  with_groups: bool = False,
4057
4058
  traverse_pictures: bool = False,
4058
- page_no: Optional[int] = None,
4059
+ page_nrs: Optional[set[int]] = None,
4059
4060
  included_content_layers: Optional[set[ContentLayer]] = None,
4060
4061
  _stack: Optional[list[int]] = None,
4061
4062
  ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
@@ -4078,8 +4079,8 @@ class DoclingDocument(BaseModel):
4078
4079
  and (
4079
4080
  not isinstance(root, DocItem)
4080
4081
  or (
4081
- page_no is None
4082
- or any(prov.page_no == page_no for prov in root.prov)
4082
+ page_nrs is None
4083
+ or any(prov.page_no in page_nrs for prov in root.prov)
4083
4084
  )
4084
4085
  )
4085
4086
  and root.content_layer in my_layers
@@ -4113,7 +4114,7 @@ class DoclingDocument(BaseModel):
4113
4114
  child,
4114
4115
  with_groups=with_groups,
4115
4116
  traverse_pictures=traverse_pictures,
4116
- page_no=page_no,
4117
+ page_nrs=page_nrs,
4117
4118
  _stack=my_stack,
4118
4119
  included_content_layers=my_layers,
4119
4120
  )
@@ -5603,7 +5604,9 @@ class DoclingDocument(BaseModel):
5603
5604
  def get_item_list(self, key: str) -> list[NodeItem]:
5604
5605
  return getattr(self, key)
5605
5606
 
5606
- def index(self, doc: "DoclingDocument") -> None:
5607
+ def index(
5608
+ self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
5609
+ ) -> None:
5607
5610
 
5608
5611
  orig_ref_to_new_ref: dict[str, str] = {}
5609
5612
  page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
@@ -5614,10 +5617,11 @@ class DoclingDocument(BaseModel):
5614
5617
  self._names.append(doc.name)
5615
5618
 
5616
5619
  # collect items in traversal order
5617
- for item, _ in doc.iterate_items(
5620
+ for item, _ in doc._iterate_items_with_stack(
5618
5621
  with_groups=True,
5619
5622
  traverse_pictures=True,
5620
5623
  included_content_layers={c for c in ContentLayer},
5624
+ page_nrs=page_nrs,
5621
5625
  ):
5622
5626
  key = item.self_ref.split("/")[1]
5623
5627
  is_body = key == "body"
@@ -5686,12 +5690,13 @@ class DoclingDocument(BaseModel):
5686
5690
  # update pages
5687
5691
  new_max_page = None
5688
5692
  for page_nr in doc.pages:
5689
- new_page = copy.deepcopy(doc.pages[page_nr])
5690
- new_page_nr = page_nr + page_delta
5691
- new_page.page_no = new_page_nr
5692
- self.pages[new_page_nr] = new_page
5693
- if new_max_page is None or new_page_nr > new_max_page:
5694
- new_max_page = new_page_nr
5693
+ if page_nrs is None or page_nr in page_nrs:
5694
+ new_page = copy.deepcopy(doc.pages[page_nr])
5695
+ new_page_nr = page_nr + page_delta
5696
+ new_page.page_no = new_page_nr
5697
+ self.pages[new_page_nr] = new_page
5698
+ if new_max_page is None or new_page_nr > new_max_page:
5699
+ new_max_page = new_page_nr
5695
5700
  if new_max_page is not None:
5696
5701
  self._max_page = new_max_page
5697
5702
 
@@ -5715,6 +5720,14 @@ class DoclingDocument(BaseModel):
5715
5720
  doc_index.index(doc=self)
5716
5721
  self._update_from_index(doc_index)
5717
5722
 
5723
+ def filter(self, page_nrs: Optional[set[int]] = None) -> "DoclingDocument":
5724
+ """Create a new document based on the provided filter parameters."""
5725
+ doc_index = DoclingDocument._DocIndex()
5726
+ doc_index.index(doc=self, page_nrs=page_nrs)
5727
+ res_doc = DoclingDocument(name=self.name)
5728
+ res_doc._update_from_index(doc_index)
5729
+ return res_doc
5730
+
5718
5731
  @classmethod
5719
5732
  def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5720
5733
  """Concatenate multiple documents into a single document."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.46.0
3
+ Version: 2.48.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -19,7 +19,7 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
19
19
  docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
20
20
  docling_core/transforms/chunker/__init__.py,sha256=Qg5RhC-2QqdXKEfjzNGJaVi0NqBCL3xAhKWJGOlrE3M,375
21
21
  docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
22
- docling_core/transforms/chunker/hierarchical_chunker.py,sha256=uDf-qGiIT_4JUEg9NOdzvDqAPOTqycKJ-jEpDkV3jJU,8243
22
+ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=qc-gnuxji-2lrlZCRr34VubBciBTE4ClZ3QplgNpUx8,8246
23
23
  docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
24
24
  docling_core/transforms/chunker/page_chunker.py,sha256=gLUlqA_klK-rkuPVYuJKi3ZuTIGdd2HD7ces72AiZ2U,2018
25
25
  docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
@@ -28,11 +28,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
28
28
  docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
29
29
  docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
30
30
  docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
31
- docling_core/transforms/serializer/common.py,sha256=RwfdzZ9FRSHQjKM0vskg1CVqar0Z_ms38arSlLAgITc,19150
32
- docling_core/transforms/serializer/doctags.py,sha256=VXPjAZPhBur7LaEeuqH9k31TgZWSN32lK8z8rJXzFwY,19935
33
- docling_core/transforms/serializer/html.py,sha256=GRfRaqFIb4FXRMplB4Agl4fSNa5jsHV7P4tBtFMro9I,38453
31
+ docling_core/transforms/serializer/common.py,sha256=vfJhu0b4vAcIres85PX774RQSTKu9RueBOWMO95FQyc,19186
32
+ docling_core/transforms/serializer/doctags.py,sha256=9_aV_ffTOTtQKZQTKz_I3kRTQ_GXHCePKwXnR-rnggA,20644
33
+ docling_core/transforms/serializer/html.py,sha256=h0yiDgTNIeOS-rJaMRfinUFgrZygd3MjheM7pjLw5F0,38380
34
34
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
35
- docling_core/transforms/serializer/markdown.py,sha256=hilGM1yWpbbRTjuEjfBRrhavspD5vFF_6SDvlKx8BrM,24230
35
+ docling_core/transforms/serializer/markdown.py,sha256=9Sy7xWSegX0zdQb9vPzEUFucyGQUA4TcQxMfE70SJsk,24354
36
36
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
37
37
  docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
38
38
  docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
@@ -43,7 +43,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
43
43
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
44
44
  docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
45
45
  docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
46
- docling_core/types/doc/document.py,sha256=Ab-JOc6fkzocXP3PcxPRXJPjLOhOTYo_0571vSr6VXo,202093
46
+ docling_core/types/doc/document.py,sha256=sZsLV6GfFF8TzTgD6C47a9YrurLZFhwqt8I9PZmYkJY,202734
47
47
  docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
48
48
  docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
49
49
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -76,9 +76,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
76
76
  docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
77
77
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
78
78
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
79
- docling_core-2.46.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
- docling_core-2.46.0.dist-info/METADATA,sha256=txMHh-7y8N3RiJ_M_HbrsvzRyGPJVXv8UcA6_DpAfok,6453
81
- docling_core-2.46.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
- docling_core-2.46.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
- docling_core-2.46.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
- docling_core-2.46.0.dist-info/RECORD,,
79
+ docling_core-2.48.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
+ docling_core-2.48.0.dist-info/METADATA,sha256=WybgSJP5TG0mMu5sA2bN0pVKCoZxKCf4KR70MGK3904,6453
81
+ docling_core-2.48.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ docling_core-2.48.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
+ docling_core-2.48.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
+ docling_core-2.48.0.dist-info/RECORD,,