docling-core 2.44.2__py3-none-any.whl → 2.46.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -359,6 +359,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
359
359
  item=item,
360
360
  doc_serializer=self,
361
361
  doc=self.doc,
362
+ visited=my_visited,
362
363
  **my_kwargs,
363
364
  )
364
365
  elif isinstance(item, PictureItem):
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
157
157
  item: TableItem,
158
158
  doc_serializer: BaseDocSerializer,
159
159
  doc: DoclingDocument,
160
+ visited: Optional[set[str]] = None,
160
161
  **kwargs: Any,
161
162
  ) -> SerializationResult:
162
163
  """Serializes the passed item."""
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
179
180
  add_cell_text=params.add_table_cell_text,
180
181
  xsize=params.xsize,
181
182
  ysize=params.ysize,
183
+ visited=visited,
182
184
  )
183
185
  res_parts.append(create_ser_result(text=otsl_text, span_source=item))
184
186
 
@@ -65,8 +65,8 @@ from docling_core.types.doc.document import (
65
65
  PictureItem,
66
66
  PictureMoleculeData,
67
67
  PictureTabularChartData,
68
+ RichTableCell,
68
69
  SectionHeaderItem,
69
- TableCell,
70
70
  TableItem,
71
71
  TextItem,
72
72
  TitleItem,
@@ -346,9 +346,6 @@ class HTMLTableSerializer(BaseTableSerializer):
346
346
  **kwargs: Any,
347
347
  ) -> SerializationResult:
348
348
  """Serializes the passed table item to HTML."""
349
- nrows = item.data.num_rows
350
- ncols = item.data.num_cols
351
-
352
349
  res_parts: list[SerializationResult] = []
353
350
  cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
354
351
  if cap_res.text:
@@ -356,11 +353,11 @@ class HTMLTableSerializer(BaseTableSerializer):
356
353
 
357
354
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
358
355
  body = ""
356
+ span_source: Union[DocItem, list[SerializationResult]] = []
359
357
 
360
- for i in range(nrows):
358
+ for i, row in enumerate(item.data.grid):
361
359
  body += "<tr>"
362
- for j in range(ncols):
363
- cell: TableCell = item.data.grid[i][j]
360
+ for j, cell in enumerate(row):
364
361
 
365
362
  rowspan, rowstart = (
366
363
  cell.row_span,
@@ -376,7 +373,16 @@ class HTMLTableSerializer(BaseTableSerializer):
376
373
  if colstart != j:
377
374
  continue
378
375
 
379
- content = html.escape(cell.text.strip())
376
+ if isinstance(cell, RichTableCell):
377
+ ser_res = doc_serializer.serialize(
378
+ item=cell.ref.resolve(doc=doc), **kwargs
379
+ )
380
+ content = ser_res.text
381
+ span_source = [ser_res]
382
+ else:
383
+ content = html.escape(cell.text.strip())
384
+ span_source = item
385
+
380
386
  celltag = "td"
381
387
  if cell.column_header or cell.row_header or cell.row_section:
382
388
  celltag = "th"
@@ -389,14 +395,14 @@ class HTMLTableSerializer(BaseTableSerializer):
389
395
 
390
396
  text_dir = get_text_direction(content)
391
397
  if text_dir == "rtl":
392
- opening_tag += f' dir="{dir}"'
398
+ opening_tag += f' dir="{text_dir}"'
393
399
 
394
400
  body += f"<{opening_tag}>{content}</{celltag}>"
395
401
  body += "</tr>"
396
402
 
397
403
  if body:
398
404
  body = f"<tbody>{body}</tbody>"
399
- res_parts.append(create_ser_result(text=body, span_source=item))
405
+ res_parts.append(create_ser_result(text=body, span_source=span_source))
400
406
 
401
407
  text_res = "".join([r.text for r in res_parts])
402
408
  text_res = f"<table>{text_res}</table>" if text_res else ""
@@ -1057,7 +1063,7 @@ class HTMLDocSerializer(DocSerializer):
1057
1063
  if self.params.html_head is not None:
1058
1064
  return self.params.html_head
1059
1065
 
1060
- head_parts = ["<head>", '<meta charset="UTF-8">']
1066
+ head_parts = ["<head>", '<meta charset="UTF-8"/>']
1061
1067
 
1062
1068
  # Add metadata if requested
1063
1069
  if params.add_document_metadata:
@@ -1067,7 +1073,7 @@ class HTMLDocSerializer(DocSerializer):
1067
1073
  head_parts.append("<title>Docling Document</title>")
1068
1074
 
1069
1075
  head_parts.append(
1070
- '<meta name="generator" content="Docling HTML Serializer">'
1076
+ '<meta name="generator" content="Docling HTML Serializer"/>'
1071
1077
  )
1072
1078
 
1073
1079
  # Add default styles or custom CSS
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
55
55
  PictureItem,
56
56
  PictureMoleculeData,
57
57
  PictureTabularChartData,
58
+ RichTableCell,
58
59
  SectionHeaderItem,
59
60
  TableItem,
60
61
  TextItem,
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
320
321
  [
321
322
  # make sure that md tables are not broken
322
323
  # due to newline chars in the text
323
- col.text.replace("\n", " ")
324
+ (
325
+ doc_serializer.serialize(
326
+ item=col.ref.resolve(doc=doc), **kwargs
327
+ ).text
328
+ if isinstance(col, RichTableCell)
329
+ else col.text
330
+ ).replace("\n", " ")
324
331
  for col in row
325
332
  ]
326
333
  for row in item.data.grid
@@ -7,6 +7,7 @@
7
7
 
8
8
  from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
+ AnyTableCell,
10
11
  BaseAnnotation,
11
12
  ChartBar,
12
13
  ChartLine,
@@ -52,6 +53,7 @@ from .document import (
52
53
  PictureTabularChartData,
53
54
  ProvenanceItem,
54
55
  RefItem,
56
+ RichTableCell,
55
57
  Script,
56
58
  SectionHeaderItem,
57
59
  TableCell,
@@ -3,7 +3,6 @@
3
3
  import base64
4
4
  import copy
5
5
  import hashlib
6
- import itertools
7
6
  import json
8
7
  import logging
9
8
  import mimetypes
@@ -35,7 +34,7 @@ from pydantic import (
35
34
  validate_call,
36
35
  )
37
36
  from tabulate import tabulate
38
- from typing_extensions import Annotated, Self, deprecated
37
+ from typing_extensions import Annotated, Self, deprecated, override
39
38
 
40
39
  from docling_core.search.package import VERSION_PATTERN
41
40
  from docling_core.types.base import _JSON_POINTER_REGEX
@@ -54,14 +53,14 @@ from docling_core.types.doc.labels import (
54
53
  GroupLabel,
55
54
  PictureClassificationLabel,
56
55
  )
57
- from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
58
- from docling_core.types.doc.utils import relative_path
56
+ from docling_core.types.doc.tokens import DocumentToken, TableToken
57
+ from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
59
58
 
60
59
  _logger = logging.getLogger(__name__)
61
60
 
62
61
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
63
62
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
64
- CURRENT_VERSION: Final = "1.5.0"
63
+ CURRENT_VERSION: Final = "1.6.0"
65
64
 
66
65
  DEFAULT_EXPORT_LABELS = {
67
66
  DocItemLabel.TITLE,
@@ -326,7 +325,7 @@ class TableCell(BaseModel):
326
325
  in data
327
326
  ):
328
327
  return data
329
- text = data["bbox"].get("token", "")
328
+ text = data.get("bbox", {}).get("token", "")
330
329
  if not len(text):
331
330
  text_cells = data.pop("text_cell_bboxes", None)
332
331
  if text_cells:
@@ -338,11 +337,39 @@ class TableCell(BaseModel):
338
337
 
339
338
  return data
340
339
 
340
+ def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
341
+ return self.text
342
+
343
+
344
+ class RichTableCell(TableCell):
345
+ """RichTableCell."""
346
+
347
+ ref: "RefItem"
348
+
349
+ @override
350
+ def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
351
+ from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
352
+
353
+ if doc is not None:
354
+ doc_serializer = kwargs.pop(
355
+ "doc_serializer", MarkdownDocSerializer(doc=doc)
356
+ )
357
+ ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
358
+ return ser_res.text
359
+ else:
360
+ return "<!-- rich cell -->"
361
+
362
+
363
+ AnyTableCell = Annotated[
364
+ Union[RichTableCell, TableCell],
365
+ Field(union_mode="left_to_right"),
366
+ ]
367
+
341
368
 
342
369
  class TableData(BaseModel): # TBD
343
370
  """BaseTableData."""
344
371
 
345
- table_cells: List[TableCell] = []
372
+ table_cells: List[AnyTableCell] = []
346
373
  num_rows: int = 0
347
374
  num_cols: int = 0
348
375
 
@@ -381,7 +408,9 @@ class TableData(BaseModel): # TBD
381
408
 
382
409
  return table_data
383
410
 
384
- def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
411
+ def remove_rows(
412
+ self, indices: List[int], doc: Optional["DoclingDocument"] = None
413
+ ) -> List[List[TableCell]]:
385
414
  """Remove rows from the table by their indices.
386
415
 
387
416
  :param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
@@ -393,6 +422,7 @@ class TableData(BaseModel): # TBD
393
422
 
394
423
  indices = sorted(indices, reverse=True)
395
424
 
425
+ refs_to_remove = []
396
426
  all_removed_cells = []
397
427
  for row_index in indices:
398
428
  if row_index < 0 or row_index >= self.num_rows:
@@ -404,6 +434,10 @@ class TableData(BaseModel): # TBD
404
434
  end_idx = start_idx + self.num_cols
405
435
  removed_cells = self.table_cells[start_idx:end_idx]
406
436
 
437
+ for cell in removed_cells:
438
+ if isinstance(cell, RichTableCell):
439
+ refs_to_remove.append(cell.ref)
440
+
407
441
  # Remove the cells from the table
408
442
  self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
409
443
 
@@ -418,9 +452,18 @@ class TableData(BaseModel): # TBD
418
452
 
419
453
  all_removed_cells.append(removed_cells)
420
454
 
455
+ if refs_to_remove:
456
+ if doc is None:
457
+ _logger.warning(
458
+ "When table contains rich cells, `doc` argument must be provided, "
459
+ "otherwise rich cell content will be left dangling."
460
+ )
461
+ else:
462
+ doc._delete_items(refs_to_remove)
463
+
421
464
  return all_removed_cells
422
465
 
423
- def pop_row(self) -> List[TableCell]:
466
+ def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
424
467
  """Remove and return the last row from the table.
425
468
 
426
469
  :returns: List[TableCell]: A list of TableCell objects representing the popped row.
@@ -428,16 +471,18 @@ class TableData(BaseModel): # TBD
428
471
  if self.num_rows == 0:
429
472
  raise IndexError("Cannot pop from an empty table.")
430
473
 
431
- return self.remove_row(self.num_rows - 1)
474
+ return self.remove_row(self.num_rows - 1, doc=doc)
432
475
 
433
- def remove_row(self, row_index: int) -> List[TableCell]:
476
+ def remove_row(
477
+ self, row_index: int, doc: Optional["DoclingDocument"] = None
478
+ ) -> List[TableCell]:
434
479
  """Remove a row from the table by its index.
435
480
 
436
481
  :param row_index: int: The index of the row to remove. (Starting from 0)
437
482
 
438
483
  :returns: List[TableCell]: A list of TableCell objects representing the removed row.
439
484
  """
440
- return self.remove_rows([row_index])[0]
485
+ return self.remove_rows([row_index], doc=doc)[0]
441
486
 
442
487
  def insert_rows(
443
488
  self, row_index: int, rows: List[List[str]], after: bool = False
@@ -1510,8 +1555,15 @@ class TableItem(FloatingItem):
1510
1555
 
1511
1556
  annotations: List[TableAnnotationType] = []
1512
1557
 
1513
- def export_to_dataframe(self) -> pd.DataFrame:
1558
+ def export_to_dataframe(
1559
+ self, doc: Optional["DoclingDocument"] = None
1560
+ ) -> pd.DataFrame:
1514
1561
  """Export the table as a Pandas DataFrame."""
1562
+ if doc is None:
1563
+ _logger.warning(
1564
+ "Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
1565
+ )
1566
+
1515
1567
  if self.data.num_rows == 0 or self.data.num_cols == 0:
1516
1568
  return pd.DataFrame()
1517
1569
 
@@ -1540,14 +1592,15 @@ class TableItem(FloatingItem):
1540
1592
  columns = ["" for _ in range(self.data.num_cols)]
1541
1593
  for i in range(num_headers):
1542
1594
  for j, cell in enumerate(self.data.grid[i]):
1543
- col_name = cell.text
1595
+ col_name = cell._get_text(doc=doc)
1544
1596
  if columns[j] != "":
1545
1597
  col_name = f".{col_name}"
1546
1598
  columns[j] += col_name
1547
1599
 
1548
1600
  # Create table data
1549
1601
  table_data = [
1550
- [cell.text for cell in row] for row in self.data.grid[num_headers:]
1602
+ [cell._get_text(doc=doc) for cell in row]
1603
+ for row in self.data.grid[num_headers:]
1551
1604
  ]
1552
1605
 
1553
1606
  # Create DataFrame
@@ -1578,7 +1631,7 @@ class TableItem(FloatingItem):
1578
1631
 
1579
1632
  # make sure that md tables are not broken
1580
1633
  # due to newline chars in the text
1581
- text = col.text
1634
+ text = col._get_text(doc=doc)
1582
1635
  text = text.replace("\n", " ")
1583
1636
  tmp.append(text)
1584
1637
 
@@ -1624,6 +1677,7 @@ class TableItem(FloatingItem):
1624
1677
  add_cell_text: bool = True,
1625
1678
  xsize: int = 500,
1626
1679
  ysize: int = 500,
1680
+ **kwargs: Any,
1627
1681
  ) -> str:
1628
1682
  """Export the table as OTSL."""
1629
1683
  # Possible OTSL tokens...
@@ -1640,6 +1694,9 @@ class TableItem(FloatingItem):
1640
1694
  # Headers (column, row, section row):
1641
1695
  # "ched", "rhed", "srow"
1642
1696
 
1697
+ from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
1698
+
1699
+ doc_serializer = DocTagsDocSerializer(doc=doc)
1643
1700
  body = []
1644
1701
  nrows = self.data.num_rows
1645
1702
  ncols = self.data.num_cols
@@ -1653,7 +1710,9 @@ class TableItem(FloatingItem):
1653
1710
  for i in range(nrows):
1654
1711
  for j in range(ncols):
1655
1712
  cell: TableCell = self.data.grid[i][j]
1656
- content = cell.text.strip()
1713
+ content = cell._get_text(
1714
+ doc=doc, doc_serializer=doc_serializer, **kwargs
1715
+ ).strip()
1657
1716
  rowspan, rowstart = (
1658
1717
  cell.row_span,
1659
1718
  cell.start_row_offset_idx,
@@ -2305,6 +2364,15 @@ class DoclingDocument(BaseModel):
2305
2364
  refs_to_be_deleted=refs_to_be_deleted,
2306
2365
  lookup=lookup,
2307
2366
  )
2367
+ if isinstance(node, TableItem):
2368
+ for cell in node.data.table_cells:
2369
+ if isinstance(cell, RichTableCell):
2370
+ path = cell.ref._split_ref_to_path()
2371
+ cell.ref = self._update_ref_with_lookup(
2372
+ item_label=path[1],
2373
+ item_index=int(path[2]),
2374
+ lookup=lookup,
2375
+ )
2308
2376
 
2309
2377
  # Update the self_ref reference
2310
2378
  if node.parent is not None:
@@ -3946,16 +4014,22 @@ class DoclingDocument(BaseModel):
3946
4014
  """num_pages."""
3947
4015
  return len(self.pages.values())
3948
4016
 
3949
- def validate_tree(self, root) -> bool:
4017
+ def validate_tree(self, root: NodeItem) -> bool:
3950
4018
  """validate_tree."""
3951
- res = []
3952
4019
  for child_ref in root.children:
3953
4020
  child = child_ref.resolve(self)
3954
- if child.parent.resolve(self) != root:
4021
+ if child.parent.resolve(self) != root or not self.validate_tree(child):
3955
4022
  return False
3956
- res.append(self.validate_tree(child))
3957
4023
 
3958
- return all(res) or len(res) == 0
4024
+ if isinstance(root, TableItem):
4025
+ for cell in root.data.table_cells:
4026
+ if isinstance(cell, RichTableCell) and (
4027
+ (par_ref := cell.ref.resolve(self).parent) is None
4028
+ or par_ref.resolve(self) != root
4029
+ ):
4030
+ return False
4031
+
4032
+ return True
3959
4033
 
3960
4034
  def iterate_items(
3961
4035
  self,
@@ -3964,7 +4038,7 @@ class DoclingDocument(BaseModel):
3964
4038
  traverse_pictures: bool = False,
3965
4039
  page_no: Optional[int] = None,
3966
4040
  included_content_layers: Optional[set[ContentLayer]] = None,
3967
- _level: int = 0, # fixed parameter, carries through the node nesting level
4041
+ _level: int = 0, # deprecated
3968
4042
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
3969
4043
  """Iterate elements with level."""
3970
4044
  for item, stack in self._iterate_items_with_stack(
@@ -4688,181 +4762,6 @@ class DoclingDocument(BaseModel):
4688
4762
  bbox = None
4689
4763
  return caption_item, bbox
4690
4764
 
4691
- def otsl_parse_texts(texts, tokens):
4692
- split_word = TableToken.OTSL_NL.value
4693
- # CLEAN tokens from extra tags, only structural OTSL allowed
4694
- clean_tokens = []
4695
- for t in tokens:
4696
- if t in [
4697
- TableToken.OTSL_ECEL.value,
4698
- TableToken.OTSL_FCEL.value,
4699
- TableToken.OTSL_LCEL.value,
4700
- TableToken.OTSL_UCEL.value,
4701
- TableToken.OTSL_XCEL.value,
4702
- TableToken.OTSL_NL.value,
4703
- TableToken.OTSL_CHED.value,
4704
- TableToken.OTSL_RHED.value,
4705
- TableToken.OTSL_SROW.value,
4706
- ]:
4707
- clean_tokens.append(t)
4708
- tokens = clean_tokens
4709
- split_row_tokens = [
4710
- list(y)
4711
- for x, y in itertools.groupby(tokens, lambda z: z == split_word)
4712
- if not x
4713
- ]
4714
-
4715
- table_cells = []
4716
- r_idx = 0
4717
- c_idx = 0
4718
-
4719
- def count_right(tokens, c_idx, r_idx, which_tokens):
4720
- span = 0
4721
- c_idx_iter = c_idx
4722
- while tokens[r_idx][c_idx_iter] in which_tokens:
4723
- c_idx_iter += 1
4724
- span += 1
4725
- if c_idx_iter >= len(tokens[r_idx]):
4726
- return span
4727
- return span
4728
-
4729
- def count_down(tokens, c_idx, r_idx, which_tokens):
4730
- span = 0
4731
- r_idx_iter = r_idx
4732
- while tokens[r_idx_iter][c_idx] in which_tokens:
4733
- r_idx_iter += 1
4734
- span += 1
4735
- if r_idx_iter >= len(tokens):
4736
- return span
4737
- return span
4738
-
4739
- for i, text in enumerate(texts):
4740
- cell_text = ""
4741
- if text in [
4742
- TableToken.OTSL_FCEL.value,
4743
- TableToken.OTSL_ECEL.value,
4744
- TableToken.OTSL_CHED.value,
4745
- TableToken.OTSL_RHED.value,
4746
- TableToken.OTSL_SROW.value,
4747
- ]:
4748
- row_span = 1
4749
- col_span = 1
4750
- right_offset = 1
4751
- if text != TableToken.OTSL_ECEL.value:
4752
- cell_text = texts[i + 1]
4753
- right_offset = 2
4754
-
4755
- # Check next element(s) for lcel / ucel / xcel,
4756
- # set properly row_span, col_span
4757
- next_right_cell = ""
4758
- if i + right_offset < len(texts):
4759
- next_right_cell = texts[i + right_offset]
4760
-
4761
- next_bottom_cell = ""
4762
- if r_idx + 1 < len(split_row_tokens):
4763
- if c_idx < len(split_row_tokens[r_idx + 1]):
4764
- next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
4765
-
4766
- if next_right_cell in [
4767
- TableToken.OTSL_LCEL.value,
4768
- TableToken.OTSL_XCEL.value,
4769
- ]:
4770
- # we have horisontal spanning cell or 2d spanning cell
4771
- col_span += count_right(
4772
- split_row_tokens,
4773
- c_idx + 1,
4774
- r_idx,
4775
- [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
4776
- )
4777
- if next_bottom_cell in [
4778
- TableToken.OTSL_UCEL.value,
4779
- TableToken.OTSL_XCEL.value,
4780
- ]:
4781
- # we have a vertical spanning cell or 2d spanning cell
4782
- row_span += count_down(
4783
- split_row_tokens,
4784
- c_idx,
4785
- r_idx + 1,
4786
- [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
4787
- )
4788
-
4789
- table_cells.append(
4790
- TableCell(
4791
- text=cell_text.strip(),
4792
- row_span=row_span,
4793
- col_span=col_span,
4794
- start_row_offset_idx=r_idx,
4795
- end_row_offset_idx=r_idx + row_span,
4796
- start_col_offset_idx=c_idx,
4797
- end_col_offset_idx=c_idx + col_span,
4798
- )
4799
- )
4800
- if text in [
4801
- TableToken.OTSL_FCEL.value,
4802
- TableToken.OTSL_ECEL.value,
4803
- TableToken.OTSL_CHED.value,
4804
- TableToken.OTSL_RHED.value,
4805
- TableToken.OTSL_SROW.value,
4806
- TableToken.OTSL_LCEL.value,
4807
- TableToken.OTSL_UCEL.value,
4808
- TableToken.OTSL_XCEL.value,
4809
- ]:
4810
- c_idx += 1
4811
- if text == TableToken.OTSL_NL.value:
4812
- r_idx += 1
4813
- c_idx = 0
4814
- return table_cells, split_row_tokens
4815
-
4816
- def otsl_extract_tokens_and_text(s: str):
4817
- # Pattern to match anything enclosed by < >
4818
- # (including the angle brackets themselves)
4819
- pattern = r"(<[^>]+>)"
4820
- # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
4821
- tokens = re.findall(pattern, s)
4822
- # Remove any tokens that start with "<loc_"
4823
- tokens = [
4824
- token
4825
- for token in tokens
4826
- if not (
4827
- token.startswith(rf"<{_LOC_PREFIX}")
4828
- or token
4829
- in [
4830
- rf"<{DocumentToken.OTSL.value}>",
4831
- rf"</{DocumentToken.OTSL.value}>",
4832
- ]
4833
- )
4834
- ]
4835
- # Split the string by those tokens to get the in-between text
4836
- text_parts = re.split(pattern, s)
4837
- text_parts = [
4838
- token
4839
- for token in text_parts
4840
- if not (
4841
- token.startswith(rf"<{_LOC_PREFIX}")
4842
- or token
4843
- in [
4844
- rf"<{DocumentToken.OTSL.value}>",
4845
- rf"</{DocumentToken.OTSL.value}>",
4846
- ]
4847
- )
4848
- ]
4849
- # Remove any empty or purely whitespace strings from text_parts
4850
- text_parts = [part for part in text_parts if part.strip()]
4851
-
4852
- return tokens, text_parts
4853
-
4854
- def parse_table_content(otsl_content: str) -> TableData:
4855
- tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
4856
- table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
4857
-
4858
- return TableData(
4859
- num_rows=len(split_row_tokens),
4860
- num_cols=(
4861
- max(len(row) for row in split_row_tokens) if split_row_tokens else 0
4862
- ),
4863
- table_cells=table_cells,
4864
- )
4865
-
4866
4765
  def extract_chart_type(text_chunk: str):
4867
4766
  label = None
4868
4767
  chart_labels = [
@@ -5094,7 +4993,7 @@ class DoclingDocument(BaseModel):
5094
4993
  doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
5095
4994
 
5096
4995
  if tag_name == DocumentToken.OTSL.value:
5097
- table_data = parse_table_content(full_chunk)
4996
+ table_data = parse_otsl_table_content(full_chunk)
5098
4997
  caption, caption_bbox = extract_caption(full_chunk)
5099
4998
  if caption is not None and caption_bbox is not None:
5100
4999
  caption.prov.append(
@@ -5137,7 +5036,7 @@ class DoclingDocument(BaseModel):
5137
5036
  table_data = None
5138
5037
  chart_type = None
5139
5038
  if tag_name == DocumentToken.CHART.value:
5140
- table_data = parse_table_content(full_chunk)
5039
+ table_data = parse_otsl_table_content(full_chunk)
5141
5040
  chart_type = extract_chart_type(full_chunk)
5142
5041
  if image:
5143
5042
  if bbox:
@@ -5500,7 +5399,9 @@ class DoclingDocument(BaseModel):
5500
5399
  grid.append([])
5501
5400
  for j, cell in enumerate(row):
5502
5401
  if j < 10:
5503
- text = get_text(text=cell.text, max_text_len=16)
5402
+ text = get_text(
5403
+ cell._get_text(doc=self), max_text_len=16
5404
+ )
5504
5405
  grid[-1].append(text)
5505
5406
 
5506
5407
  result.append("\n" + tabulate(grid) + "\n")
@@ -5683,69 +5584,196 @@ class DoclingDocument(BaseModel):
5683
5584
  )
5684
5585
  return self
5685
5586
 
5587
+ class _DocIndex(BaseModel):
5588
+ """A document merge buffer."""
5589
+
5590
+ groups: list[GroupItem] = []
5591
+ texts: list[TextItem] = []
5592
+ pictures: list[PictureItem] = []
5593
+ tables: list[TableItem] = []
5594
+ key_value_items: list[KeyValueItem] = []
5595
+ form_items: list[FormItem] = []
5596
+
5597
+ pages: dict[int, PageItem] = {}
5598
+
5599
+ _body: Optional[GroupItem] = None
5600
+ _max_page: int = 0
5601
+ _names: list[str] = []
5602
+
5603
+ def get_item_list(self, key: str) -> list[NodeItem]:
5604
+ return getattr(self, key)
5605
+
5606
+ def index(self, doc: "DoclingDocument") -> None:
5607
+
5608
+ orig_ref_to_new_ref: dict[str, str] = {}
5609
+ page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
5610
+
5611
+ if self._body is None:
5612
+ self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
5613
+
5614
+ self._names.append(doc.name)
5615
+
5616
+ # collect items in traversal order
5617
+ for item, _ in doc.iterate_items(
5618
+ with_groups=True,
5619
+ traverse_pictures=True,
5620
+ included_content_layers={c for c in ContentLayer},
5621
+ ):
5622
+ key = item.self_ref.split("/")[1]
5623
+ is_body = key == "body"
5624
+ new_cref = (
5625
+ "#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
5626
+ )
5627
+ # register cref mapping:
5628
+ orig_ref_to_new_ref[item.self_ref] = new_cref
5629
+
5630
+ if not is_body:
5631
+ new_item = copy.deepcopy(item)
5632
+ new_item.children = []
5633
+
5634
+ # put item in the right list
5635
+ self.get_item_list(key).append(new_item)
5636
+
5637
+ # update item's self reference
5638
+ new_item.self_ref = new_cref
5639
+
5640
+ if isinstance(new_item, DocItem):
5641
+ # update page numbers
5642
+ # NOTE other prov sources (e.g. GraphCell) currently not covered
5643
+ for prov in new_item.prov:
5644
+ prov.page_no += page_delta
5645
+
5646
+ if item.parent:
5647
+ # set item's parent
5648
+ new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5649
+ new_item.parent = RefItem(cref=new_parent_cref)
5650
+
5651
+ # add item to parent's children
5652
+ path_components = new_parent_cref.split("/")
5653
+ num_components = len(path_components)
5654
+ if num_components == 3:
5655
+ _, parent_key, parent_index_str = path_components
5656
+ parent_index = int(parent_index_str)
5657
+ parent_item = self.get_item_list(parent_key)[parent_index]
5658
+
5659
+ # update captions field (not possible in iterate_items order):
5660
+ if isinstance(parent_item, FloatingItem):
5661
+ for cap_it, cap in enumerate(parent_item.captions):
5662
+ if cap.cref == item.self_ref:
5663
+ parent_item.captions[cap_it] = RefItem(
5664
+ cref=new_cref
5665
+ )
5666
+ break
5667
+
5668
+ # update rich table cells references:
5669
+ if isinstance(parent_item, TableItem):
5670
+ for cell in parent_item.data.table_cells:
5671
+ if (
5672
+ isinstance(cell, RichTableCell)
5673
+ and cell.ref.cref == item.self_ref
5674
+ ):
5675
+ cell.ref.cref = new_cref
5676
+ break
5677
+
5678
+ elif num_components == 2 and path_components[1] == "body":
5679
+ parent_item = self._body
5680
+ else:
5681
+ raise RuntimeError(
5682
+ f"Unsupported ref format: {new_parent_cref}"
5683
+ )
5684
+ parent_item.children.append(RefItem(cref=new_cref))
5685
+
5686
+ # update pages
5687
+ new_max_page = None
5688
+ for page_nr in doc.pages:
5689
+ new_page = copy.deepcopy(doc.pages[page_nr])
5690
+ new_page_nr = page_nr + page_delta
5691
+ new_page.page_no = new_page_nr
5692
+ self.pages[new_page_nr] = new_page
5693
+ if new_max_page is None or new_page_nr > new_max_page:
5694
+ new_max_page = new_page_nr
5695
+ if new_max_page is not None:
5696
+ self._max_page = new_max_page
5697
+
5698
+ def get_name(self) -> str:
5699
+ return " + ".join(self._names)
5700
+
5701
+ def _update_from_index(self, doc_index: "_DocIndex") -> None:
5702
+ if doc_index._body is not None:
5703
+ self.body = doc_index._body
5704
+ self.groups = doc_index.groups
5705
+ self.texts = doc_index.texts
5706
+ self.pictures = doc_index.pictures
5707
+ self.tables = doc_index.tables
5708
+ self.key_value_items = doc_index.key_value_items
5709
+ self.form_items = doc_index.form_items
5710
+ self.pages = doc_index.pages
5711
+ self.name = doc_index.get_name()
5712
+
5686
5713
  def _normalize_references(self) -> None:
5687
- """Normalize ref numbering by ordering node items as per iterate_items()."""
5688
- new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
5689
-
5690
- item_lists: dict[str, list[NodeItem]] = {
5691
- "groups": [],
5692
- "texts": [],
5693
- "pictures": [],
5694
- "tables": [],
5695
- "key_value_items": [],
5696
- "form_items": [],
5697
- }
5698
- orig_ref_to_new_ref: dict[str, str] = {}
5714
+ doc_index = DoclingDocument._DocIndex()
5715
+ doc_index.index(doc=self)
5716
+ self._update_from_index(doc_index)
5717
+
5718
+ @classmethod
5719
+ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5720
+ """Concatenate multiple documents into a single document."""
5721
+ doc_index = DoclingDocument._DocIndex()
5722
+ for doc in docs:
5723
+ doc_index.index(doc=doc)
5724
+
5725
+ res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
5726
+ res_doc._update_from_index(doc_index)
5727
+ return res_doc
5728
+
5729
+ def _validate_rules(self):
5730
+ def validate_list_group(doc: DoclingDocument, item: ListGroup):
5731
+ for ref in item.children:
5732
+ child = ref.resolve(doc)
5733
+ if not isinstance(child, ListItem):
5734
+ raise ValueError(
5735
+ f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
5736
+ )
5737
+
5738
+ def validate_list_item(doc: DoclingDocument, item: ListItem):
5739
+ if item.parent is None:
5740
+ raise ValueError(f"ListItem {item.self_ref} has no parent")
5741
+ if not isinstance(item.parent.resolve(doc), ListGroup):
5742
+ raise ValueError(
5743
+ f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
5744
+ )
5745
+
5746
+ def validate_group(doc: DoclingDocument, item: GroupItem):
5747
+ if (
5748
+ item.parent and not item.children
5749
+ ): # tolerate empty body, but not other groups
5750
+ raise ValueError(f"Group {item.self_ref} has no children")
5699
5751
 
5700
- # collect items in traversal order
5701
5752
  for item, _ in self.iterate_items(
5702
5753
  with_groups=True,
5703
5754
  traverse_pictures=True,
5704
5755
  included_content_layers={c for c in ContentLayer},
5705
5756
  ):
5706
- key = item.self_ref.split("/")[1]
5707
- is_body = key == "body"
5708
- new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
5709
- # register cref mapping:
5710
- orig_ref_to_new_ref[item.self_ref] = new_cref
5711
-
5712
- if not is_body:
5713
- new_item = copy.deepcopy(item)
5714
- new_item.children = []
5715
-
5716
- # put item in the right list
5717
- item_lists[key].append(new_item)
5718
-
5719
- # update item's self reference
5720
- new_item.self_ref = new_cref
5721
-
5722
- if item.parent:
5723
- # set item's parent
5724
- new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5725
- new_item.parent = RefItem(cref=new_parent_cref)
5726
-
5727
- # add item to parent's children
5728
- path_components = new_parent_cref.split("/")
5729
- num_components = len(path_components)
5730
- parent_node: NodeItem
5731
- if num_components == 3:
5732
- _, parent_key, parent_index_str = path_components
5733
- parent_index = int(parent_index_str)
5734
- parent_node = item_lists[parent_key][parent_index]
5735
- elif num_components == 2 and path_components[1] == "body":
5736
- parent_node = new_body
5737
- else:
5738
- raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
5739
- parent_node.children.append(RefItem(cref=new_cref))
5740
-
5741
- # update document
5742
- self.groups = item_lists["groups"] # type: ignore
5743
- self.texts = item_lists["texts"] # type: ignore
5744
- self.pictures = item_lists["pictures"] # type: ignore
5745
- self.tables = item_lists["tables"] # type: ignore
5746
- self.key_value_items = item_lists["key_value_items"] # type: ignore
5747
- self.form_items = item_lists["form_items"] # type: ignore
5748
- self.body = new_body
5757
+ if isinstance(item, ListGroup):
5758
+ validate_list_group(self, item)
5759
+
5760
+ elif isinstance(item, GroupItem):
5761
+ validate_group(self, item)
5762
+
5763
+ elif isinstance(item, ListItem):
5764
+ validate_list_item(self, item)
5765
+
5766
+ def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
5767
+ """Add a table cell to the table."""
5768
+ if isinstance(cell, RichTableCell):
5769
+ item = cell.ref.resolve(doc=self)
5770
+ if isinstance(item, NodeItem) and (
5771
+ (not item.parent) or item.parent.cref != table_item.self_ref
5772
+ ):
5773
+ raise ValueError(
5774
+ f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
5775
+ )
5776
+ table_item.data.table_cells.append(cell)
5749
5777
 
5750
5778
 
5751
5779
  # deprecated aliases (kept for backwards compatibility):
@@ -6,9 +6,16 @@
6
6
  """Utils for document types."""
7
7
 
8
8
  import html
9
+ import itertools
10
+ import re
9
11
  import unicodedata
10
12
  from pathlib import Path
11
- from typing import Optional
13
+ from typing import TYPE_CHECKING, List, Optional, Tuple
14
+
15
+ from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
16
+
17
+ if TYPE_CHECKING:
18
+ from docling_core.types.doc.document import TableCell, TableData
12
19
 
13
20
 
14
21
  def relative_path(src: Path, target: Path) -> Path:
@@ -84,3 +91,192 @@ def get_text_direction(text: str) -> str:
84
91
  or rtl_chars > len(text) / 2
85
92
  else "ltr"
86
93
  )
94
+
95
+
96
+ def otsl_extract_tokens_and_text(s: str) -> Tuple[List[str], List[str]]:
97
+ """Extract OTSL tokens and text from an OTSL string."""
98
+ # Pattern to match anything enclosed by < >
99
+ # (including the angle brackets themselves)
100
+ pattern = r"(<[^>]+>)"
101
+ # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
102
+ tokens = re.findall(pattern, s)
103
+ # Remove any tokens that start with "<loc_"
104
+ tokens = [
105
+ token
106
+ for token in tokens
107
+ if not (
108
+ token.startswith(rf"<{_LOC_PREFIX}")
109
+ or token
110
+ in [
111
+ rf"<{DocumentToken.OTSL.value}>",
112
+ rf"</{DocumentToken.OTSL.value}>",
113
+ ]
114
+ )
115
+ ]
116
+ # Split the string by those tokens to get the in-between text
117
+ text_parts = re.split(pattern, s)
118
+ text_parts = [
119
+ token
120
+ for token in text_parts
121
+ if not (
122
+ token.startswith(rf"<{_LOC_PREFIX}")
123
+ or token
124
+ in [
125
+ rf"<{DocumentToken.OTSL.value}>",
126
+ rf"</{DocumentToken.OTSL.value}>",
127
+ ]
128
+ )
129
+ ]
130
+ # Remove any empty or purely whitespace strings from text_parts
131
+ text_parts = [part for part in text_parts if part.strip()]
132
+
133
+ return tokens, text_parts
134
+
135
+
136
+ def otsl_parse_texts(
137
+ texts: List[str], tokens: List[str]
138
+ ) -> Tuple[List["TableCell"], List[List[str]]]:
139
+ """Parse OTSL texts and tokens into table cells."""
140
+ from docling_core.types.doc.document import TableCell
141
+
142
+ split_word = TableToken.OTSL_NL.value
143
+ # CLEAN tokens from extra tags, only structural OTSL allowed
144
+ clean_tokens = []
145
+ for t in tokens:
146
+ if t in [
147
+ TableToken.OTSL_ECEL.value,
148
+ TableToken.OTSL_FCEL.value,
149
+ TableToken.OTSL_LCEL.value,
150
+ TableToken.OTSL_UCEL.value,
151
+ TableToken.OTSL_XCEL.value,
152
+ TableToken.OTSL_NL.value,
153
+ TableToken.OTSL_CHED.value,
154
+ TableToken.OTSL_RHED.value,
155
+ TableToken.OTSL_SROW.value,
156
+ ]:
157
+ clean_tokens.append(t)
158
+ tokens = clean_tokens
159
+ split_row_tokens = [
160
+ list(y)
161
+ for x, y in itertools.groupby(tokens, lambda z: z == split_word)
162
+ if not x
163
+ ]
164
+
165
+ table_cells = []
166
+ r_idx = 0
167
+ c_idx = 0
168
+
169
+ def count_right(
170
+ tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str]
171
+ ) -> int:
172
+ span = 0
173
+ c_idx_iter = c_idx
174
+ while tokens[r_idx][c_idx_iter] in which_tokens:
175
+ c_idx_iter += 1
176
+ span += 1
177
+ if c_idx_iter >= len(tokens[r_idx]):
178
+ return span
179
+ return span
180
+
181
+ def count_down(
182
+ tokens: List[List[str]], c_idx: int, r_idx: int, which_tokens: List[str]
183
+ ) -> int:
184
+ span = 0
185
+ r_idx_iter = r_idx
186
+ while tokens[r_idx_iter][c_idx] in which_tokens:
187
+ r_idx_iter += 1
188
+ span += 1
189
+ if r_idx_iter >= len(tokens):
190
+ return span
191
+ return span
192
+
193
+ for i, text in enumerate(texts):
194
+ cell_text = ""
195
+ if text in [
196
+ TableToken.OTSL_FCEL.value,
197
+ TableToken.OTSL_ECEL.value,
198
+ TableToken.OTSL_CHED.value,
199
+ TableToken.OTSL_RHED.value,
200
+ TableToken.OTSL_SROW.value,
201
+ ]:
202
+ row_span = 1
203
+ col_span = 1
204
+ right_offset = 1
205
+ if text != TableToken.OTSL_ECEL.value:
206
+ cell_text = texts[i + 1]
207
+ right_offset = 2
208
+
209
+ # Check next element(s) for lcel / ucel / xcel,
210
+ # set properly row_span, col_span
211
+ next_right_cell = ""
212
+ if i + right_offset < len(texts):
213
+ next_right_cell = texts[i + right_offset]
214
+
215
+ next_bottom_cell = ""
216
+ if r_idx + 1 < len(split_row_tokens):
217
+ if c_idx < len(split_row_tokens[r_idx + 1]):
218
+ next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
219
+
220
+ if next_right_cell in [
221
+ TableToken.OTSL_LCEL.value,
222
+ TableToken.OTSL_XCEL.value,
223
+ ]:
224
+ # we have horizontal spanning cell or 2d spanning cell
225
+ col_span += count_right(
226
+ split_row_tokens,
227
+ c_idx + 1,
228
+ r_idx,
229
+ [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
230
+ )
231
+ if next_bottom_cell in [
232
+ TableToken.OTSL_UCEL.value,
233
+ TableToken.OTSL_XCEL.value,
234
+ ]:
235
+ # we have a vertical spanning cell or 2d spanning cell
236
+ row_span += count_down(
237
+ split_row_tokens,
238
+ c_idx,
239
+ r_idx + 1,
240
+ [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
241
+ )
242
+
243
+ table_cells.append(
244
+ TableCell(
245
+ text=cell_text.strip(),
246
+ row_span=row_span,
247
+ col_span=col_span,
248
+ start_row_offset_idx=r_idx,
249
+ end_row_offset_idx=r_idx + row_span,
250
+ start_col_offset_idx=c_idx,
251
+ end_col_offset_idx=c_idx + col_span,
252
+ )
253
+ )
254
+ if text in [
255
+ TableToken.OTSL_FCEL.value,
256
+ TableToken.OTSL_ECEL.value,
257
+ TableToken.OTSL_CHED.value,
258
+ TableToken.OTSL_RHED.value,
259
+ TableToken.OTSL_SROW.value,
260
+ TableToken.OTSL_LCEL.value,
261
+ TableToken.OTSL_UCEL.value,
262
+ TableToken.OTSL_XCEL.value,
263
+ ]:
264
+ c_idx += 1
265
+ if text == TableToken.OTSL_NL.value:
266
+ r_idx += 1
267
+ c_idx = 0
268
+ return table_cells, split_row_tokens
269
+
270
+
271
+ def parse_otsl_table_content(otsl_content: str) -> "TableData":
272
+ """Parse OTSL content into TableData."""
273
+ from docling_core.types.doc.document import TableData
274
+
275
+ tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
276
+ table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
277
+
278
+ return TableData(
279
+ num_rows=len(split_row_tokens),
280
+ num_cols=(max(len(row) for row in split_row_tokens) if split_row_tokens else 0),
281
+ table_cells=table_cells,
282
+ )
@@ -252,7 +252,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
252
252
 
253
253
  spans = list(_make_spans(cell, item))
254
254
  table_data[i][j] = GlmTableCell(
255
- text=cell.text,
255
+ text=cell._get_text(doc=doc),
256
256
  bbox=(
257
257
  cell.bbox.as_tuple()
258
258
  if cell.bbox is not None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.44.2
3
+ Version: 2.46.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -28,11 +28,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
28
28
  docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
29
29
  docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
30
30
  docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
31
- docling_core/transforms/serializer/common.py,sha256=0TNEGoA_rJ-qkVYp-X8SMUr3jTrbf6TRzPzwufYh5JM,19114
32
- docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
33
- docling_core/transforms/serializer/html.py,sha256=KnSMjtNZlBMfkuhtgB8T70iQSTfG_E8FFDfVRRo9WNs,38087
31
+ docling_core/transforms/serializer/common.py,sha256=RwfdzZ9FRSHQjKM0vskg1CVqar0Z_ms38arSlLAgITc,19150
32
+ docling_core/transforms/serializer/doctags.py,sha256=VXPjAZPhBur7LaEeuqH9k31TgZWSN32lK8z8rJXzFwY,19935
33
+ docling_core/transforms/serializer/html.py,sha256=GRfRaqFIb4FXRMplB4Agl4fSNa5jsHV7P4tBtFMro9I,38453
34
34
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
35
- docling_core/transforms/serializer/markdown.py,sha256=VwonuAkuOPmQM7ibDIGvQBHOqhTcTJ_t187fLQQiNPo,23951
35
+ docling_core/transforms/serializer/markdown.py,sha256=hilGM1yWpbbRTjuEjfBRrhavspD5vFF_6SDvlKx8BrM,24230
36
36
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
37
37
  docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
38
38
  docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
@@ -41,13 +41,13 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
41
41
  docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
42
42
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
43
43
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
44
- docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
44
+ docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
45
45
  docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
46
- docling_core/types/doc/document.py,sha256=-cL4eGFRbQHgXAsCG8zALxAx-IoanvkqG5E1zvKOMxI,201012
46
+ docling_core/types/doc/document.py,sha256=Ab-JOc6fkzocXP3PcxPRXJPjLOhOTYo_0571vSr6VXo,202093
47
47
  docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
48
48
  docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
49
49
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
50
- docling_core/types/doc/utils.py,sha256=JpAi7x9DHksFlIj_gRJPcSZOHa8AHvVPEO_K9aSnw4c,2608
50
+ docling_core/types/doc/utils.py,sha256=wKC9SJgS4ZKdoYPAlNuRyncv9RIEewzVCBmwbUmbA6E,9106
51
51
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
52
52
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
53
53
  docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
@@ -73,12 +73,12 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
73
73
  docling_core/utils/file.py,sha256=CSNclJGL2OwLIc8DQFdoLxr22FUc4_UC7zS6pNrFfkQ,6858
74
74
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
75
75
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
76
- docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
76
+ docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
77
77
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
78
78
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
79
- docling_core-2.44.2.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
- docling_core-2.44.2.dist-info/METADATA,sha256=IZWVMKuPPpzd3ksiFXTPUu3FSw13zuwa5qyaLWlBEyY,6453
81
- docling_core-2.44.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
- docling_core-2.44.2.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
- docling_core-2.44.2.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
- docling_core-2.44.2.dist-info/RECORD,,
79
+ docling_core-2.46.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
+ docling_core-2.46.0.dist-info/METADATA,sha256=txMHh-7y8N3RiJ_M_HbrsvzRyGPJVXv8UcA6_DpAfok,6453
81
+ docling_core-2.46.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ docling_core-2.46.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
+ docling_core-2.46.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
+ docling_core-2.46.0.dist-info/RECORD,,