docling-core 2.45.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -359,6 +359,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
359
359
  item=item,
360
360
  doc_serializer=self,
361
361
  doc=self.doc,
362
+ visited=my_visited,
362
363
  **my_kwargs,
363
364
  )
364
365
  elif isinstance(item, PictureItem):
@@ -157,6 +157,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
157
157
  item: TableItem,
158
158
  doc_serializer: BaseDocSerializer,
159
159
  doc: DoclingDocument,
160
+ visited: Optional[set[str]] = None,
160
161
  **kwargs: Any,
161
162
  ) -> SerializationResult:
162
163
  """Serializes the passed item."""
@@ -179,6 +180,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
179
180
  add_cell_text=params.add_table_cell_text,
180
181
  xsize=params.xsize,
181
182
  ysize=params.ysize,
183
+ visited=visited,
182
184
  )
183
185
  res_parts.append(create_ser_result(text=otsl_text, span_source=item))
184
186
 
@@ -65,8 +65,8 @@ from docling_core.types.doc.document import (
65
65
  PictureItem,
66
66
  PictureMoleculeData,
67
67
  PictureTabularChartData,
68
+ RichTableCell,
68
69
  SectionHeaderItem,
69
- TableCell,
70
70
  TableItem,
71
71
  TextItem,
72
72
  TitleItem,
@@ -346,9 +346,6 @@ class HTMLTableSerializer(BaseTableSerializer):
346
346
  **kwargs: Any,
347
347
  ) -> SerializationResult:
348
348
  """Serializes the passed table item to HTML."""
349
- nrows = item.data.num_rows
350
- ncols = item.data.num_cols
351
-
352
349
  res_parts: list[SerializationResult] = []
353
350
  cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
354
351
  if cap_res.text:
@@ -356,11 +353,11 @@ class HTMLTableSerializer(BaseTableSerializer):
356
353
 
357
354
  if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
358
355
  body = ""
356
+ span_source: Union[DocItem, list[SerializationResult]] = []
359
357
 
360
- for i in range(nrows):
358
+ for i, row in enumerate(item.data.grid):
361
359
  body += "<tr>"
362
- for j in range(ncols):
363
- cell: TableCell = item.data.grid[i][j]
360
+ for j, cell in enumerate(row):
364
361
 
365
362
  rowspan, rowstart = (
366
363
  cell.row_span,
@@ -376,7 +373,16 @@ class HTMLTableSerializer(BaseTableSerializer):
376
373
  if colstart != j:
377
374
  continue
378
375
 
379
- content = html.escape(cell.text.strip())
376
+ if isinstance(cell, RichTableCell):
377
+ ser_res = doc_serializer.serialize(
378
+ item=cell.ref.resolve(doc=doc), **kwargs
379
+ )
380
+ content = ser_res.text
381
+ span_source = [ser_res]
382
+ else:
383
+ content = html.escape(cell.text.strip())
384
+ span_source = item
385
+
380
386
  celltag = "td"
381
387
  if cell.column_header or cell.row_header or cell.row_section:
382
388
  celltag = "th"
@@ -389,14 +395,14 @@ class HTMLTableSerializer(BaseTableSerializer):
389
395
 
390
396
  text_dir = get_text_direction(content)
391
397
  if text_dir == "rtl":
392
- opening_tag += f' dir="{dir}"'
398
+ opening_tag += f' dir="{text_dir}"'
393
399
 
394
400
  body += f"<{opening_tag}>{content}</{celltag}>"
395
401
  body += "</tr>"
396
402
 
397
403
  if body:
398
404
  body = f"<tbody>{body}</tbody>"
399
- res_parts.append(create_ser_result(text=body, span_source=item))
405
+ res_parts.append(create_ser_result(text=body, span_source=span_source))
400
406
 
401
407
  text_res = "".join([r.text for r in res_parts])
402
408
  text_res = f"<table>{text_res}</table>" if text_res else ""
@@ -55,6 +55,7 @@ from docling_core.types.doc.document import (
55
55
  PictureItem,
56
56
  PictureMoleculeData,
57
57
  PictureTabularChartData,
58
+ RichTableCell,
58
59
  SectionHeaderItem,
59
60
  TableItem,
60
61
  TextItem,
@@ -320,7 +321,13 @@ class MarkdownTableSerializer(BaseTableSerializer):
320
321
  [
321
322
  # make sure that md tables are not broken
322
323
  # due to newline chars in the text
323
- col.text.replace("\n", " ")
324
+ (
325
+ doc_serializer.serialize(
326
+ item=col.ref.resolve(doc=doc), **kwargs
327
+ ).text
328
+ if isinstance(col, RichTableCell)
329
+ else col.text
330
+ ).replace("\n", " ")
324
331
  for col in row
325
332
  ]
326
333
  for row in item.data.grid
@@ -7,6 +7,7 @@
7
7
 
8
8
  from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
9
9
  from .document import (
10
+ AnyTableCell,
10
11
  BaseAnnotation,
11
12
  ChartBar,
12
13
  ChartLine,
@@ -52,6 +53,7 @@ from .document import (
52
53
  PictureTabularChartData,
53
54
  ProvenanceItem,
54
55
  RefItem,
56
+ RichTableCell,
55
57
  Script,
56
58
  SectionHeaderItem,
57
59
  TableCell,
@@ -34,7 +34,7 @@ from pydantic import (
34
34
  validate_call,
35
35
  )
36
36
  from tabulate import tabulate
37
- from typing_extensions import Annotated, Self, deprecated
37
+ from typing_extensions import Annotated, Self, deprecated, override
38
38
 
39
39
  from docling_core.search.package import VERSION_PATTERN
40
40
  from docling_core.types.base import _JSON_POINTER_REGEX
@@ -60,7 +60,7 @@ _logger = logging.getLogger(__name__)
60
60
 
61
61
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
62
62
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
63
- CURRENT_VERSION: Final = "1.5.0"
63
+ CURRENT_VERSION: Final = "1.6.0"
64
64
 
65
65
  DEFAULT_EXPORT_LABELS = {
66
66
  DocItemLabel.TITLE,
@@ -325,7 +325,7 @@ class TableCell(BaseModel):
325
325
  in data
326
326
  ):
327
327
  return data
328
- text = data["bbox"].get("token", "")
328
+ text = data.get("bbox", {}).get("token", "")
329
329
  if not len(text):
330
330
  text_cells = data.pop("text_cell_bboxes", None)
331
331
  if text_cells:
@@ -337,11 +337,39 @@ class TableCell(BaseModel):
337
337
 
338
338
  return data
339
339
 
340
+ def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
341
+ return self.text
342
+
343
+
344
+ class RichTableCell(TableCell):
345
+ """RichTableCell."""
346
+
347
+ ref: "RefItem"
348
+
349
+ @override
350
+ def _get_text(self, doc: Optional["DoclingDocument"] = None, **kwargs: Any) -> str:
351
+ from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
352
+
353
+ if doc is not None:
354
+ doc_serializer = kwargs.pop(
355
+ "doc_serializer", MarkdownDocSerializer(doc=doc)
356
+ )
357
+ ser_res = doc_serializer.serialize(item=self.ref.resolve(doc=doc), **kwargs)
358
+ return ser_res.text
359
+ else:
360
+ return "<!-- rich cell -->"
361
+
362
+
363
+ AnyTableCell = Annotated[
364
+ Union[RichTableCell, TableCell],
365
+ Field(union_mode="left_to_right"),
366
+ ]
367
+
340
368
 
341
369
  class TableData(BaseModel): # TBD
342
370
  """BaseTableData."""
343
371
 
344
- table_cells: List[TableCell] = []
372
+ table_cells: List[AnyTableCell] = []
345
373
  num_rows: int = 0
346
374
  num_cols: int = 0
347
375
 
@@ -380,7 +408,9 @@ class TableData(BaseModel): # TBD
380
408
 
381
409
  return table_data
382
410
 
383
- def remove_rows(self, indices: List[int]) -> List[List[TableCell]]:
411
+ def remove_rows(
412
+ self, indices: List[int], doc: Optional["DoclingDocument"] = None
413
+ ) -> List[List[TableCell]]:
384
414
  """Remove rows from the table by their indices.
385
415
 
386
416
  :param indices: List[int]: A list of indices of the rows to remove. (Starting from 0)
@@ -392,6 +422,7 @@ class TableData(BaseModel): # TBD
392
422
 
393
423
  indices = sorted(indices, reverse=True)
394
424
 
425
+ refs_to_remove = []
395
426
  all_removed_cells = []
396
427
  for row_index in indices:
397
428
  if row_index < 0 or row_index >= self.num_rows:
@@ -403,6 +434,10 @@ class TableData(BaseModel): # TBD
403
434
  end_idx = start_idx + self.num_cols
404
435
  removed_cells = self.table_cells[start_idx:end_idx]
405
436
 
437
+ for cell in removed_cells:
438
+ if isinstance(cell, RichTableCell):
439
+ refs_to_remove.append(cell.ref)
440
+
406
441
  # Remove the cells from the table
407
442
  self.table_cells = self.table_cells[:start_idx] + self.table_cells[end_idx:]
408
443
 
@@ -417,9 +452,18 @@ class TableData(BaseModel): # TBD
417
452
 
418
453
  all_removed_cells.append(removed_cells)
419
454
 
455
+ if refs_to_remove:
456
+ if doc is None:
457
+ _logger.warning(
458
+ "When table contains rich cells, `doc` argument must be provided, "
459
+ "otherwise rich cell content will be left dangling."
460
+ )
461
+ else:
462
+ doc._delete_items(refs_to_remove)
463
+
420
464
  return all_removed_cells
421
465
 
422
- def pop_row(self) -> List[TableCell]:
466
+ def pop_row(self, doc: Optional["DoclingDocument"] = None) -> List[TableCell]:
423
467
  """Remove and return the last row from the table.
424
468
 
425
469
  :returns: List[TableCell]: A list of TableCell objects representing the popped row.
@@ -427,16 +471,18 @@ class TableData(BaseModel): # TBD
427
471
  if self.num_rows == 0:
428
472
  raise IndexError("Cannot pop from an empty table.")
429
473
 
430
- return self.remove_row(self.num_rows - 1)
474
+ return self.remove_row(self.num_rows - 1, doc=doc)
431
475
 
432
- def remove_row(self, row_index: int) -> List[TableCell]:
476
+ def remove_row(
477
+ self, row_index: int, doc: Optional["DoclingDocument"] = None
478
+ ) -> List[TableCell]:
433
479
  """Remove a row from the table by its index.
434
480
 
435
481
  :param row_index: int: The index of the row to remove. (Starting from 0)
436
482
 
437
483
  :returns: List[TableCell]: A list of TableCell objects representing the removed row.
438
484
  """
439
- return self.remove_rows([row_index])[0]
485
+ return self.remove_rows([row_index], doc=doc)[0]
440
486
 
441
487
  def insert_rows(
442
488
  self, row_index: int, rows: List[List[str]], after: bool = False
@@ -1509,8 +1555,15 @@ class TableItem(FloatingItem):
1509
1555
 
1510
1556
  annotations: List[TableAnnotationType] = []
1511
1557
 
1512
- def export_to_dataframe(self) -> pd.DataFrame:
1558
+ def export_to_dataframe(
1559
+ self, doc: Optional["DoclingDocument"] = None
1560
+ ) -> pd.DataFrame:
1513
1561
  """Export the table as a Pandas DataFrame."""
1562
+ if doc is None:
1563
+ _logger.warning(
1564
+ "Usage of TableItem.export_to_dataframe() without `doc` argument is deprecated."
1565
+ )
1566
+
1514
1567
  if self.data.num_rows == 0 or self.data.num_cols == 0:
1515
1568
  return pd.DataFrame()
1516
1569
 
@@ -1539,14 +1592,15 @@ class TableItem(FloatingItem):
1539
1592
  columns = ["" for _ in range(self.data.num_cols)]
1540
1593
  for i in range(num_headers):
1541
1594
  for j, cell in enumerate(self.data.grid[i]):
1542
- col_name = cell.text
1595
+ col_name = cell._get_text(doc=doc)
1543
1596
  if columns[j] != "":
1544
1597
  col_name = f".{col_name}"
1545
1598
  columns[j] += col_name
1546
1599
 
1547
1600
  # Create table data
1548
1601
  table_data = [
1549
- [cell.text for cell in row] for row in self.data.grid[num_headers:]
1602
+ [cell._get_text(doc=doc) for cell in row]
1603
+ for row in self.data.grid[num_headers:]
1550
1604
  ]
1551
1605
 
1552
1606
  # Create DataFrame
@@ -1577,7 +1631,7 @@ class TableItem(FloatingItem):
1577
1631
 
1578
1632
  # make sure that md tables are not broken
1579
1633
  # due to newline chars in the text
1580
- text = col.text
1634
+ text = col._get_text(doc=doc)
1581
1635
  text = text.replace("\n", " ")
1582
1636
  tmp.append(text)
1583
1637
 
@@ -1623,6 +1677,7 @@ class TableItem(FloatingItem):
1623
1677
  add_cell_text: bool = True,
1624
1678
  xsize: int = 500,
1625
1679
  ysize: int = 500,
1680
+ **kwargs: Any,
1626
1681
  ) -> str:
1627
1682
  """Export the table as OTSL."""
1628
1683
  # Possible OTSL tokens...
@@ -1639,6 +1694,9 @@ class TableItem(FloatingItem):
1639
1694
  # Headers (column, row, section row):
1640
1695
  # "ched", "rhed", "srow"
1641
1696
 
1697
+ from docling_core.transforms.serializer.doctags import DocTagsDocSerializer
1698
+
1699
+ doc_serializer = DocTagsDocSerializer(doc=doc)
1642
1700
  body = []
1643
1701
  nrows = self.data.num_rows
1644
1702
  ncols = self.data.num_cols
@@ -1652,7 +1710,9 @@ class TableItem(FloatingItem):
1652
1710
  for i in range(nrows):
1653
1711
  for j in range(ncols):
1654
1712
  cell: TableCell = self.data.grid[i][j]
1655
- content = cell.text.strip()
1713
+ content = cell._get_text(
1714
+ doc=doc, doc_serializer=doc_serializer, **kwargs
1715
+ ).strip()
1656
1716
  rowspan, rowstart = (
1657
1717
  cell.row_span,
1658
1718
  cell.start_row_offset_idx,
@@ -2304,6 +2364,15 @@ class DoclingDocument(BaseModel):
2304
2364
  refs_to_be_deleted=refs_to_be_deleted,
2305
2365
  lookup=lookup,
2306
2366
  )
2367
+ if isinstance(node, TableItem):
2368
+ for cell in node.data.table_cells:
2369
+ if isinstance(cell, RichTableCell):
2370
+ path = cell.ref._split_ref_to_path()
2371
+ cell.ref = self._update_ref_with_lookup(
2372
+ item_label=path[1],
2373
+ item_index=int(path[2]),
2374
+ lookup=lookup,
2375
+ )
2307
2376
 
2308
2377
  # Update the self_ref reference
2309
2378
  if node.parent is not None:
@@ -3945,16 +4014,22 @@ class DoclingDocument(BaseModel):
3945
4014
  """num_pages."""
3946
4015
  return len(self.pages.values())
3947
4016
 
3948
- def validate_tree(self, root) -> bool:
4017
+ def validate_tree(self, root: NodeItem) -> bool:
3949
4018
  """validate_tree."""
3950
- res = []
3951
4019
  for child_ref in root.children:
3952
4020
  child = child_ref.resolve(self)
3953
- if child.parent.resolve(self) != root:
4021
+ if child.parent.resolve(self) != root or not self.validate_tree(child):
3954
4022
  return False
3955
- res.append(self.validate_tree(child))
3956
4023
 
3957
- return all(res) or len(res) == 0
4024
+ if isinstance(root, TableItem):
4025
+ for cell in root.data.table_cells:
4026
+ if isinstance(cell, RichTableCell) and (
4027
+ (par_ref := cell.ref.resolve(self).parent) is None
4028
+ or par_ref.resolve(self) != root
4029
+ ):
4030
+ return False
4031
+
4032
+ return True
3958
4033
 
3959
4034
  def iterate_items(
3960
4035
  self,
@@ -3963,14 +4038,14 @@ class DoclingDocument(BaseModel):
3963
4038
  traverse_pictures: bool = False,
3964
4039
  page_no: Optional[int] = None,
3965
4040
  included_content_layers: Optional[set[ContentLayer]] = None,
3966
- _level: int = 0, # fixed parameter, carries through the node nesting level
4041
+ _level: int = 0, # deprecated
3967
4042
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
3968
4043
  """Iterate elements with level."""
3969
4044
  for item, stack in self._iterate_items_with_stack(
3970
4045
  root=root,
3971
4046
  with_groups=with_groups,
3972
4047
  traverse_pictures=traverse_pictures,
3973
- page_no=page_no,
4048
+ page_nrs={page_no} if page_no is not None else None,
3974
4049
  included_content_layers=included_content_layers,
3975
4050
  ):
3976
4051
  yield item, len(stack)
@@ -3980,7 +4055,7 @@ class DoclingDocument(BaseModel):
3980
4055
  root: Optional[NodeItem] = None,
3981
4056
  with_groups: bool = False,
3982
4057
  traverse_pictures: bool = False,
3983
- page_no: Optional[int] = None,
4058
+ page_nrs: Optional[set[int]] = None,
3984
4059
  included_content_layers: Optional[set[ContentLayer]] = None,
3985
4060
  _stack: Optional[list[int]] = None,
3986
4061
  ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
@@ -4003,8 +4078,8 @@ class DoclingDocument(BaseModel):
4003
4078
  and (
4004
4079
  not isinstance(root, DocItem)
4005
4080
  or (
4006
- page_no is None
4007
- or any(prov.page_no == page_no for prov in root.prov)
4081
+ page_nrs is None
4082
+ or any(prov.page_no in page_nrs for prov in root.prov)
4008
4083
  )
4009
4084
  )
4010
4085
  and root.content_layer in my_layers
@@ -4038,7 +4113,7 @@ class DoclingDocument(BaseModel):
4038
4113
  child,
4039
4114
  with_groups=with_groups,
4040
4115
  traverse_pictures=traverse_pictures,
4041
- page_no=page_no,
4116
+ page_nrs=page_nrs,
4042
4117
  _stack=my_stack,
4043
4118
  included_content_layers=my_layers,
4044
4119
  )
@@ -5324,7 +5399,9 @@ class DoclingDocument(BaseModel):
5324
5399
  grid.append([])
5325
5400
  for j, cell in enumerate(row):
5326
5401
  if j < 10:
5327
- text = get_text(text=cell.text, max_text_len=16)
5402
+ text = get_text(
5403
+ cell._get_text(doc=self), max_text_len=16
5404
+ )
5328
5405
  grid[-1].append(text)
5329
5406
 
5330
5407
  result.append("\n" + tabulate(grid) + "\n")
@@ -5526,7 +5603,9 @@ class DoclingDocument(BaseModel):
5526
5603
  def get_item_list(self, key: str) -> list[NodeItem]:
5527
5604
  return getattr(self, key)
5528
5605
 
5529
- def index(self, doc: "DoclingDocument") -> None:
5606
+ def index(
5607
+ self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
5608
+ ) -> None:
5530
5609
 
5531
5610
  orig_ref_to_new_ref: dict[str, str] = {}
5532
5611
  page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
@@ -5537,10 +5616,11 @@ class DoclingDocument(BaseModel):
5537
5616
  self._names.append(doc.name)
5538
5617
 
5539
5618
  # collect items in traversal order
5540
- for item, _ in doc.iterate_items(
5619
+ for item, _ in doc._iterate_items_with_stack(
5541
5620
  with_groups=True,
5542
5621
  traverse_pictures=True,
5543
5622
  included_content_layers={c for c in ContentLayer},
5623
+ page_nrs=page_nrs,
5544
5624
  ):
5545
5625
  key = item.self_ref.split("/")[1]
5546
5626
  is_body = key == "body"
@@ -5588,6 +5668,16 @@ class DoclingDocument(BaseModel):
5588
5668
  )
5589
5669
  break
5590
5670
 
5671
+ # update rich table cells references:
5672
+ if isinstance(parent_item, TableItem):
5673
+ for cell in parent_item.data.table_cells:
5674
+ if (
5675
+ isinstance(cell, RichTableCell)
5676
+ and cell.ref.cref == item.self_ref
5677
+ ):
5678
+ cell.ref.cref = new_cref
5679
+ break
5680
+
5591
5681
  elif num_components == 2 and path_components[1] == "body":
5592
5682
  parent_item = self._body
5593
5683
  else:
@@ -5599,12 +5689,13 @@ class DoclingDocument(BaseModel):
5599
5689
  # update pages
5600
5690
  new_max_page = None
5601
5691
  for page_nr in doc.pages:
5602
- new_page = copy.deepcopy(doc.pages[page_nr])
5603
- new_page_nr = page_nr + page_delta
5604
- new_page.page_no = new_page_nr
5605
- self.pages[new_page_nr] = new_page
5606
- if new_max_page is None or new_page_nr > new_max_page:
5607
- new_max_page = new_page_nr
5692
+ if page_nrs is None or page_nr in page_nrs:
5693
+ new_page = copy.deepcopy(doc.pages[page_nr])
5694
+ new_page_nr = page_nr + page_delta
5695
+ new_page.page_no = new_page_nr
5696
+ self.pages[new_page_nr] = new_page
5697
+ if new_max_page is None or new_page_nr > new_max_page:
5698
+ new_max_page = new_page_nr
5608
5699
  if new_max_page is not None:
5609
5700
  self._max_page = new_max_page
5610
5701
 
@@ -5628,6 +5719,14 @@ class DoclingDocument(BaseModel):
5628
5719
  doc_index.index(doc=self)
5629
5720
  self._update_from_index(doc_index)
5630
5721
 
5722
+ def filter(self, page_nrs: Optional[set[int]] = None) -> "DoclingDocument":
5723
+ """Create a new document based on the provided filter parameters."""
5724
+ doc_index = DoclingDocument._DocIndex()
5725
+ doc_index.index(doc=self, page_nrs=page_nrs)
5726
+ res_doc = DoclingDocument(name=self.name)
5727
+ res_doc._update_from_index(doc_index)
5728
+ return res_doc
5729
+
5631
5730
  @classmethod
5632
5731
  def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5633
5732
  """Concatenate multiple documents into a single document."""
@@ -5676,6 +5775,18 @@ class DoclingDocument(BaseModel):
5676
5775
  elif isinstance(item, ListItem):
5677
5776
  validate_list_item(self, item)
5678
5777
 
5778
+ def add_table_cell(self, table_item: TableItem, cell: TableCell) -> None:
5779
+ """Add a table cell to the table."""
5780
+ if isinstance(cell, RichTableCell):
5781
+ item = cell.ref.resolve(doc=self)
5782
+ if isinstance(item, NodeItem) and (
5783
+ (not item.parent) or item.parent.cref != table_item.self_ref
5784
+ ):
5785
+ raise ValueError(
5786
+ f"Trying to add cell with another parent {item.parent} to {table_item.self_ref}"
5787
+ )
5788
+ table_item.data.table_cells.append(cell)
5789
+
5679
5790
 
5680
5791
  # deprecated aliases (kept for backwards compatibility):
5681
5792
  BasePictureData = BaseAnnotation
@@ -252,7 +252,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
252
252
 
253
253
  spans = list(_make_spans(cell, item))
254
254
  table_data[i][j] = GlmTableCell(
255
- text=cell.text,
255
+ text=cell._get_text(doc=doc),
256
256
  bbox=(
257
257
  cell.bbox.as_tuple()
258
258
  if cell.bbox is not None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.45.0
3
+ Version: 2.47.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -28,11 +28,11 @@ docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZ
28
28
  docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
29
29
  docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
30
30
  docling_core/transforms/serializer/base.py,sha256=TI8Epj7gyxdTet9j-Rs4o5U09gfACfAIVoirlschviM,7266
31
- docling_core/transforms/serializer/common.py,sha256=0TNEGoA_rJ-qkVYp-X8SMUr3jTrbf6TRzPzwufYh5JM,19114
32
- docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
33
- docling_core/transforms/serializer/html.py,sha256=PPlHVu3_wnc0cD-n6n8v9clCmeY_LPqII7euVYqi6Kk,38089
31
+ docling_core/transforms/serializer/common.py,sha256=RwfdzZ9FRSHQjKM0vskg1CVqar0Z_ms38arSlLAgITc,19150
32
+ docling_core/transforms/serializer/doctags.py,sha256=VXPjAZPhBur7LaEeuqH9k31TgZWSN32lK8z8rJXzFwY,19935
33
+ docling_core/transforms/serializer/html.py,sha256=GRfRaqFIb4FXRMplB4Agl4fSNa5jsHV7P4tBtFMro9I,38453
34
34
  docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
35
- docling_core/transforms/serializer/markdown.py,sha256=VwonuAkuOPmQM7ibDIGvQBHOqhTcTJ_t187fLQQiNPo,23951
35
+ docling_core/transforms/serializer/markdown.py,sha256=hilGM1yWpbbRTjuEjfBRrhavspD5vFF_6SDvlKx8BrM,24230
36
36
  docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
37
37
  docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
38
38
  docling_core/transforms/visualizer/key_value_visualizer.py,sha256=fp7nFLy4flOSiavdRgg5y1Mu7WVLIDGh1zEHsq8kgVM,8979
@@ -41,9 +41,9 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
41
41
  docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
42
42
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
43
43
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
44
- docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
44
+ docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
45
45
  docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
46
- docling_core/types/doc/document.py,sha256=gMruWRH1ELYepSMaGA5b8_l9bIjIz7JU4Yh78W_sb00,198193
46
+ docling_core/types/doc/document.py,sha256=jyMcK1oiu8X8juNa9DuI3S1imn4hXwjOS7iTLQ1HykU,202707
47
47
  docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
48
48
  docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
49
49
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -73,12 +73,12 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
73
73
  docling_core/utils/file.py,sha256=CSNclJGL2OwLIc8DQFdoLxr22FUc4_UC7zS6pNrFfkQ,6858
74
74
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
75
75
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
76
- docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
76
+ docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
77
77
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
78
78
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
79
- docling_core-2.45.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
- docling_core-2.45.0.dist-info/METADATA,sha256=VX5jfhqswstEumhmLi6VrGD9crC8RKy52z835nCvORw,6453
81
- docling_core-2.45.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
- docling_core-2.45.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
- docling_core-2.45.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
- docling_core-2.45.0.dist-info/RECORD,,
79
+ docling_core-2.47.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
+ docling_core-2.47.0.dist-info/METADATA,sha256=jW4Zdx0WwStnLDifSsvYyGLw-5C2IYiEeK4IQRGQi-I,6453
81
+ docling_core-2.47.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ docling_core-2.47.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
+ docling_core-2.47.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
+ docling_core-2.47.0.dist-info/RECORD,,