docling-core 2.46.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -4045,7 +4045,7 @@ class DoclingDocument(BaseModel):
4045
4045
  root=root,
4046
4046
  with_groups=with_groups,
4047
4047
  traverse_pictures=traverse_pictures,
4048
- page_no=page_no,
4048
+ page_nrs={page_no} if page_no is not None else None,
4049
4049
  included_content_layers=included_content_layers,
4050
4050
  ):
4051
4051
  yield item, len(stack)
@@ -4055,7 +4055,7 @@ class DoclingDocument(BaseModel):
4055
4055
  root: Optional[NodeItem] = None,
4056
4056
  with_groups: bool = False,
4057
4057
  traverse_pictures: bool = False,
4058
- page_no: Optional[int] = None,
4058
+ page_nrs: Optional[set[int]] = None,
4059
4059
  included_content_layers: Optional[set[ContentLayer]] = None,
4060
4060
  _stack: Optional[list[int]] = None,
4061
4061
  ) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
@@ -4078,8 +4078,8 @@ class DoclingDocument(BaseModel):
4078
4078
  and (
4079
4079
  not isinstance(root, DocItem)
4080
4080
  or (
4081
- page_no is None
4082
- or any(prov.page_no == page_no for prov in root.prov)
4081
+ page_nrs is None
4082
+ or any(prov.page_no in page_nrs for prov in root.prov)
4083
4083
  )
4084
4084
  )
4085
4085
  and root.content_layer in my_layers
@@ -4113,7 +4113,7 @@ class DoclingDocument(BaseModel):
4113
4113
  child,
4114
4114
  with_groups=with_groups,
4115
4115
  traverse_pictures=traverse_pictures,
4116
- page_no=page_no,
4116
+ page_nrs=page_nrs,
4117
4117
  _stack=my_stack,
4118
4118
  included_content_layers=my_layers,
4119
4119
  )
@@ -5603,7 +5603,9 @@ class DoclingDocument(BaseModel):
5603
5603
  def get_item_list(self, key: str) -> list[NodeItem]:
5604
5604
  return getattr(self, key)
5605
5605
 
5606
- def index(self, doc: "DoclingDocument") -> None:
5606
+ def index(
5607
+ self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
5608
+ ) -> None:
5607
5609
 
5608
5610
  orig_ref_to_new_ref: dict[str, str] = {}
5609
5611
  page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
@@ -5614,10 +5616,11 @@ class DoclingDocument(BaseModel):
5614
5616
  self._names.append(doc.name)
5615
5617
 
5616
5618
  # collect items in traversal order
5617
- for item, _ in doc.iterate_items(
5619
+ for item, _ in doc._iterate_items_with_stack(
5618
5620
  with_groups=True,
5619
5621
  traverse_pictures=True,
5620
5622
  included_content_layers={c for c in ContentLayer},
5623
+ page_nrs=page_nrs,
5621
5624
  ):
5622
5625
  key = item.self_ref.split("/")[1]
5623
5626
  is_body = key == "body"
@@ -5686,12 +5689,13 @@ class DoclingDocument(BaseModel):
5686
5689
  # update pages
5687
5690
  new_max_page = None
5688
5691
  for page_nr in doc.pages:
5689
- new_page = copy.deepcopy(doc.pages[page_nr])
5690
- new_page_nr = page_nr + page_delta
5691
- new_page.page_no = new_page_nr
5692
- self.pages[new_page_nr] = new_page
5693
- if new_max_page is None or new_page_nr > new_max_page:
5694
- new_max_page = new_page_nr
5692
+ if page_nrs is None or page_nr in page_nrs:
5693
+ new_page = copy.deepcopy(doc.pages[page_nr])
5694
+ new_page_nr = page_nr + page_delta
5695
+ new_page.page_no = new_page_nr
5696
+ self.pages[new_page_nr] = new_page
5697
+ if new_max_page is None or new_page_nr > new_max_page:
5698
+ new_max_page = new_page_nr
5695
5699
  if new_max_page is not None:
5696
5700
  self._max_page = new_max_page
5697
5701
 
@@ -5715,6 +5719,14 @@ class DoclingDocument(BaseModel):
5715
5719
  doc_index.index(doc=self)
5716
5720
  self._update_from_index(doc_index)
5717
5721
 
5722
+ def filter(self, page_nrs: Optional[set[int]] = None) -> "DoclingDocument":
5723
+ """Create a new document based on the provided filter parameters."""
5724
+ doc_index = DoclingDocument._DocIndex()
5725
+ doc_index.index(doc=self, page_nrs=page_nrs)
5726
+ res_doc = DoclingDocument(name=self.name)
5727
+ res_doc._update_from_index(doc_index)
5728
+ return res_doc
5729
+
5718
5730
  @classmethod
5719
5731
  def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5720
5732
  """Concatenate multiple documents into a single document."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.46.0
3
+ Version: 2.47.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -43,7 +43,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
43
43
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
44
44
  docling_core/types/doc/__init__.py,sha256=Vsl3oJV3_BLpS7rIwvahhcWOwmEBvj7ZbQzQCCl-IQk,1678
45
45
  docling_core/types/doc/base.py,sha256=i98y4IF250adR-8BSS374K90fwfwG-vBfWh14tLC5Cs,15906
46
- docling_core/types/doc/document.py,sha256=Ab-JOc6fkzocXP3PcxPRXJPjLOhOTYo_0571vSr6VXo,202093
46
+ docling_core/types/doc/document.py,sha256=jyMcK1oiu8X8juNa9DuI3S1imn4hXwjOS7iTLQ1HykU,202707
47
47
  docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
48
48
  docling_core/types/doc/page.py,sha256=35h1xdtCM3-AaN8Dim9jDseZIiw-3GxpB-ofF-H2rQQ,41878
49
49
  docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -76,9 +76,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
76
76
  docling_core/utils/legacy.py,sha256=G7ed8fkBpIO8hG3DKEY83cHsrKJHyvDst_1jSdgBXMI,24406
77
77
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
78
78
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
79
- docling_core-2.46.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
- docling_core-2.46.0.dist-info/METADATA,sha256=txMHh-7y8N3RiJ_M_HbrsvzRyGPJVXv8UcA6_DpAfok,6453
81
- docling_core-2.46.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
- docling_core-2.46.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
- docling_core-2.46.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
- docling_core-2.46.0.dist-info/RECORD,,
79
+ docling_core-2.47.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
80
+ docling_core-2.47.0.dist-info/METADATA,sha256=jW4Zdx0WwStnLDifSsvYyGLw-5C2IYiEeK4IQRGQi-I,6453
81
+ docling_core-2.47.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
82
+ docling_core-2.47.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
83
+ docling_core-2.47.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
84
+ docling_core-2.47.0.dist-info/RECORD,,