docling-core 2.24.1__py3-none-any.whl → 2.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -182,7 +182,10 @@ class BoundingBox(BaseModel):
182
182
  ) -> float:
183
183
  """intersection_over_self."""
184
184
  intersection_area = self.intersection_area_with(other=other)
185
- return intersection_area / self.area()
185
+ if self.area() > 0:
186
+ return intersection_area / self.area()
187
+ else:
188
+ return 0.0
186
189
 
187
190
  def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
188
191
  """to_bottom_left_origin.
@@ -542,25 +542,32 @@ class DocTagsDocument(BaseModel):
542
542
 
543
543
  @classmethod
544
544
  def from_doctags_and_image_pairs(
545
- cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
545
+ cls,
546
+ doctags: typing.Sequence[Union[Path, str]],
547
+ images: Optional[List[Union[Path, PILImage.Image]]],
546
548
  ):
547
549
  """from_doctags_and_image_pairs."""
548
- if len(doctags) != len(images):
550
+ if images is not None and len(doctags) != len(images):
549
551
  raise ValueError("Number of page doctags must be equal to page images!")
550
552
  doctags_doc = cls()
551
553
 
552
554
  pages = []
553
- for dt, img in zip(doctags, images):
555
+
556
+ for ix, dt in enumerate(doctags):
554
557
  if isinstance(dt, Path):
555
558
  with dt.open("r") as fp:
556
559
  dt = fp.read()
557
560
  elif isinstance(dt, str):
558
561
  pass
559
562
 
560
- if isinstance(img, Path):
561
- img = PILImage.open(img)
562
- elif isinstance(dt, PILImage.Image):
563
- pass
563
+ img = None
564
+ if images is not None:
565
+ img = images[ix]
566
+
567
+ if isinstance(img, Path):
568
+ img = PILImage.open(img)
569
+ elif isinstance(img, PILImage.Image):
570
+ pass
564
571
 
565
572
  page = DocTagsPage(tokens=dt, image=img)
566
573
  pages.append(page)
@@ -568,6 +575,25 @@ class DocTagsDocument(BaseModel):
568
575
  doctags_doc.pages = pages
569
576
  return doctags_doc
570
577
 
578
+ @classmethod
579
+ def from_multipage_doctags_and_images(
580
+ cls,
581
+ doctags: Union[Path, str],
582
+ images: Optional[List[Union[Path, PILImage.Image]]],
583
+ ):
584
+ """From doctags with `<page_break>` and corresponding list of page images."""
585
+ if isinstance(doctags, Path):
586
+ with doctags.open("r") as fp:
587
+ doctags = fp.read()
588
+ dt_list = (
589
+ doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
590
+ .removesuffix(f"</{DocumentToken.DOCUMENT.value}>")
591
+ .split(f"<{DocumentToken.PAGE_BREAK.value}>")
592
+ )
593
+ dt_list = [el.strip() for el in dt_list]
594
+
595
+ return cls.from_doctags_and_image_pairs(dt_list, images)
596
+
571
597
 
572
598
  class ProvenanceItem(BaseModel):
573
599
  """ProvenanceItem."""
@@ -722,7 +748,9 @@ class TextItem(DocItem):
722
748
  text: str # sanitized representation
723
749
 
724
750
  formatting: Optional[Formatting] = None
725
- hyperlink: Optional[Union[AnyUrl, Path]] = None
751
+ hyperlink: Optional[Union[AnyUrl, Path]] = Field(
752
+ union_mode="left_to_right", default=None
753
+ )
726
754
 
727
755
  @deprecated("Use export_to_doctags() instead.")
728
756
  def export_to_document_tokens(self, *args, **kwargs):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.24.1
3
+ Version: 2.25.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -29,8 +29,8 @@ docling_core/transforms/chunker/hybrid_chunker.py,sha256=v-HpFg-HvQLi0gQtHm-6KlM
29
29
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
30
30
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
31
31
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
32
- docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
33
- docling_core/types/doc/document.py,sha256=_FJtmp0yh6F_3AVLVN4Xpo7E1hz50gvS_-HrJmp8FOA,128806
32
+ docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
33
+ docling_core/types/doc/document.py,sha256=bFor-GQmt0pn0eZ4HpcA2RUFJ7GEX5neAR5gnDVY3Hw,129747
34
34
  docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
35
35
  docling_core/types/doc/page.py,sha256=s5DxxoS-6RS0gv3C3ZHWqo2RND2j_iksGJStdby6dBw,40466
36
36
  docling_core/types/doc/tokens.py,sha256=fpPtVHfO5RXk8mkqZ7YrW5LyHipg697kbFBNqn6jXQU,9159
@@ -63,8 +63,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
63
63
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
64
64
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
65
65
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
66
- docling_core-2.24.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
67
- docling_core-2.24.1.dist-info/METADATA,sha256=GYkFcQg28UpfzdBadMKZ6AL6V9ezUVTlL50B__Mje5g,5843
68
- docling_core-2.24.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
69
- docling_core-2.24.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
70
- docling_core-2.24.1.dist-info/RECORD,,
66
+ docling_core-2.25.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
67
+ docling_core-2.25.0.dist-info/METADATA,sha256=CFFeRrWUJUjUK9x4LEaiXhWYPLspeRVbf5I5DnLdnrQ,5843
68
+ docling_core-2.25.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
69
+ docling_core-2.25.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
70
+ docling_core-2.25.0.dist-info/RECORD,,