docling-core 2.24.0__py3-none-any.whl → 2.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +4 -1
- docling_core/types/doc/document.py +36 -8
- docling_core/types/doc/page.py +10 -3
- {docling_core-2.24.0.dist-info → docling_core-2.25.0.dist-info}/METADATA +1 -1
- {docling_core-2.24.0.dist-info → docling_core-2.25.0.dist-info}/RECORD +8 -8
- {docling_core-2.24.0.dist-info → docling_core-2.25.0.dist-info}/LICENSE +0 -0
- {docling_core-2.24.0.dist-info → docling_core-2.25.0.dist-info}/WHEEL +0 -0
- {docling_core-2.24.0.dist-info → docling_core-2.25.0.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -182,7 +182,10 @@ class BoundingBox(BaseModel):
|
|
|
182
182
|
) -> float:
|
|
183
183
|
"""intersection_over_self."""
|
|
184
184
|
intersection_area = self.intersection_area_with(other=other)
|
|
185
|
-
|
|
185
|
+
if self.area() > 0:
|
|
186
|
+
return intersection_area / self.area()
|
|
187
|
+
else:
|
|
188
|
+
return 0.0
|
|
186
189
|
|
|
187
190
|
def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
|
|
188
191
|
"""to_bottom_left_origin.
|
|
@@ -542,25 +542,32 @@ class DocTagsDocument(BaseModel):
|
|
|
542
542
|
|
|
543
543
|
@classmethod
|
|
544
544
|
def from_doctags_and_image_pairs(
|
|
545
|
-
cls,
|
|
545
|
+
cls,
|
|
546
|
+
doctags: typing.Sequence[Union[Path, str]],
|
|
547
|
+
images: Optional[List[Union[Path, PILImage.Image]]],
|
|
546
548
|
):
|
|
547
549
|
"""from_doctags_and_image_pairs."""
|
|
548
|
-
if len(doctags) != len(images):
|
|
550
|
+
if images is not None and len(doctags) != len(images):
|
|
549
551
|
raise ValueError("Number of page doctags must be equal to page images!")
|
|
550
552
|
doctags_doc = cls()
|
|
551
553
|
|
|
552
554
|
pages = []
|
|
553
|
-
|
|
555
|
+
|
|
556
|
+
for ix, dt in enumerate(doctags):
|
|
554
557
|
if isinstance(dt, Path):
|
|
555
558
|
with dt.open("r") as fp:
|
|
556
559
|
dt = fp.read()
|
|
557
560
|
elif isinstance(dt, str):
|
|
558
561
|
pass
|
|
559
562
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
563
|
+
img = None
|
|
564
|
+
if images is not None:
|
|
565
|
+
img = images[ix]
|
|
566
|
+
|
|
567
|
+
if isinstance(img, Path):
|
|
568
|
+
img = PILImage.open(img)
|
|
569
|
+
elif isinstance(img, PILImage.Image):
|
|
570
|
+
pass
|
|
564
571
|
|
|
565
572
|
page = DocTagsPage(tokens=dt, image=img)
|
|
566
573
|
pages.append(page)
|
|
@@ -568,6 +575,25 @@ class DocTagsDocument(BaseModel):
|
|
|
568
575
|
doctags_doc.pages = pages
|
|
569
576
|
return doctags_doc
|
|
570
577
|
|
|
578
|
+
@classmethod
|
|
579
|
+
def from_multipage_doctags_and_images(
|
|
580
|
+
cls,
|
|
581
|
+
doctags: Union[Path, str],
|
|
582
|
+
images: Optional[List[Union[Path, PILImage.Image]]],
|
|
583
|
+
):
|
|
584
|
+
"""From doctags with `<page_break>` and corresponding list of page images."""
|
|
585
|
+
if isinstance(doctags, Path):
|
|
586
|
+
with doctags.open("r") as fp:
|
|
587
|
+
doctags = fp.read()
|
|
588
|
+
dt_list = (
|
|
589
|
+
doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
|
|
590
|
+
.removesuffix(f"</{DocumentToken.DOCUMENT.value}>")
|
|
591
|
+
.split(f"<{DocumentToken.PAGE_BREAK.value}>")
|
|
592
|
+
)
|
|
593
|
+
dt_list = [el.strip() for el in dt_list]
|
|
594
|
+
|
|
595
|
+
return cls.from_doctags_and_image_pairs(dt_list, images)
|
|
596
|
+
|
|
571
597
|
|
|
572
598
|
class ProvenanceItem(BaseModel):
|
|
573
599
|
"""ProvenanceItem."""
|
|
@@ -722,7 +748,9 @@ class TextItem(DocItem):
|
|
|
722
748
|
text: str # sanitized representation
|
|
723
749
|
|
|
724
750
|
formatting: Optional[Formatting] = None
|
|
725
|
-
hyperlink: Optional[Union[AnyUrl, Path]] =
|
|
751
|
+
hyperlink: Optional[Union[AnyUrl, Path]] = Field(
|
|
752
|
+
union_mode="left_to_right", default=None
|
|
753
|
+
)
|
|
726
754
|
|
|
727
755
|
@deprecated("Use export_to_doctags() instead.")
|
|
728
756
|
def export_to_document_tokens(self, *args, **kwargs):
|
docling_core/types/doc/page.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Datastructures for PaginatedDocument."""
|
|
2
2
|
|
|
3
|
+
import copy
|
|
3
4
|
import json
|
|
4
5
|
import logging
|
|
5
6
|
import math
|
|
@@ -530,10 +531,16 @@ class SegmentedPdfPage(SegmentedPage):
|
|
|
530
531
|
"""
|
|
531
532
|
cells = []
|
|
532
533
|
for page_cell in self.iterate_cells(cell_unit):
|
|
533
|
-
|
|
534
|
+
pc = copy.deepcopy(page_cell)
|
|
535
|
+
# Bring cell_bbox coord origin to the same as input bbox.coord_origin:
|
|
536
|
+
if page_cell.rect.coord_origin != bbox.coord_origin:
|
|
537
|
+
if bbox.coord_origin == CoordOrigin.TOPLEFT:
|
|
538
|
+
pc.rect = pc.rect.to_top_left_origin(self.dimension.height)
|
|
539
|
+
elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
540
|
+
pc.rect = pc.rect.to_bottom_left_origin(self.dimension.height)
|
|
541
|
+
cell_bbox = pc.to_bounding_box()
|
|
534
542
|
if cell_bbox.intersection_over_self(bbox) > ios:
|
|
535
|
-
cells.append(
|
|
536
|
-
|
|
543
|
+
cells.append(pc)
|
|
537
544
|
return cells
|
|
538
545
|
|
|
539
546
|
def export_to_dict(self) -> Dict:
|
|
@@ -29,10 +29,10 @@ docling_core/transforms/chunker/hybrid_chunker.py,sha256=v-HpFg-HvQLi0gQtHm-6KlM
|
|
|
29
29
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
30
30
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
31
31
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
32
|
-
docling_core/types/doc/base.py,sha256=
|
|
33
|
-
docling_core/types/doc/document.py,sha256=
|
|
32
|
+
docling_core/types/doc/base.py,sha256=sM3IyFXzVh2WT8IGh5nejXYh8sf39yBh8TBSlHeJ9CI,12611
|
|
33
|
+
docling_core/types/doc/document.py,sha256=bFor-GQmt0pn0eZ4HpcA2RUFJ7GEX5neAR5gnDVY3Hw,129747
|
|
34
34
|
docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
|
|
35
|
-
docling_core/types/doc/page.py,sha256=
|
|
35
|
+
docling_core/types/doc/page.py,sha256=s5DxxoS-6RS0gv3C3ZHWqo2RND2j_iksGJStdby6dBw,40466
|
|
36
36
|
docling_core/types/doc/tokens.py,sha256=fpPtVHfO5RXk8mkqZ7YrW5LyHipg697kbFBNqn6jXQU,9159
|
|
37
37
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
38
38
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -63,8 +63,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
63
63
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
64
64
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
65
65
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
66
|
-
docling_core-2.
|
|
67
|
-
docling_core-2.
|
|
68
|
-
docling_core-2.
|
|
69
|
-
docling_core-2.
|
|
70
|
-
docling_core-2.
|
|
66
|
+
docling_core-2.25.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
67
|
+
docling_core-2.25.0.dist-info/METADATA,sha256=CFFeRrWUJUjUK9x4LEaiXhWYPLspeRVbf5I5DnLdnrQ,5843
|
|
68
|
+
docling_core-2.25.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
69
|
+
docling_core-2.25.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
70
|
+
docling_core-2.25.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|