docling-core 2.3.2__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.3.2 → docling_core-2.4.0}/PKG-INFO +1 -1
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/doc/document.py +42 -2
- {docling_core-2.3.2 → docling_core-2.4.0}/pyproject.toml +1 -1
- {docling_core-2.3.2 → docling_core-2.4.0}/LICENSE +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/README.md +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/py.typed +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/search/package.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/base.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.3.2 → docling_core-2.4.0}/docling_core/utils/validators.py +0 -0
|
@@ -551,6 +551,28 @@ class DocItem(
|
|
|
551
551
|
|
|
552
552
|
return location
|
|
553
553
|
|
|
554
|
+
def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
|
|
555
|
+
"""Returns the image of this DocItem.
|
|
556
|
+
|
|
557
|
+
The function returns None if this DocItem has no valid provenance or
|
|
558
|
+
if a valid image of the page containing this DocItem is not available
|
|
559
|
+
in doc.
|
|
560
|
+
"""
|
|
561
|
+
if not len(self.prov):
|
|
562
|
+
return None
|
|
563
|
+
|
|
564
|
+
page = doc.pages.get(self.prov[0].page_no)
|
|
565
|
+
if page is None or page.size is None or page.image is None:
|
|
566
|
+
return None
|
|
567
|
+
|
|
568
|
+
page_image = page.image.pil_image
|
|
569
|
+
crop_bbox = (
|
|
570
|
+
self.prov[0]
|
|
571
|
+
.bbox.to_top_left_origin(page_height=page.size.height)
|
|
572
|
+
.scaled(scale=page_image.height / page.size.height)
|
|
573
|
+
)
|
|
574
|
+
return page_image.crop(crop_bbox.as_tuple())
|
|
575
|
+
|
|
554
576
|
|
|
555
577
|
class TextItem(DocItem):
|
|
556
578
|
"""TextItem."""
|
|
@@ -633,6 +655,20 @@ class FloatingItem(DocItem):
|
|
|
633
655
|
text += cap.resolve(doc).text
|
|
634
656
|
return text
|
|
635
657
|
|
|
658
|
+
def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
|
|
659
|
+
"""Returns the image corresponding to this FloatingItem.
|
|
660
|
+
|
|
661
|
+
This function returns the PIL image from self.image if one is available.
|
|
662
|
+
Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
|
|
663
|
+
|
|
664
|
+
In particular, when self.image is None, the function returns None if this
|
|
665
|
+
FloatingItem has no valid provenance or the doc does not contain a valid image
|
|
666
|
+
for the required page.
|
|
667
|
+
"""
|
|
668
|
+
if self.image is not None:
|
|
669
|
+
return self.image.pil_image
|
|
670
|
+
return super().get_image(doc=doc)
|
|
671
|
+
|
|
636
672
|
|
|
637
673
|
class PictureItem(FloatingItem):
|
|
638
674
|
"""PictureItem."""
|
|
@@ -1255,7 +1291,10 @@ class DoclingDocument(BaseModel):
|
|
|
1255
1291
|
# If the child is a NodeItem, recursively traverse it
|
|
1256
1292
|
if not isinstance(child, PictureItem) or traverse_pictures:
|
|
1257
1293
|
yield from self.iterate_items(
|
|
1258
|
-
child,
|
|
1294
|
+
child,
|
|
1295
|
+
_level=_level + 1,
|
|
1296
|
+
with_groups=with_groups,
|
|
1297
|
+
page_no=page_no,
|
|
1259
1298
|
)
|
|
1260
1299
|
|
|
1261
1300
|
def print_element_tree(self):
|
|
@@ -1281,6 +1320,7 @@ class DoclingDocument(BaseModel):
|
|
|
1281
1320
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1282
1321
|
indent: int = 4,
|
|
1283
1322
|
text_width: int = -1,
|
|
1323
|
+
page_no: Optional[int] = None,
|
|
1284
1324
|
) -> str:
|
|
1285
1325
|
r"""Serialize to Markdown.
|
|
1286
1326
|
|
|
@@ -1317,7 +1357,7 @@ class DoclingDocument(BaseModel):
|
|
|
1317
1357
|
in_list = False # Track if we're currently processing list items
|
|
1318
1358
|
|
|
1319
1359
|
for ix, (item, level) in enumerate(
|
|
1320
|
-
self.iterate_items(self.body, with_groups=True)
|
|
1360
|
+
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
1321
1361
|
):
|
|
1322
1362
|
# If we've moved to a lower level, we're exiting one or more groups
|
|
1323
1363
|
if level < previous_level:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.3.2 → docling_core-2.4.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.3.2 → docling_core-2.4.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.3.2 → docling_core-2.4.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|