docling-core 2.3.1__tar.gz → 2.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.3.1 → docling_core-2.4.0}/PKG-INFO +1 -1
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/doc/document.py +47 -10
- {docling_core-2.3.1 → docling_core-2.4.0}/pyproject.toml +1 -1
- {docling_core-2.3.1 → docling_core-2.4.0}/LICENSE +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/README.md +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/py.typed +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/search/package.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/base.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.3.1 → docling_core-2.4.0}/docling_core/utils/validators.py +0 -0
|
@@ -551,6 +551,28 @@ class DocItem(
|
|
|
551
551
|
|
|
552
552
|
return location
|
|
553
553
|
|
|
554
|
+
def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
|
|
555
|
+
"""Returns the image of this DocItem.
|
|
556
|
+
|
|
557
|
+
The function returns None if this DocItem has no valid provenance or
|
|
558
|
+
if a valid image of the page containing this DocItem is not available
|
|
559
|
+
in doc.
|
|
560
|
+
"""
|
|
561
|
+
if not len(self.prov):
|
|
562
|
+
return None
|
|
563
|
+
|
|
564
|
+
page = doc.pages.get(self.prov[0].page_no)
|
|
565
|
+
if page is None or page.size is None or page.image is None:
|
|
566
|
+
return None
|
|
567
|
+
|
|
568
|
+
page_image = page.image.pil_image
|
|
569
|
+
crop_bbox = (
|
|
570
|
+
self.prov[0]
|
|
571
|
+
.bbox.to_top_left_origin(page_height=page.size.height)
|
|
572
|
+
.scaled(scale=page_image.height / page.size.height)
|
|
573
|
+
)
|
|
574
|
+
return page_image.crop(crop_bbox.as_tuple())
|
|
575
|
+
|
|
554
576
|
|
|
555
577
|
class TextItem(DocItem):
|
|
556
578
|
"""TextItem."""
|
|
@@ -633,6 +655,20 @@ class FloatingItem(DocItem):
|
|
|
633
655
|
text += cap.resolve(doc).text
|
|
634
656
|
return text
|
|
635
657
|
|
|
658
|
+
def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
|
|
659
|
+
"""Returns the image corresponding to this FloatingItem.
|
|
660
|
+
|
|
661
|
+
This function returns the PIL image from self.image if one is available.
|
|
662
|
+
Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
|
|
663
|
+
|
|
664
|
+
In particular, when self.image is None, the function returns None if this
|
|
665
|
+
FloatingItem has no valid provenance or the doc does not contain a valid image
|
|
666
|
+
for the required page.
|
|
667
|
+
"""
|
|
668
|
+
if self.image is not None:
|
|
669
|
+
return self.image.pil_image
|
|
670
|
+
return super().get_image(doc=doc)
|
|
671
|
+
|
|
636
672
|
|
|
637
673
|
class PictureItem(FloatingItem):
|
|
638
674
|
"""PictureItem."""
|
|
@@ -1255,7 +1291,10 @@ class DoclingDocument(BaseModel):
|
|
|
1255
1291
|
# If the child is a NodeItem, recursively traverse it
|
|
1256
1292
|
if not isinstance(child, PictureItem) or traverse_pictures:
|
|
1257
1293
|
yield from self.iterate_items(
|
|
1258
|
-
child,
|
|
1294
|
+
child,
|
|
1295
|
+
_level=_level + 1,
|
|
1296
|
+
with_groups=with_groups,
|
|
1297
|
+
page_no=page_no,
|
|
1259
1298
|
)
|
|
1260
1299
|
|
|
1261
1300
|
def print_element_tree(self):
|
|
@@ -1281,11 +1320,12 @@ class DoclingDocument(BaseModel):
|
|
|
1281
1320
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1282
1321
|
indent: int = 4,
|
|
1283
1322
|
text_width: int = -1,
|
|
1323
|
+
page_no: Optional[int] = None,
|
|
1284
1324
|
) -> str:
|
|
1285
1325
|
r"""Serialize to Markdown.
|
|
1286
1326
|
|
|
1287
|
-
Operates on a slice of the document's
|
|
1288
|
-
|
|
1327
|
+
Operates on a slice of the document's body as defined through arguments
|
|
1328
|
+
from_element and to_element; defaulting to the whole document.
|
|
1289
1329
|
|
|
1290
1330
|
:param delim: Delimiter to use when concatenating the various
|
|
1291
1331
|
Markdown parts. Defaults to "\n\n".
|
|
@@ -1294,11 +1334,9 @@ class DoclingDocument(BaseModel):
|
|
|
1294
1334
|
Defaults to 0.
|
|
1295
1335
|
:type from_element: int
|
|
1296
1336
|
:param to_element: Body slicing stop index
|
|
1297
|
-
(exclusive). Defaults to
|
|
1298
|
-
:type to_element:
|
|
1337
|
+
(exclusive). Defaults to 0maxint.
|
|
1338
|
+
:type to_element: int
|
|
1299
1339
|
:param delim: str: (Default value = "\n\n")
|
|
1300
|
-
:param from_element: int: (Default value = 0)
|
|
1301
|
-
:param to_element: Optional[int]: (Default value = None)
|
|
1302
1340
|
:param labels: set[DocItemLabel]
|
|
1303
1341
|
:param "subtitle-level-1":
|
|
1304
1342
|
:param "paragraph":
|
|
@@ -1306,7 +1344,6 @@ class DoclingDocument(BaseModel):
|
|
|
1306
1344
|
:param "table":
|
|
1307
1345
|
:param "Text":
|
|
1308
1346
|
:param "text":
|
|
1309
|
-
:param ]:
|
|
1310
1347
|
:param strict_text: bool: (Default value = False)
|
|
1311
1348
|
:param image_placeholder str: (Default value = "<!-- image -->")
|
|
1312
1349
|
the placeholder to include to position images in the markdown.
|
|
@@ -1320,7 +1357,7 @@ class DoclingDocument(BaseModel):
|
|
|
1320
1357
|
in_list = False # Track if we're currently processing list items
|
|
1321
1358
|
|
|
1322
1359
|
for ix, (item, level) in enumerate(
|
|
1323
|
-
self.iterate_items(self.body, with_groups=True)
|
|
1360
|
+
self.iterate_items(self.body, with_groups=True, page_no=page_no)
|
|
1324
1361
|
):
|
|
1325
1362
|
# If we've moved to a lower level, we're exiting one or more groups
|
|
1326
1363
|
if level < previous_level:
|
|
@@ -1331,7 +1368,7 @@ class DoclingDocument(BaseModel):
|
|
|
1331
1368
|
|
|
1332
1369
|
previous_level = level # Update previous_level for next iteration
|
|
1333
1370
|
|
|
1334
|
-
if ix < from_element
|
|
1371
|
+
if ix < from_element or to_element <= ix:
|
|
1335
1372
|
continue # skip as many items as you want
|
|
1336
1373
|
|
|
1337
1374
|
# Handle newlines between different types of content
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.3.1 → docling_core-2.4.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.3.1 → docling_core-2.4.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.3.1 → docling_core-2.4.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|