docling-core 2.3.2__tar.gz → 2.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (55) hide show
  1. {docling_core-2.3.2 → docling_core-2.4.1}/PKG-INFO +2 -2
  2. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/doc/document.py +42 -2
  3. {docling_core-2.3.2 → docling_core-2.4.1}/pyproject.toml +2 -2
  4. {docling_core-2.3.2 → docling_core-2.4.1}/LICENSE +0 -0
  5. {docling_core-2.3.2 → docling_core-2.4.1}/README.md +0 -0
  6. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/__init__.py +0 -0
  7. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/py.typed +0 -0
  8. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  9. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  10. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  11. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  12. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  13. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  14. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  15. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  16. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/search/__init__.py +0 -0
  17. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  18. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/search/mapping.py +0 -0
  19. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/search/meta.py +0 -0
  20. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/search/package.py +0 -0
  21. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/transforms/__init__.py +0 -0
  22. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/transforms/chunker/__init__.py +0 -0
  23. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/transforms/chunker/base.py +0 -0
  24. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  25. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/__init__.py +0 -0
  26. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/base.py +0 -0
  27. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/doc/__init__.py +0 -0
  28. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/doc/base.py +0 -0
  29. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/doc/labels.py +0 -0
  30. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/gen/__init__.py +0 -0
  31. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/gen/generic.py +0 -0
  32. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  33. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/base.py +0 -0
  34. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  35. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  36. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  37. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/document.py +0 -0
  38. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  39. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/nlp/__init__.py +0 -0
  40. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/nlp/qa.py +0 -0
  41. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/nlp/qa_labels.py +0 -0
  42. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/__init__.py +0 -0
  43. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/attribute.py +0 -0
  44. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/base.py +0 -0
  45. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/predicate.py +0 -0
  46. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/record.py +0 -0
  47. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/statement.py +0 -0
  48. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/types/rec/subject.py +0 -0
  49. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/__init__.py +0 -0
  50. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/alias.py +0 -0
  51. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/file.py +0 -0
  52. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/generate_docs.py +0 -0
  53. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/generate_jsonschema.py +0 -0
  54. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/validate.py +0 -0
  55. {docling_core-2.3.2 → docling_core-2.4.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.3.2
3
+ Version: 2.4.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
29
29
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
30
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
31
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
- Requires-Dist: pydantic (>=2.6.0,<3.0.0)
32
+ Requires-Dist: pydantic (>=2.6.0,<2.10)
33
33
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
34
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
35
  Description-Content-Type: text/markdown
@@ -551,6 +551,28 @@ class DocItem(
551
551
 
552
552
  return location
553
553
 
554
+ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
555
+ """Returns the image of this DocItem.
556
+
557
+ The function returns None if this DocItem has no valid provenance or
558
+ if a valid image of the page containing this DocItem is not available
559
+ in doc.
560
+ """
561
+ if not len(self.prov):
562
+ return None
563
+
564
+ page = doc.pages.get(self.prov[0].page_no)
565
+ if page is None or page.size is None or page.image is None:
566
+ return None
567
+
568
+ page_image = page.image.pil_image
569
+ crop_bbox = (
570
+ self.prov[0]
571
+ .bbox.to_top_left_origin(page_height=page.size.height)
572
+ .scaled(scale=page_image.height / page.size.height)
573
+ )
574
+ return page_image.crop(crop_bbox.as_tuple())
575
+
554
576
 
555
577
  class TextItem(DocItem):
556
578
  """TextItem."""
@@ -633,6 +655,20 @@ class FloatingItem(DocItem):
633
655
  text += cap.resolve(doc).text
634
656
  return text
635
657
 
658
+ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
659
+ """Returns the image corresponding to this FloatingItem.
660
+
661
+ This function returns the PIL image from self.image if one is available.
662
+ Otherwise, it uses DocItem.get_image to get an image of this FloatingItem.
663
+
664
+ In particular, when self.image is None, the function returns None if this
665
+ FloatingItem has no valid provenance or the doc does not contain a valid image
666
+ for the required page.
667
+ """
668
+ if self.image is not None:
669
+ return self.image.pil_image
670
+ return super().get_image(doc=doc)
671
+
636
672
 
637
673
  class PictureItem(FloatingItem):
638
674
  """PictureItem."""
@@ -1255,7 +1291,10 @@ class DoclingDocument(BaseModel):
1255
1291
  # If the child is a NodeItem, recursively traverse it
1256
1292
  if not isinstance(child, PictureItem) or traverse_pictures:
1257
1293
  yield from self.iterate_items(
1258
- child, _level=_level + 1, with_groups=with_groups
1294
+ child,
1295
+ _level=_level + 1,
1296
+ with_groups=with_groups,
1297
+ page_no=page_no,
1259
1298
  )
1260
1299
 
1261
1300
  def print_element_tree(self):
@@ -1281,6 +1320,7 @@ class DoclingDocument(BaseModel):
1281
1320
  image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
1282
1321
  indent: int = 4,
1283
1322
  text_width: int = -1,
1323
+ page_no: Optional[int] = None,
1284
1324
  ) -> str:
1285
1325
  r"""Serialize to Markdown.
1286
1326
 
@@ -1317,7 +1357,7 @@ class DoclingDocument(BaseModel):
1317
1357
  in_list = False # Track if we're currently processing list items
1318
1358
 
1319
1359
  for ix, (item, level) in enumerate(
1320
- self.iterate_items(self.body, with_groups=True)
1360
+ self.iterate_items(self.body, with_groups=True, page_no=page_no)
1321
1361
  ):
1322
1362
  # If we've moved to a lower level, we're exiting one or more groups
1323
1363
  if level < previous_level:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.3.2"
3
+ version = "2.4.1"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -48,7 +48,7 @@ generate_docs = "docling_core.utils.generate_docs:main"
48
48
  [tool.poetry.dependencies]
49
49
  python = "^3.9"
50
50
  jsonschema = "^4.16.0"
51
- pydantic = "^2.6.0"
51
+ pydantic = ">=2.6.0,<2.10"
52
52
  jsonref = "^1.1.0"
53
53
  tabulate = "^0.9.0"
54
54
  pandas = "^2.1.4"
File without changes
File without changes