docling-core 2.24.0__tar.gz → 2.25.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (69) hide show
  1. {docling_core-2.24.0 → docling_core-2.25.0}/PKG-INFO +1 -1
  2. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/base.py +4 -1
  3. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/document.py +36 -8
  4. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/page.py +10 -3
  5. {docling_core-2.24.0 → docling_core-2.25.0}/pyproject.toml +1 -1
  6. {docling_core-2.24.0 → docling_core-2.25.0}/LICENSE +0 -0
  7. {docling_core-2.24.0 → docling_core-2.25.0}/README.md +0 -0
  8. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/__init__.py +0 -0
  9. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/cli/__init__.py +0 -0
  10. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/cli/view.py +0 -0
  11. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/experimental/__init__.py +0 -0
  12. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/experimental/serializer/__init__.py +0 -0
  13. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/experimental/serializer/base.py +0 -0
  14. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/experimental/serializer/common.py +0 -0
  15. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/experimental/serializer/doctags.py +0 -0
  16. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/experimental/serializer/markdown.py +0 -0
  17. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/py.typed +0 -0
  18. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  19. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  20. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  21. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  22. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  23. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  24. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  25. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  26. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/search/__init__.py +0 -0
  27. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  28. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/search/mapping.py +0 -0
  29. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/search/meta.py +0 -0
  30. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/search/package.py +0 -0
  31. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/transforms/__init__.py +0 -0
  32. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/transforms/chunker/__init__.py +0 -0
  33. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/transforms/chunker/base.py +0 -0
  34. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  35. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  36. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/__init__.py +0 -0
  37. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/base.py +0 -0
  38. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/__init__.py +0 -0
  39. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/labels.py +0 -0
  40. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/tokens.py +0 -0
  41. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/doc/utils.py +0 -0
  42. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/gen/__init__.py +0 -0
  43. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/gen/generic.py +0 -0
  44. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/io/__init__.py +0 -0
  45. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  46. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/base.py +0 -0
  47. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  48. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  49. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  50. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/document.py +0 -0
  51. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  52. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/nlp/__init__.py +0 -0
  53. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/nlp/qa.py +0 -0
  54. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/nlp/qa_labels.py +0 -0
  55. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/__init__.py +0 -0
  56. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/attribute.py +0 -0
  57. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/base.py +0 -0
  58. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/predicate.py +0 -0
  59. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/record.py +0 -0
  60. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/statement.py +0 -0
  61. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/types/rec/subject.py +0 -0
  62. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/__init__.py +0 -0
  63. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/alias.py +0 -0
  64. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/file.py +0 -0
  65. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/generate_docs.py +0 -0
  66. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/generate_jsonschema.py +0 -0
  67. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/legacy.py +0 -0
  68. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/validate.py +0 -0
  69. {docling_core-2.24.0 → docling_core-2.25.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.24.0
3
+ Version: 2.25.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://github.com/docling-project
6
6
  License: MIT
@@ -182,7 +182,10 @@ class BoundingBox(BaseModel):
182
182
  ) -> float:
183
183
  """intersection_over_self."""
184
184
  intersection_area = self.intersection_area_with(other=other)
185
- return intersection_area / self.area()
185
+ if self.area() > 0:
186
+ return intersection_area / self.area()
187
+ else:
188
+ return 0.0
186
189
 
187
190
  def to_bottom_left_origin(self, page_height: float) -> "BoundingBox":
188
191
  """to_bottom_left_origin.
@@ -542,25 +542,32 @@ class DocTagsDocument(BaseModel):
542
542
 
543
543
  @classmethod
544
544
  def from_doctags_and_image_pairs(
545
- cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
545
+ cls,
546
+ doctags: typing.Sequence[Union[Path, str]],
547
+ images: Optional[List[Union[Path, PILImage.Image]]],
546
548
  ):
547
549
  """from_doctags_and_image_pairs."""
548
- if len(doctags) != len(images):
550
+ if images is not None and len(doctags) != len(images):
549
551
  raise ValueError("Number of page doctags must be equal to page images!")
550
552
  doctags_doc = cls()
551
553
 
552
554
  pages = []
553
- for dt, img in zip(doctags, images):
555
+
556
+ for ix, dt in enumerate(doctags):
554
557
  if isinstance(dt, Path):
555
558
  with dt.open("r") as fp:
556
559
  dt = fp.read()
557
560
  elif isinstance(dt, str):
558
561
  pass
559
562
 
560
- if isinstance(img, Path):
561
- img = PILImage.open(img)
562
- elif isinstance(dt, PILImage.Image):
563
- pass
563
+ img = None
564
+ if images is not None:
565
+ img = images[ix]
566
+
567
+ if isinstance(img, Path):
568
+ img = PILImage.open(img)
569
+ elif isinstance(img, PILImage.Image):
570
+ pass
564
571
 
565
572
  page = DocTagsPage(tokens=dt, image=img)
566
573
  pages.append(page)
@@ -568,6 +575,25 @@ class DocTagsDocument(BaseModel):
568
575
  doctags_doc.pages = pages
569
576
  return doctags_doc
570
577
 
578
+ @classmethod
579
+ def from_multipage_doctags_and_images(
580
+ cls,
581
+ doctags: Union[Path, str],
582
+ images: Optional[List[Union[Path, PILImage.Image]]],
583
+ ):
584
+ """From doctags with `<page_break>` and corresponding list of page images."""
585
+ if isinstance(doctags, Path):
586
+ with doctags.open("r") as fp:
587
+ doctags = fp.read()
588
+ dt_list = (
589
+ doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
590
+ .removesuffix(f"</{DocumentToken.DOCUMENT.value}>")
591
+ .split(f"<{DocumentToken.PAGE_BREAK.value}>")
592
+ )
593
+ dt_list = [el.strip() for el in dt_list]
594
+
595
+ return cls.from_doctags_and_image_pairs(dt_list, images)
596
+
571
597
 
572
598
  class ProvenanceItem(BaseModel):
573
599
  """ProvenanceItem."""
@@ -722,7 +748,9 @@ class TextItem(DocItem):
722
748
  text: str # sanitized representation
723
749
 
724
750
  formatting: Optional[Formatting] = None
725
- hyperlink: Optional[Union[AnyUrl, Path]] = None
751
+ hyperlink: Optional[Union[AnyUrl, Path]] = Field(
752
+ union_mode="left_to_right", default=None
753
+ )
726
754
 
727
755
  @deprecated("Use export_to_doctags() instead.")
728
756
  def export_to_document_tokens(self, *args, **kwargs):
@@ -1,5 +1,6 @@
1
1
  """Datastructures for PaginatedDocument."""
2
2
 
3
+ import copy
3
4
  import json
4
5
  import logging
5
6
  import math
@@ -530,10 +531,16 @@ class SegmentedPdfPage(SegmentedPage):
530
531
  """
531
532
  cells = []
532
533
  for page_cell in self.iterate_cells(cell_unit):
533
- cell_bbox = page_cell.to_bounding_box()
534
+ pc = copy.deepcopy(page_cell)
535
+ # Bring cell_bbox coord origin to the same as input bbox.coord_origin:
536
+ if page_cell.rect.coord_origin != bbox.coord_origin:
537
+ if bbox.coord_origin == CoordOrigin.TOPLEFT:
538
+ pc.rect = pc.rect.to_top_left_origin(self.dimension.height)
539
+ elif bbox.coord_origin == CoordOrigin.BOTTOMLEFT:
540
+ pc.rect = pc.rect.to_bottom_left_origin(self.dimension.height)
541
+ cell_bbox = pc.to_bounding_box()
534
542
  if cell_bbox.intersection_over_self(bbox) > ios:
535
- cells.append(page_cell)
536
-
543
+ cells.append(pc)
537
544
  return cells
538
545
 
539
546
  def export_to_dict(self) -> Dict:
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.24.0"
3
+ version = "2.25.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes