docling-core 2.42.0__tar.gz → 2.43.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (109) hide show
  1. {docling_core-2.42.0 → docling_core-2.43.1}/PKG-INFO +1 -1
  2. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/__init__.py +1 -0
  3. docling_core-2.43.1/docling_core/transforms/chunker/page_chunker.py +59 -0
  4. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/base.py +10 -0
  5. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/common.py +1 -1
  6. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/html.py +17 -0
  7. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/layout_visualizer.py +3 -1
  8. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/document.py +11 -7
  9. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/PKG-INFO +1 -1
  10. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/SOURCES.txt +2 -0
  11. {docling_core-2.42.0 → docling_core-2.43.1}/pyproject.toml +1 -1
  12. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_base.py +2 -2
  13. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_docling_doc.py +53 -11
  14. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doctags_load.py +8 -4
  15. docling_core-2.43.1/test/test_page_chunker.py +36 -0
  16. {docling_core-2.42.0 → docling_core-2.43.1}/LICENSE +0 -0
  17. {docling_core-2.42.0 → docling_core-2.43.1}/README.md +0 -0
  18. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/__init__.py +0 -0
  19. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/cli/__init__.py +0 -0
  20. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/cli/view.py +0 -0
  21. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/experimental/__init__.py +0 -0
  22. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/py.typed +0 -0
  23. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  24. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  25. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  26. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  27. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  28. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  29. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  30. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  31. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/__init__.py +0 -0
  32. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  33. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/mapping.py +0 -0
  34. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/meta.py +0 -0
  35. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/package.py +0 -0
  36. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/__init__.py +0 -0
  37. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/base.py +0 -0
  38. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  39. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  40. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  41. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  42. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  43. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  44. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/__init__.py +0 -0
  45. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/doctags.py +0 -0
  46. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/html_styles.py +0 -0
  47. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/markdown.py +0 -0
  48. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/__init__.py +0 -0
  49. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/base.py +0 -0
  50. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  51. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
  52. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/__init__.py +0 -0
  53. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/base.py +0 -0
  54. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/__init__.py +0 -0
  55. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/base.py +0 -0
  56. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/labels.py +0 -0
  57. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/page.py +0 -0
  58. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/tokens.py +0 -0
  59. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/utils.py +0 -0
  60. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/gen/__init__.py +0 -0
  61. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/gen/generic.py +0 -0
  62. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/io/__init__.py +0 -0
  63. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  64. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/base.py +0 -0
  65. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  66. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  67. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  68. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/document.py +0 -0
  69. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  70. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/nlp/__init__.py +0 -0
  71. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/nlp/qa.py +0 -0
  72. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/nlp/qa_labels.py +0 -0
  73. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/__init__.py +0 -0
  74. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/attribute.py +0 -0
  75. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/base.py +0 -0
  76. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/predicate.py +0 -0
  77. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/record.py +0 -0
  78. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/statement.py +0 -0
  79. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/subject.py +0 -0
  80. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/__init__.py +0 -0
  81. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/alias.py +0 -0
  82. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/file.py +0 -0
  83. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/generate_docs.py +0 -0
  84. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/generate_jsonschema.py +0 -0
  85. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/legacy.py +0 -0
  86. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/validate.py +0 -0
  87. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/validators.py +0 -0
  88. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/dependency_links.txt +0 -0
  89. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/entry_points.txt +0 -0
  90. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/requires.txt +0 -0
  91. {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/top_level.txt +0 -0
  92. {docling_core-2.42.0 → docling_core-2.43.1}/setup.cfg +0 -0
  93. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_collection.py +0 -0
  94. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_data_gen_flag.py +0 -0
  95. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_base.py +0 -0
  96. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_legacy_convert.py +0 -0
  97. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_schema.py +0 -0
  98. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_schema_extractor.py +0 -0
  99. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_hierarchical_chunker.py +0 -0
  100. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_hybrid_chunker.py +0 -0
  101. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_json_schema_to_search_mapper.py +0 -0
  102. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_nlp_qa.py +0 -0
  103. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_otsl_table_export.py +0 -0
  104. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_page.py +0 -0
  105. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_rec_schema.py +0 -0
  106. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_search_meta.py +0 -0
  107. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_serialization.py +0 -0
  108. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_utils.py +0 -0
  109. {docling_core-2.42.0 → docling_core-2.43.1}/test/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.42.0
3
+ Version: 2.43.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -11,3 +11,4 @@ from docling_core.transforms.chunker.hierarchical_chunker import (
11
11
  DocMeta,
12
12
  HierarchicalChunker,
13
13
  )
14
+ from docling_core.transforms.chunker.page_chunker import PageChunker
@@ -0,0 +1,59 @@
1
+ """Page-based chunker implementation: each chunk corresponds to a single page."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Iterator
6
+
7
+ from pydantic import ConfigDict
8
+ from typing_extensions import override
9
+
10
+ from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
11
+ from docling_core.transforms.chunker.hierarchical_chunker import (
12
+ ChunkingSerializerProvider,
13
+ )
14
+ from docling_core.types import DoclingDocument as DLDocument
15
+
16
+
17
+ class PageChunker(BaseChunker):
18
+ r"""Chunker implementation that yields one chunk per page."""
19
+
20
+ model_config = ConfigDict(arbitrary_types_allowed=True)
21
+
22
+ serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
23
+
24
+ @override
25
+ def chunk(
26
+ self,
27
+ dl_doc: DLDocument,
28
+ **kwargs: Any,
29
+ ) -> Iterator[DocChunk]:
30
+ """Chunk the provided document by page."""
31
+ my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
32
+ if dl_doc.pages:
33
+ # chunk by page
34
+ for page_no in sorted(dl_doc.pages.keys()):
35
+ ser_res = my_doc_ser.serialize(pages={page_no})
36
+ if not ser_res.text:
37
+ continue
38
+ yield DocChunk(
39
+ text=ser_res.text,
40
+ meta=DocMeta(
41
+ doc_items=ser_res.get_unique_doc_items(),
42
+ headings=None,
43
+ captions=None,
44
+ origin=dl_doc.origin,
45
+ ),
46
+ )
47
+ else:
48
+ # if no pages, treat whole document as single chunk
49
+ ser_res = my_doc_ser.serialize()
50
+ if ser_res.text:
51
+ yield DocChunk(
52
+ text=ser_res.text,
53
+ meta=DocMeta(
54
+ doc_items=ser_res.get_unique_doc_items(),
55
+ headings=None,
56
+ captions=None,
57
+ origin=dl_doc.origin,
58
+ ),
59
+ )
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
39
39
  spans: list[Span] = []
40
40
  # group: Optional[GroupItem] = None # set when result reflects specific group item
41
41
 
42
+ def get_unique_doc_items(self) -> list[DocItem]:
43
+ """Get the doc items corresponding to this result."""
44
+ seen_doc_item_refs: set[str] = set()
45
+ doc_items: list[DocItem] = []
46
+ for span in self.spans:
47
+ if span.item.self_ref not in seen_doc_item_refs:
48
+ seen_doc_item_refs.add(span.item.self_ref)
49
+ doc_items.append(span.item)
50
+ return doc_items
51
+
42
52
 
43
53
  class BaseTextSerializer(ABC):
44
54
  """Base class for text item serializers."""
@@ -285,7 +285,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
285
285
 
286
286
  def _serialize_body(self, **kwargs) -> SerializationResult:
287
287
  """Serialize the document body."""
288
- subparts = self.get_parts()
288
+ subparts = self.get_parts(**kwargs)
289
289
  res = self.serialize_doc(parts=subparts, **kwargs)
290
290
  return res
291
291
 
@@ -713,6 +713,23 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
713
713
  **kwargs,
714
714
  )
715
715
 
716
+ # Append nested list to parent list item:
717
+ i = 0
718
+ while i < len(parts):
719
+ prt = parts[i]
720
+ if prt.text.startswith(("<ul>", "<ol>")):
721
+ for j in range(i - 1, -1, -1):
722
+ if parts[j].text.startswith(("<li>", "<li ")) and parts[
723
+ j
724
+ ].text.endswith("</li>"):
725
+ before, _, _ = parts[j].text.rpartition("</li>")
726
+ parts[j].text = f"{before}\n{prt.text}\n</li>"
727
+ break
728
+ if j > -1:
729
+ parts.pop(i)
730
+ else:
731
+ i += 1
732
+
716
733
  # Add all child parts
717
734
  text_res = "\n".join(
718
735
  [
@@ -148,7 +148,9 @@ class LayoutVisualizer(BaseVisualizer):
148
148
  prev_image = None
149
149
  prev_page_nr = None
150
150
  for idx, (elem, _) in enumerate(
151
- doc.iterate_items(included_content_layers=included_content_layers)
151
+ doc.iterate_items(
152
+ included_content_layers=included_content_layers, traverse_pictures=True
153
+ )
152
154
  ):
153
155
  if not isinstance(elem, DocItem):
154
156
  continue
@@ -4098,7 +4098,10 @@ class DoclingDocument(BaseModel):
4098
4098
  return result
4099
4099
 
4100
4100
  def _with_pictures_refs(
4101
- self, image_dir: Path, reference_path: Optional[Path] = None
4101
+ self,
4102
+ image_dir: Path,
4103
+ page_no: Optional[int],
4104
+ reference_path: Optional[Path] = None,
4102
4105
  ) -> "DoclingDocument":
4103
4106
  """Document with images as refs.
4104
4107
 
@@ -4111,7 +4114,7 @@ class DoclingDocument(BaseModel):
4111
4114
  image_dir.mkdir(parents=True, exist_ok=True)
4112
4115
 
4113
4116
  if image_dir.is_dir():
4114
- for item, level in result.iterate_items(with_groups=False):
4117
+ for item, level in result.iterate_items(page_no=page_no, with_groups=False):
4115
4118
  if isinstance(item, PictureItem):
4116
4119
 
4117
4120
  if (
@@ -4211,7 +4214,7 @@ class DoclingDocument(BaseModel):
4211
4214
  os.makedirs(artifacts_dir, exist_ok=True)
4212
4215
 
4213
4216
  new_doc = self._make_copy_with_refmode(
4214
- artifacts_dir, image_mode, reference_path=reference_path
4217
+ artifacts_dir, image_mode, page_no=None, reference_path=reference_path
4215
4218
  )
4216
4219
 
4217
4220
  out = new_doc.export_to_dict(
@@ -4254,7 +4257,7 @@ class DoclingDocument(BaseModel):
4254
4257
  os.makedirs(artifacts_dir, exist_ok=True)
4255
4258
 
4256
4259
  new_doc = self._make_copy_with_refmode(
4257
- artifacts_dir, image_mode, reference_path=reference_path
4260
+ artifacts_dir, image_mode, page_no=None, reference_path=reference_path
4258
4261
  )
4259
4262
 
4260
4263
  out = new_doc.export_to_dict(
@@ -4327,7 +4330,7 @@ class DoclingDocument(BaseModel):
4327
4330
  os.makedirs(artifacts_dir, exist_ok=True)
4328
4331
 
4329
4332
  new_doc = self._make_copy_with_refmode(
4330
- artifacts_dir, image_mode, reference_path=reference_path
4333
+ artifacts_dir, image_mode, page_no, reference_path=reference_path
4331
4334
  )
4332
4335
 
4333
4336
  md_out = new_doc.export_to_markdown(
@@ -4503,7 +4506,7 @@ class DoclingDocument(BaseModel):
4503
4506
  os.makedirs(artifacts_dir, exist_ok=True)
4504
4507
 
4505
4508
  new_doc = self._make_copy_with_refmode(
4506
- artifacts_dir, image_mode, reference_path=reference_path
4509
+ artifacts_dir, image_mode, page_no, reference_path=reference_path
4507
4510
  )
4508
4511
 
4509
4512
  html_out = new_doc.export_to_html(
@@ -4542,6 +4545,7 @@ class DoclingDocument(BaseModel):
4542
4545
  self,
4543
4546
  artifacts_dir: Path,
4544
4547
  image_mode: ImageRefMode,
4548
+ page_no: Optional[int],
4545
4549
  reference_path: Optional[Path] = None,
4546
4550
  ):
4547
4551
  new_doc = None
@@ -4549,7 +4553,7 @@ class DoclingDocument(BaseModel):
4549
4553
  new_doc = self
4550
4554
  elif image_mode == ImageRefMode.REFERENCED:
4551
4555
  new_doc = self._with_pictures_refs(
4552
- image_dir=artifacts_dir, reference_path=reference_path
4556
+ image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
4553
4557
  )
4554
4558
  elif image_mode == ImageRefMode.EMBEDDED:
4555
4559
  new_doc = self._with_embedded_pictures()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.42.0
3
+ Version: 2.43.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -30,6 +30,7 @@ docling_core/transforms/chunker/__init__.py
30
30
  docling_core/transforms/chunker/base.py
31
31
  docling_core/transforms/chunker/hierarchical_chunker.py
32
32
  docling_core/transforms/chunker/hybrid_chunker.py
33
+ docling_core/transforms/chunker/page_chunker.py
33
34
  docling_core/transforms/chunker/tokenizer/__init__.py
34
35
  docling_core/transforms/chunker/tokenizer/base.py
35
36
  docling_core/transforms/chunker/tokenizer/huggingface.py
@@ -98,6 +99,7 @@ test/test_json_schema_to_search_mapper.py
98
99
  test/test_nlp_qa.py
99
100
  test/test_otsl_table_export.py
100
101
  test/test_page.py
102
+ test/test_page_chunker.py
101
103
  test/test_rec_schema.py
102
104
  test/test_search_meta.py
103
105
  test/test_serialization.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.42.0" # DO NOT EDIT, updated automatically
3
+ version = "2.43.1" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -36,8 +36,8 @@ def test_identifier():
36
36
  )
37
37
 
38
38
  # schema_json(): no need to set by_alias since it is True by the default
39
- tf = open("test/data/json_schemas/base_identifier.json", encoding="utf-8")
40
- gold_json = json.load(tf)
39
+ with open("test/data/json_schemas/base_identifier.json", encoding="utf-8") as tf:
40
+ gold_json = json.load(tf)
41
41
 
42
42
  assert Identifier.model_json_schema() == gold_json
43
43
 
@@ -1349,12 +1349,51 @@ def test_save_pictures():
1349
1349
 
1350
1350
  doc: DoclingDocument = _construct_doc()
1351
1351
 
1352
- new_doc = doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"))
1352
+ new_doc = doc._with_pictures_refs(
1353
+ image_dir=Path("./test/data/constructed_images/"), page_no=None
1354
+ )
1353
1355
 
1354
1356
  img_paths = new_doc._list_images_on_disk()
1355
1357
  assert len(img_paths) == 1, "len(img_paths)!=1"
1356
1358
 
1357
1359
 
1360
+ def test_save_pictures_with_page():
1361
+ # Given
1362
+ doc = DoclingDocument(name="Dummy")
1363
+
1364
+ doc.add_page(page_no=1, size=Size(width=2000, height=4000), image=None)
1365
+ doc.add_page(
1366
+ page_no=2,
1367
+ size=Size(width=2000, height=4000),
1368
+ )
1369
+ image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
1370
+ doc.add_picture(
1371
+ image=ImageRef.from_pil(image=image, dpi=72),
1372
+ prov=ProvenanceItem(
1373
+ page_no=2,
1374
+ bbox=BoundingBox(
1375
+ b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT
1376
+ ),
1377
+ charspan=(1, 2),
1378
+ ),
1379
+ )
1380
+
1381
+ # When
1382
+ with_ref = doc._with_pictures_refs(
1383
+ image_dir=Path("./test/data/constructed_images/"), page_no=1
1384
+ )
1385
+ # Then
1386
+ n_images = len(with_ref._list_images_on_disk())
1387
+ assert n_images == 0
1388
+ # When
1389
+ with_ref = with_ref._with_pictures_refs(
1390
+ image_dir=Path("./test/data/constructed_images/"), page_no=2
1391
+ )
1392
+ n_images = len(with_ref._list_images_on_disk())
1393
+ # Then
1394
+ assert n_images == 1
1395
+
1396
+
1358
1397
  def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
1359
1398
 
1360
1399
  for p in paths:
@@ -1406,7 +1445,8 @@ def test_save_to_disk():
1406
1445
  image_dir = Path("./test/data/doc/constructed_images/")
1407
1446
 
1408
1447
  doc_with_references = doc._with_pictures_refs(
1409
- image_dir=image_dir # Path("./test/data/constructed_images/")
1448
+ image_dir=image_dir, # Path("./test/data/constructed_images/")
1449
+ page_no=None,
1410
1450
  )
1411
1451
 
1412
1452
  # paths will be different on different machines, so needs to be kept!
@@ -1779,9 +1819,10 @@ def test_document_manipulation():
1779
1819
 
1780
1820
  # Test the handling of list items in insert_* methods, both with and without parent groups
1781
1821
 
1782
- li_sibling = doc.insert_list_item(
1783
- sibling=node, text="Inserted List Item, Incorrect Parent", after=False
1784
- )
1822
+ with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
1823
+ li_sibling = doc.insert_list_item(
1824
+ sibling=node, text="Inserted List Item, Incorrect Parent", after=False
1825
+ )
1785
1826
  doc.insert_list_item(
1786
1827
  sibling=li_sibling, text="Inserted List Item, Correct Parent", after=True
1787
1828
  )
@@ -1791,12 +1832,13 @@ def test_document_manipulation():
1791
1832
  text="Inserted Text with LIST_ITEM Label, Correct Parent",
1792
1833
  after=False,
1793
1834
  )
1794
- doc.insert_text(
1795
- sibling=node,
1796
- label=DocItemLabel.LIST_ITEM,
1797
- text="Inserted Text with LIST_ITEM Label, Incorrect Parent",
1798
- after=True,
1799
- )
1835
+ with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
1836
+ doc.insert_text(
1837
+ sibling=node,
1838
+ label=DocItemLabel.LIST_ITEM,
1839
+ text="Inserted Text with LIST_ITEM Label, Incorrect Parent",
1840
+ after=True,
1841
+ )
1800
1842
 
1801
1843
  filename = Path(
1802
1844
  "test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json"
@@ -60,7 +60,8 @@ def test_doctags_load_from_files():
60
60
 
61
61
  def test_doctags_load_from_memory():
62
62
 
63
- doctags = Path("test/data/doc/page_with_pic.dt").open("r").read()
63
+ with Path("test/data/doc/page_with_pic.dt").open() as file:
64
+ doctags = file.read()
64
65
  image = PILImage.open(Path("test/data/doc/page_with_pic.png"))
65
66
 
66
67
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
@@ -75,7 +76,8 @@ def test_doctags_load_from_memory():
75
76
 
76
77
 
77
78
  def test_doctags_load_without_image():
78
- doctags = Path("test/data/doc/page_with_pic.dt").open("r").read()
79
+ with Path("test/data/doc/page_with_pic.dt").open() as file:
80
+ doctags = file.read()
79
81
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], None)
80
82
  doc = DoclingDocument.load_from_doctags(doctags_doc)
81
83
  exp = "test/data/doc/page_without_pic.dt.json"
@@ -86,7 +88,8 @@ def test_doctags_load_without_image():
86
88
 
87
89
 
88
90
  def test_doctags_load_for_kv_region():
89
- doctags = Path("test/data/doc/doc_with_kv.dt").open("r").read()
91
+ with Path("test/data/doc/doc_with_kv.dt").open() as file:
92
+ doctags = file.read()
90
93
  image = PILImage.open(Path("test/data/doc/doc_with_kv.png"))
91
94
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
92
95
  doc = DoclingDocument.load_from_doctags(doctags_doc)
@@ -98,7 +101,8 @@ def test_doctags_load_for_kv_region():
98
101
 
99
102
 
100
103
  def test_multipage_doctags_load():
101
- doctags = Path("test/data/doc/2206.01062.yaml.dt").open("r").read()
104
+ with Path("test/data/doc/2206.01062.yaml.dt").open() as file:
105
+ doctags = file.read()
102
106
  doctags_doc = DocTagsDocument.from_multipage_doctags_and_images(doctags, None)
103
107
  doc = DoclingDocument.load_from_doctags(doctags_doc)
104
108
  exp = "test/data/doc/2206.01062.yaml.dt.json"
@@ -0,0 +1,36 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from docling_core.transforms.chunker.hierarchical_chunker import DocChunk
5
+ from docling_core.transforms.chunker.page_chunker import PageChunker
6
+ from docling_core.types.doc.document import DoclingDocument
7
+
8
+ from .test_data_gen_flag import GEN_TEST_DATA
9
+
10
+
11
+ def _process(act_data, exp_path_str):
12
+ if GEN_TEST_DATA:
13
+ with open(exp_path_str, mode="w", encoding="utf-8") as f:
14
+ json.dump(act_data, fp=f, indent=4)
15
+ f.write("\n")
16
+ else:
17
+ with open(exp_path_str, encoding="utf-8") as f:
18
+ exp_data = json.load(fp=f)
19
+ assert exp_data == act_data
20
+
21
+
22
+ def test_page_chunks():
23
+ src = Path("./test/data/doc/cross_page_lists.json")
24
+ doc = DoclingDocument.load_from_json(src)
25
+
26
+ chunker = PageChunker()
27
+
28
+ chunk_iter = chunker.chunk(dl_doc=doc)
29
+ chunks = list(chunk_iter)
30
+ act_data = dict(
31
+ root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
32
+ )
33
+ _process(
34
+ act_data=act_data,
35
+ exp_path_str=src.parent / f"{src.stem}_chunks.json",
36
+ )
File without changes
File without changes
File without changes