docling-core 1.7.0__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -4,19 +4,41 @@
4
4
  #
5
5
 
6
6
  """Define base classes for chunking."""
7
+ import re
7
8
  from abc import ABC, abstractmethod
8
- from typing import Iterator, Optional
9
+ from typing import Final, Iterator, Optional
9
10
 
10
- from pydantic import BaseModel
11
+ from pydantic import BaseModel, Field, field_validator
11
12
 
12
13
  from docling_core.types import BoundingBox, Document
14
+ from docling_core.types.base import _JSON_POINTER_REGEX
15
+
16
+ # (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
17
+ _DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
18
+
19
+
20
+ def _create_path(pos: int, path_prefix: str = "main-text") -> str:
21
+ return f"#/{path_prefix}/{pos}"
13
22
 
14
23
 
15
24
  class Chunk(BaseModel):
16
25
  """Data model for Chunk."""
17
26
 
18
- path: str
27
+ path: str = Field(pattern=_JSON_POINTER_REGEX)
19
28
  text: str
29
+ heading: Optional[str] = None
30
+
31
+ @field_validator("path", mode="before")
32
+ @classmethod
33
+ def _json_pointer_from_json_path(cls, path: str):
34
+ if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
35
+ groups = match.groups()
36
+ if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
37
+ return _create_path(
38
+ pos=int(groups[1]),
39
+ path_prefix=groups[0],
40
+ )
41
+ return path
20
42
 
21
43
 
22
44
  class ChunkWithMetadata(Chunk):
@@ -24,7 +46,6 @@ class ChunkWithMetadata(Chunk):
24
46
 
25
47
  page: Optional[int] = None
26
48
  bbox: Optional[BoundingBox] = None
27
- heading: Optional[str] = None
28
49
 
29
50
 
30
51
  class BaseChunker(BaseModel, ABC):
@@ -44,3 +65,10 @@ class BaseChunker(BaseModel, ABC):
44
65
  Iterator[Chunk]: iterator over extracted chunks
45
66
  """
46
67
  raise NotImplementedError()
68
+
69
+ @classmethod
70
+ def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
71
+ return _create_path(
72
+ pos=pos,
73
+ path_prefix=path_prefix,
74
+ )
@@ -12,7 +12,7 @@ from enum import Enum
12
12
  from typing import Any, Iterator, Optional, Union
13
13
 
14
14
  import pandas as pd
15
- from pydantic import BaseModel, PositiveInt
15
+ from pydantic import BaseModel, Field, PositiveInt
16
16
 
17
17
  from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
18
18
  from docling_core.types import BaseText
@@ -25,9 +25,17 @@ _logger = logging.getLogger(__name__)
25
25
  class HierarchicalChunker(BaseChunker):
26
26
  """Chunker implementation leveraging the document layout."""
27
27
 
28
- include_metadata: bool = True
29
- heading_as_metadata: bool = False
30
- min_chunk_len: PositiveInt = 64
28
+ heading_as_metadata: bool = Field(
29
+ default=False,
30
+ description="Whether heading should be in metadata (instead of text)",
31
+ )
32
+ include_metadata: bool = Field(
33
+ default=True,
34
+ description="Whether to include extras in the metadata",
35
+ )
36
+ min_chunk_len: PositiveInt = Field(
37
+ default=64, description="Minimum chunk text length to consider (in chars)"
38
+ )
31
39
 
32
40
  class _NodeType(str, Enum):
33
41
  PARAGRAPH = "paragraph"
@@ -75,7 +83,7 @@ class HierarchicalChunker(BaseChunker):
75
83
  nrows = table_df.shape[0]
76
84
  ncols = table_df.shape[1]
77
85
  texts = [
78
- f"{rows[i]}, {cols[j]} = {table_df.iloc[i, j].strip()}"
86
+ f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
79
87
  for i in range(1, nrows)
80
88
  for j in range(1, ncols)
81
89
  ]
@@ -83,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
83
91
 
84
92
  return output_text
85
93
 
86
- @classmethod
87
- def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
88
- return f"$.{path_prefix}[{pos}]"
89
-
90
94
  class _MainTextItemNode(BaseModel):
91
95
  parent: Optional[int] = None
92
96
  children: list[int] = []
@@ -304,14 +308,15 @@ class HierarchicalChunker(BaseChunker):
304
308
  return ChunkWithMetadata(
305
309
  text=concat,
306
310
  path=path,
311
+ heading=heading,
307
312
  page=item.prov[0].page if item.prov else None,
308
313
  bbox=item.prov[0].bbox if item.prov else None,
309
- heading=heading,
310
314
  )
311
315
  else:
312
316
  return Chunk(
313
317
  text=concat,
314
318
  path=path,
319
+ heading=heading,
315
320
  )
316
321
  else:
317
322
  return None
@@ -327,11 +332,6 @@ class HierarchicalChunker(BaseChunker):
327
332
  Yields:
328
333
  Iterator[Chunk]: iterator over extracted chunks
329
334
  """
330
- if (not self.include_metadata) and self.heading_as_metadata:
331
- raise RuntimeError(
332
- "To enable `heading_as_metadata`, also `include_metadata` must be True."
333
- )
334
-
335
335
  if dl_doc.main_text:
336
336
  # extract doc structure incl. metadata for
337
337
  # each item (e.g. parent, children)
@@ -6,20 +6,18 @@
6
6
  """Simple metadata extractor module."""
7
7
 
8
8
 
9
- from enum import Enum
10
- from typing import Any
9
+ from typing import Any, Final
11
10
 
12
11
  from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
13
12
  from docling_core.types import Document as DLDocument
14
13
 
14
+ _DL_DOC_HASH: Final[str] = "dl_doc_hash"
15
+ _ORIGIN: Final[str] = "origin"
16
+
15
17
 
16
18
  class SimpleMetadataExtractor(BaseMetadataExtractor):
17
19
  """Simple metadata extractor class."""
18
20
 
19
- class _Keys(str, Enum):
20
- DL_DOC_HASH = "dl_doc_hash"
21
- ORIGIN = "origin"
22
-
23
21
  include_origin: bool = False
24
22
 
25
23
  def get_metadata(
@@ -35,10 +33,10 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
35
33
  dict[str, Any]: the extracted metadata
36
34
  """
37
35
  meta: dict[str, Any] = {
38
- self._Keys.DL_DOC_HASH: doc.file_info.document_hash,
36
+ _DL_DOC_HASH: doc.file_info.document_hash,
39
37
  }
40
38
  if self.include_origin:
41
- meta[self._Keys.ORIGIN] = origin
39
+ meta[_ORIGIN] = origin
42
40
  return meta
43
41
 
44
42
  def get_excluded_embed_metadata_keys(self) -> list[str]:
@@ -47,9 +45,9 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
47
45
  Returns:
48
46
  list[str]: the metadata to exclude
49
47
  """
50
- excl_keys: list[str] = [self._Keys.DL_DOC_HASH]
48
+ excl_keys: list[str] = [_DL_DOC_HASH]
51
49
  if self.include_origin:
52
- excl_keys.append(self._Keys.ORIGIN)
50
+ excl_keys.append(_ORIGIN)
53
51
  return excl_keys
54
52
 
55
53
  def get_excluded_llm_metadata_keys(self) -> list[str]:
@@ -6,7 +6,7 @@
6
6
  """Define common models across types."""
7
7
  from datetime import datetime, timezone
8
8
  from enum import Enum
9
- from typing import Generic, Hashable, List, Literal, Optional, TypeVar
9
+ from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
10
10
 
11
11
  from pydantic import (
12
12
  AfterValidator,
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
28
28
  from docling_core.utils.alias import AliasModel
29
29
  from docling_core.utils.validators import validate_datetime, validate_unique_list
30
30
 
31
+ # (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
32
+ _JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
33
+
31
34
  LanguageT = TypeVar("LanguageT", bound=str)
32
35
  IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
33
36
  DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
@@ -20,6 +20,7 @@ from tabulate import tabulate
20
20
  from typing_extensions import Annotated
21
21
 
22
22
  from docling_core.search.package import VERSION_PATTERN
23
+ from docling_core.types.base import _JSON_POINTER_REGEX
23
24
  from docling_core.types.doc.tokens import DocumentToken
24
25
  from docling_core.types.experimental import BoundingBox, Size
25
26
  from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
@@ -28,9 +29,6 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
28
29
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
29
30
  CURRENT_VERSION: Final = "1.0.0"
30
31
 
31
- # (subset of) JSON Pointer URI fragment identifier format:
32
- _JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
33
-
34
32
 
35
33
  class BasePictureData(BaseModel): # TBD
36
34
  """BasePictureData."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.7.0
3
+ Version: 1.7.2
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -28,7 +28,7 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
- Requires-Dist: pandas (>=2.2.2,<3.0.0)
31
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
34
  Project-URL: Repository, https://github.com/DS4SD/docling-core
@@ -15,17 +15,17 @@ docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3
15
15
  docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
16
16
  docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
17
17
  docling_core/transforms/chunker/__init__.py,sha256=xZ5ELOB8tbCoJY1dKUvOrFqxYyoHmmCNUSHxrrRi8a4,317
18
- docling_core/transforms/chunker/base.py,sha256=8tueNosEHt5J49olPOaw8KgZ_5WXBTBhtHBkNMR0MjA,1018
19
- docling_core/transforms/chunker/hierarchical_chunker.py,sha256=GkUn17E4CG8MNqIdyVmjQz6_Tdy22utBpUs-_aIMrrA,12676
18
+ docling_core/transforms/chunker/base.py,sha256=5EW89CZf4SMB6Eh4yNjzYoNjn8S7oHH8NEpMck3Lcio,2078
19
+ docling_core/transforms/chunker/hierarchical_chunker.py,sha256=DexBEPMR5rqnwrCMi-g98AtLDG7PKyZuf7u3NuXo-tA,12682
20
20
  docling_core/transforms/id_generator/__init__.py,sha256=7UoSyAcLsvw-RRrNjYXRVS4rIOUXjwqVpaQA-SSeINU,379
21
21
  docling_core/transforms/id_generator/base.py,sha256=SufPsaZUfMpuITq7pMv5YtlLmtGTDgA4LWmjmhQuSM0,704
22
22
  docling_core/transforms/id_generator/doc_hash_id_generator.py,sha256=SUw4FBhMZtbWCfc7oMucSwYvJTXqIPMn3yCXPRxtPCI,656
23
23
  docling_core/transforms/id_generator/uuid_generator.py,sha256=t8Bky_1JQB9myX-PJGWvW_c4-NvtHPHab6b1NdS-bpU,929
24
24
  docling_core/transforms/metadata_extractor/__init__.py,sha256=q_eAUcbaToEuYUPco4uiBO8vgTGSmZUC-r0mS7KbWh8,335
25
25
  docling_core/transforms/metadata_extractor/base.py,sha256=7h_S6-buCVtvAvKQKLISjDqFV8D3brewiQ-geqlUriI,1467
26
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py,sha256=SPRL6H_IvK18MiTLR9APsXh_2SRaC9SzfizDluzwFWY,1702
26
+ docling_core/transforms/metadata_extractor/simple_metadata_extractor.py,sha256=ZRjDdXgFe8jPBNC_0ruJjQanabpkxceVsCJVVWVWlIg,1629
27
27
  docling_core/types/__init__.py,sha256=6mrAEKRW85uHJwNQBufwjPcMWCjm3oocA6MaO4_NLgg,805
28
- docling_core/types/base.py,sha256=fNtfQ20NKa_RBNBWbq0DfO8o0zC1Cec8UAMu0Znsltk,8170
28
+ docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
29
29
  docling_core/types/doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
30
30
  docling_core/types/doc/base.py,sha256=ujko-oQKoXw6wjBn0Il2Khu3PyljHqYnUNh3mPDVJF8,14676
31
31
  docling_core/types/doc/doc_ann.py,sha256=8pV2efUglw19jxl4_oqB__mSxjWvtGIcllyCdqA-b2s,1196
@@ -35,7 +35,7 @@ docling_core/types/doc/document.py,sha256=AKp1kOo0tncf9FX3q7qRWQ2Jz_hZE44smZpyrt
35
35
  docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
36
36
  docling_core/types/experimental/__init__.py,sha256=mpqa2soTcHHEKqkcSeYBbAHepg0OgVZNReKvPmGz2r4,587
37
37
  docling_core/types/experimental/base.py,sha256=k04zvzNI7qo4HfKxLPCePKxCnerzXd582gvrVjF25SI,4225
38
- docling_core/types/experimental/document.py,sha256=xy-Y_Gf3L4iXigEuQCrEFNsZfBKhAuDaOayq9ZyvmAU,37925
38
+ docling_core/types/experimental/document.py,sha256=X3z4sjRmWytRbEmSCnKat4D9sYxSV7Olm1YNmG3c5Kg,37874
39
39
  docling_core/types/experimental/labels.py,sha256=tpmvpmJuQyYMLhxAvJSVuFhDRh_zQNiP1WrQmNXKQzo,1224
40
40
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
41
41
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -56,8 +56,8 @@ docling_core/utils/ds_generate_jsonschema.py,sha256=EhNQutqWJFWuN-yl9UUPFZ7DJTvG
56
56
  docling_core/utils/file.py,sha256=VQgzjyvmJnAIHB6ex7ikcmbDAR4GA1ALreuO7Ubrp50,1895
57
57
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
58
58
  docling_core/utils/validators.py,sha256=fBdyWX4PvFh7o_d25ZTs4iwmeo75QTbrxsvXv2kXkTg,2777
59
- docling_core-1.7.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-1.7.0.dist-info/METADATA,sha256=rpzl-k7WEt4anqtZcE-FFjrfTBHIXXP_fhiHrgI7xz4,5383
61
- docling_core-1.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-1.7.0.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
63
- docling_core-1.7.0.dist-info/RECORD,,
59
+ docling_core-1.7.2.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-1.7.2.dist-info/METADATA,sha256=OfpdHwn-55a4Z-61sx7SZ1yD7jMDaiZ_LEmEIoYKa9I,5383
61
+ docling_core-1.7.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-1.7.2.dist-info/entry_points.txt,sha256=XHhtJEkdUuLxXSNxLdFIzx_siQ3z2UFQEKp-P8VYAE4,189
63
+ docling_core-1.7.2.dist-info/RECORD,,