docling-core 1.7.0__tar.gz → 1.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (63) hide show
  1. {docling_core-1.7.0 → docling_core-1.7.1}/PKG-INFO +1 -1
  2. docling_core-1.7.1/docling_core/transforms/chunker/base.py +74 -0
  3. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/chunker/hierarchical_chunker.py +14 -14
  4. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +8 -10
  5. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/base.py +4 -1
  6. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/document.py +1 -3
  7. {docling_core-1.7.0 → docling_core-1.7.1}/pyproject.toml +1 -1
  8. docling_core-1.7.0/docling_core/transforms/chunker/base.py +0 -46
  9. {docling_core-1.7.0 → docling_core-1.7.1}/LICENSE +0 -0
  10. {docling_core-1.7.0 → docling_core-1.7.1}/README.md +0 -0
  11. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/__init__.py +0 -0
  12. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/py.typed +0 -0
  13. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  14. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  15. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  16. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  17. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  18. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  19. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  20. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  21. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/__init__.py +0 -0
  22. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  23. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/mapping.py +0 -0
  24. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/meta.py +0 -0
  25. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/search/package.py +0 -0
  26. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/__init__.py +0 -0
  27. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/chunker/__init__.py +0 -0
  28. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/__init__.py +0 -0
  29. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/base.py +0 -0
  30. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -0
  31. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/id_generator/uuid_generator.py +0 -0
  32. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/__init__.py +0 -0
  33. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/transforms/metadata_extractor/base.py +0 -0
  34. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/__init__.py +0 -0
  35. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/__init__.py +0 -0
  36. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/base.py +0 -0
  37. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/doc_ann.py +0 -0
  38. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/doc_ocr.py +0 -0
  39. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/doc_raw.py +0 -0
  40. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/document.py +0 -0
  41. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/doc/tokens.py +0 -0
  42. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/__init__.py +0 -0
  43. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/base.py +0 -0
  44. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/experimental/labels.py +0 -0
  45. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/gen/__init__.py +0 -0
  46. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/gen/generic.py +0 -0
  47. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/nlp/__init__.py +0 -0
  48. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/nlp/qa.py +0 -0
  49. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/nlp/qa_labels.py +0 -0
  50. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/__init__.py +0 -0
  51. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/attribute.py +0 -0
  52. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/base.py +0 -0
  53. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/predicate.py +0 -0
  54. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/record.py +0 -0
  55. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/statement.py +0 -0
  56. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/types/rec/subject.py +0 -0
  57. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/__init__.py +0 -0
  58. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/alias.py +0 -0
  59. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/ds_generate_docs.py +0 -0
  60. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  61. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/file.py +0 -0
  62. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/validate.py +0 -0
  63. {docling_core-1.7.0 → docling_core-1.7.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.7.0
3
+ Version: 1.7.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -0,0 +1,74 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for chunking."""
7
+ import re
8
+ from abc import ABC, abstractmethod
9
+ from typing import Final, Iterator, Optional
10
+
11
+ from pydantic import BaseModel, Field, field_validator
12
+
13
+ from docling_core.types import BoundingBox, Document
14
+ from docling_core.types.base import _JSON_POINTER_REGEX
15
+
16
+ # (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
17
+ _DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
18
+
19
+
20
+ def _create_path(pos: int, path_prefix: str = "main-text") -> str:
21
+ return f"#/{path_prefix}/{pos}"
22
+
23
+
24
+ class Chunk(BaseModel):
25
+ """Data model for Chunk."""
26
+
27
+ path: str = Field(pattern=_JSON_POINTER_REGEX)
28
+ text: str
29
+ heading: Optional[str] = None
30
+
31
+ @field_validator("path", mode="before")
32
+ @classmethod
33
+ def _json_pointer_from_json_path(cls, path: str):
34
+ if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
35
+ groups = match.groups()
36
+ if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
37
+ return _create_path(
38
+ pos=int(groups[1]),
39
+ path_prefix=groups[0],
40
+ )
41
+ return path
42
+
43
+
44
+ class ChunkWithMetadata(Chunk):
45
+ """Data model for Chunk including metadata."""
46
+
47
+ page: Optional[int] = None
48
+ bbox: Optional[BoundingBox] = None
49
+
50
+
51
+ class BaseChunker(BaseModel, ABC):
52
+ """Base class for Chunker."""
53
+
54
+ @abstractmethod
55
+ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
56
+ """Chunk the provided document.
57
+
58
+ Args:
59
+ dl_doc (Document): document to chunk
60
+
61
+ Raises:
62
+ NotImplementedError: in this abstract implementation
63
+
64
+ Yields:
65
+ Iterator[Chunk]: iterator over extracted chunks
66
+ """
67
+ raise NotImplementedError()
68
+
69
+ @classmethod
70
+ def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
71
+ return _create_path(
72
+ pos=pos,
73
+ path_prefix=path_prefix,
74
+ )
@@ -12,7 +12,7 @@ from enum import Enum
12
12
  from typing import Any, Iterator, Optional, Union
13
13
 
14
14
  import pandas as pd
15
- from pydantic import BaseModel, PositiveInt
15
+ from pydantic import BaseModel, Field, PositiveInt
16
16
 
17
17
  from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
18
18
  from docling_core.types import BaseText
@@ -25,9 +25,17 @@ _logger = logging.getLogger(__name__)
25
25
  class HierarchicalChunker(BaseChunker):
26
26
  """Chunker implementation leveraging the document layout."""
27
27
 
28
- include_metadata: bool = True
29
- heading_as_metadata: bool = False
30
- min_chunk_len: PositiveInt = 64
28
+ heading_as_metadata: bool = Field(
29
+ default=False,
30
+ description="Whether heading should be in metadata (instead of text)",
31
+ )
32
+ include_metadata: bool = Field(
33
+ default=True,
34
+ description="Whether to include extras in the metadata",
35
+ )
36
+ min_chunk_len: PositiveInt = Field(
37
+ default=64, description="Minimum chunk text length to consider (in chars)"
38
+ )
31
39
 
32
40
  class _NodeType(str, Enum):
33
41
  PARAGRAPH = "paragraph"
@@ -83,10 +91,6 @@ class HierarchicalChunker(BaseChunker):
83
91
 
84
92
  return output_text
85
93
 
86
- @classmethod
87
- def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
88
- return f"$.{path_prefix}[{pos}]"
89
-
90
94
  class _MainTextItemNode(BaseModel):
91
95
  parent: Optional[int] = None
92
96
  children: list[int] = []
@@ -304,14 +308,15 @@ class HierarchicalChunker(BaseChunker):
304
308
  return ChunkWithMetadata(
305
309
  text=concat,
306
310
  path=path,
311
+ heading=heading,
307
312
  page=item.prov[0].page if item.prov else None,
308
313
  bbox=item.prov[0].bbox if item.prov else None,
309
- heading=heading,
310
314
  )
311
315
  else:
312
316
  return Chunk(
313
317
  text=concat,
314
318
  path=path,
319
+ heading=heading,
315
320
  )
316
321
  else:
317
322
  return None
@@ -327,11 +332,6 @@ class HierarchicalChunker(BaseChunker):
327
332
  Yields:
328
333
  Iterator[Chunk]: iterator over extracted chunks
329
334
  """
330
- if (not self.include_metadata) and self.heading_as_metadata:
331
- raise RuntimeError(
332
- "To enable `heading_as_metadata`, also `include_metadata` must be True."
333
- )
334
-
335
335
  if dl_doc.main_text:
336
336
  # extract doc structure incl. metadata for
337
337
  # each item (e.g. parent, children)
@@ -6,20 +6,18 @@
6
6
  """Simple metadata extractor module."""
7
7
 
8
8
 
9
- from enum import Enum
10
- from typing import Any
9
+ from typing import Any, Final
11
10
 
12
11
  from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
13
12
  from docling_core.types import Document as DLDocument
14
13
 
14
+ _DL_DOC_HASH: Final[str] = "dl_doc_hash"
15
+ _ORIGIN: Final[str] = "origin"
16
+
15
17
 
16
18
  class SimpleMetadataExtractor(BaseMetadataExtractor):
17
19
  """Simple metadata extractor class."""
18
20
 
19
- class _Keys(str, Enum):
20
- DL_DOC_HASH = "dl_doc_hash"
21
- ORIGIN = "origin"
22
-
23
21
  include_origin: bool = False
24
22
 
25
23
  def get_metadata(
@@ -35,10 +33,10 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
35
33
  dict[str, Any]: the extracted metadata
36
34
  """
37
35
  meta: dict[str, Any] = {
38
- self._Keys.DL_DOC_HASH: doc.file_info.document_hash,
36
+ _DL_DOC_HASH: doc.file_info.document_hash,
39
37
  }
40
38
  if self.include_origin:
41
- meta[self._Keys.ORIGIN] = origin
39
+ meta[_ORIGIN] = origin
42
40
  return meta
43
41
 
44
42
  def get_excluded_embed_metadata_keys(self) -> list[str]:
@@ -47,9 +45,9 @@ class SimpleMetadataExtractor(BaseMetadataExtractor):
47
45
  Returns:
48
46
  list[str]: the metadata to exclude
49
47
  """
50
- excl_keys: list[str] = [self._Keys.DL_DOC_HASH]
48
+ excl_keys: list[str] = [_DL_DOC_HASH]
51
49
  if self.include_origin:
52
- excl_keys.append(self._Keys.ORIGIN)
50
+ excl_keys.append(_ORIGIN)
53
51
  return excl_keys
54
52
 
55
53
  def get_excluded_llm_metadata_keys(self) -> list[str]:
@@ -6,7 +6,7 @@
6
6
  """Define common models across types."""
7
7
  from datetime import datetime, timezone
8
8
  from enum import Enum
9
- from typing import Generic, Hashable, List, Literal, Optional, TypeVar
9
+ from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
10
10
 
11
11
  from pydantic import (
12
12
  AfterValidator,
@@ -28,6 +28,9 @@ from docling_core.search.package import VERSION_PATTERN
28
28
  from docling_core.utils.alias import AliasModel
29
29
  from docling_core.utils.validators import validate_datetime, validate_unique_list
30
30
 
31
+ # (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
32
+ _JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
33
+
31
34
  LanguageT = TypeVar("LanguageT", bound=str)
32
35
  IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
33
36
  DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
@@ -20,6 +20,7 @@ from tabulate import tabulate
20
20
  from typing_extensions import Annotated
21
21
 
22
22
  from docling_core.search.package import VERSION_PATTERN
23
+ from docling_core.types.base import _JSON_POINTER_REGEX
23
24
  from docling_core.types.doc.tokens import DocumentToken
24
25
  from docling_core.types.experimental import BoundingBox, Size
25
26
  from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
@@ -28,9 +29,6 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
28
29
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
29
30
  CURRENT_VERSION: Final = "1.0.0"
30
31
 
31
- # (subset of) JSON Pointer URI fragment identifier format:
32
- _JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
33
-
34
32
 
35
33
  class BasePictureData(BaseModel): # TBD
36
34
  """BasePictureData."""
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.7.0"
3
+ version = "1.7.1"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -1,46 +0,0 @@
1
- #
2
- # Copyright IBM Corp. 2024 - 2024
3
- # SPDX-License-Identifier: MIT
4
- #
5
-
6
- """Define base classes for chunking."""
7
- from abc import ABC, abstractmethod
8
- from typing import Iterator, Optional
9
-
10
- from pydantic import BaseModel
11
-
12
- from docling_core.types import BoundingBox, Document
13
-
14
-
15
- class Chunk(BaseModel):
16
- """Data model for Chunk."""
17
-
18
- path: str
19
- text: str
20
-
21
-
22
- class ChunkWithMetadata(Chunk):
23
- """Data model for Chunk including metadata."""
24
-
25
- page: Optional[int] = None
26
- bbox: Optional[BoundingBox] = None
27
- heading: Optional[str] = None
28
-
29
-
30
- class BaseChunker(BaseModel, ABC):
31
- """Base class for Chunker."""
32
-
33
- @abstractmethod
34
- def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
35
- """Chunk the provided document.
36
-
37
- Args:
38
- dl_doc (Document): document to chunk
39
-
40
- Raises:
41
- NotImplementedError: in this abstract implementation
42
-
43
- Yields:
44
- Iterator[Chunk]: iterator over extracted chunks
45
- """
46
- raise NotImplementedError()
File without changes
File without changes