docling-core 0.2.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (45) hide show
  1. {docling_core-0.2.0 → docling_core-1.1.0}/PKG-INFO +2 -1
  2. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/search/package.py +2 -1
  3. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/base.py +10 -6
  4. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/doc/document.py +111 -2
  5. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/attribute.py +3 -6
  6. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/predicate.py +8 -10
  7. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/record.py +0 -2
  8. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/subject.py +5 -1
  9. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/utils/ds_generate_docs.py +4 -4
  10. {docling_core-0.2.0 → docling_core-1.1.0}/pyproject.toml +17 -2
  11. {docling_core-0.2.0 → docling_core-1.1.0}/LICENSE +0 -0
  12. {docling_core-0.2.0 → docling_core-1.1.0}/README.md +0 -0
  13. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/__init__.py +0 -0
  14. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/py.typed +0 -0
  15. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  16. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  17. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  18. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  19. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  20. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  21. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  22. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  23. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/search/__init__.py +0 -0
  24. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  25. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/search/mapping.py +0 -0
  26. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/search/meta.py +0 -0
  27. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/__init__.py +0 -0
  28. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/doc/__init__.py +0 -0
  29. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/doc/base.py +0 -0
  30. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/doc/doc_ann.py +0 -0
  31. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/doc/doc_ocr.py +0 -0
  32. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/doc/doc_raw.py +0 -0
  33. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/gen/__init__.py +0 -0
  34. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/gen/generic.py +0 -0
  35. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/nlp/__init__.py +0 -0
  36. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/nlp/qa.py +0 -0
  37. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
  38. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/__init__.py +0 -0
  39. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/base.py +0 -0
  40. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/types/rec/statement.py +0 -0
  41. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/utils/__init__.py +0 -0
  42. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/utils/alias.py +0 -0
  43. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  44. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/utils/validate.py +0 -0
  45. {docling_core-0.2.0 → docling_core-1.1.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 0.2.0
3
+ Version: 1.1.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -31,6 +31,7 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
31
  Requires-Dist: poetry (>=1.8.3,<2.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
34
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
35
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
36
  Description-Content-Type: text/markdown
36
37
 
@@ -5,6 +5,7 @@
5
5
 
6
6
  """Models and methods to define a package model."""
7
7
 
8
+ import importlib.metadata
8
9
  import re
9
10
  from typing import Final
10
11
 
@@ -27,7 +28,7 @@ class Package(BaseModel, extra="forbid"):
27
28
 
28
29
  name: StrictStr
29
30
  version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
30
- "0.1.0"
31
+ importlib.metadata.version("docling-core")
31
32
  )
32
33
 
33
34
  def __hash__(self):
@@ -39,6 +39,10 @@ PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str)
39
39
  PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
40
40
  ProvenanceTypeT = TypeVar("ProvenanceTypeT", bound=str)
41
41
  CollectionNameTypeT = TypeVar("CollectionNameTypeT", bound=str)
42
+ Coordinates = Annotated[
43
+ list[float],
44
+ Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
45
+ ]
42
46
  T = TypeVar("T", bound=Hashable)
43
47
 
44
48
  UniqueList = Annotated[
@@ -61,7 +65,7 @@ ACQUISITION_TYPE = Literal[
61
65
 
62
66
 
63
67
  class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
64
- """Unique identifier of a Deep Search data object."""
68
+ """Unique identifier of a Docling data object."""
65
69
 
66
70
  type_: IdentifierTypeT = Field(
67
71
  alias="type",
@@ -81,7 +85,7 @@ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
81
85
  alias="_name",
82
86
  title="_Name",
83
87
  description=(
84
- "A unique identifier of the data object across Deep Search, consisting of "
88
+ "A unique identifier of the data object across Docling, consisting of "
85
89
  "the concatenation of type and value in lower case, separated by hash "
86
90
  "(#)."
87
91
  ),
@@ -118,7 +122,7 @@ class Log(AliasModel, extra="forbid"):
118
122
  json_schema_extra=es_field(type="keyword", ignore_above=8191),
119
123
  )
120
124
  agent: StrictStr = Field(
121
- description="The Deep Search agent that performed the task, e.g., CCS or CXS.",
125
+ description="The Docling agent that performed the task, e.g., CCS or CXS.",
122
126
  json_schema_extra=es_field(type="keyword", ignore_above=8191),
123
127
  )
124
128
  type_: StrictStr = Field(
@@ -138,7 +142,7 @@ class Log(AliasModel, extra="forbid"):
138
142
 
139
143
 
140
144
  class FileInfoObject(AliasModel):
141
- """Filing information for any data object to be stored in a Deep Search database."""
145
+ """Filing information for any data object to be stored in a Docling database."""
142
146
 
143
147
  filename: StrictStr = Field(
144
148
  description="The name of a persistent object that created this data object",
@@ -156,7 +160,7 @@ class FileInfoObject(AliasModel):
156
160
  document_hash: StrictStr = Field(
157
161
  description=(
158
162
  "A unique identifier of this data object within a collection of a "
159
- "Deep Search database"
163
+ "Docling database"
160
164
  ),
161
165
  alias="document-hash",
162
166
  json_schema_extra=es_field(type="keyword", ignore_above=8191),
@@ -164,7 +168,7 @@ class FileInfoObject(AliasModel):
164
168
 
165
169
 
166
170
  class CollectionTypeEnum(str, Enum):
167
- """Enumeration of valid Deep Search collection types."""
171
+ """Enumeration of valid Docling collection types."""
168
172
 
169
173
  generic = "Generic"
170
174
  document = "Document"
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: MIT
4
4
  #
5
5
 
6
- """Models for the Deep Search Document data type."""
6
+ """Models for the Docling Document data type."""
7
7
 
8
8
  from datetime import datetime
9
9
  from typing import Generic, Optional, Union
@@ -16,6 +16,7 @@ from pydantic import (
16
16
  StrictStr,
17
17
  model_validator,
18
18
  )
19
+ from tabulate import tabulate
19
20
 
20
21
  from docling_core.search.mapping import es_field
21
22
  from docling_core.types.base import (
@@ -352,7 +353,7 @@ class ExportedCCSDocument(
352
353
  CollectionNameTypeT,
353
354
  ],
354
355
  ):
355
- """Document model for Deep Search."""
356
+ """Document model for Docling."""
356
357
 
357
358
  obj_type: StrictStr = Field(
358
359
  "pdf-document",
@@ -391,3 +392,111 @@ class ExportedCCSDocument(
391
392
  item["$ref"] = ref
392
393
 
393
394
  return data
395
+
396
+ def _resolve_ref(self, item: Ref) -> Optional[Table]:
397
+ """Return the resolved reference in case of table reference, otherwise None."""
398
+ result: Optional[Table] = None
399
+
400
+ # NOTE: currently only resolves table refs & makes assumptions on ref parts
401
+ if item.obj_type == "table" and self.tables:
402
+ parts = item.ref.split("/")
403
+ result = self.tables[int(parts[2])]
404
+
405
+ return result
406
+
407
+ def export_to_markdown(
408
+ self,
409
+ delim: str = "\n\n",
410
+ main_text_start: int = 0,
411
+ main_text_stop: Optional[int] = None,
412
+ ) -> str:
413
+ r"""Serialize to Markdown.
414
+
415
+ Operates on a slice of the document's main_text as defined through arguments
416
+ main_text_start and main_text_stop; defaulting to the whole main_text.
417
+
418
+ Args:
419
+ delim (str, optional): Delimiter to use when concatenating the various
420
+ Markdown parts. Defaults to "\n\n".
421
+ main_text_start (int, optional): Main-text slicing start index (inclusive).
422
+ Defaults to 0.
423
+ main_text_end (Optional[int], optional): Main-text slicing stop index
424
+ (exclusive). Defaults to None.
425
+
426
+ Returns:
427
+ str: The exported Markdown representation.
428
+ """
429
+ has_title = False
430
+ prev_text = ""
431
+ md_texts: list[str] = []
432
+
433
+ if self.main_text is not None:
434
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
435
+ markdown_text = ""
436
+
437
+ item = (
438
+ self._resolve_ref(orig_item)
439
+ if isinstance(orig_item, Ref)
440
+ else orig_item
441
+ )
442
+ if item is None:
443
+ continue
444
+
445
+ item_type = item.obj_type
446
+ if isinstance(item, BaseText) and item_type in {
447
+ "title",
448
+ "subtitle-level-1",
449
+ "paragraph",
450
+ "caption",
451
+ }:
452
+ text = item.text
453
+
454
+ # ignore repeated text
455
+ if prev_text == text:
456
+ continue
457
+ else:
458
+ prev_text = text
459
+
460
+ # first title match
461
+ if item_type == "title" and not has_title:
462
+ markdown_text = f"# {text}"
463
+ has_title = True
464
+
465
+ # secondary titles
466
+ elif item_type in {"title", "subtitle-level-1"} or (
467
+ has_title and item_type == "title"
468
+ ):
469
+ markdown_text = f"## {text}"
470
+
471
+ # normal text
472
+ else:
473
+ markdown_text = text
474
+
475
+ elif isinstance(item, Table) and item.data:
476
+ table = []
477
+ for row in item.data:
478
+ tmp = []
479
+ for col in row:
480
+ tmp.append(col.text)
481
+ table.append(tmp)
482
+
483
+ if len(table) > 1 and len(table[0]) > 0:
484
+ try:
485
+ md_table = tabulate(
486
+ table[1:], headers=table[0], tablefmt="github"
487
+ )
488
+ except ValueError:
489
+ md_table = tabulate(
490
+ table[1:],
491
+ headers=table[0],
492
+ tablefmt="github",
493
+ disable_numparse=True,
494
+ )
495
+
496
+ markdown_text = md_table
497
+
498
+ if markdown_text:
499
+ md_texts.append(markdown_text)
500
+
501
+ result = delim.join(md_texts)
502
+ return result
@@ -6,7 +6,7 @@
6
6
  """Define the model Attribute."""
7
7
  from typing import Generic, Optional
8
8
 
9
- from pydantic import BaseModel, Field
9
+ from pydantic import Field
10
10
  from typing_extensions import Annotated
11
11
 
12
12
  from docling_core.search.mapping import es_field
@@ -16,23 +16,20 @@ from docling_core.types.base import (
16
16
  PredicateKeyTypeT,
17
17
  PredicateValueTypeT,
18
18
  ProvenanceTypeT,
19
- SubjectNameTypeT,
20
- SubjectTypeT,
21
19
  )
22
20
  from docling_core.types.rec.base import ProvenanceItem
23
21
  from docling_core.types.rec.predicate import Predicate
22
+ from docling_core.utils.alias import AliasModel
24
23
 
25
24
 
26
25
  class Attribute(
27
- BaseModel,
26
+ AliasModel,
28
27
  Generic[
29
28
  IdentifierTypeT,
30
29
  PredicateValueTypeT,
31
30
  PredicateKeyNameT,
32
31
  PredicateKeyTypeT,
33
32
  ProvenanceTypeT,
34
- SubjectTypeT,
35
- SubjectNameTypeT,
36
33
  ],
37
34
  extra="forbid",
38
35
  ):
@@ -5,7 +5,7 @@
5
5
 
6
6
  """Define the model Predicate."""
7
7
  from datetime import datetime
8
- from typing import Annotated, Generic, Optional, TypeVar
8
+ from typing import Annotated, Generic, Optional
9
9
 
10
10
  from pydantic import (
11
11
  BaseModel,
@@ -17,16 +17,14 @@ from pydantic import (
17
17
  )
18
18
 
19
19
  from docling_core.search.mapping import es_field
20
+ from docling_core.types.base import (
21
+ Coordinates,
22
+ PredicateKeyNameT,
23
+ PredicateKeyTypeT,
24
+ PredicateValueTypeT,
25
+ )
20
26
  from docling_core.utils.alias import AliasModel
21
27
 
22
- PredicateValueTypeT = TypeVar("PredicateValueTypeT", bound=str)
23
- PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str)
24
- PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
25
- Coordinates = Annotated[
26
- list[float],
27
- Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
28
- ]
29
-
30
28
 
31
29
  class NumericalValue(BaseModel, extra="forbid"):
32
30
  """Model for numerical values."""
@@ -117,7 +115,7 @@ class PredicateValue(AliasModel, Generic[PredicateValueTypeT], extra="forbid"):
117
115
 
118
116
 
119
117
  class Predicate(
120
- BaseModel,
118
+ AliasModel,
121
119
  Generic[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT],
122
120
  extra="forbid",
123
121
  ):
@@ -80,8 +80,6 @@ class Record(
80
80
  PredicateKeyNameT,
81
81
  PredicateKeyTypeT,
82
82
  ProvenanceTypeT,
83
- SubjectTypeT,
84
- SubjectNameTypeT,
85
83
  ]
86
84
  ]
87
85
  ] = None
@@ -19,6 +19,10 @@ from docling_core.types.doc.base import S3Reference
19
19
  from docling_core.utils.alias import AliasModel
20
20
 
21
21
 
22
+ class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]):
23
+ """Identifier of subject names.""" ""
24
+
25
+
22
26
  class Subject(
23
27
  AliasModel,
24
28
  Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT],
@@ -53,7 +57,7 @@ class Subject(
53
57
  ),
54
58
  json_schema_extra=es_field(type="keyword", ignore_above=8191),
55
59
  )
56
- names: list[Identifier[SubjectNameTypeT]] = Field(
60
+ names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field(
57
61
  description=(
58
62
  "List of given names for this subject. They may not be unique across "
59
63
  "different subjects."
@@ -44,7 +44,7 @@ def _prepare_directory(folder: str, clean: bool = False) -> None:
44
44
 
45
45
 
46
46
  def generate_collection_jsonschema(folder: str):
47
- """Generate the JSON schema of Deep Search collections and export them to a folder.
47
+ """Generate the JSON schema of Docling collections and export them to a folder.
48
48
 
49
49
  Args:
50
50
  folder: The name of the directory.
@@ -58,7 +58,7 @@ def generate_collection_jsonschema(folder: str):
58
58
 
59
59
 
60
60
  def generate_collection_html(folder: str):
61
- """Generate HTML pages documenting the data model of Deep Search collections.
61
+ """Generate HTML pages documenting the data model of Docling collections.
62
62
 
63
63
  The JSON schemas files need to be in a folder and the generated HTML pages will be
64
64
  written in the same folder.
@@ -79,7 +79,7 @@ def generate_collection_html(folder: str):
79
79
 
80
80
 
81
81
  def generate_collection_markdown(folder: str):
82
- """Generate Markdown pages documenting the data model of Deep Search collections.
82
+ """Generate Markdown pages documenting the data model of Docling collections.
83
83
 
84
84
  The JSON schemas files need to be in a folder and the generated markdown pages will
85
85
  be written in the same folder.
@@ -101,7 +101,7 @@ def generate_collection_markdown(folder: str):
101
101
 
102
102
 
103
103
  def main() -> None:
104
- """Generate the JSON Schema of Deep Search collections and export documentation."""
104
+ """Generate the JSON Schema of Docling collections and export documentation."""
105
105
  argparser = argparse.ArgumentParser()
106
106
  argparser.add_argument(
107
107
  "directory",
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "0.2.0"
3
+ version = "1.1.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -53,6 +53,7 @@ jsonref = "^1.1.0"
53
53
  json-schema-for-humans = "^1.0.0"
54
54
  poetry = "^1.8.3"
55
55
  pyproject-toml = "^0.0.10"
56
+ tabulate = "^0.9.0"
56
57
 
57
58
  [tool.poetry.group.dev.dependencies]
58
59
  black = "^24.4.2"
@@ -67,6 +68,7 @@ flake8-docstrings = "^1.6.0"
67
68
  pep8-naming = "^0.13.2"
68
69
  jsondiff = "^2.0.0"
69
70
  types-setuptools = "^70.3.0"
71
+ python-semantic-release = "^7.32.2"
70
72
 
71
73
  [tool.setuptools.packages.find]
72
74
  where = ["docling_core/resources/schemas"]
@@ -110,5 +112,18 @@ python_version = "3.9"
110
112
  plugins = ["pydantic.mypy"]
111
113
 
112
114
  [[tool.mypy.overrides]]
113
- module = ["jsonref.*", "jsonschema.*", "json_schema_for_humans.*"]
115
+ module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
114
116
  ignore_missing_imports = true
117
+
118
+ [tool.semantic_release]
119
+ # for default values check:
120
+ # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
121
+
122
+ version_source = "tag_only"
123
+ branch = "main"
124
+
125
+ # configure types which should trigger minor and patch version bumps respectively
126
+ # (note that they must be a subset of the configured allowed types):
127
+ parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
128
+ parser_angular_minor_types = "feat"
129
+ parser_angular_patch_types = "fix,perf"
File without changes
File without changes