docling-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (46) hide show
  1. docling_core/__init__.py +6 -0
  2. docling_core/py.typed +0 -0
  3. docling_core/resources/schemas/doc/ANN.json +171 -0
  4. docling_core/resources/schemas/doc/DOC.json +300 -0
  5. docling_core/resources/schemas/doc/OCR-output.json +166 -0
  6. docling_core/resources/schemas/doc/RAW.json +158 -0
  7. docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
  8. docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
  9. docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
  10. docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
  11. docling_core/search/__init__.py +6 -0
  12. docling_core/search/json_schema_to_search_mapper.py +406 -0
  13. docling_core/search/mapping.py +29 -0
  14. docling_core/search/meta.py +93 -0
  15. docling_core/search/package.py +56 -0
  16. docling_core/types/__init__.py +25 -0
  17. docling_core/types/base.py +248 -0
  18. docling_core/types/doc/__init__.py +6 -0
  19. docling_core/types/doc/base.py +199 -0
  20. docling_core/types/doc/doc_ann.py +76 -0
  21. docling_core/types/doc/doc_ocr.py +83 -0
  22. docling_core/types/doc/doc_raw.py +187 -0
  23. docling_core/types/doc/document.py +393 -0
  24. docling_core/types/gen/__init__.py +6 -0
  25. docling_core/types/gen/generic.py +33 -0
  26. docling_core/types/nlp/__init__.py +6 -0
  27. docling_core/types/nlp/qa.py +74 -0
  28. docling_core/types/nlp/qa_labels.py +118 -0
  29. docling_core/types/rec/__init__.py +6 -0
  30. docling_core/types/rec/attribute.py +55 -0
  31. docling_core/types/rec/base.py +90 -0
  32. docling_core/types/rec/predicate.py +133 -0
  33. docling_core/types/rec/record.py +95 -0
  34. docling_core/types/rec/statement.py +41 -0
  35. docling_core/types/rec/subject.py +77 -0
  36. docling_core/utils/__init__.py +6 -0
  37. docling_core/utils/alias.py +27 -0
  38. docling_core/utils/ds_generate_docs.py +144 -0
  39. docling_core/utils/ds_generate_jsonschema.py +62 -0
  40. docling_core/utils/validate.py +86 -0
  41. docling_core/utils/validators.py +100 -0
  42. docling_core-0.0.1.dist-info/LICENSE +21 -0
  43. docling_core-0.0.1.dist-info/METADATA +133 -0
  44. docling_core-0.0.1.dist-info/RECORD +46 -0
  45. docling_core-0.0.1.dist-info/WHEEL +4 -0
  46. docling_core-0.0.1.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,248 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define common models across types."""
7
+ from datetime import datetime, timezone
8
+ from enum import Enum
9
+ from typing import Generic, Hashable, List, Literal, Optional, TypeVar
10
+
11
+ from pydantic import (
12
+ AfterValidator,
13
+ AnyUrl,
14
+ BaseModel,
15
+ Field,
16
+ PlainSerializer,
17
+ StrictStr,
18
+ StringConstraints,
19
+ ValidationInfo,
20
+ WrapValidator,
21
+ field_validator,
22
+ )
23
+ from pydantic.types import NonNegativeInt
24
+ from typing_extensions import Annotated
25
+
26
+ from docling_core.search.mapping import es_field
27
+ from docling_core.search.package import VERSION_PATTERN
28
+ from docling_core.utils.alias import AliasModel
29
+ from docling_core.utils.validators import validate_datetime, validate_unique_list
30
+
31
+ LanguageT = TypeVar("LanguageT", bound=str)
32
+ IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
33
+ DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
34
+ DescriptionAnalyticsT = TypeVar("DescriptionAnalyticsT", bound=BaseModel)
35
+ SubjectTypeT = TypeVar("SubjectTypeT", bound=str)
36
+ SubjectNameTypeT = TypeVar("SubjectNameTypeT", bound=str)
37
+ PredicateValueTypeT = TypeVar("PredicateValueTypeT", bound=str)
38
+ PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str)
39
+ PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
40
+ ProvenanceTypeT = TypeVar("ProvenanceTypeT", bound=str)
41
+ CollectionNameTypeT = TypeVar("CollectionNameTypeT", bound=str)
42
+ Coordinates = Annotated[
43
+ list[float],
44
+ Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
45
+ ]
46
+ T = TypeVar("T", bound=Hashable)
47
+
48
+ UniqueList = Annotated[
49
+ List[T],
50
+ AfterValidator(validate_unique_list),
51
+ Field(json_schema_extra={"uniqueItems": True}),
52
+ ]
53
+
54
+ StrictDateTime = Annotated[
55
+ datetime,
56
+ WrapValidator(validate_datetime),
57
+ PlainSerializer(
58
+ lambda x: x.astimezone(tz=timezone.utc).isoformat(), return_type=str
59
+ ),
60
+ ]
61
+
62
+ ACQUISITION_TYPE = Literal[
63
+ "API", "FTP", "Download", "Link", "Web scraping/Crawling", "Other"
64
+ ]
65
+
66
+
67
+ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
68
+ """Unique identifier of a Docling data object."""
69
+
70
+ type_: IdentifierTypeT = Field(
71
+ alias="type",
72
+ description=(
73
+ "A string representing a collection or database that contains this "
74
+ "data object."
75
+ ),
76
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
77
+ )
78
+ value: StrictStr = Field(
79
+ description=(
80
+ "The identifier value of the data object within a collection or database."
81
+ ),
82
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
83
+ )
84
+ name: str = Field(
85
+ alias="_name",
86
+ title="_Name",
87
+ description=(
88
+ "A unique identifier of the data object across Docling, consisting of "
89
+ "the concatenation of type and value in lower case, separated by hash "
90
+ "(#)."
91
+ ),
92
+ pattern=r"^.+#.+$",
93
+ strict=True,
94
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
95
+ )
96
+
97
+ @field_validator("name")
98
+ @classmethod
99
+ def name_from_type_value(cls, v, info: ValidationInfo):
100
+ """Validate the reference field for indexes of type Document."""
101
+ if (
102
+ "type_" in info.data
103
+ and "value" in info.data
104
+ and v != f"{info.data['type_'].lower()}#{info.data['value'].lower()}"
105
+ ):
106
+ raise ValueError(
107
+ "the _name field must be the concatenation of type and value in lower "
108
+ "case, separated by hash (#)"
109
+ )
110
+ return v
111
+
112
+
113
+ class Log(AliasModel, extra="forbid"):
114
+ """Log entry to describe an ETL task on a document."""
115
+
116
+ task: Optional[StrictStr] = Field(
117
+ default=None,
118
+ description=(
119
+ "An identifier of this task. It may be used to identify this task from "
120
+ "other tasks of the same agent and type."
121
+ ),
122
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
123
+ )
124
+ agent: StrictStr = Field(
125
+ description="The Docling agent that performed the task, e.g., CCS or CXS.",
126
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
127
+ )
128
+ type_: StrictStr = Field(
129
+ alias="type",
130
+ description="A task category.",
131
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
132
+ )
133
+ comment: Optional[StrictStr] = Field(
134
+ default=None,
135
+ description="A description of the task or any comments in natural language.",
136
+ )
137
+ date: StrictDateTime = Field(
138
+ description=(
139
+ "A string representation of the task execution datetime in ISO 8601 format."
140
+ )
141
+ )
142
+
143
+
144
+ class FileInfoObject(AliasModel):
145
+ """Filing information for any data object to be stored in a Docling database."""
146
+
147
+ filename: StrictStr = Field(
148
+ description="The name of a persistent object that created this data object",
149
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
150
+ )
151
+ fileprov: Optional[StrictStr] = Field(
152
+ default=None,
153
+ description=(
154
+ "The provenance of this data object, e.g. an archive file, a URL, or any"
155
+ " other repository."
156
+ ),
157
+ alias="filename-prov",
158
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
159
+ )
160
+ document_hash: StrictStr = Field(
161
+ description=(
162
+ "A unique identifier of this data object within a collection of a "
163
+ "Docling database"
164
+ ),
165
+ alias="document-hash",
166
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
167
+ )
168
+
169
+
170
+ class CollectionTypeEnum(str, Enum):
171
+ """Enumeration of valid Docling collection types."""
172
+
173
+ generic = "Generic"
174
+ document = "Document"
175
+ record = "Record"
176
+
177
+
178
+ CollectionTypeT = TypeVar("CollectionTypeT", bound=CollectionTypeEnum)
179
+
180
+
181
+ class CollectionInfo(
182
+ BaseModel, Generic[CollectionNameTypeT, CollectionTypeT], extra="forbid"
183
+ ):
184
+ """Information of a collection."""
185
+
186
+ name: Optional[CollectionNameTypeT] = Field(
187
+ default=None,
188
+ description="Name of the collection.",
189
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
190
+ )
191
+ type: CollectionTypeT = Field(
192
+ ...,
193
+ description="The collection type.",
194
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
195
+ )
196
+ version: Optional[
197
+ Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)]
198
+ ] = Field(
199
+ default=None,
200
+ description="The version of this collection model.",
201
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
202
+ )
203
+ alias: Optional[list[StrictStr]] = Field(
204
+ default=None,
205
+ description="A list of tags (aliases) for the collection.",
206
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
207
+ )
208
+
209
+
210
+ class CollectionDocumentInfo(
211
+ CollectionInfo[CollectionNameTypeT, Literal[CollectionTypeEnum.document]],
212
+ Generic[CollectionNameTypeT],
213
+ extra="forbid",
214
+ ):
215
+ """Information of a collection of type Document."""
216
+
217
+
218
+ class CollectionRecordInfo(
219
+ CollectionInfo[CollectionNameTypeT, Literal[CollectionTypeEnum.record]],
220
+ Generic[CollectionNameTypeT],
221
+ extra="forbid",
222
+ ):
223
+ """Information of a collection of type Record."""
224
+
225
+
226
+ class Acquisition(BaseModel, extra="forbid"):
227
+ """Information on how the data was obtained."""
228
+
229
+ type: ACQUISITION_TYPE = Field(
230
+ description="The method to obtain the data.",
231
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
232
+ )
233
+ date: Optional[StrictDateTime] = Field(
234
+ default=None,
235
+ description=(
236
+ "A string representation of the acquisition datetime in ISO 8601 format."
237
+ ),
238
+ )
239
+ link: Optional[AnyUrl] = Field(
240
+ default=None,
241
+ description="Link to the data source of this document.",
242
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
243
+ )
244
+ size: Optional[NonNegativeInt] = Field(
245
+ default=None,
246
+ description="Size in bytes of the raw document from the data source.",
247
+ json_schema_extra=es_field(type="long"),
248
+ )
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Document type."""
@@ -0,0 +1,199 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define common models across CCS objects."""
7
+ from typing import Annotated, Literal, Optional, Union
8
+
9
+ from pydantic import BaseModel, Field, StrictStr
10
+
11
+ from docling_core.search.mapping import es_field
12
+ from docling_core.utils.alias import AliasModel
13
+
14
+ CellData = tuple[float, float, float, float, str, str]
15
+
16
+ CellHeader = tuple[
17
+ Literal["x0"],
18
+ Literal["y0"],
19
+ Literal["x1"],
20
+ Literal["y1"],
21
+ Literal["font"],
22
+ Literal["text"],
23
+ ]
24
+
25
+ BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
26
+
27
+ Span = Annotated[list[int], Field(min_length=2, max_length=2)]
28
+
29
+
30
+ class CellsContainer(BaseModel):
31
+ """Cell container."""
32
+
33
+ data: Optional[list[CellData]] = None
34
+ header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
35
+
36
+
37
+ class S3Resource(BaseModel):
38
+ """Resource in a cloud object storage."""
39
+
40
+ mime: str
41
+ path: str
42
+ page: Optional[int] = None
43
+
44
+
45
+ class S3Data(AliasModel):
46
+ """Data object in a cloud object storage."""
47
+
48
+ pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
49
+ pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
50
+ pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
51
+ json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
52
+ json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
53
+ glm_json_document: Optional[S3Resource] = Field(
54
+ default=None, alias="glm-json-document"
55
+ )
56
+ figures: Optional[list[S3Resource]] = None
57
+
58
+
59
+ class S3Reference(AliasModel):
60
+ """References an s3 resource."""
61
+
62
+ ref_s3_data: StrictStr = Field(
63
+ alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
64
+ )
65
+
66
+
67
+ class Prov(AliasModel):
68
+ """Provenance."""
69
+
70
+ bbox: BoundingBox
71
+ page: int
72
+ span: Span
73
+ ref_s3_data: Optional[StrictStr] = Field(
74
+ default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
75
+ )
76
+
77
+
78
+ class BoundingBoxContainer(BaseModel):
79
+ """Bounding box container."""
80
+
81
+ min: BoundingBox
82
+ max: BoundingBox
83
+
84
+
85
+ class BitmapObject(AliasModel):
86
+ """Bitmap object."""
87
+
88
+ obj_type: str = Field(alias="type")
89
+ bounding_box: BoundingBoxContainer = Field(
90
+ json_schema_extra=es_field(suppress=True)
91
+ )
92
+ prov: Prov
93
+
94
+
95
+ class PageDimensions(BaseModel):
96
+ """Page dimensions."""
97
+
98
+ height: float
99
+ page: int
100
+ width: float
101
+
102
+
103
+ class TableCell(AliasModel):
104
+ """Table cell."""
105
+
106
+ bbox: Optional[BoundingBox] = None
107
+ spans: Optional[list[Span]] = None
108
+ text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
109
+ obj_type: str = Field(alias="type")
110
+
111
+
112
+ class GlmTableCell(TableCell):
113
+ """Glm Table cell."""
114
+
115
+ col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
116
+ col_header: bool = Field(
117
+ default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
118
+ )
119
+ col_span: Optional[Span] = Field(
120
+ default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
121
+ )
122
+ row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
123
+ row_header: bool = Field(
124
+ default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
125
+ )
126
+ row_span: Optional[Span] = Field(
127
+ default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
128
+ )
129
+
130
+
131
+ class Table(AliasModel):
132
+ """Table."""
133
+
134
+ num_cols: int = Field(alias="#-cols")
135
+ num_rows: int = Field(alias="#-rows")
136
+ bounding_box: Optional[BoundingBoxContainer] = Field(
137
+ default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
138
+ )
139
+ data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
140
+ model: Optional[str] = None
141
+ prov: Optional[list[Prov]] = None
142
+ text: Optional[str] = Field(
143
+ default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
144
+ )
145
+ obj_type: str = Field(
146
+ alias="type",
147
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
148
+ )
149
+
150
+
151
+ class BaseCell(AliasModel):
152
+ """Base cell."""
153
+
154
+ bounding_box: Optional[BoundingBoxContainer] = Field(
155
+ default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
156
+ )
157
+ prov: Optional[list[Prov]] = None
158
+ text: Optional[str] = None
159
+ obj_type: str = Field(
160
+ alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
161
+ )
162
+
163
+
164
+ class BaseText(AliasModel):
165
+ """Base model for text objects."""
166
+
167
+ text: StrictStr = Field(
168
+ json_schema_extra=es_field(term_vector="with_positions_offsets")
169
+ )
170
+ obj_type: StrictStr = Field(
171
+ alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
172
+ )
173
+ name: Optional[StrictStr] = Field(
174
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
175
+ )
176
+ font: Optional[str] = None
177
+ prov: Optional[list[Prov]] = None
178
+
179
+
180
+ class ListItem(BaseText):
181
+ """List item."""
182
+
183
+ identifier: str
184
+
185
+
186
+ class Ref(AliasModel):
187
+ """Reference."""
188
+
189
+ name: str
190
+ obj_type: str = Field(alias="type")
191
+ ref: str = Field(alias="$ref")
192
+
193
+
194
+ class PageReference(BaseModel):
195
+ """Page reference."""
196
+
197
+ hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
198
+ model: str = Field(json_schema_extra=es_field(suppress=True))
199
+ page: int = Field(json_schema_extra=es_field(type="short"))
@@ -0,0 +1,76 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for annotations and predictions in CCS."""
7
+ from typing import Any
8
+
9
+ from pydantic import BaseModel
10
+
11
+ from docling_core.types.doc.base import BoundingBox
12
+
13
+ AnnotationReport = Any # TODO
14
+
15
+
16
+ class Cell(BaseModel):
17
+ """Cell."""
18
+
19
+ id: int
20
+ rawcell_id: int
21
+ label: str
22
+
23
+
24
+ class Cluster(BaseModel):
25
+ """Cluster."""
26
+
27
+ model: str
28
+ type: str
29
+ bbox: BoundingBox
30
+ cell_ids: list[int]
31
+ merged: bool
32
+ id: int
33
+
34
+
35
+ class Table(BaseModel):
36
+ """Table."""
37
+
38
+ cell_id: int
39
+ label: str
40
+ rows: list[int]
41
+ cols: list[int]
42
+
43
+
44
+ class Info(BaseModel):
45
+ """Info."""
46
+
47
+ display_name: str
48
+ model_name: str
49
+ model_class: str
50
+ model_version: str
51
+ model_id: str
52
+
53
+
54
+ class Source(BaseModel):
55
+ """Source."""
56
+
57
+ type: str
58
+ timestamp: float
59
+ info: Info
60
+
61
+
62
+ class AnnotPredItem(BaseModel):
63
+ """Annotation or prediction item."""
64
+
65
+ cells: list[Cell]
66
+ clusters: list[Cluster]
67
+ tables: list[Table]
68
+ source: Source
69
+
70
+
71
+ class Annotation(BaseModel):
72
+ """Annotations."""
73
+
74
+ annotations: list[AnnotPredItem]
75
+ predictions: list[AnnotPredItem]
76
+ reports: list[AnnotationReport]
@@ -0,0 +1,83 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for CCS objects with OCR."""
7
+ from typing import Any, Dict, List, Literal
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ from docling_core.types.doc.base import BoundingBox
12
+ from docling_core.utils.alias import AliasModel
13
+
14
+ CoordsOrder = Literal["x1", "y1", "x2", "y2"]
15
+
16
+ CoordsOrigin = Literal["top-left"] # TODO
17
+
18
+ Info = Dict[str, Any] # TODO
19
+
20
+
21
+ class Page(BaseModel):
22
+ """Page."""
23
+
24
+ width: float
25
+ height: float
26
+
27
+
28
+ class Meta(AliasModel):
29
+ """Meta."""
30
+
31
+ page: Page
32
+ coords_order: List[CoordsOrder] = Field(..., alias="coords-order")
33
+ coords_origin: CoordsOrigin = Field(..., alias="coords-origin")
34
+
35
+
36
+ class Dimension(BaseModel):
37
+ """Dimension."""
38
+
39
+ width: float
40
+ height: float
41
+
42
+
43
+ class Word(BaseModel):
44
+ """Word."""
45
+
46
+ confidence: float
47
+ bbox: BoundingBox
48
+ content: str
49
+
50
+
51
+ class Cell(BaseModel):
52
+ """Cell."""
53
+
54
+ confidence: float
55
+ bbox: BoundingBox
56
+ content: str
57
+
58
+
59
+ class Box(BaseModel):
60
+ """Box."""
61
+
62
+ confidence: float
63
+ bbox: BoundingBox
64
+ content: str
65
+
66
+
67
+ class Path(BaseModel):
68
+ """Path."""
69
+
70
+ x: List[float]
71
+ y: List[float]
72
+
73
+
74
+ class OcrOutput(AliasModel):
75
+ """OCR output."""
76
+
77
+ meta: Meta = Field(..., alias="_meta")
78
+ info: Info
79
+ dimension: Dimension
80
+ words: List[Word]
81
+ cells: List[Cell]
82
+ boxes: List[Box]
83
+ paths: List[Path]