docling-core 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/__init__.py +6 -0
- docling_core/py.typed +0 -0
- docling_core/resources/schemas/doc/ANN.json +171 -0
- docling_core/resources/schemas/doc/DOC.json +300 -0
- docling_core/resources/schemas/doc/OCR-output.json +166 -0
- docling_core/resources/schemas/doc/RAW.json +158 -0
- docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
- docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
- docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
- docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
- docling_core/search/__init__.py +6 -0
- docling_core/search/json_schema_to_search_mapper.py +406 -0
- docling_core/search/mapping.py +29 -0
- docling_core/search/meta.py +93 -0
- docling_core/search/package.py +56 -0
- docling_core/types/__init__.py +25 -0
- docling_core/types/base.py +248 -0
- docling_core/types/doc/__init__.py +6 -0
- docling_core/types/doc/base.py +199 -0
- docling_core/types/doc/doc_ann.py +76 -0
- docling_core/types/doc/doc_ocr.py +83 -0
- docling_core/types/doc/doc_raw.py +187 -0
- docling_core/types/doc/document.py +393 -0
- docling_core/types/gen/__init__.py +6 -0
- docling_core/types/gen/generic.py +33 -0
- docling_core/types/nlp/__init__.py +6 -0
- docling_core/types/nlp/qa.py +74 -0
- docling_core/types/nlp/qa_labels.py +118 -0
- docling_core/types/rec/__init__.py +6 -0
- docling_core/types/rec/attribute.py +55 -0
- docling_core/types/rec/base.py +90 -0
- docling_core/types/rec/predicate.py +133 -0
- docling_core/types/rec/record.py +95 -0
- docling_core/types/rec/statement.py +41 -0
- docling_core/types/rec/subject.py +77 -0
- docling_core/utils/__init__.py +6 -0
- docling_core/utils/alias.py +27 -0
- docling_core/utils/ds_generate_docs.py +144 -0
- docling_core/utils/ds_generate_jsonschema.py +62 -0
- docling_core/utils/validate.py +86 -0
- docling_core/utils/validators.py +100 -0
- docling_core-0.0.1.dist-info/LICENSE +21 -0
- docling_core-0.0.1.dist-info/METADATA +133 -0
- docling_core-0.0.1.dist-info/RECORD +46 -0
- docling_core-0.0.1.dist-info/WHEEL +4 -0
- docling_core-0.0.1.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Models for CCS objects in raw format."""
|
|
7
|
+
from typing import Any, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
from typing_extensions import Annotated
|
|
11
|
+
|
|
12
|
+
from docling_core.types.doc.base import BoundingBox
|
|
13
|
+
from docling_core.utils.alias import AliasModel
|
|
14
|
+
|
|
15
|
+
FontDifferences = dict[str, Any]
|
|
16
|
+
NamedWidths = dict[str, Any]
|
|
17
|
+
IgnoredCell = Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Box(BaseModel):
|
|
21
|
+
"""Box."""
|
|
22
|
+
|
|
23
|
+
baseline: BoundingBox
|
|
24
|
+
device: BoundingBox
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Content(BaseModel):
|
|
28
|
+
"""Content."""
|
|
29
|
+
|
|
30
|
+
rnormalized: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Enumeration(BaseModel):
|
|
34
|
+
"""Enumeration."""
|
|
35
|
+
|
|
36
|
+
match: int
|
|
37
|
+
type: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Font(BaseModel):
|
|
41
|
+
"""Font."""
|
|
42
|
+
|
|
43
|
+
color: Annotated[List[float], Field(min_length=3, max_length=4)]
|
|
44
|
+
name: str
|
|
45
|
+
size: float
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Cell(AliasModel):
|
|
49
|
+
"""Cell."""
|
|
50
|
+
|
|
51
|
+
see_cell: bool = Field(..., alias="SEE_cell")
|
|
52
|
+
see_confidence: float = Field(..., alias="SEE_confidence")
|
|
53
|
+
angle: float
|
|
54
|
+
box: Box
|
|
55
|
+
content: Content
|
|
56
|
+
enumeration: Enumeration
|
|
57
|
+
font: Font
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class PageDimensions(BaseModel):
|
|
61
|
+
"""PageDimensions."""
|
|
62
|
+
|
|
63
|
+
bbox: BoundingBox
|
|
64
|
+
height: float
|
|
65
|
+
width: float
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Path(AliasModel):
|
|
69
|
+
"""Path."""
|
|
70
|
+
|
|
71
|
+
bbox: BoundingBox
|
|
72
|
+
sub_paths: list[float] = Field(..., alias="sub-paths")
|
|
73
|
+
type: str
|
|
74
|
+
x_values: list[float] = Field(..., alias="x-values")
|
|
75
|
+
y_values: list[float] = Field(..., alias="y-values")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class VerticalLine(BaseModel):
|
|
79
|
+
"""Vertical line."""
|
|
80
|
+
|
|
81
|
+
y0: int
|
|
82
|
+
y1: int
|
|
83
|
+
x: int
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class HorizontalLine(BaseModel):
|
|
87
|
+
"""Horizontal line."""
|
|
88
|
+
|
|
89
|
+
x0: int
|
|
90
|
+
x1: int
|
|
91
|
+
y: int
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Image(BaseModel):
|
|
95
|
+
"""Image."""
|
|
96
|
+
|
|
97
|
+
box: BoundingBox
|
|
98
|
+
height: float
|
|
99
|
+
width: float
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class FontRange(BaseModel):
|
|
103
|
+
"""Font range."""
|
|
104
|
+
|
|
105
|
+
first: int
|
|
106
|
+
second: int
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class FontCmap(BaseModel):
|
|
110
|
+
"""Font cmap."""
|
|
111
|
+
|
|
112
|
+
cmap: dict[str, str]
|
|
113
|
+
name: str
|
|
114
|
+
range: FontRange
|
|
115
|
+
type: int
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class FontMetrics(AliasModel):
|
|
119
|
+
"""Font metrics."""
|
|
120
|
+
|
|
121
|
+
stem_h: float = Field(..., alias="StemH")
|
|
122
|
+
stem_v: float = Field(..., alias="StemV")
|
|
123
|
+
ascent: float
|
|
124
|
+
average_width: float = Field(..., alias="average-width")
|
|
125
|
+
bbox: BoundingBox
|
|
126
|
+
cap_height: float
|
|
127
|
+
default_width: float = Field(..., alias="default-width")
|
|
128
|
+
descent: float
|
|
129
|
+
file: str
|
|
130
|
+
italic_angle: float = Field(..., alias="italic-angle")
|
|
131
|
+
max_width: float = Field(..., alias="max-width")
|
|
132
|
+
missing_width: float = Field(..., alias="missing-width")
|
|
133
|
+
name: str
|
|
134
|
+
named_widths: NamedWidths = Field(..., alias="named-widths")
|
|
135
|
+
weight: str
|
|
136
|
+
widths: dict[str, float]
|
|
137
|
+
x_height: float
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class FontInfo(AliasModel):
|
|
141
|
+
"""Font info."""
|
|
142
|
+
|
|
143
|
+
font_cmap: FontCmap = Field(..., alias="font-cmap")
|
|
144
|
+
font_differences: FontDifferences = Field(..., alias="font-differences")
|
|
145
|
+
font_metrics: FontMetrics = Field(..., alias="font-metrics")
|
|
146
|
+
name: str
|
|
147
|
+
internal_name: str = Field(..., alias="name (internal)")
|
|
148
|
+
subtype: str
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class Page(AliasModel):
|
|
152
|
+
"""Page."""
|
|
153
|
+
|
|
154
|
+
height: float
|
|
155
|
+
width: float
|
|
156
|
+
dimensions: PageDimensions
|
|
157
|
+
cells: list[Cell]
|
|
158
|
+
paths: list[Path]
|
|
159
|
+
vertical_lines: Optional[list[VerticalLine]] = Field(..., alias="vertical-lines")
|
|
160
|
+
horizontal_lines: Optional[list[HorizontalLine]] = Field(
|
|
161
|
+
..., alias="horizontal-lines"
|
|
162
|
+
)
|
|
163
|
+
ignored_cells: list[IgnoredCell] = Field(..., alias="ignored-cells")
|
|
164
|
+
images: list[Image]
|
|
165
|
+
fonts: dict[str, FontInfo]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class Histograms(AliasModel):
|
|
169
|
+
"""Histogram."""
|
|
170
|
+
|
|
171
|
+
mean_char_height: dict[str, float] = Field(..., alias="mean-char-height")
|
|
172
|
+
mean_char_width: dict[str, float] = Field(..., alias="mean-char-width")
|
|
173
|
+
number_of_chars: dict[str, int] = Field(..., alias="number-of-chars")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class PdfInfo(BaseModel):
|
|
177
|
+
"""PDF info."""
|
|
178
|
+
|
|
179
|
+
histograms: Histograms
|
|
180
|
+
styles: list[str]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class RawPdf(BaseModel):
|
|
184
|
+
"""Raw PDF."""
|
|
185
|
+
|
|
186
|
+
info: PdfInfo
|
|
187
|
+
pages: list[Page]
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Models for the Docling Document data type."""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Generic, Optional, Union
|
|
10
|
+
|
|
11
|
+
from pydantic import (
|
|
12
|
+
AnyHttpUrl,
|
|
13
|
+
BaseModel,
|
|
14
|
+
Field,
|
|
15
|
+
NonNegativeInt,
|
|
16
|
+
StrictStr,
|
|
17
|
+
model_validator,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from docling_core.search.mapping import es_field
|
|
21
|
+
from docling_core.types.base import (
|
|
22
|
+
Acquisition,
|
|
23
|
+
CollectionDocumentInfo,
|
|
24
|
+
CollectionNameTypeT,
|
|
25
|
+
DescriptionAdvancedT,
|
|
26
|
+
DescriptionAnalyticsT,
|
|
27
|
+
FileInfoObject,
|
|
28
|
+
Identifier,
|
|
29
|
+
IdentifierTypeT,
|
|
30
|
+
LanguageT,
|
|
31
|
+
Log,
|
|
32
|
+
)
|
|
33
|
+
from docling_core.types.doc.base import (
|
|
34
|
+
BaseCell,
|
|
35
|
+
BaseText,
|
|
36
|
+
BitmapObject,
|
|
37
|
+
PageDimensions,
|
|
38
|
+
PageReference,
|
|
39
|
+
Ref,
|
|
40
|
+
S3Data,
|
|
41
|
+
Table,
|
|
42
|
+
)
|
|
43
|
+
from docling_core.utils.alias import AliasModel
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class CCSFileInfoDescription(BaseModel, extra="forbid"):
|
|
47
|
+
"""File info description."""
|
|
48
|
+
|
|
49
|
+
author: Optional[list[StrictStr]] = None
|
|
50
|
+
keywords: Optional[str] = None
|
|
51
|
+
subject: Optional[str] = None
|
|
52
|
+
title: Optional[StrictStr] = None
|
|
53
|
+
creation_date: Optional[str] = None # datetime
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class CCSFileInfoObject(FileInfoObject, extra="forbid"):
|
|
57
|
+
"""File info object."""
|
|
58
|
+
|
|
59
|
+
num_pages: Optional[int] = Field(default=None, alias="#-pages")
|
|
60
|
+
|
|
61
|
+
collection_name: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
alias="collection-name",
|
|
64
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
65
|
+
)
|
|
66
|
+
description: Optional[CCSFileInfoDescription] = Field(
|
|
67
|
+
default=None, json_schema_extra=es_field(suppress=True)
|
|
68
|
+
)
|
|
69
|
+
page_hashes: Optional[list[PageReference]] = Field(
|
|
70
|
+
default=None, alias="page-hashes"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Affiliation(BaseModel, extra="forbid"):
|
|
75
|
+
"""Affiliation."""
|
|
76
|
+
|
|
77
|
+
name: str = Field(
|
|
78
|
+
...,
|
|
79
|
+
json_schema_extra=es_field(
|
|
80
|
+
fields={
|
|
81
|
+
"lower": {
|
|
82
|
+
"normalizer": "lowercase_asciifolding",
|
|
83
|
+
"type": "keyword",
|
|
84
|
+
"ignore_above": 8191,
|
|
85
|
+
},
|
|
86
|
+
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
87
|
+
},
|
|
88
|
+
),
|
|
89
|
+
)
|
|
90
|
+
id: Optional[str] = Field(
|
|
91
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
92
|
+
)
|
|
93
|
+
source: Optional[str] = Field(
|
|
94
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Author(BaseModel, extra="forbid"):
|
|
99
|
+
"""Author."""
|
|
100
|
+
|
|
101
|
+
name: str = Field(
|
|
102
|
+
...,
|
|
103
|
+
json_schema_extra=es_field(
|
|
104
|
+
type="text",
|
|
105
|
+
fields={
|
|
106
|
+
"lower": {
|
|
107
|
+
"normalizer": "lowercase_asciifolding",
|
|
108
|
+
"type": "keyword",
|
|
109
|
+
"ignore_above": 8191,
|
|
110
|
+
},
|
|
111
|
+
"keyword": {"type": "keyword", "ignore_above": 8191},
|
|
112
|
+
},
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
id: Optional[str] = Field(
|
|
116
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
117
|
+
)
|
|
118
|
+
source: Optional[str] = Field(
|
|
119
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
120
|
+
)
|
|
121
|
+
affiliations: Optional[list[Affiliation]] = None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
|
|
125
|
+
"""Publication details of a journal or venue."""
|
|
126
|
+
|
|
127
|
+
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
|
|
128
|
+
default=None,
|
|
129
|
+
description="Unique identifiers of a publication venue.",
|
|
130
|
+
)
|
|
131
|
+
name: StrictStr = Field(
|
|
132
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
133
|
+
description="Name of the publication.",
|
|
134
|
+
)
|
|
135
|
+
alternate_names: Optional[list[StrictStr]] = Field(
|
|
136
|
+
default=None,
|
|
137
|
+
json_schema_extra=es_field(type="text"),
|
|
138
|
+
title="Alternate Names",
|
|
139
|
+
description="Other names or abbreviations of this publication.",
|
|
140
|
+
)
|
|
141
|
+
type: Optional[list[StrictStr]] = Field(
|
|
142
|
+
default=None,
|
|
143
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
144
|
+
description="Type of publication (journal article, conference, review,...).",
|
|
145
|
+
)
|
|
146
|
+
pages: Optional[StrictStr] = Field(
|
|
147
|
+
default=None,
|
|
148
|
+
json_schema_extra=es_field(type="text"),
|
|
149
|
+
description="Page range in the publication.",
|
|
150
|
+
)
|
|
151
|
+
issue: Optional[StrictStr] = Field(
|
|
152
|
+
default=None,
|
|
153
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
154
|
+
description="Publication issue (issue number).",
|
|
155
|
+
)
|
|
156
|
+
volume: Optional[StrictStr] = Field(
|
|
157
|
+
default=None,
|
|
158
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
159
|
+
description="Publication volume.",
|
|
160
|
+
)
|
|
161
|
+
url: Optional[AnyHttpUrl] = Field(
|
|
162
|
+
default=None,
|
|
163
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
164
|
+
description="URL on the publication site.",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class DescriptionLicense(BaseModel, extra="forbid"):
|
|
169
|
+
"""Licence in document description."""
|
|
170
|
+
|
|
171
|
+
code: Optional[StrictStr] = Field(
|
|
172
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
173
|
+
)
|
|
174
|
+
text: Optional[StrictStr] = None
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class CCSDocumentDescription(
|
|
178
|
+
AliasModel,
|
|
179
|
+
Generic[
|
|
180
|
+
DescriptionAdvancedT,
|
|
181
|
+
DescriptionAnalyticsT,
|
|
182
|
+
IdentifierTypeT,
|
|
183
|
+
LanguageT,
|
|
184
|
+
CollectionNameTypeT,
|
|
185
|
+
],
|
|
186
|
+
):
|
|
187
|
+
"""Description in document."""
|
|
188
|
+
|
|
189
|
+
title: Optional[StrictStr] = None
|
|
190
|
+
abstract: Optional[list[StrictStr]] = None
|
|
191
|
+
authors: Optional[list[Author]] = None
|
|
192
|
+
affiliations: Optional[list[Affiliation]] = None
|
|
193
|
+
subjects: Optional[list[str]] = Field(
|
|
194
|
+
default=None,
|
|
195
|
+
json_schema_extra=es_field(
|
|
196
|
+
fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
|
|
197
|
+
),
|
|
198
|
+
)
|
|
199
|
+
keywords: Optional[list[str]] = Field(
|
|
200
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
201
|
+
)
|
|
202
|
+
publication_date: Optional[datetime] = None
|
|
203
|
+
languages: Optional[list[LanguageT]] = Field(
|
|
204
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
205
|
+
)
|
|
206
|
+
license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
|
|
207
|
+
publishers: Optional[list[StrictStr]] = Field(
|
|
208
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
209
|
+
)
|
|
210
|
+
url_refs: Optional[list[str]] = Field(
|
|
211
|
+
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
212
|
+
)
|
|
213
|
+
references: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
214
|
+
publication: Optional[list[Publication]] = Field(
|
|
215
|
+
default=None, description="List of publication journals or venues."
|
|
216
|
+
)
|
|
217
|
+
reference_count: Optional[NonNegativeInt] = Field(
|
|
218
|
+
default=None,
|
|
219
|
+
title="Reference Count",
|
|
220
|
+
description="Total number of documents referenced by this document.",
|
|
221
|
+
json_schema_extra=es_field(type="integer"),
|
|
222
|
+
)
|
|
223
|
+
citation_count: Optional[NonNegativeInt] = Field(
|
|
224
|
+
default=None,
|
|
225
|
+
title="Citation Count",
|
|
226
|
+
description=(
|
|
227
|
+
"Total number of citations that this document has received (number "
|
|
228
|
+
"of documents in whose bibliography this document appears)."
|
|
229
|
+
),
|
|
230
|
+
json_schema_extra=es_field(type="integer"),
|
|
231
|
+
)
|
|
232
|
+
citation_date: Optional[datetime] = Field(
|
|
233
|
+
default=None,
|
|
234
|
+
title="Citation Count Date",
|
|
235
|
+
description="Last update date of the citation count.",
|
|
236
|
+
)
|
|
237
|
+
advanced: Optional[DescriptionAdvancedT] = None
|
|
238
|
+
analytics: Optional[DescriptionAnalyticsT] = None
|
|
239
|
+
logs: list[Log]
|
|
240
|
+
collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
|
|
241
|
+
default=None, description="The collection information of this document."
|
|
242
|
+
)
|
|
243
|
+
acquisition: Optional[Acquisition] = Field(
|
|
244
|
+
default=None,
|
|
245
|
+
description=(
|
|
246
|
+
"Information on how the document was obtained, for data governance"
|
|
247
|
+
" purposes."
|
|
248
|
+
),
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class MinimalDocument(
|
|
253
|
+
AliasModel,
|
|
254
|
+
Generic[
|
|
255
|
+
DescriptionAdvancedT,
|
|
256
|
+
DescriptionAnalyticsT,
|
|
257
|
+
IdentifierTypeT,
|
|
258
|
+
LanguageT,
|
|
259
|
+
CollectionNameTypeT,
|
|
260
|
+
],
|
|
261
|
+
):
|
|
262
|
+
"""Minimal model for a document."""
|
|
263
|
+
|
|
264
|
+
name: StrictStr = Field(alias="_name")
|
|
265
|
+
obj_type: StrictStr = Field("document", alias="type")
|
|
266
|
+
description: CCSDocumentDescription[
|
|
267
|
+
DescriptionAdvancedT,
|
|
268
|
+
DescriptionAnalyticsT,
|
|
269
|
+
IdentifierTypeT,
|
|
270
|
+
LanguageT,
|
|
271
|
+
CollectionNameTypeT,
|
|
272
|
+
]
|
|
273
|
+
file_info: FileInfoObject = Field(alias="file-info")
|
|
274
|
+
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
275
|
+
default=None, alias="main-text"
|
|
276
|
+
)
|
|
277
|
+
figures: Optional[list[BaseCell]] = None
|
|
278
|
+
tables: Optional[list[Table]] = None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class CCSDocument(
|
|
282
|
+
MinimalDocument,
|
|
283
|
+
Generic[
|
|
284
|
+
DescriptionAdvancedT,
|
|
285
|
+
DescriptionAnalyticsT,
|
|
286
|
+
IdentifierTypeT,
|
|
287
|
+
LanguageT,
|
|
288
|
+
CollectionNameTypeT,
|
|
289
|
+
],
|
|
290
|
+
):
|
|
291
|
+
"""Model for a CCS-generated document."""
|
|
292
|
+
|
|
293
|
+
obj_type: StrictStr = Field("pdf-document", alias="type")
|
|
294
|
+
bitmaps: Optional[list[BitmapObject]] = None
|
|
295
|
+
equations: Optional[list[BaseCell]] = None
|
|
296
|
+
footnotes: Optional[list[BaseText]] = None
|
|
297
|
+
file_info: CCSFileInfoObject = Field(alias="file-info")
|
|
298
|
+
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
299
|
+
default=None,
|
|
300
|
+
alias="main-text",
|
|
301
|
+
)
|
|
302
|
+
page_dimensions: Optional[list[PageDimensions]] = Field(
|
|
303
|
+
default=None, alias="page-dimensions"
|
|
304
|
+
)
|
|
305
|
+
page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
|
|
306
|
+
page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
|
|
307
|
+
s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
|
|
308
|
+
|
|
309
|
+
@model_validator(mode="before")
|
|
310
|
+
@classmethod
|
|
311
|
+
def from_dict(cls, data):
|
|
312
|
+
"""Validates and fixes the input data."""
|
|
313
|
+
description_collection = data["description"].get("collection")
|
|
314
|
+
if not description_collection:
|
|
315
|
+
data["description"].setdefault("collection", {})
|
|
316
|
+
|
|
317
|
+
data["description"]["collection"].setdefault("type", "Document")
|
|
318
|
+
logs = data["description"].get("logs")
|
|
319
|
+
if not logs:
|
|
320
|
+
data["description"].setdefault("logs", [])
|
|
321
|
+
|
|
322
|
+
abstract = data["description"].get("abstract")
|
|
323
|
+
if abstract is not None and not isinstance(abstract, list):
|
|
324
|
+
if isinstance(abstract, str):
|
|
325
|
+
data["description"]["abstract"] = [abstract]
|
|
326
|
+
else:
|
|
327
|
+
data["description"].pop("abstract")
|
|
328
|
+
|
|
329
|
+
for key in ["affiliations", "authors"]:
|
|
330
|
+
descr = data["description"].get(key)
|
|
331
|
+
if descr is not None and not isinstance(descr, list):
|
|
332
|
+
if isinstance(descr, dict):
|
|
333
|
+
data["description"][key] = [descr]
|
|
334
|
+
else:
|
|
335
|
+
data["description"].pop(key)
|
|
336
|
+
|
|
337
|
+
if data.get("main-text"):
|
|
338
|
+
for item in data["main-text"]:
|
|
339
|
+
if ref := item.pop("__ref", None):
|
|
340
|
+
item["$ref"] = ref
|
|
341
|
+
|
|
342
|
+
return data
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class ExportedCCSDocument(
|
|
346
|
+
MinimalDocument,
|
|
347
|
+
Generic[
|
|
348
|
+
DescriptionAdvancedT,
|
|
349
|
+
DescriptionAnalyticsT,
|
|
350
|
+
IdentifierTypeT,
|
|
351
|
+
LanguageT,
|
|
352
|
+
CollectionNameTypeT,
|
|
353
|
+
],
|
|
354
|
+
):
|
|
355
|
+
"""Document model for Docling."""
|
|
356
|
+
|
|
357
|
+
obj_type: StrictStr = Field(
|
|
358
|
+
"pdf-document",
|
|
359
|
+
alias="type",
|
|
360
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
361
|
+
)
|
|
362
|
+
bitmaps: Optional[list[BitmapObject]] = None
|
|
363
|
+
equations: Optional[list[BaseCell]] = None
|
|
364
|
+
footnotes: Optional[list[BaseText]] = None
|
|
365
|
+
description: CCSDocumentDescription[
|
|
366
|
+
DescriptionAdvancedT,
|
|
367
|
+
DescriptionAnalyticsT,
|
|
368
|
+
IdentifierTypeT,
|
|
369
|
+
LanguageT,
|
|
370
|
+
CollectionNameTypeT,
|
|
371
|
+
]
|
|
372
|
+
file_info: CCSFileInfoObject = Field(alias="file-info")
|
|
373
|
+
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
|
|
374
|
+
default=None, alias="main-text"
|
|
375
|
+
)
|
|
376
|
+
page_dimensions: Optional[list[PageDimensions]] = Field(
|
|
377
|
+
default=None, alias="page-dimensions"
|
|
378
|
+
)
|
|
379
|
+
page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
|
|
380
|
+
page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
|
|
381
|
+
s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
|
|
382
|
+
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
|
|
383
|
+
|
|
384
|
+
@model_validator(mode="before")
|
|
385
|
+
@classmethod
|
|
386
|
+
def from_dict(cls, data):
|
|
387
|
+
"""Fix ref in main-text."""
|
|
388
|
+
if data.get("main-text"):
|
|
389
|
+
for item in data["main-text"]:
|
|
390
|
+
if ref := item.pop("__ref", None):
|
|
391
|
+
item["$ref"] = ref
|
|
392
|
+
|
|
393
|
+
return data
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define a generic Docling type."""
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import Field, StrictStr
|
|
11
|
+
|
|
12
|
+
from docling_core.search.mapping import es_field
|
|
13
|
+
from docling_core.types.base import FileInfoObject
|
|
14
|
+
from docling_core.utils.alias import AliasModel
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Generic(AliasModel):
|
|
18
|
+
"""A representation of a generic document."""
|
|
19
|
+
|
|
20
|
+
name: Optional[StrictStr] = Field(
|
|
21
|
+
default=None,
|
|
22
|
+
description="A short description or summary of the document.",
|
|
23
|
+
alias="_name",
|
|
24
|
+
json_schema_extra=es_field(type="text"),
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
file_info: FileInfoObject = Field(
|
|
28
|
+
title="Document information",
|
|
29
|
+
description=(
|
|
30
|
+
"Minimal identification information of the document within a collection."
|
|
31
|
+
),
|
|
32
|
+
alias="file-info",
|
|
33
|
+
)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the model for Q&A pairs."""
|
|
7
|
+
from typing import Generic, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, StrictBool, StrictStr
|
|
10
|
+
|
|
11
|
+
from docling_core.search.mapping import es_field
|
|
12
|
+
from docling_core.types.base import DescriptionAdvancedT, StrictDateTime, UniqueList
|
|
13
|
+
from docling_core.types.nlp.qa_labels import QALabelling
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class QAPair(BaseModel, Generic[DescriptionAdvancedT]):
|
|
17
|
+
"""A representation of a question-answering (QA) pair."""
|
|
18
|
+
|
|
19
|
+
context: StrictStr = Field(
|
|
20
|
+
description=(
|
|
21
|
+
"A single string containing the context of the question enabling the"
|
|
22
|
+
" presentation of the answer."
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
question: StrictStr = Field(description="A question on the given context.")
|
|
26
|
+
answer: StrictStr = Field(
|
|
27
|
+
description="The answer to the question from the context."
|
|
28
|
+
)
|
|
29
|
+
short_answer: Optional[StrictStr] = Field(
|
|
30
|
+
default=None, description="Alternative and concise answer."
|
|
31
|
+
)
|
|
32
|
+
retrieved_context: Optional[StrictBool] = Field(
|
|
33
|
+
default=False,
|
|
34
|
+
description="Whether the context was retrieved from the question.",
|
|
35
|
+
)
|
|
36
|
+
generated_question: Optional[StrictBool] = Field(
|
|
37
|
+
default=False, description="Whether the question was generated by an AI model."
|
|
38
|
+
)
|
|
39
|
+
generated_answer: Optional[StrictBool] = Field(
|
|
40
|
+
default=False, description="Whether the answer was generated by an AI model."
|
|
41
|
+
)
|
|
42
|
+
created: StrictDateTime = Field(
|
|
43
|
+
description="Datetime when the QA pair was created ."
|
|
44
|
+
)
|
|
45
|
+
user: Optional[StrictStr] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description=(
|
|
48
|
+
"Unique identifier of the user that created or curated this QA pair."
|
|
49
|
+
),
|
|
50
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
51
|
+
)
|
|
52
|
+
model: Optional[StrictStr] = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
description="Unique identifier of the model used to generate this QA pair.",
|
|
55
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
56
|
+
)
|
|
57
|
+
paths: UniqueList[StrictStr] = Field(
|
|
58
|
+
description=(
|
|
59
|
+
"One or more references to a document that identify the provenance of the"
|
|
60
|
+
" QA pair context."
|
|
61
|
+
),
|
|
62
|
+
examples=[
|
|
63
|
+
"badce7c84d0ba7ba0fb5e94492b0d91e2506a7cb48e4524ad572c546a35f768e#/"
|
|
64
|
+
"main-text/4"
|
|
65
|
+
],
|
|
66
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
67
|
+
)
|
|
68
|
+
advanced: Optional[DescriptionAdvancedT] = Field(
|
|
69
|
+
default=None,
|
|
70
|
+
description="Document metadata to provide more details on the context.",
|
|
71
|
+
)
|
|
72
|
+
labels: Optional[QALabelling] = Field(
|
|
73
|
+
default=None, description="QApair labelling axes."
|
|
74
|
+
)
|