docling-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (46) hide show
  1. docling_core/__init__.py +6 -0
  2. docling_core/py.typed +0 -0
  3. docling_core/resources/schemas/doc/ANN.json +171 -0
  4. docling_core/resources/schemas/doc/DOC.json +300 -0
  5. docling_core/resources/schemas/doc/OCR-output.json +166 -0
  6. docling_core/resources/schemas/doc/RAW.json +158 -0
  7. docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
  8. docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
  9. docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
  10. docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
  11. docling_core/search/__init__.py +6 -0
  12. docling_core/search/json_schema_to_search_mapper.py +406 -0
  13. docling_core/search/mapping.py +29 -0
  14. docling_core/search/meta.py +93 -0
  15. docling_core/search/package.py +56 -0
  16. docling_core/types/__init__.py +25 -0
  17. docling_core/types/base.py +248 -0
  18. docling_core/types/doc/__init__.py +6 -0
  19. docling_core/types/doc/base.py +199 -0
  20. docling_core/types/doc/doc_ann.py +76 -0
  21. docling_core/types/doc/doc_ocr.py +83 -0
  22. docling_core/types/doc/doc_raw.py +187 -0
  23. docling_core/types/doc/document.py +393 -0
  24. docling_core/types/gen/__init__.py +6 -0
  25. docling_core/types/gen/generic.py +33 -0
  26. docling_core/types/nlp/__init__.py +6 -0
  27. docling_core/types/nlp/qa.py +74 -0
  28. docling_core/types/nlp/qa_labels.py +118 -0
  29. docling_core/types/rec/__init__.py +6 -0
  30. docling_core/types/rec/attribute.py +55 -0
  31. docling_core/types/rec/base.py +90 -0
  32. docling_core/types/rec/predicate.py +133 -0
  33. docling_core/types/rec/record.py +95 -0
  34. docling_core/types/rec/statement.py +41 -0
  35. docling_core/types/rec/subject.py +77 -0
  36. docling_core/utils/__init__.py +6 -0
  37. docling_core/utils/alias.py +27 -0
  38. docling_core/utils/ds_generate_docs.py +144 -0
  39. docling_core/utils/ds_generate_jsonschema.py +62 -0
  40. docling_core/utils/validate.py +86 -0
  41. docling_core/utils/validators.py +100 -0
  42. docling_core-0.0.1.dist-info/LICENSE +21 -0
  43. docling_core-0.0.1.dist-info/METADATA +133 -0
  44. docling_core-0.0.1.dist-info/RECORD +46 -0
  45. docling_core-0.0.1.dist-info/WHEEL +4 -0
  46. docling_core-0.0.1.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,187 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for CCS objects in raw format."""
7
+ from typing import Any, List, Optional
8
+
9
+ from pydantic import BaseModel, Field
10
+ from typing_extensions import Annotated
11
+
12
+ from docling_core.types.doc.base import BoundingBox
13
+ from docling_core.utils.alias import AliasModel
14
+
15
+ FontDifferences = dict[str, Any]
16
+ NamedWidths = dict[str, Any]
17
+ IgnoredCell = Any
18
+
19
+
20
+ class Box(BaseModel):
21
+ """Box."""
22
+
23
+ baseline: BoundingBox
24
+ device: BoundingBox
25
+
26
+
27
+ class Content(BaseModel):
28
+ """Content."""
29
+
30
+ rnormalized: str
31
+
32
+
33
+ class Enumeration(BaseModel):
34
+ """Enumeration."""
35
+
36
+ match: int
37
+ type: int
38
+
39
+
40
+ class Font(BaseModel):
41
+ """Font."""
42
+
43
+ color: Annotated[List[float], Field(min_length=3, max_length=4)]
44
+ name: str
45
+ size: float
46
+
47
+
48
+ class Cell(AliasModel):
49
+ """Cell."""
50
+
51
+ see_cell: bool = Field(..., alias="SEE_cell")
52
+ see_confidence: float = Field(..., alias="SEE_confidence")
53
+ angle: float
54
+ box: Box
55
+ content: Content
56
+ enumeration: Enumeration
57
+ font: Font
58
+
59
+
60
+ class PageDimensions(BaseModel):
61
+ """PageDimensions."""
62
+
63
+ bbox: BoundingBox
64
+ height: float
65
+ width: float
66
+
67
+
68
+ class Path(AliasModel):
69
+ """Path."""
70
+
71
+ bbox: BoundingBox
72
+ sub_paths: list[float] = Field(..., alias="sub-paths")
73
+ type: str
74
+ x_values: list[float] = Field(..., alias="x-values")
75
+ y_values: list[float] = Field(..., alias="y-values")
76
+
77
+
78
+ class VerticalLine(BaseModel):
79
+ """Vertical line."""
80
+
81
+ y0: int
82
+ y1: int
83
+ x: int
84
+
85
+
86
+ class HorizontalLine(BaseModel):
87
+ """Horizontal line."""
88
+
89
+ x0: int
90
+ x1: int
91
+ y: int
92
+
93
+
94
+ class Image(BaseModel):
95
+ """Image."""
96
+
97
+ box: BoundingBox
98
+ height: float
99
+ width: float
100
+
101
+
102
+ class FontRange(BaseModel):
103
+ """Font range."""
104
+
105
+ first: int
106
+ second: int
107
+
108
+
109
+ class FontCmap(BaseModel):
110
+ """Font cmap."""
111
+
112
+ cmap: dict[str, str]
113
+ name: str
114
+ range: FontRange
115
+ type: int
116
+
117
+
118
+ class FontMetrics(AliasModel):
119
+ """Font metrics."""
120
+
121
+ stem_h: float = Field(..., alias="StemH")
122
+ stem_v: float = Field(..., alias="StemV")
123
+ ascent: float
124
+ average_width: float = Field(..., alias="average-width")
125
+ bbox: BoundingBox
126
+ cap_height: float
127
+ default_width: float = Field(..., alias="default-width")
128
+ descent: float
129
+ file: str
130
+ italic_angle: float = Field(..., alias="italic-angle")
131
+ max_width: float = Field(..., alias="max-width")
132
+ missing_width: float = Field(..., alias="missing-width")
133
+ name: str
134
+ named_widths: NamedWidths = Field(..., alias="named-widths")
135
+ weight: str
136
+ widths: dict[str, float]
137
+ x_height: float
138
+
139
+
140
+ class FontInfo(AliasModel):
141
+ """Font info."""
142
+
143
+ font_cmap: FontCmap = Field(..., alias="font-cmap")
144
+ font_differences: FontDifferences = Field(..., alias="font-differences")
145
+ font_metrics: FontMetrics = Field(..., alias="font-metrics")
146
+ name: str
147
+ internal_name: str = Field(..., alias="name (internal)")
148
+ subtype: str
149
+
150
+
151
+ class Page(AliasModel):
152
+ """Page."""
153
+
154
+ height: float
155
+ width: float
156
+ dimensions: PageDimensions
157
+ cells: list[Cell]
158
+ paths: list[Path]
159
+ vertical_lines: Optional[list[VerticalLine]] = Field(..., alias="vertical-lines")
160
+ horizontal_lines: Optional[list[HorizontalLine]] = Field(
161
+ ..., alias="horizontal-lines"
162
+ )
163
+ ignored_cells: list[IgnoredCell] = Field(..., alias="ignored-cells")
164
+ images: list[Image]
165
+ fonts: dict[str, FontInfo]
166
+
167
+
168
+ class Histograms(AliasModel):
169
+ """Histogram."""
170
+
171
+ mean_char_height: dict[str, float] = Field(..., alias="mean-char-height")
172
+ mean_char_width: dict[str, float] = Field(..., alias="mean-char-width")
173
+ number_of_chars: dict[str, int] = Field(..., alias="number-of-chars")
174
+
175
+
176
+ class PdfInfo(BaseModel):
177
+ """PDF info."""
178
+
179
+ histograms: Histograms
180
+ styles: list[str]
181
+
182
+
183
+ class RawPdf(BaseModel):
184
+ """Raw PDF."""
185
+
186
+ info: PdfInfo
187
+ pages: list[Page]
@@ -0,0 +1,393 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Models for the Docling Document data type."""
7
+
8
+ from datetime import datetime
9
+ from typing import Generic, Optional, Union
10
+
11
+ from pydantic import (
12
+ AnyHttpUrl,
13
+ BaseModel,
14
+ Field,
15
+ NonNegativeInt,
16
+ StrictStr,
17
+ model_validator,
18
+ )
19
+
20
+ from docling_core.search.mapping import es_field
21
+ from docling_core.types.base import (
22
+ Acquisition,
23
+ CollectionDocumentInfo,
24
+ CollectionNameTypeT,
25
+ DescriptionAdvancedT,
26
+ DescriptionAnalyticsT,
27
+ FileInfoObject,
28
+ Identifier,
29
+ IdentifierTypeT,
30
+ LanguageT,
31
+ Log,
32
+ )
33
+ from docling_core.types.doc.base import (
34
+ BaseCell,
35
+ BaseText,
36
+ BitmapObject,
37
+ PageDimensions,
38
+ PageReference,
39
+ Ref,
40
+ S3Data,
41
+ Table,
42
+ )
43
+ from docling_core.utils.alias import AliasModel
44
+
45
+
46
+ class CCSFileInfoDescription(BaseModel, extra="forbid"):
47
+ """File info description."""
48
+
49
+ author: Optional[list[StrictStr]] = None
50
+ keywords: Optional[str] = None
51
+ subject: Optional[str] = None
52
+ title: Optional[StrictStr] = None
53
+ creation_date: Optional[str] = None # datetime
54
+
55
+
56
+ class CCSFileInfoObject(FileInfoObject, extra="forbid"):
57
+ """File info object."""
58
+
59
+ num_pages: Optional[int] = Field(default=None, alias="#-pages")
60
+
61
+ collection_name: Optional[str] = Field(
62
+ default=None,
63
+ alias="collection-name",
64
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
65
+ )
66
+ description: Optional[CCSFileInfoDescription] = Field(
67
+ default=None, json_schema_extra=es_field(suppress=True)
68
+ )
69
+ page_hashes: Optional[list[PageReference]] = Field(
70
+ default=None, alias="page-hashes"
71
+ )
72
+
73
+
74
+ class Affiliation(BaseModel, extra="forbid"):
75
+ """Affiliation."""
76
+
77
+ name: str = Field(
78
+ ...,
79
+ json_schema_extra=es_field(
80
+ fields={
81
+ "lower": {
82
+ "normalizer": "lowercase_asciifolding",
83
+ "type": "keyword",
84
+ "ignore_above": 8191,
85
+ },
86
+ "keyword": {"type": "keyword", "ignore_above": 8191},
87
+ },
88
+ ),
89
+ )
90
+ id: Optional[str] = Field(
91
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
92
+ )
93
+ source: Optional[str] = Field(
94
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
95
+ )
96
+
97
+
98
+ class Author(BaseModel, extra="forbid"):
99
+ """Author."""
100
+
101
+ name: str = Field(
102
+ ...,
103
+ json_schema_extra=es_field(
104
+ type="text",
105
+ fields={
106
+ "lower": {
107
+ "normalizer": "lowercase_asciifolding",
108
+ "type": "keyword",
109
+ "ignore_above": 8191,
110
+ },
111
+ "keyword": {"type": "keyword", "ignore_above": 8191},
112
+ },
113
+ ),
114
+ )
115
+ id: Optional[str] = Field(
116
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
117
+ )
118
+ source: Optional[str] = Field(
119
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
120
+ )
121
+ affiliations: Optional[list[Affiliation]] = None
122
+
123
+
124
+ class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
125
+ """Publication details of a journal or venue."""
126
+
127
+ identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
128
+ default=None,
129
+ description="Unique identifiers of a publication venue.",
130
+ )
131
+ name: StrictStr = Field(
132
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
133
+ description="Name of the publication.",
134
+ )
135
+ alternate_names: Optional[list[StrictStr]] = Field(
136
+ default=None,
137
+ json_schema_extra=es_field(type="text"),
138
+ title="Alternate Names",
139
+ description="Other names or abbreviations of this publication.",
140
+ )
141
+ type: Optional[list[StrictStr]] = Field(
142
+ default=None,
143
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
144
+ description="Type of publication (journal article, conference, review,...).",
145
+ )
146
+ pages: Optional[StrictStr] = Field(
147
+ default=None,
148
+ json_schema_extra=es_field(type="text"),
149
+ description="Page range in the publication.",
150
+ )
151
+ issue: Optional[StrictStr] = Field(
152
+ default=None,
153
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
154
+ description="Publication issue (issue number).",
155
+ )
156
+ volume: Optional[StrictStr] = Field(
157
+ default=None,
158
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
159
+ description="Publication volume.",
160
+ )
161
+ url: Optional[AnyHttpUrl] = Field(
162
+ default=None,
163
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
164
+ description="URL on the publication site.",
165
+ )
166
+
167
+
168
+ class DescriptionLicense(BaseModel, extra="forbid"):
169
+ """Licence in document description."""
170
+
171
+ code: Optional[StrictStr] = Field(
172
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
173
+ )
174
+ text: Optional[StrictStr] = None
175
+
176
+
177
+ class CCSDocumentDescription(
178
+ AliasModel,
179
+ Generic[
180
+ DescriptionAdvancedT,
181
+ DescriptionAnalyticsT,
182
+ IdentifierTypeT,
183
+ LanguageT,
184
+ CollectionNameTypeT,
185
+ ],
186
+ ):
187
+ """Description in document."""
188
+
189
+ title: Optional[StrictStr] = None
190
+ abstract: Optional[list[StrictStr]] = None
191
+ authors: Optional[list[Author]] = None
192
+ affiliations: Optional[list[Affiliation]] = None
193
+ subjects: Optional[list[str]] = Field(
194
+ default=None,
195
+ json_schema_extra=es_field(
196
+ fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
197
+ ),
198
+ )
199
+ keywords: Optional[list[str]] = Field(
200
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
201
+ )
202
+ publication_date: Optional[datetime] = None
203
+ languages: Optional[list[LanguageT]] = Field(
204
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
205
+ )
206
+ license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
207
+ publishers: Optional[list[StrictStr]] = Field(
208
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
209
+ )
210
+ url_refs: Optional[list[str]] = Field(
211
+ default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
212
+ )
213
+ references: Optional[list[Identifier[IdentifierTypeT]]] = None
214
+ publication: Optional[list[Publication]] = Field(
215
+ default=None, description="List of publication journals or venues."
216
+ )
217
+ reference_count: Optional[NonNegativeInt] = Field(
218
+ default=None,
219
+ title="Reference Count",
220
+ description="Total number of documents referenced by this document.",
221
+ json_schema_extra=es_field(type="integer"),
222
+ )
223
+ citation_count: Optional[NonNegativeInt] = Field(
224
+ default=None,
225
+ title="Citation Count",
226
+ description=(
227
+ "Total number of citations that this document has received (number "
228
+ "of documents in whose bibliography this document appears)."
229
+ ),
230
+ json_schema_extra=es_field(type="integer"),
231
+ )
232
+ citation_date: Optional[datetime] = Field(
233
+ default=None,
234
+ title="Citation Count Date",
235
+ description="Last update date of the citation count.",
236
+ )
237
+ advanced: Optional[DescriptionAdvancedT] = None
238
+ analytics: Optional[DescriptionAnalyticsT] = None
239
+ logs: list[Log]
240
+ collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
241
+ default=None, description="The collection information of this document."
242
+ )
243
+ acquisition: Optional[Acquisition] = Field(
244
+ default=None,
245
+ description=(
246
+ "Information on how the document was obtained, for data governance"
247
+ " purposes."
248
+ ),
249
+ )
250
+
251
+
252
+ class MinimalDocument(
253
+ AliasModel,
254
+ Generic[
255
+ DescriptionAdvancedT,
256
+ DescriptionAnalyticsT,
257
+ IdentifierTypeT,
258
+ LanguageT,
259
+ CollectionNameTypeT,
260
+ ],
261
+ ):
262
+ """Minimal model for a document."""
263
+
264
+ name: StrictStr = Field(alias="_name")
265
+ obj_type: StrictStr = Field("document", alias="type")
266
+ description: CCSDocumentDescription[
267
+ DescriptionAdvancedT,
268
+ DescriptionAnalyticsT,
269
+ IdentifierTypeT,
270
+ LanguageT,
271
+ CollectionNameTypeT,
272
+ ]
273
+ file_info: FileInfoObject = Field(alias="file-info")
274
+ main_text: Optional[list[Union[Ref, BaseText]]] = Field(
275
+ default=None, alias="main-text"
276
+ )
277
+ figures: Optional[list[BaseCell]] = None
278
+ tables: Optional[list[Table]] = None
279
+
280
+
281
+ class CCSDocument(
282
+ MinimalDocument,
283
+ Generic[
284
+ DescriptionAdvancedT,
285
+ DescriptionAnalyticsT,
286
+ IdentifierTypeT,
287
+ LanguageT,
288
+ CollectionNameTypeT,
289
+ ],
290
+ ):
291
+ """Model for a CCS-generated document."""
292
+
293
+ obj_type: StrictStr = Field("pdf-document", alias="type")
294
+ bitmaps: Optional[list[BitmapObject]] = None
295
+ equations: Optional[list[BaseCell]] = None
296
+ footnotes: Optional[list[BaseText]] = None
297
+ file_info: CCSFileInfoObject = Field(alias="file-info")
298
+ main_text: Optional[list[Union[Ref, BaseText]]] = Field(
299
+ default=None,
300
+ alias="main-text",
301
+ )
302
+ page_dimensions: Optional[list[PageDimensions]] = Field(
303
+ default=None, alias="page-dimensions"
304
+ )
305
+ page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
306
+ page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
307
+ s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
308
+
309
+ @model_validator(mode="before")
310
+ @classmethod
311
+ def from_dict(cls, data):
312
+ """Validates and fixes the input data."""
313
+ description_collection = data["description"].get("collection")
314
+ if not description_collection:
315
+ data["description"].setdefault("collection", {})
316
+
317
+ data["description"]["collection"].setdefault("type", "Document")
318
+ logs = data["description"].get("logs")
319
+ if not logs:
320
+ data["description"].setdefault("logs", [])
321
+
322
+ abstract = data["description"].get("abstract")
323
+ if abstract is not None and not isinstance(abstract, list):
324
+ if isinstance(abstract, str):
325
+ data["description"]["abstract"] = [abstract]
326
+ else:
327
+ data["description"].pop("abstract")
328
+
329
+ for key in ["affiliations", "authors"]:
330
+ descr = data["description"].get(key)
331
+ if descr is not None and not isinstance(descr, list):
332
+ if isinstance(descr, dict):
333
+ data["description"][key] = [descr]
334
+ else:
335
+ data["description"].pop(key)
336
+
337
+ if data.get("main-text"):
338
+ for item in data["main-text"]:
339
+ if ref := item.pop("__ref", None):
340
+ item["$ref"] = ref
341
+
342
+ return data
343
+
344
+
345
+ class ExportedCCSDocument(
346
+ MinimalDocument,
347
+ Generic[
348
+ DescriptionAdvancedT,
349
+ DescriptionAnalyticsT,
350
+ IdentifierTypeT,
351
+ LanguageT,
352
+ CollectionNameTypeT,
353
+ ],
354
+ ):
355
+ """Document model for Docling."""
356
+
357
+ obj_type: StrictStr = Field(
358
+ "pdf-document",
359
+ alias="type",
360
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
361
+ )
362
+ bitmaps: Optional[list[BitmapObject]] = None
363
+ equations: Optional[list[BaseCell]] = None
364
+ footnotes: Optional[list[BaseText]] = None
365
+ description: CCSDocumentDescription[
366
+ DescriptionAdvancedT,
367
+ DescriptionAnalyticsT,
368
+ IdentifierTypeT,
369
+ LanguageT,
370
+ CollectionNameTypeT,
371
+ ]
372
+ file_info: CCSFileInfoObject = Field(alias="file-info")
373
+ main_text: Optional[list[Union[Ref, BaseText]]] = Field(
374
+ default=None, alias="main-text"
375
+ )
376
+ page_dimensions: Optional[list[PageDimensions]] = Field(
377
+ default=None, alias="page-dimensions"
378
+ )
379
+ page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
380
+ page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
381
+ s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
382
+ identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
383
+
384
+ @model_validator(mode="before")
385
+ @classmethod
386
+ def from_dict(cls, data):
387
+ """Fix ref in main-text."""
388
+ if data.get("main-text"):
389
+ for item in data["main-text"]:
390
+ if ref := item.pop("__ref", None):
391
+ item["$ref"] = ref
392
+
393
+ return data
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Generic type."""
@@ -0,0 +1,33 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define a generic Docling type."""
7
+
8
+ from typing import Optional
9
+
10
+ from pydantic import Field, StrictStr
11
+
12
+ from docling_core.search.mapping import es_field
13
+ from docling_core.types.base import FileInfoObject
14
+ from docling_core.utils.alias import AliasModel
15
+
16
+
17
+ class Generic(AliasModel):
18
+ """A representation of a generic document."""
19
+
20
+ name: Optional[StrictStr] = Field(
21
+ default=None,
22
+ description="A short description or summary of the document.",
23
+ alias="_name",
24
+ json_schema_extra=es_field(type="text"),
25
+ )
26
+
27
+ file_info: FileInfoObject = Field(
28
+ title="Document information",
29
+ description=(
30
+ "Minimal identification information of the document within a collection."
31
+ ),
32
+ alias="file-info",
33
+ )
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defining NLP artifacts."""
@@ -0,0 +1,74 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the model for Q&A pairs."""
7
+ from typing import Generic, Optional
8
+
9
+ from pydantic import BaseModel, Field, StrictBool, StrictStr
10
+
11
+ from docling_core.search.mapping import es_field
12
+ from docling_core.types.base import DescriptionAdvancedT, StrictDateTime, UniqueList
13
+ from docling_core.types.nlp.qa_labels import QALabelling
14
+
15
+
16
+ class QAPair(BaseModel, Generic[DescriptionAdvancedT]):
17
+ """A representation of a question-answering (QA) pair."""
18
+
19
+ context: StrictStr = Field(
20
+ description=(
21
+ "A single string containing the context of the question enabling the"
22
+ " presentation of the answer."
23
+ )
24
+ )
25
+ question: StrictStr = Field(description="A question on the given context.")
26
+ answer: StrictStr = Field(
27
+ description="The answer to the question from the context."
28
+ )
29
+ short_answer: Optional[StrictStr] = Field(
30
+ default=None, description="Alternative and concise answer."
31
+ )
32
+ retrieved_context: Optional[StrictBool] = Field(
33
+ default=False,
34
+ description="Whether the context was retrieved from the question.",
35
+ )
36
+ generated_question: Optional[StrictBool] = Field(
37
+ default=False, description="Whether the question was generated by an AI model."
38
+ )
39
+ generated_answer: Optional[StrictBool] = Field(
40
+ default=False, description="Whether the answer was generated by an AI model."
41
+ )
42
+ created: StrictDateTime = Field(
43
+ description="Datetime when the QA pair was created ."
44
+ )
45
+ user: Optional[StrictStr] = Field(
46
+ default=None,
47
+ description=(
48
+ "Unique identifier of the user that created or curated this QA pair."
49
+ ),
50
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
51
+ )
52
+ model: Optional[StrictStr] = Field(
53
+ default=None,
54
+ description="Unique identifier of the model used to generate this QA pair.",
55
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
56
+ )
57
+ paths: UniqueList[StrictStr] = Field(
58
+ description=(
59
+ "One or more references to a document that identify the provenance of the"
60
+ " QA pair context."
61
+ ),
62
+ examples=[
63
+ "badce7c84d0ba7ba0fb5e94492b0d91e2506a7cb48e4524ad572c546a35f768e#/"
64
+ "main-text/4"
65
+ ],
66
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
67
+ )
68
+ advanced: Optional[DescriptionAdvancedT] = Field(
69
+ default=None,
70
+ description="Document metadata to provide more details on the context.",
71
+ )
72
+ labels: Optional[QALabelling] = Field(
73
+ default=None, description="QApair labelling axes."
74
+ )