docling-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (46) hide show
  1. docling_core/__init__.py +6 -0
  2. docling_core/py.typed +0 -0
  3. docling_core/resources/schemas/doc/ANN.json +171 -0
  4. docling_core/resources/schemas/doc/DOC.json +300 -0
  5. docling_core/resources/schemas/doc/OCR-output.json +166 -0
  6. docling_core/resources/schemas/doc/RAW.json +158 -0
  7. docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
  8. docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
  9. docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
  10. docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
  11. docling_core/search/__init__.py +6 -0
  12. docling_core/search/json_schema_to_search_mapper.py +406 -0
  13. docling_core/search/mapping.py +29 -0
  14. docling_core/search/meta.py +93 -0
  15. docling_core/search/package.py +56 -0
  16. docling_core/types/__init__.py +25 -0
  17. docling_core/types/base.py +248 -0
  18. docling_core/types/doc/__init__.py +6 -0
  19. docling_core/types/doc/base.py +199 -0
  20. docling_core/types/doc/doc_ann.py +76 -0
  21. docling_core/types/doc/doc_ocr.py +83 -0
  22. docling_core/types/doc/doc_raw.py +187 -0
  23. docling_core/types/doc/document.py +393 -0
  24. docling_core/types/gen/__init__.py +6 -0
  25. docling_core/types/gen/generic.py +33 -0
  26. docling_core/types/nlp/__init__.py +6 -0
  27. docling_core/types/nlp/qa.py +74 -0
  28. docling_core/types/nlp/qa_labels.py +118 -0
  29. docling_core/types/rec/__init__.py +6 -0
  30. docling_core/types/rec/attribute.py +55 -0
  31. docling_core/types/rec/base.py +90 -0
  32. docling_core/types/rec/predicate.py +133 -0
  33. docling_core/types/rec/record.py +95 -0
  34. docling_core/types/rec/statement.py +41 -0
  35. docling_core/types/rec/subject.py +77 -0
  36. docling_core/utils/__init__.py +6 -0
  37. docling_core/utils/alias.py +27 -0
  38. docling_core/utils/ds_generate_docs.py +144 -0
  39. docling_core/utils/ds_generate_jsonschema.py +62 -0
  40. docling_core/utils/validate.py +86 -0
  41. docling_core/utils/validators.py +100 -0
  42. docling_core-0.0.1.dist-info/LICENSE +21 -0
  43. docling_core-0.0.1.dist-info/METADATA +133 -0
  44. docling_core-0.0.1.dist-info/RECORD +46 -0
  45. docling_core-0.0.1.dist-info/WHEEL +4 -0
  46. docling_core-0.0.1.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,118 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define models for labeling Q&A pairs."""
7
+ from typing import Literal, Optional
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ from docling_core.search.mapping import es_field
12
+
13
+ QAScopeLabel = Literal["corpus", "document", "out_of_scope"]
14
+ QAAlignmentLabel = Literal["aligned", "tangential", "misaligned"]
15
+ QACorrectnessLabel = Literal["entailed", "not_entailed"]
16
+ QACompletenessLabel = Literal["complete", "incomplete"]
17
+ QAInformationLabel = Literal[
18
+ "fact_single",
19
+ "fact_multi",
20
+ "summary",
21
+ "reasoning",
22
+ "choice",
23
+ "procedure",
24
+ "opinion",
25
+ "feedback",
26
+ ]
27
+
28
+
29
+ class QALabelling(BaseModel, extra="forbid"):
30
+ """Subclass to classify QA pair."""
31
+
32
+ scope: Optional[QAScopeLabel] = Field(
33
+ default=None,
34
+ description="""Enumeration of QA scope types based on question only.
35
+ - Corpus: question is asked on the entire corpus
36
+ > Example: "What is the operating temperature of device X?"
37
+ - Document: need to know the precise document before answering the question
38
+ > Example: "What is its operating temperature?"
39
+ - Out of scope: question is out of scope for the system
40
+ > Example: "What is the volume of moon?" """,
41
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
42
+ )
43
+ alignment: Optional[QAAlignmentLabel] = Field(
44
+ default=None,
45
+ description="""Enumeration of QA alignment types based on question-context pair.
46
+ Given the following context: "Device X works between 2 and 20 degrees C"
47
+ A question can be:
48
+ - Aligned: the context has information that the question seeks
49
+ > Example: "Can device X work at 10 degrees?"
50
+ - Tangential: the context does not have the information directly
51
+ but the question is related to the context
52
+ > Example: "Is device X safe?"
53
+ - Misaligned: the question has nothing to do with the context
54
+ > Example: "Why is device Y not working?" """,
55
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
56
+ )
57
+ correctness: Optional[QACorrectnessLabel] = Field(
58
+ default=None,
59
+ description="""Enumeration of QA correctness types based on
60
+ question-answer-context triplet.
61
+ Given the following context: "Device X works between 2 and 20 degrees C"
62
+ and the following question: "Can device X work at 10 degrees?"
63
+ An answer can be:
64
+ - Entailed: answer is entailed to both question and context
65
+ > Example: "Yes, as it works between 2 and 20 degrees."
66
+ - Not entailed: answer is not entailed to either question or context
67
+ > Example: "Yes, device X can work at any temperature." """,
68
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
69
+ )
70
+ completeness: Optional[QACompletenessLabel] = Field(
71
+ default=None,
72
+ description="""Enumeration of QA completeness types based on
73
+ question-answer-context triplet.
74
+ Given the following context: "A, B, C, and D met on Friday."
75
+ and the following question: "Who was in the meeting?"
76
+ An answer can be:
77
+ - Complete: Answer contains all relevant information requested by a
78
+ question that can be extracted from the associated ground-truth context
79
+ > Example: "A, B, C, and D."
80
+ - Incomplete: Answer does not contain the entire relevant information in
81
+ the context
82
+ > Example: "B and D" """,
83
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
84
+ )
85
+ information: Optional[QAInformationLabel] = Field(
86
+ default=None,
87
+ description="""Enumeration of QA nature of information types based on question
88
+ only.
89
+ - Single fact: Answer should be a short phrase containing a numerical or
90
+ textual fact
91
+ > Example: "What is the boiling point of water?"
92
+ - Multiple fact: Answer is a list of two or more facts (not necessarily in
93
+ list format)
94
+ > Example: "What is the minimum and maximum age of people working at
95
+ IBM?"
96
+ - Summary: Answer summarises a part of the context without any modification.
97
+ > Example: "Briefly describe the temperature requirements for this
98
+ device in a table"
99
+ - Reasoning: Answer requires inferring information from the context that
100
+ can be inferred but is not explicitly stated (e.g., operating
101
+ temperature is given and the question asks if the device can operate at
102
+ a particular temperature)
103
+ > Example: "Why can I not operate this device under water?"
104
+ - Multiple choice: Question provides a few choices implicitly or explicitly
105
+ and the answer must be one of these choices. Includes yes/no questions
106
+ > Example: "If I operate this device at 10 degrees, will it be in the
107
+ green range or red?"
108
+ - Procedure: Answer outlines the steps to do something. As opposed to a
109
+ summary, the order of information matters here
110
+ > Example: "How can I access part X of device Y?"
111
+ - Opinion: The context provides several viewpoints and the question
112
+ requests the opinion of the chatbot
113
+ > Example: "Is device X better than Y?"
114
+ - Feedback: The question is actually a feedback on the preceding generation
115
+ within a session
116
+ > Example: "Your summary was inadequate" """,
117
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
118
+ )
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for models defined by the Record type."""
@@ -0,0 +1,55 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the model Attribute."""
7
+ from typing import Generic, Optional
8
+
9
+ from pydantic import Field
10
+ from typing_extensions import Annotated
11
+
12
+ from docling_core.search.mapping import es_field
13
+ from docling_core.types.base import (
14
+ IdentifierTypeT,
15
+ PredicateKeyNameT,
16
+ PredicateKeyTypeT,
17
+ PredicateValueTypeT,
18
+ ProvenanceTypeT,
19
+ )
20
+ from docling_core.types.rec.base import ProvenanceItem
21
+ from docling_core.types.rec.predicate import Predicate
22
+ from docling_core.utils.alias import AliasModel
23
+
24
+
25
+ class Attribute(
26
+ AliasModel,
27
+ Generic[
28
+ IdentifierTypeT,
29
+ PredicateValueTypeT,
30
+ PredicateKeyNameT,
31
+ PredicateKeyTypeT,
32
+ ProvenanceTypeT,
33
+ ],
34
+ extra="forbid",
35
+ ):
36
+ """Attribute model that describes a list of characteristics."""
37
+
38
+ conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0, allow_inf_nan=False)] = (
39
+ Field(
40
+ ...,
41
+ title="Confidence",
42
+ description="The confidence level of this attribute characteristics.",
43
+ json_schema_extra=es_field(type="float"),
44
+ )
45
+ )
46
+
47
+ prov: Optional[list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]]] = Field(
48
+ default=None,
49
+ title="Provenance",
50
+ description="The sources of this attribute characteristics.",
51
+ )
52
+
53
+ predicates: list[
54
+ Predicate[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT]
55
+ ] = Field(..., description="A list of characteristics (type, value, and name).")
@@ -0,0 +1,90 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the base models for the Record type."""
7
+ from typing import Generic, List, Optional
8
+
9
+ from pydantic import Field, StrictInt, StrictStr
10
+ from typing_extensions import Annotated
11
+
12
+ from docling_core.search.mapping import es_field
13
+ from docling_core.types.base import Identifier, IdentifierTypeT, ProvenanceTypeT
14
+ from docling_core.utils.alias import AliasModel
15
+
16
+
17
+ class ProvenanceItem(
18
+ AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT], extra="forbid"
19
+ ):
20
+ """A representation of an object provenance."""
21
+
22
+ type_: Optional[ProvenanceTypeT] = Field(
23
+ default=None,
24
+ alias="type",
25
+ title="The provenance type",
26
+ description=(
27
+ "Any string representing the type of provenance, e.g. `sentence`, "
28
+ "`table`, or `doi`."
29
+ ),
30
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
31
+ )
32
+
33
+ text: Optional[StrictStr] = Field(
34
+ default=None,
35
+ title="Evidence of the provenance",
36
+ description=(
37
+ "A text representing the evidence of the provenance, e.g. the sentence "
38
+ "text or the content of a table cell"
39
+ ),
40
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
41
+ )
42
+
43
+ reference: Optional[Identifier[IdentifierTypeT]] = Field(
44
+ default=None,
45
+ title="Reference to the provenance object",
46
+ description=(
47
+ "Reference to another object, e.g. record, statement, URL, or any other "
48
+ "object that identifies the provenance"
49
+ ),
50
+ )
51
+
52
+ path: Optional[StrictStr] = Field(
53
+ default=None,
54
+ title="The location of the provenance within the referenced object",
55
+ description=(
56
+ "A path that locates the evidence within the provenance object identified "
57
+ "by the `reference` field using a JSON pointer notation, e.g., "
58
+ "`#/main-text/5` to locate the `main-text` paragraph at index 5"
59
+ ),
60
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
61
+ )
62
+
63
+ span: Optional[Annotated[List[StrictInt], Field(min_length=2, max_length=2)]] = (
64
+ Field(
65
+ default=None,
66
+ title="The location of the item in the text/table",
67
+ description=(
68
+ "location of the item in the text/table referenced by the `path`,"
69
+ " e.g., `[34, 67]`"
70
+ ),
71
+ )
72
+ )
73
+
74
+
75
+ class Provenance(AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT]):
76
+ """A representation of an evidence, as a list of provenance objects."""
77
+
78
+ conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0)] = Field(
79
+ ...,
80
+ title="The confidence of the evidence",
81
+ description=(
82
+ "This value represents a score to the data item. Items originating from "
83
+ " databases will typically have a score 1.0, while items resulting from "
84
+ " an NLP model may have a value between 0.0 and 1.0."
85
+ ),
86
+ json_schema_extra=es_field(type="float"),
87
+ )
88
+ prov: list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]] = Field(
89
+ title="Provenance", description="A list of provenance items."
90
+ )
@@ -0,0 +1,133 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the model Predicate."""
7
+ from datetime import datetime
8
+ from typing import Annotated, Generic, Optional
9
+
10
+ from pydantic import (
11
+ BaseModel,
12
+ Field,
13
+ StrictBool,
14
+ StrictFloat,
15
+ StrictStr,
16
+ field_validator,
17
+ )
18
+
19
+ from docling_core.search.mapping import es_field
20
+ from docling_core.types.base import (
21
+ Coordinates,
22
+ PredicateKeyNameT,
23
+ PredicateKeyTypeT,
24
+ PredicateValueTypeT,
25
+ )
26
+ from docling_core.utils.alias import AliasModel
27
+
28
+
29
+ class NumericalValue(BaseModel, extra="forbid"):
30
+ """Model for numerical values."""
31
+
32
+ min: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
33
+ max: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
34
+ val: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
35
+ err: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
36
+ unit: StrictStr = Field(
37
+ ..., json_schema_extra=es_field(type="keyword", ignore_above=8191)
38
+ )
39
+
40
+
41
+ class NominalValue(BaseModel, extra="forbid"):
42
+ """Model for nominal (categorical) values."""
43
+
44
+ value: StrictStr = Field(
45
+ ..., json_schema_extra=es_field(type="keyword", ignore_above=8191)
46
+ )
47
+
48
+
49
+ class TextValue(BaseModel, extra="forbid"):
50
+ """Model for textual values."""
51
+
52
+ value: StrictStr = Field(..., json_schema_extra=es_field(type="text"))
53
+
54
+
55
+ class BooleanValue(BaseModel, extra="forbid"):
56
+ """Model for boolean values."""
57
+
58
+ value: StrictBool = Field(..., json_schema_extra=es_field(type="boolean"))
59
+
60
+
61
+ class DatetimeValue(BaseModel, extra="forbid"):
62
+ """Model for datetime values."""
63
+
64
+ value: datetime
65
+
66
+
67
+ class GeopointValue(BaseModel, extra="forbid"):
68
+ """A representation of a geopoint (longitude and latitude coordinates)."""
69
+
70
+ value: Coordinates
71
+ conf: Optional[Annotated[float, Field(strict=True, ge=0.0, le=1.0)]] = Field(
72
+ default=None, json_schema_extra=es_field(type="float")
73
+ )
74
+
75
+ @field_validator("value")
76
+ @classmethod
77
+ def validate_coordinates(cls, v):
78
+ """Validate the reference field for indexes of type Document."""
79
+ if abs(v[0]) > 180:
80
+ raise ValueError("invalid longitude")
81
+ if abs(v[1]) > 90:
82
+ raise ValueError("invalid latitude")
83
+ return v
84
+
85
+
86
+ class PredicateKey(
87
+ AliasModel, Generic[PredicateKeyNameT, PredicateKeyTypeT], extra="forbid"
88
+ ):
89
+ """Model for the key (unique identifier) of a predicate."""
90
+
91
+ name: PredicateKeyNameT = Field(
92
+ description="Name of the predicate key.",
93
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
94
+ )
95
+ type_: PredicateKeyTypeT = Field(
96
+ alias="type",
97
+ title="Type",
98
+ description="Type of predicate key.",
99
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
100
+ )
101
+
102
+
103
+ class PredicateValue(AliasModel, Generic[PredicateValueTypeT], extra="forbid"):
104
+ """Model for the value of a predicate."""
105
+
106
+ name: StrictStr = Field(
107
+ description="Name of the predicate value (actual value).",
108
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
109
+ )
110
+ type_: PredicateValueTypeT = Field(
111
+ alias="type",
112
+ description="Type of predicate value.",
113
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
114
+ )
115
+
116
+
117
+ class Predicate(
118
+ AliasModel,
119
+ Generic[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT],
120
+ extra="forbid",
121
+ ):
122
+ """Model for a predicate."""
123
+
124
+ key: PredicateKey[PredicateKeyNameT, PredicateKeyTypeT]
125
+ value: PredicateValue[PredicateValueTypeT]
126
+
127
+ numerical_value: Optional[NumericalValue] = None
128
+ numerical_value_si: Optional[NumericalValue] = None
129
+ nominal_value: Optional[NominalValue] = None
130
+ text_value: Optional[TextValue] = None
131
+ boolean_value: Optional[BooleanValue] = None
132
+ datetime_value: Optional[DatetimeValue] = None
133
+ geopoint_value: Optional[GeopointValue] = None
@@ -0,0 +1,95 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the model Record."""
7
+ from typing import Generic, Optional
8
+
9
+ from pydantic import BaseModel, Field, StrictStr
10
+
11
+ from docling_core.search.mapping import es_field
12
+ from docling_core.types.base import (
13
+ Acquisition,
14
+ CollectionNameTypeT,
15
+ CollectionRecordInfo,
16
+ FileInfoObject,
17
+ Identifier,
18
+ IdentifierTypeT,
19
+ Log,
20
+ PredicateKeyNameT,
21
+ PredicateKeyTypeT,
22
+ PredicateValueTypeT,
23
+ StrictDateTime,
24
+ SubjectNameTypeT,
25
+ SubjectTypeT,
26
+ )
27
+ from docling_core.types.rec.attribute import Attribute
28
+ from docling_core.types.rec.base import Provenance, ProvenanceTypeT
29
+ from docling_core.types.rec.subject import Subject
30
+
31
+
32
+ class RecordDescription(BaseModel, Generic[CollectionNameTypeT]):
33
+ """Additional record metadata, including optional collection-specific fields."""
34
+
35
+ logs: list[Log] = Field(
36
+ description="Logs that describe the ETL tasks applied to this record."
37
+ )
38
+ publication_date: Optional[StrictDateTime] = Field(
39
+ default=None,
40
+ title="Publication date",
41
+ description=(
42
+ "The date that best represents the last publication time of a record."
43
+ ),
44
+ )
45
+ collection: Optional[CollectionRecordInfo[CollectionNameTypeT]] = Field(
46
+ default=None, description="The collection information of this record."
47
+ )
48
+ acquisition: Optional[Acquisition] = Field(
49
+ default=None,
50
+ description=(
51
+ "Information on how the document was obtained, for data governance"
52
+ " purposes."
53
+ ),
54
+ )
55
+
56
+
57
+ class Record(
58
+ Provenance,
59
+ Generic[
60
+ IdentifierTypeT,
61
+ PredicateValueTypeT,
62
+ PredicateKeyNameT,
63
+ PredicateKeyTypeT,
64
+ ProvenanceTypeT,
65
+ SubjectTypeT,
66
+ SubjectNameTypeT,
67
+ CollectionNameTypeT,
68
+ ],
69
+ ):
70
+ """A representation of a structured record in an database."""
71
+
72
+ file_info: FileInfoObject = Field(alias="file-info")
73
+ description: RecordDescription
74
+ subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT]
75
+ attributes: Optional[
76
+ list[
77
+ Attribute[
78
+ IdentifierTypeT,
79
+ PredicateValueTypeT,
80
+ PredicateKeyNameT,
81
+ PredicateKeyTypeT,
82
+ ProvenanceTypeT,
83
+ ]
84
+ ]
85
+ ] = None
86
+ name: Optional[StrictStr] = Field(
87
+ default=None,
88
+ description="A short description or summary of the record.",
89
+ alias="_name",
90
+ json_schema_extra=es_field(type="text"),
91
+ )
92
+ identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
93
+ default=None,
94
+ description="A list of unique identifiers of this record in a database.",
95
+ )
@@ -0,0 +1,41 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the model Statement."""
7
+ from typing import Generic
8
+
9
+ from pydantic import Field
10
+
11
+ from docling_core.types.base import (
12
+ IdentifierTypeT,
13
+ PredicateKeyNameT,
14
+ PredicateKeyTypeT,
15
+ PredicateValueTypeT,
16
+ ProvenanceTypeT,
17
+ SubjectNameTypeT,
18
+ SubjectTypeT,
19
+ )
20
+ from docling_core.types.rec.attribute import Attribute
21
+ from docling_core.types.rec.subject import Subject
22
+
23
+
24
+ class Statement(
25
+ Attribute,
26
+ Generic[
27
+ IdentifierTypeT,
28
+ PredicateValueTypeT,
29
+ PredicateKeyNameT,
30
+ PredicateKeyTypeT,
31
+ ProvenanceTypeT,
32
+ SubjectTypeT,
33
+ SubjectNameTypeT,
34
+ ],
35
+ extra="allow",
36
+ ):
37
+ """A representation of a statement on a subject."""
38
+
39
+ subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT] = Field(
40
+ description="The subject (entity) of this statement."
41
+ )
@@ -0,0 +1,77 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the model Subject."""
7
+ from typing import Generic, Optional
8
+
9
+ from pydantic import Field, StrictStr
10
+
11
+ from docling_core.search.mapping import es_field
12
+ from docling_core.types.base import (
13
+ Identifier,
14
+ IdentifierTypeT,
15
+ SubjectNameTypeT,
16
+ SubjectTypeT,
17
+ )
18
+ from docling_core.types.doc.base import S3Reference
19
+ from docling_core.utils.alias import AliasModel
20
+
21
+
22
+ class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]):
23
+ """Identifier of subject names.""" ""
24
+
25
+
26
+ class Subject(
27
+ AliasModel,
28
+ Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT],
29
+ extra="forbid",
30
+ ):
31
+ """A representation of a subject."""
32
+
33
+ display_name: StrictStr = Field(
34
+ title="Display Name",
35
+ description=(
36
+ "Name of the subject in natural language. It can be used for end-user "
37
+ "applications to display a human-readable name. For instance, `B(2) Mg(1)` "
38
+ "for `MgB2` or `International Business Machines` for `IBM`"
39
+ ),
40
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
41
+ )
42
+ display_image: Optional[S3Reference] = Field(
43
+ default=None,
44
+ title="Display Image",
45
+ description=(
46
+ "Image representing the subject. It can be used for end-user applications."
47
+ "For example, the chemical structure drawing of a compound "
48
+ "or the eight bar IBM logo for IBM."
49
+ ),
50
+ json_schema_extra=es_field(suppress=True),
51
+ )
52
+ type_: SubjectTypeT = Field(
53
+ alias="type",
54
+ description=(
55
+ "Main subject type. For instance, `material`, `material-class`, "
56
+ "`material-device`, `company`, or `person`."
57
+ ),
58
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
59
+ )
60
+ names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field(
61
+ description=(
62
+ "List of given names for this subject. They may not be unique across "
63
+ "different subjects."
64
+ )
65
+ )
66
+ identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
67
+ default=None,
68
+ description=(
69
+ "List of unique identifiers in database. For instance, the `PubChem ID` "
70
+ "of a record in the PubChem database."
71
+ ),
72
+ )
73
+ labels: Optional[list[StrictStr]] = Field(
74
+ default=None,
75
+ description="List of labels or categories for this subject.",
76
+ json_schema_extra=es_field(type="keyword", ignore_above=8191),
77
+ )
@@ -0,0 +1,6 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Package for modules to support data models."""
@@ -0,0 +1,27 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define utility models and types related to field aliases."""
7
+ from pydantic import BaseModel, ConfigDict
8
+
9
+
10
+ class AliasModel(BaseModel):
11
+ """Model for alias fields to ensure instantiation and serialization by alias."""
12
+
13
+ model_config = ConfigDict(populate_by_name=True)
14
+
15
+ def model_dump(self, **kwargs) -> dict:
16
+ """Generate a dictionary representation of the model using field aliases."""
17
+ if "by_alias" not in kwargs:
18
+ kwargs = {**kwargs, "by_alias": True}
19
+
20
+ return super().model_dump(**kwargs)
21
+
22
+ def model_dump_json(self, **kwargs) -> str:
23
+ """Generate a JSON representation of the model using field aliases."""
24
+ if "by_alias" not in kwargs:
25
+ kwargs = {**kwargs, "by_alias": True}
26
+
27
+ return super().model_dump_json(**kwargs)