docling-core 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/__init__.py +6 -0
- docling_core/py.typed +0 -0
- docling_core/resources/schemas/doc/ANN.json +171 -0
- docling_core/resources/schemas/doc/DOC.json +300 -0
- docling_core/resources/schemas/doc/OCR-output.json +166 -0
- docling_core/resources/schemas/doc/RAW.json +158 -0
- docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
- docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
- docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
- docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
- docling_core/search/__init__.py +6 -0
- docling_core/search/json_schema_to_search_mapper.py +406 -0
- docling_core/search/mapping.py +29 -0
- docling_core/search/meta.py +93 -0
- docling_core/search/package.py +56 -0
- docling_core/types/__init__.py +25 -0
- docling_core/types/base.py +248 -0
- docling_core/types/doc/__init__.py +6 -0
- docling_core/types/doc/base.py +199 -0
- docling_core/types/doc/doc_ann.py +76 -0
- docling_core/types/doc/doc_ocr.py +83 -0
- docling_core/types/doc/doc_raw.py +187 -0
- docling_core/types/doc/document.py +393 -0
- docling_core/types/gen/__init__.py +6 -0
- docling_core/types/gen/generic.py +33 -0
- docling_core/types/nlp/__init__.py +6 -0
- docling_core/types/nlp/qa.py +74 -0
- docling_core/types/nlp/qa_labels.py +118 -0
- docling_core/types/rec/__init__.py +6 -0
- docling_core/types/rec/attribute.py +55 -0
- docling_core/types/rec/base.py +90 -0
- docling_core/types/rec/predicate.py +133 -0
- docling_core/types/rec/record.py +95 -0
- docling_core/types/rec/statement.py +41 -0
- docling_core/types/rec/subject.py +77 -0
- docling_core/utils/__init__.py +6 -0
- docling_core/utils/alias.py +27 -0
- docling_core/utils/ds_generate_docs.py +144 -0
- docling_core/utils/ds_generate_jsonschema.py +62 -0
- docling_core/utils/validate.py +86 -0
- docling_core/utils/validators.py +100 -0
- docling_core-0.0.1.dist-info/LICENSE +21 -0
- docling_core-0.0.1.dist-info/METADATA +133 -0
- docling_core-0.0.1.dist-info/RECORD +46 -0
- docling_core-0.0.1.dist-info/WHEEL +4 -0
- docling_core-0.0.1.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define models for labeling Q&A pairs."""
|
|
7
|
+
from typing import Literal, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
from docling_core.search.mapping import es_field
|
|
12
|
+
|
|
13
|
+
QAScopeLabel = Literal["corpus", "document", "out_of_scope"]
|
|
14
|
+
QAAlignmentLabel = Literal["aligned", "tangential", "misaligned"]
|
|
15
|
+
QACorrectnessLabel = Literal["entailed", "not_entailed"]
|
|
16
|
+
QACompletenessLabel = Literal["complete", "incomplete"]
|
|
17
|
+
QAInformationLabel = Literal[
|
|
18
|
+
"fact_single",
|
|
19
|
+
"fact_multi",
|
|
20
|
+
"summary",
|
|
21
|
+
"reasoning",
|
|
22
|
+
"choice",
|
|
23
|
+
"procedure",
|
|
24
|
+
"opinion",
|
|
25
|
+
"feedback",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class QALabelling(BaseModel, extra="forbid"):
|
|
30
|
+
"""Subclass to classify QA pair."""
|
|
31
|
+
|
|
32
|
+
scope: Optional[QAScopeLabel] = Field(
|
|
33
|
+
default=None,
|
|
34
|
+
description="""Enumeration of QA scope types based on question only.
|
|
35
|
+
- Corpus: question is asked on the entire corpus
|
|
36
|
+
> Example: "What is the operating temperature of device X?"
|
|
37
|
+
- Document: need to know the precise document before answering the question
|
|
38
|
+
> Example: "What is its operating temperature?"
|
|
39
|
+
- Out of scope: question is out of scope for the system
|
|
40
|
+
> Example: "What is the volume of moon?" """,
|
|
41
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
42
|
+
)
|
|
43
|
+
alignment: Optional[QAAlignmentLabel] = Field(
|
|
44
|
+
default=None,
|
|
45
|
+
description="""Enumeration of QA alignment types based on question-context pair.
|
|
46
|
+
Given the following context: "Device X works between 2 and 20 degrees C"
|
|
47
|
+
A question can be:
|
|
48
|
+
- Aligned: the context has information that the question seeks
|
|
49
|
+
> Example: "Can device X work at 10 degrees?"
|
|
50
|
+
- Tangential: the context does not have the information directly
|
|
51
|
+
but the question is related to the context
|
|
52
|
+
> Example: "Is device X safe?"
|
|
53
|
+
- Misaligned: the question has nothing to do with the context
|
|
54
|
+
> Example: "Why is device Y not working?" """,
|
|
55
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
56
|
+
)
|
|
57
|
+
correctness: Optional[QACorrectnessLabel] = Field(
|
|
58
|
+
default=None,
|
|
59
|
+
description="""Enumeration of QA correctness types based on
|
|
60
|
+
question-answer-context triplet.
|
|
61
|
+
Given the following context: "Device X works between 2 and 20 degrees C"
|
|
62
|
+
and the following question: "Can device X work at 10 degrees?"
|
|
63
|
+
An answer can be:
|
|
64
|
+
- Entailed: answer is entailed to both question and context
|
|
65
|
+
> Example: "Yes, as it works between 2 and 20 degrees."
|
|
66
|
+
- Not entailed: answer is not entailed to either question or context
|
|
67
|
+
> Example: "Yes, device X can work at any temperature." """,
|
|
68
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
69
|
+
)
|
|
70
|
+
completeness: Optional[QACompletenessLabel] = Field(
|
|
71
|
+
default=None,
|
|
72
|
+
description="""Enumeration of QA completeness types based on
|
|
73
|
+
question-answer-context triplet.
|
|
74
|
+
Given the following context: "A, B, C, and D met on Friday."
|
|
75
|
+
and the following question: "Who was in the meeting?"
|
|
76
|
+
An answer can be:
|
|
77
|
+
- Complete: Answer contains all relevant information requested by a
|
|
78
|
+
question that can be extracted from the associated ground-truth context
|
|
79
|
+
> Example: "A, B, C, and D."
|
|
80
|
+
- Incomplete: Answer does not contain the entire relevant information in
|
|
81
|
+
the context
|
|
82
|
+
> Example: "B and D" """,
|
|
83
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
84
|
+
)
|
|
85
|
+
information: Optional[QAInformationLabel] = Field(
|
|
86
|
+
default=None,
|
|
87
|
+
description="""Enumeration of QA nature of information types based on question
|
|
88
|
+
only.
|
|
89
|
+
- Single fact: Answer should be a short phrase containing a numerical or
|
|
90
|
+
textual fact
|
|
91
|
+
> Example: "What is the boiling point of water?"
|
|
92
|
+
- Multiple fact: Answer is a list of two or more facts (not necessarily in
|
|
93
|
+
list format)
|
|
94
|
+
> Example: "What is the minimum and maximum age of people working at
|
|
95
|
+
IBM?"
|
|
96
|
+
- Summary: Answer summarises a part of the context without any modification.
|
|
97
|
+
> Example: "Briefly describe the temperature requirements for this
|
|
98
|
+
device in a table"
|
|
99
|
+
- Reasoning: Answer requires inferring information from the context that
|
|
100
|
+
can be inferred but is not explicitly stated (e.g., operating
|
|
101
|
+
temperature is given and the question asks if the device can operate at
|
|
102
|
+
a particular temperature)
|
|
103
|
+
> Example: "Why can I not operate this device under water?"
|
|
104
|
+
- Multiple choice: Question provides a few choices implicitly or explicitly
|
|
105
|
+
and the answer must be one of these choices. Includes yes/no questions
|
|
106
|
+
> Example: "If I operate this device at 10 degrees, will it be in the
|
|
107
|
+
green range or red?"
|
|
108
|
+
- Procedure: Answer outlines the steps to do something. As opposed to a
|
|
109
|
+
summary, the order of information matters here
|
|
110
|
+
> Example: "How can I access part X of device Y?"
|
|
111
|
+
- Opinion: The context provides several viewpoints and the question
|
|
112
|
+
requests the opinion of the chatbot
|
|
113
|
+
> Example: "Is device X better than Y?"
|
|
114
|
+
- Feedback: The question is actually a feedback on the preceding generation
|
|
115
|
+
within a session
|
|
116
|
+
> Example: "Your summary was inadequate" """,
|
|
117
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
118
|
+
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the model Attribute."""
|
|
7
|
+
from typing import Generic, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
from typing_extensions import Annotated
|
|
11
|
+
|
|
12
|
+
from docling_core.search.mapping import es_field
|
|
13
|
+
from docling_core.types.base import (
|
|
14
|
+
IdentifierTypeT,
|
|
15
|
+
PredicateKeyNameT,
|
|
16
|
+
PredicateKeyTypeT,
|
|
17
|
+
PredicateValueTypeT,
|
|
18
|
+
ProvenanceTypeT,
|
|
19
|
+
)
|
|
20
|
+
from docling_core.types.rec.base import ProvenanceItem
|
|
21
|
+
from docling_core.types.rec.predicate import Predicate
|
|
22
|
+
from docling_core.utils.alias import AliasModel
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Attribute(
|
|
26
|
+
AliasModel,
|
|
27
|
+
Generic[
|
|
28
|
+
IdentifierTypeT,
|
|
29
|
+
PredicateValueTypeT,
|
|
30
|
+
PredicateKeyNameT,
|
|
31
|
+
PredicateKeyTypeT,
|
|
32
|
+
ProvenanceTypeT,
|
|
33
|
+
],
|
|
34
|
+
extra="forbid",
|
|
35
|
+
):
|
|
36
|
+
"""Attribute model that describes a list of characteristics."""
|
|
37
|
+
|
|
38
|
+
conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0, allow_inf_nan=False)] = (
|
|
39
|
+
Field(
|
|
40
|
+
...,
|
|
41
|
+
title="Confidence",
|
|
42
|
+
description="The confidence level of this attribute characteristics.",
|
|
43
|
+
json_schema_extra=es_field(type="float"),
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
prov: Optional[list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]]] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
title="Provenance",
|
|
50
|
+
description="The sources of this attribute characteristics.",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
predicates: list[
|
|
54
|
+
Predicate[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT]
|
|
55
|
+
] = Field(..., description="A list of characteristics (type, value, and name).")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the base models for the Record type."""
|
|
7
|
+
from typing import Generic, List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, StrictInt, StrictStr
|
|
10
|
+
from typing_extensions import Annotated
|
|
11
|
+
|
|
12
|
+
from docling_core.search.mapping import es_field
|
|
13
|
+
from docling_core.types.base import Identifier, IdentifierTypeT, ProvenanceTypeT
|
|
14
|
+
from docling_core.utils.alias import AliasModel
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ProvenanceItem(
|
|
18
|
+
AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT], extra="forbid"
|
|
19
|
+
):
|
|
20
|
+
"""A representation of an object provenance."""
|
|
21
|
+
|
|
22
|
+
type_: Optional[ProvenanceTypeT] = Field(
|
|
23
|
+
default=None,
|
|
24
|
+
alias="type",
|
|
25
|
+
title="The provenance type",
|
|
26
|
+
description=(
|
|
27
|
+
"Any string representing the type of provenance, e.g. `sentence`, "
|
|
28
|
+
"`table`, or `doi`."
|
|
29
|
+
),
|
|
30
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
text: Optional[StrictStr] = Field(
|
|
34
|
+
default=None,
|
|
35
|
+
title="Evidence of the provenance",
|
|
36
|
+
description=(
|
|
37
|
+
"A text representing the evidence of the provenance, e.g. the sentence "
|
|
38
|
+
"text or the content of a table cell"
|
|
39
|
+
),
|
|
40
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
reference: Optional[Identifier[IdentifierTypeT]] = Field(
|
|
44
|
+
default=None,
|
|
45
|
+
title="Reference to the provenance object",
|
|
46
|
+
description=(
|
|
47
|
+
"Reference to another object, e.g. record, statement, URL, or any other "
|
|
48
|
+
"object that identifies the provenance"
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
path: Optional[StrictStr] = Field(
|
|
53
|
+
default=None,
|
|
54
|
+
title="The location of the provenance within the referenced object",
|
|
55
|
+
description=(
|
|
56
|
+
"A path that locates the evidence within the provenance object identified "
|
|
57
|
+
"by the `reference` field using a JSON pointer notation, e.g., "
|
|
58
|
+
"`#/main-text/5` to locate the `main-text` paragraph at index 5"
|
|
59
|
+
),
|
|
60
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
span: Optional[Annotated[List[StrictInt], Field(min_length=2, max_length=2)]] = (
|
|
64
|
+
Field(
|
|
65
|
+
default=None,
|
|
66
|
+
title="The location of the item in the text/table",
|
|
67
|
+
description=(
|
|
68
|
+
"location of the item in the text/table referenced by the `path`,"
|
|
69
|
+
" e.g., `[34, 67]`"
|
|
70
|
+
),
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Provenance(AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT]):
|
|
76
|
+
"""A representation of an evidence, as a list of provenance objects."""
|
|
77
|
+
|
|
78
|
+
conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0)] = Field(
|
|
79
|
+
...,
|
|
80
|
+
title="The confidence of the evidence",
|
|
81
|
+
description=(
|
|
82
|
+
"This value represents a score to the data item. Items originating from "
|
|
83
|
+
" databases will typically have a score 1.0, while items resulting from "
|
|
84
|
+
" an NLP model may have a value between 0.0 and 1.0."
|
|
85
|
+
),
|
|
86
|
+
json_schema_extra=es_field(type="float"),
|
|
87
|
+
)
|
|
88
|
+
prov: list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]] = Field(
|
|
89
|
+
title="Provenance", description="A list of provenance items."
|
|
90
|
+
)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the model Predicate."""
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Annotated, Generic, Optional
|
|
9
|
+
|
|
10
|
+
from pydantic import (
|
|
11
|
+
BaseModel,
|
|
12
|
+
Field,
|
|
13
|
+
StrictBool,
|
|
14
|
+
StrictFloat,
|
|
15
|
+
StrictStr,
|
|
16
|
+
field_validator,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from docling_core.search.mapping import es_field
|
|
20
|
+
from docling_core.types.base import (
|
|
21
|
+
Coordinates,
|
|
22
|
+
PredicateKeyNameT,
|
|
23
|
+
PredicateKeyTypeT,
|
|
24
|
+
PredicateValueTypeT,
|
|
25
|
+
)
|
|
26
|
+
from docling_core.utils.alias import AliasModel
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NumericalValue(BaseModel, extra="forbid"):
|
|
30
|
+
"""Model for numerical values."""
|
|
31
|
+
|
|
32
|
+
min: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
|
|
33
|
+
max: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
|
|
34
|
+
val: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
|
|
35
|
+
err: StrictFloat = Field(..., json_schema_extra=es_field(type="float"))
|
|
36
|
+
unit: StrictStr = Field(
|
|
37
|
+
..., json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class NominalValue(BaseModel, extra="forbid"):
|
|
42
|
+
"""Model for nominal (categorical) values."""
|
|
43
|
+
|
|
44
|
+
value: StrictStr = Field(
|
|
45
|
+
..., json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class TextValue(BaseModel, extra="forbid"):
|
|
50
|
+
"""Model for textual values."""
|
|
51
|
+
|
|
52
|
+
value: StrictStr = Field(..., json_schema_extra=es_field(type="text"))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class BooleanValue(BaseModel, extra="forbid"):
|
|
56
|
+
"""Model for boolean values."""
|
|
57
|
+
|
|
58
|
+
value: StrictBool = Field(..., json_schema_extra=es_field(type="boolean"))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DatetimeValue(BaseModel, extra="forbid"):
|
|
62
|
+
"""Model for datetime values."""
|
|
63
|
+
|
|
64
|
+
value: datetime
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class GeopointValue(BaseModel, extra="forbid"):
|
|
68
|
+
"""A representation of a geopoint (longitude and latitude coordinates)."""
|
|
69
|
+
|
|
70
|
+
value: Coordinates
|
|
71
|
+
conf: Optional[Annotated[float, Field(strict=True, ge=0.0, le=1.0)]] = Field(
|
|
72
|
+
default=None, json_schema_extra=es_field(type="float")
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
@field_validator("value")
|
|
76
|
+
@classmethod
|
|
77
|
+
def validate_coordinates(cls, v):
|
|
78
|
+
"""Validate the reference field for indexes of type Document."""
|
|
79
|
+
if abs(v[0]) > 180:
|
|
80
|
+
raise ValueError("invalid longitude")
|
|
81
|
+
if abs(v[1]) > 90:
|
|
82
|
+
raise ValueError("invalid latitude")
|
|
83
|
+
return v
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class PredicateKey(
|
|
87
|
+
AliasModel, Generic[PredicateKeyNameT, PredicateKeyTypeT], extra="forbid"
|
|
88
|
+
):
|
|
89
|
+
"""Model for the key (unique identifier) of a predicate."""
|
|
90
|
+
|
|
91
|
+
name: PredicateKeyNameT = Field(
|
|
92
|
+
description="Name of the predicate key.",
|
|
93
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
94
|
+
)
|
|
95
|
+
type_: PredicateKeyTypeT = Field(
|
|
96
|
+
alias="type",
|
|
97
|
+
title="Type",
|
|
98
|
+
description="Type of predicate key.",
|
|
99
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class PredicateValue(AliasModel, Generic[PredicateValueTypeT], extra="forbid"):
|
|
104
|
+
"""Model for the value of a predicate."""
|
|
105
|
+
|
|
106
|
+
name: StrictStr = Field(
|
|
107
|
+
description="Name of the predicate value (actual value).",
|
|
108
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
109
|
+
)
|
|
110
|
+
type_: PredicateValueTypeT = Field(
|
|
111
|
+
alias="type",
|
|
112
|
+
description="Type of predicate value.",
|
|
113
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class Predicate(
|
|
118
|
+
AliasModel,
|
|
119
|
+
Generic[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT],
|
|
120
|
+
extra="forbid",
|
|
121
|
+
):
|
|
122
|
+
"""Model for a predicate."""
|
|
123
|
+
|
|
124
|
+
key: PredicateKey[PredicateKeyNameT, PredicateKeyTypeT]
|
|
125
|
+
value: PredicateValue[PredicateValueTypeT]
|
|
126
|
+
|
|
127
|
+
numerical_value: Optional[NumericalValue] = None
|
|
128
|
+
numerical_value_si: Optional[NumericalValue] = None
|
|
129
|
+
nominal_value: Optional[NominalValue] = None
|
|
130
|
+
text_value: Optional[TextValue] = None
|
|
131
|
+
boolean_value: Optional[BooleanValue] = None
|
|
132
|
+
datetime_value: Optional[DatetimeValue] = None
|
|
133
|
+
geopoint_value: Optional[GeopointValue] = None
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the model Record."""
|
|
7
|
+
from typing import Generic, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field, StrictStr
|
|
10
|
+
|
|
11
|
+
from docling_core.search.mapping import es_field
|
|
12
|
+
from docling_core.types.base import (
|
|
13
|
+
Acquisition,
|
|
14
|
+
CollectionNameTypeT,
|
|
15
|
+
CollectionRecordInfo,
|
|
16
|
+
FileInfoObject,
|
|
17
|
+
Identifier,
|
|
18
|
+
IdentifierTypeT,
|
|
19
|
+
Log,
|
|
20
|
+
PredicateKeyNameT,
|
|
21
|
+
PredicateKeyTypeT,
|
|
22
|
+
PredicateValueTypeT,
|
|
23
|
+
StrictDateTime,
|
|
24
|
+
SubjectNameTypeT,
|
|
25
|
+
SubjectTypeT,
|
|
26
|
+
)
|
|
27
|
+
from docling_core.types.rec.attribute import Attribute
|
|
28
|
+
from docling_core.types.rec.base import Provenance, ProvenanceTypeT
|
|
29
|
+
from docling_core.types.rec.subject import Subject
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RecordDescription(BaseModel, Generic[CollectionNameTypeT]):
|
|
33
|
+
"""Additional record metadata, including optional collection-specific fields."""
|
|
34
|
+
|
|
35
|
+
logs: list[Log] = Field(
|
|
36
|
+
description="Logs that describe the ETL tasks applied to this record."
|
|
37
|
+
)
|
|
38
|
+
publication_date: Optional[StrictDateTime] = Field(
|
|
39
|
+
default=None,
|
|
40
|
+
title="Publication date",
|
|
41
|
+
description=(
|
|
42
|
+
"The date that best represents the last publication time of a record."
|
|
43
|
+
),
|
|
44
|
+
)
|
|
45
|
+
collection: Optional[CollectionRecordInfo[CollectionNameTypeT]] = Field(
|
|
46
|
+
default=None, description="The collection information of this record."
|
|
47
|
+
)
|
|
48
|
+
acquisition: Optional[Acquisition] = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description=(
|
|
51
|
+
"Information on how the document was obtained, for data governance"
|
|
52
|
+
" purposes."
|
|
53
|
+
),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Record(
|
|
58
|
+
Provenance,
|
|
59
|
+
Generic[
|
|
60
|
+
IdentifierTypeT,
|
|
61
|
+
PredicateValueTypeT,
|
|
62
|
+
PredicateKeyNameT,
|
|
63
|
+
PredicateKeyTypeT,
|
|
64
|
+
ProvenanceTypeT,
|
|
65
|
+
SubjectTypeT,
|
|
66
|
+
SubjectNameTypeT,
|
|
67
|
+
CollectionNameTypeT,
|
|
68
|
+
],
|
|
69
|
+
):
|
|
70
|
+
"""A representation of a structured record in an database."""
|
|
71
|
+
|
|
72
|
+
file_info: FileInfoObject = Field(alias="file-info")
|
|
73
|
+
description: RecordDescription
|
|
74
|
+
subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT]
|
|
75
|
+
attributes: Optional[
|
|
76
|
+
list[
|
|
77
|
+
Attribute[
|
|
78
|
+
IdentifierTypeT,
|
|
79
|
+
PredicateValueTypeT,
|
|
80
|
+
PredicateKeyNameT,
|
|
81
|
+
PredicateKeyTypeT,
|
|
82
|
+
ProvenanceTypeT,
|
|
83
|
+
]
|
|
84
|
+
]
|
|
85
|
+
] = None
|
|
86
|
+
name: Optional[StrictStr] = Field(
|
|
87
|
+
default=None,
|
|
88
|
+
description="A short description or summary of the record.",
|
|
89
|
+
alias="_name",
|
|
90
|
+
json_schema_extra=es_field(type="text"),
|
|
91
|
+
)
|
|
92
|
+
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="A list of unique identifiers of this record in a database.",
|
|
95
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the model Statement."""
|
|
7
|
+
from typing import Generic
|
|
8
|
+
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from docling_core.types.base import (
|
|
12
|
+
IdentifierTypeT,
|
|
13
|
+
PredicateKeyNameT,
|
|
14
|
+
PredicateKeyTypeT,
|
|
15
|
+
PredicateValueTypeT,
|
|
16
|
+
ProvenanceTypeT,
|
|
17
|
+
SubjectNameTypeT,
|
|
18
|
+
SubjectTypeT,
|
|
19
|
+
)
|
|
20
|
+
from docling_core.types.rec.attribute import Attribute
|
|
21
|
+
from docling_core.types.rec.subject import Subject
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Statement(
|
|
25
|
+
Attribute,
|
|
26
|
+
Generic[
|
|
27
|
+
IdentifierTypeT,
|
|
28
|
+
PredicateValueTypeT,
|
|
29
|
+
PredicateKeyNameT,
|
|
30
|
+
PredicateKeyTypeT,
|
|
31
|
+
ProvenanceTypeT,
|
|
32
|
+
SubjectTypeT,
|
|
33
|
+
SubjectNameTypeT,
|
|
34
|
+
],
|
|
35
|
+
extra="allow",
|
|
36
|
+
):
|
|
37
|
+
"""A representation of a statement on a subject."""
|
|
38
|
+
|
|
39
|
+
subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT] = Field(
|
|
40
|
+
description="The subject (entity) of this statement."
|
|
41
|
+
)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the model Subject."""
|
|
7
|
+
from typing import Generic, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import Field, StrictStr
|
|
10
|
+
|
|
11
|
+
from docling_core.search.mapping import es_field
|
|
12
|
+
from docling_core.types.base import (
|
|
13
|
+
Identifier,
|
|
14
|
+
IdentifierTypeT,
|
|
15
|
+
SubjectNameTypeT,
|
|
16
|
+
SubjectTypeT,
|
|
17
|
+
)
|
|
18
|
+
from docling_core.types.doc.base import S3Reference
|
|
19
|
+
from docling_core.utils.alias import AliasModel
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]):
|
|
23
|
+
"""Identifier of subject names.""" ""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Subject(
|
|
27
|
+
AliasModel,
|
|
28
|
+
Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT],
|
|
29
|
+
extra="forbid",
|
|
30
|
+
):
|
|
31
|
+
"""A representation of a subject."""
|
|
32
|
+
|
|
33
|
+
display_name: StrictStr = Field(
|
|
34
|
+
title="Display Name",
|
|
35
|
+
description=(
|
|
36
|
+
"Name of the subject in natural language. It can be used for end-user "
|
|
37
|
+
"applications to display a human-readable name. For instance, `B(2) Mg(1)` "
|
|
38
|
+
"for `MgB2` or `International Business Machines` for `IBM`"
|
|
39
|
+
),
|
|
40
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
41
|
+
)
|
|
42
|
+
display_image: Optional[S3Reference] = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
title="Display Image",
|
|
45
|
+
description=(
|
|
46
|
+
"Image representing the subject. It can be used for end-user applications."
|
|
47
|
+
"For example, the chemical structure drawing of a compound "
|
|
48
|
+
"or the eight bar IBM logo for IBM."
|
|
49
|
+
),
|
|
50
|
+
json_schema_extra=es_field(suppress=True),
|
|
51
|
+
)
|
|
52
|
+
type_: SubjectTypeT = Field(
|
|
53
|
+
alias="type",
|
|
54
|
+
description=(
|
|
55
|
+
"Main subject type. For instance, `material`, `material-class`, "
|
|
56
|
+
"`material-device`, `company`, or `person`."
|
|
57
|
+
),
|
|
58
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
59
|
+
)
|
|
60
|
+
names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field(
|
|
61
|
+
description=(
|
|
62
|
+
"List of given names for this subject. They may not be unique across "
|
|
63
|
+
"different subjects."
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description=(
|
|
69
|
+
"List of unique identifiers in database. For instance, the `PubChem ID` "
|
|
70
|
+
"of a record in the PubChem database."
|
|
71
|
+
),
|
|
72
|
+
)
|
|
73
|
+
labels: Optional[list[StrictStr]] = Field(
|
|
74
|
+
default=None,
|
|
75
|
+
description="List of labels or categories for this subject.",
|
|
76
|
+
json_schema_extra=es_field(type="keyword", ignore_above=8191),
|
|
77
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define utility models and types related to field aliases."""
|
|
7
|
+
from pydantic import BaseModel, ConfigDict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AliasModel(BaseModel):
|
|
11
|
+
"""Model for alias fields to ensure instantiation and serialization by alias."""
|
|
12
|
+
|
|
13
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
14
|
+
|
|
15
|
+
def model_dump(self, **kwargs) -> dict:
|
|
16
|
+
"""Generate a dictionary representation of the model using field aliases."""
|
|
17
|
+
if "by_alias" not in kwargs:
|
|
18
|
+
kwargs = {**kwargs, "by_alias": True}
|
|
19
|
+
|
|
20
|
+
return super().model_dump(**kwargs)
|
|
21
|
+
|
|
22
|
+
def model_dump_json(self, **kwargs) -> str:
|
|
23
|
+
"""Generate a JSON representation of the model using field aliases."""
|
|
24
|
+
if "by_alias" not in kwargs:
|
|
25
|
+
kwargs = {**kwargs, "by_alias": True}
|
|
26
|
+
|
|
27
|
+
return super().model_dump_json(**kwargs)
|