heurist-api 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of heurist-api might be problematic. Click here for more details.
- heurist/__init__.py +1 -0
- heurist/api/__init__.py +0 -0
- heurist/api/client.py +122 -0
- heurist/api/connection.py +71 -0
- heurist/api/constants.py +19 -0
- heurist/api/credentials.py +71 -0
- heurist/api/exceptions.py +45 -0
- heurist/api/url_builder.py +148 -0
- heurist/api/utils.py +24 -0
- heurist/cli/__init__.py +0 -0
- heurist/cli/__main__.py +227 -0
- heurist/cli/load.py +55 -0
- heurist/cli/records.py +49 -0
- heurist/cli/schema.py +94 -0
- heurist/database/__init__.py +3 -0
- heurist/database/basedb.py +125 -0
- heurist/database/database.py +96 -0
- heurist/models/__init__.py +0 -0
- heurist/models/dynamic/__init__.py +3 -0
- heurist/models/dynamic/annotation.py +143 -0
- heurist/models/dynamic/create_model.py +82 -0
- heurist/models/dynamic/date.py +61 -0
- heurist/models/dynamic/type.py +96 -0
- heurist/models/structural/DetailTypes.py +34 -0
- heurist/models/structural/RecStructure.py +27 -0
- heurist/models/structural/RecTypeGroups.py +27 -0
- heurist/models/structural/RecTypes.py +27 -0
- heurist/models/structural/Terms.py +27 -0
- heurist/models/structural/__init__.py +19 -0
- heurist/models/structural/dty.py +121 -0
- heurist/models/structural/hml_structure.py +36 -0
- heurist/models/structural/rst.py +141 -0
- heurist/models/structural/rtg.py +25 -0
- heurist/models/structural/rty.py +81 -0
- heurist/models/structural/trm.py +34 -0
- heurist/models/structural/utils.py +53 -0
- heurist/schema/__init__.py +27 -0
- heurist/schema/models.py +70 -0
- heurist/schema/rel_to_dict.py +39 -0
- heurist/sql/__init__.py +21 -0
- heurist/sql/joinRecordTypeIDNameByGroupType.sql +10 -0
- heurist/sql/joinRecordTypeMetadata.sql +17 -0
- heurist/sql/selectRecordTypeSchema.sql +51 -0
- heurist/sql/sql_safety.py +101 -0
- heurist/utils/constants.py +1 -0
- heurist/utils/rel_to_dict_array.py +8 -0
- heurist/validators/__init__.py +3 -0
- heurist/validators/detail_validator.py +142 -0
- heurist/validators/exceptions.py +34 -0
- heurist/validators/parse_heurist_date.py +71 -0
- heurist/validators/record_validator.py +156 -0
- heurist/workflows/__init__.py +3 -0
- heurist/workflows/etl.py +66 -0
- heurist_api-0.1.2.dist-info/METADATA +453 -0
- heurist_api-0.1.2.dist-info/RECORD +80 -0
- heurist_api-0.1.2.dist-info/WHEEL +4 -0
- heurist_api-0.1.2.dist-info/entry_points.txt +2 -0
- heurist_api-0.1.2.dist-info/licenses/LICENSE +427 -0
- mock_data/__init__.py +22 -0
- mock_data/blocktext/__init__.py +0 -0
- mock_data/blocktext/single.py +7 -0
- mock_data/date/__init__.py +0 -0
- mock_data/date/compound_repeated.py +44 -0
- mock_data/date/compound_single.py +30 -0
- mock_data/date/simple_single.py +16 -0
- mock_data/date/timestamp_repeated.py +30 -0
- mock_data/enum/__init__.py +0 -0
- mock_data/enum/repeated.py +29 -0
- mock_data/enum/single.py +18 -0
- mock_data/file/__init__.py +0 -0
- mock_data/file/single.py +28 -0
- mock_data/float/__init__.py +0 -0
- mock_data/float/single.py +8 -0
- mock_data/freetext/__init__.py +0 -0
- mock_data/freetext/single.py +16 -0
- mock_data/geo/__init__.py +0 -0
- mock_data/geo/single.py +22 -0
- mock_data/resource/__init__.py +0 -0
- mock_data/resource/repeated.py +35 -0
- mock_data/resource/single.py +16 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
|
|
5
|
+
KEYWORDS = duckdb.sql("select * from duckdb_keywords()").fetchall()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SafeSQLName:
|
|
9
|
+
def __init__(self) -> None:
|
|
10
|
+
self.reserved = [t[0] for t in KEYWORDS if t[1] == "reserved"]
|
|
11
|
+
self.unreserved = [t[0] for t in KEYWORDS if t[1] == "unreserved"]
|
|
12
|
+
self.column_name = [t[0] for t in KEYWORDS if t[1] == "column_name"]
|
|
13
|
+
self.type_function = [t[0] for t in KEYWORDS if t[1] == "type_function"]
|
|
14
|
+
self.all_keywords = [t[0] for t in KEYWORDS]
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def remove_characters(cls, s: str) -> str:
|
|
18
|
+
"""Simplify and remove undesirable characters from a string.
|
|
19
|
+
|
|
20
|
+
Examples:
|
|
21
|
+
>>> s = "Author or Creator (Person, Organization)"
|
|
22
|
+
>>> SafeSQLName.remove_characters(s)
|
|
23
|
+
'Author or Creator'
|
|
24
|
+
|
|
25
|
+
>>> s = "Status_trad_freetext"
|
|
26
|
+
>>> SafeSQLName.remove_characters(s)
|
|
27
|
+
'Status_trad_freetext'
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
s (str): Input string.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
str: Cleaned string.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Remove parentheses
|
|
37
|
+
s = re.sub(r"\(.+\)", "", s)
|
|
38
|
+
# Remove non-letters
|
|
39
|
+
s = re.sub(r"\W", " ", s)
|
|
40
|
+
# Remove backslashes
|
|
41
|
+
s = re.sub(r"/", " ", s)
|
|
42
|
+
# Remove double spaces
|
|
43
|
+
s = re.sub(r"\s+", " ", s)
|
|
44
|
+
# Remove double underscores
|
|
45
|
+
s = re.sub(r"_+", "_", s)
|
|
46
|
+
# Trim underscores
|
|
47
|
+
s = s.strip()
|
|
48
|
+
return s
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def to_pascal_case(cls, text: str) -> str:
|
|
52
|
+
text_string = text.replace("-", " ").replace("_", " ")
|
|
53
|
+
words = text_string.split()
|
|
54
|
+
if len(text) == 0:
|
|
55
|
+
return text
|
|
56
|
+
capitalized_words = ["".join(w[0].capitalize() + w[1:] for w in words)]
|
|
57
|
+
return "".join(capitalized_words)
|
|
58
|
+
|
|
59
|
+
def create_column_name(self, field_name: str, field_type: str) -> str:
|
|
60
|
+
"""
|
|
61
|
+
Create an SQL-safe column name for the Pydantic data field.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
field_name (str): Displayed name of the field (detail) in Heurist.
|
|
65
|
+
field_type (str): Heurist type of the field (detail).
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
str: SQL-safe column name.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
simplified_name = self.remove_characters(field_name)
|
|
72
|
+
if field_type == "resource":
|
|
73
|
+
final_name = f"{simplified_name} H-ID"
|
|
74
|
+
elif simplified_name.lower() in self.all_keywords:
|
|
75
|
+
final_name = f"{simplified_name}_COLUMN"
|
|
76
|
+
else:
|
|
77
|
+
final_name = simplified_name
|
|
78
|
+
return final_name
|
|
79
|
+
|
|
80
|
+
def create_table_name(self, record_name: str) -> str:
|
|
81
|
+
"""
|
|
82
|
+
Create SQL-safe table name for the record's data model.
|
|
83
|
+
|
|
84
|
+
Examples:
|
|
85
|
+
>>> heurist_name = "Sequence"
|
|
86
|
+
>>> SafeSQLName().create_table_name(heurist_name)
|
|
87
|
+
'SequenceTable'
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
record_name (str): Name of the Heurist record type.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
str: SQL-safe name for the record type's table.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
camel_case_name = self.to_pascal_case(record_name)
|
|
97
|
+
if camel_case_name.lower() in self.all_keywords:
|
|
98
|
+
final_name = f"{camel_case_name}Table"
|
|
99
|
+
else:
|
|
100
|
+
final_name = camel_case_name
|
|
101
|
+
return final_name
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
DEFAULT_RECORD_GROUPS = ("My record types",)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Class for converting a record's detail before the Pydantic model validation."""
|
|
2
|
+
|
|
3
|
+
from heurist.models.dynamic.date import TemporalObject
|
|
4
|
+
from heurist.models.dynamic.type import FieldType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DetailValidator:
|
|
8
|
+
"""
|
|
9
|
+
In Heurist, a record's "detail" is what is more commonly known as an attribute, \
|
|
10
|
+
dimension, or a data field.
|
|
11
|
+
|
|
12
|
+
This class features methods to extract the key value from Heurist's JSON \
|
|
13
|
+
formatting for all data types in Heurist's system.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
direct_values = ["freetext", "blocktext", "integer", "boolean", "float"]
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def validate_file(cls, detail: dict) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Extract the value of a file field.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
detail (dict): Record's detail.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
str: Value of record's detail.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
return detail.get("value", {}).get("file", {}).get("ulf_ExternalFileReference")
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def validate_enum(cls, detail: dict) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Extract the value of an enum field.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
detail (dict): Record's detail.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
str: Value of record's detail.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
return detail["termLabel"]
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def validate_geo(cls, detail: dict) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Extract the value of a geo field.
|
|
50
|
+
|
|
51
|
+
Examples:
|
|
52
|
+
>>> from mock_data.geo.single import DETAIL_POINT
|
|
53
|
+
>>> DetailValidator.convert(DETAIL_POINT)
|
|
54
|
+
'POINT(2.19726563 48.57478991)'
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
detail (dict): Record's detail.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: Value of record's detail.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
geo = detail["value"]["geo"]
|
|
64
|
+
if geo["type"] == "p" or geo["type"] == "pl":
|
|
65
|
+
return geo["wkt"]
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def validate_date(cls, detail: dict) -> dict:
|
|
69
|
+
"""
|
|
70
|
+
Build the variable date value into a structured dictionary.
|
|
71
|
+
|
|
72
|
+
Examples:
|
|
73
|
+
>>> # Test temporal object
|
|
74
|
+
>>> from mock_data.date.compound_single import DETAIL
|
|
75
|
+
>>> value = DetailValidator.convert(DETAIL)
|
|
76
|
+
>>> value['start']['earliest']
|
|
77
|
+
datetime.datetime(1180, 1, 1, 0, 0)
|
|
78
|
+
|
|
79
|
+
>>> # Test direct date value
|
|
80
|
+
>>> from mock_data.date.simple_single import DETAIL
|
|
81
|
+
>>> value = DetailValidator.convert(DETAIL)
|
|
82
|
+
>>> value['value']
|
|
83
|
+
datetime.datetime(2024, 3, 19, 0, 0)
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
detail (dict): Record's detail.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
dict: Structured metadata for a Heurist date object.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
if isinstance(detail.get("value"), dict):
|
|
93
|
+
model = TemporalObject.model_validate(detail["value"])
|
|
94
|
+
else:
|
|
95
|
+
model = TemporalObject.model_validate(detail)
|
|
96
|
+
return model.model_dump(by_alias=True)
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def validate_resource(cls, detail: dict) -> int:
|
|
100
|
+
"""
|
|
101
|
+
Extract the value of a resource (foreign key) field.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
detail (dict): Record's detail.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
int: Heurist ID of the referenced record.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
return int(detail["value"]["id"])
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def convert(cls, detail: dict) -> str | int | list | dict | None:
|
|
114
|
+
"""
|
|
115
|
+
Based on the data type, convert the record's nested detail to a flat value.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
detail (dict): One of the record's details (data fields).
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
str | int | list | dict | None: Flattened value of the data field.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
fieldtype = FieldType.from_detail(detail)
|
|
125
|
+
|
|
126
|
+
if any(ft in fieldtype for ft in cls.direct_values):
|
|
127
|
+
return detail["value"]
|
|
128
|
+
|
|
129
|
+
elif fieldtype == "date":
|
|
130
|
+
return cls.validate_date(detail)
|
|
131
|
+
|
|
132
|
+
elif fieldtype == "enum":
|
|
133
|
+
return cls.validate_enum(detail)
|
|
134
|
+
|
|
135
|
+
elif fieldtype == "file":
|
|
136
|
+
return cls.validate_file(detail)
|
|
137
|
+
|
|
138
|
+
elif fieldtype == "geo":
|
|
139
|
+
return cls.validate_geo(detail)
|
|
140
|
+
|
|
141
|
+
elif fieldtype == "resource":
|
|
142
|
+
return cls.validate_resource(detail)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Exceptions for classes that convert / transform Heurist data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class RepeatedValueInSingularDetailType(Exception):
|
|
7
|
+
"""The detail type is limited to a maximum of 1 values
|
|
8
|
+
but the record has more than 1 value for this detail."""
|
|
9
|
+
|
|
10
|
+
description = """
|
|
11
|
+
\t[rec_Type {typeID}]
|
|
12
|
+
\t[rec_ID {recID}]
|
|
13
|
+
\tThe detail '{fieldName}' is limited to a maximum of 1 values.
|
|
14
|
+
\tCount of values = {valueCount}."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, type_id: int, record_id: int, field_name: str, value_count: int):
|
|
17
|
+
self.message = self.description.format(
|
|
18
|
+
typeID=type_id,
|
|
19
|
+
recID=record_id,
|
|
20
|
+
fieldName=field_name,
|
|
21
|
+
valueCount=value_count,
|
|
22
|
+
)
|
|
23
|
+
super().__init__(self.message)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DateNotEnteredAsDateObject(Exception):
|
|
27
|
+
"""The date field was not entered as a constructed Heurist date object."""
|
|
28
|
+
|
|
29
|
+
description = """The date field was not entered as a compound Heurist date \
|
|
30
|
+
object.\n\tEntered value = {}"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, value: int | str | float):
|
|
33
|
+
self.message = self.description.format(value)
|
|
34
|
+
super().__init__(self.message)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
import dateutil.parser
|
|
4
|
+
import dateutil.relativedelta
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_heurist_date(repr: str | int | float | None) -> datetime | None:
|
|
8
|
+
"""
|
|
9
|
+
Convert Heurist's partial date representations to an ISO string format.
|
|
10
|
+
|
|
11
|
+
Examples:
|
|
12
|
+
>>> # Test a string representation of a date
|
|
13
|
+
>>> v = "2024-03-19"
|
|
14
|
+
>>> parse_heurist_date(v)
|
|
15
|
+
datetime.datetime(2024, 3, 19, 0, 0)
|
|
16
|
+
|
|
17
|
+
>>> # Test an integer representation of a year, i.e. circa 1188
|
|
18
|
+
>>> v = 1188
|
|
19
|
+
>>> parse_heurist_date(v)
|
|
20
|
+
datetime.datetime(1188, 1, 1, 0, 0)
|
|
21
|
+
|
|
22
|
+
>>> # Test a float representation of a date
|
|
23
|
+
>>> v = 1250.1231
|
|
24
|
+
>>> parse_heurist_date(v)
|
|
25
|
+
datetime.datetime(1250, 12, 31, 0, 0)
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
repr (str | int | float): Heurist representation \
|
|
29
|
+
of a date.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
datetime | None: Parsed date.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
if not repr:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
# Affirm Heurist's representation of the date is a Python string
|
|
39
|
+
repr = str(repr)
|
|
40
|
+
|
|
41
|
+
# If the Heurist representation is a year, change it to the start of
|
|
42
|
+
# the year.
|
|
43
|
+
if len(repr) == 4:
|
|
44
|
+
iso_str = f"{repr}-01-01"
|
|
45
|
+
return dateutil.parser.parse(iso_str)
|
|
46
|
+
|
|
47
|
+
# If the Heurist representation is a float, parse the month and day
|
|
48
|
+
# shown after the decimal.
|
|
49
|
+
elif "." in repr:
|
|
50
|
+
splits = repr.split(".")
|
|
51
|
+
year, smaller_than_year = splits[0], splits[1]
|
|
52
|
+
if len(smaller_than_year) == 2:
|
|
53
|
+
iso_str = f"{year}-{smaller_than_year}-01"
|
|
54
|
+
elif len(smaller_than_year) == 4:
|
|
55
|
+
iso_str = f"{year}-{smaller_than_year[:2]}-{smaller_than_year[2:]}"
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(repr)
|
|
58
|
+
return dateutil.parser.parse(iso_str)
|
|
59
|
+
|
|
60
|
+
# If the Heurist representation is a year and month, add the day
|
|
61
|
+
# (first of the month)
|
|
62
|
+
parts = repr.split("-")
|
|
63
|
+
if len(parts) == 2:
|
|
64
|
+
iso_str = f"{repr}-01"
|
|
65
|
+
return dateutil.parser.parser(iso_str)
|
|
66
|
+
|
|
67
|
+
# If no other conditions have been met, the representation is already in
|
|
68
|
+
# ISO format YYYY-MM-DD.
|
|
69
|
+
else:
|
|
70
|
+
iso_str = repr
|
|
71
|
+
return dateutil.parser.parse(iso_str)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from heurist.models.dynamic.annotation import PydanticField
|
|
6
|
+
from heurist.models.dynamic.type import FieldType
|
|
7
|
+
from heurist.validators.detail_validator import DetailValidator
|
|
8
|
+
from heurist.validators.exceptions import RepeatedValueInSingularDetailType
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
VALIDATION_LOG = Path.cwd().joinpath("validation.log")
|
|
12
|
+
|
|
13
|
+
handlers = [logging.FileHandler(filename=VALIDATION_LOG, mode="w", delay=True)]
|
|
14
|
+
if os.getenv("HEURIST_STREAM_LOG") == "True":
|
|
15
|
+
handlers.append(logging.StreamHandler())
|
|
16
|
+
|
|
17
|
+
logging.basicConfig(
|
|
18
|
+
encoding="utf-8",
|
|
19
|
+
format="{asctime} - {levelname} - {message}",
|
|
20
|
+
style="{",
|
|
21
|
+
datefmt="%Y-%m-%d %H:%M",
|
|
22
|
+
handlers=handlers,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def list_plural_fields(pydantic_model: BaseModel) -> list:
|
|
27
|
+
return [
|
|
28
|
+
v.description
|
|
29
|
+
for v in pydantic_model.model_fields.values()
|
|
30
|
+
if repr(v.annotation).startswith("list")
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RecordValidator:
|
|
35
|
+
def __init__(
|
|
36
|
+
self, pydantic_model: BaseModel, records: list[dict], rty_ID: int
|
|
37
|
+
) -> None:
|
|
38
|
+
self.pydantic_model = pydantic_model
|
|
39
|
+
self._rty_ID = rty_ID
|
|
40
|
+
self._records = records
|
|
41
|
+
self._index = 0
|
|
42
|
+
self._plural_fields = list_plural_fields(pydantic_model=self.pydantic_model)
|
|
43
|
+
|
|
44
|
+
def is_plural(self, dty_ID: int) -> bool:
|
|
45
|
+
if dty_ID in self._plural_fields:
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
def __iter__(self):
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def __next__(self) -> BaseModel:
|
|
52
|
+
if self._index < len(self._records):
|
|
53
|
+
record = self._records[self._index]
|
|
54
|
+
self._index += 1
|
|
55
|
+
# If the record isn't of the record type for this model, skip it.
|
|
56
|
+
if record["rec_RecTypeID"] != self._rty_ID:
|
|
57
|
+
pass
|
|
58
|
+
# Otherwise, process the record's details into key-value pairs that
|
|
59
|
+
# will be loaded into the Pydantic model.
|
|
60
|
+
kwargs = self.flatten_details_to_dynamic_pydantic_fields(record)
|
|
61
|
+
# Return a validated Pydantic model.
|
|
62
|
+
return self.pydantic_model.model_validate(kwargs)
|
|
63
|
+
else:
|
|
64
|
+
raise StopIteration
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def aggregate_details_by_type(cls, details: list[dict]) -> dict:
|
|
68
|
+
# Set up an index for all the types of details in this record's
|
|
69
|
+
# sequence of details.
|
|
70
|
+
index = {d["dty_ID"]: [] for d in details}
|
|
71
|
+
# According to its type, add each detail to its respective list in the index.
|
|
72
|
+
[index[d["dty_ID"]].append(d) for d in details]
|
|
73
|
+
# Return the index of aggregated details.
|
|
74
|
+
return index
|
|
75
|
+
|
|
76
|
+
def flatten_details_to_dynamic_pydantic_fields(self, record: dict) -> dict:
|
|
77
|
+
detail_type_index = self.aggregate_details_by_type(record["details"])
|
|
78
|
+
# To the list of key-value pairs, add the record's H-ID and its type ID
|
|
79
|
+
record_id = record["rec_ID"]
|
|
80
|
+
kwargs = {
|
|
81
|
+
"rec_ID": record_id,
|
|
82
|
+
"rec_RecTypeID": record["rec_RecTypeID"],
|
|
83
|
+
}
|
|
84
|
+
for dty_ID, details in detail_type_index.items():
|
|
85
|
+
# Determine if this detail type is allowed to have multiple values.
|
|
86
|
+
repeats = self.is_plural(dty_ID=dty_ID)
|
|
87
|
+
|
|
88
|
+
# If this detail is not supposed to be repeateable but Heurist allowed more
|
|
89
|
+
# than 1 value to be saved in the field, raise an error.
|
|
90
|
+
if not repeats and len(details) > 1:
|
|
91
|
+
warning = RepeatedValueInSingularDetailType(
|
|
92
|
+
type_id=record["rec_RecTypeID"],
|
|
93
|
+
record_id=record_id,
|
|
94
|
+
field_name=details[0]["fieldName"],
|
|
95
|
+
value_count=len(details),
|
|
96
|
+
)
|
|
97
|
+
logging.warning(warning)
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
# Get the validation alias for this kwarg's key
|
|
101
|
+
key = PydanticField._get_validation_alias(dty_ID=dty_ID)
|
|
102
|
+
|
|
103
|
+
# Convert the detail's metadata to a flat value.
|
|
104
|
+
values = []
|
|
105
|
+
for detail in details:
|
|
106
|
+
v = DetailValidator.convert(detail=detail)
|
|
107
|
+
values.append(v)
|
|
108
|
+
|
|
109
|
+
# Check the number of validated metadata against what is permissible for
|
|
110
|
+
# this detail type according to the Heurist schema.
|
|
111
|
+
value = self.validate_for_repeatable_values(repeats=repeats, values=values)
|
|
112
|
+
|
|
113
|
+
# If the validation failed, do not add this detail type to the set of
|
|
114
|
+
# kwargs for the Pydantic model. Let the model's default value be used
|
|
115
|
+
# for this missing / invalid metadata.
|
|
116
|
+
if not value:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Add this detail type's alias and validated value(s) to the set of kwargs.
|
|
120
|
+
kwargs.update({key: value})
|
|
121
|
+
|
|
122
|
+
# If the detail is a Term, add an additional field for the foreign key.
|
|
123
|
+
if FieldType.from_detail(details[0]) == "enum":
|
|
124
|
+
# To this detail type's validation alias, which is associated with the
|
|
125
|
+
# term's label, append a suffix to distinguish it as a supplemental
|
|
126
|
+
# field to hold the foreign key.
|
|
127
|
+
key += PydanticField.trm_validation_alias_suffix
|
|
128
|
+
# Into a list, extract each detail's foreign key, which is in "value."
|
|
129
|
+
values = []
|
|
130
|
+
for detail in details:
|
|
131
|
+
values.append(detail["value"])
|
|
132
|
+
|
|
133
|
+
value = self.validate_for_repeatable_values(
|
|
134
|
+
repeats=repeats, values=values
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# The previous if-condition should have already confirmed that this
|
|
138
|
+
# group of deatils are valid. Therefore, they can be added directly
|
|
139
|
+
# to the kwargs.
|
|
140
|
+
kwargs.update({key: value})
|
|
141
|
+
|
|
142
|
+
# Return the flat key-value pairs for the Pydantic model's fields.
|
|
143
|
+
return kwargs
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def validate_for_repeatable_values(
|
|
147
|
+
cls, repeats: bool, values: list
|
|
148
|
+
) -> list | dict | None:
|
|
149
|
+
# If the detail type is not repeatable, extract the first dictionary.
|
|
150
|
+
if not repeats and len(values) > 0:
|
|
151
|
+
return values[0]
|
|
152
|
+
# If the detail type is repeatable, send the list of values, which can
|
|
153
|
+
# be an empty list--as this should be the default value for this field
|
|
154
|
+
# annotation.
|
|
155
|
+
elif repeats:
|
|
156
|
+
return values
|
heurist/workflows/etl.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import duckdb
|
|
2
|
+
from heurist.api.connection import HeuristAPIConnection
|
|
3
|
+
from heurist.database import TransformedDatabase
|
|
4
|
+
from heurist.utils.constants import DEFAULT_RECORD_GROUPS
|
|
5
|
+
from rich.progress import (
|
|
6
|
+
BarColumn,
|
|
7
|
+
MofNCompleteColumn,
|
|
8
|
+
Progress,
|
|
9
|
+
SpinnerColumn,
|
|
10
|
+
TextColumn,
|
|
11
|
+
TimeElapsedColumn,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_transform_load(
|
|
16
|
+
client: HeuristAPIConnection,
|
|
17
|
+
duckdb_connection: duckdb.DuckDBPyConnection,
|
|
18
|
+
user: tuple = (),
|
|
19
|
+
record_group_names: tuple = DEFAULT_RECORD_GROUPS,
|
|
20
|
+
) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Workflow for (1) extracting, transforming, and loading the Heurist database \
|
|
23
|
+
architecture into a DuckDB database and (2) extracting, transforming, \
|
|
24
|
+
and loading record types' records into the created DuckDB database.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
client (HeuristAPIConnection): Context of a Heurist API connection.
|
|
28
|
+
duckdb_connection (duckdb.DuckDBPyConnection): Connection to a DuckDB database.
|
|
29
|
+
user (tuple): IDs (integers) of targeted users.
|
|
30
|
+
record_group_names (tuple): Names of the record group types. Must include at \
|
|
31
|
+
least 1. Defaults to ("My record types").
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
duckdb.DuckDBPyConnection: Open connection to the created DuckDB database.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Export the Heurist database's structure
|
|
38
|
+
with Progress(
|
|
39
|
+
TextColumn("{task.description}"), SpinnerColumn(), TimeElapsedColumn()
|
|
40
|
+
) as p:
|
|
41
|
+
_ = p.add_task("Get DB Structure")
|
|
42
|
+
xml = client.get_structure()
|
|
43
|
+
|
|
44
|
+
# Export individual record sets and insert into the DuckDB database
|
|
45
|
+
with (
|
|
46
|
+
Progress(
|
|
47
|
+
TextColumn("{task.description}"),
|
|
48
|
+
BarColumn(),
|
|
49
|
+
MofNCompleteColumn(),
|
|
50
|
+
TimeElapsedColumn(),
|
|
51
|
+
) as p,
|
|
52
|
+
):
|
|
53
|
+
database = TransformedDatabase(
|
|
54
|
+
conn=duckdb_connection,
|
|
55
|
+
hml_xml=xml,
|
|
56
|
+
record_type_groups=record_group_names,
|
|
57
|
+
)
|
|
58
|
+
t = p.add_task(
|
|
59
|
+
"Get Records",
|
|
60
|
+
total=len(database.pydantic_models.keys()),
|
|
61
|
+
)
|
|
62
|
+
for record_type in database.pydantic_models.values():
|
|
63
|
+
rty_ID = record_type.rty_ID
|
|
64
|
+
records = client.get_records(rty_ID, users=user)
|
|
65
|
+
p.advance(t)
|
|
66
|
+
database.insert_records(record_type_id=rty_ID, records=records)
|