heurist-api 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of heurist-api might be problematic. Click here for more details.
- heurist/__init__.py +1 -0
- heurist/api/__init__.py +0 -0
- heurist/api/client.py +122 -0
- heurist/api/connection.py +71 -0
- heurist/api/constants.py +19 -0
- heurist/api/credentials.py +71 -0
- heurist/api/exceptions.py +45 -0
- heurist/api/url_builder.py +148 -0
- heurist/api/utils.py +24 -0
- heurist/cli/__init__.py +0 -0
- heurist/cli/__main__.py +227 -0
- heurist/cli/load.py +55 -0
- heurist/cli/records.py +49 -0
- heurist/cli/schema.py +94 -0
- heurist/database/__init__.py +3 -0
- heurist/database/basedb.py +125 -0
- heurist/database/database.py +96 -0
- heurist/models/__init__.py +0 -0
- heurist/models/dynamic/__init__.py +3 -0
- heurist/models/dynamic/annotation.py +143 -0
- heurist/models/dynamic/create_model.py +82 -0
- heurist/models/dynamic/date.py +61 -0
- heurist/models/dynamic/type.py +96 -0
- heurist/models/structural/DetailTypes.py +34 -0
- heurist/models/structural/RecStructure.py +27 -0
- heurist/models/structural/RecTypeGroups.py +27 -0
- heurist/models/structural/RecTypes.py +27 -0
- heurist/models/structural/Terms.py +27 -0
- heurist/models/structural/__init__.py +19 -0
- heurist/models/structural/dty.py +121 -0
- heurist/models/structural/hml_structure.py +36 -0
- heurist/models/structural/rst.py +141 -0
- heurist/models/structural/rtg.py +25 -0
- heurist/models/structural/rty.py +81 -0
- heurist/models/structural/trm.py +34 -0
- heurist/models/structural/utils.py +53 -0
- heurist/schema/__init__.py +27 -0
- heurist/schema/models.py +70 -0
- heurist/schema/rel_to_dict.py +39 -0
- heurist/sql/__init__.py +21 -0
- heurist/sql/joinRecordTypeIDNameByGroupType.sql +10 -0
- heurist/sql/joinRecordTypeMetadata.sql +17 -0
- heurist/sql/selectRecordTypeSchema.sql +51 -0
- heurist/sql/sql_safety.py +101 -0
- heurist/utils/constants.py +1 -0
- heurist/utils/rel_to_dict_array.py +8 -0
- heurist/validators/__init__.py +3 -0
- heurist/validators/detail_validator.py +142 -0
- heurist/validators/exceptions.py +34 -0
- heurist/validators/parse_heurist_date.py +71 -0
- heurist/validators/record_validator.py +156 -0
- heurist/workflows/__init__.py +3 -0
- heurist/workflows/etl.py +66 -0
- heurist_api-0.1.2.dist-info/METADATA +453 -0
- heurist_api-0.1.2.dist-info/RECORD +80 -0
- heurist_api-0.1.2.dist-info/WHEEL +4 -0
- heurist_api-0.1.2.dist-info/entry_points.txt +2 -0
- heurist_api-0.1.2.dist-info/licenses/LICENSE +427 -0
- mock_data/__init__.py +22 -0
- mock_data/blocktext/__init__.py +0 -0
- mock_data/blocktext/single.py +7 -0
- mock_data/date/__init__.py +0 -0
- mock_data/date/compound_repeated.py +44 -0
- mock_data/date/compound_single.py +30 -0
- mock_data/date/simple_single.py +16 -0
- mock_data/date/timestamp_repeated.py +30 -0
- mock_data/enum/__init__.py +0 -0
- mock_data/enum/repeated.py +29 -0
- mock_data/enum/single.py +18 -0
- mock_data/file/__init__.py +0 -0
- mock_data/file/single.py +28 -0
- mock_data/float/__init__.py +0 -0
- mock_data/float/single.py +8 -0
- mock_data/freetext/__init__.py +0 -0
- mock_data/freetext/single.py +16 -0
- mock_data/geo/__init__.py +0 -0
- mock_data/geo/single.py +22 -0
- mock_data/resource/__init__.py +0 -0
- mock_data/resource/repeated.py +35 -0
- mock_data/resource/single.py +16 -0
heurist/cli/__main__.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI commands for extracting, transforming, and loading remote Heurist data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import importlib.metadata
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from heurist import PACKAGE_NAME
|
|
9
|
+
from heurist.api.credentials import CredentialHandler
|
|
10
|
+
from heurist.api.exceptions import MissingParameterException
|
|
11
|
+
from heurist.cli.load import load_command
|
|
12
|
+
from heurist.cli.records import rty_command
|
|
13
|
+
from heurist.cli.schema import schema_command
|
|
14
|
+
from heurist.utils.constants import DEFAULT_RECORD_GROUPS
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
|
|
17
|
+
# This name must match the package name ('name' kwarg) in the TOML file.
|
|
18
|
+
__identifier__ = importlib.metadata.version(PACKAGE_NAME)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# =========================== #
|
|
22
|
+
# Main cli group
|
|
23
|
+
# =========================== #
|
|
24
|
+
@click.group(help="Group CLI command for connecting to the Heurist DB")
|
|
25
|
+
@click.version_option(__identifier__)
|
|
26
|
+
@click.option(
|
|
27
|
+
"-d",
|
|
28
|
+
"--database",
|
|
29
|
+
type=click.STRING,
|
|
30
|
+
help="Name of the Heurist database",
|
|
31
|
+
)
|
|
32
|
+
@click.option(
|
|
33
|
+
"-l",
|
|
34
|
+
"--login",
|
|
35
|
+
type=click.STRING,
|
|
36
|
+
help="Login name for the database user",
|
|
37
|
+
)
|
|
38
|
+
@click.option(
|
|
39
|
+
"-p",
|
|
40
|
+
"--password",
|
|
41
|
+
type=click.STRING,
|
|
42
|
+
help="Password for the database user",
|
|
43
|
+
)
|
|
44
|
+
@click.option(
|
|
45
|
+
"--debugging",
|
|
46
|
+
required=False,
|
|
47
|
+
default=False,
|
|
48
|
+
is_flag=True,
|
|
49
|
+
help="Whether to run in debug mode, default false.",
|
|
50
|
+
)
|
|
51
|
+
@click.pass_context
|
|
52
|
+
def cli(ctx, database, login, password, debugging):
|
|
53
|
+
ctx.ensure_object(dict)
|
|
54
|
+
ctx.obj["DEBUGGING"] = debugging
|
|
55
|
+
try:
|
|
56
|
+
ctx.obj["CREDENTIALS"] = CredentialHandler(
|
|
57
|
+
database_name=database,
|
|
58
|
+
login=login,
|
|
59
|
+
password=password,
|
|
60
|
+
)
|
|
61
|
+
except MissingParameterException:
|
|
62
|
+
c = Console()
|
|
63
|
+
c.print(
|
|
64
|
+
"Login informaiton is missing."
|
|
65
|
+
"Please provide your credentials when prompted."
|
|
66
|
+
"\nTo quit, press Ctrl+C then Enter."
|
|
67
|
+
)
|
|
68
|
+
_database = click.prompt("Heurist database name")
|
|
69
|
+
_login = click.prompt("Heurist user login")
|
|
70
|
+
_password = click.prompt("Heurist login password")
|
|
71
|
+
c.print("Retrying the connection...")
|
|
72
|
+
ctx.obj["CREDENTIALS"] = CredentialHandler(
|
|
73
|
+
database_name=_database,
|
|
74
|
+
login=_login,
|
|
75
|
+
password=_password,
|
|
76
|
+
)
|
|
77
|
+
c.print("Success!", style="green")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# =========================== #
|
|
81
|
+
# 'record' command
|
|
82
|
+
# =========================== #
|
|
83
|
+
@cli.command("record", help="Get a JSON export of a certain record type.")
|
|
84
|
+
@click.option(
|
|
85
|
+
"-t",
|
|
86
|
+
"--record-type",
|
|
87
|
+
help="The ID fo the record type",
|
|
88
|
+
type=click.INT,
|
|
89
|
+
required=True,
|
|
90
|
+
)
|
|
91
|
+
@click.option(
|
|
92
|
+
"-o",
|
|
93
|
+
"--outfile",
|
|
94
|
+
help="JSON file path.",
|
|
95
|
+
type=click.Path(file_okay=True, writable=True),
|
|
96
|
+
required=False,
|
|
97
|
+
)
|
|
98
|
+
@click.pass_obj
|
|
99
|
+
def records(ctx, record_type, outfile):
|
|
100
|
+
credentials = ctx["CREDENTIALS"]
|
|
101
|
+
rty_command(credentials, record_type, outfile)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# =========================== #
|
|
105
|
+
# 'schema' command
|
|
106
|
+
# =========================== #
|
|
107
|
+
@cli.command(
|
|
108
|
+
"schema",
|
|
109
|
+
help="Generate documentation about the database schema.",
|
|
110
|
+
)
|
|
111
|
+
@click.option(
|
|
112
|
+
"-t",
|
|
113
|
+
"--output-type",
|
|
114
|
+
required=True,
|
|
115
|
+
type=click.Choice(["csv", "json"], case_sensitive=False),
|
|
116
|
+
help="Data format in which the schema will be described. \
|
|
117
|
+
csv = 1 CSV file for each record type. json = 1 file that \
|
|
118
|
+
lists all records together",
|
|
119
|
+
)
|
|
120
|
+
@click.option(
|
|
121
|
+
"-r",
|
|
122
|
+
"--record-group",
|
|
123
|
+
required=False,
|
|
124
|
+
type=click.STRING,
|
|
125
|
+
multiple=True,
|
|
126
|
+
default=["My record types"],
|
|
127
|
+
show_default=True,
|
|
128
|
+
help="Group name of the record types to be described. \
|
|
129
|
+
Can be declared multiple times for multiple groups.",
|
|
130
|
+
)
|
|
131
|
+
@click.option(
|
|
132
|
+
"-o",
|
|
133
|
+
"--outdir",
|
|
134
|
+
required=False,
|
|
135
|
+
type=click.Path(file_okay=False, dir_okay=True),
|
|
136
|
+
help="Path to the directory in which the files will be written. \
|
|
137
|
+
Defaults to name of the database + '_schema'.",
|
|
138
|
+
)
|
|
139
|
+
@click.pass_obj
|
|
140
|
+
def doc(ctx, record_group, outdir, output_type):
|
|
141
|
+
# Get context variables
|
|
142
|
+
credentials = ctx["CREDENTIALS"]
|
|
143
|
+
debugging = ctx["DEBUGGING"]
|
|
144
|
+
|
|
145
|
+
# Run the doc command
|
|
146
|
+
schema_command(
|
|
147
|
+
credentials=credentials,
|
|
148
|
+
record_group=record_group,
|
|
149
|
+
outdir=outdir,
|
|
150
|
+
output_type=output_type,
|
|
151
|
+
debugging=debugging,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# =========================== #
|
|
156
|
+
# 'download' command
|
|
157
|
+
# =========================== #
|
|
158
|
+
@cli.command(
|
|
159
|
+
"download",
|
|
160
|
+
help="Export data of records of 1 or more record group types.",
|
|
161
|
+
)
|
|
162
|
+
@click.option(
|
|
163
|
+
"-r",
|
|
164
|
+
"--record-group",
|
|
165
|
+
required=False,
|
|
166
|
+
type=click.STRING,
|
|
167
|
+
multiple=True,
|
|
168
|
+
default=DEFAULT_RECORD_GROUPS,
|
|
169
|
+
help="Record group of the entities whose data is exported. \
|
|
170
|
+
Default: 'My record types'.",
|
|
171
|
+
)
|
|
172
|
+
@click.option(
|
|
173
|
+
"-u",
|
|
174
|
+
"--user",
|
|
175
|
+
required=False,
|
|
176
|
+
type=click.INT,
|
|
177
|
+
multiple=True,
|
|
178
|
+
help="User or users who created the records to be exported. \
|
|
179
|
+
Default: all users' records.",
|
|
180
|
+
)
|
|
181
|
+
@click.option(
|
|
182
|
+
"-f",
|
|
183
|
+
"--filepath",
|
|
184
|
+
required=True,
|
|
185
|
+
type=click.Path(
|
|
186
|
+
file_okay=True,
|
|
187
|
+
dir_okay=False,
|
|
188
|
+
),
|
|
189
|
+
help="Path to the DuckDB database file in which the data will be written.",
|
|
190
|
+
)
|
|
191
|
+
@click.option(
|
|
192
|
+
"-o",
|
|
193
|
+
"--outdir",
|
|
194
|
+
required=False,
|
|
195
|
+
type=click.Path(
|
|
196
|
+
file_okay=False,
|
|
197
|
+
dir_okay=True,
|
|
198
|
+
),
|
|
199
|
+
help="Directory in which CSV files of the dumped tabular data \
|
|
200
|
+
will be written.",
|
|
201
|
+
)
|
|
202
|
+
@click.pass_obj
|
|
203
|
+
def load(ctx, filepath, record_group, user, outdir):
|
|
204
|
+
# Get context variable
|
|
205
|
+
credentials = ctx["CREDENTIALS"]
|
|
206
|
+
testing = ctx["DEBUGGING"]
|
|
207
|
+
|
|
208
|
+
# Run the dump command
|
|
209
|
+
if not testing:
|
|
210
|
+
load_command(
|
|
211
|
+
credentials=credentials,
|
|
212
|
+
duckdb_database_connection_path=filepath,
|
|
213
|
+
record_group=record_group,
|
|
214
|
+
user=user,
|
|
215
|
+
outdir=outdir,
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
print(
|
|
219
|
+
"\nCannot run 'dump' command in debugging mode.\
|
|
220
|
+
\nClient must connect to a remote Heurist database.\n"
|
|
221
|
+
)
|
|
222
|
+
print("Exiting.")
|
|
223
|
+
exit()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
if __name__ == "__main__":
|
|
227
|
+
cli()
|
heurist/cli/load.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for downloading all requested record types' data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import duckdb
|
|
8
|
+
from heurist.api.connection import HeuristAPIConnection
|
|
9
|
+
from heurist.api.credentials import CredentialHandler
|
|
10
|
+
from heurist.utils.constants import DEFAULT_RECORD_GROUPS
|
|
11
|
+
from heurist.workflows import extract_transform_load
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_command(
|
|
15
|
+
credentials: CredentialHandler,
|
|
16
|
+
duckdb_database_connection_path: Path | str,
|
|
17
|
+
record_group: tuple = DEFAULT_RECORD_GROUPS,
|
|
18
|
+
user: tuple = (),
|
|
19
|
+
outdir: Path | None = None,
|
|
20
|
+
):
|
|
21
|
+
# Run the ETL process
|
|
22
|
+
if isinstance(duckdb_database_connection_path, Path):
|
|
23
|
+
duckdb_database_connection_path = str(duckdb_database_connection_path)
|
|
24
|
+
with (
|
|
25
|
+
duckdb.connect(duckdb_database_connection_path) as conn,
|
|
26
|
+
HeuristAPIConnection(
|
|
27
|
+
db=credentials.get_database(),
|
|
28
|
+
login=credentials.get_login(),
|
|
29
|
+
password=credentials.get_password(),
|
|
30
|
+
) as client,
|
|
31
|
+
):
|
|
32
|
+
extract_transform_load(
|
|
33
|
+
client=client,
|
|
34
|
+
duckdb_connection=conn,
|
|
35
|
+
record_group_names=record_group,
|
|
36
|
+
user=user,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Show the results of the created DuckDB database
|
|
40
|
+
with duckdb.connect(duckdb_database_connection_path) as new_conn:
|
|
41
|
+
tables = new_conn.sql("show tables;")
|
|
42
|
+
print("\nCreated the following tables")
|
|
43
|
+
print(tables)
|
|
44
|
+
|
|
45
|
+
# If writing to CSV files, write only tables of record types
|
|
46
|
+
if outdir:
|
|
47
|
+
outdir = Path(outdir)
|
|
48
|
+
outdir.mkdir(exist_ok=True)
|
|
49
|
+
for tup in tables.fetchall():
|
|
50
|
+
table_name = tup[0]
|
|
51
|
+
# Skip the schema tables
|
|
52
|
+
if table_name in ["rtg", "rst", "rty", "dty", "trm"]:
|
|
53
|
+
continue
|
|
54
|
+
fp = outdir.joinpath(f"{table_name}.csv")
|
|
55
|
+
new_conn.table(table_name).sort("H-ID").write_csv(str(fp))
|
heurist/cli/records.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for downloading one requested record type's data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from heurist.api.connection import HeuristAPIConnection
|
|
9
|
+
from heurist.api.credentials import CredentialHandler
|
|
10
|
+
from rich.progress import (
|
|
11
|
+
Progress,
|
|
12
|
+
SpinnerColumn,
|
|
13
|
+
TextColumn,
|
|
14
|
+
TimeElapsedColumn,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def rty_command(
|
|
19
|
+
credentials: CredentialHandler,
|
|
20
|
+
rty: int,
|
|
21
|
+
outfile: Path | str | None,
|
|
22
|
+
):
|
|
23
|
+
with (
|
|
24
|
+
Progress(
|
|
25
|
+
TextColumn("{task.description}"),
|
|
26
|
+
SpinnerColumn(),
|
|
27
|
+
TimeElapsedColumn(),
|
|
28
|
+
) as p,
|
|
29
|
+
HeuristAPIConnection(
|
|
30
|
+
db=credentials.get_database(),
|
|
31
|
+
login=credentials.get_login(),
|
|
32
|
+
password=credentials.get_password(),
|
|
33
|
+
) as client,
|
|
34
|
+
):
|
|
35
|
+
_ = p.add_task(f"Get Records of type {rty}", total=1)
|
|
36
|
+
records = client.get_records(rty)
|
|
37
|
+
# If no records of this type have been entered, stop.
|
|
38
|
+
if len(records) == 0:
|
|
39
|
+
print("No records of this type have been entered.\nExiting program...")
|
|
40
|
+
exit()
|
|
41
|
+
|
|
42
|
+
# Else, write the records to a JSON file.
|
|
43
|
+
if not outfile:
|
|
44
|
+
outfile = f"RTY_{rty}.json"
|
|
45
|
+
if not isinstance(outfile, Path):
|
|
46
|
+
outfile = Path(outfile)
|
|
47
|
+
print(f"Writing results to: {outfile}")
|
|
48
|
+
with open(outfile, "w") as f:
|
|
49
|
+
json.dump(records, f, indent=4)
|
heurist/cli/schema.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for downloading details about a Heurist database schema.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from heurist.api.connection import HeuristAPIConnection
|
|
8
|
+
from heurist.api.credentials import CredentialHandler
|
|
9
|
+
from heurist.database import TransformedDatabase
|
|
10
|
+
from heurist.schema import output_csv, output_json
|
|
11
|
+
from rich.progress import (
|
|
12
|
+
BarColumn,
|
|
13
|
+
MofNCompleteColumn,
|
|
14
|
+
Progress,
|
|
15
|
+
SpinnerColumn,
|
|
16
|
+
TextColumn,
|
|
17
|
+
TimeElapsedColumn,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_database_schema(
|
|
22
|
+
record_groups: list,
|
|
23
|
+
credentials: CredentialHandler,
|
|
24
|
+
debugging: bool,
|
|
25
|
+
) -> TransformedDatabase:
|
|
26
|
+
# If testing, load the mock database XML schema
|
|
27
|
+
if debugging:
|
|
28
|
+
from mock_data import DB_STRUCTURE_XML
|
|
29
|
+
|
|
30
|
+
db = TransformedDatabase(
|
|
31
|
+
hml_xml=DB_STRUCTURE_XML, record_type_groups=record_groups
|
|
32
|
+
)
|
|
33
|
+
# If not testing, request the database XML schema from the server
|
|
34
|
+
else:
|
|
35
|
+
with (
|
|
36
|
+
Progress(
|
|
37
|
+
TextColumn("{task.description}"),
|
|
38
|
+
SpinnerColumn(),
|
|
39
|
+
TimeElapsedColumn(),
|
|
40
|
+
) as p,
|
|
41
|
+
HeuristAPIConnection(
|
|
42
|
+
db=credentials.get_database(),
|
|
43
|
+
login=credentials.get_login(),
|
|
44
|
+
password=credentials.get_password(),
|
|
45
|
+
) as client,
|
|
46
|
+
):
|
|
47
|
+
_ = p.add_task("Downloading schemas")
|
|
48
|
+
xml = client.get_structure()
|
|
49
|
+
db = TransformedDatabase(
|
|
50
|
+
hml_xml=xml,
|
|
51
|
+
record_type_groups=record_groups,
|
|
52
|
+
)
|
|
53
|
+
return db
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def schema_command(
|
|
57
|
+
credentials: CredentialHandler,
|
|
58
|
+
record_group: list,
|
|
59
|
+
outdir: str,
|
|
60
|
+
output_type: str,
|
|
61
|
+
debugging: bool = False,
|
|
62
|
+
):
|
|
63
|
+
# Set up the output directory
|
|
64
|
+
if not outdir:
|
|
65
|
+
outdir = f"{credentials.get_database()}_schema"
|
|
66
|
+
DIR = Path(outdir)
|
|
67
|
+
DIR.mkdir(exist_ok=True)
|
|
68
|
+
|
|
69
|
+
# Get the database schema
|
|
70
|
+
db = get_database_schema(
|
|
71
|
+
record_groups=record_group,
|
|
72
|
+
credentials=credentials,
|
|
73
|
+
debugging=debugging,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Describe each targeted record type
|
|
77
|
+
record_type_ids = list(db.pydantic_models.keys())
|
|
78
|
+
with Progress(
|
|
79
|
+
TextColumn("{task.description}"), BarColumn(), MofNCompleteColumn()
|
|
80
|
+
) as p:
|
|
81
|
+
descriptions = []
|
|
82
|
+
t = p.add_task("Describing record types", total=len(record_type_ids))
|
|
83
|
+
for id in record_type_ids:
|
|
84
|
+
rel = db.describe_record_schema(rty_ID=id)
|
|
85
|
+
descriptions.append(rel)
|
|
86
|
+
p.advance(t)
|
|
87
|
+
|
|
88
|
+
# Output the descriptions according to the desired data format
|
|
89
|
+
if output_type == "csv":
|
|
90
|
+
output_csv(dir=DIR, descriptions=descriptions)
|
|
91
|
+
|
|
92
|
+
elif output_type == "json":
|
|
93
|
+
outfile = DIR.joinpath("recordTypes.json")
|
|
94
|
+
output_json(descriptions=descriptions, fp=outfile)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import duckdb
|
|
2
|
+
import polars as pl
|
|
3
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
4
|
+
from heurist.models.structural.hml_structure import HMLStructure
|
|
5
|
+
from heurist.sql import RECORD_TYPE_SCHEMA
|
|
6
|
+
from pydantic_xml import BaseXmlModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HeuristDatabase:
|
|
10
|
+
"""Base class for loading the original Heurist database structure."""
|
|
11
|
+
|
|
12
|
+
BASE_TABLES = [
|
|
13
|
+
("rtg", "RecTypeGroups"),
|
|
14
|
+
("rst", "RecStructure"),
|
|
15
|
+
("rty", "RecTypes"),
|
|
16
|
+
("dty", "DetailTypes"),
|
|
17
|
+
("trm", "Terms"),
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
hml_xml: bytes,
|
|
23
|
+
conn: DuckDBPyConnection | None = None,
|
|
24
|
+
db: str = ":memory:",
|
|
25
|
+
) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Create a DuckDB database connection and populate the DuckDB database with the
|
|
28
|
+
5 base tables that comprise the Heurist database structure.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
hml_xml (bytes): Heurist database structure exported in XML format.
|
|
32
|
+
conn (DuckDBPyConnection | None, optional): A DuckDB database connection. \
|
|
33
|
+
Defaults to None.
|
|
34
|
+
db (str, optional): Path to the DuckDB database. Defaults to ":memory:".
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
hml_xml = self.trim_xml_bytes(xml=hml_xml)
|
|
38
|
+
if not conn:
|
|
39
|
+
conn = duckdb.connect(db)
|
|
40
|
+
self.conn = conn
|
|
41
|
+
# Load the Heurist database structure XML into a nested Pydantic data model
|
|
42
|
+
self.hml = HMLStructure.from_xml(hml_xml)
|
|
43
|
+
|
|
44
|
+
# Create generic tables
|
|
45
|
+
for t in self.BASE_TABLES:
|
|
46
|
+
name = t[0]
|
|
47
|
+
pydantic_model = getattr(getattr(self.hml, t[1]), t[0])
|
|
48
|
+
self.create(name, pydantic_model)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def trim_xml_bytes(cls, xml: bytes) -> bytes:
|
|
52
|
+
"""
|
|
53
|
+
Remove any extra whitespace from a potentially malformatted XML.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
xml (bytes): Heurist database structure exported XML format.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
bytes: Validated Heurist database structure in XML format.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
return xml.decode("utf-8").strip().encode("utf-8")
|
|
63
|
+
|
|
64
|
+
def delete_existing_table(self, table_name: str) -> None:
|
|
65
|
+
"""
|
|
66
|
+
If the table already exists in the DuckDB database, drop it.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
table_name (str): Name of the table to potentially drop.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
if self.conn.sql(
|
|
73
|
+
f"""
|
|
74
|
+
SELECT *
|
|
75
|
+
FROM duckdb_tables()
|
|
76
|
+
WHERE table_name like '{table_name}'
|
|
77
|
+
"""
|
|
78
|
+
):
|
|
79
|
+
self.conn.sql("DROP TABLE {}".format(table_name))
|
|
80
|
+
|
|
81
|
+
def create(self, name: str, model: BaseXmlModel) -> None:
|
|
82
|
+
"""Create an empty table in the DuckDB database connection
|
|
83
|
+
based on a Pydantic model.
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
>>> # Set up the database class and parse a table model.
|
|
87
|
+
>>> from mock_data import DB_STRUCTURE_XML
|
|
88
|
+
>>> db = HeuristDatabase(hml_xml=DB_STRUCTURE_XML)
|
|
89
|
+
>>> model = db.hml.RecTypeGroups.rtg
|
|
90
|
+
>>>
|
|
91
|
+
>>> # Create a table for the Record Type Group (rtg) table model.
|
|
92
|
+
>>> db.create(name="rtg", model=model)
|
|
93
|
+
>>> shape = db.conn.table("rtg").fetchall()
|
|
94
|
+
>>> # The Record Type Group (rtg) table should have 11 columns.
|
|
95
|
+
>>> len(shape)
|
|
96
|
+
11
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
model (BaseXmlModel): A Pydantic XML model.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
self.delete_existing_table(name)
|
|
103
|
+
|
|
104
|
+
# Convert the model to a dataframe and register it for duckdb
|
|
105
|
+
df = pl.DataFrame(model)
|
|
106
|
+
assert df.shape[0] > 1
|
|
107
|
+
|
|
108
|
+
# Create table from model dataframe
|
|
109
|
+
sql = "CREATE TABLE {} AS FROM df".format(name)
|
|
110
|
+
self.conn.sql(sql)
|
|
111
|
+
|
|
112
|
+
def describe_record_schema(self, rty_ID: int) -> DuckDBPyRelation:
|
|
113
|
+
"""Join the tables 'dty' (detail), 'rst' (record structure), 'rty' (record type)
|
|
114
|
+
to get all the relevant information for a specific record type, plus add the
|
|
115
|
+
label and description of the section / separator associated with each detail
|
|
116
|
+
(if any).
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
rty_ID (int): ID of the targeted record type.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
DuckDBPyRelation: A DuckDB Python relation that can be queried or converted.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
return self.conn.from_query(query=RECORD_TYPE_SCHEMA, params=[rty_ID])
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
3
|
+
from heurist.database.basedb import HeuristDatabase
|
|
4
|
+
from heurist.models.dynamic import HeuristRecord
|
|
5
|
+
from heurist.sql import RECORD_BY_GROUP_TYPE, RECORD_TYPE_METADATA
|
|
6
|
+
from heurist.validators.record_validator import RecordValidator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformedDatabase(HeuristDatabase):
|
|
10
|
+
"""Class for building and populating SQL tables with data collected and \
|
|
11
|
+
transformed from remote Heurist DB.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
hml_xml: bytes,
|
|
17
|
+
conn: DuckDBPyConnection | None = None,
|
|
18
|
+
db: str | None = ":memory:",
|
|
19
|
+
record_type_groups: list[str] = ["My record types"],
|
|
20
|
+
) -> None:
|
|
21
|
+
super().__init__(hml_xml, conn, db)
|
|
22
|
+
|
|
23
|
+
# Create an empty index of targeted record types' Pydantic models
|
|
24
|
+
self.pydantic_models = {}
|
|
25
|
+
|
|
26
|
+
# Joining together the Heurist database's structural tables, construct an SQL
|
|
27
|
+
# statement that selects the ID and name of the record types that belong to one
|
|
28
|
+
# of the targeted record type groups
|
|
29
|
+
condition = "\nWHERE rtg.rtg_Name like '{}'".format(record_type_groups[0])
|
|
30
|
+
if len(record_type_groups) > 1:
|
|
31
|
+
for rtg in record_type_groups[1:]:
|
|
32
|
+
condition += " OR rtg.rtg_Name like '{}'".format(rtg)
|
|
33
|
+
query = RECORD_BY_GROUP_TYPE + condition
|
|
34
|
+
|
|
35
|
+
# Iterate through each targeted record type's ID and name
|
|
36
|
+
for rty_ID, rty_Name in self.conn.sql(query).fetchall():
|
|
37
|
+
# Using the ID, select the metadata of a record type's data fields (details)
|
|
38
|
+
rel = self.conn.sql(query=RECORD_TYPE_METADATA, params=[rty_ID])
|
|
39
|
+
# Using this metadata, create a dynamic Pydantic model for the record type
|
|
40
|
+
data_field_metadata = rel.pl().to_dicts()
|
|
41
|
+
model = HeuristRecord(
|
|
42
|
+
rty_ID=rty_ID,
|
|
43
|
+
rty_Name=rty_Name,
|
|
44
|
+
detail_metadata=data_field_metadata,
|
|
45
|
+
)
|
|
46
|
+
# Add the dynamic Pydantic model to the index of models
|
|
47
|
+
self.pydantic_models.update({rty_ID: model})
|
|
48
|
+
|
|
49
|
+
def insert_records(
|
|
50
|
+
self, record_type_id: int, records: list[dict]
|
|
51
|
+
) -> DuckDBPyRelation | None:
|
|
52
|
+
# From the index of Pydantic models, get this record type's
|
|
53
|
+
# dynamically-created Pydantic model.
|
|
54
|
+
dynamic_model = self.pydantic_models[record_type_id].model
|
|
55
|
+
table_name = self.pydantic_models[record_type_id].table_name
|
|
56
|
+
|
|
57
|
+
# Prepare a list in which to store dictionaries of the validated record data.
|
|
58
|
+
model_dict_sequence = []
|
|
59
|
+
|
|
60
|
+
# Using the dynamically-created Pyandtic model, validate the metadata of
|
|
61
|
+
# all the records of this type.
|
|
62
|
+
validator = RecordValidator(
|
|
63
|
+
pydantic_model=dynamic_model,
|
|
64
|
+
records=records,
|
|
65
|
+
rty_ID=record_type_id,
|
|
66
|
+
)
|
|
67
|
+
for model in validator:
|
|
68
|
+
# Dump the validated record's data model to a dictionary.
|
|
69
|
+
model_dict = model.model_dump(by_alias=True)
|
|
70
|
+
# Add the dictionary representation of the validated data to the sequence.
|
|
71
|
+
model_dict_sequence.append(model_dict)
|
|
72
|
+
|
|
73
|
+
# If no records of this type have been created yet, skip it.
|
|
74
|
+
if len(model_dict_sequence) == 0:
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
# Transform the sequence of dictionaries into a Pandas dataframe
|
|
78
|
+
try:
|
|
79
|
+
df = pd.DataFrame(model_dict_sequence)
|
|
80
|
+
assert df.shape[1] > 0
|
|
81
|
+
except Exception as e:
|
|
82
|
+
from pprint import pprint
|
|
83
|
+
|
|
84
|
+
pprint(model_dict_sequence)
|
|
85
|
+
print(df)
|
|
86
|
+
print(records)
|
|
87
|
+
print(table_name)
|
|
88
|
+
raise e
|
|
89
|
+
|
|
90
|
+
# Delete any existing table for this record type.
|
|
91
|
+
self.delete_existing_table(table_name=table_name)
|
|
92
|
+
|
|
93
|
+
# From the dataframe, build a new table for the record type.
|
|
94
|
+
sql = f"""CREATE TABLE {table_name} AS FROM df"""
|
|
95
|
+
self.conn.sql(sql)
|
|
96
|
+
return self.conn.table(table_name=table_name)
|
|
File without changes
|