bedrock-ge 0.2.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bedrock_ge/__init__.py +1 -1
- bedrock_ge/gi/ags.py +103 -0
- bedrock_ge/gi/ags3.py +275 -0
- bedrock_ge/gi/ags4.py +29 -0
- bedrock_ge/gi/{ags/schemas.py → ags_schemas.py} +29 -8
- bedrock_ge/gi/db_operations.py +128 -0
- bedrock_ge/gi/geospatial.py +349 -0
- bedrock_ge/gi/io_utils.py +271 -0
- bedrock_ge/gi/mapper.py +221 -0
- bedrock_ge/gi/mapping_models.py +69 -0
- bedrock_ge/gi/schemas.py +136 -36
- bedrock_ge/gi/validate.py +46 -109
- bedrock_ge/gi/write.py +58 -38
- bedrock_ge/plot.py +3 -1
- bedrock_ge-0.3.0.dist-info/METADATA +208 -0
- bedrock_ge-0.3.0.dist-info/RECORD +22 -0
- bedrock_ge/gi/ags/__init__.py +0 -0
- bedrock_ge/gi/ags/read.py +0 -190
- bedrock_ge/gi/ags/transform.py +0 -264
- bedrock_ge/gi/ags/validate.py +0 -25
- bedrock_ge/gi/brgi-schema.json +0 -36
- bedrock_ge/gi/concatenate.py +0 -38
- bedrock_ge/gi/gis_geometry.py +0 -280
- bedrock_ge-0.2.3.dist-info/METADATA +0 -227
- bedrock_ge-0.2.3.dist-info/RECORD +0 -21
- /bedrock_ge/gi/{ags/ags3_data_dictionary.json → ags3_data_dictionary.json} +0 -0
- /bedrock_ge/gi/{ags/ags4_data_dictionary.json → ags4_data_dictionary.json} +0 -0
- {bedrock_ge-0.2.3.dist-info → bedrock_ge-0.3.0.dist-info}/WHEEL +0 -0
- {bedrock_ge-0.2.3.dist-info → bedrock_ge-0.3.0.dist-info}/licenses/LICENSE +0 -0
bedrock_ge/__init__.py
CHANGED
bedrock_ge/gi/ags.py
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import IO
|
3
|
+
|
4
|
+
from pyproj import CRS
|
5
|
+
|
6
|
+
from bedrock_ge.gi.ags3 import ags3_to_brgi_db_mapping
|
7
|
+
from bedrock_ge.gi.io_utils import detect_encoding, open_text_data_source
|
8
|
+
from bedrock_ge.gi.mapping_models import BedrockGIMapping
|
9
|
+
|
10
|
+
|
11
|
+
def ags_to_brgi_db_mapping(
|
12
|
+
source: str | Path | IO[str] | IO[bytes] | bytes,
|
13
|
+
projected_crs: CRS,
|
14
|
+
vertical_crs: CRS = CRS(3855),
|
15
|
+
encoding: str | None = None,
|
16
|
+
) -> BedrockGIMapping:
|
17
|
+
"""Map AGS 3 or AGS 4 data to the Bedrock GI data model.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
source (str | Path | IO[str] | IO[bytes] | bytes): The AGS file (str or Path)
|
21
|
+
or a file-like object that represents the AGS file.
|
22
|
+
projected_crs (CRS): Projected Coordinate Reference System (CRS). For example:
|
23
|
+
- OSGB36 / British National Grid: `pyproj.CRS("EPSG:27700")`
|
24
|
+
- Hong Kong 1980 Grid System: `pyproj.CRS("EPSG:2326")`
|
25
|
+
vertical_crs (CRS, optional): Vertical CRS. Defaults to EGM2008 height, EPSG:3855
|
26
|
+
which measures the orthometric height w.r.t. the Earth Gravitational Model 2008.
|
27
|
+
- Ordnance Datum Newlyn (ODN) Height: `pyproj.CRS("EPSG:5701")`
|
28
|
+
- Hong Kong Principle Datum (HKPD) Height: `pyproj.CRS("EPSG:5738")`
|
29
|
+
encoding (str | None, optional): Encoding of the text file or bytes stream.
|
30
|
+
Defaults to None. An attempt at detecting the encoding will be made if None.
|
31
|
+
|
32
|
+
Raises:
|
33
|
+
ValueError: If the data does not match AGS 3 or AGS 4 format.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
BedrockGIDatabaseMapping: Object that maps AGS 3 or AGS 4 data to Bedrock GI data model.
|
37
|
+
"""
|
38
|
+
if not encoding:
|
39
|
+
encoding = detect_encoding(source)
|
40
|
+
|
41
|
+
# Get first non-blank line, None if all lines are blank
|
42
|
+
with open_text_data_source(source, encoding=encoding) as f:
|
43
|
+
first_line = next((line.strip() for line in f if line.strip()), None)
|
44
|
+
|
45
|
+
if first_line:
|
46
|
+
if first_line.startswith('"**'):
|
47
|
+
ags_version = 3
|
48
|
+
brgi_db_mapping = ags3_to_brgi_db_mapping(
|
49
|
+
source, projected_crs, vertical_crs, encoding
|
50
|
+
)
|
51
|
+
elif first_line.startswith('"GROUP"'):
|
52
|
+
ags_version = 4
|
53
|
+
# brgi_db_mapping = ags4_to_brgi_db_mapping(
|
54
|
+
# source, projected_crs, vertical_crs, encoding
|
55
|
+
# )
|
56
|
+
else:
|
57
|
+
# If first non-empty line doesn't match AGS 3 or AGS 4 format
|
58
|
+
raise ValueError("The data provided is not valid AGS 3 or AGS 4 data.")
|
59
|
+
else:
|
60
|
+
raise ValueError("The file provided has only blank lines")
|
61
|
+
|
62
|
+
# Put CPT data into brgi_db.Other table, because CPT data has too many rows
|
63
|
+
# that would generate geospatial geometry.
|
64
|
+
# "STCN" and "SCPT" are the group names for CPT data in AGS 3 and AGS 3 respectively.
|
65
|
+
# TODO: implement a warning when interpolating GI geospatial geometry when
|
66
|
+
# TODO: a single GI location has waaay too many rows in a certain In-Situ test,
|
67
|
+
# TODO: rather than removing specific groups here.
|
68
|
+
insitu_test_names = {
|
69
|
+
insitu_test.table_name: i
|
70
|
+
for i, insitu_test in enumerate(brgi_db_mapping.InSitu)
|
71
|
+
}
|
72
|
+
cpt_key = (
|
73
|
+
"STCN"
|
74
|
+
if "STCN" in insitu_test_names
|
75
|
+
else "SCPT"
|
76
|
+
if "SCPT" in insitu_test_names
|
77
|
+
else None
|
78
|
+
)
|
79
|
+
if cpt_key is not None:
|
80
|
+
cpt_data_mapping = brgi_db_mapping.InSitu.pop(insitu_test_names[cpt_key])
|
81
|
+
del insitu_test_names[cpt_key]
|
82
|
+
brgi_db_mapping.Other.append(cpt_data_mapping)
|
83
|
+
|
84
|
+
# Log information about the mapped AGS 3 or AGS 4 data
|
85
|
+
project_id = brgi_db_mapping.Project.project_id
|
86
|
+
n_gi_locations = len(brgi_db_mapping.Location.data)
|
87
|
+
n_samples = len(brgi_db_mapping.Sample.data) if brgi_db_mapping.Sample else 0
|
88
|
+
print_args = [
|
89
|
+
f"AGS {ags_version} data was read for Project {project_id}",
|
90
|
+
f"This GI data contains {n_gi_locations} GI locations, {n_samples} samples and:",
|
91
|
+
f" - In-Situ Tests: {list(insitu_test_names.keys())}",
|
92
|
+
]
|
93
|
+
if brgi_db_mapping.Lab:
|
94
|
+
print_args.append(
|
95
|
+
f" - Lab Tests: {[lab_test.table_name for lab_test in brgi_db_mapping.Lab]}"
|
96
|
+
)
|
97
|
+
if brgi_db_mapping.Other:
|
98
|
+
print_args.append(
|
99
|
+
f" - Other Tables: {[other_table.table_name for other_table in brgi_db_mapping.Other]}"
|
100
|
+
)
|
101
|
+
print(*print_args, sep="\n", end="\n\n")
|
102
|
+
|
103
|
+
return brgi_db_mapping
|
bedrock_ge/gi/ags3.py
ADDED
@@ -0,0 +1,275 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import IO, Any
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from pyproj import CRS
|
6
|
+
|
7
|
+
from bedrock_ge.gi.ags_schemas import Ags3HOLE, Ags3SAMP, check_ags_proj_group
|
8
|
+
from bedrock_ge.gi.io_utils import coerce_string, open_text_data_source
|
9
|
+
from bedrock_ge.gi.mapping_models import (
|
10
|
+
BedrockGIMapping,
|
11
|
+
InSituTestTableMapping,
|
12
|
+
LabTestTableMapping,
|
13
|
+
LocationTableMapping,
|
14
|
+
OtherTable,
|
15
|
+
ProjectTableMapping,
|
16
|
+
SampleTableMapping,
|
17
|
+
)
|
18
|
+
|
19
|
+
|
20
|
+
def ags3_to_dfs(
|
21
|
+
source: str | Path | IO[str] | IO[bytes] | bytes, encoding: str
|
22
|
+
) -> dict[str, pd.DataFrame]:
|
23
|
+
"""Converts AGS 3 data to a dictionary of pandas DataFrames.
|
24
|
+
|
25
|
+
Also strips '?' from non-standard AGS 3 group and header names, in order to
|
26
|
+
make the rest of the code more generic.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
source (str | Path | IO[str] | IO[bytes] | bytes): The AGS 3 file (str or Path)
|
30
|
+
or a file-like object that represents the AGS 3 file.
|
31
|
+
encoding (str): Encoding of file or object.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, i.e. a database,
|
35
|
+
where each key is an AGS 3 group, and the corresponding value is
|
36
|
+
a pandas DataFrame containing the data for that group.
|
37
|
+
"""
|
38
|
+
# Initialize dictionary and variables used in the AGS 3 read loop
|
39
|
+
ags3_dfs = {}
|
40
|
+
line_type = "line_0"
|
41
|
+
group = ""
|
42
|
+
headers: list[str] = ["", "", ""]
|
43
|
+
group_data: list[list[Any]] = [[], [], []]
|
44
|
+
|
45
|
+
with open_text_data_source(source, encoding=encoding) as file:
|
46
|
+
for i, line in enumerate(file):
|
47
|
+
line = line.strip()
|
48
|
+
last_line_type = line_type
|
49
|
+
|
50
|
+
# In AGS 3.1 group names are prefixed with **
|
51
|
+
if line.startswith('"**'):
|
52
|
+
line_type = "group_name"
|
53
|
+
if group:
|
54
|
+
ags3_dfs[group] = pd.DataFrame(group_data, columns=headers)
|
55
|
+
|
56
|
+
group = line.strip(' ,"*?')
|
57
|
+
group_data = []
|
58
|
+
|
59
|
+
# In AGS 3 header names are prefixed with "*
|
60
|
+
elif line.startswith('"*'):
|
61
|
+
line_type = "headers"
|
62
|
+
new_headers = line.split('","')
|
63
|
+
new_headers = [h.strip(' ,"*?') for h in new_headers]
|
64
|
+
|
65
|
+
# Some groups have so many headers that they span multiple lines.
|
66
|
+
# Therefore we need to check whether the new headers are
|
67
|
+
# a continuation of the previous headers from the last line.
|
68
|
+
if line_type == last_line_type:
|
69
|
+
headers = headers + new_headers
|
70
|
+
else:
|
71
|
+
headers = new_headers
|
72
|
+
|
73
|
+
# Skip lines where group units are defined, these are defined in the AGS 3 data dictionary.
|
74
|
+
elif line.startswith('"<UNITS>"'):
|
75
|
+
line_type = "units"
|
76
|
+
continue
|
77
|
+
|
78
|
+
# The rest of the lines contain:
|
79
|
+
# 1. GI data
|
80
|
+
# 2. a continuation of the previous line. These lines contain "<CONT>" in the first column.
|
81
|
+
# 3. are empty or contain worthless data
|
82
|
+
else:
|
83
|
+
line_type = "data_row"
|
84
|
+
data_row = line.split('","')
|
85
|
+
if len("".join(data_row)) == 0:
|
86
|
+
# print(f"Line {i} is empty. Last Group: {group}")
|
87
|
+
continue
|
88
|
+
elif len(data_row) != len(headers):
|
89
|
+
print(
|
90
|
+
f"\n🚨 CAUTION: The number of columns ({len(data_row)}) on line {i + 1} doesn't match the number of columns ({len(headers)}) of group {group}!",
|
91
|
+
f"{group} headers: {headers}",
|
92
|
+
f"Line {i + 1}: {data_row}",
|
93
|
+
sep="\n",
|
94
|
+
end="\n\n",
|
95
|
+
)
|
96
|
+
continue
|
97
|
+
# Append continued lines (<CONT>) to the last data_row
|
98
|
+
elif data_row[0] == '"<CONT>':
|
99
|
+
last_data_row = group_data[-1]
|
100
|
+
for j, data in enumerate(data_row):
|
101
|
+
data = data.strip(' "')
|
102
|
+
if data and data != "<CONT>":
|
103
|
+
if last_data_row[j] is None:
|
104
|
+
# Last data row didn't contain data for this column
|
105
|
+
last_data_row[j] = coerce_string(data)
|
106
|
+
else:
|
107
|
+
# Last data row already contains data for this column
|
108
|
+
last_data_row[j] = str(last_data_row[j]) + data
|
109
|
+
# Lines that are assumed to contain valid data are added to the group data
|
110
|
+
else:
|
111
|
+
cleaned_data_row = []
|
112
|
+
for data in data_row:
|
113
|
+
cleaned_data_row.append(coerce_string(data.strip(' "')))
|
114
|
+
group_data.append(cleaned_data_row)
|
115
|
+
|
116
|
+
# Also add the last group's df to the dictionary of AGS dfs
|
117
|
+
ags3_dfs[group] = pd.DataFrame(group_data, columns=headers)
|
118
|
+
|
119
|
+
if not group:
|
120
|
+
print(
|
121
|
+
'🚨 ERROR: The provided AGS 3 data does not contain any groups, i.e. lines starting with "**'
|
122
|
+
)
|
123
|
+
|
124
|
+
return ags3_dfs
|
125
|
+
|
126
|
+
|
127
|
+
# TODO: AGS 3 table validation based on the AGS 3 data dictionary.
|
128
|
+
def ags3_to_brgi_db_mapping(
|
129
|
+
source: str | Path | IO[str] | IO[bytes] | bytes,
|
130
|
+
projected_crs: CRS,
|
131
|
+
vertical_crs: CRS,
|
132
|
+
encoding: str,
|
133
|
+
) -> BedrockGIMapping:
|
134
|
+
"""Map AGS 3 data to the Bedrock GI data model.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
ags3_db (dict[str, pd.DataFrame]): A dictionary of pandas DataFrames, i.e. database,
|
138
|
+
where each key is an AGS 3 group, and the corresponding value is
|
139
|
+
a pandas DataFrame containing the data for that group.
|
140
|
+
projected_crs (CRS): Projected coordinate reference system (CRS).
|
141
|
+
vertical_crs (CRS, optional): Vertical CRS. Defaults to EGM2008 height, EPSG:3855
|
142
|
+
which measures the orthometric height w.r.t. the Earth Gravitational Model 2008.
|
143
|
+
encoding (str): Encoding of the text file or bytes stream.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
BedrockGIDatabaseMapping: Object that maps AGS 3 data to Bedrock GI data model.
|
147
|
+
"""
|
148
|
+
ags3_dfs = ags3_to_dfs(source, encoding)
|
149
|
+
|
150
|
+
check_ags_proj_group(ags3_dfs["PROJ"])
|
151
|
+
ags3_project = ProjectTableMapping(
|
152
|
+
data=ags3_dfs["PROJ"].to_dict(orient="records")[0],
|
153
|
+
project_id=ags3_dfs["PROJ"].at[0, "PROJ_ID"],
|
154
|
+
horizontal_crs=projected_crs,
|
155
|
+
vertical_crs=vertical_crs,
|
156
|
+
)
|
157
|
+
del ags3_dfs["PROJ"]
|
158
|
+
|
159
|
+
Ags3HOLE.validate(ags3_dfs["HOLE"])
|
160
|
+
ags3_location = LocationTableMapping(
|
161
|
+
data=ags3_dfs["HOLE"],
|
162
|
+
location_id_column="HOLE_ID",
|
163
|
+
easting_column="HOLE_NATE",
|
164
|
+
northing_column="HOLE_NATN",
|
165
|
+
ground_level_elevation_column="HOLE_GL",
|
166
|
+
depth_to_base_column="HOLE_FDEP",
|
167
|
+
)
|
168
|
+
del ags3_dfs["HOLE"]
|
169
|
+
|
170
|
+
if "SAMP" in ags3_dfs.keys():
|
171
|
+
Ags3SAMP.validate(ags3_dfs["SAMP"])
|
172
|
+
samp_df = ags3_dfs["SAMP"]
|
173
|
+
samp_df = _add_sample_source_id(samp_df)
|
174
|
+
ags3_sample = SampleTableMapping(
|
175
|
+
data=samp_df,
|
176
|
+
location_id_column="HOLE_ID",
|
177
|
+
sample_id_column="sample_source_id",
|
178
|
+
depth_to_top_column="SAMP_TOP",
|
179
|
+
)
|
180
|
+
del ags3_dfs["SAMP"]
|
181
|
+
else:
|
182
|
+
ags3_sample = None
|
183
|
+
|
184
|
+
ags3_lab_tests = []
|
185
|
+
ags3_insitu_tests = []
|
186
|
+
ags3_other_tables = []
|
187
|
+
|
188
|
+
for group, df in ags3_dfs.items():
|
189
|
+
# Non-standard group names contain the "?" prefix.
|
190
|
+
# => checking that "SAMP_TOP" / "HOLE_ID" is in the columns is too restrictive.
|
191
|
+
if "SAMP_TOP" in df.columns:
|
192
|
+
df = _add_sample_source_id(df)
|
193
|
+
ags3_lab_tests.append(
|
194
|
+
LabTestTableMapping(
|
195
|
+
table_name=group,
|
196
|
+
data=df,
|
197
|
+
location_id_column="HOLE_ID",
|
198
|
+
sample_id_column="sample_source_id",
|
199
|
+
)
|
200
|
+
)
|
201
|
+
elif "HOLE_ID" in df.columns:
|
202
|
+
top_depth, base_depth = _get_depth_columns(group, list(df.columns))
|
203
|
+
ags3_insitu_tests.append(
|
204
|
+
InSituTestTableMapping(
|
205
|
+
table_name=group,
|
206
|
+
data=df,
|
207
|
+
location_id_column="HOLE_ID",
|
208
|
+
depth_to_top_column=top_depth,
|
209
|
+
depth_to_base_column=base_depth,
|
210
|
+
)
|
211
|
+
)
|
212
|
+
else:
|
213
|
+
ags3_other_tables.append(OtherTable(table_name=group, data=df))
|
214
|
+
|
215
|
+
brgi_db_mapping = BedrockGIMapping(
|
216
|
+
Project=ags3_project,
|
217
|
+
Location=ags3_location,
|
218
|
+
InSitu=ags3_insitu_tests,
|
219
|
+
Sample=ags3_sample,
|
220
|
+
Lab=ags3_lab_tests,
|
221
|
+
Other=ags3_other_tables,
|
222
|
+
)
|
223
|
+
return brgi_db_mapping
|
224
|
+
|
225
|
+
|
226
|
+
def _add_sample_source_id(df: pd.DataFrame) -> pd.DataFrame:
|
227
|
+
df["sample_source_id"] = (
|
228
|
+
df["SAMP_REF"].astype(str)
|
229
|
+
+ "-"
|
230
|
+
+ df["SAMP_TYPE"].astype(str)
|
231
|
+
+ "-"
|
232
|
+
+ df["SAMP_TOP"].astype(str)
|
233
|
+
+ "-"
|
234
|
+
+ df["HOLE_ID"].astype(str)
|
235
|
+
)
|
236
|
+
return df
|
237
|
+
|
238
|
+
|
239
|
+
def _get_depth_columns(group: str, headers: list[str]) -> tuple[str | None, str | None]:
|
240
|
+
top_depth: str | None = f"{group}_TOP"
|
241
|
+
base_depth: str | None = f"{group}_BASE"
|
242
|
+
|
243
|
+
match group:
|
244
|
+
case "CDIA":
|
245
|
+
top_depth = "CDIA_CDEP"
|
246
|
+
case "FLSH":
|
247
|
+
top_depth = "FLSH_FROM"
|
248
|
+
base_depth = "FLSH_TO"
|
249
|
+
case "CORE":
|
250
|
+
base_depth = "CORE_BOT"
|
251
|
+
case "HDIA":
|
252
|
+
top_depth = "HDIA_HDEP"
|
253
|
+
case "PTIM":
|
254
|
+
top_depth = "PTIM_DEP"
|
255
|
+
case "IVAN":
|
256
|
+
top_depth = "IVAN_DPTH"
|
257
|
+
case "STCN":
|
258
|
+
top_depth = "STCN_DPTH"
|
259
|
+
case "POBS" | "PREF":
|
260
|
+
top_depth = "PREF_TDEP"
|
261
|
+
case "DREM":
|
262
|
+
top_depth = "DREM_DPTH"
|
263
|
+
case "PRTD" | "PRTG" | "PRTL":
|
264
|
+
top_depth = "PRTD_DPTH"
|
265
|
+
|
266
|
+
if top_depth not in headers:
|
267
|
+
top_depth = None
|
268
|
+
if base_depth not in headers:
|
269
|
+
base_depth = None
|
270
|
+
if not top_depth and not base_depth:
|
271
|
+
raise ValueError(
|
272
|
+
'The in-situ test group "{group}" group in this AGS 3 file does not contain a top or base depth heading!'
|
273
|
+
)
|
274
|
+
|
275
|
+
return top_depth, base_depth
|
bedrock_ge/gi/ags4.py
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import IO
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from python_ags4 import AGS4
|
6
|
+
|
7
|
+
|
8
|
+
def ags4_to_dfs(
|
9
|
+
source: str | Path | IO[str] | IO[bytes] | bytes,
|
10
|
+
) -> dict[str, pd.DataFrame]:
|
11
|
+
"""Converts AGS 4 data to a dictionary of pandas DataFrames.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
source (str | Path | IO[str] | IO[bytes] | bytes): The AGS4 file (str or Path) or a file-like
|
15
|
+
object that represents and AGS4 file.
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key
|
19
|
+
represents a group name from AGS 4 data, and the corresponding value is a
|
20
|
+
pandas DataFrame containing the data for that group.
|
21
|
+
"""
|
22
|
+
ags4_tups = AGS4.AGS4_to_dataframe(source)
|
23
|
+
|
24
|
+
ags4_dfs = {}
|
25
|
+
for group, df in ags4_tups[0].items():
|
26
|
+
df = df.loc[2:].drop(columns=["HEADING"]).reset_index(drop=True)
|
27
|
+
ags4_dfs[group] = df
|
28
|
+
|
29
|
+
return ags4_dfs
|
@@ -1,7 +1,35 @@
|
|
1
|
-
import
|
1
|
+
import pandas as pd
|
2
|
+
import pandera.pandas as pa
|
2
3
|
from pandera.typing import Series
|
3
4
|
|
4
5
|
|
6
|
+
def check_ags_proj_group(ags_proj: pd.DataFrame) -> bool:
|
7
|
+
"""Checks if the AGS 3 or AGS 4 PROJ group is correct.
|
8
|
+
|
9
|
+
Args:
|
10
|
+
ags_proj (pd.DataFrame): The DataFrame with the PROJ group.
|
11
|
+
|
12
|
+
Raises:
|
13
|
+
ValueError: If AGS 3 of AGS 4 PROJ group is not correct.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
bool: Returns True if the AGS 3 or AGS 4 PROJ group is correct.
|
17
|
+
"""
|
18
|
+
if len(ags_proj) != 1:
|
19
|
+
raise ValueError("The PROJ group must contain exactly one row.")
|
20
|
+
|
21
|
+
msg = 'The project ID ("PROJ_ID" in the "PROJ" group) is missing from the AGS data.'
|
22
|
+
try:
|
23
|
+
project_id = ags_proj.at[ags_proj.index[0], "PROJ_ID"]
|
24
|
+
except KeyError:
|
25
|
+
raise ValueError(msg)
|
26
|
+
|
27
|
+
if pd.isna(project_id) or str(project_id).strip() == "":
|
28
|
+
raise ValueError(msg)
|
29
|
+
|
30
|
+
return True
|
31
|
+
|
32
|
+
|
5
33
|
class Ags3HOLE(pa.DataFrameModel):
|
6
34
|
HOLE_ID: Series[str] = pa.Field(
|
7
35
|
# primary_key=True,
|
@@ -56,13 +84,6 @@ class BaseSAMP(pa.DataFrameModel):
|
|
56
84
|
|
57
85
|
|
58
86
|
class Ags3SAMP(BaseSAMP):
|
59
|
-
sample_id: Series[str] = pa.Field(
|
60
|
-
# primary_key=True,
|
61
|
-
unique=True,
|
62
|
-
coerce=True,
|
63
|
-
description="Sample unique identifier",
|
64
|
-
# example="REF_TYPE_TOP_HOLE_ID",
|
65
|
-
)
|
66
87
|
HOLE_ID: Series[str] = pa.Field(
|
67
88
|
# foreign_key="Ags3HOLE.HOLE_ID",
|
68
89
|
description="Exploratory hole or location equivalent",
|
@@ -0,0 +1,128 @@
|
|
1
|
+
from collections.abc import Iterable
|
2
|
+
|
3
|
+
import pandas as pd
|
4
|
+
|
5
|
+
from bedrock_ge.gi.io_utils import convert_object_col_content_to_string
|
6
|
+
from bedrock_ge.gi.schemas import (
|
7
|
+
BedrockGIDatabase,
|
8
|
+
InSituTestSchema,
|
9
|
+
LabTestSchema,
|
10
|
+
LocationSchema,
|
11
|
+
ProjectSchema,
|
12
|
+
SampleSchema,
|
13
|
+
)
|
14
|
+
from bedrock_ge.gi.validate import check_foreign_key
|
15
|
+
|
16
|
+
|
17
|
+
def merge_dbs(
|
18
|
+
brgi_dbs: Iterable[BedrockGIDatabase],
|
19
|
+
) -> BedrockGIDatabase:
|
20
|
+
"""Merges the incoming Bedrock GI database into the target Bedrock GI database.
|
21
|
+
|
22
|
+
The function concatenates the pandas DataFrames of the second dict of
|
23
|
+
DataFrames to the first dict of DataFrames for the keys they have in common.
|
24
|
+
Keys that are unique to either dictionary will be included in the final
|
25
|
+
concatenated dictionary.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
brgi_dbs: The Bedrock GI databases containing the data to be merged.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
BedrockGIDatabase: Merged Bedrock GI database.
|
32
|
+
"""
|
33
|
+
dbs = list(brgi_dbs)
|
34
|
+
|
35
|
+
if not dbs:
|
36
|
+
raise ValueError("Cannot merge an empty list of Bedrock GI databases.")
|
37
|
+
elif len(dbs) == 1 and isinstance(dbs[0], BedrockGIDatabase):
|
38
|
+
return dbs[0]
|
39
|
+
|
40
|
+
project_dataframes = _filter_dataframes([db.Project for db in dbs])
|
41
|
+
merged_project = pd.concat(project_dataframes, ignore_index=True)
|
42
|
+
merged_project = merged_project.drop_duplicates().reset_index(drop=True)
|
43
|
+
merged_project = convert_object_col_content_to_string(merged_project)
|
44
|
+
ProjectSchema.validate(merged_project)
|
45
|
+
|
46
|
+
location_dataframes = _filter_dataframes([db.Location for db in dbs])
|
47
|
+
merged_location = pd.concat(location_dataframes, ignore_index=True)
|
48
|
+
merged_location = merged_location.drop_duplicates().reset_index(drop=True)
|
49
|
+
merged_location = convert_object_col_content_to_string(merged_location)
|
50
|
+
LocationSchema.validate(merged_location)
|
51
|
+
check_foreign_key("project_uid", merged_project, merged_location)
|
52
|
+
|
53
|
+
insitu_tables: set[str] = set()
|
54
|
+
lab_tables: set[str] = set()
|
55
|
+
other_tables: set[str] = set()
|
56
|
+
for db in dbs:
|
57
|
+
insitu_tables.update(db.InSituTests.keys())
|
58
|
+
if db.LabTests:
|
59
|
+
lab_tables.update(db.LabTests.keys())
|
60
|
+
if db.Other:
|
61
|
+
other_tables.update(db.Other.keys())
|
62
|
+
|
63
|
+
merged_insitu: dict[str, pd.DataFrame] = {}
|
64
|
+
for table_name in insitu_tables:
|
65
|
+
insitu_dataframes = _filter_dataframes(
|
66
|
+
[db.InSituTests.get(table_name) for db in dbs]
|
67
|
+
)
|
68
|
+
insitu_df = pd.concat(insitu_dataframes, ignore_index=True)
|
69
|
+
insitu_df = insitu_df.drop_duplicates().reset_index(drop=True)
|
70
|
+
insitu_df = convert_object_col_content_to_string(insitu_df)
|
71
|
+
InSituTestSchema.validate(insitu_df)
|
72
|
+
check_foreign_key("project_uid", merged_project, insitu_df)
|
73
|
+
check_foreign_key("location_uid", merged_location, insitu_df)
|
74
|
+
merged_insitu[table_name] = insitu_df
|
75
|
+
|
76
|
+
sample_dfs = _filter_dataframes([db.Sample for db in dbs])
|
77
|
+
merged_sample = None
|
78
|
+
if sample_dfs:
|
79
|
+
merged_sample = pd.concat(sample_dfs, ignore_index=True)
|
80
|
+
merged_sample = merged_sample.drop_duplicates().reset_index(drop=True)
|
81
|
+
merged_sample = convert_object_col_content_to_string(merged_sample)
|
82
|
+
SampleSchema.validate(merged_sample)
|
83
|
+
check_foreign_key("project_uid", merged_project, merged_sample)
|
84
|
+
|
85
|
+
merged_lab: dict[str, pd.DataFrame] = {}
|
86
|
+
for table_name in lab_tables:
|
87
|
+
lab_dataframes = _filter_dataframes([db.LabTests.get(table_name) for db in dbs])
|
88
|
+
lab_df = pd.concat(lab_dataframes, ignore_index=True)
|
89
|
+
lab_df = lab_df.drop_duplicates().reset_index(drop=True)
|
90
|
+
lab_df = convert_object_col_content_to_string(lab_df)
|
91
|
+
LabTestSchema.validate(lab_df)
|
92
|
+
check_foreign_key("project_uid", merged_project, lab_df)
|
93
|
+
check_foreign_key("sample_uid", merged_sample, lab_df)
|
94
|
+
merged_lab[table_name] = lab_df
|
95
|
+
|
96
|
+
merged_other: dict[str, pd.DataFrame] = {}
|
97
|
+
for table_name in other_tables:
|
98
|
+
other_dataframes = _filter_dataframes([db.Other.get(table_name) for db in dbs])
|
99
|
+
other_df = pd.concat(other_dataframes, ignore_index=True)
|
100
|
+
other_df = other_df.drop_duplicates().reset_index(drop=True)
|
101
|
+
other_df = convert_object_col_content_to_string(other_df)
|
102
|
+
check_foreign_key("project_uid", merged_project, other_df)
|
103
|
+
merged_other[table_name] = other_df
|
104
|
+
|
105
|
+
return BedrockGIDatabase(
|
106
|
+
Project=merged_project,
|
107
|
+
Location=merged_location,
|
108
|
+
InSituTests=merged_insitu,
|
109
|
+
Sample=merged_sample,
|
110
|
+
LabTests=merged_lab,
|
111
|
+
Other=merged_other,
|
112
|
+
)
|
113
|
+
|
114
|
+
|
115
|
+
def _filter_dataframes(dataframes: list[pd.DataFrame | None]) -> list[pd.DataFrame]:
|
116
|
+
"""Filter out empty or all-NA DataFrames to avoid FutureWarnings."""
|
117
|
+
valid_dfs = []
|
118
|
+
for df in dataframes:
|
119
|
+
if df is not None and not df.empty and not df.isna().all().all():
|
120
|
+
if df.columns.duplicated().any():
|
121
|
+
raise ValueError(
|
122
|
+
f"Duplicate column names found in dataframe:\n{list(df.columns)}"
|
123
|
+
)
|
124
|
+
|
125
|
+
df.dropna(axis=1, how="all", inplace=True)
|
126
|
+
|
127
|
+
valid_dfs.append(df)
|
128
|
+
return valid_dfs
|