bedrock-ge 0.2.4__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bedrock_ge/__init__.py +1 -1
- bedrock_ge/gi/ags.py +103 -0
- bedrock_ge/gi/ags3.py +275 -0
- bedrock_ge/gi/ags4.py +29 -0
- bedrock_ge/gi/{ags/schemas.py → ags_schemas.py} +29 -8
- bedrock_ge/gi/db_operations.py +128 -0
- bedrock_ge/gi/geospatial.py +349 -0
- bedrock_ge/gi/io_utils.py +271 -0
- bedrock_ge/gi/mapper.py +221 -0
- bedrock_ge/gi/mapping_models.py +69 -0
- bedrock_ge/gi/schemas.py +136 -36
- bedrock_ge/gi/validate.py +45 -108
- bedrock_ge/gi/write.py +54 -37
- {bedrock_ge-0.2.4.dist-info → bedrock_ge-0.3.1.dist-info}/METADATA +3 -3
- bedrock_ge-0.3.1.dist-info/RECORD +22 -0
- bedrock_ge/gi/ags/__init__.py +0 -0
- bedrock_ge/gi/ags/read.py +0 -192
- bedrock_ge/gi/ags/transform.py +0 -264
- bedrock_ge/gi/ags/validate.py +0 -25
- bedrock_ge/gi/brgi-schema.json +0 -36
- bedrock_ge/gi/concatenate.py +0 -38
- bedrock_ge/gi/gis_geometry.py +0 -282
- bedrock_ge-0.2.4.dist-info/RECORD +0 -21
- /bedrock_ge/gi/{ags/ags3_data_dictionary.json → ags3_data_dictionary.json} +0 -0
- /bedrock_ge/gi/{ags/ags4_data_dictionary.json → ags4_data_dictionary.json} +0 -0
- {bedrock_ge-0.2.4.dist-info → bedrock_ge-0.3.1.dist-info}/WHEEL +0 -0
- {bedrock_ge-0.2.4.dist-info → bedrock_ge-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,349 @@
|
|
1
|
+
import geopandas as gpd
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
from pandera.typing import DataFrame
|
5
|
+
from pyproj import CRS, Transformer
|
6
|
+
from pyproj.crs.crs import CompoundCRS
|
7
|
+
from shapely.geometry import LineString, Point
|
8
|
+
|
9
|
+
from bedrock_ge.gi.schemas import (
|
10
|
+
BedrockGIDatabase,
|
11
|
+
BedrockGIGeospatialDatabase,
|
12
|
+
InSituTestSchema,
|
13
|
+
LocationSchema,
|
14
|
+
SampleSchema,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
def create_brgi_geodb(
|
19
|
+
brgi_db: BedrockGIDatabase,
|
20
|
+
) -> BedrockGIGeospatialDatabase:
|
21
|
+
"""Creates a Bedrock GI geospatial database from a Bedrock GI database.
|
22
|
+
|
23
|
+
Creates a Bedrock GI geospatial database by performing the following steps:
|
24
|
+
1. Creates a geospatial DataFrame for the Location table using the
|
25
|
+
`create_location_geodf` function.
|
26
|
+
2. Creates a geospatial DataFrame for the LonLatHeight table using the
|
27
|
+
`create_lon_lat_height_geodf` function.
|
28
|
+
3. Creates a dictionary of geospatial DataFrames for the In-Situ test tables
|
29
|
+
using the `interpolate_gi_geometry` function.
|
30
|
+
4. Creates a geospatial DataFrame for the Sample table using the
|
31
|
+
`interpolate_gi_geometry` function, if the Sample table exists.
|
32
|
+
5. Returns a BedrockGIGeospatialDatabase object.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
brgi_db (BedrockGIDatabase): The Bedrock GI database to be converted.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
BedrockGIGeospatialDatabase: The resulting Bedrock GI geospatial database.
|
39
|
+
"""
|
40
|
+
location_geodf = create_location_geodf(brgi_db)
|
41
|
+
lon_lat_height_geodf = create_lon_lat_height_geodf(brgi_db)
|
42
|
+
insitu_test_geodfs = {}
|
43
|
+
for insitu_test_name, insitu_test_data in brgi_db.InSituTests.items():
|
44
|
+
insitu_test_geodfs[insitu_test_name] = interpolate_gi_geometry( # type: ignore
|
45
|
+
insitu_test_data, # type: ignore
|
46
|
+
location_geodf, # type: ignore
|
47
|
+
) # type: ignore
|
48
|
+
|
49
|
+
if brgi_db.Sample is not None:
|
50
|
+
sample_geodf = interpolate_gi_geometry(brgi_db.Sample, location_geodf) # type: ignore
|
51
|
+
else:
|
52
|
+
sample_geodf = None
|
53
|
+
|
54
|
+
return BedrockGIGeospatialDatabase(
|
55
|
+
Project=brgi_db.Project,
|
56
|
+
Location=location_geodf,
|
57
|
+
LonLatHeight=lon_lat_height_geodf,
|
58
|
+
InSituTests=insitu_test_geodfs,
|
59
|
+
Sample=sample_geodf,
|
60
|
+
LabTests=brgi_db.LabTests,
|
61
|
+
Other=brgi_db.Other,
|
62
|
+
)
|
63
|
+
|
64
|
+
|
65
|
+
def create_location_geodf(brgi_db: BedrockGIDatabase) -> gpd.GeoDataFrame:
|
66
|
+
"""Creates a geospatial DataFrame for the Location table from a Bedrock GI database.
|
67
|
+
|
68
|
+
This function generates a GeoDataFrame for the Location table using the input
|
69
|
+
Bedrock GI database. It assumes the boreholes are vertical (for now) and calculates
|
70
|
+
elevation at the base of each borehole. It raises an error if multiple
|
71
|
+
horizontal or vertical coordinate reference systems (CRS) are found in the
|
72
|
+
project data.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
brgi_db (BedrockGIDatabase): The Bedrock GI database containing location
|
76
|
+
data and project CRS information.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
gpd.GeoDataFrame: A GeoDataFrame with LineString geometries representing
|
80
|
+
vertical boreholes, using the compound CRS derived from the project's
|
81
|
+
horizontal and vertical CRS.
|
82
|
+
"""
|
83
|
+
# TODO: Implement logic to handle multiple CRS'es in the input GI data:
|
84
|
+
# 1. Create WKT geometry for each location in original CRS
|
85
|
+
# 2. Convert to WGS84 + EGM2008 orthometric height EPSG:9518
|
86
|
+
# 3. Interpolate InSituTest and Sample geospatial vector geometry from active geometry column
|
87
|
+
hor_crs_series = brgi_db.Project["horizontal_crs_wkt"]
|
88
|
+
vert_crs_series = brgi_db.Project["vertical_crs_wkt"]
|
89
|
+
if hor_crs_series.nunique() > 1 or vert_crs_series.nunique() > 1:
|
90
|
+
raise ValueError(
|
91
|
+
"All projects must have the same horizontal and vertical CRS (Coordinate Reference System).\n"
|
92
|
+
"Raise an issue on GitHub in case you need to be able to combine GI data that was acquired in multiple different CRSes."
|
93
|
+
)
|
94
|
+
|
95
|
+
horizontal_crs = CRS.from_wkt(hor_crs_series.iat[0])
|
96
|
+
vertical_crs = CRS.from_wkt(vert_crs_series.iat[0])
|
97
|
+
compound_crs = CompoundCRS(
|
98
|
+
name=f"{horizontal_crs.name} + {vertical_crs.name}",
|
99
|
+
components=[horizontal_crs, vertical_crs],
|
100
|
+
)
|
101
|
+
|
102
|
+
# TODO: Implement logic such that inclined borholes are handled correctly.
|
103
|
+
# All boreholes are now assumed to be vertical.
|
104
|
+
location_df = brgi_db.Location.copy()
|
105
|
+
location_df["elevation_at_base"] = (
|
106
|
+
location_df["ground_level_elevation"] - location_df["depth_to_base"]
|
107
|
+
)
|
108
|
+
return gpd.GeoDataFrame(
|
109
|
+
brgi_db.Location.copy(),
|
110
|
+
geometry=location_df.apply(
|
111
|
+
lambda row: LineString(
|
112
|
+
[
|
113
|
+
(row["easting"], row["northing"], row["ground_level_elevation"]),
|
114
|
+
(row["easting"], row["northing"], row["elevation_at_base"]),
|
115
|
+
]
|
116
|
+
),
|
117
|
+
axis=1,
|
118
|
+
),
|
119
|
+
crs=compound_crs,
|
120
|
+
)
|
121
|
+
|
122
|
+
|
123
|
+
def create_lon_lat_height_geodf(brgi_db: BedrockGIDatabase) -> gpd.GeoDataFrame:
|
124
|
+
"""Creates GeoDataFrame with (lon, lat, height) for each location in a Bedrock GI database.
|
125
|
+
|
126
|
+
This function processes all GI locations in a Bedrock GI database, transforming the
|
127
|
+
(easting, northing, ground level elevation) coordinates to WGS84 (lon, lat)
|
128
|
+
+ EGM2008 orthometric height coordinates, which have coordinate reference system EPSG:9518.
|
129
|
+
It returns a GeoDataFrame with the transformed longitude, latitude, and
|
130
|
+
EGM2008 ground level height, along with the corresponding point geometries in EPSG:9518.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
brgi_db (BedrockGIDatabase): The source Bedrock Ground Investigation database
|
134
|
+
containing location and project information.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
gpd.GeoDataFrame: A GeoDataFrame with the transformed longitude, latitude,
|
138
|
+
and EGM2008 ground level height, along with the corresponding point
|
139
|
+
geometries in EPSG:9518.
|
140
|
+
"""
|
141
|
+
wgs84_egm2008_crs = CRS("EPSG:9518")
|
142
|
+
crs_lookup = brgi_db.Project.set_index("project_uid")
|
143
|
+
dfs = []
|
144
|
+
for project_uid, location_df in brgi_db.Location.groupby("project_uid"):
|
145
|
+
horizontal_crs = CRS.from_wkt(crs_lookup.at[project_uid, "horizontal_crs_wkt"])
|
146
|
+
vertical_crs = CRS.from_wkt(crs_lookup.at[project_uid, "vertical_crs_wkt"])
|
147
|
+
compound_crs = CompoundCRS(
|
148
|
+
name=f"{horizontal_crs.name} + {vertical_crs.name}",
|
149
|
+
components=[horizontal_crs, vertical_crs],
|
150
|
+
)
|
151
|
+
transformer = Transformer.from_crs(
|
152
|
+
compound_crs, wgs84_egm2008_crs, always_xy=True
|
153
|
+
)
|
154
|
+
lon, lat, egm2008_height = transformer.transform(
|
155
|
+
location_df["easting"],
|
156
|
+
location_df["northing"],
|
157
|
+
location_df["ground_level_elevation"],
|
158
|
+
)
|
159
|
+
dfs.append(
|
160
|
+
pd.DataFrame(
|
161
|
+
{
|
162
|
+
"project_uid": project_uid,
|
163
|
+
"location_uid": location_df["location_uid"],
|
164
|
+
"longitude": lon,
|
165
|
+
"latitude": lat,
|
166
|
+
"egm2008_ground_level_height": egm2008_height,
|
167
|
+
}
|
168
|
+
)
|
169
|
+
)
|
170
|
+
|
171
|
+
lon_lat_height_df = pd.concat(dfs, ignore_index=True)
|
172
|
+
return gpd.GeoDataFrame(
|
173
|
+
lon_lat_height_df,
|
174
|
+
geometry=gpd.points_from_xy(
|
175
|
+
lon_lat_height_df["longitude"],
|
176
|
+
lon_lat_height_df["latitude"],
|
177
|
+
lon_lat_height_df["egm2008_ground_level_height"],
|
178
|
+
),
|
179
|
+
crs=wgs84_egm2008_crs,
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
def interpolate_gi_geometry(
|
184
|
+
insitu_test_df: DataFrame[InSituTestSchema] | DataFrame[SampleSchema],
|
185
|
+
location_geodf: gpd.GeoDataFrame,
|
186
|
+
) -> gpd.GeoDataFrame:
|
187
|
+
"""Interpolates the geospatial geometry for a given In-Situ test DataFrame using the corresponding GI Location GeoDataFrame.
|
188
|
+
|
189
|
+
This function takes an In-Situ test or Sample DataFrame and a GI Location GeoDataFrame and
|
190
|
+
returns a GeoDataFrame with its geometry interpolated from the Location GeoDataFrame.
|
191
|
+
The In-Situ test geometry is always a LineString or Point, depending on whether the
|
192
|
+
In-Situ test is performed at a specific depth or over a depth interval inside a borehole.
|
193
|
+
The geometry is calculated by linearly interpolating the depth values for each row
|
194
|
+
in a In-Situ test DataFrame along the corresponding location's LineString geometry.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
insitu_test_df: The In-Situ test or Sample DataFrame containing the depth values to be interpolated.
|
198
|
+
location_geodf: The location GeoDataFrame containing the location LineStrings to be used for interpolation.
|
199
|
+
|
200
|
+
Returns:
|
201
|
+
gpd.GeoDataFrame: A GeoDataFrame containing the interpolated geospatial geometry
|
202
|
+
for the In-Situ test DataFrame.
|
203
|
+
"""
|
204
|
+
# TODO: implement a warning when interpolating GI geospatial geometry when
|
205
|
+
# TODO: a single GI location has waaay too many rows in a certain In-Situ test.
|
206
|
+
geodf = location_geodf[["location_uid", "geometry"]].merge(
|
207
|
+
insitu_test_df,
|
208
|
+
how="right",
|
209
|
+
on="location_uid",
|
210
|
+
)
|
211
|
+
return gpd.GeoDataFrame(
|
212
|
+
insitu_test_df.copy(),
|
213
|
+
geometry=geodf.apply(
|
214
|
+
_interpolate_gi_geometry_row,
|
215
|
+
axis=1,
|
216
|
+
),
|
217
|
+
crs=str(geodf.crs),
|
218
|
+
)
|
219
|
+
|
220
|
+
|
221
|
+
def _interpolate_gi_geometry_row(row: pd.Series) -> LineString | Point:
|
222
|
+
"""Process geometry based on available depth values for each row."""
|
223
|
+
has_top = pd.notna(row.get("depth_to_top"))
|
224
|
+
has_base = pd.notna(row.get("depth_to_base"))
|
225
|
+
|
226
|
+
if has_top and has_base:
|
227
|
+
return substring_3d(
|
228
|
+
row["geometry"],
|
229
|
+
start_dist=row["depth_to_top"],
|
230
|
+
end_dist=row["depth_to_base"],
|
231
|
+
)
|
232
|
+
elif has_top:
|
233
|
+
return interpolate_3d(
|
234
|
+
row["geometry"],
|
235
|
+
distance=row["depth_to_top"],
|
236
|
+
)
|
237
|
+
elif has_base:
|
238
|
+
return interpolate_3d(
|
239
|
+
row["geometry"],
|
240
|
+
distance=row["depth_to_base"],
|
241
|
+
)
|
242
|
+
else:
|
243
|
+
raise KeyError(
|
244
|
+
"An In-Situ test must either have a 'depth_to_top' or a 'depth_to_base', or both."
|
245
|
+
)
|
246
|
+
|
247
|
+
|
248
|
+
def calc_distances_along_3d_linestring(linestring: LineString) -> np.ndarray:
|
249
|
+
"""Calculate cumulative distances along a 3D LineString."""
|
250
|
+
coords = np.array(linestring.coords)
|
251
|
+
if coords.shape[1] < 3:
|
252
|
+
raise ValueError("Coordinates must be 3D (x, y, z)")
|
253
|
+
|
254
|
+
# Calculate 3D distances between consecutive points
|
255
|
+
diffs = np.diff(coords, axis=0)
|
256
|
+
distances = np.sqrt(np.sum(diffs**2, axis=1))
|
257
|
+
|
258
|
+
# Return cumulative distances (starting with 0)
|
259
|
+
return np.concatenate([[0], np.cumsum(distances)])
|
260
|
+
|
261
|
+
|
262
|
+
def interpolate_3d(linestring: LineString, distance: float) -> Point:
|
263
|
+
"""Interpolate a point along a 3D LineString using true 3D distance.
|
264
|
+
|
265
|
+
Return the first point if the distance is less than 0 or the last point if
|
266
|
+
the distance is greater than the total length. This behavior is different than
|
267
|
+
the shapely.LineString.interpolate method.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
linestring: A 3D LineString geometry
|
271
|
+
distance: Distance along the line in 3D space
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
Point: The interpolated 3D point
|
275
|
+
"""
|
276
|
+
if distance <= 0:
|
277
|
+
return Point(linestring.coords[0])
|
278
|
+
|
279
|
+
cumulative_distances = calc_distances_along_3d_linestring(linestring)
|
280
|
+
total_length = cumulative_distances[-1]
|
281
|
+
|
282
|
+
if distance >= total_length:
|
283
|
+
return Point(linestring.coords[-1])
|
284
|
+
|
285
|
+
# Find the segment where the distance falls
|
286
|
+
segment_end_idx = int(np.searchsorted(cumulative_distances, distance))
|
287
|
+
segment_end_dist = cumulative_distances[segment_end_idx]
|
288
|
+
segment_start_idx = max(0, segment_end_idx - 1) # Ensure non-negative
|
289
|
+
segment_start_dist = cumulative_distances[segment_start_idx]
|
290
|
+
|
291
|
+
# Get the coordinates of the point at the start of the segment
|
292
|
+
p1 = np.array(linestring.coords[segment_start_idx])
|
293
|
+
segment_length = segment_end_dist - segment_start_dist
|
294
|
+
if segment_length == 0:
|
295
|
+
return Point(p1)
|
296
|
+
p2 = np.array(linestring.coords[segment_end_idx])
|
297
|
+
|
298
|
+
# Calculate the ratio of how far along the segment the distance of interest falls
|
299
|
+
ratio = (distance - segment_start_dist) / segment_length
|
300
|
+
|
301
|
+
return Point(p1 + ratio * (p2 - p1))
|
302
|
+
|
303
|
+
|
304
|
+
def substring_3d(
|
305
|
+
linestring: LineString, start_dist: float, end_dist: float
|
306
|
+
) -> LineString | Point:
|
307
|
+
"""Extract a substring of a 3D LineString using true 3D distances.
|
308
|
+
|
309
|
+
Args:
|
310
|
+
linestring: A 3D LineString geometry
|
311
|
+
start_dist: Start distance along the line in 3D space
|
312
|
+
end_dist: End distance along the line in 3D space
|
313
|
+
|
314
|
+
Returns:
|
315
|
+
LineString: The extracted 3D LineString segment
|
316
|
+
"""
|
317
|
+
# Ensure start_dist <= end_dist
|
318
|
+
if start_dist > end_dist:
|
319
|
+
start_dist, end_dist = end_dist, start_dist
|
320
|
+
|
321
|
+
# Calculate cumulative 3D distances
|
322
|
+
cumulative_distances = calc_distances_along_3d_linestring(linestring)
|
323
|
+
total_length = cumulative_distances[-1]
|
324
|
+
|
325
|
+
# Handle edge cases
|
326
|
+
start_dist = max(0, min(start_dist, total_length))
|
327
|
+
end_dist = max(0, min(end_dist, total_length))
|
328
|
+
|
329
|
+
if start_dist == end_dist:
|
330
|
+
return interpolate_3d(linestring, start_dist)
|
331
|
+
|
332
|
+
# Find segments that intersect with our range
|
333
|
+
result_coords = []
|
334
|
+
|
335
|
+
# Add start point if it's not at a linestring vertex
|
336
|
+
start_point = interpolate_3d(linestring, start_dist)
|
337
|
+
result_coords.append(start_point.coords[0])
|
338
|
+
|
339
|
+
# Add all vertices that fall within the range
|
340
|
+
for i, dist in enumerate(cumulative_distances):
|
341
|
+
if start_dist < dist < end_dist:
|
342
|
+
result_coords.append(linestring.coords[i])
|
343
|
+
|
344
|
+
# Add end point if it's not at a vertex
|
345
|
+
end_point = interpolate_3d(linestring, end_dist)
|
346
|
+
if end_point.coords[0] != result_coords[-1]: # Avoid duplicate points
|
347
|
+
result_coords.append(end_point.coords[0])
|
348
|
+
|
349
|
+
return LineString(result_coords)
|
@@ -0,0 +1,271 @@
|
|
1
|
+
"""Utility functions for reading, parsing and writing data."""
|
2
|
+
|
3
|
+
import codecs
|
4
|
+
import io
|
5
|
+
from contextlib import contextmanager, nullcontext
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import IO, ContextManager
|
8
|
+
|
9
|
+
import chardet
|
10
|
+
import geopandas as gpd
|
11
|
+
import pandas as pd
|
12
|
+
|
13
|
+
from bedrock_ge.gi.schemas import BedrockGIDatabase, BedrockGIGeospatialDatabase
|
14
|
+
|
15
|
+
DEFAULT_ENCODING = "utf-8"
|
16
|
+
|
17
|
+
|
18
|
+
def detect_encoding(source: str | Path | IO[str] | IO[bytes] | bytes) -> str:
|
19
|
+
"""Detect the character encoding of various input types.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
source (str | Path | IO[str] | IO[bytes] | bytes): The source to detect encoding from.
|
23
|
+
- str or Path: File path.
|
24
|
+
- IO[str]: Already decoded text stream (returns `DEFAULT_ENCODING`)
|
25
|
+
- IO[bytes]: Binary stream to detect encoding from
|
26
|
+
- bytes: Binary data to detect encoding from
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'ascii', etc.)
|
30
|
+
|
31
|
+
Raises:
|
32
|
+
TypeError: If the source type is unsupported
|
33
|
+
FileNotFoundError: If a file path doesn't exist
|
34
|
+
"""
|
35
|
+
# Set number of bytes to read for detection and required confidence
|
36
|
+
SAMPLE_SIZE = 1_000_000
|
37
|
+
REQUIRED_CONFIDENCE = 0.7
|
38
|
+
|
39
|
+
def _detect_from_bytes(data: bytes) -> str:
|
40
|
+
"""Detect encoding from bytes data."""
|
41
|
+
sample = data[: min(len(data), SAMPLE_SIZE)]
|
42
|
+
result = chardet.detect(sample)
|
43
|
+
encoding = result.get("encoding", DEFAULT_ENCODING)
|
44
|
+
confidence = result.get("confidence", 0.0)
|
45
|
+
|
46
|
+
if not encoding or confidence < REQUIRED_CONFIDENCE:
|
47
|
+
return DEFAULT_ENCODING
|
48
|
+
|
49
|
+
if encoding.lower() == "ascii":
|
50
|
+
return "utf-8"
|
51
|
+
|
52
|
+
return encoding
|
53
|
+
|
54
|
+
def _read_from_path(path: Path):
|
55
|
+
"""Read contents from path."""
|
56
|
+
if path.exists() and path.is_file():
|
57
|
+
with open(path, "rb") as file:
|
58
|
+
sample = file.read(SAMPLE_SIZE)
|
59
|
+
return _detect_from_bytes(sample)
|
60
|
+
else:
|
61
|
+
raise FileNotFoundError(
|
62
|
+
f"Path does not exist or is not a file: {path.__str__()[0:40]}"
|
63
|
+
)
|
64
|
+
|
65
|
+
# bytes
|
66
|
+
if isinstance(source, bytes):
|
67
|
+
return _detect_from_bytes(source)
|
68
|
+
|
69
|
+
# String, if not a path, still returns DEFAULT_ENCODING
|
70
|
+
if isinstance(source, str):
|
71
|
+
path = Path(source)
|
72
|
+
try:
|
73
|
+
return _read_from_path(path)
|
74
|
+
except FileNotFoundError:
|
75
|
+
return DEFAULT_ENCODING
|
76
|
+
|
77
|
+
# Path object
|
78
|
+
if isinstance(source, Path):
|
79
|
+
return _read_from_path(source)
|
80
|
+
|
81
|
+
# IO[str] object
|
82
|
+
if hasattr(source, "encoding"):
|
83
|
+
if source.encoding:
|
84
|
+
# Could be `None`, e.g. io.StringIO has an encoding attribute which is None.
|
85
|
+
return source.encoding
|
86
|
+
else:
|
87
|
+
return DEFAULT_ENCODING
|
88
|
+
|
89
|
+
# IO[bytes]
|
90
|
+
if isinstance(source, io.BufferedIOBase):
|
91
|
+
try:
|
92
|
+
if not source.seekable():
|
93
|
+
# For non-seekable streams, read what we can without seeking
|
94
|
+
sample = source.read(SAMPLE_SIZE)
|
95
|
+
if isinstance(sample, bytes):
|
96
|
+
return _detect_from_bytes(sample)
|
97
|
+
else:
|
98
|
+
return DEFAULT_ENCODING
|
99
|
+
|
100
|
+
# For seekable streams, preserve position
|
101
|
+
original_position = source.tell()
|
102
|
+
try:
|
103
|
+
source.seek(0)
|
104
|
+
sample = source.read(SAMPLE_SIZE)
|
105
|
+
if isinstance(sample, bytes):
|
106
|
+
encoding = _detect_from_bytes(sample)
|
107
|
+
else:
|
108
|
+
# if not bytes, then its a custom string-like type that was not caught
|
109
|
+
encoding = DEFAULT_ENCODING
|
110
|
+
return encoding
|
111
|
+
finally:
|
112
|
+
source.seek(original_position)
|
113
|
+
except (AttributeError, IOError, OSError):
|
114
|
+
return DEFAULT_ENCODING
|
115
|
+
|
116
|
+
raise TypeError(f"Unsupported input type for encoding detection: {type(source)}")
|
117
|
+
|
118
|
+
|
119
|
+
def open_text_data_source(
|
120
|
+
source: str | Path | IO[str] | IO[bytes] | bytes, encoding: str | None = None
|
121
|
+
) -> ContextManager[io.TextIOBase]:
|
122
|
+
"""Opens or wraps a given source for reading AGS (text-based) data.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
source (str | Path | IO[str] | IO[bytes] | bytes): The source to read from.
|
126
|
+
- str or Path: File path or direct string content.
|
127
|
+
- IO[str]: A file-like text stream.
|
128
|
+
- IO[bytes]: Byte stream
|
129
|
+
- bytes: Binary content or stream (will be decoded).
|
130
|
+
encoding (str | None): Encoding to use for decoding bytes. Default is None.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
ContextManager[TextIOBase]: A context manager yielding a text stream.
|
134
|
+
|
135
|
+
Raises:
|
136
|
+
TypeError: If the source type is unsupported or binary streams are not decoded.
|
137
|
+
"""
|
138
|
+
try:
|
139
|
+
codecs.lookup(encoding)
|
140
|
+
except LookupError:
|
141
|
+
raise ValueError(f"Unsupported encoding: {encoding}")
|
142
|
+
|
143
|
+
@contextmanager
|
144
|
+
def _bytes_source(bytes_content: bytes):
|
145
|
+
string_io = io.StringIO(bytes_content.decode(encoding))
|
146
|
+
try:
|
147
|
+
yield string_io
|
148
|
+
finally:
|
149
|
+
string_io.close()
|
150
|
+
|
151
|
+
if isinstance(source, (str, Path)):
|
152
|
+
path = Path(source)
|
153
|
+
if path.exists() and path.is_file():
|
154
|
+
return open(path, "r", encoding=encoding)
|
155
|
+
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
|
156
|
+
|
157
|
+
elif isinstance(source, io.TextIOBase):
|
158
|
+
source.seek(0)
|
159
|
+
return nullcontext(source)
|
160
|
+
|
161
|
+
elif isinstance(source, io.BufferedIOBase):
|
162
|
+
text_stream = io.TextIOWrapper(source, encoding=encoding)
|
163
|
+
text_stream.seek(0)
|
164
|
+
return nullcontext(text_stream)
|
165
|
+
|
166
|
+
elif isinstance(source, bytes):
|
167
|
+
return _bytes_source(source)
|
168
|
+
|
169
|
+
else:
|
170
|
+
raise TypeError(
|
171
|
+
f"Unsupported source type: {type(source)}. "
|
172
|
+
"Expected str, Path, IO[str], IO[bytes], or bytes."
|
173
|
+
)
|
174
|
+
|
175
|
+
|
176
|
+
def coerce_string(string: str) -> None | bool | float | str:
|
177
|
+
"""Converts a string to an appropriate Python data type.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
string (str): The input string to be converted.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
None: If the string is 'none', 'null', or empty.
|
184
|
+
bool: If the string is 'true' or 'false' (case insensitive).
|
185
|
+
int: If the string can be converted to a float and has no decimal part.
|
186
|
+
float: If the string can be converted to a float with a decimal part.
|
187
|
+
str: If the string cannot be converted to any of the above types.
|
188
|
+
|
189
|
+
"""
|
190
|
+
if string.lower() in {"none", "null", ""}:
|
191
|
+
return None
|
192
|
+
elif string.lower() == "true":
|
193
|
+
return True
|
194
|
+
elif string.lower() == "false":
|
195
|
+
return False
|
196
|
+
else:
|
197
|
+
try:
|
198
|
+
value = float(string)
|
199
|
+
if value.is_integer():
|
200
|
+
return int(value)
|
201
|
+
else:
|
202
|
+
return value
|
203
|
+
except ValueError:
|
204
|
+
return string
|
205
|
+
|
206
|
+
|
207
|
+
def brgi_db_to_dfs(
|
208
|
+
brgi_db: BedrockGIDatabase | BedrockGIGeospatialDatabase,
|
209
|
+
) -> dict[str, pd.DataFrame | gpd.GeoDataFrame]:
|
210
|
+
"""Converts a Bedrock GI (geospatial) database to a dictionary of DataFrames.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
brgi_db (BedrockGIDatabase | BedrockGIGeospatialDatabase): The Bedrock GI (geospatial) database.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
dict[str, pd.DataFrame | gpd.GeoDataFrame]: A dictionary where the keys are
|
217
|
+
the Bedrock GI table names and the values are the DataFrames that contain
|
218
|
+
the data for each table.
|
219
|
+
"""
|
220
|
+
dict_of_dfs = {
|
221
|
+
"Project": brgi_db.Project,
|
222
|
+
"Location": brgi_db.Location,
|
223
|
+
}
|
224
|
+
|
225
|
+
if hasattr(brgi_db, "LonLatHeight"):
|
226
|
+
dict_of_dfs["LonLatHeight"] = brgi_db.LonLatHeight
|
227
|
+
|
228
|
+
if brgi_db.Sample is not None:
|
229
|
+
dict_of_dfs["Sample"] = brgi_db.Sample
|
230
|
+
|
231
|
+
insitu_dfs = {k: v for k, v in brgi_db.InSituTests.items()}
|
232
|
+
lab_dfs = {k: v for k, v in brgi_db.LabTests.items()}
|
233
|
+
other_dfs = {k: v for k, v in brgi_db.Other.items()}
|
234
|
+
|
235
|
+
return dict_of_dfs | insitu_dfs | lab_dfs | other_dfs
|
236
|
+
|
237
|
+
|
238
|
+
def convert_object_col_content_to_string(
|
239
|
+
df: pd.DataFrame, in_place: bool = True
|
240
|
+
) -> pd.DataFrame:
|
241
|
+
"""Converts the data in columns with the object dtype to strings.
|
242
|
+
|
243
|
+
The real reason that this is necessary is that pandas and marimo are a little finicky about strings:
|
244
|
+
1. The built-in pd.Dataframe.convert_dtypes() method doesn't convert the dtype of
|
245
|
+
columns that contain multiple types in that same column to string.
|
246
|
+
2. marimo cannot handle pd.DataFrames with nullable strings (and other nullable pandas dtypes)
|
247
|
+
very well, see https://github.com/marimo-team/marimo/issues/5445.
|
248
|
+
|
249
|
+
Therefore, this function converts all the data in columns with the object dtype to strings,
|
250
|
+
and then back to the object dtype.
|
251
|
+
|
252
|
+
Args:
|
253
|
+
df: The DataFrame to modify.
|
254
|
+
in_place: Whether to modify the DataFrame in-place (default) or return a new DataFrame.
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
pd.DataFrame: The modified DataFrame with object dtypes converted to string dtypes.
|
258
|
+
|
259
|
+
"""
|
260
|
+
if not in_place:
|
261
|
+
df = df.copy()
|
262
|
+
object_cols = df.select_dtypes(include=["object"]).columns
|
263
|
+
df[object_cols] = df[object_cols].astype("string")
|
264
|
+
df[object_cols] = df[object_cols].astype("object")
|
265
|
+
return df
|
266
|
+
|
267
|
+
|
268
|
+
def geodf_to_df(geodf: gpd.GeoDataFrame) -> pd.DataFrame:
|
269
|
+
"""Convenience function to convert GeoDataFrames to DataFrames for nicer display in notebook environments like marimo."""
|
270
|
+
df = pd.DataFrame(geodf.copy())
|
271
|
+
return df.assign(geometry=df.geometry.astype(str))
|