water-column-sonar-processing 25.3.1__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/aws/dynamodb_manager.py +6 -6
- water_column_sonar_processing/aws/s3_manager.py +95 -90
- water_column_sonar_processing/aws/s3fs_manager.py +5 -3
- water_column_sonar_processing/aws/sqs_manager.py +1 -1
- water_column_sonar_processing/cruise/__init__.py +2 -1
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +49 -43
- water_column_sonar_processing/cruise/create_empty_zarr_store_level_3.py +161 -0
- water_column_sonar_processing/cruise/datatree_manager.py +21 -21
- water_column_sonar_processing/cruise/resample_regrid.py +57 -47
- water_column_sonar_processing/dataset/__init__.py +3 -0
- water_column_sonar_processing/dataset/dataset_manager.py +205 -0
- water_column_sonar_processing/dataset/feature_manager.py +32 -0
- water_column_sonar_processing/geometry/geometry_manager.py +11 -12
- water_column_sonar_processing/geometry/line_simplification.py +26 -1
- water_column_sonar_processing/geometry/pmtile_generation.py +211 -247
- water_column_sonar_processing/index/index_manager.py +18 -17
- water_column_sonar_processing/model/zarr_manager.py +504 -256
- water_column_sonar_processing/processing/__init__.py +3 -2
- water_column_sonar_processing/processing/batch_downloader.py +11 -11
- water_column_sonar_processing/processing/raw_to_netcdf.py +319 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +41 -31
- water_column_sonar_processing/utility/__init__.py +9 -2
- water_column_sonar_processing/utility/cleaner.py +1 -2
- water_column_sonar_processing/utility/constants.py +26 -7
- water_column_sonar_processing/utility/timestamp.py +1 -0
- water_column_sonar_processing-25.8.0.dist-info/METADATA +162 -0
- water_column_sonar_processing-25.8.0.dist-info/RECORD +39 -0
- {water_column_sonar_processing-25.3.1.dist-info → water_column_sonar_processing-25.8.0.dist-info}/WHEEL +1 -1
- water_column_sonar_processing-25.3.1.dist-info/licenses/LICENSE → water_column_sonar_processing-25.8.0.dist-info/licenses/LICENSE-MIT +1 -1
- water_column_sonar_processing-25.3.1.dist-info/METADATA +0 -170
- water_column_sonar_processing-25.3.1.dist-info/RECORD +0 -34
- {water_column_sonar_processing-25.3.1.dist-info → water_column_sonar_processing-25.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
|
|
7
|
+
from water_column_sonar_processing.model import ZarrManager
|
|
8
|
+
from water_column_sonar_processing.utility import Cleaner
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CreateEmptyZarrStoreLevel3:
|
|
12
|
+
#######################################################
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
):
|
|
16
|
+
self.__overwrite = True
|
|
17
|
+
|
|
18
|
+
#######################################################
|
|
19
|
+
# TODO: move this to the s3_manager
|
|
20
|
+
def upload_zarr_store_to_s3(
|
|
21
|
+
self,
|
|
22
|
+
output_bucket_name: str,
|
|
23
|
+
local_directory: str,
|
|
24
|
+
object_prefix: str, # TODO: add level
|
|
25
|
+
cruise_name: str,
|
|
26
|
+
) -> None:
|
|
27
|
+
print("uploading model store to s3")
|
|
28
|
+
s3_manager = S3Manager()
|
|
29
|
+
#
|
|
30
|
+
print("Starting upload with thread pool executor.")
|
|
31
|
+
# # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
32
|
+
all_files = []
|
|
33
|
+
for subdir, dirs, files in os.walk(f"{local_directory}/{cruise_name}.zarr"):
|
|
34
|
+
for file in files:
|
|
35
|
+
local_path = os.path.join(subdir, file)
|
|
36
|
+
# TODO: find a better method for splitting strings here:
|
|
37
|
+
# 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
|
|
38
|
+
s3_key = f"{object_prefix}/{cruise_name}.zarr{local_path.split(f'{cruise_name}.zarr')[-1]}"
|
|
39
|
+
all_files.append([local_path, s3_key])
|
|
40
|
+
#
|
|
41
|
+
# print(all_files)
|
|
42
|
+
s3_manager.upload_files_with_thread_pool_executor(
|
|
43
|
+
output_bucket_name=output_bucket_name,
|
|
44
|
+
all_files=all_files,
|
|
45
|
+
)
|
|
46
|
+
print("Done uploading with thread pool executor.")
|
|
47
|
+
# TODO: move to common place
|
|
48
|
+
|
|
49
|
+
#######################################################
|
|
50
|
+
def create_cruise_level_zarr_store_level_3(
|
|
51
|
+
self,
|
|
52
|
+
output_bucket_name: str,
|
|
53
|
+
ship_name: str,
|
|
54
|
+
cruise_name: str,
|
|
55
|
+
sensor_name: str,
|
|
56
|
+
table_name: str,
|
|
57
|
+
) -> None:
|
|
58
|
+
tempdir = tempfile.TemporaryDirectory()
|
|
59
|
+
try:
|
|
60
|
+
dynamo_db_manager = DynamoDBManager()
|
|
61
|
+
s3_manager = S3Manager()
|
|
62
|
+
df = dynamo_db_manager.get_table_as_df(
|
|
63
|
+
table_name=table_name,
|
|
64
|
+
cruise_name=cruise_name,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# TODO: filter the dataframe just for enums >= LEVEL_1_PROCESSING
|
|
68
|
+
|
|
69
|
+
print(f"DataFrame shape: {df.shape}")
|
|
70
|
+
cruise_channels = list(
|
|
71
|
+
set([i for sublist in df["CHANNELS"].dropna() for i in sublist])
|
|
72
|
+
)
|
|
73
|
+
cruise_channels.sort()
|
|
74
|
+
|
|
75
|
+
consolidated_zarr_width = np.sum(
|
|
76
|
+
df["NUM_PING_TIME_DROPNA"].dropna().astype(int)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# [3] calculate the max/min measurement resolutions for the whole cruise
|
|
80
|
+
cruise_min_echo_range = np.min(
|
|
81
|
+
(df["MIN_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# [4] calculate the maximum of the max depth values
|
|
85
|
+
cruise_max_echo_range = np.max(
|
|
86
|
+
(df["MAX_ECHO_RANGE"] + df["WATER_LEVEL"]).dropna().astype(float)
|
|
87
|
+
)
|
|
88
|
+
cruise_max_echo_range = np.ceil(cruise_max_echo_range)
|
|
89
|
+
cruise_min_epsilon = 1.0 # np.min(df["MIN_ECHO_RANGE"].dropna().astype(float)) # TODO: set to 1m
|
|
90
|
+
|
|
91
|
+
print(
|
|
92
|
+
f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# [5] get number of channels
|
|
96
|
+
cruise_frequencies = [
|
|
97
|
+
float(i) for i in df["FREQUENCIES"].dropna().values.flatten()[0]
|
|
98
|
+
]
|
|
99
|
+
print(cruise_frequencies)
|
|
100
|
+
|
|
101
|
+
new_width = int(consolidated_zarr_width)
|
|
102
|
+
print(f"new_width: {new_width}")
|
|
103
|
+
#################################################################
|
|
104
|
+
store_name = f"{cruise_name}.zarr"
|
|
105
|
+
print(store_name)
|
|
106
|
+
################################################################
|
|
107
|
+
# Delete existing model store if it exists
|
|
108
|
+
zarr_prefix = os.path.join("level_3", ship_name, cruise_name, sensor_name)
|
|
109
|
+
child_objects = s3_manager.get_child_objects(
|
|
110
|
+
bucket_name=output_bucket_name,
|
|
111
|
+
sub_prefix=zarr_prefix,
|
|
112
|
+
)
|
|
113
|
+
if len(child_objects) > 0:
|
|
114
|
+
s3_manager.delete_nodd_objects(
|
|
115
|
+
bucket_name=output_bucket_name,
|
|
116
|
+
objects=child_objects,
|
|
117
|
+
)
|
|
118
|
+
################################################################
|
|
119
|
+
# Create new model store
|
|
120
|
+
zarr_manager = ZarrManager()
|
|
121
|
+
new_height = len(
|
|
122
|
+
zarr_manager.get_depth_values(
|
|
123
|
+
# min_echo_range=cruise_min_echo_range,
|
|
124
|
+
max_echo_range=cruise_max_echo_range,
|
|
125
|
+
cruise_min_epsilon=cruise_min_epsilon,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
print(f"new_height: {new_height}")
|
|
129
|
+
|
|
130
|
+
zarr_manager.create_zarr_store_level_3(
|
|
131
|
+
path=tempdir.name, # TODO: need to use .name or problem
|
|
132
|
+
ship_name=ship_name,
|
|
133
|
+
cruise_name=cruise_name,
|
|
134
|
+
sensor_name=sensor_name,
|
|
135
|
+
frequencies=cruise_frequencies,
|
|
136
|
+
width=new_width,
|
|
137
|
+
min_echo_range=cruise_min_echo_range,
|
|
138
|
+
max_echo_range=cruise_max_echo_range,
|
|
139
|
+
cruise_min_epsilon=cruise_min_epsilon,
|
|
140
|
+
calibration_status=True,
|
|
141
|
+
)
|
|
142
|
+
#################################################################
|
|
143
|
+
self.upload_zarr_store_to_s3(
|
|
144
|
+
output_bucket_name=output_bucket_name,
|
|
145
|
+
local_directory=tempdir.name, # TODO: need to use .name or problem
|
|
146
|
+
object_prefix=zarr_prefix,
|
|
147
|
+
cruise_name=cruise_name,
|
|
148
|
+
)
|
|
149
|
+
print("Done creating cruise level zarr store.")
|
|
150
|
+
#################################################################
|
|
151
|
+
except Exception as err:
|
|
152
|
+
raise RuntimeError(
|
|
153
|
+
f"Problem trying to create new cruise model store, {err}"
|
|
154
|
+
)
|
|
155
|
+
finally:
|
|
156
|
+
cleaner = Cleaner()
|
|
157
|
+
cleaner.delete_local_files()
|
|
158
|
+
print("Done creating cruise level model store")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
###########################################################
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
|
|
2
|
-
import xarray as xr
|
|
3
|
-
from datatree import DataTree
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class DatatreeManager:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
1
|
+
# ### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
|
|
2
|
+
# import xarray as xr
|
|
3
|
+
# from datatree import DataTree
|
|
4
|
+
#
|
|
5
|
+
#
|
|
6
|
+
# class DatatreeManager:
|
|
7
|
+
# #######################################################
|
|
8
|
+
# def __init__(
|
|
9
|
+
# self,
|
|
10
|
+
# ):
|
|
11
|
+
# self.dtype = "float32"
|
|
12
|
+
#
|
|
13
|
+
# #################################################################
|
|
14
|
+
# def create_datatree(
|
|
15
|
+
# self,
|
|
16
|
+
# input_ds,
|
|
17
|
+
# ) -> None:
|
|
18
|
+
# ds1 = xr.Dataset({"foo": "orange"})
|
|
19
|
+
# dt = DataTree(name="root", dataset=ds1) # create root node
|
|
20
|
+
# # ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
|
|
21
|
+
# return dt
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import gc
|
|
2
|
+
import warnings
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
import numcodecs
|
|
@@ -10,13 +11,15 @@ from water_column_sonar_processing.aws import DynamoDBManager
|
|
|
10
11
|
from water_column_sonar_processing.geometry import GeometryManager
|
|
11
12
|
from water_column_sonar_processing.model import ZarrManager
|
|
12
13
|
|
|
14
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
15
|
+
|
|
13
16
|
numcodecs.blosc.use_threads = False
|
|
14
17
|
numcodecs.blosc.set_nthreads(1)
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
# TODO: when ready switch to version 3 of model spec
|
|
18
21
|
# ZARR_V3_EXPERIMENTAL_API = 1
|
|
19
|
-
# creates the latlon
|
|
22
|
+
# creates the latlon dataset: foo = ep.consolidate.add_location(ds_Sv, echodata)
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
class ResampleRegrid:
|
|
@@ -34,10 +37,13 @@ class ResampleRegrid:
|
|
|
34
37
|
self,
|
|
35
38
|
input_xr,
|
|
36
39
|
ping_times,
|
|
37
|
-
all_cruise_depth_values,
|
|
38
|
-
water_level,
|
|
40
|
+
all_cruise_depth_values, # includes water_level offset
|
|
41
|
+
water_level, # this is the offset that will be added to each respective file
|
|
39
42
|
) -> np.ndarray:
|
|
40
|
-
|
|
43
|
+
"""
|
|
44
|
+
What gets passed into interpolate data
|
|
45
|
+
"""
|
|
46
|
+
print("Interpolating dataset.")
|
|
41
47
|
try:
|
|
42
48
|
data = np.empty(
|
|
43
49
|
(
|
|
@@ -50,31 +56,38 @@ class ResampleRegrid:
|
|
|
50
56
|
|
|
51
57
|
data[:] = np.nan
|
|
52
58
|
|
|
53
|
-
regrid_resample = xr.DataArray(
|
|
59
|
+
regrid_resample = xr.DataArray( # where data will be written to
|
|
54
60
|
data=data,
|
|
55
61
|
dims=("depth", "time", "frequency"),
|
|
56
62
|
coords={
|
|
57
|
-
"depth": all_cruise_depth_values,
|
|
63
|
+
"depth": all_cruise_depth_values,
|
|
58
64
|
"time": ping_times,
|
|
59
65
|
"frequency": input_xr.frequency_nominal.values,
|
|
60
66
|
},
|
|
61
67
|
)
|
|
62
68
|
|
|
69
|
+
# shift the input data by water_level
|
|
70
|
+
input_xr.echo_range.values = (
|
|
71
|
+
input_xr.echo_range.values + water_level
|
|
72
|
+
) # water_level # TODO: change
|
|
73
|
+
|
|
63
74
|
channels = input_xr.channel.values
|
|
64
75
|
for channel in range(
|
|
65
76
|
len(channels)
|
|
66
77
|
): # ?TODO: leaving off here, need to subset for just indices in time axis
|
|
67
78
|
gc.collect()
|
|
68
79
|
max_depths = np.nanmax(
|
|
69
|
-
a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values
|
|
70
|
-
+ water_level,
|
|
80
|
+
a=input_xr.echo_range.sel(channel=input_xr.channel[channel]).values,
|
|
81
|
+
# + water_level,
|
|
71
82
|
axis=1,
|
|
72
83
|
)
|
|
73
|
-
superset_of_max_depths = set(
|
|
84
|
+
superset_of_max_depths = set(
|
|
85
|
+
max_depths
|
|
86
|
+
) # HB1501, D20150503-T102035.raw, TypeError: unhashable type: 'numpy.ndarray'
|
|
74
87
|
set_of_max_depths = list(
|
|
75
88
|
{x for x in superset_of_max_depths if x == x}
|
|
76
89
|
) # removes nan's
|
|
77
|
-
# iterate through partitions of
|
|
90
|
+
# iterate through partitions of dataset with similar depths and resample
|
|
78
91
|
for select_max_depth in set_of_max_depths:
|
|
79
92
|
# TODO: for nan just skip and leave all nan's
|
|
80
93
|
select_indices = [
|
|
@@ -120,9 +133,8 @@ class ResampleRegrid:
|
|
|
120
133
|
print(f"updated {len(times_select)} ping times")
|
|
121
134
|
gc.collect()
|
|
122
135
|
except Exception as err:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
print("Done interpolating data.")
|
|
136
|
+
raise RuntimeError(f"Problem finding the dynamodb table, {err}")
|
|
137
|
+
print("Done interpolating dataset.")
|
|
126
138
|
return regrid_resample.values.copy()
|
|
127
139
|
|
|
128
140
|
#################################################################
|
|
@@ -132,18 +144,18 @@ class ResampleRegrid:
|
|
|
132
144
|
cruise_name,
|
|
133
145
|
sensor_name,
|
|
134
146
|
table_name,
|
|
135
|
-
|
|
136
|
-
bucket_name, # TODO: this is the same bucket
|
|
147
|
+
bucket_name,
|
|
137
148
|
override_select_files=None,
|
|
149
|
+
# override_cruise_min_epsilon=None,
|
|
138
150
|
endpoint_url=None,
|
|
139
151
|
) -> None:
|
|
140
152
|
"""
|
|
141
|
-
The goal here is to interpolate the
|
|
153
|
+
The goal here is to interpolate the dataset against the depth values already populated
|
|
142
154
|
in the existing file level model stores. We open the cruise-level store with model for
|
|
143
155
|
read/write operations. We open the file-level store with Xarray to leverage tools for
|
|
144
|
-
resampling and subsetting the
|
|
156
|
+
resampling and subsetting the dataset.
|
|
145
157
|
"""
|
|
146
|
-
print("Resample Regrid, Interpolating
|
|
158
|
+
print("Resample Regrid, Interpolating dataset.")
|
|
147
159
|
try:
|
|
148
160
|
zarr_manager = ZarrManager()
|
|
149
161
|
geo_manager = GeometryManager()
|
|
@@ -192,7 +204,7 @@ class ResampleRegrid:
|
|
|
192
204
|
]
|
|
193
205
|
)
|
|
194
206
|
|
|
195
|
-
# Get input store
|
|
207
|
+
# Get input store — this is unadjusted for water_level
|
|
196
208
|
input_xr_zarr_store = zarr_manager.open_s3_zarr_store_with_xarray(
|
|
197
209
|
ship_name=ship_name,
|
|
198
210
|
cruise_name=cruise_name,
|
|
@@ -202,12 +214,15 @@ class ResampleRegrid:
|
|
|
202
214
|
endpoint_url=endpoint_url,
|
|
203
215
|
)
|
|
204
216
|
|
|
205
|
-
# This is the
|
|
217
|
+
# This is the vertical offset of the sensor related to the ocean surface
|
|
206
218
|
# See https://echopype.readthedocs.io/en/stable/data-proc-additional.html
|
|
207
|
-
water_level
|
|
219
|
+
if "water_level" in input_xr_zarr_store.keys():
|
|
220
|
+
water_level = input_xr_zarr_store.water_level.values
|
|
221
|
+
else:
|
|
222
|
+
water_level = 0.0
|
|
208
223
|
#########################################################################
|
|
209
|
-
# [3] Get needed indices
|
|
210
|
-
# Offset from start index to insert new
|
|
224
|
+
# [3] Get needed time indices — along the x-axis
|
|
225
|
+
# Offset from start index to insert new dataset. Note that missing values are excluded.
|
|
211
226
|
ping_time_cumsum = np.insert(
|
|
212
227
|
np.cumsum(
|
|
213
228
|
cruise_df["NUM_PING_TIME_DROPNA"].dropna().to_numpy(dtype=int)
|
|
@@ -218,11 +233,6 @@ class ResampleRegrid:
|
|
|
218
233
|
start_ping_time_index = ping_time_cumsum[index]
|
|
219
234
|
end_ping_time_index = ping_time_cumsum[index + 1]
|
|
220
235
|
|
|
221
|
-
min_echo_range = np.min(
|
|
222
|
-
(cruise_df["MIN_ECHO_RANGE"] + cruise_df["WATER_LEVEL"])
|
|
223
|
-
.dropna()
|
|
224
|
-
.astype(float)
|
|
225
|
-
)
|
|
226
236
|
max_echo_range = np.max(
|
|
227
237
|
(cruise_df["MAX_ECHO_RANGE"] + cruise_df["WATER_LEVEL"])
|
|
228
238
|
.dropna()
|
|
@@ -233,9 +243,9 @@ class ResampleRegrid:
|
|
|
233
243
|
)
|
|
234
244
|
|
|
235
245
|
# Note: cruise dims (depth, time, frequency)
|
|
236
|
-
all_cruise_depth_values = zarr_manager.get_depth_values(
|
|
237
|
-
min_echo_range=min_echo_range,
|
|
238
|
-
max_echo_range=max_echo_range,
|
|
246
|
+
all_cruise_depth_values = zarr_manager.get_depth_values( # needs to integrate water_level
|
|
247
|
+
# min_echo_range=min_echo_range,
|
|
248
|
+
max_echo_range=max_echo_range, # does it here
|
|
239
249
|
cruise_min_epsilon=cruise_min_epsilon, # remove this & integrate into min_echo_range
|
|
240
250
|
) # with offset of 7.5 meters, 0 meter measurement should now start at 7.5 meters
|
|
241
251
|
|
|
@@ -257,7 +267,9 @@ class ResampleRegrid:
|
|
|
257
267
|
output_bucket_name=bucket_name,
|
|
258
268
|
)
|
|
259
269
|
|
|
260
|
-
input_xr = input_xr_zarr_store.isel(
|
|
270
|
+
input_xr = input_xr_zarr_store.isel(
|
|
271
|
+
ping_time=indices
|
|
272
|
+
) # Problem with HB200802-D20080310-T174959.zarr/
|
|
261
273
|
|
|
262
274
|
ping_times = input_xr.ping_time.values
|
|
263
275
|
# Date format: numpy.datetime64('2007-07-20T02:10:25.845073920') converts to "1184897425.845074"
|
|
@@ -270,13 +282,11 @@ class ResampleRegrid:
|
|
|
270
282
|
)
|
|
271
283
|
|
|
272
284
|
# --- UPDATING --- #
|
|
273
|
-
regrid_resample = (
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
water_level=water_level,
|
|
279
|
-
)
|
|
285
|
+
regrid_resample = self.interpolate_data(
|
|
286
|
+
input_xr=input_xr,
|
|
287
|
+
ping_times=ping_times,
|
|
288
|
+
all_cruise_depth_values=all_cruise_depth_values, # should accommodate the water_level already
|
|
289
|
+
water_level=water_level, # not applied to anything yet
|
|
280
290
|
)
|
|
281
291
|
|
|
282
292
|
print(
|
|
@@ -296,15 +306,16 @@ class ResampleRegrid:
|
|
|
296
306
|
# TODO: Only checking the first channel for now. Need to average across all channels
|
|
297
307
|
# in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
|
|
298
308
|
if "detected_seafloor_depth" in input_xr.variables:
|
|
299
|
-
print(
|
|
309
|
+
print(
|
|
310
|
+
"Found detected_seafloor_depth, adding dataset to output store."
|
|
311
|
+
)
|
|
300
312
|
detected_seafloor_depth = input_xr.detected_seafloor_depth.values
|
|
301
313
|
detected_seafloor_depth[detected_seafloor_depth == 0.0] = np.nan
|
|
302
314
|
# TODO: problem here: Processing file: D20070711-T210709.
|
|
303
315
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
# RuntimeWarning: Mean of empty slice detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
|
|
316
|
+
# Use the lowest frequencies to determine bottom
|
|
317
|
+
detected_seafloor_depths = detected_seafloor_depth[0, :]
|
|
318
|
+
|
|
308
319
|
detected_seafloor_depths[detected_seafloor_depths == 0.0] = np.nan
|
|
309
320
|
print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
|
|
310
321
|
print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
|
|
@@ -326,11 +337,10 @@ class ResampleRegrid:
|
|
|
326
337
|
#########################################################################
|
|
327
338
|
#########################################################################
|
|
328
339
|
except Exception as err:
|
|
329
|
-
|
|
330
|
-
raise err
|
|
340
|
+
raise RuntimeError(f"Problem with resample_regrid, {err}")
|
|
331
341
|
finally:
|
|
332
342
|
print("Exiting resample_regrid.")
|
|
333
|
-
# TODO: read across times and verify
|
|
343
|
+
# TODO: read across times and verify dataset was written?
|
|
334
344
|
|
|
335
345
|
#######################################################
|
|
336
346
|
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import xarray as xr
|
|
5
|
+
import xbatcher
|
|
6
|
+
|
|
7
|
+
from water_column_sonar_processing.aws import S3FSManager
|
|
8
|
+
from water_column_sonar_processing.utility.constants import BatchShape
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatasetManager:
|
|
12
|
+
"""
|
|
13
|
+
Dataset manager does three things.
|
|
14
|
+
1) Opens zarr store in s3 bucket with xarray and returns masked dataset
|
|
15
|
+
2) Loads Xarray DataSet with Xbatcher
|
|
16
|
+
3) Loads Xbatcher batches into tensorflow dataset
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
bucket_name: str,
|
|
22
|
+
ship_name: str,
|
|
23
|
+
cruise_name: str,
|
|
24
|
+
sensor_name: str,
|
|
25
|
+
endpoint_url: Optional[str] = None,
|
|
26
|
+
):
|
|
27
|
+
self.bucket_name = bucket_name
|
|
28
|
+
self.ship_name = ship_name
|
|
29
|
+
self.cruise_name = cruise_name
|
|
30
|
+
self.sensor_name = sensor_name
|
|
31
|
+
self.endpoint_url = endpoint_url
|
|
32
|
+
self.dtype = "float32"
|
|
33
|
+
|
|
34
|
+
def open_xarray_dataset(
|
|
35
|
+
self,
|
|
36
|
+
mask: bool = True,
|
|
37
|
+
) -> xr.Dataset:
|
|
38
|
+
# Opens Zarr store in s3 bucket as Xarray Dataset and masks as needed
|
|
39
|
+
try:
|
|
40
|
+
s3_path = f"s3://{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
|
|
41
|
+
|
|
42
|
+
s3fs_manager = S3FSManager(endpoint_url=self.endpoint_url)
|
|
43
|
+
store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=s3_path)
|
|
44
|
+
|
|
45
|
+
ds = xr.open_dataset(
|
|
46
|
+
filename_or_obj=store_s3_map,
|
|
47
|
+
engine="zarr",
|
|
48
|
+
# backend_kwargs={'storage_options': {'anon': True}},
|
|
49
|
+
chunks={},
|
|
50
|
+
cache=False,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Mask all sub-bottom dataset
|
|
54
|
+
if mask:
|
|
55
|
+
return ds.where(ds.depth < ds.bottom)
|
|
56
|
+
|
|
57
|
+
return ds
|
|
58
|
+
except Exception as err:
|
|
59
|
+
raise RuntimeError(f"Problem opening Zarr store from S3 with Xarray, {err}")
|
|
60
|
+
|
|
61
|
+
def vector_indices(
|
|
62
|
+
self,
|
|
63
|
+
first_index: int,
|
|
64
|
+
last_index: int,
|
|
65
|
+
step: int,
|
|
66
|
+
):
|
|
67
|
+
starts = np.arange(first_index, last_index, step)
|
|
68
|
+
ends = np.arange(step, last_index + 1, step)
|
|
69
|
+
return list(zip(starts, ends))
|
|
70
|
+
|
|
71
|
+
def dataset_batcher(
|
|
72
|
+
self,
|
|
73
|
+
):
|
|
74
|
+
"""
|
|
75
|
+
Opens a dataset and creates a generator that returns different chunks of data for processing.
|
|
76
|
+
# TODO: get subset of cruise
|
|
77
|
+
# TODO: if beneath bottom skip
|
|
78
|
+
# TODO: preprocess? scale/normalize?
|
|
79
|
+
# TODO: add in features
|
|
80
|
+
# TODO: pass sv dataset
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
# open zarr store
|
|
84
|
+
# sv_dataset = self.open_xarray_dataset(mask=True)
|
|
85
|
+
|
|
86
|
+
# patch_input_dims = {"depth": 1, "time": 2, "frequency": 3}
|
|
87
|
+
|
|
88
|
+
# define bounds
|
|
89
|
+
outline_dims = {"depth": 7, "time": 4, "frequency": 2}
|
|
90
|
+
|
|
91
|
+
bottom = np.array([5, np.nan, 3, 2]) # for nan should sample all depths
|
|
92
|
+
|
|
93
|
+
for f in self.vector_indices(0, outline_dims["frequency"] + 1, 2):
|
|
94
|
+
for t in self.vector_indices(0, outline_dims["time"] + 1, 2):
|
|
95
|
+
for d in self.vector_indices(0, outline_dims["depth"] + 1, 2):
|
|
96
|
+
indices = f"[d: {d}, t: {t}, f: {f}]"
|
|
97
|
+
|
|
98
|
+
if np.isnan(bottom[t]) or d > bottom[t]:
|
|
99
|
+
print("_+_+_+subbottom_+_+_+")
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
yield indices
|
|
103
|
+
# # generate
|
|
104
|
+
# for f in np.arange(0, outline_dims['frequency'] + 1, 2):
|
|
105
|
+
# for t in np.arange(0, outline_dims['time'] + 1, 2):
|
|
106
|
+
# for d in np.arange(0, outline_dims['depth'] + 1, 2):
|
|
107
|
+
# indices = f"[d: {d}, t: {t}, f: {f}]"
|
|
108
|
+
# # TODO: get subset of cruise
|
|
109
|
+
# # TODO: if beneath bottom skip
|
|
110
|
+
# if np.isnan(bottom[t]) or d > bottom[t]:
|
|
111
|
+
# print('_+_+_+subbottom_+_+_+')
|
|
112
|
+
# continue
|
|
113
|
+
# # TODO: preprocess? scale/normalize?
|
|
114
|
+
# # TODO: add in features
|
|
115
|
+
# # TODO: pass sv dataset
|
|
116
|
+
# yield indices
|
|
117
|
+
|
|
118
|
+
except Exception as err:
|
|
119
|
+
raise RuntimeError(f"Problem defining dataset_batcher, {err}")
|
|
120
|
+
|
|
121
|
+
# @deprecated("We cannot use xbatcher")
|
|
122
|
+
def setup_xbatcher(
|
|
123
|
+
self,
|
|
124
|
+
bucket_name: str,
|
|
125
|
+
ship_name: str,
|
|
126
|
+
cruise_name: str,
|
|
127
|
+
sensor_name: str,
|
|
128
|
+
endpoint_url: str = None,
|
|
129
|
+
):
|
|
130
|
+
# -> xbatcher.generators.BatchGenerator:
|
|
131
|
+
try:
|
|
132
|
+
sv_dataset = self.open_xarray_dataset(
|
|
133
|
+
bucket_name=bucket_name,
|
|
134
|
+
ship_name=ship_name,
|
|
135
|
+
cruise_name=cruise_name,
|
|
136
|
+
sensor_name=sensor_name,
|
|
137
|
+
endpoint_url=endpoint_url,
|
|
138
|
+
)
|
|
139
|
+
patch_input_dims = dict(
|
|
140
|
+
depth=BatchShape.DEPTH.value,
|
|
141
|
+
time=BatchShape.TIME.value,
|
|
142
|
+
frequency=BatchShape.FREQUENCY.value,
|
|
143
|
+
)
|
|
144
|
+
patch_input_overlap = dict(depth=0, time=0, frequency=0)
|
|
145
|
+
batch_generator = xbatcher.generators.BatchGenerator(
|
|
146
|
+
ds=sv_dataset.Sv, # TODO: need to get the depth out of this somehow?
|
|
147
|
+
input_dims=patch_input_dims,
|
|
148
|
+
input_overlap=patch_input_overlap,
|
|
149
|
+
# batch_dims={ "depth": 8, "time": 8, "frequency": 4 }, # no idea what this is doing
|
|
150
|
+
concat_input_dims=False,
|
|
151
|
+
preload_batch=False, # Load each batch dynamically
|
|
152
|
+
cache=None, # TODO: figure this out
|
|
153
|
+
# cache_preprocess=preprocess_batch, # https://xbatcher.readthedocs.io/en/latest/user-guide/caching.html
|
|
154
|
+
)
|
|
155
|
+
return batch_generator
|
|
156
|
+
except Exception as err:
|
|
157
|
+
raise RuntimeError(f"Problem setting up xbatcher, {err}")
|
|
158
|
+
|
|
159
|
+
# @deprecated("We cannot use xbatcher")
|
|
160
|
+
# def create_keras_dataloader(
|
|
161
|
+
# self,
|
|
162
|
+
# bucket_name: str,
|
|
163
|
+
# ship_name: str,
|
|
164
|
+
# cruise_name: str,
|
|
165
|
+
# sensor_name: str,
|
|
166
|
+
# endpoint_url: str = None,
|
|
167
|
+
# batch_size: int = 3,
|
|
168
|
+
# ):
|
|
169
|
+
# pass
|
|
170
|
+
# x_batch_generator = self.setup_xbatcher(
|
|
171
|
+
# bucket_name=bucket_name,
|
|
172
|
+
# ship_name=ship_name,
|
|
173
|
+
# cruise_name=cruise_name, # TODO: move all these to constructor
|
|
174
|
+
# sensor_name=sensor_name,
|
|
175
|
+
# endpoint_url=endpoint_url,
|
|
176
|
+
# )
|
|
177
|
+
#
|
|
178
|
+
# def transform(
|
|
179
|
+
# x,
|
|
180
|
+
# ): # TODO: do clip and normalize here... [-100, 0] w mean at -65, clip?
|
|
181
|
+
# # return x + 1e-6 # (x + 50.) / 100.
|
|
182
|
+
# # return np.clip(x, -60, -50)
|
|
183
|
+
# return (x + 50.) / 100.
|
|
184
|
+
#
|
|
185
|
+
# keras_dataset = xbatcher.loaders.keras.CustomTFDataset(
|
|
186
|
+
# X_generator=x_batch_generator,
|
|
187
|
+
# y_generator=x_batch_generator,
|
|
188
|
+
# transform=transform,
|
|
189
|
+
# target_transform=transform,
|
|
190
|
+
# )
|
|
191
|
+
#
|
|
192
|
+
# output_signature = tensorflow.TensorSpec(
|
|
193
|
+
# shape=(
|
|
194
|
+
# BatchShape.DEPTH.value, # 2
|
|
195
|
+
# BatchShape.TIME.value, # 3
|
|
196
|
+
# BatchShape.FREQUENCY.value, # 4
|
|
197
|
+
# ),
|
|
198
|
+
# dtype=tensorflow.float32,
|
|
199
|
+
# )
|
|
200
|
+
# train_dataloader = tensorflow.data.Dataset.from_generator(
|
|
201
|
+
# generator=lambda: iter(keras_dataset),
|
|
202
|
+
# output_signature=(output_signature, output_signature),
|
|
203
|
+
# )
|
|
204
|
+
#
|
|
205
|
+
# return train_dataloader.batch(batch_size=BatchShape.BATCH_SIZE.value) # 5
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import xarray as xr
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DatasetManager:
|
|
7
|
+
"""
|
|
8
|
+
Enrich the dataset with features
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
bucket_name: str,
|
|
14
|
+
ship_name: str,
|
|
15
|
+
cruise_name: str,
|
|
16
|
+
sensor_name: str,
|
|
17
|
+
endpoint_url: Optional[str] = None,
|
|
18
|
+
):
|
|
19
|
+
self.bucket_name = bucket_name
|
|
20
|
+
self.ship_name = ship_name
|
|
21
|
+
self.cruise_name = cruise_name
|
|
22
|
+
self.sensor_name = sensor_name
|
|
23
|
+
self.endpoint_url = endpoint_url
|
|
24
|
+
|
|
25
|
+
def add_features(
|
|
26
|
+
self,
|
|
27
|
+
) -> xr.Dataset:
|
|
28
|
+
# Opens Zarr store in s3 bucket as Xarray Dataset and masks as needed
|
|
29
|
+
try:
|
|
30
|
+
pass
|
|
31
|
+
except Exception as err:
|
|
32
|
+
raise RuntimeError(f"Problem opening Zarr store from S3 with Xarray, {err}")
|