water-column-sonar-processing 0.0.9__py3-none-any.whl → 26.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- water_column_sonar_processing/aws/dynamodb_manager.py +138 -59
- water_column_sonar_processing/aws/s3_manager.py +179 -141
- water_column_sonar_processing/aws/s3fs_manager.py +29 -33
- water_column_sonar_processing/aws/sqs_manager.py +1 -1
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +35 -96
- water_column_sonar_processing/cruise/datatree_manager.py +21 -0
- water_column_sonar_processing/cruise/resample_regrid.py +142 -127
- water_column_sonar_processing/geometry/__init__.py +10 -2
- water_column_sonar_processing/geometry/elevation_manager.py +111 -0
- water_column_sonar_processing/geometry/geometry_manager.py +50 -49
- water_column_sonar_processing/geometry/line_simplification.py +176 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +227 -223
- water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
- water_column_sonar_processing/index/index_manager.py +151 -33
- water_column_sonar_processing/model/zarr_manager.py +665 -262
- water_column_sonar_processing/processing/__init__.py +3 -3
- water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +206 -214
- water_column_sonar_processing/utility/__init__.py +9 -2
- water_column_sonar_processing/utility/constants.py +69 -18
- water_column_sonar_processing/utility/pipeline_status.py +11 -15
- water_column_sonar_processing/utility/timestamp.py +3 -4
- water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
- water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
- {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
- {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
- water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
- water_column_sonar_processing/process.py +0 -147
- water_column_sonar_processing/processing/cruise_sampler.py +0 -342
- water_column_sonar_processing-0.0.9.dist-info/METADATA +0 -134
- water_column_sonar_processing-0.0.9.dist-info/RECORD +0 -32
- {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import tempfile
|
|
2
3
|
|
|
3
|
-
import numcodecs
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from water_column_sonar_processing.
|
|
7
|
-
from water_column_sonar_processing.aws import S3Manager
|
|
6
|
+
from water_column_sonar_processing.utility import Constants
|
|
7
|
+
from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
|
|
8
8
|
from water_column_sonar_processing.model import ZarrManager
|
|
9
9
|
from water_column_sonar_processing.utility import Cleaner
|
|
10
10
|
|
|
11
|
-
numcodecs.blosc.use_threads = False
|
|
12
|
-
numcodecs.blosc.set_nthreads(1)
|
|
13
|
-
|
|
14
|
-
# TEMPDIR = "/tmp"
|
|
15
|
-
# TODO: when ready switch to version 3 of model spec
|
|
16
|
-
# ZARR_V3_EXPERIMENTAL_API = 1
|
|
17
|
-
# creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
|
|
18
|
-
|
|
19
11
|
|
|
20
12
|
# TODO: change name to "CreateLocalEmptyZarrStore"
|
|
21
13
|
class CreateEmptyZarrStore:
|
|
@@ -24,65 +16,37 @@ class CreateEmptyZarrStore:
|
|
|
24
16
|
self,
|
|
25
17
|
):
|
|
26
18
|
self.__overwrite = True
|
|
27
|
-
self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
28
|
-
self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
29
|
-
|
|
30
|
-
#######################################################
|
|
31
|
-
|
|
32
|
-
# TODO: move this to the s3_manager
|
|
33
|
-
def upload_zarr_store_to_s3(
|
|
34
|
-
self,
|
|
35
|
-
local_directory: str,
|
|
36
|
-
object_prefix: str,
|
|
37
|
-
cruise_name: str,
|
|
38
|
-
) -> None:
|
|
39
|
-
print("uploading model store to s3")
|
|
40
|
-
s3_manager = S3Manager()
|
|
41
|
-
#
|
|
42
|
-
print("Starting upload with thread pool executor.")
|
|
43
|
-
# # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
|
|
44
|
-
all_files = []
|
|
45
|
-
for subdir, dirs, files in os.walk(
|
|
46
|
-
f"{local_directory}/{cruise_name}.zarr_manager"
|
|
47
|
-
):
|
|
48
|
-
for file in files:
|
|
49
|
-
local_path = os.path.join(subdir, file)
|
|
50
|
-
# 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.model/.zattrs'
|
|
51
|
-
s3_key = f'{object_prefix}/{cruise_name}.model{local_path.split(f"{cruise_name}.model")[-1]}'
|
|
52
|
-
all_files.append([local_path, s3_key])
|
|
53
|
-
#
|
|
54
|
-
# print(all_files)
|
|
55
|
-
s3_manager.upload_files_with_thread_pool_executor(
|
|
56
|
-
all_files=all_files,
|
|
57
|
-
)
|
|
58
|
-
print("Done uploading with thread pool executor.")
|
|
59
|
-
# TODO: move to common place
|
|
19
|
+
# self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
20
|
+
# self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
60
21
|
|
|
61
22
|
#######################################################
|
|
23
|
+
@staticmethod
|
|
62
24
|
def create_cruise_level_zarr_store(
|
|
63
|
-
|
|
25
|
+
output_bucket_name: str,
|
|
64
26
|
ship_name: str,
|
|
65
27
|
cruise_name: str,
|
|
66
28
|
sensor_name: str,
|
|
67
29
|
table_name: str,
|
|
68
|
-
tempdir: str,
|
|
69
30
|
) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Initialize zarr store for the entire cruise which aggregates all the raw data.
|
|
33
|
+
All cruises will be resampled at 20 cm depth.
|
|
34
|
+
# tempdir="/tmp", # TODO: create better tmp directory for testing
|
|
35
|
+
"""
|
|
36
|
+
tempdir = tempfile.TemporaryDirectory()
|
|
70
37
|
try:
|
|
71
|
-
# HB0806 - 123, HB0903 - 220
|
|
72
38
|
dynamo_db_manager = DynamoDBManager()
|
|
73
39
|
s3_manager = S3Manager()
|
|
74
40
|
|
|
75
41
|
df = dynamo_db_manager.get_table_as_df(
|
|
76
42
|
table_name=table_name,
|
|
77
|
-
ship_name=ship_name,
|
|
78
43
|
cruise_name=cruise_name,
|
|
79
|
-
sensor_name=sensor_name,
|
|
80
44
|
)
|
|
81
45
|
|
|
82
46
|
# TODO: filter the dataframe just for enums >= LEVEL_1_PROCESSING
|
|
83
47
|
# df[df['PIPELINE_STATUS'] < PipelineStatus.LEVEL_1_PROCESSING] = np.nan
|
|
84
48
|
|
|
85
|
-
# TODO: VERIFY GEOJSON EXISTS as prerequisite!!!
|
|
49
|
+
# TODO: VERIFY GEOJSON EXISTS as prerequisite!!! ...no more geojson needed
|
|
86
50
|
|
|
87
51
|
print(f"DataFrame shape: {df.shape}")
|
|
88
52
|
cruise_channels = list(
|
|
@@ -94,89 +58,64 @@ class CreateEmptyZarrStore:
|
|
|
94
58
|
df["NUM_PING_TIME_DROPNA"].dropna().astype(int)
|
|
95
59
|
)
|
|
96
60
|
|
|
97
|
-
# [
|
|
98
|
-
|
|
99
|
-
np.min(df["MIN_ECHO_RANGE"].dropna().astype(float))
|
|
100
|
-
)
|
|
61
|
+
# [4] max measurement resolution for the whole cruise
|
|
62
|
+
cruise_max_echo_range = np.max(df["MAX_ECHO_RANGE"].dropna().astype(float))
|
|
101
63
|
|
|
102
|
-
|
|
103
|
-
cruise_max_echo_range = float(
|
|
104
|
-
np.max(df["MAX_ECHO_RANGE"].dropna().astype(float))
|
|
105
|
-
)
|
|
106
|
-
print(
|
|
107
|
-
f"cruise_min_echo_range: {cruise_min_echo_range}, cruise_max_echo_range: {cruise_max_echo_range}"
|
|
108
|
-
)
|
|
64
|
+
print(f"cruise_max_echo_range: {cruise_max_echo_range}")
|
|
109
65
|
|
|
110
66
|
# [5] get number of channels
|
|
111
67
|
cruise_frequencies = [
|
|
112
68
|
float(i) for i in df["FREQUENCIES"].dropna().values.flatten()[0]
|
|
113
69
|
]
|
|
114
|
-
print(cruise_frequencies)
|
|
115
70
|
|
|
116
71
|
new_width = int(consolidated_zarr_width)
|
|
117
|
-
print(f"new_width: {new_width}")
|
|
118
|
-
#################################################################
|
|
119
|
-
store_name = f"{cruise_name}.model"
|
|
120
|
-
print(store_name)
|
|
121
72
|
################################################################
|
|
122
|
-
# Delete existing
|
|
123
|
-
zarr_prefix = os.path.join(
|
|
73
|
+
# Delete any existing stores
|
|
74
|
+
zarr_prefix = os.path.join(
|
|
75
|
+
str(Constants.LEVEL_2.value), ship_name, cruise_name, sensor_name
|
|
76
|
+
)
|
|
124
77
|
child_objects = s3_manager.get_child_objects(
|
|
125
|
-
bucket_name=
|
|
78
|
+
bucket_name=output_bucket_name,
|
|
126
79
|
sub_prefix=zarr_prefix,
|
|
127
80
|
)
|
|
81
|
+
|
|
128
82
|
if len(child_objects) > 0:
|
|
129
83
|
s3_manager.delete_nodd_objects(
|
|
84
|
+
bucket_name=output_bucket_name,
|
|
130
85
|
objects=child_objects,
|
|
131
86
|
)
|
|
132
87
|
################################################################
|
|
133
88
|
# Create new model store
|
|
134
89
|
zarr_manager = ZarrManager()
|
|
135
|
-
new_height = len(
|
|
136
|
-
zarr_manager.get_depth_values(
|
|
137
|
-
min_echo_range=cruise_min_echo_range,
|
|
138
|
-
max_echo_range=cruise_max_echo_range,
|
|
139
|
-
)
|
|
140
|
-
)
|
|
141
|
-
print(f"new_height: {new_height}")
|
|
142
|
-
|
|
143
90
|
zarr_manager.create_zarr_store(
|
|
144
|
-
path=tempdir,
|
|
91
|
+
path=tempdir.name,
|
|
145
92
|
ship_name=ship_name,
|
|
146
93
|
cruise_name=cruise_name,
|
|
147
94
|
sensor_name=sensor_name,
|
|
148
95
|
frequencies=cruise_frequencies,
|
|
149
96
|
width=new_width,
|
|
150
|
-
min_echo_range=cruise_min_echo_range,
|
|
151
97
|
max_echo_range=cruise_max_echo_range,
|
|
98
|
+
# cruise_min_epsilon=cruise_min_epsilon,
|
|
152
99
|
calibration_status=True,
|
|
153
100
|
)
|
|
154
101
|
#################################################################
|
|
155
|
-
|
|
156
|
-
|
|
102
|
+
# TODO: would be more elegant to create directly into s3 bucket
|
|
103
|
+
s3_manager.upload_zarr_store_to_s3(
|
|
104
|
+
output_bucket_name=output_bucket_name,
|
|
105
|
+
local_directory=tempdir.name,
|
|
157
106
|
object_prefix=zarr_prefix,
|
|
158
107
|
cruise_name=cruise_name,
|
|
159
108
|
)
|
|
160
|
-
# https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html
|
|
161
109
|
#################################################################
|
|
162
|
-
#
|
|
163
|
-
# count = self.__get_file_count(store_name=store_name)
|
|
164
|
-
# #
|
|
165
|
-
# raw_zarr_files = self.__get_s3_files( # TODO: just need count
|
|
166
|
-
# bucket_name=self.__output_bucket,
|
|
167
|
-
# sub_prefix=os.path.join(zarr_prefix, store_name),
|
|
168
|
-
# )
|
|
169
|
-
# if len(raw_zarr_files) != count:
|
|
170
|
-
# print(f'Problem writing {store_name} with proper count {count}.')
|
|
171
|
-
# raise Exception("File count doesnt equal number of s3 Zarr store files.")
|
|
172
|
-
# else:
|
|
173
|
-
# print("File counts match.")
|
|
110
|
+
# TODO: verify count of the files uploaded
|
|
174
111
|
#################################################################
|
|
175
|
-
# Success
|
|
176
112
|
# TODO: update enum in dynamodb
|
|
113
|
+
print("Done creating cruise level zarr store.")
|
|
177
114
|
#################################################################
|
|
178
115
|
except Exception as err:
|
|
179
|
-
|
|
116
|
+
raise RuntimeError(
|
|
117
|
+
f"Problem trying to create new cruise model store, {err}"
|
|
118
|
+
)
|
|
180
119
|
finally:
|
|
181
120
|
cleaner = Cleaner()
|
|
182
121
|
cleaner.delete_local_files()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# ### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
|
|
2
|
+
# import xarray as xr
|
|
3
|
+
# from datatree import DataTree
|
|
4
|
+
#
|
|
5
|
+
#
|
|
6
|
+
# class DatatreeManager:
|
|
7
|
+
# #######################################################
|
|
8
|
+
# def __init__(
|
|
9
|
+
# self,
|
|
10
|
+
# ):
|
|
11
|
+
# self.dtype = "float32"
|
|
12
|
+
#
|
|
13
|
+
# #################################################################
|
|
14
|
+
# def create_datatree(
|
|
15
|
+
# self,
|
|
16
|
+
# input_ds,
|
|
17
|
+
# ) -> None:
|
|
18
|
+
# ds1 = xr.Dataset({"foo": "orange"})
|
|
19
|
+
# dt = DataTree(name="root", dataset=ds1) # create root node
|
|
20
|
+
# # ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
|
|
21
|
+
# return dt
|
|
@@ -1,23 +1,14 @@
|
|
|
1
1
|
import gc
|
|
2
|
-
import
|
|
2
|
+
import warnings
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
-
import numcodecs
|
|
6
5
|
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
6
|
import xarray as xr
|
|
9
7
|
|
|
10
8
|
from water_column_sonar_processing.aws import DynamoDBManager
|
|
11
|
-
from water_column_sonar_processing.geometry import GeometryManager
|
|
12
9
|
from water_column_sonar_processing.model import ZarrManager
|
|
13
10
|
|
|
14
|
-
|
|
15
|
-
numcodecs.blosc.set_nthreads(1)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
# TODO: when ready switch to version 3 of model spec
|
|
19
|
-
# ZARR_V3_EXPERIMENTAL_API = 1
|
|
20
|
-
# creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
|
|
11
|
+
warnings.simplefilter("ignore", category=RuntimeWarning)
|
|
21
12
|
|
|
22
13
|
|
|
23
14
|
class ResampleRegrid:
|
|
@@ -26,68 +17,55 @@ class ResampleRegrid:
|
|
|
26
17
|
self,
|
|
27
18
|
):
|
|
28
19
|
self.__overwrite = True
|
|
29
|
-
self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
30
|
-
self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
31
20
|
self.dtype = "float32"
|
|
32
21
|
|
|
33
22
|
#################################################################
|
|
34
23
|
def interpolate_data(
|
|
35
24
|
self,
|
|
36
|
-
input_xr,
|
|
37
|
-
ping_times,
|
|
38
|
-
all_cruise_depth_values,
|
|
25
|
+
input_xr: xr.Dataset,
|
|
26
|
+
ping_times: np.ndarray,
|
|
27
|
+
all_cruise_depth_values: np.ndarray, # includes water_level offset
|
|
39
28
|
) -> np.ndarray:
|
|
40
|
-
|
|
29
|
+
"""
|
|
30
|
+
Input dataset is passed in along with times and depth values to regrid to.
|
|
31
|
+
"""
|
|
32
|
+
print("Interpolating dataset.")
|
|
41
33
|
try:
|
|
42
34
|
data = np.empty(
|
|
43
|
-
(
|
|
35
|
+
( # Depth / Time / Frequency
|
|
44
36
|
len(all_cruise_depth_values),
|
|
45
37
|
len(ping_times),
|
|
46
|
-
len(input_xr.frequency_nominal),
|
|
38
|
+
len(input_xr.frequency_nominal.values),
|
|
47
39
|
),
|
|
48
40
|
dtype=self.dtype,
|
|
49
41
|
)
|
|
50
42
|
|
|
51
43
|
data[:] = np.nan
|
|
52
44
|
|
|
53
|
-
regrid_resample = xr.DataArray(
|
|
45
|
+
regrid_resample = xr.DataArray( # where data will be written to
|
|
54
46
|
data=data,
|
|
55
|
-
dims=("depth", "time", "frequency"),
|
|
56
47
|
coords={
|
|
57
48
|
"depth": all_cruise_depth_values,
|
|
58
49
|
"time": ping_times,
|
|
59
50
|
"frequency": input_xr.frequency_nominal.values,
|
|
60
51
|
},
|
|
52
|
+
dims=("depth", "time", "frequency"),
|
|
53
|
+
name="Sv",
|
|
61
54
|
)
|
|
62
55
|
|
|
63
56
|
channels = input_xr.channel.values
|
|
64
57
|
for channel in range(
|
|
65
58
|
len(channels)
|
|
66
|
-
): # TODO: leaving off here, need to subset for just indices in time axis
|
|
67
|
-
|
|
68
|
-
np.nanmax(
|
|
69
|
-
input_xr.echo_range.sel(
|
|
70
|
-
channel=input_xr.channel[channel]
|
|
71
|
-
).values
|
|
72
|
-
)
|
|
73
|
-
)
|
|
74
|
-
#
|
|
59
|
+
): # ?TODO: leaving off here, need to subset for just indices in time axis
|
|
60
|
+
gc.collect()
|
|
75
61
|
max_depths = np.nanmax(
|
|
76
|
-
a=input_xr.
|
|
62
|
+
a=input_xr.depth.sel(channel=input_xr.channel[channel]).values,
|
|
77
63
|
axis=1,
|
|
78
64
|
)
|
|
79
|
-
superset_of_max_depths = set(
|
|
80
|
-
np.nanmax(
|
|
81
|
-
input_xr.echo_range.sel(
|
|
82
|
-
channel=input_xr.channel[channel]
|
|
83
|
-
).values,
|
|
84
|
-
1,
|
|
85
|
-
)
|
|
86
|
-
)
|
|
65
|
+
superset_of_max_depths = set(max_depths)
|
|
87
66
|
set_of_max_depths = list(
|
|
88
67
|
{x for x in superset_of_max_depths if x == x}
|
|
89
|
-
) #
|
|
90
|
-
# iterate through partitions of data with similar depths and resample
|
|
68
|
+
) # To speed things up resample in groups denoted by max_depth
|
|
91
69
|
for select_max_depth in set_of_max_depths:
|
|
92
70
|
# TODO: for nan just skip and leave all nan's
|
|
93
71
|
select_indices = [
|
|
@@ -96,34 +74,35 @@ class ResampleRegrid:
|
|
|
96
74
|
if max_depths[i] == select_max_depth
|
|
97
75
|
]
|
|
98
76
|
|
|
99
|
-
# now create new DataArray with proper dimension and indices
|
|
100
|
-
# data_select = input_xr.Sv.sel(
|
|
101
|
-
# channel=input_xr.channel[channel]
|
|
102
|
-
# ).values[select_indices, :].T # TODO: dont like this transpose
|
|
103
77
|
data_select = input_xr.Sv.sel(channel=input_xr.channel[channel])[
|
|
104
78
|
select_indices, :
|
|
105
79
|
].T.values
|
|
106
|
-
# change from ".values[select_indices, :].T" to "[select_indices, :].values.T"
|
|
107
80
|
|
|
108
81
|
times_select = input_xr.ping_time.values[select_indices]
|
|
109
|
-
|
|
110
|
-
channel=input_xr.channel[channel]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
82
|
+
depths_all = input_xr.depth.sel(
|
|
83
|
+
channel=input_xr.channel[channel],
|
|
84
|
+
ping_time=input_xr.ping_time[select_indices[0]],
|
|
85
|
+
).values
|
|
86
|
+
depths_select = depths_all[~np.isnan(depths_all)]
|
|
114
87
|
|
|
115
88
|
da_select = xr.DataArray(
|
|
116
|
-
data=data_select,
|
|
89
|
+
data=data_select[: len(depths_select), :],
|
|
117
90
|
dims=("depth", "time"),
|
|
118
91
|
coords={
|
|
119
92
|
"depth": depths_select,
|
|
120
93
|
"time": times_select,
|
|
121
94
|
},
|
|
122
|
-
).dropna(dim="depth")
|
|
123
|
-
resampled = da_select.interp(
|
|
124
|
-
depth=all_cruise_depth_values, method="nearest"
|
|
125
95
|
)
|
|
126
|
-
|
|
96
|
+
|
|
97
|
+
resampled = (
|
|
98
|
+
da_select.interp( # TODO: problem here w D20070712-T152416.raw
|
|
99
|
+
depth=all_cruise_depth_values,
|
|
100
|
+
method="nearest",
|
|
101
|
+
assume_sorted=True,
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
### write to outptut ###
|
|
127
106
|
regrid_resample.loc[
|
|
128
107
|
dict(
|
|
129
108
|
time=times_select,
|
|
@@ -131,11 +110,15 @@ class ResampleRegrid:
|
|
|
131
110
|
)
|
|
132
111
|
] = resampled
|
|
133
112
|
print(f"updated {len(times_select)} ping times")
|
|
113
|
+
gc.collect()
|
|
114
|
+
return (
|
|
115
|
+
regrid_resample.values.copy()
|
|
116
|
+
) # gets passed back wo depth, might need to include?
|
|
134
117
|
except Exception as err:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
118
|
+
raise RuntimeError(f"Problem finding the dynamodb table, {err}")
|
|
119
|
+
finally:
|
|
120
|
+
gc.collect()
|
|
121
|
+
print("Done interpolating dataset.")
|
|
139
122
|
|
|
140
123
|
#################################################################
|
|
141
124
|
def resample_regrid(
|
|
@@ -144,66 +127,79 @@ class ResampleRegrid:
|
|
|
144
127
|
cruise_name,
|
|
145
128
|
sensor_name,
|
|
146
129
|
table_name,
|
|
130
|
+
bucket_name,
|
|
131
|
+
override_select_files=None,
|
|
132
|
+
# override_cruise_min_epsilon=None,
|
|
133
|
+
endpoint_url=None,
|
|
147
134
|
) -> None:
|
|
148
135
|
"""
|
|
149
|
-
The goal here is to interpolate the
|
|
136
|
+
The goal here is to interpolate the dataset against the depth values already populated
|
|
150
137
|
in the existing file level model stores. We open the cruise-level store with model for
|
|
151
138
|
read/write operations. We open the file-level store with Xarray to leverage tools for
|
|
152
|
-
resampling and subsetting the
|
|
139
|
+
resampling and subsetting the dataset.
|
|
153
140
|
"""
|
|
154
|
-
print("Interpolating
|
|
141
|
+
print("Resample Regrid, Interpolating dataset.")
|
|
155
142
|
try:
|
|
156
143
|
zarr_manager = ZarrManager()
|
|
157
|
-
#
|
|
158
|
-
|
|
159
|
-
# get model store
|
|
144
|
+
# geo_manager = GeometryManager()
|
|
145
|
+
|
|
160
146
|
output_zarr_store = zarr_manager.open_s3_zarr_store_with_zarr(
|
|
161
147
|
ship_name=ship_name,
|
|
162
148
|
cruise_name=cruise_name,
|
|
163
149
|
sensor_name=sensor_name,
|
|
164
|
-
|
|
150
|
+
output_bucket_name=bucket_name,
|
|
151
|
+
endpoint_url=endpoint_url,
|
|
165
152
|
)
|
|
166
153
|
|
|
167
|
-
# get dynamo stuff
|
|
168
154
|
dynamo_db_manager = DynamoDBManager()
|
|
169
155
|
cruise_df = dynamo_db_manager.get_table_as_df(
|
|
170
|
-
ship_name=ship_name,
|
|
171
156
|
cruise_name=cruise_name,
|
|
172
|
-
sensor_name=sensor_name,
|
|
173
157
|
table_name=table_name,
|
|
174
158
|
)
|
|
175
159
|
|
|
176
160
|
#########################################################
|
|
177
161
|
#########################################################
|
|
178
|
-
# TODO: iterate files here
|
|
179
162
|
all_file_names = cruise_df["FILE_NAME"]
|
|
163
|
+
|
|
164
|
+
if override_select_files is not None:
|
|
165
|
+
all_file_names = override_select_files
|
|
166
|
+
|
|
167
|
+
# Iterate files
|
|
180
168
|
for file_name in all_file_names:
|
|
181
169
|
gc.collect()
|
|
182
170
|
file_name_stem = Path(file_name).stem
|
|
183
|
-
# file_name_stem = "D20070724-T151330"
|
|
184
171
|
print(f"Processing file: {file_name_stem}.")
|
|
185
|
-
|
|
186
|
-
|
|
172
|
+
|
|
173
|
+
if f"{file_name_stem}.raw" not in list(cruise_df["FILE_NAME"]):
|
|
174
|
+
raise Exception("Raw file file_stem not found in dynamodb.")
|
|
187
175
|
|
|
188
176
|
# status = PipelineStatus['LEVEL_1_PROCESSING']
|
|
189
177
|
# TODO: filter rows by enum success, filter the dataframe just for enums >= LEVEL_1_PROCESSING
|
|
190
178
|
# df[df['PIPELINE_STATUS'] < PipelineStatus.LEVEL_1_PROCESSING] = np.nan
|
|
191
179
|
|
|
192
180
|
# Get index from all cruise files. Note: should be based on which are included in cruise.
|
|
193
|
-
index =
|
|
194
|
-
cruise_df["FILE_NAME"] == f"{file_name_stem}.raw"
|
|
195
|
-
|
|
181
|
+
index = int(
|
|
182
|
+
cruise_df.index[cruise_df["FILE_NAME"] == f"{file_name_stem}.raw"][
|
|
183
|
+
0
|
|
184
|
+
]
|
|
185
|
+
)
|
|
196
186
|
|
|
197
|
-
#
|
|
187
|
+
# Get input store
|
|
198
188
|
input_xr_zarr_store = zarr_manager.open_s3_zarr_store_with_xarray(
|
|
199
189
|
ship_name=ship_name,
|
|
200
190
|
cruise_name=cruise_name,
|
|
201
191
|
sensor_name=sensor_name,
|
|
202
192
|
file_name_stem=file_name_stem,
|
|
193
|
+
bucket_name=bucket_name,
|
|
194
|
+
endpoint_url=endpoint_url,
|
|
203
195
|
)
|
|
196
|
+
|
|
197
|
+
# This is the vertical offset of the sensor related to the ocean surface
|
|
198
|
+
# See https://echopype.readthedocs.io/en/stable/data-proc-additional.html
|
|
199
|
+
# Ignoring water-level for now
|
|
204
200
|
#########################################################################
|
|
205
|
-
# [3] Get needed indices
|
|
206
|
-
# Offset from start index to insert new
|
|
201
|
+
# [3] Get needed time indices — along the x-axis
|
|
202
|
+
# Offset from start index to insert new dataset. Note that missing values are excluded.
|
|
207
203
|
ping_time_cumsum = np.insert(
|
|
208
204
|
np.cumsum(
|
|
209
205
|
cruise_df["NUM_PING_TIME_DROPNA"].dropna().to_numpy(dtype=int)
|
|
@@ -214,85 +210,104 @@ class ResampleRegrid:
|
|
|
214
210
|
start_ping_time_index = ping_time_cumsum[index]
|
|
215
211
|
end_ping_time_index = ping_time_cumsum[index + 1]
|
|
216
212
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
print(
|
|
221
|
-
"Creating empty ndarray for Sv data."
|
|
222
|
-
) # Note: cruise_zarr dimensions are (depth, time, frequency)
|
|
223
|
-
cruise_sv_subset = np.empty(
|
|
224
|
-
shape=output_zarr_store.Sv[
|
|
225
|
-
:, start_ping_time_index:end_ping_time_index, :
|
|
226
|
-
].shape
|
|
213
|
+
max_echo_range = np.max(
|
|
214
|
+
cruise_df["MAX_ECHO_RANGE"].dropna().astype(np.float32)
|
|
227
215
|
)
|
|
228
|
-
|
|
216
|
+
# cruise_min_epsilon = np.min(
|
|
217
|
+
# cruise_df["MIN_ECHO_RANGE"].dropna().astype(float)
|
|
218
|
+
# ) # TODO: currently overwriting to 0.25 m
|
|
229
219
|
|
|
230
220
|
all_cruise_depth_values = zarr_manager.get_depth_values(
|
|
231
|
-
|
|
221
|
+
max_echo_range=max_echo_range,
|
|
222
|
+
# cruise_min_epsilon=cruise_min_epsilon,
|
|
232
223
|
)
|
|
233
224
|
|
|
234
|
-
|
|
235
|
-
|
|
225
|
+
if set(
|
|
226
|
+
input_xr_zarr_store.Sv.dims
|
|
227
|
+
) != { # Cruise dimensions are: (depth, time, frequency)
|
|
236
228
|
"channel",
|
|
237
229
|
"ping_time",
|
|
238
230
|
"range_sample",
|
|
239
231
|
}:
|
|
240
232
|
raise Exception("Xarray dimensions are not as expected.")
|
|
241
233
|
|
|
242
|
-
#
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
234
|
+
# indices, geospatial = geo_manager.read_s3_geo_json( # TODO: remove this!!!!
|
|
235
|
+
# ship_name=ship_name,
|
|
236
|
+
# cruise_name=cruise_name,
|
|
237
|
+
# sensor_name=sensor_name,
|
|
238
|
+
# file_name_stem=file_name_stem,
|
|
239
|
+
# input_xr_zarr_store=input_xr_zarr_store,
|
|
240
|
+
# endpoint_url=endpoint_url,
|
|
241
|
+
# output_bucket_name=bucket_name,
|
|
242
|
+
# )
|
|
250
243
|
|
|
251
|
-
input_xr = input_xr_zarr_store.isel(ping_time=indices)
|
|
244
|
+
input_xr = input_xr_zarr_store # .isel(ping_time=indices)
|
|
252
245
|
|
|
253
246
|
ping_times = input_xr.ping_time.values
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
(pd.Timestamp(i) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
|
|
257
|
-
for i in ping_times
|
|
258
|
-
]
|
|
259
|
-
output_zarr_store.time[start_ping_time_index:end_ping_time_index] = (
|
|
260
|
-
epoch_seconds
|
|
247
|
+
output_zarr_store["time"][start_ping_time_index:end_ping_time_index] = (
|
|
248
|
+
input_xr.ping_time.data
|
|
261
249
|
)
|
|
262
250
|
|
|
263
|
-
# --- UPDATING --- #
|
|
264
|
-
|
|
251
|
+
# --- UPDATING --- # # TODO: problem, this returns dimensionless array
|
|
265
252
|
regrid_resample = self.interpolate_data(
|
|
266
253
|
input_xr=input_xr,
|
|
267
254
|
ping_times=ping_times,
|
|
268
|
-
all_cruise_depth_values=all_cruise_depth_values,
|
|
255
|
+
all_cruise_depth_values=all_cruise_depth_values, # should accommodate the water_level already
|
|
269
256
|
)
|
|
270
257
|
|
|
271
258
|
print(
|
|
272
259
|
f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}"
|
|
273
260
|
)
|
|
274
|
-
|
|
275
261
|
#########################################################################
|
|
276
262
|
# write Sv values to cruise-level-model-store
|
|
277
|
-
for channel in range(
|
|
278
|
-
len(input_xr.channel.values)
|
|
279
|
-
): # doesn't like being written in one fell swoop :(
|
|
280
|
-
output_zarr_store.Sv[
|
|
281
|
-
:, start_ping_time_index:end_ping_time_index, channel
|
|
282
|
-
] = regrid_resample[:, :, channel]
|
|
283
263
|
|
|
264
|
+
for fff in range(regrid_resample.shape[-1]):
|
|
265
|
+
output_zarr_store["Sv"][
|
|
266
|
+
: regrid_resample[:, :, fff].shape[0],
|
|
267
|
+
start_ping_time_index:end_ping_time_index,
|
|
268
|
+
fff,
|
|
269
|
+
] = regrid_resample[:, :, fff]
|
|
270
|
+
#########################################################################
|
|
271
|
+
# in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
|
|
272
|
+
if "detected_seafloor_depth" in list(input_xr.variables):
|
|
273
|
+
print("Adding detected_seafloor_depth to output")
|
|
274
|
+
detected_seafloor_depth = input_xr.detected_seafloor_depth.values
|
|
275
|
+
detected_seafloor_depth[detected_seafloor_depth == 0.0] = np.nan
|
|
276
|
+
|
|
277
|
+
# As requested, use the lowest frequencies to determine bottom
|
|
278
|
+
detected_seafloor_depths = detected_seafloor_depth[0, :]
|
|
279
|
+
|
|
280
|
+
detected_seafloor_depths[detected_seafloor_depths == 0.0] = np.nan
|
|
281
|
+
print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
|
|
282
|
+
print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
|
|
283
|
+
output_zarr_store["bottom"][
|
|
284
|
+
start_ping_time_index:end_ping_time_index
|
|
285
|
+
] = detected_seafloor_depths
|
|
286
|
+
#
|
|
284
287
|
#########################################################################
|
|
285
288
|
# [5] write subset of latitude/longitude
|
|
286
|
-
output_zarr_store
|
|
289
|
+
# output_zarr_store["latitude"][
|
|
290
|
+
# start_ping_time_index:end_ping_time_index
|
|
291
|
+
# ] = geospatial.dropna()[
|
|
292
|
+
# "latitude"
|
|
293
|
+
# ].values # TODO: get from ds_sv directly, dont need geojson anymore
|
|
294
|
+
# output_zarr_store["longitude"][
|
|
295
|
+
# start_ping_time_index:end_ping_time_index
|
|
296
|
+
# ] = geospatial.dropna()["longitude"].values
|
|
297
|
+
#########################################################################
|
|
298
|
+
output_zarr_store["latitude"][
|
|
287
299
|
start_ping_time_index:end_ping_time_index
|
|
288
|
-
] =
|
|
289
|
-
output_zarr_store
|
|
300
|
+
] = input_xr_zarr_store.latitude.dropna(dim="ping_time").values
|
|
301
|
+
output_zarr_store["longitude"][
|
|
290
302
|
start_ping_time_index:end_ping_time_index
|
|
291
|
-
] =
|
|
303
|
+
] = input_xr_zarr_store.longitude.dropna(dim="ping_time").values
|
|
304
|
+
#########################################################################
|
|
292
305
|
except Exception as err:
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
306
|
+
raise RuntimeError(f"Problem with resample_regrid, {err}")
|
|
307
|
+
finally:
|
|
308
|
+
print("Exiting resample_regrid.")
|
|
309
|
+
# TODO: read across times and verify dataset was written?
|
|
310
|
+
gc.collect()
|
|
296
311
|
|
|
297
312
|
#######################################################
|
|
298
313
|
|