water-column-sonar-processing 25.1.7__py3-none-any.whl → 25.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/aws/dynamodb_manager.py +27 -32
- water_column_sonar_processing/aws/s3_manager.py +52 -64
- water_column_sonar_processing/aws/s3fs_manager.py +3 -9
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +14 -14
- water_column_sonar_processing/cruise/datatree_manager.py +3 -6
- water_column_sonar_processing/cruise/resample_regrid.py +67 -49
- water_column_sonar_processing/geometry/__init__.py +7 -2
- water_column_sonar_processing/geometry/elevation_manager.py +16 -17
- water_column_sonar_processing/geometry/geometry_manager.py +25 -25
- water_column_sonar_processing/geometry/line_simplification.py +150 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +99 -64
- water_column_sonar_processing/index/index_manager.py +67 -32
- water_column_sonar_processing/model/zarr_manager.py +32 -21
- water_column_sonar_processing/process.py +15 -13
- water_column_sonar_processing/processing/__init__.py +2 -2
- water_column_sonar_processing/processing/batch_downloader.py +66 -41
- water_column_sonar_processing/processing/raw_to_zarr.py +121 -82
- water_column_sonar_processing/utility/constants.py +10 -1
- water_column_sonar_processing/utility/pipeline_status.py +11 -15
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/METADATA +21 -12
- water_column_sonar_processing-25.3.0.dist-info/RECORD +34 -0
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/WHEEL +1 -1
- water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
- water_column_sonar_processing-25.1.7.dist-info/RECORD +0 -34
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info/licenses}/LICENSE +0 -0
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
2
3
|
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
5
|
+
import xarray as xr
|
|
4
6
|
import xbatcher
|
|
5
|
-
|
|
7
|
+
|
|
6
8
|
# s3fs.core.setup_logging("DEBUG")
|
|
7
9
|
|
|
10
|
+
|
|
8
11
|
class BatchDownloader:
|
|
9
12
|
"""
|
|
10
13
|
Uses the xbatcher XbatchDownloader to download data from an xarray dataset. Connection
|
|
@@ -12,13 +15,13 @@ class BatchDownloader:
|
|
|
12
15
|
"""
|
|
13
16
|
|
|
14
17
|
def __init__(
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
self,
|
|
19
|
+
bucket_name: Optional[str] = "noaa-wcsd-zarr-pds",
|
|
20
|
+
ship_name: Optional[str] = "Henry_B._Bigelow",
|
|
21
|
+
cruise_name: Optional[str] = "HB0707",
|
|
22
|
+
sensor_name: Optional[str] = "EK60",
|
|
23
|
+
patch_dims: Optional[int] = 64, # TODO: change to 64
|
|
24
|
+
# input_steps: Optional[int] = 3,
|
|
22
25
|
):
|
|
23
26
|
self.bucket_name = bucket_name
|
|
24
27
|
self.ship_name = ship_name
|
|
@@ -28,7 +31,7 @@ class BatchDownloader:
|
|
|
28
31
|
|
|
29
32
|
# TODO: move this to the s3fs module
|
|
30
33
|
def get_s3_zarr_store(self) -> xr.Dataset:
|
|
31
|
-
"""
|
|
34
|
+
"""Returns an Xarray Dataset"""
|
|
32
35
|
s3_zarr_store_path = f"{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
|
|
33
36
|
# Info about the HB0707 cruise:
|
|
34
37
|
# Time: ["2007-07-11T18:20:33.657573888", "2007-07-11T18:20:53.657573888", "2007-07-13T00:55:17.454448896"]
|
|
@@ -40,7 +43,9 @@ class BatchDownloader:
|
|
|
40
43
|
# store = s3fs.S3Map(root=s3_zarr_store_path, s3=s3_file_system, check=False)
|
|
41
44
|
|
|
42
45
|
# return xr.open_zarr(store=f"s3://{s3_zarr_store_path}", consolidated=True, storage_options={'anon': True})
|
|
43
|
-
return xr.open_dataset(
|
|
46
|
+
return xr.open_dataset(
|
|
47
|
+
f"s3://{s3_zarr_store_path}", engine="zarr", storage_options={"anon": True}
|
|
48
|
+
)
|
|
44
49
|
# return xr.open_zarr(store, consolidated=True)
|
|
45
50
|
|
|
46
51
|
def get_toy_batch_generator(self) -> xbatcher.BatchGenerator:
|
|
@@ -48,14 +53,12 @@ class BatchDownloader:
|
|
|
48
53
|
Returns a BatchGenerator with subsets of Sv data
|
|
49
54
|
Note: this is synthetic data, for a smaller toy example
|
|
50
55
|
"""
|
|
51
|
-
depth = np.arange(1, 21)
|
|
52
|
-
time = pd.date_range(start="2025-01-01", end="2025-01-31", freq=
|
|
53
|
-
frequency = [1_000, 2_000, 3_000]
|
|
54
|
-
Sv = np.random.rand(len(depth), len(time), len(frequency))
|
|
56
|
+
depth = np.arange(1, 21) # N meters
|
|
57
|
+
time = pd.date_range(start="2025-01-01", end="2025-01-31", freq="D") # N days
|
|
58
|
+
frequency = [1_000, 2_000, 3_000] # N frequencies
|
|
59
|
+
Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic data
|
|
55
60
|
cruise = xr.Dataset(
|
|
56
|
-
data_vars={
|
|
57
|
-
"Sv": (["depth", "time", "frequency"], Sv)
|
|
58
|
-
},
|
|
61
|
+
data_vars={"Sv": (["depth", "time", "frequency"], Sv)},
|
|
59
62
|
coords={
|
|
60
63
|
"depth": depth,
|
|
61
64
|
"time": time,
|
|
@@ -66,28 +69,45 @@ class BatchDownloader:
|
|
|
66
69
|
batch_generator = xbatcher.BatchGenerator(
|
|
67
70
|
ds=cruise,
|
|
68
71
|
# get samples that are shaped 10x10x3
|
|
69
|
-
input_dims={
|
|
72
|
+
input_dims={
|
|
73
|
+
"depth": 10,
|
|
74
|
+
"time": 10,
|
|
75
|
+
"frequency": cruise.frequency.shape[0],
|
|
76
|
+
}, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
|
|
70
77
|
# no overlap between samples
|
|
71
|
-
input_overlap={
|
|
78
|
+
input_overlap={
|
|
79
|
+
"depth": 0,
|
|
80
|
+
"time": 0,
|
|
81
|
+
"frequency": 0,
|
|
82
|
+
}, # Zero means no overlap. A dictionary specifying the overlap along each dimension
|
|
72
83
|
)
|
|
73
84
|
return batch_generator
|
|
74
85
|
|
|
75
86
|
def get_s3_batch_generator(self) -> xbatcher.BatchGenerator:
|
|
76
|
-
"""
|
|
87
|
+
"""Returns a BatchGenerator with subsets of Sv data from s3 Zarr store"""
|
|
77
88
|
cruise = self.get_s3_zarr_store()
|
|
78
89
|
|
|
79
90
|
# TODO: temporarily limits to a smaller slice of the data
|
|
80
|
-
cruise_select = (
|
|
81
|
-
.where(cruise.depth < 100
|
|
82
|
-
|
|
91
|
+
cruise_select = (
|
|
92
|
+
cruise.where(cruise.depth < 100.0, drop=True).sel(
|
|
93
|
+
time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53")
|
|
94
|
+
)
|
|
83
95
|
# .sel(time=slice("2007-07-11T18:20:00", "2007-07-11T19:20:00"))
|
|
84
96
|
)
|
|
85
|
-
print(cruise_select.Sv.shape)
|
|
97
|
+
print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
|
|
86
98
|
|
|
87
99
|
batch_generator = xbatcher.BatchGenerator(
|
|
88
100
|
ds=cruise_select,
|
|
89
|
-
input_dims={
|
|
90
|
-
|
|
101
|
+
input_dims={
|
|
102
|
+
"depth": 10,
|
|
103
|
+
"time": 10,
|
|
104
|
+
"frequency": cruise.frequency.shape[0],
|
|
105
|
+
}, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
|
|
106
|
+
input_overlap={
|
|
107
|
+
"depth": 0,
|
|
108
|
+
"time": 0,
|
|
109
|
+
"frequency": 0,
|
|
110
|
+
}, # Zero means no overlap. A dictionary specifying the overlap along each dimension
|
|
91
111
|
preload_batch=False,
|
|
92
112
|
)
|
|
93
113
|
|
|
@@ -104,15 +124,22 @@ class BatchDownloader:
|
|
|
104
124
|
cruise = self.get_s3_zarr_store()
|
|
105
125
|
|
|
106
126
|
# TODO: temporarily limits to a smaller slice of the data
|
|
107
|
-
cruise_select = (cruise
|
|
108
|
-
|
|
109
|
-
.sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
|
|
127
|
+
cruise_select = cruise.where(cruise.depth < 100.0, drop=True).sel(
|
|
128
|
+
time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53")
|
|
110
129
|
)
|
|
111
|
-
print(cruise_select.Sv.shape)
|
|
130
|
+
print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
|
|
112
131
|
batch_generator = xbatcher.BatchGenerator(
|
|
113
132
|
ds=cruise_select,
|
|
114
|
-
input_dims={
|
|
115
|
-
|
|
133
|
+
input_dims={
|
|
134
|
+
"depth": 10,
|
|
135
|
+
"time": 10,
|
|
136
|
+
"frequency": cruise.frequency.shape[0],
|
|
137
|
+
}, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
|
|
138
|
+
input_overlap={
|
|
139
|
+
"depth": 0,
|
|
140
|
+
"time": 0,
|
|
141
|
+
"frequency": 0,
|
|
142
|
+
}, # Zero means no overlap. A dictionary specifying the overlap along each dimension
|
|
116
143
|
preload_batch=True,
|
|
117
144
|
)
|
|
118
145
|
|
|
@@ -121,12 +148,10 @@ class BatchDownloader:
|
|
|
121
148
|
return batch_generator
|
|
122
149
|
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
|
|
123
150
|
|
|
124
|
-
"""
|
|
125
|
-
(105, 21, 4)
|
|
126
151
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
152
|
+
# (105, 21, 4)
|
|
153
|
+
# depth-start: 0.1899999976158142, depth-end: 1.899999976158142
|
|
154
|
+
# time-start: 2007-07-11T18:20:33.657573888, time-end: 2007-07-11T18:20:42.657573888
|
|
155
|
+
# frequency-start: 18000.0, frequency-end: 200000.0
|
|
156
|
+
# (10, 10, 4)
|
|
157
|
+
# np.nanmean: -53.70000076293945
|
|
@@ -1,25 +1,26 @@
|
|
|
1
1
|
import gc
|
|
2
2
|
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path # , PurePath
|
|
5
|
+
|
|
3
6
|
import echopype as ep
|
|
4
7
|
import numcodecs
|
|
5
8
|
import numpy as np
|
|
6
9
|
from numcodecs import Blosc
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
from pathlib import Path # , PurePath
|
|
9
10
|
|
|
10
11
|
from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
|
|
11
12
|
from water_column_sonar_processing.geometry import GeometryManager
|
|
12
|
-
from water_column_sonar_processing.utility import Cleaner
|
|
13
|
+
from water_column_sonar_processing.utility import Cleaner
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
# This code is getting copied from echofish-aws-raw-to-zarr-lambda
|
|
16
17
|
class RawToZarr:
|
|
17
18
|
#######################################################
|
|
18
19
|
def __init__(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
self,
|
|
21
|
+
# output_bucket_access_key,
|
|
22
|
+
# output_bucket_secret_access_key,
|
|
23
|
+
# # overwrite_existing_zarr_store,
|
|
23
24
|
):
|
|
24
25
|
# TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
|
|
25
26
|
self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
|
|
@@ -33,45 +34,47 @@ class RawToZarr:
|
|
|
33
34
|
############################################################################
|
|
34
35
|
############################################################################
|
|
35
36
|
def __zarr_info_to_table(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
37
|
+
self,
|
|
38
|
+
# output_bucket_name,
|
|
39
|
+
table_name,
|
|
40
|
+
ship_name,
|
|
41
|
+
cruise_name,
|
|
42
|
+
sensor_name,
|
|
43
|
+
file_name,
|
|
44
|
+
# zarr_path,
|
|
45
|
+
min_echo_range,
|
|
46
|
+
max_echo_range,
|
|
47
|
+
num_ping_time_dropna,
|
|
48
|
+
start_time,
|
|
49
|
+
end_time,
|
|
50
|
+
frequencies,
|
|
51
|
+
channels,
|
|
52
|
+
water_level,
|
|
51
53
|
):
|
|
52
|
-
print(
|
|
54
|
+
print("Writing Zarr information to DynamoDB table.")
|
|
53
55
|
dynamodb_manager = DynamoDBManager()
|
|
54
56
|
dynamodb_manager.update_item(
|
|
55
57
|
table_name=table_name,
|
|
56
58
|
key={
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
"FILE_NAME": {"S": file_name}, # Partition Key
|
|
60
|
+
"CRUISE_NAME": {"S": cruise_name}, # Sort Key
|
|
59
61
|
},
|
|
60
62
|
expression_attribute_names={
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
"#CH": "CHANNELS",
|
|
64
|
+
"#ET": "END_TIME",
|
|
63
65
|
# "#ED": "ERROR_DETAIL",
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
"#PS": "PIPELINE_STATUS",
|
|
66
|
+
"#FR": "FREQUENCIES",
|
|
67
|
+
"#MA": "MAX_ECHO_RANGE",
|
|
68
|
+
"#MI": "MIN_ECHO_RANGE",
|
|
69
|
+
"#ND": "NUM_PING_TIME_DROPNA",
|
|
70
|
+
# "#PS": "PIPELINE_STATUS",
|
|
69
71
|
"#PT": "PIPELINE_TIME",
|
|
70
72
|
"#SE": "SENSOR_NAME",
|
|
71
73
|
"#SH": "SHIP_NAME",
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
74
|
+
"#ST": "START_TIME",
|
|
75
|
+
# "#ZB": "ZARR_BUCKET",
|
|
76
|
+
# "#ZP": "ZARR_PATH",
|
|
77
|
+
"#WL": "WATER_LEVEL",
|
|
75
78
|
},
|
|
76
79
|
expression_attribute_values={
|
|
77
80
|
":ch": {"L": [{"S": i} for i in channels]},
|
|
@@ -82,13 +85,14 @@ class RawToZarr:
|
|
|
82
85
|
":mi": {"N": str(np.round(min_echo_range, 4))},
|
|
83
86
|
":nd": {"N": str(num_ping_time_dropna)},
|
|
84
87
|
# ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
|
|
85
|
-
":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
|
|
88
|
+
# ":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
|
|
86
89
|
":pt": {"S": datetime.now().isoformat(timespec="seconds") + "Z"},
|
|
87
90
|
":se": {"S": sensor_name},
|
|
88
91
|
":sh": {"S": ship_name},
|
|
89
92
|
":st": {"S": start_time},
|
|
90
|
-
":
|
|
91
|
-
":
|
|
93
|
+
":wl": {"N": str(np.round(water_level, 2))},
|
|
94
|
+
# ":zb": {"S": output_bucket_name},
|
|
95
|
+
# ":zp": {"S": zarr_path},
|
|
92
96
|
},
|
|
93
97
|
update_expression=(
|
|
94
98
|
"SET "
|
|
@@ -99,30 +103,31 @@ class RawToZarr:
|
|
|
99
103
|
"#MA = :ma, "
|
|
100
104
|
"#MI = :mi, "
|
|
101
105
|
"#ND = :nd, "
|
|
102
|
-
"#PS = :ps, "
|
|
106
|
+
# "#PS = :ps, "
|
|
103
107
|
"#PT = :pt, "
|
|
104
108
|
"#SE = :se, "
|
|
105
109
|
"#SH = :sh, "
|
|
106
110
|
"#ST = :st, "
|
|
107
|
-
"#
|
|
108
|
-
"#
|
|
111
|
+
"#WL = :wl"
|
|
112
|
+
# "#ZB = :zb, "
|
|
113
|
+
# "#ZP = :zp"
|
|
109
114
|
),
|
|
110
115
|
)
|
|
111
|
-
print(
|
|
116
|
+
print("Done writing Zarr information to DynamoDB table.")
|
|
112
117
|
|
|
113
118
|
############################################################################
|
|
114
119
|
############################################################################
|
|
115
120
|
############################################################################
|
|
116
121
|
def __upload_files_to_output_bucket(
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
+
self,
|
|
123
|
+
output_bucket_name,
|
|
124
|
+
local_directory,
|
|
125
|
+
object_prefix,
|
|
126
|
+
endpoint_url,
|
|
122
127
|
):
|
|
123
128
|
# Note: this will be passed credentials if using NODD
|
|
124
129
|
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
125
|
-
print(
|
|
130
|
+
print("Uploading files using thread pool executor.")
|
|
126
131
|
all_files = []
|
|
127
132
|
for subdir, dirs, files in os.walk(local_directory):
|
|
128
133
|
for file in files:
|
|
@@ -138,38 +143,50 @@ class RawToZarr:
|
|
|
138
143
|
|
|
139
144
|
############################################################################
|
|
140
145
|
def raw_to_zarr(
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
146
|
+
self,
|
|
147
|
+
table_name,
|
|
148
|
+
input_bucket_name,
|
|
149
|
+
output_bucket_name,
|
|
150
|
+
ship_name,
|
|
151
|
+
cruise_name,
|
|
152
|
+
sensor_name,
|
|
153
|
+
raw_file_name,
|
|
154
|
+
endpoint_url=None,
|
|
155
|
+
include_bot=True,
|
|
151
156
|
):
|
|
152
157
|
"""
|
|
153
158
|
Downloads the raw files, processes them with echopype, writes geojson, and uploads files
|
|
154
159
|
to the nodd bucket.
|
|
155
160
|
"""
|
|
156
|
-
print(f
|
|
161
|
+
print(f"Opening raw: {raw_file_name} and creating zarr store.")
|
|
157
162
|
geometry_manager = GeometryManager()
|
|
158
163
|
cleaner = Cleaner()
|
|
159
|
-
cleaner.delete_local_files(
|
|
164
|
+
cleaner.delete_local_files(
|
|
165
|
+
file_types=["*.zarr", "*.json"]
|
|
166
|
+
) # TODO: include bot and raw?
|
|
160
167
|
|
|
161
168
|
s3_manager = S3Manager(endpoint_url=endpoint_url)
|
|
162
|
-
s3_file_path =
|
|
169
|
+
s3_file_path = (
|
|
170
|
+
f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
|
|
171
|
+
)
|
|
163
172
|
bottom_file_name = f"{Path(raw_file_name).stem}.bot"
|
|
164
|
-
s3_bottom_file_path =
|
|
165
|
-
|
|
173
|
+
s3_bottom_file_path = (
|
|
174
|
+
f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
|
|
175
|
+
)
|
|
176
|
+
s3_manager.download_file(
|
|
177
|
+
bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name
|
|
178
|
+
)
|
|
166
179
|
# TODO: add the bottom file
|
|
167
180
|
if include_bot:
|
|
168
|
-
s3_manager.download_file(
|
|
181
|
+
s3_manager.download_file(
|
|
182
|
+
bucket_name=input_bucket_name,
|
|
183
|
+
key=s3_bottom_file_path,
|
|
184
|
+
file_name=bottom_file_name,
|
|
185
|
+
)
|
|
169
186
|
|
|
170
187
|
try:
|
|
171
188
|
gc.collect()
|
|
172
|
-
print(
|
|
189
|
+
print("Opening raw file with echopype.")
|
|
173
190
|
# s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
|
|
174
191
|
# s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
|
|
175
192
|
echodata = ep.open_raw(
|
|
@@ -180,14 +197,20 @@ class RawToZarr:
|
|
|
180
197
|
# max_chunk_size=300,
|
|
181
198
|
# storage_options={'anon': True } # 'endpoint_url': self.endpoint_url} # this was creating problems
|
|
182
199
|
)
|
|
183
|
-
print(
|
|
200
|
+
print("Compute volume backscattering strength (Sv) from raw data.")
|
|
184
201
|
ds_sv = ep.calibrate.compute_Sv(echodata)
|
|
202
|
+
ds_sv = ep.consolidate.add_depth(
|
|
203
|
+
ds_sv, echodata
|
|
204
|
+
) # TODO: consolidate with other depth values
|
|
205
|
+
water_level = ds_sv["water_level"].values
|
|
185
206
|
gc.collect()
|
|
186
|
-
print(
|
|
207
|
+
print("Done computing volume backscatter strength (Sv) from raw data.")
|
|
187
208
|
# Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
|
|
188
209
|
# but is not written out with ds_sv
|
|
189
210
|
if "detected_seafloor_depth" in list(echodata.vendor.variables):
|
|
190
|
-
ds_sv["detected_seafloor_depth"] =
|
|
211
|
+
ds_sv["detected_seafloor_depth"] = (
|
|
212
|
+
echodata.vendor.detected_seafloor_depth
|
|
213
|
+
)
|
|
191
214
|
#
|
|
192
215
|
frequencies = echodata.environment.frequency_nominal.values
|
|
193
216
|
#################################################################
|
|
@@ -200,10 +223,12 @@ class RawToZarr:
|
|
|
200
223
|
sensor_name=sensor_name,
|
|
201
224
|
file_name=raw_file_name,
|
|
202
225
|
endpoint_url=endpoint_url,
|
|
203
|
-
write_geojson=True
|
|
226
|
+
write_geojson=True,
|
|
204
227
|
)
|
|
205
228
|
ds_sv = ep.consolidate.add_location(ds_sv, echodata)
|
|
206
|
-
ds_sv.latitude.values =
|
|
229
|
+
ds_sv.latitude.values = (
|
|
230
|
+
lat # overwriting echopype gps values to include missing values
|
|
231
|
+
)
|
|
207
232
|
ds_sv.longitude.values = lon
|
|
208
233
|
# gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
|
|
209
234
|
#################################################################
|
|
@@ -216,8 +241,12 @@ class RawToZarr:
|
|
|
216
241
|
# This is the number of missing values found throughout the lat/lon
|
|
217
242
|
num_ping_time_dropna = lat[~np.isnan(lat)].shape[0] # symmetric to lon
|
|
218
243
|
#
|
|
219
|
-
start_time =
|
|
220
|
-
|
|
244
|
+
start_time = (
|
|
245
|
+
np.datetime_as_string(ds_sv.ping_time.values[0], unit="ms") + "Z"
|
|
246
|
+
)
|
|
247
|
+
end_time = (
|
|
248
|
+
np.datetime_as_string(ds_sv.ping_time.values[-1], unit="ms") + "Z"
|
|
249
|
+
)
|
|
221
250
|
channels = list(ds_sv.channel.values)
|
|
222
251
|
#
|
|
223
252
|
#################################################################
|
|
@@ -225,7 +254,9 @@ class RawToZarr:
|
|
|
225
254
|
store_name = f"{Path(raw_file_name).stem}.zarr"
|
|
226
255
|
# Sv = ds_sv.Sv
|
|
227
256
|
# ds_sv['Sv'] = Sv.astype('int32', copy=False)
|
|
228
|
-
ds_sv.to_zarr(
|
|
257
|
+
ds_sv.to_zarr(
|
|
258
|
+
store=store_name
|
|
259
|
+
) # ds_sv.Sv.sel(channel=ds_sv.channel.values[0]).shape
|
|
229
260
|
gc.collect()
|
|
230
261
|
#################################################################
|
|
231
262
|
output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}/"
|
|
@@ -237,7 +268,9 @@ class RawToZarr:
|
|
|
237
268
|
sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.zarr",
|
|
238
269
|
)
|
|
239
270
|
if len(child_objects) > 0:
|
|
240
|
-
print(
|
|
271
|
+
print(
|
|
272
|
+
"Zarr store data already exists in s3, deleting existing and continuing."
|
|
273
|
+
)
|
|
241
274
|
s3_manager.delete_nodd_objects(
|
|
242
275
|
bucket_name=output_bucket_name,
|
|
243
276
|
objects=child_objects,
|
|
@@ -247,37 +280,42 @@ class RawToZarr:
|
|
|
247
280
|
output_bucket_name=output_bucket_name,
|
|
248
281
|
local_directory=store_name,
|
|
249
282
|
object_prefix=output_zarr_prefix,
|
|
250
|
-
endpoint_url=endpoint_url
|
|
283
|
+
endpoint_url=endpoint_url,
|
|
251
284
|
)
|
|
252
285
|
#################################################################
|
|
253
286
|
self.__zarr_info_to_table(
|
|
254
|
-
output_bucket_name=output_bucket_name,
|
|
287
|
+
# output_bucket_name=output_bucket_name,
|
|
255
288
|
table_name=table_name,
|
|
256
289
|
ship_name=ship_name,
|
|
257
290
|
cruise_name=cruise_name,
|
|
258
291
|
sensor_name=sensor_name,
|
|
259
292
|
file_name=raw_file_name,
|
|
260
|
-
zarr_path=os.path.join(output_zarr_prefix, store_name),
|
|
293
|
+
# zarr_path=os.path.join(output_zarr_prefix, store_name),
|
|
261
294
|
min_echo_range=min_echo_range,
|
|
262
295
|
max_echo_range=max_echo_range,
|
|
263
296
|
num_ping_time_dropna=num_ping_time_dropna,
|
|
264
297
|
start_time=start_time,
|
|
265
298
|
end_time=end_time,
|
|
266
299
|
frequencies=frequencies,
|
|
267
|
-
channels=channels
|
|
300
|
+
channels=channels,
|
|
301
|
+
water_level=water_level,
|
|
268
302
|
)
|
|
269
303
|
#######################################################################
|
|
270
304
|
# TODO: verify count of objects matches, publish message, update status
|
|
271
305
|
#######################################################################
|
|
272
|
-
print(
|
|
306
|
+
print("Finished raw-to-zarr conversion.")
|
|
273
307
|
except Exception as err:
|
|
274
|
-
print(
|
|
308
|
+
print(
|
|
309
|
+
f"Exception encountered creating local Zarr store with echopype: {err}"
|
|
310
|
+
)
|
|
275
311
|
raise RuntimeError(f"Problem creating local Zarr store, {err}")
|
|
276
312
|
finally:
|
|
277
313
|
gc.collect()
|
|
278
314
|
print("Finally.")
|
|
279
|
-
cleaner.delete_local_files(
|
|
280
|
-
|
|
315
|
+
cleaner.delete_local_files(
|
|
316
|
+
file_types=["*.raw", "*.bot", "*.zarr", "*.json"]
|
|
317
|
+
)
|
|
318
|
+
print("Done creating local zarr store.")
|
|
281
319
|
|
|
282
320
|
############################################################################
|
|
283
321
|
# TODO: does this get called?
|
|
@@ -365,5 +403,6 @@ class RawToZarr:
|
|
|
365
403
|
|
|
366
404
|
############################################################################
|
|
367
405
|
|
|
406
|
+
|
|
368
407
|
################################################################################
|
|
369
408
|
############################################################################
|
|
@@ -9,7 +9,16 @@ class Constants(Flag):
|
|
|
9
9
|
# chunk size is ~1.3 kB, HB0902 cruise takes ~30 seconds to load all time/lat/lon data
|
|
10
10
|
# NOTE: larger value here will speed up the TurfJS download of data in the UI
|
|
11
11
|
# Problem interpolating the data: cannot reshape array of size 65536 into shape...
|
|
12
|
-
SPATIOTEMPORAL_CHUNK_SIZE =
|
|
12
|
+
SPATIOTEMPORAL_CHUNK_SIZE = int(2**16) - 1024 # e.g. int(2**14)
|
|
13
|
+
# TODO: create test for SPATIOTEMPORAL_CHUNK_SIZE with requirement!
|
|
14
|
+
|
|
15
|
+
LEVEL_0 = "raw"
|
|
16
|
+
LEVEL_1 = "level_1"
|
|
17
|
+
LEVEL_2 = "level_2"
|
|
18
|
+
LEVEL_3 = "level_3"
|
|
19
|
+
|
|
20
|
+
EK60 = "EK60" # TODO: use for "instrument"
|
|
21
|
+
EK80 = "EK80"
|
|
13
22
|
|
|
14
23
|
|
|
15
24
|
class Coordinates(Enum):
|
|
@@ -107,19 +107,15 @@ class PipelineStatus(Flag):
|
|
|
107
107
|
# Status.LEVEL_1_PROCESSING.value < Status.LEVEL_2_PROCESSING.value
|
|
108
108
|
|
|
109
109
|
# https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-cloudformation-stack.html
|
|
110
|
-
"""
|
|
111
|
-
CREATE_IN_PROGRESS | CREATE_FAILED | CREATE_COMPLETE |
|
|
112
|
-
ROLLBACK_IN_PROGRESS | ROLLBACK_FAILED | ROLLBACK_COMPLETE |
|
|
113
|
-
DELETE_IN_PROGRESS | DELETE_FAILED | DELETE_COMPLETE |
|
|
114
|
-
UPDATE_IN_PROGRESS | UPDATE_COMPLETE_CLEANUP_IN_PROGRESS | UPDATE_COMPLETE |
|
|
115
|
-
UPDATE_FAILED | UPDATE_ROLLBACK_IN_PROGRESS | UPDATE_ROLLBACK_FAILED |
|
|
116
|
-
UPDATE_ROLLBACK_COMPLETE_CLEANUP_IN_PROGRESS | UPDATE_ROLLBACK_COMPLETE |
|
|
117
|
-
REVIEW_IN_PROGRESS | IMPORT_IN_PROGRESS | IMPORT_COMPLETE |
|
|
118
|
-
IMPORT_ROLLBACK_IN_PROGRESS | IMPORT_ROLLBACK_FAILED | IMPORT_ROLLBACK_COMPLETE
|
|
119
110
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
111
|
+
# CREATE_IN_PROGRESS | CREATE_FAILED | CREATE_COMPLETE |
|
|
112
|
+
# ROLLBACK_IN_PROGRESS | ROLLBACK_FAILED | ROLLBACK_COMPLETE |
|
|
113
|
+
# DELETE_IN_PROGRESS | DELETE_FAILED | DELETE_COMPLETE |
|
|
114
|
+
# UPDATE_IN_PROGRESS | UPDATE_COMPLETE_CLEANUP_IN_PROGRESS | UPDATE_COMPLETE |
|
|
115
|
+
# UPDATE_FAILED | UPDATE_ROLLBACK_IN_PROGRESS | UPDATE_ROLLBACK_FAILED |
|
|
116
|
+
# UPDATE_ROLLBACK_COMPLETE_CLEANUP_IN_PROGRESS | UPDATE_ROLLBACK_COMPLETE |
|
|
117
|
+
# REVIEW_IN_PROGRESS | IMPORT_IN_PROGRESS | IMPORT_COMPLETE |
|
|
118
|
+
# IMPORT_ROLLBACK_IN_PROGRESS | IMPORT_ROLLBACK_FAILED | IMPORT_ROLLBACK_COMPLETE
|
|
119
|
+
# failure - noun -
|
|
120
|
+
# failed - verb - "verbs should be avoided"
|
|
121
|
+
# success - noun
|