water-column-sonar-processing 25.3.2__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (32) hide show
  1. water_column_sonar_processing/aws/dynamodb_manager.py +6 -6
  2. water_column_sonar_processing/aws/s3_manager.py +95 -90
  3. water_column_sonar_processing/aws/s3fs_manager.py +5 -3
  4. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  5. water_column_sonar_processing/cruise/__init__.py +2 -1
  6. water_column_sonar_processing/cruise/create_empty_zarr_store.py +49 -43
  7. water_column_sonar_processing/cruise/create_empty_zarr_store_level_3.py +161 -0
  8. water_column_sonar_processing/cruise/datatree_manager.py +21 -21
  9. water_column_sonar_processing/cruise/resample_regrid.py +57 -47
  10. water_column_sonar_processing/dataset/__init__.py +3 -0
  11. water_column_sonar_processing/dataset/dataset_manager.py +205 -0
  12. water_column_sonar_processing/dataset/feature_manager.py +32 -0
  13. water_column_sonar_processing/geometry/geometry_manager.py +11 -12
  14. water_column_sonar_processing/geometry/line_simplification.py +26 -1
  15. water_column_sonar_processing/geometry/pmtile_generation.py +211 -247
  16. water_column_sonar_processing/index/index_manager.py +18 -17
  17. water_column_sonar_processing/model/zarr_manager.py +504 -256
  18. water_column_sonar_processing/processing/__init__.py +3 -2
  19. water_column_sonar_processing/processing/batch_downloader.py +11 -11
  20. water_column_sonar_processing/processing/raw_to_netcdf.py +319 -0
  21. water_column_sonar_processing/processing/raw_to_zarr.py +41 -31
  22. water_column_sonar_processing/utility/__init__.py +9 -2
  23. water_column_sonar_processing/utility/cleaner.py +1 -2
  24. water_column_sonar_processing/utility/constants.py +26 -7
  25. water_column_sonar_processing/utility/timestamp.py +1 -0
  26. water_column_sonar_processing-25.8.0.dist-info/METADATA +162 -0
  27. water_column_sonar_processing-25.8.0.dist-info/RECORD +39 -0
  28. {water_column_sonar_processing-25.3.2.dist-info → water_column_sonar_processing-25.8.0.dist-info}/WHEEL +1 -1
  29. water_column_sonar_processing-25.3.2.dist-info/licenses/LICENSE → water_column_sonar_processing-25.8.0.dist-info/licenses/LICENSE-MIT +1 -1
  30. water_column_sonar_processing-25.3.2.dist-info/METADATA +0 -170
  31. water_column_sonar_processing-25.3.2.dist-info/RECORD +0 -34
  32. {water_column_sonar_processing-25.3.2.dist-info → water_column_sonar_processing-25.8.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  # from .cruise_sampler import CruiseSampler
2
2
  from .batch_downloader import BatchDownloader
3
- from .raw_to_zarr import RawToZarr
3
+ from .raw_to_netcdf import RawToNetCDF
4
+ from .raw_to_zarr import RawToZarr, get_water_level
4
5
 
5
- __all__ = ["RawToZarr", "BatchDownloader"]
6
+ __all__ = ["RawToZarr", "get_water_level", "RawToNetCDF", "BatchDownloader"]
@@ -10,7 +10,7 @@ import xbatcher
10
10
 
11
11
  class BatchDownloader:
12
12
  """
13
- Uses the xbatcher XbatchDownloader to download data from an xarray dataset. Connection
13
+ Uses the xbatcher XbatchDownloader to download dataset from an xarray dataset. Connection
14
14
  is established
15
15
  """
16
16
 
@@ -50,13 +50,13 @@ class BatchDownloader:
50
50
 
51
51
  def get_toy_batch_generator(self) -> xbatcher.BatchGenerator:
52
52
  """
53
- Returns a BatchGenerator with subsets of Sv data
54
- Note: this is synthetic data, for a smaller toy example
53
+ Returns a BatchGenerator with subsets of Sv dataset
54
+ Note: this is synthetic dataset, for a smaller toy example
55
55
  """
56
56
  depth = np.arange(1, 21) # N meters
57
57
  time = pd.date_range(start="2025-01-01", end="2025-01-31", freq="D") # N days
58
58
  frequency = [1_000, 2_000, 3_000] # N frequencies
59
- Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic data
59
+ Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic dataset
60
60
  cruise = xr.Dataset(
61
61
  data_vars={"Sv": (["depth", "time", "frequency"], Sv)},
62
62
  coords={
@@ -84,10 +84,10 @@ class BatchDownloader:
84
84
  return batch_generator
85
85
 
86
86
  def get_s3_batch_generator(self) -> xbatcher.BatchGenerator:
87
- """Returns a BatchGenerator with subsets of Sv data from s3 Zarr store"""
87
+ """Returns a BatchGenerator with subsets of Sv dataset from s3 Zarr store"""
88
88
  cruise = self.get_s3_zarr_store()
89
89
 
90
- # TODO: temporarily limits to a smaller slice of the data
90
+ # TODO: temporarily limits to a smaller slice of the dataset
91
91
  cruise_select = (
92
92
  cruise.where(cruise.depth < 100.0, drop=True).sel(
93
93
  time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53")
@@ -111,19 +111,19 @@ class BatchDownloader:
111
111
  preload_batch=False,
112
112
  )
113
113
 
114
- # TODO: need to raise exception if all the data is nan
114
+ # TODO: need to raise exception if all the dataset is nan
115
115
 
116
116
  return batch_generator
117
117
  # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
118
118
 
119
119
  def get_s3_manual_batch_generator(self):
120
120
  """
121
- Using just xarray (no xbatcher), iterate through the data and generate batches.
122
- Returns a BatchGenerator with subsets of Sv data from s3 Zarr store.
121
+ Using just xarray (no xbatcher), iterate through the dataset and generate batches.
122
+ Returns a BatchGenerator with subsets of Sv dataset from s3 Zarr store.
123
123
  """
124
124
  cruise = self.get_s3_zarr_store()
125
125
 
126
- # TODO: temporarily limits to a smaller slice of the data
126
+ # TODO: temporarily limits to a smaller slice of the dataset
127
127
  cruise_select = cruise.where(cruise.depth < 100.0, drop=True).sel(
128
128
  time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53")
129
129
  )
@@ -143,7 +143,7 @@ class BatchDownloader:
143
143
  preload_batch=True,
144
144
  )
145
145
 
146
- # TODO: need to raise exception if all the data is nan
146
+ # TODO: need to raise exception if all the dataset is nan
147
147
 
148
148
  return batch_generator
149
149
  # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
@@ -0,0 +1,319 @@
1
+ import gc
2
+ import os
3
+ from datetime import datetime
4
+ from pathlib import Path # , PurePath
5
+
6
+ import echopype as ep
7
+ import numcodecs
8
+ import numpy as np
9
+ from numcodecs import Blosc
10
+
11
+ from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
12
+ from water_column_sonar_processing.geometry import GeometryManager
13
+ from water_column_sonar_processing.utility import Cleaner
14
+
15
+
16
+ # This code is getting copied from echofish-aws-raw-to-zarr-lambda
17
+ class RawToNetCDF:
18
+ #######################################################
19
+ def __init__(
20
+ self,
21
+ # output_bucket_access_key,
22
+ # output_bucket_secret_access_key,
23
+ # # overwrite_existing_zarr_store,
24
+ ):
25
+ # TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
26
+ self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
27
+ self.__overwrite = True
28
+ self.__num_threads = numcodecs.blosc.get_nthreads()
29
+ # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
30
+ # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
31
+ # self.__table_name = table_name
32
+ # # self.__overwrite_existing_zarr_store = overwrite_existing_zarr_store
33
+
34
+ ############################################################################
35
+ ############################################################################
36
+ def __netcdf_info_to_table(
37
+ self,
38
+ # output_bucket_name,
39
+ table_name,
40
+ ship_name,
41
+ cruise_name,
42
+ sensor_name,
43
+ file_name,
44
+ # zarr_path,
45
+ min_echo_range,
46
+ max_echo_range,
47
+ num_ping_time_dropna,
48
+ start_time,
49
+ end_time,
50
+ frequencies,
51
+ channels,
52
+ water_level,
53
+ ):
54
+ print("Writing Zarr information to DynamoDB table.")
55
+ dynamodb_manager = DynamoDBManager()
56
+ dynamodb_manager.update_item(
57
+ table_name=table_name,
58
+ key={
59
+ "FILE_NAME": {"S": file_name}, # Partition Key
60
+ "CRUISE_NAME": {"S": cruise_name}, # Sort Key
61
+ },
62
+ expression_attribute_names={
63
+ "#CH": "CHANNELS",
64
+ "#ET": "END_TIME",
65
+ # "#ED": "ERROR_DETAIL",
66
+ "#FR": "FREQUENCIES",
67
+ "#MA": "MAX_ECHO_RANGE",
68
+ "#MI": "MIN_ECHO_RANGE",
69
+ "#ND": "NUM_PING_TIME_DROPNA",
70
+ # "#PS": "PIPELINE_STATUS",
71
+ "#PT": "PIPELINE_TIME",
72
+ "#SE": "SENSOR_NAME",
73
+ "#SH": "SHIP_NAME",
74
+ "#ST": "START_TIME",
75
+ # "#ZB": "ZARR_BUCKET",
76
+ # "#ZP": "ZARR_PATH",
77
+ "#WL": "WATER_LEVEL",
78
+ },
79
+ expression_attribute_values={
80
+ ":ch": {"L": [{"S": i} for i in channels]},
81
+ ":et": {"S": end_time},
82
+ # ":ed": {"S": ""},
83
+ ":fr": {"L": [{"N": str(i)} for i in frequencies]},
84
+ ":ma": {"N": str(np.round(max_echo_range, 4))},
85
+ ":mi": {"N": str(np.round(min_echo_range, 4))},
86
+ ":nd": {"N": str(num_ping_time_dropna)},
87
+ # ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
88
+ # ":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
89
+ ":pt": {"S": datetime.now().isoformat(timespec="seconds") + "Z"},
90
+ ":se": {"S": sensor_name},
91
+ ":sh": {"S": ship_name},
92
+ ":st": {"S": start_time},
93
+ ":wl": {"N": str(np.round(water_level, 2))},
94
+ # ":zb": {"S": output_bucket_name},
95
+ # ":zp": {"S": zarr_path},
96
+ },
97
+ update_expression=(
98
+ "SET "
99
+ "#CH = :ch, "
100
+ "#ET = :et, "
101
+ # "#ED = :ed, "
102
+ "#FR = :fr, "
103
+ "#MA = :ma, "
104
+ "#MI = :mi, "
105
+ "#ND = :nd, "
106
+ # "#PS = :ps, "
107
+ "#PT = :pt, "
108
+ "#SE = :se, "
109
+ "#SH = :sh, "
110
+ "#ST = :st, "
111
+ "#WL = :wl"
112
+ # "#ZB = :zb, "
113
+ # "#ZP = :zp"
114
+ ),
115
+ )
116
+ print("Done writing Zarr information to DynamoDB table.")
117
+
118
+ ############################################################################
119
+ ############################################################################
120
+ ############################################################################
121
+ def __upload_files_to_output_bucket(
122
+ self,
123
+ output_bucket_name,
124
+ local_directory,
125
+ object_prefix,
126
+ endpoint_url,
127
+ ):
128
+ # Note: this will be passed credentials if using NODD
129
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
130
+ print("Uploading files using thread pool executor.")
131
+ all_files = []
132
+ for subdir, dirs, files in os.walk(local_directory):
133
+ for file in files:
134
+ local_path = os.path.join(subdir, file)
135
+ s3_key = os.path.join(object_prefix, local_path)
136
+ all_files.append([local_path, s3_key])
137
+ # all_files
138
+ all_uploads = s3_manager.upload_files_with_thread_pool_executor(
139
+ output_bucket_name=output_bucket_name,
140
+ all_files=all_files,
141
+ )
142
+ return all_uploads
143
+
144
+ def __upload_file_to_output_bucket(
145
+ self,
146
+ output_bucket_name,
147
+ local_directory,
148
+ object_prefix,
149
+ endpoint_url,
150
+ ):
151
+ # Note: this will be passed credentials if using NODD
152
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
153
+ print("Uploading files using thread pool executor.")
154
+ all_files = [local_directory]
155
+ all_uploads = s3_manager.upload_files_with_thread_pool_executor(
156
+ output_bucket_name=output_bucket_name,
157
+ all_files=all_files,
158
+ )
159
+ return all_uploads
160
+
161
+ ############################################################################
162
+ def raw_to_netcdf(
163
+ self,
164
+ table_name,
165
+ input_bucket_name,
166
+ output_bucket_name,
167
+ ship_name,
168
+ cruise_name,
169
+ sensor_name,
170
+ raw_file_name,
171
+ endpoint_url=None,
172
+ include_bot=True,
173
+ ):
174
+ """
175
+ Downloads the raw files, processes them with echopype, and uploads files
176
+ to the nodd bucket.
177
+
178
+ Needs to create two files, one echopype opened file, one is Sv calibrated file
179
+ """
180
+ print(f"Opening raw: {raw_file_name} and creating netcdf.")
181
+ try:
182
+ geometry_manager = GeometryManager()
183
+ cleaner = Cleaner()
184
+ cleaner.delete_local_files(
185
+ file_types=["*.nc", "*.json"]
186
+ ) # TODO: include bot and raw?
187
+
188
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
189
+ s3_file_path = (
190
+ f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
191
+ )
192
+ bottom_file_name = f"{Path(raw_file_name).stem}.bot"
193
+ s3_bottom_file_path = f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
194
+ s3_manager.download_file(
195
+ bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name
196
+ )
197
+ # TODO: add the bottom file
198
+ if include_bot:
199
+ s3_manager.download_file(
200
+ bucket_name=input_bucket_name,
201
+ key=s3_bottom_file_path,
202
+ file_name=bottom_file_name,
203
+ )
204
+
205
+ gc.collect()
206
+ print("Opening raw file with echopype.")
207
+ # s3_file_path = f"s3://{bucket_name}/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
208
+ # s3_file_path = Path(f"s3://noaa-wcsd-pds/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
209
+ echodata = ep.open_raw(
210
+ raw_file=raw_file_name,
211
+ sonar_model=sensor_name,
212
+ include_bot=include_bot,
213
+ )
214
+
215
+ netcdf_name = f"{Path(raw_file_name).stem}.nc"
216
+ # Xarray Dataset to netcdf
217
+ echodata.to_netcdf(
218
+ save_path=netcdf_name,
219
+ compress=True,
220
+ overwrite=True,
221
+ )
222
+
223
+ print("Compute volume backscattering strength (Sv) from raw dataset.")
224
+ ds_sv = ep.calibrate.compute_Sv(echodata)
225
+ ds_sv = ep.consolidate.add_depth(
226
+ ds_sv, echodata
227
+ ) # TODO: consolidate with other depth values
228
+ # water_level = ds_sv["water_level"].values
229
+ gc.collect()
230
+ print("Done computing volume backscatter strength (Sv) from raw dataset.")
231
+ # Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
232
+ # but is not written out with ds_sv
233
+ if "detected_seafloor_depth" in list(echodata.vendor.variables):
234
+ ds_sv["detected_seafloor_depth"] = (
235
+ echodata.vendor.detected_seafloor_depth
236
+ )
237
+ #
238
+ # frequencies = echodata.environment.frequency_nominal.values
239
+ #################################################################
240
+ # Get GPS coordinates, just overwrite the lat lon values
241
+ gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
242
+ echodata=echodata,
243
+ output_bucket_name=output_bucket_name,
244
+ ship_name=ship_name,
245
+ cruise_name=cruise_name,
246
+ sensor_name=sensor_name,
247
+ file_name=raw_file_name,
248
+ endpoint_url=endpoint_url,
249
+ write_geojson=False,
250
+ )
251
+ ds_sv = ep.consolidate.add_location(ds_sv, echodata)
252
+ ds_sv.latitude.values = (
253
+ lat # overwriting echopype gps values to include missing values
254
+ )
255
+ ds_sv.longitude.values = lon
256
+ # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
257
+
258
+ # Create the netcdf
259
+ netcdf_name_computed_Sv = f"{Path(raw_file_name).stem}_computed_Sv.nc"
260
+
261
+ # Xarray Dataset to netcdf
262
+ ds_sv.to_netcdf(
263
+ path=netcdf_name_computed_Sv,
264
+ mode="w",
265
+ )
266
+ gc.collect()
267
+ #################################################################
268
+ # output_netcdf_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}/"
269
+ #################################################################
270
+ # If netcdf already exists then delete
271
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
272
+ child_objects = s3_manager.get_child_objects(
273
+ bucket_name=output_bucket_name,
274
+ sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.nc",
275
+ )
276
+ if len(child_objects) > 0:
277
+ print(
278
+ "NetCDF dataset already exists in s3, deleting existing and continuing."
279
+ )
280
+ s3_manager.delete_nodd_objects(
281
+ bucket_name=output_bucket_name,
282
+ objects=child_objects,
283
+ )
284
+ child_objects_computed_Sv = s3_manager.get_child_objects(
285
+ bucket_name=output_bucket_name,
286
+ sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}_computed_Sv.nc",
287
+ )
288
+ if len(child_objects_computed_Sv) > 0:
289
+ print("data already exists in s3, deleting existing and continuing.")
290
+ s3_manager.delete_nodd_objects(
291
+ bucket_name=output_bucket_name,
292
+ objects=child_objects_computed_Sv,
293
+ )
294
+ #################################################################
295
+ s3_manager.upload_file(
296
+ filename=netcdf_name,
297
+ bucket_name=output_bucket_name,
298
+ key=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.nc",
299
+ )
300
+ s3_manager.upload_file(
301
+ filename=netcdf_name_computed_Sv,
302
+ bucket_name=output_bucket_name,
303
+ key=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}_computed_Sv.nc",
304
+ )
305
+ except Exception as err:
306
+ print(f"Exception encountered creating local netcdf with echopype: {err}")
307
+ raise RuntimeError(f"Problem creating local netcdf, {err}")
308
+ finally:
309
+ gc.collect()
310
+ cleaner.delete_local_files(
311
+ file_types=["*.raw", "*.bot", "*.zarr", "*.nc", "*.json"]
312
+ )
313
+ print("Done creating local zarr store.")
314
+
315
+ ############################################################################
316
+
317
+
318
+ ################################################################################
319
+ ############################################################################
@@ -1,7 +1,7 @@
1
1
  import gc
2
2
  import os
3
3
  from datetime import datetime
4
- from pathlib import Path # , PurePath
4
+ from pathlib import Path
5
5
 
6
6
  import echopype as ep
7
7
  import numcodecs
@@ -13,6 +13,16 @@ from water_column_sonar_processing.geometry import GeometryManager
13
13
  from water_column_sonar_processing.utility import Cleaner
14
14
 
15
15
 
16
+ def get_water_level(ds):
17
+ """
18
+ needs to be mocked up so thats why this is broken out
19
+ """
20
+ if "water_level" in ds.keys():
21
+ return ds.water_level.values
22
+ else:
23
+ return 0.0
24
+
25
+
16
26
  # This code is getting copied from echofish-aws-raw-to-zarr-lambda
17
27
  class RawToZarr:
18
28
  #######################################################
@@ -35,13 +45,11 @@ class RawToZarr:
35
45
  ############################################################################
36
46
  def __zarr_info_to_table(
37
47
  self,
38
- # output_bucket_name,
39
48
  table_name,
40
49
  ship_name,
41
50
  cruise_name,
42
- sensor_name,
51
+ sensor_name, # : Constants, TODO: convert to enum
43
52
  file_name,
44
- # zarr_path,
45
53
  min_echo_range,
46
54
  max_echo_range,
47
55
  num_ping_time_dropna,
@@ -67,13 +75,10 @@ class RawToZarr:
67
75
  "#MA": "MAX_ECHO_RANGE",
68
76
  "#MI": "MIN_ECHO_RANGE",
69
77
  "#ND": "NUM_PING_TIME_DROPNA",
70
- # "#PS": "PIPELINE_STATUS",
71
78
  "#PT": "PIPELINE_TIME",
72
79
  "#SE": "SENSOR_NAME",
73
80
  "#SH": "SHIP_NAME",
74
81
  "#ST": "START_TIME",
75
- # "#ZB": "ZARR_BUCKET",
76
- # "#ZP": "ZARR_PATH",
77
82
  "#WL": "WATER_LEVEL",
78
83
  },
79
84
  expression_attribute_values={
@@ -84,33 +89,25 @@ class RawToZarr:
84
89
  ":ma": {"N": str(np.round(max_echo_range, 4))},
85
90
  ":mi": {"N": str(np.round(min_echo_range, 4))},
86
91
  ":nd": {"N": str(num_ping_time_dropna)},
87
- # ":ps": {"S": "PROCESSING_RESAMPLE_AND_WRITE_TO_ZARR_STORE"},
88
- # ":ps": {"S": PipelineStatus.LEVEL_1_PROCESSING.name},
89
92
  ":pt": {"S": datetime.now().isoformat(timespec="seconds") + "Z"},
90
93
  ":se": {"S": sensor_name},
91
94
  ":sh": {"S": ship_name},
92
95
  ":st": {"S": start_time},
93
96
  ":wl": {"N": str(np.round(water_level, 2))},
94
- # ":zb": {"S": output_bucket_name},
95
- # ":zp": {"S": zarr_path},
96
97
  },
97
98
  update_expression=(
98
99
  "SET "
99
100
  "#CH = :ch, "
100
101
  "#ET = :et, "
101
- # "#ED = :ed, "
102
102
  "#FR = :fr, "
103
103
  "#MA = :ma, "
104
104
  "#MI = :mi, "
105
105
  "#ND = :nd, "
106
- # "#PS = :ps, "
107
106
  "#PT = :pt, "
108
107
  "#SE = :se, "
109
108
  "#SH = :sh, "
110
109
  "#ST = :st, "
111
110
  "#WL = :wl"
112
- # "#ZB = :zb, "
113
- # "#ZP = :zp"
114
111
  ),
115
112
  )
116
113
  print("Done writing Zarr information to DynamoDB table.")
@@ -120,16 +117,20 @@ class RawToZarr:
120
117
  ############################################################################
121
118
  def __upload_files_to_output_bucket(
122
119
  self,
123
- output_bucket_name,
124
- local_directory,
125
- object_prefix,
120
+ output_bucket_name: str,
121
+ local_directory: str, # e.g. 'D20070724-T042400.zarr' # TODO: problem: if this is not in the current directory
122
+ object_prefix: str, # e.g. "level_1/Henry_B._Bigelow/HB0706/EK60/"
126
123
  endpoint_url,
127
124
  ):
128
125
  # Note: this will be passed credentials if using NODD
126
+ # TODO: this will not work if the local_directory is anywhere other than the current folder
127
+ # see test_s3_manager test_upload...pool_executor for solution
129
128
  s3_manager = S3Manager(endpoint_url=endpoint_url)
130
129
  print("Uploading files using thread pool executor.")
131
130
  all_files = []
132
- for subdir, dirs, files in os.walk(local_directory):
131
+ for subdir, dirs, files in os.walk(
132
+ local_directory
133
+ ): # os.path.basename(s3_manager_test_path.joinpath("HB0707.zarr/"))
133
134
  for file in files:
134
135
  local_path = os.path.join(subdir, file)
135
136
  s3_key = os.path.join(object_prefix, local_path)
@@ -141,6 +142,8 @@ class RawToZarr:
141
142
  )
142
143
  return all_uploads
143
144
 
145
+ ############################################################################
146
+
144
147
  ############################################################################
145
148
  def raw_to_zarr(
146
149
  self,
@@ -167,11 +170,11 @@ class RawToZarr:
167
170
 
168
171
  s3_manager = S3Manager(endpoint_url=endpoint_url)
169
172
  s3_file_path = (
170
- f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
173
+ f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
171
174
  )
172
175
  bottom_file_name = f"{Path(raw_file_name).stem}.bot"
173
176
  s3_bottom_file_path = (
174
- f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
177
+ f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
175
178
  )
176
179
  s3_manager.download_file(
177
180
  bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name
@@ -187,8 +190,8 @@ class RawToZarr:
187
190
  try:
188
191
  gc.collect()
189
192
  print("Opening raw file with echopype.")
190
- # s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
191
- # s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
193
+ # s3_file_path = f"s3://{bucket_name}/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
194
+ # s3_file_path = Path(f"s3://noaa-wcsd-pds/dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
192
195
  echodata = ep.open_raw(
193
196
  raw_file=raw_file_name,
194
197
  sonar_model=sensor_name,
@@ -197,14 +200,16 @@ class RawToZarr:
197
200
  # max_chunk_size=300,
198
201
  # storage_options={'anon': True } # 'endpoint_url': self.endpoint_url} # this was creating problems
199
202
  )
200
- print("Compute volume backscattering strength (Sv) from raw data.")
203
+ print("Compute volume backscattering strength (Sv) from raw dataset.")
201
204
  ds_sv = ep.calibrate.compute_Sv(echodata)
202
205
  ds_sv = ep.consolidate.add_depth(
203
206
  ds_sv, echodata
204
207
  ) # TODO: consolidate with other depth values
205
- water_level = ds_sv["water_level"].values
208
+
209
+ water_level = get_water_level(ds_sv)
210
+
206
211
  gc.collect()
207
- print("Done computing volume backscatter strength (Sv) from raw data.")
212
+ print("Done computing volume backscatter strength (Sv) from raw dataset.")
208
213
  # Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
209
214
  # but is not written out with ds_sv
210
215
  if "detected_seafloor_depth" in list(echodata.vendor.variables):
@@ -237,7 +242,14 @@ class RawToZarr:
237
242
  # TODO revert this so that smaller diffs can be used
238
243
  # The most minimum the resolution can be is as small as 0.25 meters
239
244
  min_echo_range = np.round(np.nanmin(np.diff(ds_sv.echo_range.values)), 2)
245
+ # For the HB0710 cruise the depths vary from 499.7215 @19cm to 2999.4805 @ 1cm. Moving that back
246
+ # inline with the
247
+ min_echo_range = np.max(
248
+ [0.20, min_echo_range]
249
+ ) # TODO: experiment with 0.25 and 0.50
250
+
240
251
  max_echo_range = float(np.nanmax(ds_sv.echo_range))
252
+
241
253
  # This is the number of missing values found throughout the lat/lon
242
254
  num_ping_time_dropna = lat[~np.isnan(lat)].shape[0] # symmetric to lon
243
255
  #
@@ -269,7 +281,7 @@ class RawToZarr:
269
281
  )
270
282
  if len(child_objects) > 0:
271
283
  print(
272
- "Zarr store data already exists in s3, deleting existing and continuing."
284
+ "Zarr store dataset already exists in s3, deleting existing and continuing."
273
285
  )
274
286
  s3_manager.delete_nodd_objects(
275
287
  bucket_name=output_bucket_name,
@@ -284,13 +296,11 @@ class RawToZarr:
284
296
  )
285
297
  #################################################################
286
298
  self.__zarr_info_to_table(
287
- # output_bucket_name=output_bucket_name,
288
299
  table_name=table_name,
289
300
  ship_name=ship_name,
290
301
  cruise_name=cruise_name,
291
302
  sensor_name=sensor_name,
292
303
  file_name=raw_file_name,
293
- # zarr_path=os.path.join(output_zarr_prefix, store_name),
294
304
  min_echo_range=min_echo_range,
295
305
  max_echo_range=max_echo_range,
296
306
  num_ping_time_dropna=num_ping_time_dropna,
@@ -334,7 +344,7 @@ class RawToZarr:
334
344
  # #######################################################################
335
345
  # store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
336
346
  # output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
337
- # bucket_key = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
347
+ # bucket_key = f"dataset/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
338
348
  # zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
339
349
  # #
340
350
  # os.chdir(TEMPDIR) # Lambdas require use of temp directory
@@ -348,7 +358,7 @@ class RawToZarr:
348
358
  # secret_access_key=self.__output_bucket_secret_access_key
349
359
  # )
350
360
  # if len(s3_objects) > 0:
351
- # print('Zarr store data already exists in s3, deleting existing and continuing.')
361
+ # print('Zarr store dataset already exists in s3, deleting existing and continuing.')
352
362
  # self.__s3.delete_objects(
353
363
  # bucket_name=self.__output_bucket,
354
364
  # objects=s3_objects,
@@ -1,6 +1,13 @@
1
1
  from .cleaner import Cleaner
2
- from .constants import Constants, Coordinates
2
+ from .constants import Constants, Coordinates, Instruments
3
3
  from .pipeline_status import PipelineStatus
4
4
  from .timestamp import Timestamp
5
5
 
6
- __all__ = ["Cleaner", "Constants", "Coordinates", "PipelineStatus", "Timestamp"]
6
+ __all__ = [
7
+ "Cleaner",
8
+ "Instruments",
9
+ "Constants",
10
+ "Coordinates",
11
+ "PipelineStatus",
12
+ "Timestamp",
13
+ ]
@@ -5,8 +5,7 @@ import shutil
5
5
 
6
6
  ###########################################################
7
7
  class Cleaner:
8
- @staticmethod
9
- def delete_local_files(file_types=["*.raw*", "*.model"]): # '*.json'
8
+ def delete_local_files(self, file_types=["*.raw*", "*.model"]): # '*.json'
10
9
  # TODO: add .zarr to this
11
10
  print("Deleting all local raw and model files")
12
11
  for i in file_types: