water-column-sonar-processing 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (21) hide show
  1. water_column_sonar_processing/__init__.py +4 -5
  2. water_column_sonar_processing/aws/dynamodb_manager.py +149 -43
  3. water_column_sonar_processing/aws/s3_manager.py +71 -37
  4. water_column_sonar_processing/cruise/create_empty_zarr_store.py +6 -4
  5. water_column_sonar_processing/cruise/resample_regrid.py +3 -3
  6. water_column_sonar_processing/geometry/geometry_manager.py +21 -6
  7. water_column_sonar_processing/geometry/pmtile_generation.py +202 -13
  8. water_column_sonar_processing/index/index_manager.py +25 -13
  9. water_column_sonar_processing/model/zarr_manager.py +26 -25
  10. water_column_sonar_processing/process.py +4 -4
  11. water_column_sonar_processing/processing/__init__.py +4 -0
  12. water_column_sonar_processing/processing/cruise_sampler.py +342 -0
  13. water_column_sonar_processing/processing/raw_to_zarr.py +349 -0
  14. water_column_sonar_processing/utility/cleaner.py +1 -0
  15. water_column_sonar_processing/utility/constants.py +6 -2
  16. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/METADATA +21 -10
  17. water_column_sonar_processing-0.0.8.dist-info/RECORD +32 -0
  18. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/WHEEL +1 -1
  19. water_column_sonar_processing-0.0.6.dist-info/RECORD +0 -29
  20. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/LICENSE +0 -0
  21. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,342 @@
1
+ import gc
2
+ import os
3
+ import echopype as ep
4
+ import numpy as np
5
+ from numcodecs import Blosc
6
+
7
+ from src.water_column_sonar_processing.utility import Cleaner
8
+
9
+ TEMPDIR = "/tmp"
10
+
11
+
12
+ # This code is getting copied from echofish-aws-raw-to-zarr-lambda
13
+ class CruiseSampler:
14
+ #######################################################
15
+ def __init__(
16
+ self,
17
+ ):
18
+ # TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
19
+ self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
20
+ self.bucket_name = os.environ.get("INPUT_BUCKET_NAME")
21
+ # self.__s3 = s3_operations
22
+
23
+ ############################################################################
24
+ ############################################################################
25
+ def __zarr_info_to_table(
26
+ self,
27
+ file_name,
28
+ cruise_name,
29
+ zarr_path,
30
+ min_echo_range,
31
+ max_echo_range,
32
+ num_ping_time_dropna,
33
+ start_time,
34
+ end_time,
35
+ frequencies,
36
+ channels
37
+ ):
38
+ print('Writing Zarr information to DynamoDB table.')
39
+ self.__dynamo.update_item(
40
+ table_name=self.__table_name,
41
+ key={
42
+ 'FILE_NAME': {'S': file_name}, # Partition Key
43
+ 'CRUISE_NAME': {'S': cruise_name}, # Sort Key
44
+ # TODO: should be FILE_NAME & SENSOR_NAME so they are truely unique for when two sensors are processed within one cruise
45
+ },
46
+ expression='SET #ZB = :zb, #ZP = :zp, #MINER = :miner, #MAXER = :maxer, #P = :p, #ST = :st, #ET = :et, #F = :f, #C = :c',
47
+ attribute_names={
48
+ '#ZB': 'ZARR_BUCKET',
49
+ '#ZP': 'ZARR_PATH',
50
+ '#MINER': 'MIN_ECHO_RANGE',
51
+ '#MAXER': 'MAX_ECHO_RANGE',
52
+ '#P': 'NUM_PING_TIME_DROPNA',
53
+ '#ST': 'START_TIME',
54
+ '#ET': 'END_TIME',
55
+ '#F': 'FREQUENCIES',
56
+ '#C': 'CHANNELS',
57
+ },
58
+ attribute_values={
59
+ ':zb': {
60
+ 'S': self.__output_bucket
61
+ },
62
+ ':zp': {
63
+ 'S': zarr_path
64
+ },
65
+ ':miner': {
66
+ 'N': str(np.round(min_echo_range, 4))
67
+ },
68
+ ':maxer': {
69
+ 'N': str(np.round(max_echo_range, 4))
70
+ },
71
+ ':p': {
72
+ 'N': str(num_ping_time_dropna)
73
+ },
74
+ ':st': {
75
+ 'S': start_time
76
+ },
77
+ ':et': {
78
+ 'S': end_time
79
+ },
80
+ ':f': {
81
+ 'L': [{'N': str(i)} for i in frequencies]
82
+ },
83
+ ':c': {
84
+ 'L': [{'S': i} for i in channels]
85
+ }
86
+ }
87
+ )
88
+
89
+ ############################################################################
90
+ ############################################################################
91
+ ############################################################################
92
+ def raw_to_zarr(
93
+ self,
94
+ ship_name,
95
+ cruise_name,
96
+ sensor_name,
97
+ file_name,
98
+ ):
99
+ print(f'Opening raw: {file_name} and creating zarr store.')
100
+ geometry_manager = GeometryManager()
101
+ try:
102
+ gc.collect()
103
+ print('Opening raw file with echopype.')
104
+ bucket_name="test_input_bucket" # noaa-wcsd-pds
105
+ s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
106
+ # s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
107
+ # TODO: add the bottom file here
108
+ echodata = ep.open_raw(
109
+ raw_file=s3_file_path,
110
+ sonar_model=sensor_name,
111
+ # include_bot=True,
112
+ use_swap=True,
113
+ # max_chunk_size=100,
114
+ # storage_options={'anon': True} # this was creating problems
115
+ )
116
+ print('Compute volume backscattering strength (Sv) from raw data.')
117
+ ds_sv = ep.calibrate.compute_Sv(echodata)
118
+ print('Done computing volume backscattering strength (Sv) from raw data.')
119
+ frequencies = echodata.environment.frequency_nominal.values
120
+ #################################################################
121
+ # Get GPS coordinates
122
+ gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
123
+ echodata=echodata,
124
+ ship_name=ship_name,
125
+ cruise_name=cruise_name,
126
+ sensor_name=sensor_name,
127
+ file_name=file_name,
128
+ write_geojson=True
129
+ )
130
+ # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
131
+ #################################################################
132
+ # Technically the min_echo_range would be 0 m.
133
+ # TODO: this var name is supposed to represent minimum resolution of depth measurements
134
+ # The most minimum the resolution can be is as small as 0.25 meters
135
+ min_echo_range = np.maximum(0.25, np.nanmin(np.diff(ds_sv.echo_range.values)))
136
+ max_echo_range = float(np.nanmax(ds_sv.echo_range))
137
+ #
138
+ num_ping_time_dropna = lat[~np.isnan(lat)].shape[0] # symmetric to lon
139
+ #
140
+ start_time = np.datetime_as_string(ds_sv.ping_time.values[0], unit='ms') + "Z"
141
+ end_time = np.datetime_as_string(ds_sv.ping_time.values[-1], unit='ms') + "Z"
142
+ channels = list(ds_sv.channel.values)
143
+ #
144
+ #################################################################
145
+ # Create the zarr store
146
+ ds_sv.to_zarr(store=store_name)
147
+ #################################################################
148
+ print('Note: Adding GeoJSON inside Zarr store')
149
+ self.__write_geojson_to_file(store_name=store_name, data=gps_data)
150
+ #################################################################
151
+ self.__zarr_info_to_table(
152
+ file_name=raw_file_name,
153
+ cruise_name=cruise_name,
154
+ zarr_path=os.path.join(output_zarr_prefix, store_name),
155
+ min_echo_range=min_echo_range,
156
+ max_echo_range=max_echo_range,
157
+ num_ping_time_dropna=num_ping_time_dropna,
158
+ start_time=start_time,
159
+ end_time=end_time,
160
+ frequencies=frequencies,
161
+ channels=channels
162
+ )
163
+ except Exception as err:
164
+ print(f'Exception encountered creating local Zarr store with echopype: {err}')
165
+ raise RuntimeError(f"Problem creating local Zarr store, {err}")
166
+ print('Done creating local zarr store.')
167
+
168
+ ############################################################################
169
+ def __upload_files_to_output_bucket(
170
+ self,
171
+ local_directory,
172
+ object_prefix,
173
+ ):
174
+ # Note: this will be passed credentials if using NODD
175
+ print('Uploading files using thread pool executor.')
176
+ all_files = []
177
+ for subdir, dirs, files in os.walk(local_directory):
178
+ for file in files:
179
+ local_path = os.path.join(subdir, file)
180
+ s3_key = os.path.join(object_prefix, local_path)
181
+ all_files.append([local_path, s3_key])
182
+ # all_files
183
+ all_uploads = self.__s3.upload_files_with_thread_pool_executor(
184
+ bucket_name=self.__output_bucket,
185
+ all_files=all_files,
186
+ access_key_id=self.__output_bucket_access_key,
187
+ secret_access_key=self.__output_bucket_secret_access_key
188
+ )
189
+ return all_uploads
190
+
191
+ ############################################################################
192
+ def execute(self, input_message):
193
+ ship_name = input_message['shipName']
194
+ cruise_name = input_message['cruiseName']
195
+ sensor_name = input_message['sensorName']
196
+ input_file_name = input_message['fileName']
197
+ #
198
+ try:
199
+ self.__update_processing_status(
200
+ file_name=input_file_name,
201
+ cruise_name=cruise_name,
202
+ pipeline_status="PROCESSING_RAW_TO_ZARR"
203
+ )
204
+ #######################################################################
205
+ store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
206
+ output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
207
+ bucket_key = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
208
+ zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
209
+ #
210
+ os.chdir(TEMPDIR) # Lambdas require use of temp directory
211
+ #######################################################################
212
+ #######################################################################
213
+ # Check if zarr store already exists
214
+ s3_objects = self.__s3.list_objects(
215
+ bucket_name=self.__output_bucket,
216
+ prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
217
+ access_key_id=self.__output_bucket_access_key,
218
+ secret_access_key=self.__output_bucket_secret_access_key
219
+ )
220
+ if len(s3_objects) > 0:
221
+ print('Zarr store data already exists in s3, deleting existing and continuing.')
222
+ self.__s3.delete_objects(
223
+ bucket_name=self.__output_bucket,
224
+ objects=s3_objects,
225
+ access_key_id=self.__output_bucket_access_key,
226
+ secret_access_key=self.__output_bucket_secret_access_key
227
+ )
228
+ #######################################################################
229
+ # self.__delete_all_local_raw_and_zarr_files()
230
+ Cleaner.delete_local_files(file_types=["*.raw*", "*.zarr"])
231
+ self.__s3.download_file(
232
+ bucket_name=self.__input_bucket,
233
+ key=bucket_key,
234
+ file_name=input_file_name
235
+ )
236
+ self.__create_local_zarr_store(
237
+ raw_file_name=input_file_name,
238
+ cruise_name=cruise_name,
239
+ sensor_name=sensor_name,
240
+ output_zarr_prefix=output_zarr_prefix,
241
+ store_name=store_name
242
+ )
243
+ #######################################################################
244
+ self.__upload_files_to_output_bucket(store_name, output_zarr_prefix)
245
+ #######################################################################
246
+ # # TODO: verify count of objects matches
247
+ # s3_objects = self.__s3.list_objects(
248
+ # bucket_name=self.__output_bucket,
249
+ # prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
250
+ # access_key_id=self.__output_bucket_access_key,
251
+ # secret_access_key=self.__output_bucket_secret_access_key
252
+ # )
253
+ #######################################################################
254
+ self.__update_processing_status(
255
+ file_name=input_file_name,
256
+ cruise_name=cruise_name,
257
+ pipeline_status='SUCCESS_RAW_TO_ZARR'
258
+ )
259
+ #######################################################################
260
+ self.__publish_done_message(input_message)
261
+ #######################################################################
262
+ # except Exception as err:
263
+ # print(f'Exception encountered: {err}')
264
+ # self.__update_processing_status(
265
+ # file_name=input_file_name,
266
+ # cruise_name=cruise_name,
267
+ # pipeline_status='FAILURE_RAW_TO_ZARR',
268
+ # error_message=str(err),
269
+ # )
270
+ finally:
271
+ self.__delete_all_local_raw_and_zarr_files()
272
+ #######################################################################
273
+
274
+ ############################################################################
275
+
276
+ ################################################################################
277
+ ############################################################################
278
+ # TODO: DELETE
279
+ # def __get_gps_data(
280
+ # self,
281
+ # echodata: ep.echodata.echodata.EchoData
282
+ # ) -> tuple:
283
+ # print('Getting GPS data.')
284
+ # try:
285
+ # # if 'latitude' not in echodata.platform.variables and 'longitude' not in echodata.platform.variables:
286
+ # # raise KeyError;
287
+ # assert( # TODO: raise error, e.g. KeyError
288
+ # 'latitude' in echodata.platform.variables and 'longitude' in echodata.platform.variables
289
+ # ), "Problem: GPS coordinates not found in echodata."
290
+ # latitude = echodata.platform.latitude.values
291
+ # longitude = echodata.platform.longitude.values # len(longitude) == 14691
292
+ # # RE: time coordinates: https://github.com/OSOceanAcoustics/echopype/issues/656#issue-1219104771
293
+ # assert(
294
+ # 'time1' in echodata.platform.variables and 'time1' in echodata.environment.variables
295
+ # ), "Problem: Time coordinate not found in echodata."
296
+ # # 'nmea_times' are times from the nmea datalogger associated with GPS
297
+ # # nmea times, unlike env times, can be sorted
298
+ # nmea_times = np.sort(echodata.platform.time1.values)
299
+ # # 'time1' are times from the echosounder associated with transducer measurement
300
+ # time1 = echodata.environment.time1.values
301
+ # # Align 'sv_times' to 'nmea_times'
302
+ # assert(
303
+ # np.all(time1[:-1] <= time1[1:]) and np.all(nmea_times[:-1] <= nmea_times[1:])
304
+ # ), "Problem: NMEA time stamps are not sorted."
305
+ # # Finds the indices where 'v' can be inserted just to the right of 'a'
306
+ # indices = np.searchsorted(a=nmea_times, v=time1, side="right") - 1
307
+ # #
308
+ # lat = latitude[indices]
309
+ # lat[indices < 0] = np.nan # values recorded before indexing are set to nan
310
+ # lon = longitude[indices]
311
+ # lon[indices < 0] = np.nan
312
+ # if len(lat) < 2 or len(lon) < 2:
313
+ # raise Exception("There was not enough data in lat or lon to create geojson.")
314
+ # assert( # TODO: raise ValueError
315
+ # np.all(lat[~np.isnan(lat)] >= -90.) and np.all(lat[~np.isnan(lat)] <= 90.) and np.all(lon[~np.isnan(lon)] >= -180.) and np.all(lon[~np.isnan(lon)] <= 180.)
316
+ # ), "Problem: Data falls outside GPS bounds!"
317
+ # # TODO: check for visits to null island
318
+ # # https://osoceanacoustics.github.io/echopype-examples/echopype_tour.html
319
+ # print(np.count_nonzero(np.isnan(lat)))
320
+ # print(np.count_nonzero(np.isnan(lon)))
321
+ # if len(lat[~np.isnan(lat)]) < 1:
322
+ # raise RuntimeError(f"Problem all data is NaN.")
323
+ # time1 = time1[~np.isnan(lat)]
324
+ # lat = lat[~np.isnan(lat)]
325
+ # lon = lon[~np.isnan(lon)]
326
+ # #
327
+ # gps_df = pd.DataFrame({
328
+ # 'latitude': lat,
329
+ # 'longitude': lon,
330
+ # 'time1': time1
331
+ # }).set_index(['time1'])
332
+ # gps_gdf = geopandas.GeoDataFrame(
333
+ # gps_df,
334
+ # geometry=geopandas.points_from_xy(gps_df['longitude'], gps_df['latitude']),
335
+ # crs="epsg:4326" # TODO: does this sound right?
336
+ # )
337
+ # # GeoJSON FeatureCollection with IDs as "time1"
338
+ # geo_json = gps_gdf.to_json()
339
+ # except Exception as err:
340
+ # print(f'Exception encountered creating local Zarr store with echopype: {err}')
341
+ # raise
342
+ # return geo_json, lat, lon