water-column-sonar-processing 0.0.13__py3-none-any.whl → 24.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

@@ -16,6 +16,7 @@ class S3FSManager:
16
16
  # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
17
17
  self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
18
18
  self.s3fs = s3fs.S3FileSystem(
19
+ asynchronous=False,
19
20
  endpoint_url=endpoint_url,
20
21
  key=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
21
22
  secret=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import tempfile
2
3
 
3
4
  import numcodecs
4
5
  import numpy as np
@@ -11,7 +12,6 @@ from water_column_sonar_processing.utility import Cleaner
11
12
  numcodecs.blosc.use_threads = False
12
13
  numcodecs.blosc.set_nthreads(1)
13
14
 
14
- # TEMPDIR = "/tmp"
15
15
  # TODO: when ready switch to version 3 of model spec
16
16
  # ZARR_V3_EXPERIMENTAL_API = 1
17
17
  # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
@@ -61,7 +61,6 @@ class CreateEmptyZarrStore:
61
61
  # TODO: move to common place
62
62
 
63
63
  #######################################################
64
- # @classmethod
65
64
  def create_cruise_level_zarr_store(
66
65
  self,
67
66
  output_bucket_name: str,
@@ -69,8 +68,8 @@ class CreateEmptyZarrStore:
69
68
  cruise_name: str,
70
69
  sensor_name: str,
71
70
  table_name: str,
72
- tempdir: str,
73
71
  ) -> None:
72
+ tempdir = tempfile.TemporaryDirectory()
74
73
  try:
75
74
  # HB0806 - 123, HB0903 - 220
76
75
  dynamo_db_manager = DynamoDBManager()
@@ -146,7 +145,7 @@ class CreateEmptyZarrStore:
146
145
  print(f"new_height: {new_height}")
147
146
 
148
147
  zarr_manager.create_zarr_store(
149
- path=tempdir,
148
+ path=tempdir.name, # TODO: need to use .name or problem
150
149
  ship_name=ship_name,
151
150
  cruise_name=cruise_name,
152
151
  sensor_name=sensor_name,
@@ -159,7 +158,7 @@ class CreateEmptyZarrStore:
159
158
  #################################################################
160
159
  self.upload_zarr_store_to_s3(
161
160
  output_bucket_name=output_bucket_name,
162
- local_directory=tempdir,
161
+ local_directory=tempdir.name, # TODO: need to use .name or problem
163
162
  object_prefix=zarr_prefix,
164
163
  cruise_name=cruise_name,
165
164
  )
@@ -0,0 +1,24 @@
1
+ ### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
2
+ import numpy as np
3
+ from datatree import DataTree
4
+ import xarray as xr
5
+
6
+ class DatatreeManager:
7
+ #######################################################
8
+ def __init__(
9
+ self,
10
+ ):
11
+ self.dtype = "float32"
12
+
13
+ #################################################################
14
+ def create_datatree(
15
+ self,
16
+ input_ds,
17
+ ) -> None:
18
+ ds1 = xr.Dataset({"foo": "orange"})
19
+ dt = DataTree(name="root", data=ds1) # create root node
20
+ ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
21
+ return dt
22
+
23
+
24
+
@@ -281,12 +281,7 @@ class ResampleRegrid:
281
281
  print(f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}")
282
282
  #########################################################################
283
283
  # write Sv values to cruise-level-model-store
284
- for channel in range(
285
- len(input_xr.channel.values)
286
- ): # does not like being written in one fell swoop :(
287
- output_zarr_store.Sv[
288
- :, start_ping_time_index:end_ping_time_index, channel
289
- ] = regrid_resample[:, :, channel]
284
+ output_zarr_store.Sv[:, start_ping_time_index:end_ping_time_index, :] = regrid_resample.values
290
285
 
291
286
  #########################################################################
292
287
  # [5] write subset of latitude/longitude
@@ -300,27 +295,27 @@ class ResampleRegrid:
300
295
  #########################################################################
301
296
  # TODO: add the "detected_seafloor_depth/" to the
302
297
  # L2 cruise dataarrays
303
- # TODO: make bottom optional if 'detected_seafloor_depth' in input_xr.variables:
298
+ # TODO: make bottom optional
304
299
  # TODO: Only checking the first channel for now. Need to average across all channels
305
300
  # in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
306
- # detected_seafloor_depths = input_xr.detected_seafloor_depth.values[0, :] # note can include nans?
307
- detected_seafloor_depth = input_xr.detected_seafloor_depth.values
308
- detected_seafloor_depth[detected_seafloor_depth == 0.] = np.nan
309
- detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
310
- detected_seafloor_depths[detected_seafloor_depths == 0.] = np.nan
311
- print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
312
- print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
313
- #available_indices = np.argwhere(np.isnan(geospatial['latitude'].values))
314
- output_zarr_store.bottom[
315
- start_ping_time_index:end_ping_time_index
316
- ] = detected_seafloor_depths
301
+ if 'detected_seafloor_depth' in input_xr.variables:
302
+ print('Found detected_seafloor_depth, adding data to output store.')
303
+ detected_seafloor_depth = input_xr.detected_seafloor_depth.values
304
+ detected_seafloor_depth[detected_seafloor_depth == 0.] = np.nan
305
+ # TODO: problem here: Processing file: D20070711-T210709.
306
+ detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0) # RuntimeWarning: Mean of empty slice detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
307
+ detected_seafloor_depths[detected_seafloor_depths == 0.] = np.nan
308
+ print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
309
+ print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
310
+ #available_indices = np.argwhere(np.isnan(geospatial['latitude'].values))
311
+ output_zarr_store.bottom[
312
+ start_ping_time_index:end_ping_time_index
313
+ ] = detected_seafloor_depths
317
314
  #########################################################################
318
315
  #########################################################################
319
316
  except Exception as err:
320
317
  print(f"Problem interpolating the data: {err}")
321
318
  raise err
322
- # else:
323
- # pass
324
319
  finally:
325
320
  print("Done interpolating data.")
326
321
  # TODO: read across times and verify data was written?
@@ -1,5 +1,6 @@
1
+ from .elevation_manager import ElevationManager
1
2
  from .geometry_manager import GeometryManager
2
3
  from .geometry_simplification import GeometrySimplification
3
4
  from .pmtile_generation import PMTileGeneration
4
5
 
5
- __all__ = ["GeometryManager", "GeometrySimplification", "PMTileGeneration"]
6
+ __all__ = ["ElevationManager", "GeometryManager", "GeometrySimplification", "PMTileGeneration"]
@@ -0,0 +1,112 @@
1
+ """
2
+ https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry=-31.70235%2C13.03332&geometryType=esriGeometryPoint&returnGeometry=false&returnCatalogItems=false&f=json
3
+
4
+ https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/
5
+ identify?
6
+ geometry=-31.70235%2C13.03332
7
+ &geometryType=esriGeometryPoint
8
+ &returnGeometry=false
9
+ &returnCatalogItems=false
10
+ &f=json
11
+ {"objectId":0,"name":"Pixel","value":"-5733","location":{"x":-31.702349999999999,"y":13.03332,"spatialReference":{"wkid":4326,"latestWkid":4326}},"properties":null,"catalogItems":null,"catalogItemVisibilities":[]}
12
+ -5733
13
+
14
+ (base) rudy:deleteME rudy$ curl https://api.opentopodata.org/v1/gebco2020?locations=13.03332,-31.70235
15
+ {
16
+ "results": [
17
+ {
18
+ "dataset": "gebco2020",
19
+ "elevation": -5729.0,
20
+ "location": {
21
+ "lat": 13.03332,
22
+ "lng": -31.70235
23
+ }
24
+ }
25
+ ],
26
+ "status": "OK"
27
+ }
28
+ """
29
+ import json
30
+ import time
31
+
32
+ import requests
33
+ from collections.abc import Generator
34
+
35
+ def chunked(
36
+ ll: list,
37
+ n: int
38
+ ) -> Generator:
39
+ # Yields successively n-sized chunks from ll.
40
+ for i in range(0, len(ll), n):
41
+ yield ll[i : i + n]
42
+
43
+
44
+ class ElevationManager:
45
+ #######################################################
46
+ def __init__(
47
+ self,
48
+ ):
49
+ self.DECIMAL_PRECISION = 5 # precision for GPS coordinates
50
+ self.TIMOUT_SECONDS = 10
51
+
52
+ #######################################################
53
+ def get_arcgis_elevation(
54
+ self,
55
+ lngs: list,
56
+ lats: list,
57
+ chunk_size: int=500, # I think this is the api limit
58
+ ) -> int:
59
+ # Reference: https://developers.arcgis.com/rest/services-reference/enterprise/map-to-image/
60
+ # Info: https://www.arcgis.com/home/item.html?id=c876e3c96a8642ab8557646a3b4fa0ff
61
+ ### 'https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry={"points":[[-31.70235,13.03332],[-32.70235,14.03332]]}&geometryType=esriGeometryMultipoint&returnGeometry=false&returnCatalogItems=false&f=json'
62
+ if len(lngs) != len(lats):
63
+ raise ValueError("lngs and lats must have same length")
64
+
65
+ geometryType = "esriGeometryMultipoint" # TODO: allow single point?
66
+
67
+ depths = []
68
+
69
+ list_of_points = [list(elem) for elem in list(zip(lngs, lats))]
70
+ for chunk in chunked(list_of_points, chunk_size):
71
+ time.sleep(0.1)
72
+ # order: (lng, lat)
73
+ geometry = f'{{"points":{str(chunk)}}}'
74
+ url=f'https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry={geometry}&geometryType={geometryType}&returnGeometry=false&returnCatalogItems=false&f=json'
75
+ result = requests.get(url, timeout=self.TIMOUT_SECONDS)
76
+ res = json.loads(result.content.decode('utf8'))
77
+ if 'results' in res:
78
+ for element in res['results']:
79
+ depths.append(float(element['value']))
80
+ elif 'value' in res:
81
+ depths.append(float(res['value']))
82
+
83
+ return depths
84
+
85
+ # def get_gebco_bathymetry_elevation(self) -> int:
86
+ # # Documentation: https://www.opentopodata.org/datasets/gebco2020/
87
+ # latitude = 13.03332
88
+ # longitude = -31.70235
89
+ # dataset = "gebco2020"
90
+ # url = f"https://api.opentopodata.org/v1/{dataset}?locations={latitude},{longitude}"
91
+ # pass
92
+
93
+ # def get_elevation(
94
+ # self,
95
+ # df,
96
+ # lat_column,
97
+ # lon_column,
98
+ # ) -> int:
99
+ # """Query service using lat, lon. add the elevation values as a new column."""
100
+ # url = r'https://epqs.nationalmap.gov/v1/json?'
101
+ # elevations = []
102
+ # for lat, lon in zip(df[lat_column], df[lon_column]):
103
+ # # define rest query params
104
+ # params = {
105
+ # 'output': 'json',
106
+ # 'x': lon,
107
+ # 'y': lat,
108
+ # 'units': 'Meters'
109
+ # }
110
+ # result = requests.get((url + urllib.parse.urlencode(params)))
111
+ # elevations.append(result.json()['value'])
112
+ # return elevations
@@ -7,13 +7,20 @@ from concurrent.futures import as_completed
7
7
  from water_column_sonar_processing.aws import S3Manager
8
8
 
9
9
 
10
+ MAX_POOL_CONNECTIONS = 64
11
+ MAX_CONCURRENCY = 64
12
+ MAX_WORKERS = 64
13
+ GB = 1024**3
14
+
15
+
10
16
  class IndexManager:
17
+ # TODO: index into dynamodb instead of csv files
11
18
 
12
19
  def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
13
20
  self.input_bucket_name = input_bucket_name
14
21
  self.calibration_bucket = calibration_bucket
15
22
  self.calibration_key = calibration_key
16
- self.s3_manager = S3Manager()
23
+ self.s3_manager = S3Manager() # TODO: make anonymous?
17
24
 
18
25
  #################################################################
19
26
  def list_ships(
@@ -50,6 +57,9 @@ class IndexManager:
50
57
  self,
51
58
  cruise_prefixes,
52
59
  ):
60
+ """
61
+ This returns a list of ek60 prefixed cruises.
62
+ """
53
63
  cruise_sensors = [] # includes all sensor types
54
64
  for cruise_prefix in cruise_prefixes:
55
65
  page_iterator = self.s3_manager.paginator.paginate(
@@ -67,9 +77,12 @@ class IndexManager:
67
77
  cruise_name,
68
78
  sensor_name,
69
79
  ):
80
+ # Gets all raw files for a cruise under the given prefix
70
81
  prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
71
82
  page_iterator = self.s3_manager.paginator.paginate(
72
- Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
83
+ Bucket=self.input_bucket_name,
84
+ Prefix=prefix,
85
+ Delimiter="/"
73
86
  )
74
87
  all_files = []
75
88
  for page in page_iterator:
@@ -77,6 +90,57 @@ class IndexManager:
77
90
  all_files.extend([i["Key"] for i in page["Contents"]])
78
91
  return [i for i in all_files if i.endswith(".raw")]
79
92
 
93
+ def get_first_raw_file(
94
+ self,
95
+ ship_name,
96
+ cruise_name,
97
+ sensor_name,
98
+ ):
99
+ # Same as above but only needs to get the first raw file
100
+ # because we are only interested in the first datagram of one file
101
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
102
+ # page_iterator = self.s3_manager.paginator.paginate(
103
+ # Bucket=self.input_bucket_name,
104
+ # Prefix=prefix,
105
+ # Delimiter="/",
106
+ # PaginationConfig={ 'MaxItems': 5 }
107
+ # ) # TODO: this can create a problem if there is a non raw file returned first
108
+ ### filter with JMESPath expressions ###
109
+ page_iterator = self.s3_manager.paginator.paginate(
110
+ Bucket=self.input_bucket_name,
111
+ Prefix=prefix,
112
+ Delimiter="/",
113
+ )
114
+ # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
115
+ page_iterator = page_iterator.search(expression="Contents[?contains(Key, '.raw')] ")
116
+ for res in page_iterator:
117
+ if "Key" in res:
118
+ return res["Key"]
119
+ # else raise exception?
120
+
121
+ # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
122
+
123
+ def get_files_under_size(
124
+ self,
125
+ ship_name,
126
+ cruise_name,
127
+ sensor_name,
128
+ ):
129
+ # THIS isn't used, just playing with JMES paths spec
130
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
131
+ ### filter with JMESPath expressions ###
132
+ page_iterator = self.s3_manager.paginator.paginate(
133
+ Bucket=self.input_bucket_name,
134
+ Prefix=prefix,
135
+ Delimiter="/",
136
+ )
137
+ page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
138
+ all_files = []
139
+ for page in page_iterator:
140
+ if "Contents" in page.keys():
141
+ all_files.extend([i["Key"] for i in page["Contents"]])
142
+ return [i for i in all_files if i.endswith(".raw")]
143
+
80
144
  #################################################################
81
145
  def get_raw_files_csv(
82
146
  self,
@@ -102,6 +166,29 @@ class IndexManager:
102
166
  df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
103
167
  print("done")
104
168
 
169
+ def get_raw_files_list(
170
+ self,
171
+ ship_name,
172
+ cruise_name,
173
+ sensor_name,
174
+ ):
175
+ # gets all raw files in cruise and returns a list of dicts
176
+ raw_files = self.get_raw_files(
177
+ ship_name=ship_name,
178
+ cruise_name=cruise_name,
179
+ sensor_name=sensor_name
180
+ )
181
+ files_list = [
182
+ {
183
+ "ship_name": ship_name,
184
+ "cruise_name": cruise_name,
185
+ "sensor_name": sensor_name,
186
+ "file_name": os.path.basename(raw_file),
187
+ }
188
+ for raw_file in raw_files
189
+ ]
190
+ return files_list
191
+
105
192
  #################################################################
106
193
  def get_subset_ek60_prefix( # TODO: is this used?
107
194
  self,
@@ -169,16 +256,14 @@ class IndexManager:
169
256
  return first_datagram
170
257
 
171
258
  #################################################################
172
- def get_subset_datagrams(
259
+ def get_subset_datagrams( # TODO: is this getting used
173
260
  self,
174
261
  df: pd.DataFrame
175
262
  ) -> list:
176
263
  print("getting subset of datagrams")
177
- select_keys = list(
178
- df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
179
- )
264
+ select_keys = df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values.tolist()
180
265
  all_datagrams = []
181
- with ThreadPoolExecutor(max_workers=self.max_pool_connections) as executor:
266
+ with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
182
267
  futures = [
183
268
  executor.submit(self.scan_datagram, select_key)
184
269
  for select_key in select_keys
@@ -1,4 +1,3 @@
1
- import os
2
1
  import numcodecs
3
2
  import numpy as np
4
3
  import xarray as xr
@@ -48,6 +47,9 @@ class ZarrManager:
48
47
  endpoint=True,
49
48
  )
50
49
 
50
+ if np.any(np.isnan(all_cruise_depth_values)):
51
+ raise Exception('Problem depth values returned were NaN.')
52
+
51
53
  print("Done getting depth values.")
52
54
  return all_cruise_depth_values.round(decimals=2)
53
55
 
@@ -67,10 +69,10 @@ class ZarrManager:
67
69
  print(
68
70
  f"Creating local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
69
71
  )
70
-
71
- # There should be no repeated frequencies
72
- assert len(frequencies) == len(set(frequencies))
73
- # TODO: eventually switch coordinate to "channel"
72
+ # There can not currently be repeated frequencies
73
+ # TODO: eventually switch coordinate to "channel" because frequencies can repeat
74
+ if len(frequencies) != len(set(frequencies)):
75
+ raise Exception("Number of frequencies does not match number of channels")
74
76
 
75
77
  print(f"Debugging number of threads: {self.__num_threads}")
76
78
 
@@ -118,8 +120,9 @@ class ZarrManager:
118
120
  fill_value=np.nan,
119
121
  overwrite=self.__overwrite,
120
122
  )
121
- # TODO: change to exception
122
- assert not np.any(np.isnan(depth_values))
123
+
124
+ if np.any(np.isnan(depth_values)):
125
+ raise Exception('Some depth values returned were NaN.')
123
126
 
124
127
  root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
125
128
 
@@ -219,7 +222,8 @@ class ZarrManager:
219
222
  root.create_dataset(
220
223
  name=Coordinates.SV.value,
221
224
  shape=(len(depth_values), width, len(frequencies)),
222
- chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
225
+ # chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
226
+ chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1), # 256x256x1 <- speed up for alex
223
227
  dtype=np.dtype(
224
228
  Coordinates.SV_DTYPE.value
225
229
  ), # TODO: try to experiment with 'float16'
@@ -246,11 +250,12 @@ class ZarrManager:
246
250
  #
247
251
  root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
248
252
  root.attrs["processing_software_version"] = (
249
- "0.0.13" # TODO: get programmatically, echopype>utils>prov.py
253
+ "24.01.01" # TODO: get programmatically, echopype>utils>prov.py
250
254
  )
251
255
  root.attrs["processing_software_time"] = Timestamp.get_timestamp()
252
256
  #
253
257
  root.attrs["calibration_status"] = calibration_status
258
+ root.attrs["tile_size"] = Constants.TILE_SIZE.value
254
259
 
255
260
  zarr.consolidate_metadata(store)
256
261
  #####################################################################
@@ -1,4 +1,5 @@
1
- from .cruise_sampler import CruiseSampler
1
+ # from .cruise_sampler import CruiseSampler
2
2
  from .raw_to_zarr import RawToZarr
3
+ from .batch_downloader import BatchDownloader
3
4
 
4
- __all__ = ["CruiseSampler", "RawToZarr"]
5
+ __all__ = ["RawToZarr", "BatchDownloader"]
@@ -0,0 +1,132 @@
1
+ import xarray as xr
2
+ import numpy as np
3
+ import pandas as pd
4
+ import xbatcher
5
+ from typing import Optional
6
+ # s3fs.core.setup_logging("DEBUG")
7
+
8
+ class BatchDownloader:
9
+ """
10
+ Uses the xbatcher XbatchDownloader to download data from an xarray dataset. Connection
11
+ is established
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ bucket_name: Optional[str] = "noaa-wcsd-zarr-pds",
17
+ ship_name: Optional[str] = "Henry_B._Bigelow",
18
+ cruise_name: Optional[str] = "HB0707",
19
+ sensor_name: Optional[str] = "EK60",
20
+ patch_dims: Optional[int] = 64, # TODO: change to 64
21
+ # input_steps: Optional[int] = 3,
22
+ ):
23
+ self.bucket_name = bucket_name
24
+ self.ship_name = ship_name
25
+ self.cruise_name = cruise_name
26
+ self.sensor_name = sensor_name
27
+ self.patch_dims = patch_dims
28
+
29
+ # TODO: move this to the s3fs module
30
+ def get_s3_zarr_store(self) -> xr.Dataset:
31
+ """ Returns an Xarray Dataset """
32
+ s3_zarr_store_path = f"{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
33
+ # Info about the HB0707 cruise:
34
+ # Time: ["2007-07-11T18:20:33.657573888", "2007-07-11T18:20:53.657573888", "2007-07-13T00:55:17.454448896"]
35
+ # Frequency: [ 18000. 38000. 120000. 200000.]
36
+ # Depth: [0.19, 999.74]
37
+
38
+ # Needed to override credentials for github actions
39
+ # s3_file_system = s3fs.S3FileSystem(anon=True)
40
+ # store = s3fs.S3Map(root=s3_zarr_store_path, s3=s3_file_system, check=False)
41
+
42
+ # return xr.open_zarr(store=f"s3://{s3_zarr_store_path}", consolidated=True, storage_options={'anon': True})
43
+ return xr.open_dataset(f"s3://{s3_zarr_store_path}", engine="zarr", storage_options={'anon': True})
44
+ # return xr.open_zarr(store, consolidated=True)
45
+
46
+ def get_toy_batch_generator(self) -> xbatcher.BatchGenerator:
47
+ """
48
+ Returns a BatchGenerator with subsets of Sv data
49
+ Note: this is synthetic data, for a smaller toy example
50
+ """
51
+ depth = np.arange(1, 21) # N meters
52
+ time = pd.date_range(start="2025-01-01", end="2025-01-31", freq='D') # N days
53
+ frequency = [1_000, 2_000, 3_000] # N frequencies
54
+ Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic data
55
+ cruise = xr.Dataset(
56
+ data_vars={
57
+ "Sv": (["depth", "time", "frequency"], Sv)
58
+ },
59
+ coords={
60
+ "depth": depth,
61
+ "time": time,
62
+ "frequency": frequency,
63
+ },
64
+ attrs=dict(description="Toy Example"),
65
+ )
66
+ batch_generator = xbatcher.BatchGenerator(
67
+ ds=cruise,
68
+ # get samples that are shaped 10x10x3
69
+ input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
70
+ # no overlap between samples
71
+ input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
72
+ )
73
+ return batch_generator
74
+
75
+ def get_s3_batch_generator(self) -> xbatcher.BatchGenerator:
76
+ """ Returns a BatchGenerator with subsets of Sv data from s3 Zarr store """
77
+ cruise = self.get_s3_zarr_store()
78
+
79
+ # TODO: temporarily limits to a smaller slice of the data
80
+ cruise_select = (cruise
81
+ .where(cruise.depth < 100., drop=True)
82
+ .sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
83
+ # .sel(time=slice("2007-07-11T18:20:00", "2007-07-11T19:20:00"))
84
+ )
85
+ print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
86
+
87
+ batch_generator = xbatcher.BatchGenerator(
88
+ ds=cruise_select,
89
+ input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
90
+ input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
91
+ preload_batch=False,
92
+ )
93
+
94
+ # TODO: need to raise exception if all the data is nan
95
+
96
+ return batch_generator
97
+ # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
98
+
99
+ def get_s3_manual_batch_generator(self):
100
+ """
101
+ Using just xarray (no xbatcher), iterate through the data and generate batches.
102
+ Returns a BatchGenerator with subsets of Sv data from s3 Zarr store.
103
+ """
104
+ cruise = self.get_s3_zarr_store()
105
+
106
+ # TODO: temporarily limits to a smaller slice of the data
107
+ cruise_select = (cruise
108
+ .where(cruise.depth < 100., drop=True)
109
+ .sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
110
+ )
111
+ print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
112
+ batch_generator = xbatcher.BatchGenerator(
113
+ ds=cruise_select,
114
+ input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
115
+ input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
116
+ preload_batch=True,
117
+ )
118
+
119
+ # TODO: need to raise exception if all the data is nan
120
+
121
+ return batch_generator
122
+ # https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
123
+
124
+ """
125
+ (105, 21, 4)
126
+
127
+ depth-start: 0.1899999976158142, depth-end: 1.899999976158142
128
+ time-start: 2007-07-11T18:20:33.657573888, time-end: 2007-07-11T18:20:42.657573888
129
+ frequency-start: 18000.0, frequency-end: 200000.0
130
+ (10, 10, 4)
131
+ np.nanmean: -53.70000076293945
132
+ """
@@ -11,8 +11,6 @@ from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
11
11
  from water_column_sonar_processing.geometry import GeometryManager
12
12
  from water_column_sonar_processing.utility import Cleaner, PipelineStatus
13
13
 
14
- TEMPDIR = "/tmp"
15
-
16
14
 
17
15
  # This code is getting copied from echofish-aws-raw-to-zarr-lambda
18
16
  class RawToZarr:
@@ -3,11 +3,12 @@ from enum import Enum, Flag, unique
3
3
 
4
4
  @unique
5
5
  class Constants(Flag):
6
- TILE_SIZE = 2048
6
+ TILE_SIZE = 256 # TODO: add tile size to metadata?
7
+
7
8
  # Average https://noaa-wcsd-zarr-pds.s3.us-east-1.amazonaws.com/level_2/Henry_B._Bigelow/HB0902/EK60/HB0902.zarr/time/927
8
9
  # chunk size is ~1.3 kB, HB0902 cruise takes ~30 seconds to load all time/lat/lon data
9
10
  # NOTE: larger value here will speed up the TurfJS download of data in the UI
10
- SPATIOTEMPORAL_CHUNK_SIZE = int(1e5) # 2**17
11
+ SPATIOTEMPORAL_CHUNK_SIZE = int(2**16) # 2**17
11
12
 
12
13
 
13
14
  class Coordinates(Enum):
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: water_column_sonar_processing
3
- Version: 0.0.13
3
+ Version: 24.1.1
4
4
  Summary: A processing tool for water column sonar data.
5
5
  Author-email: Rudy Klucik <rudy.klucik@noaa.gov>
6
6
  Project-URL: Homepage, https://github.com/CI-CMG/water-column-sonar-processing
@@ -8,7 +8,7 @@ Project-URL: Issues, https://github.com/CI-CMG/water-column-sonar-processing/iss
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
11
+ Requires-Python: >=3.8
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: aiobotocore==2.15.2
@@ -26,26 +26,19 @@ Requires-Dist: pandas==2.2.3
26
26
  Requires-Dist: pyarrow==18.1.0
27
27
  Requires-Dist: python-dotenv==1.0.1
28
28
  Requires-Dist: requests==2.32.3
29
- Requires-Dist: s3fs==2023.12.1
29
+ Requires-Dist: s3fs==2024.2.0
30
30
  Requires-Dist: scipy==1.14.1
31
31
  Requires-Dist: setuptools
32
32
  Requires-Dist: shapely==2.0.3
33
33
  Requires-Dist: typing-extensions==4.10.0
34
34
  Requires-Dist: xarray==2024.10.0
35
+ Requires-Dist: xbatcher==0.4.0
35
36
  Requires-Dist: zarr==2.18.3
36
37
 
37
38
  # Water Column Sonar Processing
38
39
  Processing tool for converting L0 data to L1 and L2 as well as generating geospatial information
39
40
 
40
- ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CI-CMG/water-column-sonar-processing/test_action.yaml)
41
-
42
- ![GitHub License](https://img.shields.io/github/license/CI-CMG/water-column-sonar-processing)
43
-
44
- ![PyPI - Implementation](https://img.shields.io/pypi/v/water-column-sonar-processing?color=black)
45
-
46
- ![PyPI - Downloads](https://img.shields.io/pypi/dd/water-column-sonar-processing)
47
-
48
- ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/CI-CMG/water-column-sonar-processing) ![GitHub repo size](https://img.shields.io/github/repo-size/CI-CMG/water-column-sonar-processing)
41
+ ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CI-CMG/water-column-sonar-processing/test_action.yaml?color=black) ![PyPI - Implementation](https://img.shields.io/pypi/v/water-column-sonar-processing?color=black) ![GitHub License](https://img.shields.io/github/license/CI-CMG/water-column-sonar-processing?color=black) ![PyPI - Downloads](https://img.shields.io/pypi/dd/water-column-sonar-processing?color=black) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/CI-CMG/water-column-sonar-processing?color=black) ![GitHub repo size](https://img.shields.io/github/repo-size/CI-CMG/water-column-sonar-processing?color=black)
49
42
 
50
43
  # Setting up the Python Environment
51
44
  > Python 3.10.12
@@ -103,12 +96,6 @@ or
103
96
  Following this tutorial:
104
97
  https://packaging.python.org/en/latest/tutorials/packaging-projects/
105
98
 
106
- # To Publish To PROD
107
- ```commandline
108
- python -m build
109
- python -m twine upload --repository pypi dist/*
110
- ```
111
-
112
99
  # Pre Commit Hook
113
100
  see here for installation: https://pre-commit.com/
114
101
  https://dev.to/rafaelherik/using-trufflehog-and-pre-commit-hook-to-prevent-secret-exposure-edo
@@ -133,13 +120,29 @@ https://colab.research.google.com/drive/1KiLMueXiz9WVB9o4RuzYeGjNZ6PsZU7a#scroll
133
120
  # Tag a Release
134
121
  Step 1 --> increment the semantic version in the zarr_manager.py "metadata" & the "pyproject.toml"
135
122
  ```commandline
136
- git tag "v0.0.13" -a
123
+ git tag -a v24.01.01 -m "Releasing version v24.01.01"
137
124
  ```
138
- Step 3 --> enter description
125
+
139
126
  ```commandline
140
127
  git push origin --tags
141
128
  ```
142
129
 
130
+ # To Publish To PROD
131
+ ```commandline
132
+ python -m build
133
+ python -m twine upload --repository pypi dist/*
134
+ ```
135
+
143
136
  # TODO:
144
137
  add https://pypi.org/project/setuptools-scm/
145
138
  for extracting the version
139
+
140
+ # Security scanning
141
+ > bandit -r water_column_sonar_processing/
142
+
143
+ # Data Debugging
144
+ Experimental Plotting in Xarray (hvPlot):
145
+ https://colab.research.google.com/drive/18vrI9LAip4xRGEX6EvnuVFp35RAiVYwU#scrollTo=q9_j9p2yXsLV
146
+
147
+ HB0707 Cruise zoomable:
148
+ https://hb0707.s3.us-east-1.amazonaws.com/index.html
@@ -3,31 +3,32 @@ water_column_sonar_processing/process.py,sha256=-yQtK3rnZq6lGAr3q02zLDe1NuMH9c0P
3
3
  water_column_sonar_processing/aws/__init__.py,sha256=KJqK8oYMn-u8n8i-Jp_lG5BvCOTjwWSjWP8yAyDlWVo,297
4
4
  water_column_sonar_processing/aws/dynamodb_manager.py,sha256=LQ3eh7Zf1fBLG-RKovod9KbQwhE-0Qdq1JPk4Ro5bdo,10252
5
5
  water_column_sonar_processing/aws/s3_manager.py,sha256=-PCiW7YF31nGIPa1oVOVTzjTSExAAkT_IyNNnvWv2HU,16214
6
- water_column_sonar_processing/aws/s3fs_manager.py,sha256=d7p9Sx-ocooKzHjVJVCawnXSGv6BpmKvvN9uhzilglw,2529
6
+ water_column_sonar_processing/aws/s3fs_manager.py,sha256=Vo-DXj6vgb8t1l4LdtNu7JCtq_RfFsnl33RuGeBUXhk,2561
7
7
  water_column_sonar_processing/aws/sns_manager.py,sha256=Dp9avG5VSugSWPR1dZ-askuAw1fCZkNUHbOUP65iR-k,1867
8
8
  water_column_sonar_processing/aws/sqs_manager.py,sha256=NSUrWmnSC8h8Gf7gT0U8zFaQQ-yX89h0Q0mDLKGqp2Y,1597
9
9
  water_column_sonar_processing/cruise/__init__.py,sha256=H5hW0JMORuaFvQk_R31B4VL8RnRyKeanOOiWmqEMZJk,156
10
- water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=1IehrlhMAS5XAl7DLdQI4jIMSY9ZNLiW4YdcBEwYkbc,7679
11
- water_column_sonar_processing/cruise/experiment_datatree.py,sha256=K6Uq_36Rygw5oFF8zWavEwb1x8D27lJv5G3j0B59agE,243
12
- water_column_sonar_processing/cruise/resample_regrid.py,sha256=XpGRs8nWspWuVoXBEV6VNVJSMlr3_IjnKlN1dK6dEA4,14292
13
- water_column_sonar_processing/geometry/__init__.py,sha256=_ol5nI8AL30pYXeAh5rtP7YmQggitPC6LA_kuTfPJ0Q,231
10
+ water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=ZsFQTDA0gXfQHlxDsXBGD1qQ0ipmx4kS81DcY6ml5Ew,7767
11
+ water_column_sonar_processing/cruise/datatree_manager.py,sha256=Qy4dZCW8_q31lbjxbMsx3JtBS4BvQT17_2P0QD1RQcY,639
12
+ water_column_sonar_processing/cruise/resample_regrid.py,sha256=gz_uP-mBD4JSBRBr69ZvsfmXX4yyBdRG9-P1z3If43E,14246
13
+ water_column_sonar_processing/geometry/__init__.py,sha256=GIzzc-_7pwEwbOkGpc4i_fmjWI5ymllXqzdHq_d3Rio,299
14
+ water_column_sonar_processing/geometry/elevation_manager.py,sha256=eq9w691WJknPwWYkvO3giKTPleIxCVc2tMGR0e8ZRxQ,4267
14
15
  water_column_sonar_processing/geometry/geometry_manager.py,sha256=nz5T1vCDWHYIfQ853EqKYHDetTul7jRWS3y8Evep8QU,10855
15
16
  water_column_sonar_processing/geometry/geometry_simplification.py,sha256=im1HG9nfYIerQv3w-PUHzphw2B7aGgnsA3Zcdy2oTmA,3016
16
17
  water_column_sonar_processing/geometry/pmtile_generation.py,sha256=7Lm08Jr6YaM4nYmexClxbIMOqSV1teo9wMm6dfjFuNA,12384
17
18
  water_column_sonar_processing/index/__init__.py,sha256=izEObsKiOoIJ0kZCFhvaYsBd6Ga71XJxnogjrNInw68,68
18
- water_column_sonar_processing/index/index_manager.py,sha256=YS6y_THfGAZpjfBZOj5n8O1aY_BnBYS781eNHfhpip0,11239
19
+ water_column_sonar_processing/index/index_manager.py,sha256=qsS6rKObJlFXKyzRuT1bk2_qW1YagW-Fg_AkQ1U_KRs,14213
19
20
  water_column_sonar_processing/model/__init__.py,sha256=FXaCdbPqxp0ogmZm9NplRirqpgMiYs1iRYgJbFbbX2Y,65
20
- water_column_sonar_processing/model/zarr_manager.py,sha256=LoL8vOnEl2r_Jhu4l30p6AgfUZg1tW5aBydHx_BZAZg,15068
21
- water_column_sonar_processing/processing/__init__.py,sha256=UwdB3BnoUxy4q3k9-ZjBF6KzmCWVDcqbcArTeHgmvGA,118
22
- water_column_sonar_processing/processing/cruise_sampler.py,sha256=hadPrnH5nz7_oG_4pND7YbMFH6NMR9d6p3xAXedtKU8,15927
23
- water_column_sonar_processing/processing/raw_to_zarr.py,sha256=agbb2A0BWf7D4b5u-mYOBN_VyjRVjOdQM2aeRGBweWw,17617
21
+ water_column_sonar_processing/model/zarr_manager.py,sha256=Sgh8wXhjTgvQ_UlHGALIbUQA9d7ESdpAT2hJIavpXwM,15507
22
+ water_column_sonar_processing/processing/__init__.py,sha256=tdpSfwnY6lbAS_yBTu4aG0SjPgCKqh6LAFvIj_t3j3U,168
23
+ water_column_sonar_processing/processing/batch_downloader.py,sha256=qXoruHdbgzAolmroK6eRn9bWgeHFgaVQLwhJ6X5oHRE,6299
24
+ water_column_sonar_processing/processing/raw_to_zarr.py,sha256=Sn0_zBT7yYP6abbSTlQBPA6iZSBxeVqPYYSgoroiBEU,17599
24
25
  water_column_sonar_processing/utility/__init__.py,sha256=yDObMOL0_OxKWet5wffK2-XVJgoE9iwiY2q04GZrtBQ,234
25
26
  water_column_sonar_processing/utility/cleaner.py,sha256=bNbs-hopWxtKAFBK0Eu18xdRErZCGZvtla3j-1bTwQw,619
26
- water_column_sonar_processing/utility/constants.py,sha256=EbzsorvYKadsPjuutRjQKKByGibhFm0Gw6D-Sp2ZD3I,2143
27
+ water_column_sonar_processing/utility/constants.py,sha256=AD6RlDrJRVN1GYwRvo7cunLhrdC0F8CyOlbkB_GxL-s,2180
27
28
  water_column_sonar_processing/utility/pipeline_status.py,sha256=O-0SySqdRGJ6bs3zQe1NV9vkOpmsRM7zj5QoHgzYioY,4395
28
29
  water_column_sonar_processing/utility/timestamp.py,sha256=bO0oir7KxxoEHPGRkz9FCBfOligkocUyRiWRzAq8fnU,361
29
- water_column_sonar_processing-0.0.13.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
30
- water_column_sonar_processing-0.0.13.dist-info/METADATA,sha256=MUkVn5e1wkAFUAYpk25V02yNCeYNmwBsyib788i2ibg,5087
31
- water_column_sonar_processing-0.0.13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
32
- water_column_sonar_processing-0.0.13.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
33
- water_column_sonar_processing-0.0.13.dist-info/RECORD,,
30
+ water_column_sonar_processing-24.1.1.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
31
+ water_column_sonar_processing-24.1.1.dist-info/METADATA,sha256=Bym-EHrC46s9vFs9eN-nqZisesp5r5AFOwCckUVULS8,5474
32
+ water_column_sonar_processing-24.1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
33
+ water_column_sonar_processing-24.1.1.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
34
+ water_column_sonar_processing-24.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,13 +0,0 @@
1
- from datatree import DataTree
2
-
3
- ds1 = xr.Dataset({"foo": "orange"})
4
-
5
- dt = DataTree(name="root", data=ds1) # create root node
6
-
7
- dt
8
- Out[4]:
9
- DataTree('root', parent=None)
10
- Dimensions: ()
11
- Data variables:
12
- foo <U6 24B 'orange'
13
-
@@ -1,342 +0,0 @@
1
- import gc
2
- import os
3
- import echopype as ep
4
- import numpy as np
5
- from numcodecs import Blosc
6
-
7
- from water_column_sonar_processing.utility import Cleaner
8
-
9
- TEMPDIR = "/tmp"
10
-
11
-
12
- # This code is getting copied from echofish-aws-raw-to-zarr-lambda
13
- class CruiseSampler:
14
- #######################################################
15
- def __init__(
16
- self,
17
- ):
18
- # TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
19
- self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
20
- self.bucket_name = os.environ.get("INPUT_BUCKET_NAME")
21
- # self.__s3 = s3_operations
22
-
23
- ############################################################################
24
- ############################################################################
25
- def __zarr_info_to_table(
26
- self,
27
- file_name,
28
- cruise_name,
29
- zarr_path,
30
- min_echo_range,
31
- max_echo_range,
32
- num_ping_time_dropna,
33
- start_time,
34
- end_time,
35
- frequencies,
36
- channels
37
- ):
38
- print('Writing Zarr information to DynamoDB table.')
39
- self.__dynamo.update_item(
40
- table_name=self.__table_name,
41
- key={
42
- 'FILE_NAME': {'S': file_name}, # Partition Key
43
- 'CRUISE_NAME': {'S': cruise_name}, # Sort Key
44
- # TODO: should be FILE_NAME & SENSOR_NAME so they are truely unique for when two sensors are processed within one cruise
45
- },
46
- expression='SET #ZB = :zb, #ZP = :zp, #MINER = :miner, #MAXER = :maxer, #P = :p, #ST = :st, #ET = :et, #F = :f, #C = :c',
47
- attribute_names={
48
- '#ZB': 'ZARR_BUCKET',
49
- '#ZP': 'ZARR_PATH',
50
- '#MINER': 'MIN_ECHO_RANGE',
51
- '#MAXER': 'MAX_ECHO_RANGE',
52
- '#P': 'NUM_PING_TIME_DROPNA',
53
- '#ST': 'START_TIME',
54
- '#ET': 'END_TIME',
55
- '#F': 'FREQUENCIES',
56
- '#C': 'CHANNELS',
57
- },
58
- attribute_values={
59
- ':zb': {
60
- 'S': self.__output_bucket
61
- },
62
- ':zp': {
63
- 'S': zarr_path
64
- },
65
- ':miner': {
66
- 'N': str(np.round(min_echo_range, 4))
67
- },
68
- ':maxer': {
69
- 'N': str(np.round(max_echo_range, 4))
70
- },
71
- ':p': {
72
- 'N': str(num_ping_time_dropna)
73
- },
74
- ':st': {
75
- 'S': start_time
76
- },
77
- ':et': {
78
- 'S': end_time
79
- },
80
- ':f': {
81
- 'L': [{'N': str(i)} for i in frequencies]
82
- },
83
- ':c': {
84
- 'L': [{'S': i} for i in channels]
85
- }
86
- }
87
- )
88
-
89
- ############################################################################
90
- ############################################################################
91
- ############################################################################
92
- def raw_to_zarr(
93
- self,
94
- ship_name,
95
- cruise_name,
96
- sensor_name,
97
- file_name,
98
- ):
99
- print(f'Opening raw: {file_name} and creating zarr store.')
100
- geometry_manager = GeometryManager()
101
- try:
102
- gc.collect()
103
- print('Opening raw file with echopype.')
104
- bucket_name="test_input_bucket" # noaa-wcsd-pds
105
- s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
106
- # s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
107
- # TODO: add the bottom file here
108
- echodata = ep.open_raw(
109
- raw_file=s3_file_path,
110
- sonar_model=sensor_name,
111
- # include_bot=True,
112
- use_swap=True,
113
- # max_chunk_size=100,
114
- # storage_options={'anon': True} # this was creating problems
115
- )
116
- print('Compute volume backscattering strength (Sv) from raw data.')
117
- ds_sv = ep.calibrate.compute_Sv(echodata)
118
- print('Done computing volume backscattering strength (Sv) from raw data.')
119
- frequencies = echodata.environment.frequency_nominal.values
120
- #################################################################
121
- # Get GPS coordinates
122
- gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
123
- echodata=echodata,
124
- ship_name=ship_name,
125
- cruise_name=cruise_name,
126
- sensor_name=sensor_name,
127
- file_name=file_name,
128
- write_geojson=True
129
- )
130
- # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
131
- #################################################################
132
- # Technically the min_echo_range would be 0 m.
133
- # TODO: this var name is supposed to represent minimum resolution of depth measurements
134
- # The most minimum the resolution can be is as small as 0.25 meters
135
- min_echo_range = np.maximum(0.25, np.nanmin(np.diff(ds_sv.echo_range.values)))
136
- max_echo_range = float(np.nanmax(ds_sv.echo_range))
137
- #
138
- num_ping_time_dropna = lat[~np.isnan(lat)].shape[0] # symmetric to lon
139
- #
140
- start_time = np.datetime_as_string(ds_sv.ping_time.values[0], unit='ms') + "Z"
141
- end_time = np.datetime_as_string(ds_sv.ping_time.values[-1], unit='ms') + "Z"
142
- channels = list(ds_sv.channel.values)
143
- #
144
- #################################################################
145
- # Create the zarr store
146
- ds_sv.to_zarr(store=store_name)
147
- #################################################################
148
- print('Note: Adding GeoJSON inside Zarr store')
149
- self.__write_geojson_to_file(store_name=store_name, data=gps_data)
150
- #################################################################
151
- self.__zarr_info_to_table(
152
- file_name=raw_file_name,
153
- cruise_name=cruise_name,
154
- zarr_path=os.path.join(output_zarr_prefix, store_name),
155
- min_echo_range=min_echo_range,
156
- max_echo_range=max_echo_range,
157
- num_ping_time_dropna=num_ping_time_dropna,
158
- start_time=start_time,
159
- end_time=end_time,
160
- frequencies=frequencies,
161
- channels=channels
162
- )
163
- except Exception as err:
164
- print(f'Exception encountered creating local Zarr store with echopype: {err}')
165
- raise RuntimeError(f"Problem creating local Zarr store, {err}")
166
- print('Done creating local zarr store.')
167
-
168
- ############################################################################
169
- def __upload_files_to_output_bucket(
170
- self,
171
- local_directory,
172
- object_prefix,
173
- ):
174
- # Note: this will be passed credentials if using NODD
175
- print('Uploading files using thread pool executor.')
176
- all_files = []
177
- for subdir, dirs, files in os.walk(local_directory):
178
- for file in files:
179
- local_path = os.path.join(subdir, file)
180
- s3_key = os.path.join(object_prefix, local_path)
181
- all_files.append([local_path, s3_key])
182
- # all_files
183
- all_uploads = self.__s3.upload_files_with_thread_pool_executor(
184
- bucket_name=self.__output_bucket,
185
- all_files=all_files,
186
- access_key_id=self.__output_bucket_access_key,
187
- secret_access_key=self.__output_bucket_secret_access_key
188
- )
189
- return all_uploads
190
-
191
- ############################################################################
192
- def execute(self, input_message):
193
- ship_name = input_message['shipName']
194
- cruise_name = input_message['cruiseName']
195
- sensor_name = input_message['sensorName']
196
- input_file_name = input_message['fileName']
197
- #
198
- try:
199
- self.__update_processing_status(
200
- file_name=input_file_name,
201
- cruise_name=cruise_name,
202
- pipeline_status="PROCESSING_RAW_TO_ZARR"
203
- )
204
- #######################################################################
205
- store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
206
- output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
207
- bucket_key = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
208
- zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
209
- #
210
- os.chdir(TEMPDIR) # Lambdas require use of temp directory
211
- #######################################################################
212
- #######################################################################
213
- # Check if zarr store already exists
214
- s3_objects = self.__s3.list_objects(
215
- bucket_name=self.__output_bucket,
216
- prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
217
- access_key_id=self.__output_bucket_access_key,
218
- secret_access_key=self.__output_bucket_secret_access_key
219
- )
220
- if len(s3_objects) > 0:
221
- print('Zarr store data already exists in s3, deleting existing and continuing.')
222
- self.__s3.delete_objects(
223
- bucket_name=self.__output_bucket,
224
- objects=s3_objects,
225
- access_key_id=self.__output_bucket_access_key,
226
- secret_access_key=self.__output_bucket_secret_access_key
227
- )
228
- #######################################################################
229
- # self.__delete_all_local_raw_and_zarr_files()
230
- Cleaner.delete_local_files(file_types=["*.raw*", "*.zarr"])
231
- self.__s3.download_file(
232
- bucket_name=self.__input_bucket,
233
- key=bucket_key,
234
- file_name=input_file_name
235
- )
236
- self.__create_local_zarr_store(
237
- raw_file_name=input_file_name,
238
- cruise_name=cruise_name,
239
- sensor_name=sensor_name,
240
- output_zarr_prefix=output_zarr_prefix,
241
- store_name=store_name
242
- )
243
- #######################################################################
244
- self.__upload_files_to_output_bucket(store_name, output_zarr_prefix)
245
- #######################################################################
246
- # # TODO: verify count of objects matches
247
- # s3_objects = self.__s3.list_objects(
248
- # bucket_name=self.__output_bucket,
249
- # prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
250
- # access_key_id=self.__output_bucket_access_key,
251
- # secret_access_key=self.__output_bucket_secret_access_key
252
- # )
253
- #######################################################################
254
- self.__update_processing_status(
255
- file_name=input_file_name,
256
- cruise_name=cruise_name,
257
- pipeline_status='SUCCESS_RAW_TO_ZARR'
258
- )
259
- #######################################################################
260
- self.__publish_done_message(input_message)
261
- #######################################################################
262
- # except Exception as err:
263
- # print(f'Exception encountered: {err}')
264
- # self.__update_processing_status(
265
- # file_name=input_file_name,
266
- # cruise_name=cruise_name,
267
- # pipeline_status='FAILURE_RAW_TO_ZARR',
268
- # error_message=str(err),
269
- # )
270
- finally:
271
- self.__delete_all_local_raw_and_zarr_files()
272
- #######################################################################
273
-
274
- ############################################################################
275
-
276
- ################################################################################
277
- ############################################################################
278
- # TODO: DELETE
279
- # def __get_gps_data(
280
- # self,
281
- # echodata: ep.echodata.echodata.EchoData
282
- # ) -> tuple:
283
- # print('Getting GPS data.')
284
- # try:
285
- # # if 'latitude' not in echodata.platform.variables and 'longitude' not in echodata.platform.variables:
286
- # # raise KeyError;
287
- # assert( # TODO: raise error, e.g. KeyError
288
- # 'latitude' in echodata.platform.variables and 'longitude' in echodata.platform.variables
289
- # ), "Problem: GPS coordinates not found in echodata."
290
- # latitude = echodata.platform.latitude.values
291
- # longitude = echodata.platform.longitude.values # len(longitude) == 14691
292
- # # RE: time coordinates: https://github.com/OSOceanAcoustics/echopype/issues/656#issue-1219104771
293
- # assert(
294
- # 'time1' in echodata.platform.variables and 'time1' in echodata.environment.variables
295
- # ), "Problem: Time coordinate not found in echodata."
296
- # # 'nmea_times' are times from the nmea datalogger associated with GPS
297
- # # nmea times, unlike env times, can be sorted
298
- # nmea_times = np.sort(echodata.platform.time1.values)
299
- # # 'time1' are times from the echosounder associated with transducer measurement
300
- # time1 = echodata.environment.time1.values
301
- # # Align 'sv_times' to 'nmea_times'
302
- # assert(
303
- # np.all(time1[:-1] <= time1[1:]) and np.all(nmea_times[:-1] <= nmea_times[1:])
304
- # ), "Problem: NMEA time stamps are not sorted."
305
- # # Finds the indices where 'v' can be inserted just to the right of 'a'
306
- # indices = np.searchsorted(a=nmea_times, v=time1, side="right") - 1
307
- # #
308
- # lat = latitude[indices]
309
- # lat[indices < 0] = np.nan # values recorded before indexing are set to nan
310
- # lon = longitude[indices]
311
- # lon[indices < 0] = np.nan
312
- # if len(lat) < 2 or len(lon) < 2:
313
- # raise Exception("There was not enough data in lat or lon to create geojson.")
314
- # assert( # TODO: raise ValueError
315
- # np.all(lat[~np.isnan(lat)] >= -90.) and np.all(lat[~np.isnan(lat)] <= 90.) and np.all(lon[~np.isnan(lon)] >= -180.) and np.all(lon[~np.isnan(lon)] <= 180.)
316
- # ), "Problem: Data falls outside GPS bounds!"
317
- # # TODO: check for visits to null island
318
- # # https://osoceanacoustics.github.io/echopype-examples/echopype_tour.html
319
- # print(np.count_nonzero(np.isnan(lat)))
320
- # print(np.count_nonzero(np.isnan(lon)))
321
- # if len(lat[~np.isnan(lat)]) < 1:
322
- # raise RuntimeError(f"Problem all data is NaN.")
323
- # time1 = time1[~np.isnan(lat)]
324
- # lat = lat[~np.isnan(lat)]
325
- # lon = lon[~np.isnan(lon)]
326
- # #
327
- # gps_df = pd.DataFrame({
328
- # 'latitude': lat,
329
- # 'longitude': lon,
330
- # 'time1': time1
331
- # }).set_index(['time1'])
332
- # gps_gdf = geopandas.GeoDataFrame(
333
- # gps_df,
334
- # geometry=geopandas.points_from_xy(gps_df['longitude'], gps_df['latitude']),
335
- # crs="epsg:4326" # TODO: does this sound right?
336
- # )
337
- # # GeoJSON FeatureCollection with IDs as "time1"
338
- # geo_json = gps_gdf.to_json()
339
- # except Exception as err:
340
- # print(f'Exception encountered creating local Zarr store with echopype: {err}')
341
- # raise
342
- # return geo_json, lat, lon