water-column-sonar-processing 25.1.7__py3-none-any.whl → 25.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (26) hide show
  1. water_column_sonar_processing/aws/dynamodb_manager.py +27 -32
  2. water_column_sonar_processing/aws/s3_manager.py +52 -64
  3. water_column_sonar_processing/aws/s3fs_manager.py +3 -9
  4. water_column_sonar_processing/cruise/create_empty_zarr_store.py +14 -14
  5. water_column_sonar_processing/cruise/datatree_manager.py +3 -6
  6. water_column_sonar_processing/cruise/resample_regrid.py +67 -49
  7. water_column_sonar_processing/geometry/__init__.py +7 -2
  8. water_column_sonar_processing/geometry/elevation_manager.py +16 -17
  9. water_column_sonar_processing/geometry/geometry_manager.py +25 -25
  10. water_column_sonar_processing/geometry/line_simplification.py +150 -0
  11. water_column_sonar_processing/geometry/pmtile_generation.py +99 -64
  12. water_column_sonar_processing/index/index_manager.py +67 -32
  13. water_column_sonar_processing/model/zarr_manager.py +32 -21
  14. water_column_sonar_processing/process.py +15 -13
  15. water_column_sonar_processing/processing/__init__.py +2 -2
  16. water_column_sonar_processing/processing/batch_downloader.py +66 -41
  17. water_column_sonar_processing/processing/raw_to_zarr.py +121 -82
  18. water_column_sonar_processing/utility/constants.py +10 -1
  19. water_column_sonar_processing/utility/pipeline_status.py +11 -15
  20. {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/METADATA +21 -12
  21. water_column_sonar_processing-25.3.0.dist-info/RECORD +34 -0
  22. {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/WHEEL +1 -1
  23. water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
  24. water_column_sonar_processing-25.1.7.dist-info/RECORD +0 -34
  25. {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info/licenses}/LICENSE +0 -0
  26. {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,14 @@
1
1
  import glob
2
2
  import os
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
4
  from pathlib import Path
5
+
4
6
  import fiona
5
- import s3fs
7
+ import geopandas
8
+ import geopandas as gpd
6
9
  import numpy as np
7
10
  import pandas as pd
8
11
  import xarray as xr
9
- import geopandas
10
- import geopandas as gpd
11
- import pyogrio
12
- from concurrent.futures import ThreadPoolExecutor, as_completed
13
12
  from shapely.geometry import LineString
14
13
 
15
14
  MAX_POOL_CONNECTIONS = 64
@@ -19,6 +18,16 @@ GB = 1024**3
19
18
 
20
19
 
21
20
  class PMTileGeneration(object):
21
+ """
22
+ TODO: need to
23
+ - iterate through the zarr stores for all cruises
24
+ - generate geojson in geopandas df
25
+ - consolidate into singular df, one cruise per row
26
+ - export as _shape?_ file
27
+ - document next steps creating pmtiles with linux commands
28
+ - upload to s3
29
+ """
30
+
22
31
  #######################################################
23
32
  def __init__(
24
33
  self,
@@ -85,13 +94,20 @@ class PMTileGeneration(object):
85
94
  ship_name,
86
95
  cruise_names,
87
96
  ):
97
+ # TODO: NOT USED ANYWHERE
88
98
  total_size = 0
89
- s3_fs = s3fs.S3FileSystem(anon=True)
99
+ # s3_fs = s3fs.S3FileSystem(anon=True)
90
100
  for cruise_name in cruise_names:
91
- path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
92
- zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
93
- xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
94
- print(f'Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}')
101
+ s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
102
+ # zarr_store = s3fs.S3Map(root=s3_path, s3=s3_fs)
103
+ xr_store = xr.open_dataset(
104
+ filename_or_obj=s3_path,
105
+ engine="zarr",
106
+ storage_options={"anon": True},
107
+ chunks={}, # this allows the engine to define the chunk scheme
108
+ cache=True,
109
+ )
110
+ print(f"Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}")
95
111
  total_size = total_size + xr_store.time.shape[0]
96
112
 
97
113
  def get_geospatial_info_from_zarr_store(
@@ -102,40 +118,51 @@ class PMTileGeneration(object):
102
118
  """
103
119
  Open Zarr store, create geometry, write to geojson, return name
104
120
  """
105
- s3_fs = s3fs.S3FileSystem(anon=True)
121
+ # s3_fs = s3fs.S3FileSystem(anon=True)
106
122
  gps_gdf = geopandas.GeoDataFrame(
107
123
  columns=["id", "ship", "cruise", "sensor", "geometry"],
108
124
  geometry="geometry",
109
- crs="EPSG:4326"
125
+ crs="EPSG:4326",
110
126
  )
111
- path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
112
- # file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
113
- # file_stem = os.path.splitext(os.path.basename(file_name))[0]
114
- zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
115
- # ---Open Zarr Store--- #
127
+ s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
116
128
  # TODO: try-except to allow failures
117
- print('opening store')
118
- # xr_store = xr.open_zarr(store=zarr_store, consolidated=False)
119
- xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
129
+ print("opening store")
130
+ xr_store = xr.open_dataset(
131
+ filename_or_obj=s3_path,
132
+ engine="zarr",
133
+ storage_options={"anon": True},
134
+ chunks={}, # this allows the engine to define the chunk scheme
135
+ cache=True,
136
+ )
120
137
  print(xr_store.Sv.shape)
121
138
  # ---Read Zarr Store Time/Latitude/Longitude--- #
122
139
  latitude = xr_store.latitude.values
123
140
  longitude = xr_store.longitude.values
124
141
  if np.isnan(latitude).any() or np.isnan(longitude).any():
125
- print(f'there was missing lat-lon data for {cruise_name}')
142
+ print(f"there was missing lat-lon data for {cruise_name}")
126
143
  return None
127
144
  # ---Add To GeoPandas Dataframe--- #
128
145
  # TODO: experiment with tolerance "0.001"
129
- geom = LineString(list(zip(longitude, latitude))).simplify(tolerance=0.001, preserve_topology=True)
130
- gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
131
- gps_gdf.set_index('id', inplace=True)
132
- gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, engine="pyogrio")
146
+ geom = LineString(list(zip(longitude, latitude))).simplify(
147
+ tolerance=0.001, preserve_topology=True
148
+ )
149
+ gps_gdf.loc[0] = (
150
+ 0,
151
+ "Henry_B._Bigelow",
152
+ cruise_name,
153
+ "EK60",
154
+ geom,
155
+ ) # (ship, cruise, sensor, geometry)
156
+ gps_gdf.set_index("id", inplace=True)
157
+ gps_gdf.to_file(
158
+ f"dataframe_{cruise_name}.geojson", driver="GeoJSON"
159
+ ) # , engine="pyogrio")
133
160
  return cruise_name
134
161
 
135
162
  #######################################################
136
163
  def open_zarr_stores_with_thread_pool_executor(
137
- self,
138
- cruises: list,
164
+ self,
165
+ cruises: list,
139
166
  ):
140
167
  # 'cruises' is a list of cruises to process
141
168
  completed_cruises = []
@@ -156,37 +183,46 @@ class PMTileGeneration(object):
156
183
  except Exception as err:
157
184
  print(err)
158
185
  print("Done opening zarr stores using thread pool.")
159
- return completed_cruises # Took ~12 minutes
186
+ return completed_cruises # Took ~12 minutes
160
187
 
161
188
  #######################################################
162
189
  # https://docs.protomaps.com/pmtiles/create
163
- def aggregate_geojson_into_dataframe(
164
- self
165
- ):
190
+ def aggregate_geojson_into_dataframe(self):
166
191
  """
167
192
  iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
168
193
  """
169
194
  gps_gdf = geopandas.GeoDataFrame(
170
195
  columns=["id", "ship", "cruise", "sensor", "geometry"],
171
196
  geometry="geometry",
172
- crs="EPSG:4326"
197
+ crs="EPSG:4326",
173
198
  )
174
199
 
175
- file_type = 'dataframe_*.geojson'
200
+ file_type = "dataframe_*.geojson"
176
201
  geojson_files = glob.glob(file_type)
177
202
  for jjj in range(len(geojson_files)):
178
203
  print(jjj)
179
204
  geom = geopandas.read_file(geojson_files[jjj])
180
- gps_gdf.loc[jjj] = (jjj, geom.ship[0], geom.cruise[0], geom.sensor[0], geom.geometry[0])
181
- #gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
205
+ gps_gdf.loc[jjj] = (
206
+ jjj,
207
+ geom.ship[0],
208
+ geom.cruise[0],
209
+ geom.sensor[0],
210
+ geom.geometry[0],
211
+ )
212
+ # gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
182
213
  print(gps_gdf)
183
- gps_gdf.set_index('id', inplace=True)
184
- gps_gdf.to_file(f"data.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
214
+ gps_gdf.set_index("id", inplace=True)
215
+ gps_gdf.to_file(
216
+ "data.geojson",
217
+ driver="GeoJSON",
218
+ engine="pyogrio",
219
+ layer_options={"ID_GENERATE": "YES"},
220
+ )
185
221
  return list(gps_gdf.cruise)
186
222
 
187
223
  # gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
188
- #print('writing to file')
189
- #print(gps_gdf)
224
+ # print('writing to file')
225
+ # print(gps_gdf)
190
226
  # gps_gdf.set_index('id', inplace=True)
191
227
  # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
192
228
  # https://gdal.org/en/latest/drivers/vector/jsonfg.html
@@ -198,25 +234,25 @@ class PMTileGeneration(object):
198
234
  # )
199
235
  # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
200
236
 
237
+
201
238
  # print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
202
- #gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
239
+ # gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
203
240
  # Convert geojson feature collection to pmtiles
204
- #gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
205
- #print("done")
241
+ # gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
242
+ # print("done")
206
243
  # ---Export Shapefile--- #
207
244
 
208
245
 
209
-
210
- #gps_gdf.set_geometry(col='geometry', inplace=True)
211
- #gps_gdf.__geo_interface__
212
- #gps_gdf.set_index('id', inplace=True)
213
- #gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
246
+ # gps_gdf.set_geometry(col='geometry', inplace=True)
247
+ # gps_gdf.__geo_interface__
248
+ # gps_gdf.set_index('id', inplace=True)
249
+ # gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
214
250
 
215
251
  ### this gives the right layer id values
216
- #gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
252
+ # gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
217
253
  # jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
218
- #tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
219
- #tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
254
+ # tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
255
+ # tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
220
256
  # {
221
257
  # "type": "FeatureCollection",
222
258
  # "name": "dataframe5",
@@ -226,19 +262,19 @@ class PMTileGeneration(object):
226
262
  # { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
227
263
  # ]
228
264
  # }
229
- """
230
- # https://docs.protomaps.com/pmtiles/create
231
- #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
232
- # Only need to do the second one here...
233
- tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
234
- tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
235
- # used this to combine all the geojson files into single pmtile file (2024-12-03):
236
- tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
237
265
 
238
- TODO:
239
- run each one of the cruises in a separate ospool workflow.
240
- each process gets own store
241
- """
266
+ # # https://docs.protomaps.com/pmtiles/create
267
+ # #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
268
+ # # Only need to do the second one here...
269
+ # tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
270
+ # tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
271
+ # # used this to combine all the geojson files into single pmtile file (2024-12-03):
272
+ # tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
273
+ #
274
+ # TODO:
275
+ # run each one of the cruises in a separate ospool workflow.
276
+ # each process gets own store
277
+
242
278
  ###########################################################
243
279
 
244
280
  # s3_manager = S3Manager() # endpoint_url=endpoint_url)
@@ -258,5 +294,4 @@ TODO:
258
294
  # print(ds_zarr.Sv.shape)
259
295
 
260
296
 
261
-
262
- total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
297
+ # total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
@@ -1,11 +1,14 @@
1
1
  import os
2
2
  import re
3
- import pandas as pd
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from datetime import datetime
5
- from concurrent.futures import ThreadPoolExecutor
6
- from concurrent.futures import as_completed
7
- from water_column_sonar_processing.aws import S3Manager
5
+ from hashlib import sha256
6
+
7
+ import networkx as nx
8
+ import numpy as np
9
+ import pandas as pd
8
10
 
11
+ from water_column_sonar_processing.aws import S3Manager
9
12
 
10
13
  MAX_POOL_CONNECTIONS = 64
11
14
  MAX_CONCURRENCY = 64
@@ -19,8 +22,8 @@ class IndexManager:
19
22
  def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
20
23
  self.input_bucket_name = input_bucket_name
21
24
  self.calibration_bucket = calibration_bucket
22
- self.calibration_key = calibration_key
23
- self.s3_manager = S3Manager() # TODO: make anonymous?
25
+ self.calibration_key = calibration_key # TODO: make optional?
26
+ self.s3_manager = S3Manager() # TODO: make anonymous?
24
27
 
25
28
  #################################################################
26
29
  def list_ships(
@@ -80,9 +83,7 @@ class IndexManager:
80
83
  # Gets all raw files for a cruise under the given prefix
81
84
  prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
82
85
  page_iterator = self.s3_manager.paginator.paginate(
83
- Bucket=self.input_bucket_name,
84
- Prefix=prefix,
85
- Delimiter="/"
86
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
86
87
  )
87
88
  all_files = []
88
89
  for page in page_iterator:
@@ -112,7 +113,9 @@ class IndexManager:
112
113
  Delimiter="/",
113
114
  )
114
115
  # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
115
- page_iterator = page_iterator.search(expression="Contents[?contains(Key, '.raw')] ")
116
+ page_iterator = page_iterator.search(
117
+ expression="Contents[?contains(Key, '.raw')] "
118
+ )
116
119
  for res in page_iterator:
117
120
  if "Key" in res:
118
121
  return res["Key"]
@@ -149,9 +152,7 @@ class IndexManager:
149
152
  sensor_name,
150
153
  ):
151
154
  raw_files = self.get_raw_files(
152
- ship_name=ship_name,
153
- cruise_name=cruise_name,
154
- sensor_name=sensor_name
155
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
155
156
  )
156
157
  files_list = [
157
158
  {
@@ -174,9 +175,7 @@ class IndexManager:
174
175
  ):
175
176
  # gets all raw files in cruise and returns a list of dicts
176
177
  raw_files = self.get_raw_files(
177
- ship_name=ship_name,
178
- cruise_name=cruise_name,
179
- sensor_name=sensor_name
178
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
180
179
  )
181
180
  files_list = [
182
181
  {
@@ -190,10 +189,9 @@ class IndexManager:
190
189
  return files_list
191
190
 
192
191
  #################################################################
193
- def get_subset_ek60_prefix( # TODO: is this used?
194
- self,
195
- df: pd.DataFrame
196
- ) -> pd.DataFrame:
192
+ def get_subset_ek60_prefix(
193
+ self, df: pd.DataFrame
194
+ ) -> pd.DataFrame: # TODO: is this used?
197
195
  # Returns all objects with 'EK60' in prefix of file path
198
196
  # Note that this can include 'EK80' data that are false-positives
199
197
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
@@ -237,10 +235,7 @@ class IndexManager:
237
235
  return pd.DataFrame(objects)
238
236
 
239
237
  #################################################################
240
- def scan_datagram(
241
- self,
242
- select_key: str
243
- ) -> list:
238
+ def scan_datagram(self, select_key: str) -> list:
244
239
  # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
245
240
  # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
246
241
  # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
@@ -256,12 +251,15 @@ class IndexManager:
256
251
  return first_datagram
257
252
 
258
253
  #################################################################
259
- def get_subset_datagrams( # TODO: is this getting used
260
- self,
261
- df: pd.DataFrame
262
- ) -> list:
254
+ def get_subset_datagrams(
255
+ self, df: pd.DataFrame
256
+ ) -> list: # TODO: is this getting used
263
257
  print("getting subset of datagrams")
264
- select_keys = df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values.tolist()
258
+ select_keys = (
259
+ df[["KEY", "CRUISE"]]
260
+ .drop_duplicates(subset="CRUISE")["KEY"]
261
+ .values.tolist()
262
+ )
265
263
  all_datagrams = []
266
264
  with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
267
265
  futures = [
@@ -276,9 +274,7 @@ class IndexManager:
276
274
 
277
275
  #################################################################
278
276
  def get_ek60_objects(
279
- self,
280
- df: pd.DataFrame,
281
- subset_datagrams: list
277
+ self, df: pd.DataFrame, subset_datagrams: list
282
278
  ) -> pd.DataFrame:
283
279
  # for each key write datagram value to all other files in same cruise
284
280
  for subset_datagram in subset_datagrams:
@@ -346,3 +342,42 @@ class IndexManager:
346
342
  # end_time = datetime.now() # used for benchmarking
347
343
  # print(start_time)
348
344
  # print(end_time)
345
+
346
+ # TODO: wip
347
+ def build_merkle_tree(self):
348
+ G = nx.DiGraph()
349
+ # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
350
+ ship_name = "Henry_B._Bigelow"
351
+ cruise_name = "HB0707"
352
+ # cruise_name = "HB0805"
353
+ prefix = f"data/raw/{ship_name}/{cruise_name}/"
354
+ # prefix = f"data/raw/{ship_name}/"
355
+ page_iterator = self.s3_manager.paginator.paginate(
356
+ Bucket=self.input_bucket_name,
357
+ Prefix=prefix,
358
+ )
359
+ for page in page_iterator:
360
+ for contents in page["Contents"]:
361
+ obj_key = contents["Key"]
362
+ # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
363
+ obj_etag = contents["ETag"].split('"')[1] # properties
364
+ obj_size = contents["Size"]
365
+ basename = os.path.basename(obj_key)
366
+ G.add_node(
367
+ node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
368
+ ) # TODO: add parent hash
369
+ split_path = os.path.normpath(obj_key).split(os.path.sep)
370
+ # split_path: ['data', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
371
+ for previous, current in zip(split_path, split_path[1:]):
372
+ if not G.has_edge(previous, current):
373
+ G.add_edge(previous, current)
374
+ # print(G)
375
+ etag_set = frozenset(
376
+ [k for j, k in list(G.nodes.data("ETag")) if k is not None]
377
+ )
378
+ new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
379
+ total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
380
+ print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
381
+ print(" ")
382
+ print(new_hash)
383
+ return new_hash
@@ -1,17 +1,16 @@
1
+ import importlib.metadata
2
+
1
3
  import numcodecs
2
4
  import numpy as np
3
5
  import xarray as xr
4
6
  import zarr
5
- import importlib.metadata
6
7
  from numcodecs import Blosc
7
8
 
8
9
  from water_column_sonar_processing.aws import S3FSManager
9
- from water_column_sonar_processing.utility import Constants
10
- from water_column_sonar_processing.utility import Timestamp
11
- from water_column_sonar_processing.utility import Coordinates
10
+ from water_column_sonar_processing.utility import Constants, Coordinates, Timestamp
12
11
 
13
- numcodecs.blosc.use_threads = False
14
- numcodecs.blosc.set_nthreads(1)
12
+ # numcodecs.blosc.use_threads = False
13
+ # numcodecs.blosc.set_nthreads(1)
15
14
 
16
15
 
17
16
  # TODO: when ready switch to version 3 of model spec
@@ -36,20 +35,22 @@ class ZarrManager:
36
35
  self,
37
36
  min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
38
37
  max_echo_range: float = 100.0, # maximum depth measured from whole cruise
38
+ cruise_min_epsilon: float = 0.25, # resolution between subsequent measurements
39
39
  ):
40
40
  # Gets the set of depth values that will be used when resampling and
41
41
  # regridding the data to a cruise level model store.
42
42
  # Note: returned values do not start at zero.
43
+ # For more info see here: https://echopype.readthedocs.io/en/stable/data-proc-additional.html
43
44
  print("Getting depth values.")
44
45
  all_cruise_depth_values = np.linspace(
45
46
  start=min_echo_range,
46
47
  stop=max_echo_range,
47
- num=int(max_echo_range / min_echo_range) + 1,
48
+ num=int((max_echo_range - min_echo_range) / cruise_min_epsilon) + 1,
48
49
  endpoint=True,
49
- )
50
+ ) # np.arange(min_echo_range, max_echo_range, step=min_echo_range) # this is worse
50
51
 
51
52
  if np.any(np.isnan(all_cruise_depth_values)):
52
- raise Exception('Problem depth values returned were NaN.')
53
+ raise Exception("Problem depth values returned were NaN.")
53
54
 
54
55
  print("Done getting depth values.")
55
56
  return all_cruise_depth_values.round(decimals=2)
@@ -57,7 +58,7 @@ class ZarrManager:
57
58
  #######################################################
58
59
  def create_zarr_store(
59
60
  self,
60
- path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
61
+ path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
61
62
  ship_name: str,
62
63
  cruise_name: str,
63
64
  sensor_name: str,
@@ -65,6 +66,7 @@ class ZarrManager:
65
66
  width: int, # TODO: needs better name... "ping_time"
66
67
  min_echo_range: float, # smallest resolution in meters
67
68
  max_echo_range: float,
69
+ cruise_min_epsilon: float,
68
70
  calibration_status: bool = False, # Assume uncalibrated
69
71
  ) -> str:
70
72
  print(
@@ -105,7 +107,9 @@ class ZarrManager:
105
107
  #####################################################################
106
108
  # --- Coordinate: Depth --- #
107
109
  depth_values = self.get_depth_values(
108
- min_echo_range=min_echo_range, max_echo_range=max_echo_range
110
+ min_echo_range=min_echo_range,
111
+ max_echo_range=max_echo_range,
112
+ cruise_min_epsilon=cruise_min_epsilon,
109
113
  )
110
114
 
111
115
  root.create_dataset(
@@ -123,7 +127,7 @@ class ZarrManager:
123
127
  )
124
128
 
125
129
  if np.any(np.isnan(depth_values)):
126
- raise Exception('Some depth values returned were NaN.')
130
+ raise Exception("Some depth values returned were NaN.")
127
131
 
128
132
  root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
129
133
 
@@ -171,7 +175,9 @@ class ZarrManager:
171
175
 
172
176
  root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
173
177
  root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
174
- root.longitude.attrs["standard_name"] = Coordinates.LONGITUDE_STANDARD_NAME.value
178
+ root.longitude.attrs["standard_name"] = (
179
+ Coordinates.LONGITUDE_STANDARD_NAME.value
180
+ )
175
181
 
176
182
  #####################################################################
177
183
  # TODO: verify adding this variable for where the bottom was detected
@@ -224,7 +230,11 @@ class ZarrManager:
224
230
  name=Coordinates.SV.value,
225
231
  shape=(len(depth_values), width, len(frequencies)),
226
232
  # chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
227
- chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1), # 256x256x1 <- speed up for alex
233
+ chunks=(
234
+ Constants.TILE_SIZE.value,
235
+ Constants.TILE_SIZE.value,
236
+ 1,
237
+ ), # 256x256x1 <- speed up for alex
228
238
  dtype=np.dtype(
229
239
  Coordinates.SV_DTYPE.value
230
240
  ), # TODO: try to experiment with 'float16'
@@ -251,7 +261,9 @@ class ZarrManager:
251
261
  #
252
262
  root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
253
263
 
254
- current_project_version = importlib.metadata.version('water_column_sonar_processing')
264
+ current_project_version = importlib.metadata.version(
265
+ "water_column_sonar_processing"
266
+ )
255
267
  root.attrs["processing_software_version"] = current_project_version
256
268
  root.attrs["processing_software_time"] = Timestamp.get_timestamp()
257
269
  #
@@ -317,16 +329,14 @@ class ZarrManager:
317
329
  input_bucket_name: str,
318
330
  endpoint_url=None,
319
331
  ) -> xr.Dataset:
320
- print("Opening L1 Zarr store in S3 with Xarray.") # TODO: Is this only used for reading from?
332
+ print(
333
+ "Opening L1 Zarr store in S3 with Xarray."
334
+ ) # TODO: Is this only used for reading from?
321
335
  try:
322
336
  zarr_path = f"s3://{input_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
323
337
  s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
324
338
  store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
325
- ds = xr.open_dataset(
326
- filename_or_obj=store_s3_map,
327
- engine="zarr",
328
- chunks={}
329
- )
339
+ ds = xr.open_dataset(filename_or_obj=store_s3_map, engine="zarr", chunks={})
330
340
  except Exception as err:
331
341
  print("Problem opening Zarr store in S3 as Xarray.")
332
342
  raise err
@@ -353,6 +363,7 @@ class ZarrManager:
353
363
  raise err
354
364
  print("Done opening Zarr store in S3 as Xarray.")
355
365
  return ds
366
+
356
367
  ############################################################################
357
368
 
358
369
  #######################################################
@@ -3,10 +3,12 @@ import os
3
3
 
4
4
  import numpy as np
5
5
 
6
- from water_column_sonar_processing.aws import DynamoDBManager
7
- from water_column_sonar_processing.aws import S3Manager
8
- from water_column_sonar_processing.aws import S3FSManager
9
- from water_column_sonar_processing.aws import SNSManager
6
+ from water_column_sonar_processing.aws import (
7
+ DynamoDBManager,
8
+ S3FSManager,
9
+ S3Manager,
10
+ SNSManager,
11
+ )
10
12
 
11
13
 
12
14
  ###########################################################
@@ -23,9 +25,9 @@ class Process:
23
25
  # self.output_bucket_secret_access_key = ?
24
26
 
25
27
  def execute(self):
26
- input_s3_manager = (
27
- S3Manager()
28
- ) # TODO: Need to allow passing in of credentials when writing to protected bucket
28
+ # input_s3_manager = (
29
+ # S3Manager()
30
+ # ) # TODO: Need to allow passing in of credentials when writing to protected bucket
29
31
  s3fs_manager = S3FSManager() # TODO: delete this
30
32
  print(s3fs_manager) # TODO: delete this
31
33
  output_s3_manager = S3Manager()
@@ -76,8 +78,8 @@ class Process:
76
78
  "#SE": "SENSOR_NAME",
77
79
  "#SH": "SHIP_NAME",
78
80
  "#ST": "START_TIME",
79
- "#ZB": "ZARR_BUCKET",
80
- "#ZP": "ZARR_PATH",
81
+ # "#ZB": "ZARR_BUCKET",
82
+ # "#ZP": "ZARR_PATH",
81
83
  },
82
84
  expression_attribute_values={
83
85
  ":ch": {"L": [{"S": i} for i in test_channels]},
@@ -92,10 +94,10 @@ class Process:
92
94
  ":se": {"S": sensor_name},
93
95
  ":sh": {"S": ship_name},
94
96
  ":st": {"S": "2006-04-06T11:34:07.288Z"},
95
- ":zb": {"S": "r2d2-dev-echofish2-118234403147-echofish-dev-output"},
96
- ":zp": {
97
- "S": "level_1/David_Starr_Jordan/DS0604/EK60/DSJ0604-D20060406-T113407.model"
98
- },
97
+ # ":zb": {"S": "r2d2-dev-echofish2-118234403147-echofish-dev-output"},
98
+ # ":zp": {
99
+ # "S": "level_1/David_Starr_Jordan/DS0604/EK60/DSJ0604-D20060406-T113407.model"
100
+ # },
99
101
  },
100
102
  update_expression=(
101
103
  "SET "
@@ -1,5 +1,5 @@
1
1
  # from .cruise_sampler import CruiseSampler
2
- from .raw_to_zarr import RawToZarr
3
2
  from .batch_downloader import BatchDownloader
3
+ from .raw_to_zarr import RawToZarr
4
4
 
5
- __all__ = ["RawToZarr", "BatchDownloader"]
5
+ __all__ = ["RawToZarr", "BatchDownloader"]