water-column-sonar-processing 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. water_column_sonar_processing/__init__.py +2 -5
  2. water_column_sonar_processing/aws/__init__.py +2 -2
  3. water_column_sonar_processing/aws/dynamodb_manager.py +149 -43
  4. water_column_sonar_processing/aws/s3_manager.py +71 -37
  5. water_column_sonar_processing/cruise/create_empty_zarr_store.py +6 -4
  6. water_column_sonar_processing/cruise/resample_regrid.py +3 -3
  7. water_column_sonar_processing/geometry/geometry_manager.py +21 -6
  8. water_column_sonar_processing/geometry/pmtile_generation.py +200 -13
  9. water_column_sonar_processing/index/index_manager.py +25 -13
  10. water_column_sonar_processing/model/zarr_manager.py +27 -25
  11. water_column_sonar_processing/process.py +4 -4
  12. water_column_sonar_processing/processing/__init__.py +4 -0
  13. water_column_sonar_processing/processing/cruise_sampler.py +342 -0
  14. water_column_sonar_processing/processing/raw_to_zarr.py +349 -0
  15. water_column_sonar_processing/utility/__init__.py +2 -2
  16. water_column_sonar_processing/utility/cleaner.py +1 -0
  17. water_column_sonar_processing/utility/constants.py +6 -2
  18. {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/METADATA +20 -10
  19. water_column_sonar_processing-0.0.9.dist-info/RECORD +32 -0
  20. {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/WHEEL +1 -1
  21. water_column_sonar_processing-0.0.7.dist-info/RECORD +0 -29
  22. {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/LICENSE +0 -0
  23. {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,22 @@
1
+ import glob
1
2
  import os
2
3
  from pathlib import Path
3
-
4
- # from shapely import wkt
5
- # import json
6
- # from shapely.geometry import shape, GeometryCollection
7
4
  import fiona
8
- import geopandas
5
+ import s3fs
6
+ import numpy as np
9
7
  import pandas as pd
8
+ import xarray as xr
9
+ import geopandas
10
+ import geopandas as gpd
11
+ import pyogrio
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
13
  from shapely.geometry import LineString
11
14
 
15
+ MAX_POOL_CONNECTIONS = 64
16
+ MAX_CONCURRENCY = 64
17
+ MAX_WORKERS = 64
18
+ GB = 1024**3
19
+
12
20
 
13
21
  class PMTileGeneration(object):
14
22
  #######################################################
@@ -18,34 +26,35 @@ class PMTileGeneration(object):
18
26
  print("123")
19
27
 
20
28
  #######################################################
29
+ # This uses a local collection of file-level geojson files to create the data
21
30
  def generate_geojson_feature_collection(self):
22
31
  # This was used to read from noaa-wcsd-model-pds bucket geojson files and then to
23
32
  # generate the geopandas dataframe which could be exported to another comprehensive
24
33
  # geojson file. That
25
34
  result = list(Path("/Users/r2d2/Documents/echofish/geojson").rglob("*.json"))
26
35
  # result = result[:100]
27
- iii = 0
36
+ jjj = 0
28
37
  pieces = []
29
- for iii in range(len(result)):
30
- file_name = os.path.normpath(result[iii]).split(os.sep)[-1]
38
+ for jjj in range(len(result)):
39
+ file_name = os.path.normpath(result[jjj]).split(os.sep)[-1]
31
40
  file_stem = os.path.splitext(os.path.basename(file_name))[0]
32
- geom = geopandas.read_file(result[iii]).iloc[0]["geometry"]
41
+ geom = gpd.read_file(result[jjj]).iloc[0]["geometry"]
33
42
  # TDOO: Filter (0,0) coordinates
34
43
  if len(geom.coords.xy[0]) < 2:
35
44
  continue
36
45
  geom = LineString(list(zip(geom.coords.xy[1], geom.coords.xy[0])))
37
46
  pieces.append(
38
47
  {
39
- "ship_name": os.path.normpath(result[iii]).split(os.sep)[-4],
40
- "cruise_name": os.path.normpath(result[iii]).split(os.sep)[-3],
48
+ "ship_name": os.path.normpath(result[jjj]).split(os.sep)[-4],
49
+ "cruise_name": os.path.normpath(result[jjj]).split(os.sep)[-3],
41
50
  "file_stem": file_stem,
42
- "file_path": result[iii],
51
+ "file_path": result[jjj],
43
52
  "geom": geom,
44
53
  }
45
54
  )
46
55
  df = pd.DataFrame(pieces)
47
56
  print(df)
48
- gps_gdf = geopandas.GeoDataFrame(
57
+ gps_gdf = gpd.GeoDataFrame(
49
58
  data=df[
50
59
  ["ship_name", "cruise_name", "file_stem"]
51
60
  ], # try again with file_stem
@@ -70,6 +79,184 @@ class PMTileGeneration(object):
70
79
  """
71
80
 
72
81
  #######################################################
82
+ # TODO: temporary using this to get info
83
+ def get_info_from_zarr_store(
84
+ self,
85
+ ship_name,
86
+ cruise_names,
87
+ ):
88
+ total_size = 0
89
+ s3_fs = s3fs.S3FileSystem(anon=True)
90
+ for cruise_name in cruise_names:
91
+ path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
92
+ zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
93
+ xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
94
+ print(f'Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}')
95
+ total_size = total_size + xr_store.time.shape[0]
96
+
97
+ def get_geospatial_info_from_zarr_store(
98
+ self,
99
+ ship_name,
100
+ cruise_name,
101
+ ):
102
+ """
103
+ Open Zarr store, create geometry, write to geojson, return name
104
+ """
105
+ s3_fs = s3fs.S3FileSystem(anon=True)
106
+ gps_gdf = geopandas.GeoDataFrame(
107
+ columns=["id", "ship", "cruise", "sensor", "geometry"],
108
+ geometry="geometry",
109
+ crs="EPSG:4326"
110
+ )
111
+ path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
112
+ # file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
113
+ # file_stem = os.path.splitext(os.path.basename(file_name))[0]
114
+ zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
115
+ # ---Open Zarr Store--- #
116
+ # TODO: try-except to allow failures
117
+ print('opening store')
118
+ # xr_store = xr.open_zarr(store=zarr_store, consolidated=False)
119
+ xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
120
+ print(xr_store.Sv.shape)
121
+ # ---Read Zarr Store Time/Latitude/Longitude--- #
122
+ latitude = xr_store.latitude.values
123
+ longitude = xr_store.longitude.values
124
+ if np.isnan(latitude).any() or np.isnan(longitude).any():
125
+ print(f'there was missing lat-lon data for {cruise_name}')
126
+ return None
127
+ # ---Add To GeoPandas Dataframe--- #
128
+ # TODO: experiment with tolerance "0.001"
129
+ geom = LineString(list(zip(longitude, latitude))).simplify(tolerance=0.001, preserve_topology=True)
130
+ gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
131
+ gps_gdf.set_index('id', inplace=True)
132
+ gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, engine="pyogrio")
133
+ return cruise_name
134
+
135
+ #######################################################
136
+ def open_zarr_stores_with_thread_pool_executor(
137
+ self,
138
+ cruises: list,
139
+ ):
140
+ # 'cruises' is a list of cruises to process
141
+ completed_cruises = []
142
+ try:
143
+ with ThreadPoolExecutor(max_workers=32) as executor:
144
+ futures = [
145
+ executor.submit(
146
+ self.get_geospatial_info_from_zarr_store,
147
+ "Henry_B._Bigelow", # ship_name
148
+ cruise, # cruise_name
149
+ )
150
+ for cruise in cruises
151
+ ]
152
+ for future in as_completed(futures):
153
+ result = future.result()
154
+ if result:
155
+ completed_cruises.extend([result])
156
+ except Exception as err:
157
+ print(err)
158
+ print("Done opening zarr stores using thread pool.")
159
+ return completed_cruises # Took ~12 minutes
160
+
161
+ #######################################################
162
+ # https://docs.protomaps.com/pmtiles/create
163
+ def aggregate_geojson_into_dataframe(
164
+ self
165
+ ):
166
+ """
167
+ iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
168
+ """
169
+ gps_gdf = geopandas.GeoDataFrame(
170
+ columns=["id", "ship", "cruise", "sensor", "geometry"],
171
+ geometry="geometry",
172
+ crs="EPSG:4326"
173
+ )
174
+
175
+ file_type = 'dataframe_*.geojson'
176
+ geojson_files = glob.glob(file_type)
177
+ for jjj in range(len(geojson_files)):
178
+ print(jjj)
179
+ geom = geopandas.read_file(geojson_files[jjj])
180
+ gps_gdf.loc[jjj] = (jjj, geom.ship[0], geom.cruise[0], geom.sensor[0], geom.geometry[0])
181
+ #gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
182
+ print(gps_gdf)
183
+ gps_gdf.set_index('id', inplace=True)
184
+ gps_gdf.to_file(f"data.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
185
+ return list(gps_gdf.cruise)
186
+
187
+ # gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
188
+ #print('writing to file')
189
+ #print(gps_gdf)
190
+ # gps_gdf.set_index('id', inplace=True)
191
+ # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
192
+ # https://gdal.org/en/latest/drivers/vector/jsonfg.html
193
+ # gps_gdf.to_file(
194
+ # f"data.geojson",
195
+ # driver="GeoJSON",
196
+ # engine="pyogrio",
197
+ # layer_options={"ID_FIELD": "id"}
198
+ # )
199
+ # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
73
200
 
201
+ # print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
202
+ #gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
203
+ # Convert geojson feature collection to pmtiles
204
+ #gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
205
+ #print("done")
206
+ # ---Export Shapefile--- #
74
207
 
208
+
209
+
210
+ #gps_gdf.set_geometry(col='geometry', inplace=True)
211
+ #gps_gdf.__geo_interface__
212
+ #gps_gdf.set_index('id', inplace=True)
213
+ #gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
214
+
215
+ ### this gives the right layer id values
216
+ #gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
217
+ # jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
218
+ #tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
219
+ #tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
220
+ # {
221
+ # "type": "FeatureCollection",
222
+ # "name": "dataframe5",
223
+ # "features": [
224
+ # { "type": "Feature", "id": 0, "properties": { "id": 0, "ship": "Henry_B._Bigelow", "cruise": "HB0706", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.120498657226562, 39.659671783447266 ], [ -72.120773315429688, 39.660198211669922 ] ] } },
225
+ # { "type": "Feature", "id": 1, "properties": { "id": 1, "ship": "Henry_B._Bigelow", "cruise": "HB0707", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -71.797836303710938, 41.003166198730469 ], [ -71.797996520996094, 41.002998352050781 ], [ -71.798583984375, 41.002994537353516 ] ] } },
226
+ # { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
227
+ # ]
228
+ # }
229
+ """
230
+ # https://docs.protomaps.com/pmtiles/create
231
+ #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
232
+ # Only need to do the second one here...
233
+ tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
234
+ tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
235
+ # used this to combine all the geojson files into single pmtile file (2024-12-03):
236
+ tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
237
+
238
+ TODO:
239
+ run each one of the cruises in a separate ospool workflow.
240
+ each process gets own store
241
+ """
75
242
  ###########################################################
243
+
244
+ # s3_manager = S3Manager() # endpoint_url=endpoint_url)
245
+ # # s3fs_manager = S3FSManager()
246
+ # # input_bucket_name = "test_input_bucket"
247
+ # # s3_manager.create_bucket(bucket_name=input_bucket_name)
248
+ # ship_name = "Henry_B._Bigelow"
249
+ # cruise_name = "HB0706"
250
+ # sensor_name = "EK60"
251
+ #
252
+ # # ---Scan Bucket For All Zarr Stores--- #
253
+ # # https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html#level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr/
254
+ # path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr"
255
+ # s3 = s3fs.S3FileSystem()
256
+ # zarr_store = s3fs.S3Map(path_to_zarr_store, s3=s3)
257
+ # ds_zarr = xr.open_zarr(zarr_store, consolidated=None)
258
+ # print(ds_zarr.Sv.shape)
259
+
260
+
261
+
262
+ total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  from datetime import datetime
5
5
  from concurrent.futures import ThreadPoolExecutor
6
6
  from concurrent.futures import as_completed
7
- from water_column_sonar_processing.aws.s3_manager import S3Manager
7
+ from water_column_sonar_processing.aws import S3Manager
8
8
 
9
9
 
10
10
  class IndexManager:
@@ -16,12 +16,10 @@ class IndexManager:
16
16
  self.s3_manager = S3Manager()
17
17
 
18
18
  #################################################################
19
-
20
19
  def list_ships(
21
20
  self,
22
21
  prefix="data/raw/",
23
22
  ):
24
- # s3_client = self.s3_manager.s3_client
25
23
  page_iterator = self.s3_manager.paginator.paginate(
26
24
  Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
27
25
  )
@@ -79,6 +77,7 @@ class IndexManager:
79
77
  all_files.extend([i["Key"] for i in page["Contents"]])
80
78
  return [i for i in all_files if i.endswith(".raw")]
81
79
 
80
+ #################################################################
82
81
  def get_raw_files_csv(
83
82
  self,
84
83
  ship_name,
@@ -86,7 +85,9 @@ class IndexManager:
86
85
  sensor_name,
87
86
  ):
88
87
  raw_files = self.get_raw_files(
89
- ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
88
+ ship_name=ship_name,
89
+ cruise_name=cruise_name,
90
+ sensor_name=sensor_name
90
91
  )
91
92
  files_list = [
92
93
  {
@@ -102,7 +103,10 @@ class IndexManager:
102
103
  print("done")
103
104
 
104
105
  #################################################################
105
- def get_subset_ek60_prefix(self, df: pd.DataFrame) -> pd.DataFrame:
106
+ def get_subset_ek60_prefix( # TODO: is this used?
107
+ self,
108
+ df: pd.DataFrame
109
+ ) -> pd.DataFrame:
106
110
  # Returns all objects with 'EK60' in prefix of file path
107
111
  # Note that this can include 'EK80' data that are false-positives
108
112
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
@@ -119,13 +123,13 @@ class IndexManager:
119
123
  2:5
120
124
  ] # 'Okeanos_Explorer', 'EX1608', 'EK60'
121
125
  if (
122
- re.search("[D](\d{8})", filename) is not None
123
- and re.search("[T](\d{6})", filename) is not None
126
+ re.search("[D](\\d{8})", filename) is not None
127
+ and re.search("[T](\\d{6})", filename) is not None
124
128
  ):
125
129
  # Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
126
130
  # and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
127
- date_substring = re.search("[D](\d{8})", filename).group(1)
128
- time_substring = re.search("[T](\d{6})", filename).group(1)
131
+ date_substring = re.search("[D](\\d{8})", filename).group(1)
132
+ time_substring = re.search("[T](\\d{6})", filename).group(1)
129
133
  date_string = datetime.strptime(
130
134
  f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
131
135
  )
@@ -146,7 +150,10 @@ class IndexManager:
146
150
  return pd.DataFrame(objects)
147
151
 
148
152
  #################################################################
149
- def scan_datagram(self, select_key: str) -> list:
153
+ def scan_datagram(
154
+ self,
155
+ select_key: str
156
+ ) -> list:
150
157
  # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
151
158
  # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
152
159
  # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
@@ -162,7 +169,10 @@ class IndexManager:
162
169
  return first_datagram
163
170
 
164
171
  #################################################################
165
- def get_subset_datagrams(self, df: pd.DataFrame) -> list:
172
+ def get_subset_datagrams(
173
+ self,
174
+ df: pd.DataFrame
175
+ ) -> list:
166
176
  print("getting subset of datagrams")
167
177
  select_keys = list(
168
178
  df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
@@ -181,7 +191,9 @@ class IndexManager:
181
191
 
182
192
  #################################################################
183
193
  def get_ek60_objects(
184
- self, df: pd.DataFrame, subset_datagrams: list
194
+ self,
195
+ df: pd.DataFrame,
196
+ subset_datagrams: list
185
197
  ) -> pd.DataFrame:
186
198
  # for each key write datagram value to all other files in same cruise
187
199
  for subset_datagram in subset_datagrams:
@@ -195,7 +207,7 @@ class IndexManager:
195
207
  return df.loc[df["DATAGRAM"] == "CON0"]
196
208
 
197
209
  #################################################################
198
- def get_calibration_information( # tested
210
+ def get_calibration_information(
199
211
  self,
200
212
  ) -> pd.DataFrame:
201
213
  # Calibration data generated by data manager currently located here:
@@ -1,14 +1,14 @@
1
1
  import os
2
-
3
2
  import numcodecs
4
3
  import numpy as np
5
4
  import xarray as xr
6
5
  import zarr
7
6
  from numcodecs import Blosc
8
7
 
9
- from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
10
- from water_column_sonar_processing.utility.constants import Constants, Coordinates
11
- from water_column_sonar_processing.utility.timestamp import Timestamp
8
+ from water_column_sonar_processing.aws import S3FSManager
9
+ from water_column_sonar_processing.utility import Constants
10
+ from water_column_sonar_processing.utility import Timestamp
11
+ from water_column_sonar_processing.utility import Coordinates
12
12
 
13
13
  numcodecs.blosc.use_threads = False
14
14
  numcodecs.blosc.set_nthreads(1)
@@ -32,8 +32,8 @@ class ZarrManager:
32
32
  self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
33
33
 
34
34
  #######################################################
35
- @staticmethod
36
35
  def get_depth_values(
36
+ self,
37
37
  min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
38
38
  max_echo_range: float = 100.0, # maximum depth measured from whole cruise
39
39
  ):
@@ -85,12 +85,11 @@ class ZarrManager:
85
85
  name=Coordinates.TIME.value,
86
86
  data=np.repeat(0.0, width),
87
87
  shape=width,
88
- chunks=(
89
- Constants.TILE_SIZE.value,
90
- ), # TODO: the chunking scheme doesn't seem to be working here
88
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
89
+ # Constants.TILE_SIZE.value,
90
+ #), # TODO: the chunking scheme doesn't seem to be working here
91
91
  dtype=np.dtype(Coordinates.TIME_DTYPE.value),
92
92
  compressor=self.__compressor,
93
- # fill_value=0.,
94
93
  fill_value=np.nan, # TODO: do i want nan's?
95
94
  overwrite=self.__overwrite,
96
95
  )
@@ -113,12 +112,12 @@ class ZarrManager:
113
112
  # TODO: verify that these values are correct
114
113
  data=depth_values,
115
114
  shape=len(depth_values),
116
- chunks=Constants.TILE_SIZE.value,
115
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
117
116
  dtype=np.dtype(
118
117
  Coordinates.DEPTH_DTYPE.value
119
118
  ), # float16 == 2 significant digits would be ideal
120
119
  compressor=self.__compressor,
121
- # fill_value=np.nan,
120
+ fill_value=np.nan,
122
121
  overwrite=self.__overwrite,
123
122
  )
124
123
  # TODO: change to exception
@@ -133,15 +132,16 @@ class ZarrManager:
133
132
  # --- Coordinate: Latitude --- #
134
133
  root.create_dataset(
135
134
  name=Coordinates.LATITUDE.value,
136
- data=np.repeat(0.0, width),
135
+ # data=np.repeat(0.0, width),
137
136
  shape=width,
138
- chunks=Constants.TILE_SIZE.value,
137
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
139
138
  dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
140
139
  compressor=self.__compressor,
141
- fill_value=0.0,
140
+ fill_value=np.nan,
142
141
  overwrite=self.__overwrite,
143
142
  )
144
143
 
144
+ # Note: LATITUDE is indexed by TIME
145
145
  root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
146
146
 
147
147
  root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
@@ -151,15 +151,16 @@ class ZarrManager:
151
151
  # --- Coordinate: Longitude --- #
152
152
  root.create_dataset(
153
153
  name=Coordinates.LONGITUDE.value,
154
- data=np.repeat(0.0, width), # root.longitude[:] = np.nan
154
+ # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
155
155
  shape=width,
156
- chunks=Constants.TILE_SIZE.value,
156
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
157
157
  dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
158
158
  compressor=self.__compressor,
159
- fill_value=0.0,
159
+ fill_value=np.nan,
160
160
  overwrite=self.__overwrite,
161
161
  )
162
162
 
163
+ # Note: LONGITUDE is indexed by TIME
163
164
  root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
164
165
 
165
166
  root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
@@ -170,19 +171,20 @@ class ZarrManager:
170
171
  # --- Coordinate: Bottom --- #
171
172
  root.create_dataset(
172
173
  name=Coordinates.BOTTOM.value,
173
- # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
174
+ data=np.repeat(0.0, width), # root.longitude[:] = np.nan
174
175
  shape=width,
175
- chunks=Constants.TILE_SIZE.value,
176
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
176
177
  dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
177
178
  compressor=self.__compressor,
178
- fill_value=np.nan,
179
+ fill_value=0.0,
179
180
  overwrite=self.__overwrite,
180
181
  )
181
182
 
182
- root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
183
+ # BOTTOM is indexed by TIME
184
+ root.bottom.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
183
185
 
184
- root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
185
- root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
186
+ root.bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
187
+ root.bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
186
188
 
187
189
  #####################################################################
188
190
  # --- Coordinate: Frequency --- #
@@ -190,7 +192,7 @@ class ZarrManager:
190
192
  name=Coordinates.FREQUENCY.value,
191
193
  data=frequencies,
192
194
  shape=len(frequencies),
193
- chunks=1,
195
+ chunks=len(frequencies),
194
196
  dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
195
197
  compressor=self.__compressor,
196
198
  fill_value=0.0,
@@ -213,7 +215,7 @@ class ZarrManager:
213
215
  root.create_dataset(
214
216
  name=Coordinates.SV.value,
215
217
  shape=(len(depth_values), width, len(frequencies)),
216
- chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1),
218
+ chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
217
219
  dtype=np.dtype(
218
220
  Coordinates.SV_DTYPE.value
219
221
  ), # TODO: try to experiment with 'float16'
@@ -3,10 +3,10 @@ import os
3
3
 
4
4
  import numpy as np
5
5
 
6
- from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
7
- from water_column_sonar_processing.aws.s3_manager import S3Manager
8
- from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
9
- from water_column_sonar_processing.aws.sns_manager import SNSManager
6
+ from water_column_sonar_processing.aws import DynamoDBManager
7
+ from water_column_sonar_processing.aws import S3Manager
8
+ from water_column_sonar_processing.aws import S3FSManager
9
+ from water_column_sonar_processing.aws import SNSManager
10
10
 
11
11
 
12
12
  ###########################################################
@@ -0,0 +1,4 @@
1
+ from .cruise_sampler import CruiseSampler
2
+ from .raw_to_zarr import RawToZarr
3
+
4
+ __all__ = ["CruiseSampler", "RawToZarr"]