water-column-sonar-processing 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (21) hide show
  1. water_column_sonar_processing/__init__.py +4 -5
  2. water_column_sonar_processing/aws/dynamodb_manager.py +149 -43
  3. water_column_sonar_processing/aws/s3_manager.py +71 -37
  4. water_column_sonar_processing/cruise/create_empty_zarr_store.py +6 -4
  5. water_column_sonar_processing/cruise/resample_regrid.py +3 -3
  6. water_column_sonar_processing/geometry/geometry_manager.py +21 -6
  7. water_column_sonar_processing/geometry/pmtile_generation.py +202 -13
  8. water_column_sonar_processing/index/index_manager.py +25 -13
  9. water_column_sonar_processing/model/zarr_manager.py +26 -25
  10. water_column_sonar_processing/process.py +4 -4
  11. water_column_sonar_processing/processing/__init__.py +4 -0
  12. water_column_sonar_processing/processing/cruise_sampler.py +342 -0
  13. water_column_sonar_processing/processing/raw_to_zarr.py +349 -0
  14. water_column_sonar_processing/utility/cleaner.py +1 -0
  15. water_column_sonar_processing/utility/constants.py +6 -2
  16. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/METADATA +21 -10
  17. water_column_sonar_processing-0.0.8.dist-info/RECORD +32 -0
  18. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/WHEEL +1 -1
  19. water_column_sonar_processing-0.0.6.dist-info/RECORD +0 -29
  20. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/LICENSE +0 -0
  21. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,24 @@
1
+ import glob
1
2
  import os
2
3
  from pathlib import Path
3
-
4
- # from shapely import wkt
5
- # import json
6
- # from shapely.geometry import shape, GeometryCollection
7
4
  import fiona
8
- import geopandas
5
+ import s3fs
6
+ import numpy as np
9
7
  import pandas as pd
8
+ import xarray as xr
9
+ import geopandas
10
+ import geopandas as gpd
11
+ import pyogrio
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
13
  from shapely.geometry import LineString
11
14
 
15
+ from src.water_column_sonar_processing.aws import S3Manager, S3FSManager
16
+
17
+ MAX_POOL_CONNECTIONS = 64
18
+ MAX_CONCURRENCY = 64
19
+ MAX_WORKERS = 64
20
+ GB = 1024**3
21
+
12
22
 
13
23
  class PMTileGeneration(object):
14
24
  #######################################################
@@ -18,34 +28,35 @@ class PMTileGeneration(object):
18
28
  print("123")
19
29
 
20
30
  #######################################################
31
+ # This uses a local collection of file-level geojson files to create the data
21
32
  def generate_geojson_feature_collection(self):
22
33
  # This was used to read from noaa-wcsd-model-pds bucket geojson files and then to
23
34
  # generate the geopandas dataframe which could be exported to another comprehensive
24
35
  # geojson file. That
25
36
  result = list(Path("/Users/r2d2/Documents/echofish/geojson").rglob("*.json"))
26
37
  # result = result[:100]
27
- iii = 0
38
+ jjj = 0
28
39
  pieces = []
29
- for iii in range(len(result)):
30
- file_name = os.path.normpath(result[iii]).split(os.sep)[-1]
40
+ for jjj in range(len(result)):
41
+ file_name = os.path.normpath(result[jjj]).split(os.sep)[-1]
31
42
  file_stem = os.path.splitext(os.path.basename(file_name))[0]
32
- geom = geopandas.read_file(result[iii]).iloc[0]["geometry"]
43
+ geom = gpd.read_file(result[jjj]).iloc[0]["geometry"]
33
44
  # TDOO: Filter (0,0) coordinates
34
45
  if len(geom.coords.xy[0]) < 2:
35
46
  continue
36
47
  geom = LineString(list(zip(geom.coords.xy[1], geom.coords.xy[0])))
37
48
  pieces.append(
38
49
  {
39
- "ship_name": os.path.normpath(result[iii]).split(os.sep)[-4],
40
- "cruise_name": os.path.normpath(result[iii]).split(os.sep)[-3],
50
+ "ship_name": os.path.normpath(result[jjj]).split(os.sep)[-4],
51
+ "cruise_name": os.path.normpath(result[jjj]).split(os.sep)[-3],
41
52
  "file_stem": file_stem,
42
- "file_path": result[iii],
53
+ "file_path": result[jjj],
43
54
  "geom": geom,
44
55
  }
45
56
  )
46
57
  df = pd.DataFrame(pieces)
47
58
  print(df)
48
- gps_gdf = geopandas.GeoDataFrame(
59
+ gps_gdf = gpd.GeoDataFrame(
49
60
  data=df[
50
61
  ["ship_name", "cruise_name", "file_stem"]
51
62
  ], # try again with file_stem
@@ -70,6 +81,184 @@ class PMTileGeneration(object):
70
81
  """
71
82
 
72
83
  #######################################################
84
+ # TODO: temporary using this to get info
85
+ def get_info_from_zarr_store(
86
+ self,
87
+ ship_name,
88
+ cruise_names,
89
+ ):
90
+ total_size = 0
91
+ s3_fs = s3fs.S3FileSystem(anon=True)
92
+ for cruise_name in cruise_names:
93
+ path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
94
+ zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
95
+ xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
96
+ print(f'Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}')
97
+ total_size = total_size + xr_store.time.shape[0]
98
+
99
+ def get_geospatial_info_from_zarr_store(
100
+ self,
101
+ ship_name,
102
+ cruise_name,
103
+ ):
104
+ """
105
+ Open Zarr store, create geometry, write to geojson, return name
106
+ """
107
+ s3_fs = s3fs.S3FileSystem(anon=True)
108
+ gps_gdf = geopandas.GeoDataFrame(
109
+ columns=["id", "ship", "cruise", "sensor", "geometry"],
110
+ geometry="geometry",
111
+ crs="EPSG:4326"
112
+ )
113
+ path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
114
+ # file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
115
+ # file_stem = os.path.splitext(os.path.basename(file_name))[0]
116
+ zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
117
+ # ---Open Zarr Store--- #
118
+ # TODO: try-except to allow failures
119
+ print('opening store')
120
+ # xr_store = xr.open_zarr(store=zarr_store, consolidated=False)
121
+ xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
122
+ print(xr_store.Sv.shape)
123
+ # ---Read Zarr Store Time/Latitude/Longitude--- #
124
+ latitude = xr_store.latitude.values
125
+ longitude = xr_store.longitude.values
126
+ if np.isnan(latitude).any() or np.isnan(longitude).any():
127
+ print(f'there was missing lat-lon data for {cruise_name}')
128
+ return None
129
+ # ---Add To GeoPandas Dataframe--- #
130
+ # TODO: experiment with tolerance "0.001"
131
+ geom = LineString(list(zip(longitude, latitude))).simplify(tolerance=0.001, preserve_topology=True)
132
+ gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
133
+ gps_gdf.set_index('id', inplace=True)
134
+ gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, engine="pyogrio")
135
+ return cruise_name
73
136
 
137
+ #######################################################
138
+ def open_zarr_stores_with_thread_pool_executor(
139
+ self,
140
+ cruises: list,
141
+ ):
142
+ # 'cruises' is a list of cruises to process
143
+ completed_cruises = []
144
+ try:
145
+ with ThreadPoolExecutor(max_workers=32) as executor:
146
+ futures = [
147
+ executor.submit(
148
+ self.get_geospatial_info_from_zarr_store,
149
+ "Henry_B._Bigelow", # ship_name
150
+ cruise, # cruise_name
151
+ )
152
+ for cruise in cruises
153
+ ]
154
+ for future in as_completed(futures):
155
+ result = future.result()
156
+ if result:
157
+ completed_cruises.extend([result])
158
+ except Exception as err:
159
+ print(err)
160
+ print("Done opening zarr stores using thread pool.")
161
+ return completed_cruises # Took ~12 minutes
74
162
 
163
+ #######################################################
164
+ # https://docs.protomaps.com/pmtiles/create
165
+ def aggregate_geojson_into_dataframe(
166
+ self
167
+ ):
168
+ """
169
+ iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
170
+ """
171
+ gps_gdf = geopandas.GeoDataFrame(
172
+ columns=["id", "ship", "cruise", "sensor", "geometry"],
173
+ geometry="geometry",
174
+ crs="EPSG:4326"
175
+ )
176
+
177
+ file_type = 'dataframe_*.geojson'
178
+ geojson_files = glob.glob(file_type)
179
+ for jjj in range(len(geojson_files)):
180
+ print(jjj)
181
+ geom = geopandas.read_file(geojson_files[jjj])
182
+ gps_gdf.loc[jjj] = (jjj, geom.ship[0], geom.cruise[0], geom.sensor[0], geom.geometry[0])
183
+ #gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
184
+ print(gps_gdf)
185
+ gps_gdf.set_index('id', inplace=True)
186
+ gps_gdf.to_file(f"data.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
187
+ return list(gps_gdf.cruise)
188
+
189
+ # gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
190
+ #print('writing to file')
191
+ #print(gps_gdf)
192
+ # gps_gdf.set_index('id', inplace=True)
193
+ # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
194
+ # https://gdal.org/en/latest/drivers/vector/jsonfg.html
195
+ # gps_gdf.to_file(
196
+ # f"data.geojson",
197
+ # driver="GeoJSON",
198
+ # engine="pyogrio",
199
+ # layer_options={"ID_FIELD": "id"}
200
+ # )
201
+ # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
202
+
203
+ # print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
204
+ #gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
205
+ # Convert geojson feature collection to pmtiles
206
+ #gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
207
+ #print("done")
208
+ # ---Export Shapefile--- #
209
+
210
+
211
+
212
+ #gps_gdf.set_geometry(col='geometry', inplace=True)
213
+ #gps_gdf.__geo_interface__
214
+ #gps_gdf.set_index('id', inplace=True)
215
+ #gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
216
+
217
+ ### this gives the right layer id values
218
+ #gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
219
+ # jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
220
+ #tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
221
+ #tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
222
+ # {
223
+ # "type": "FeatureCollection",
224
+ # "name": "dataframe5",
225
+ # "features": [
226
+ # { "type": "Feature", "id": 0, "properties": { "id": 0, "ship": "Henry_B._Bigelow", "cruise": "HB0706", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.120498657226562, 39.659671783447266 ], [ -72.120773315429688, 39.660198211669922 ] ] } },
227
+ # { "type": "Feature", "id": 1, "properties": { "id": 1, "ship": "Henry_B._Bigelow", "cruise": "HB0707", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -71.797836303710938, 41.003166198730469 ], [ -71.797996520996094, 41.002998352050781 ], [ -71.798583984375, 41.002994537353516 ] ] } },
228
+ # { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
229
+ # ]
230
+ # }
231
+ """
232
+ # https://docs.protomaps.com/pmtiles/create
233
+ #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
234
+ # Only need to do the second one here...
235
+ tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
236
+ tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
237
+ # used this to combine all the geojson files into single pmtile file (2024-12-03):
238
+ tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
239
+
240
+ TODO:
241
+ run each one of the cruises in a separate ospool workflow.
242
+ each process gets own store
243
+ """
75
244
  ###########################################################
245
+
246
+ # s3_manager = S3Manager() # endpoint_url=endpoint_url)
247
+ # # s3fs_manager = S3FSManager()
248
+ # # input_bucket_name = "test_input_bucket"
249
+ # # s3_manager.create_bucket(bucket_name=input_bucket_name)
250
+ # ship_name = "Henry_B._Bigelow"
251
+ # cruise_name = "HB0706"
252
+ # sensor_name = "EK60"
253
+ #
254
+ # # ---Scan Bucket For All Zarr Stores--- #
255
+ # # https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html#level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr/
256
+ # path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr"
257
+ # s3 = s3fs.S3FileSystem()
258
+ # zarr_store = s3fs.S3Map(path_to_zarr_store, s3=s3)
259
+ # ds_zarr = xr.open_zarr(zarr_store, consolidated=None)
260
+ # print(ds_zarr.Sv.shape)
261
+
262
+
263
+
264
+ total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  from datetime import datetime
5
5
  from concurrent.futures import ThreadPoolExecutor
6
6
  from concurrent.futures import as_completed
7
- from water_column_sonar_processing.aws.s3_manager import S3Manager
7
+ from src.water_column_sonar_processing.aws.s3_manager import S3Manager
8
8
 
9
9
 
10
10
  class IndexManager:
@@ -16,12 +16,10 @@ class IndexManager:
16
16
  self.s3_manager = S3Manager()
17
17
 
18
18
  #################################################################
19
-
20
19
  def list_ships(
21
20
  self,
22
21
  prefix="data/raw/",
23
22
  ):
24
- # s3_client = self.s3_manager.s3_client
25
23
  page_iterator = self.s3_manager.paginator.paginate(
26
24
  Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
27
25
  )
@@ -79,6 +77,7 @@ class IndexManager:
79
77
  all_files.extend([i["Key"] for i in page["Contents"]])
80
78
  return [i for i in all_files if i.endswith(".raw")]
81
79
 
80
+ #################################################################
82
81
  def get_raw_files_csv(
83
82
  self,
84
83
  ship_name,
@@ -86,7 +85,9 @@ class IndexManager:
86
85
  sensor_name,
87
86
  ):
88
87
  raw_files = self.get_raw_files(
89
- ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
88
+ ship_name=ship_name,
89
+ cruise_name=cruise_name,
90
+ sensor_name=sensor_name
90
91
  )
91
92
  files_list = [
92
93
  {
@@ -102,7 +103,10 @@ class IndexManager:
102
103
  print("done")
103
104
 
104
105
  #################################################################
105
- def get_subset_ek60_prefix(self, df: pd.DataFrame) -> pd.DataFrame:
106
+ def get_subset_ek60_prefix( # TODO: is this used?
107
+ self,
108
+ df: pd.DataFrame
109
+ ) -> pd.DataFrame:
106
110
  # Returns all objects with 'EK60' in prefix of file path
107
111
  # Note that this can include 'EK80' data that are false-positives
108
112
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
@@ -119,13 +123,13 @@ class IndexManager:
119
123
  2:5
120
124
  ] # 'Okeanos_Explorer', 'EX1608', 'EK60'
121
125
  if (
122
- re.search("[D](\d{8})", filename) is not None
123
- and re.search("[T](\d{6})", filename) is not None
126
+ re.search("[D](\\d{8})", filename) is not None
127
+ and re.search("[T](\\d{6})", filename) is not None
124
128
  ):
125
129
  # Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
126
130
  # and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
127
- date_substring = re.search("[D](\d{8})", filename).group(1)
128
- time_substring = re.search("[T](\d{6})", filename).group(1)
131
+ date_substring = re.search("[D](\\d{8})", filename).group(1)
132
+ time_substring = re.search("[T](\\d{6})", filename).group(1)
129
133
  date_string = datetime.strptime(
130
134
  f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
131
135
  )
@@ -146,7 +150,10 @@ class IndexManager:
146
150
  return pd.DataFrame(objects)
147
151
 
148
152
  #################################################################
149
- def scan_datagram(self, select_key: str) -> list:
153
+ def scan_datagram(
154
+ self,
155
+ select_key: str
156
+ ) -> list:
150
157
  # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
151
158
  # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
152
159
  # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
@@ -162,7 +169,10 @@ class IndexManager:
162
169
  return first_datagram
163
170
 
164
171
  #################################################################
165
- def get_subset_datagrams(self, df: pd.DataFrame) -> list:
172
+ def get_subset_datagrams(
173
+ self,
174
+ df: pd.DataFrame
175
+ ) -> list:
166
176
  print("getting subset of datagrams")
167
177
  select_keys = list(
168
178
  df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
@@ -181,7 +191,9 @@ class IndexManager:
181
191
 
182
192
  #################################################################
183
193
  def get_ek60_objects(
184
- self, df: pd.DataFrame, subset_datagrams: list
194
+ self,
195
+ df: pd.DataFrame,
196
+ subset_datagrams: list
185
197
  ) -> pd.DataFrame:
186
198
  # for each key write datagram value to all other files in same cruise
187
199
  for subset_datagram in subset_datagrams:
@@ -195,7 +207,7 @@ class IndexManager:
195
207
  return df.loc[df["DATAGRAM"] == "CON0"]
196
208
 
197
209
  #################################################################
198
- def get_calibration_information( # tested
210
+ def get_calibration_information(
199
211
  self,
200
212
  ) -> pd.DataFrame:
201
213
  # Calibration data generated by data manager currently located here:
@@ -1,14 +1,13 @@
1
1
  import os
2
-
3
2
  import numcodecs
4
3
  import numpy as np
5
4
  import xarray as xr
6
5
  import zarr
7
6
  from numcodecs import Blosc
8
7
 
9
- from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
10
- from water_column_sonar_processing.utility.constants import Constants, Coordinates
11
- from water_column_sonar_processing.utility.timestamp import Timestamp
8
+ from src.water_column_sonar_processing.aws.s3fs_manager import S3FSManager
9
+ from src.water_column_sonar_processing.utility.constants import Constants, Coordinates
10
+ from src.water_column_sonar_processing.utility.timestamp import Timestamp
12
11
 
13
12
  numcodecs.blosc.use_threads = False
14
13
  numcodecs.blosc.set_nthreads(1)
@@ -32,8 +31,8 @@ class ZarrManager:
32
31
  self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
33
32
 
34
33
  #######################################################
35
- @staticmethod
36
34
  def get_depth_values(
35
+ self,
37
36
  min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
38
37
  max_echo_range: float = 100.0, # maximum depth measured from whole cruise
39
38
  ):
@@ -85,12 +84,11 @@ class ZarrManager:
85
84
  name=Coordinates.TIME.value,
86
85
  data=np.repeat(0.0, width),
87
86
  shape=width,
88
- chunks=(
89
- Constants.TILE_SIZE.value,
90
- ), # TODO: the chunking scheme doesn't seem to be working here
87
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
88
+ # Constants.TILE_SIZE.value,
89
+ #), # TODO: the chunking scheme doesn't seem to be working here
91
90
  dtype=np.dtype(Coordinates.TIME_DTYPE.value),
92
91
  compressor=self.__compressor,
93
- # fill_value=0.,
94
92
  fill_value=np.nan, # TODO: do i want nan's?
95
93
  overwrite=self.__overwrite,
96
94
  )
@@ -113,12 +111,12 @@ class ZarrManager:
113
111
  # TODO: verify that these values are correct
114
112
  data=depth_values,
115
113
  shape=len(depth_values),
116
- chunks=Constants.TILE_SIZE.value,
114
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
117
115
  dtype=np.dtype(
118
116
  Coordinates.DEPTH_DTYPE.value
119
117
  ), # float16 == 2 significant digits would be ideal
120
118
  compressor=self.__compressor,
121
- # fill_value=np.nan,
119
+ fill_value=np.nan,
122
120
  overwrite=self.__overwrite,
123
121
  )
124
122
  # TODO: change to exception
@@ -133,15 +131,16 @@ class ZarrManager:
133
131
  # --- Coordinate: Latitude --- #
134
132
  root.create_dataset(
135
133
  name=Coordinates.LATITUDE.value,
136
- data=np.repeat(0.0, width),
134
+ # data=np.repeat(0.0, width),
137
135
  shape=width,
138
- chunks=Constants.TILE_SIZE.value,
136
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
139
137
  dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
140
138
  compressor=self.__compressor,
141
- fill_value=0.0,
139
+ fill_value=np.nan,
142
140
  overwrite=self.__overwrite,
143
141
  )
144
142
 
143
+ # Note: LATITUDE is indexed by TIME
145
144
  root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
146
145
 
147
146
  root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
@@ -151,15 +150,16 @@ class ZarrManager:
151
150
  # --- Coordinate: Longitude --- #
152
151
  root.create_dataset(
153
152
  name=Coordinates.LONGITUDE.value,
154
- data=np.repeat(0.0, width), # root.longitude[:] = np.nan
153
+ # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
155
154
  shape=width,
156
- chunks=Constants.TILE_SIZE.value,
155
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
157
156
  dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
158
157
  compressor=self.__compressor,
159
- fill_value=0.0,
158
+ fill_value=np.nan,
160
159
  overwrite=self.__overwrite,
161
160
  )
162
161
 
162
+ # Note: LONGITUDE is indexed by TIME
163
163
  root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
164
164
 
165
165
  root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
@@ -170,19 +170,20 @@ class ZarrManager:
170
170
  # --- Coordinate: Bottom --- #
171
171
  root.create_dataset(
172
172
  name=Coordinates.BOTTOM.value,
173
- # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
173
+ data=np.repeat(0.0, width), # root.longitude[:] = np.nan
174
174
  shape=width,
175
- chunks=Constants.TILE_SIZE.value,
175
+ chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
176
176
  dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
177
177
  compressor=self.__compressor,
178
- fill_value=np.nan,
178
+ fill_value=0.0,
179
179
  overwrite=self.__overwrite,
180
180
  )
181
181
 
182
- root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
182
+ # BOTTOM is indexed by TIME
183
+ root.bottom.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
183
184
 
184
- root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
185
- root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
185
+ root.bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
186
+ root.bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
186
187
 
187
188
  #####################################################################
188
189
  # --- Coordinate: Frequency --- #
@@ -190,7 +191,7 @@ class ZarrManager:
190
191
  name=Coordinates.FREQUENCY.value,
191
192
  data=frequencies,
192
193
  shape=len(frequencies),
193
- chunks=1,
194
+ chunks=len(frequencies),
194
195
  dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
195
196
  compressor=self.__compressor,
196
197
  fill_value=0.0,
@@ -213,7 +214,7 @@ class ZarrManager:
213
214
  root.create_dataset(
214
215
  name=Coordinates.SV.value,
215
216
  shape=(len(depth_values), width, len(frequencies)),
216
- chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1),
217
+ chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
217
218
  dtype=np.dtype(
218
219
  Coordinates.SV_DTYPE.value
219
220
  ), # TODO: try to experiment with 'float16'
@@ -3,10 +3,10 @@ import os
3
3
 
4
4
  import numpy as np
5
5
 
6
- from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
7
- from water_column_sonar_processing.aws.s3_manager import S3Manager
8
- from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
9
- from water_column_sonar_processing.aws.sns_manager import SNSManager
6
+ from src.water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
7
+ from src.water_column_sonar_processing.aws.s3_manager import S3Manager
8
+ from src.water_column_sonar_processing.aws.s3fs_manager import S3FSManager
9
+ from src.water_column_sonar_processing.aws.sns_manager import SNSManager
10
10
 
11
11
 
12
12
  ###########################################################
@@ -0,0 +1,4 @@
1
+ from .cruise_sampler import CruiseSampler
2
+ from .raw_to_zarr import RawToZarr
3
+
4
+ __all__ = ["CruiseSampler", "RawToZarr"]