water-column-sonar-processing 25.11.1__py3-none-any.whl → 26.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (22) hide show
  1. water_column_sonar_processing/aws/s3_manager.py +2 -4
  2. water_column_sonar_processing/aws/s3fs_manager.py +1 -9
  3. water_column_sonar_processing/cruise/create_empty_zarr_store.py +19 -81
  4. water_column_sonar_processing/cruise/resample_regrid.py +88 -104
  5. water_column_sonar_processing/geometry/__init__.py +2 -0
  6. water_column_sonar_processing/geometry/elevation_manager.py +2 -2
  7. water_column_sonar_processing/geometry/geometry_manager.py +11 -13
  8. water_column_sonar_processing/geometry/line_simplification.py +10 -10
  9. water_column_sonar_processing/geometry/pmtile_generation.py +8 -3
  10. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  11. water_column_sonar_processing/index/index_manager.py +43 -46
  12. water_column_sonar_processing/model/zarr_manager.py +533 -514
  13. water_column_sonar_processing/processing/raw_to_zarr.py +45 -139
  14. water_column_sonar_processing/utility/cleaner.py +2 -1
  15. water_column_sonar_processing/utility/constants.py +29 -29
  16. water_column_sonar_processing-26.1.14.dist-info/METADATA +240 -0
  17. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/RECORD +20 -20
  18. water_column_sonar_processing/process.py +0 -149
  19. water_column_sonar_processing-25.11.1.dist-info/METADATA +0 -182
  20. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/WHEEL +0 -0
  21. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/licenses/LICENSE +0 -0
  22. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,21 @@
1
- import importlib.metadata
1
+ import os
2
+ from importlib import metadata
3
+ from typing import Optional
2
4
 
3
5
  import numpy as np
4
6
  import xarray as xr
5
7
  import zarr
6
8
  from zarr.codecs import BloscCodec, BloscShuffle
7
- from zarr.storage import LocalStore
9
+ from zarr.core.group import Group
8
10
 
9
- from water_column_sonar_processing.aws import S3FSManager
10
11
  from water_column_sonar_processing.utility import Constants, Coordinates, Timestamp
11
12
 
12
- # TODO: change clevel to 9?!
13
- compressor = BloscCodec(cname="zstd", clevel=9, shuffle=BloscShuffle.shuffle)
14
-
15
- # TODO: when ready switch to version 3 of model spec
13
+ # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/blosc/index.html
14
+ compressors = BloscCodec(
15
+ cname="zstd",
16
+ clevel=9,
17
+ shuffle=BloscShuffle.bitshuffle,
18
+ )
16
19
 
17
20
 
18
21
  # creates the latlon dataset: foo = ep.consolidate.add_location(ds_Sv, echodata)
@@ -20,33 +23,32 @@ class ZarrManager:
20
23
  #######################################################
21
24
  def __init__(
22
25
  self,
26
+ # endpoint_url: Optional[str] = None,
23
27
  ):
24
28
  self.__overwrite = True
29
+ self.key = os.environ.get("OUTPUT_BUCKET_ACCESS_KEY")
30
+ self.secret = os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY")
25
31
 
26
32
  #######################################################
33
+ @staticmethod
27
34
  def get_depth_values(
28
- self,
29
- # min_echo_range: float, # minimum depth measured (zero non-inclusive) from whole cruise
30
35
  max_echo_range: float, # maximum depth measured from whole cruise
31
- cruise_min_epsilon: float = 0.25, # resolution between subsequent measurements
32
- ): # TODO: define return type
36
+ cruise_min_epsilon: float = 0.20, # delta subsequent measurements
37
+ ) -> np.ndarray[tuple]:
33
38
  # Gets the set of depth values that will be used when resampling and
34
39
  # regridding the dataset to a cruise level model store.
35
40
  # Note: returned values start at zero!
36
41
  # For more info see here: https://echopype.readthedocs.io/en/stable/data-proc-additional.html
37
- print("Computing depth values.")
38
42
  all_cruise_depth_values = np.linspace( # TODO: PROBLEM HERE
39
- start=0, # just start it at zero
40
- stop=max_echo_range,
41
- num=int(max_echo_range / cruise_min_epsilon)
42
- + 1, # int(np.ceil(max_echo_range / cruise_min_epsilon))?
43
+ start=0, # start it at zero
44
+ stop=np.ceil(max_echo_range), # round up
45
+ num=int(np.ceil(max_echo_range) / cruise_min_epsilon) + 1,
43
46
  endpoint=True,
44
- ) # np.arange(min_echo_range, max_echo_range, step=min_echo_range) # this is worse
47
+ )
45
48
 
46
49
  if np.any(np.isnan(all_cruise_depth_values)):
47
50
  raise Exception("Problem depth values returned were NaN.")
48
51
 
49
- print("Done computing depth values.")
50
52
  return all_cruise_depth_values.round(decimals=2)
51
53
 
52
54
  #######################################################
@@ -56,667 +58,684 @@ class ZarrManager:
56
58
  ship_name: str,
57
59
  cruise_name: str,
58
60
  sensor_name: str,
59
- frequencies: list, # units in Hz
60
- width: int, # TODO: needs better name... "ping_time"
61
- # min_echo_range: float,
61
+ frequencies: list, # units in Hz, type(frequencies) == np.ndarray
62
+ width: int,
62
63
  max_echo_range: float,
63
- cruise_min_epsilon: float, # smallest resolution in meters
64
64
  calibration_status: bool = False, # Assume uncalibrated
65
65
  ) -> str:
66
+ """
67
+ Creates a new zarr store in a local temporary directory(?)
68
+ This includes the water_level on top of the max_echo_range already, nothing extra needs to be done.
69
+ """
66
70
  try:
67
- # TODO: problem throwing exceptions here
68
- print(
69
- f"Creating local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
70
- )
71
- # There can not currently be repeated frequencies
72
- # TODO: eventually switch coordinate to "channel" because frequencies can repeat
71
+ print(f"Creating local zarr store, {cruise_name}.zarr for ship {ship_name}")
73
72
  if len(frequencies) != len(set(frequencies)):
74
73
  raise Exception(
75
74
  "Number of frequencies does not match number of channels"
76
75
  )
77
76
 
78
77
  zarr_path = f"{path}/{cruise_name}.zarr"
79
- # store = zarr.DirectoryStore(path=zarr_path, normalize_keys=False)
80
- ### https://zarr.readthedocs.io/en/latest/user-guide/groups/ ###
81
- # store = zarr.group(path=zarr_path)
82
- store = LocalStore(root=zarr_path)
83
- root = zarr.group(
84
- store=store, # zarr_path,
85
- overwrite=self.__overwrite, # cache_attrs=True
86
- zarr_format=3,
78
+ #####################################################################
79
+ frequencies = np.array(
80
+ frequencies, dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value)
87
81
  )
88
-
82
+ #####################################################################
83
+ # Define the chunk sizes and the encoding
84
+ depth_chunk_shape = (Constants.TILE_SIZE.value,)
85
+ time_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
86
+ frequency_chunk_shape = (len(frequencies),)
87
+ latitude_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
88
+ longitude_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
89
+ bottom_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
90
+ speed_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
91
+ distance_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
92
+ sv_chunk_shape = (Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1)
93
+ #####################################################################
94
+ root = zarr.create_group(store=zarr_path, zarr_format=3, overwrite=True)
89
95
  #####################################################################
90
96
  # --- Coordinate: Time --- #
91
97
  # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
92
- time_data = np.repeat(0.0, width)
93
- time_data.astype(np.dtype(Coordinates.TIME_DTYPE.value), copy=False)
94
-
95
- time = root.create_array( # deprecated: Use Group.create_array instead.
98
+ # "data_type": "int64", "fill_value": 0, "units": "nanoseconds since 1970-01-01", "calendar": "proleptic_gregorian"
99
+ #
100
+ time_values = np.repeat(0.0, width)
101
+ time_values.astype(np.dtype(Coordinates.TIME_DTYPE.value))
102
+ root.create_array(
96
103
  name=Coordinates.TIME.value,
97
- data=time_data,
98
- # shape=width,
99
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
104
+ # shape=width_indices,
100
105
  # dtype=np.dtype(Coordinates.TIME_DTYPE.value),
101
- compressors=compressor,
106
+ data=time_values,
107
+ chunks=time_chunk_shape,
108
+ compressors=compressors,
102
109
  fill_value=np.nan,
103
- overwrite=self.__overwrite,
104
- dimension_names=(Coordinates.TIME.value,),
110
+ attributes=dict(
111
+ calendar=Coordinates.TIME_CALENDAR.value,
112
+ units=Coordinates.TIME_UNITS.value,
113
+ long_name=Coordinates.TIME_LONG_NAME.value,
114
+ standard_name=Coordinates.TIME_STANDARD_NAME.value,
115
+ ),
116
+ dimension_names=[Coordinates.TIME.value],
117
+ overwrite=True,
105
118
  )
106
-
107
- # time.metadata.dimension_names = (Coordinates.TIME.value,)
108
-
109
- time.attrs["calendar"] = Coordinates.TIME_CALENDAR.value
110
- time.attrs["units"] = Coordinates.TIME_UNITS.value
111
- time.attrs["long_name"] = Coordinates.TIME_LONG_NAME.value
112
- time.attrs["standard_name"] = Coordinates.TIME_STANDARD_NAME.value
113
-
114
119
  #####################################################################
115
- # --- Coordinate: Depth --- #
116
- depth_data = self.get_depth_values(
117
- # min_echo_range=min_echo_range,
120
+ #####################################################################
121
+ # # --- Coordinate: Depth --- #
122
+ depth_data_values = self.get_depth_values(
118
123
  max_echo_range=max_echo_range,
119
- cruise_min_epsilon=cruise_min_epsilon,
120
124
  )
121
125
  depth_data = np.array(
122
- depth_data, dtype=np.dtype(Coordinates.DEPTH_DTYPE.value)
126
+ depth_data_values, dtype=Coordinates.DEPTH_DTYPE.value
123
127
  )
124
-
125
- depth = root.create_array(
128
+ root.create_array(
126
129
  name=Coordinates.DEPTH.value,
127
- # TODO: verify that these values are correct
130
+ # shape=depth_indices,
131
+ # dtype=np.dtype(Coordinates.DEPTH_DTYPE.value),
128
132
  data=depth_data,
129
- # shape=len(depth_values),
130
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
131
- # dtype=np.dtype(
132
- # Coordinates.DEPTH_DTYPE.value
133
- # ), # float16 == 2 significant digits would be ideal
134
- compressors=compressor,
135
- fill_value=np.nan,
136
- overwrite=self.__overwrite,
137
- dimension_names=(Coordinates.DEPTH.value,),
138
- )
139
-
140
- if np.any(np.isnan(depth_data)):
141
- raise Exception("Some depth values returned were NaN.")
142
-
143
- # depth.metadata.dimension_names = (Coordinates.DEPTH.value,)
144
-
145
- depth.attrs["units"] = Coordinates.DEPTH_UNITS.value
146
- depth.attrs["long_name"] = Coordinates.DEPTH_LONG_NAME.value
147
- depth.attrs["standard_name"] = Coordinates.DEPTH_STANDARD_NAME.value
148
-
149
- #####################################################################
150
- # --- Coordinate: Latitude --- #
151
- gps_data = np.array(
152
- np.repeat(np.nan, width),
153
- dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
133
+ chunks=depth_chunk_shape,
134
+ compressors=compressors,
135
+ # fill_value=np.nan,
136
+ attributes=dict(
137
+ units=Coordinates.DEPTH_UNITS.value,
138
+ long_name=Coordinates.DEPTH_LONG_NAME.value,
139
+ standard_name=Coordinates.DEPTH_STANDARD_NAME.value,
140
+ ),
141
+ dimension_names=[Coordinates.DEPTH.value], # TODO: is this right
142
+ overwrite=True,
154
143
  )
155
-
156
- latitude = root.create_array(
144
+ # #####################################################################
145
+ # # --- Coordinate: Latitude --- #
146
+ # latitude_values = np.rep(np.nan, width_indices)
147
+ # latitude_values.astype(np.dtype(Coordinates.LATITUDE_DTYPE.value))
148
+ root.create_array(
157
149
  name=Coordinates.LATITUDE.value,
158
- # dataset=np.repeat(0.0, width), # root.longitude[:] = np.nan
159
- data=gps_data,
160
- # shape=width,
161
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
162
- # dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
163
- compressors=compressor,
150
+ shape=width,
151
+ dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
152
+ # data=latitude_values,
153
+ chunks=latitude_chunk_shape,
154
+ compressors=compressors,
164
155
  fill_value=np.nan,
165
- overwrite=self.__overwrite,
166
- dimension_names=(Coordinates.TIME.value,),
156
+ attributes=dict(
157
+ units=Coordinates.LATITUDE_UNITS.value,
158
+ long_name=Coordinates.LATITUDE_LONG_NAME.value,
159
+ standard_name=Coordinates.LATITUDE_STANDARD_NAME.value,
160
+ ),
161
+ dimension_names=[Coordinates.TIME.value],
162
+ overwrite=True,
167
163
  )
168
-
169
- # Note: LATITUDE is indexed by TIME
170
- # latitude.metadata.dimension_names = (Coordinates.TIME.value,)
171
-
172
- latitude.attrs["units"] = Coordinates.LATITUDE_UNITS.value
173
- latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
174
- latitude.attrs["standard_name"] = Coordinates.LATITUDE_STANDARD_NAME.value
175
-
176
- #####################################################################
177
- # --- Coordinate: Longitude --- #
178
- longitude = root.create_array(
164
+ # #####################################################################
165
+ # # --- Coordinate: Longitude --- #
166
+ # longitude_values = np.arange(0, width_indices)
167
+ # longitude_values.astype(np.dtype(Coordinates.LONGITUDE_DTYPE.value))
168
+ root.create_array(
179
169
  name=Coordinates.LONGITUDE.value,
180
- # dataset=np.repeat(0.0, width), # root.longitude[:] = np.nan
181
- data=gps_data,
182
- # shape=width,
183
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
184
- # dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
185
- compressors=compressor,
170
+ shape=width,
171
+ dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
172
+ # data=longitude_values,
173
+ chunks=longitude_chunk_shape,
174
+ compressors=compressors,
186
175
  fill_value=np.nan,
187
- overwrite=self.__overwrite,
188
- dimension_names=(Coordinates.TIME.value,),
189
- )
190
-
191
- # Note: LONGITUDE is indexed by TIME
192
- # longitude.metadata.dimension_names = (Coordinates.TIME.value,)
193
-
194
- longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
195
- longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
196
- longitude.attrs["standard_name"] = Coordinates.LONGITUDE_STANDARD_NAME.value
197
-
198
- #####################################################################
199
- # TODO: verify adding this variable for where the bottom was detected
200
- # --- Coordinate: Bottom --- #
201
- bottom_data = np.array(
202
- np.repeat(np.nan, width), dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value)
176
+ attributes=dict(
177
+ units=Coordinates.LONGITUDE_UNITS.value,
178
+ long_name=Coordinates.LONGITUDE_LONG_NAME.value,
179
+ standard_name=Coordinates.LONGITUDE_STANDARD_NAME.value,
180
+ ),
181
+ dimension_names=[
182
+ Coordinates.TIME.value
183
+ ], # Note: LONGITUDE is indexed by TIME
184
+ overwrite=True,
203
185
  )
204
-
205
- bottom = root.create_array(
186
+ # #####################################################################
187
+ # # --- Coordinate: Bottom --- #
188
+ # bottom_values = np.repeat(12.34, width_indices)
189
+ # bottom_values.astype(np.dtype(Coordinates.BOTTOM_DTYPE.value))
190
+ root.create_array(
206
191
  name=Coordinates.BOTTOM.value,
207
- data=bottom_data,
208
- # shape=width,
209
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
210
- # dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
211
- compressors=compressor,
192
+ shape=width,
193
+ dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
194
+ # data=bottom_values,
195
+ chunks=bottom_chunk_shape,
196
+ compressors=compressors,
212
197
  fill_value=np.nan,
213
- overwrite=self.__overwrite,
214
- dimension_names=(Coordinates.TIME.value,),
198
+ attributes=dict(
199
+ units=Coordinates.BOTTOM_UNITS.value,
200
+ long_name=Coordinates.BOTTOM_LONG_NAME.value,
201
+ standard_name=Coordinates.BOTTOM_STANDARD_NAME.value,
202
+ ),
203
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
204
+ overwrite=True,
215
205
  )
216
-
217
- # BOTTOM is indexed by TIME
218
- # bottom.metadata.dimension_names = (Coordinates.TIME.value,)
219
-
220
- bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
221
- bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
222
- bottom.attrs["standard_name"] = Coordinates.BOTTOM_STANDARD_NAME.value
223
-
224
- #####################################################################
225
- # TODO: verify adding this variable with test
226
- # --- Coordinate: Speed --- #
227
- speed_data = np.repeat(np.nan, width)
228
- speed_data.astype(np.dtype(Coordinates.SPEED_DTYPE.value), copy=False)
229
-
230
- speed = root.create_array(
206
+ # #####################################################################
207
+ # # --- Coordinate: Speed --- #
208
+ # speed_values = np.repeat(5.67, width_indices)
209
+ # speed_values.astype(np.dtype(Coordinates.SPEED_DTYPE.value))
210
+ root.create_array(
231
211
  name=Coordinates.SPEED.value,
232
- data=np.repeat(np.nan, width), # root.longitude[:] = np.nan
233
- # shape=width,
234
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
235
- # dtype=np.dtype(Coordinates.SPEED_DTYPE.value),
236
- compressors=compressor,
212
+ shape=width,
213
+ dtype=np.dtype(Coordinates.SPEED_DTYPE.value),
214
+ # data=speed_values,
215
+ chunks=speed_chunk_shape,
216
+ compressors=compressors,
237
217
  fill_value=np.nan,
238
- overwrite=self.__overwrite,
239
- dimension_names=(Coordinates.TIME.value,), # NOTE: 'TIME'
218
+ attributes=dict(
219
+ units=Coordinates.SPEED_UNITS.value,
220
+ long_name=Coordinates.SPEED_LONG_NAME.value,
221
+ standard_name=Coordinates.SPEED_STANDARD_NAME.value,
222
+ ),
223
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
224
+ overwrite=True,
240
225
  )
241
-
242
- # SPEED is indexed by TIME
243
- # speed.metadata.dimension_names = (Coordinates.TIME.value,)
244
-
245
- speed.attrs["units"] = Coordinates.SPEED_UNITS.value
246
- speed.attrs["long_name"] = Coordinates.SPEED_LONG_NAME.value
247
- speed.attrs["standard_name"] = Coordinates.SPEED_STANDARD_NAME.value
248
-
249
- #####################################################################
250
- # TODO: verify adding this variable with test
251
- # --- Coordinate: Speed --- #
252
- distance_data = np.repeat(np.nan, width)
253
- distance_data.astype(np.dtype(Coordinates.DISTANCE_DTYPE.value), copy=False)
254
-
255
- distance = root.create_array(
226
+ # #####################################################################
227
+ # # --- Coordinate: Distance --- #
228
+ # distance_values = np.repeat(8.90, width_indices)
229
+ # distance_values.astype(np.dtype(Coordinates.DISTANCE_DTYPE.value))
230
+ root.create_array(
256
231
  name=Coordinates.DISTANCE.value,
257
- data=np.repeat(np.nan, width), # root.longitude[:] = np.nan
258
- # shape=width,
259
- chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
260
- # dtype=np.dtype(Coordinates.SPEED_DTYPE.value),
261
- compressors=compressor,
232
+ shape=width,
233
+ dtype=np.dtype(Coordinates.DISTANCE_DTYPE.value),
234
+ # data=distance_values,
235
+ chunks=distance_chunk_shape,
236
+ compressors=compressors,
262
237
  fill_value=np.nan,
263
- overwrite=self.__overwrite,
264
- dimension_names=(Coordinates.TIME.value,), # NOTE: 'TIME'
265
- )
266
-
267
- # DISTANCE is indexed by TIME
268
- # distance.metadata.dimension_names = (Coordinates.TIME.value,)
269
-
270
- distance.attrs["units"] = Coordinates.DISTANCE_UNITS.value
271
- distance.attrs["long_name"] = Coordinates.DISTANCE_LONG_NAME.value
272
- distance.attrs["standard_name"] = Coordinates.DISTANCE_STANDARD_NAME.value
273
-
274
- #####################################################################
275
- # --- Coordinate: Frequency --- #
276
- frequency_data = np.array(
277
- frequencies, dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value)
238
+ attributes=dict(
239
+ units=Coordinates.DISTANCE_UNITS.value,
240
+ long_name=Coordinates.DISTANCE_LONG_NAME.value,
241
+ standard_name=Coordinates.DISTANCE_STANDARD_NAME.value,
242
+ ),
243
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
244
+ overwrite=True,
278
245
  )
279
- # frequency_data.astype(np.dtype(Coordinates.FREQUENCY_DTYPE.value), copy=False)
280
-
281
- frequency = root.create_array(
246
+ # #####################################################################
247
+ # # --- Coordinate: Frequency --- #
248
+ root.create_array(
282
249
  name=Coordinates.FREQUENCY.value,
283
- data=frequency_data,
284
- # shape=len(frequencies),
285
- chunks=(len(frequencies),),
250
+ # shape=frequency_indices,
286
251
  # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
287
- compressors=compressor,
288
- fill_value=0.0,
289
- overwrite=self.__overwrite,
290
- dimension_names=(Coordinates.FREQUENCY.value,),
252
+ data=frequencies,
253
+ # chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
254
+ chunks=frequency_chunk_shape,
255
+ compressors=compressors,
256
+ # fill_value=0,
257
+ attributes=dict(
258
+ units=Coordinates.FREQUENCY_UNITS.value,
259
+ long_name=Coordinates.FREQUENCY_LONG_NAME.value,
260
+ standard_name=Coordinates.FREQUENCY_STANDARD_NAME.value,
261
+ ),
262
+ dimension_names=[Coordinates.FREQUENCY.value],
263
+ overwrite=True,
291
264
  )
292
-
293
- # TODO: best coordinate would be channel with str type
294
- # frequency.metadata.dimension_names = (Coordinates.FREQUENCY.value,)
295
-
296
- frequency.attrs["units"] = Coordinates.FREQUENCY_UNITS.value
297
- frequency.attrs["long_name"] = Coordinates.FREQUENCY_LONG_NAME.value
298
- frequency.attrs["standard_name"] = Coordinates.FREQUENCY_STANDARD_NAME.value
299
-
300
- #####################################################################
301
- # --- Sv Data --- #
302
- sv = root.create_array(
265
+ # #####################################################################
266
+ # # --- Sv Data --- #
267
+ root.create_array(
303
268
  name=Coordinates.SV.value,
304
269
  shape=(len(depth_data), width, len(frequencies)),
305
- chunks=(
306
- Constants.TILE_SIZE.value,
307
- Constants.TILE_SIZE.value,
308
- 1,
309
- ),
310
270
  dtype=np.dtype(Coordinates.SV_DTYPE.value),
311
- compressors=compressor,
271
+ # data=,
272
+ chunks=sv_chunk_shape,
273
+ compressors=compressors,
312
274
  fill_value=np.nan,
313
- overwrite=self.__overwrite,
314
- dimension_names=(
275
+ attributes=dict(
276
+ units=Coordinates.SV_UNITS.value,
277
+ long_name=Coordinates.SV_LONG_NAME.value,
278
+ standard_name=Coordinates.SV_STANDARD_NAME.value,
279
+ ),
280
+ dimension_names=[
315
281
  Coordinates.DEPTH.value,
316
282
  Coordinates.TIME.value,
317
283
  Coordinates.FREQUENCY.value,
318
- ),
284
+ ],
285
+ overwrite=True,
319
286
  )
320
- # sv.metadata.dimension_names = (
321
- # Coordinates.DEPTH.value,
322
- # Coordinates.TIME.value,
323
- # Coordinates.FREQUENCY.value,
324
- # )
325
- # sv.attrs["_ARRAY_DIMENSIONS"] = [
326
- # Coordinates.DEPTH.value,
327
- # Coordinates.TIME.value,
328
- # Coordinates.FREQUENCY.value,
329
- # ]
330
-
331
- sv.attrs["units"] = Coordinates.SV_UNITS.value
332
- sv.attrs["long_name"] = Coordinates.SV_LONG_NAME.value
333
- sv.attrs["tile_size"] = Constants.TILE_SIZE.value
334
-
335
287
  #####################################################################
336
- # --- Metadata --- #
288
+ # # --- Metadata --- #
337
289
  root.attrs["ship_name"] = ship_name
338
290
  root.attrs["cruise_name"] = cruise_name
339
291
  root.attrs["sensor_name"] = sensor_name
340
292
  #
341
293
  root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
342
-
343
- # NOTE: for the version to be parsable you need to build the python package
344
- # locally first.
345
- current_project_version = importlib.metadata.version(
294
+ # NOTE: for the version to be parsable you need to build the python package locally first.
295
+ root.attrs["processing_software_version"] = metadata.version(
346
296
  "water-column-sonar-processing"
347
297
  )
348
- root.attrs["processing_software_version"] = current_project_version
349
298
  root.attrs["processing_software_time"] = Timestamp.get_timestamp()
350
299
  #
351
300
  root.attrs["calibration_status"] = calibration_status
352
301
  root.attrs["tile_size"] = Constants.TILE_SIZE.value
353
-
354
- # TODO: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.
355
- # zarr.consolidate_metadata(zarr_path)
356
- #####################################################################
357
- """
358
- # zzz = zarr.open('https://echofish-dev-master-118234403147-echofish-zarr-store.s3.us-west-2.amazonaws.com/GU1002_resample.zarr')
359
- # zzz.time[0] = 1274979445.423
360
- # Initialize all to origin time, will be overwritten late
361
- """
302
+ #
362
303
  return zarr_path
363
304
  except Exception as err:
364
305
  raise RuntimeError(f"Problem trying to create zarr store, {err}")
365
- # finally:
366
- # cleaner = Cleaner()
367
- # cleaner.delete_local_files()
368
- # TODO: should delete zarr store in temp directory too?
369
306
 
370
- #######################################################
371
- #
372
- # LEVEL 3 - LEVEL 3 - LEVEL 3 - LEVEL 3 # TODO: move to separate project for zarr 3?
373
- #
374
- # def create_zarr_store_level_3(
375
- # self,
376
- # path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
377
- # ship_name: str,
378
- # cruise_name: str,
379
- # sensor_name: str,
380
- # frequencies: list, # units in Hz
381
- # width: int, # TODO: needs better name... "ping_time"
382
- # min_echo_range: float, # smallest resolution in meters --> 1.0 meters
383
- # max_echo_range: float,
384
- # cruise_min_epsilon: float,
385
- # calibration_status: bool = False, # Assume uncalibrated
307
+ # #######################################################
308
+ # def create_zarr_store_old(
309
+ # self,
310
+ # path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
311
+ # ship_name: str,
312
+ # cruise_name: str,
313
+ # sensor_name: str,
314
+ # frequencies: list, # units in Hz
315
+ # width: int,
316
+ # max_echo_range: float,
317
+ # # cruise_min_epsilon: float, # smallest resolution in meters
318
+ # calibration_status: bool = False, # Assume uncalibrated
386
319
  # ) -> str:
387
- # compressor = Blosc(cname="zstd", clevel=9, shuffle=1)
388
- # TILE_SIZE = 1024
320
+ # """
321
+ # Creates a new zarr store in a local temporary directory(?)
322
+ # """
389
323
  # try:
390
- # # TODO: problem throwing exceptions here
391
- # print(
392
- # f"Creating level 3 local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
393
- # )
324
+ # print(f"Creating local zarr store, {cruise_name}.zarr for ship {ship_name}")
394
325
  # if len(frequencies) != len(set(frequencies)):
395
326
  # raise Exception(
396
327
  # "Number of frequencies does not match number of channels"
397
328
  # )
398
329
  #
399
- # # print(f"Debugging number of threads: {self.__num_threads}")
400
- #
401
330
  # zarr_path = f"{path}/{cruise_name}.zarr"
402
- # store = zarr.DirectoryStore(path=zarr_path, normalize_keys=False)
403
- # root = zarr.group(store=store, overwrite=self.__overwrite, cache_attrs=True)
404
- #
405
331
  # #####################################################################
406
- # # --- Coordinate: Time --- #
407
- # # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
408
- # time = root.create_array(
409
- # name=Coordinates.TIME.value,
410
- # data=np.repeat(0.0, width),
411
- # shape=width,
412
- # chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
413
- # dtype=np.dtype(Coordinates.TIME_DTYPE.value),
414
- # compressor=compressor,
415
- # # fill_value=np.nan,
416
- # overwrite=self.__overwrite,
417
- # )
418
- #
419
- # time.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
420
- # time.attrs["calendar"] = Coordinates.TIME_CALENDAR.value
421
- # time.attrs["units"] = Coordinates.TIME_UNITS.value
422
- # time.attrs["long_name"] = Coordinates.TIME_LONG_NAME.value
423
- # time.attrs["standard_name"] = Coordinates.TIME_STANDARD_NAME.value
332
+ # # Define the chunk sizes and the encoding
333
+ # # 1_000_000 data points for quickest download
334
+ # spatiotemporal_chunk_size = int(1e6)
335
+ # depth_chunk_shape = (512,)
336
+ # time_chunk_shape = (spatiotemporal_chunk_size,)
337
+ # frequency_chunk_shape = (len(frequencies),)
338
+ # latitude_chunk_shape = (spatiotemporal_chunk_size,)
339
+ # longitude_chunk_shape = (spatiotemporal_chunk_size,)
340
+ # bottom_chunk_shape = (spatiotemporal_chunk_size,)
341
+ # speed_chunk_shape = (spatiotemporal_chunk_size,)
342
+ # distance_chunk_shape = (spatiotemporal_chunk_size,)
343
+ # sv_chunk_shape = (512, 512, 1) # TODO: move to constants
424
344
  #
425
345
  # #####################################################################
426
- # # --- Coordinate: Depth --- #
427
- # depth_values = self.get_depth_values(
428
- # # min_echo_range=min_echo_range,
346
+ # ##### Depth #####
347
+ # depth_data_values = self.get_depth_values(
429
348
  # max_echo_range=max_echo_range,
430
- # cruise_min_epsilon=cruise_min_epsilon,
431
349
  # )
432
350
  #
433
- # root.create_dataset(
351
+ # depth_data = np.array(
352
+ # depth_data_values, dtype=Coordinates.DEPTH_DTYPE.value
353
+ # )
354
+ # depth_da = xr.DataArray(
355
+ # data=depth_data,
356
+ # dims=Coordinates.DEPTH.value,
434
357
  # name=Coordinates.DEPTH.value,
435
- # # TODO: verify that these values are correct
436
- # data=depth_values,
437
- # shape=len(depth_values),
438
- # chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
439
- # dtype=np.dtype(
440
- # Coordinates.DEPTH_DTYPE.value # TODO: convert to integers and only get whole number depths
441
- # ), # float16 == 2 significant digits would be ideal
442
- # compressor=compressor,
443
- # # fill_value=np.nan,
444
- # overwrite=self.__overwrite,
358
+ # attrs=dict(
359
+ # units=Coordinates.DEPTH_UNITS.value,
360
+ # long_name=Coordinates.DEPTH_LONG_NAME.value,
361
+ # standard_name=Coordinates.DEPTH_STANDARD_NAME.value,
362
+ # ),
445
363
  # )
446
364
  #
447
- # if np.any(np.isnan(depth_values)):
448
- # raise Exception("Some depth values returned were NaN.")
449
- #
450
- # root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
451
- # root.depth.attrs["units"] = Coordinates.DEPTH_UNITS.value
452
- # root.depth.attrs["long_name"] = Coordinates.DEPTH_LONG_NAME.value
453
- # root.depth.attrs["standard_name"] = Coordinates.DEPTH_STANDARD_NAME.value
365
+ # ##### Time #####
366
+ # # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
367
+ # time_data = np.array(
368
+ # np.repeat(np.datetime64(0, "ns"), width),
369
+ # dtype="datetime64[ns]",
370
+ # )
371
+ # time_da = xr.DataArray(
372
+ # data=time_data,
373
+ # dims=Coordinates.TIME.value,
374
+ # name=Coordinates.TIME.value,
375
+ # attrs=dict(
376
+ # # Note: cal & units are written automatically by xarray
377
+ # # calendar="proleptic_gregorian",
378
+ # # units="seconds since 1970-01-01 00:00:00",
379
+ # long_name=Coordinates.TIME_LONG_NAME.value,
380
+ # standard_name=Coordinates.TIME_STANDARD_NAME.value,
381
+ # ),
382
+ # )
454
383
  #
455
- # #####################################################################
456
- # # --- Coordinate: Latitude --- #
457
- # root.create_dataset(
458
- # name=Coordinates.LATITUDE.value,
459
- # # dataset=np.repeat(0.0, width), # root.longitude[:] = np.nan
460
- # data=np.repeat(np.nan, width),
461
- # shape=width,
462
- # chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
463
- # dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
464
- # compressor=compressor,
465
- # fill_value=np.nan, # needs to be nan to validate if any missing
466
- # overwrite=self.__overwrite,
384
+ # ##### Frequency #####
385
+ # frequency_data = np.array(
386
+ # frequencies,
387
+ # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
388
+ # )
389
+ # frequency_da = xr.DataArray(
390
+ # data=frequency_data,
391
+ # dims=Coordinates.FREQUENCY.value,
392
+ # name=Coordinates.FREQUENCY.value,
393
+ # attrs=dict(
394
+ # units=Coordinates.FREQUENCY_UNITS.value,
395
+ # long_name=Coordinates.FREQUENCY_LONG_NAME.value,
396
+ # standard_name=Coordinates.FREQUENCY_STANDARD_NAME.value,
397
+ # ),
467
398
  # )
468
399
  #
469
- # # Note: LATITUDE is indexed by TIME
470
- # root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
471
- # root.latitude.attrs["units"] = Coordinates.LATITUDE_UNITS.value
472
- # root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
473
- # root.latitude.attrs["standard_name"] = (
474
- # Coordinates.LATITUDE_STANDARD_NAME.value
400
+ # ##### Latitude #####
401
+ # gps_data = np.array(
402
+ # np.repeat(np.nan, width),
403
+ # dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
475
404
  # )
405
+ # latitude_da = xr.DataArray(
406
+ # data=gps_data,
407
+ # coords=dict(
408
+ # time=time_da,
409
+ # ),
410
+ # dims=Coordinates.TIME.value, # Note: "TIME"
411
+ # name=Coordinates.LATITUDE.value,
412
+ # attrs=dict(
413
+ # units=Coordinates.LATITUDE_UNITS.value,
414
+ # long_name=Coordinates.LATITUDE_LONG_NAME.value,
415
+ # standard_name=Coordinates.LATITUDE_STANDARD_NAME.value,
416
+ # ),
417
+ # ) # Note: LATITUDE is indexed by TIME
476
418
  #
477
- # #####################################################################
478
- # # --- Coordinate: Longitude --- #
479
- # root.create_dataset(
419
+ # ##### Longitude #####
420
+ # longitude_da = xr.DataArray(
421
+ # data=gps_data,
422
+ # coords=dict(
423
+ # time=time_da,
424
+ # ),
425
+ # dims=Coordinates.TIME.value, # Note: "TIME"
480
426
  # name=Coordinates.LONGITUDE.value,
481
- # # dataset=np.repeat(0.0, width), # root.longitude[:] = np.nan
482
- # data=np.repeat(np.nan, width),
483
- # shape=width,
484
- # chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
485
- # dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
486
- # compressor=compressor,
487
- # fill_value=np.nan,
488
- # overwrite=self.__overwrite,
489
- # )
427
+ # attrs=dict(
428
+ # units=Coordinates.LONGITUDE_UNITS.value,
429
+ # long_name=Coordinates.LONGITUDE_LONG_NAME.value,
430
+ # standard_name=Coordinates.LONGITUDE_STANDARD_NAME.value,
431
+ # ),
432
+ # ) # Note: LONGITUDE is indexed by TIME
490
433
  #
491
- # # Note: LONGITUDE is indexed by TIME
492
- # root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
493
- # root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
494
- # root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
495
- # root.longitude.attrs["standard_name"] = (
496
- # Coordinates.LONGITUDE_STANDARD_NAME.value
434
+ # ##### Bottom #####
435
+ # bottom_data = np.array(
436
+ # np.repeat(np.nan, width), dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value)
497
437
  # )
498
- #
499
- # #####################################################################
500
- # # TODO: verify adding this variable for where the bottom was detected
501
- # # --- Coordinate: Bottom --- #
502
- # root.create_dataset(
438
+ # bottom_da = xr.DataArray(
439
+ # data=bottom_data,
440
+ # coords=dict(
441
+ # time=time_da,
442
+ # ),
443
+ # dims=Coordinates.TIME.value, # Note: "TIME"
503
444
  # name=Coordinates.BOTTOM.value,
504
- # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
505
- # shape=width,
506
- # chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
507
- # dtype=np.dtype(
508
- # Coordinates.BOTTOM_DTYPE.value
509
- # ), # TODO: should also only be integers
510
- # compressor=compressor,
511
- # fill_value=0.0,
512
- # overwrite=self.__overwrite,
445
+ # attrs=dict(
446
+ # units=Coordinates.BOTTOM_UNITS.value,
447
+ # long_name=Coordinates.BOTTOM_LONG_NAME.value,
448
+ # standard_name=Coordinates.BOTTOM_STANDARD_NAME.value,
449
+ # ),
513
450
  # )
514
451
  #
515
- # # BOTTOM is indexed by TIME
516
- # root.bottom.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
517
- # root.bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
518
- # root.bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
519
- # root.bottom.attrs["standard_name"] = Coordinates.BOTTOM_STANDARD_NAME.value
520
- #
521
- # #####################################################################
522
- # # TODO: verify adding this variable with test
523
- # # --- Coordinate: Speed --- #
524
- # root.create_dataset(
452
+ # ##### Speed #####
453
+ # speed_data = np.array(
454
+ # np.repeat(np.nan, width), dtype=np.dtype(Coordinates.SPEED_DTYPE.value)
455
+ # )
456
+ # speed_da = xr.DataArray(
457
+ # data=speed_data,
458
+ # coords=dict(
459
+ # time=time_da,
460
+ # ),
461
+ # dims=Coordinates.TIME.value, # Note: "TIME"
525
462
  # name=Coordinates.SPEED.value,
526
- # data=np.repeat(np.nan, width), # root.longitude[:] = np.nan
527
- # shape=width,
528
- # chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
529
- # dtype=np.dtype(Coordinates.SPEED_DTYPE.value), # TODO: also round?
530
- # compressor=compressor,
531
- # fill_value=np.nan,
532
- # overwrite=self.__overwrite,
463
+ # attrs=dict(
464
+ # units=Coordinates.SPEED_UNITS.value,
465
+ # long_name=Coordinates.SPEED_LONG_NAME.value,
466
+ # standard_name=Coordinates.SPEED_STANDARD_NAME.value,
467
+ # ),
533
468
  # )
534
469
  #
535
- # # SPEED is indexed by TIME
536
- # root.speed.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
537
- # root.speed.attrs["units"] = Coordinates.SPEED_UNITS.value
538
- # root.speed.attrs["long_name"] = Coordinates.SPEED_LONG_NAME.value
539
- # root.speed.attrs["standard_name"] = Coordinates.SPEED_STANDARD_NAME.value
540
- #
541
- # #####################################################################
542
- # # --- Coordinate: Frequency --- #
543
- # root.create_dataset(
544
- # name=Coordinates.FREQUENCY.value,
545
- # data=frequencies,
546
- # shape=len(frequencies),
547
- # chunks=len(frequencies),
548
- # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
549
- # compressor=compressor,
550
- # fill_value=0.0,
551
- # overwrite=self.__overwrite,
470
+ # ##### Distance #####
471
+ # distance_data = np.array(
472
+ # np.repeat(np.nan, width),
473
+ # dtype=np.dtype(Coordinates.DISTANCE_DTYPE.value),
474
+ # )
475
+ # distance_da = xr.DataArray(
476
+ # data=distance_data,
477
+ # coords=dict(
478
+ # time=time_da,
479
+ # ),
480
+ # dims=Coordinates.TIME.value, # Note: "TIME"
481
+ # name=Coordinates.DISTANCE.value,
482
+ # attrs=dict(
483
+ # units=Coordinates.DISTANCE_UNITS.value,
484
+ # long_name=Coordinates.DISTANCE_LONG_NAME.value,
485
+ # standard_name=Coordinates.DISTANCE_STANDARD_NAME.value,
486
+ # ),
552
487
  # )
553
488
  #
554
- # # TODO: best coordinate would be channel with str type
555
- # root.frequency.attrs["_ARRAY_DIMENSIONS"] = [
556
- # Coordinates.FREQUENCY.value
557
- # ] # TODO: is this correct
558
- # root.frequency.attrs["units"] = Coordinates.FREQUENCY_UNITS.value
559
- # root.frequency.attrs["long_name"] = Coordinates.FREQUENCY_LONG_NAME.value
560
- # root.frequency.attrs["standard_name"] = (
561
- # Coordinates.FREQUENCY_STANDARD_NAME.value
489
+ # ##### Sv #####
490
+ # gc.collect()
491
+ # # sv_data = np.empty(
492
+ # # (len(depth_data), width, len(frequencies)),
493
+ # # # (2501, 4_100_782, 4), # large cruise used for testing
494
+ # # dtype=np.dtype(Coordinates.SV_DTYPE.value),
495
+ # # )
496
+ # sv_data = np.full(
497
+ # (len(depth_data), width, len(frequencies)),
498
+ # np.nan,
499
+ # dtype=np.dtype(Coordinates.SV_DTYPE.value),
562
500
  # )
501
+ # print(f"one: {sys.getsizeof(sv_data)}")
502
+ # # sv_data[:] = np.nan # initialize all
563
503
  #
564
- # #####################################################################
565
- # # --- Sv Data --- #
566
- # root.create_dataset(
504
+ # sv_da = xr.DataArray(
505
+ # data=sv_data,
506
+ # coords=dict(
507
+ # depth=depth_da,
508
+ # time=time_da,
509
+ # frequency=frequency_da,
510
+ # #
511
+ # latitude=latitude_da,
512
+ # longitude=longitude_da,
513
+ # bottom=bottom_da,
514
+ # speed=speed_da,
515
+ # distance=distance_da,
516
+ # ),
517
+ # dims=( # Depth * Time * Frequency
518
+ # Coordinates.DEPTH.value,
519
+ # Coordinates.TIME.value,
520
+ # Coordinates.FREQUENCY.value,
521
+ # ),
567
522
  # name=Coordinates.SV.value,
568
- # shape=(len(depth_values), width, len(frequencies)),
569
- # chunks=(
570
- # TILE_SIZE,
571
- # TILE_SIZE,
572
- # len(frequencies),
523
+ # attrs=dict(
524
+ # units=Coordinates.SV_UNITS.value,
525
+ # long_name=Coordinates.SV_LONG_NAME.value,
526
+ # standard_name=Coordinates.SV_STANDARD_NAME.value,
527
+ # tiles_size=Constants.TILE_SIZE.value,
528
+ # _FillValue=np.nan,
573
529
  # ),
574
- # dtype=np.dtype("int8"), # Coordinates.SV_DTYPE.value
575
- # compressor=compressor, # TODO: get compression working?!
576
- # # fill_value=np.nan,
577
- # overwrite=self.__overwrite,
578
530
  # )
579
- #
580
- # root.Sv.attrs["_ARRAY_DIMENSIONS"] = [
581
- # Coordinates.DEPTH.value,
582
- # Coordinates.TIME.value,
583
- # Coordinates.FREQUENCY.value,
584
- # ]
585
- # root.Sv.attrs["units"] = Coordinates.SV_UNITS.value
586
- # root.Sv.attrs["long_name"] = Coordinates.SV_LONG_NAME.value
587
- # root.Sv.attrs["tile_size"] = TILE_SIZE
588
- #
531
+ # print(f"two: {sys.getsizeof(sv_data)}") # getting to at least here
532
+ # del sv_data
533
+ # sv_da.encoding = {"compressors": [compressor], "chunks": sv_chunk_shape}
534
+ # # sv_da = sv_da.astype(np.float32) # was crashing here
535
+ # gc.collect()
589
536
  # #####################################################################
590
- # # --- Metadata --- #
591
- # root.attrs["ship_name"] = ship_name
592
- # root.attrs["cruise_name"] = cruise_name
593
- # root.attrs["sensor_name"] = sensor_name
594
- # #
595
- # root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
596
- #
597
- # current_project_version = importlib.metadata.version(
598
- # "water_column_sonar_processing"
537
+ # ### Now create the xarray.Dataset
538
+ # ds = xr.Dataset(
539
+ # data_vars=dict(
540
+ # Sv=sv_da,
541
+ # #
542
+ # bottom=bottom_da,
543
+ # speed=speed_da,
544
+ # distance=distance_da,
545
+ # ),
546
+ # coords=dict(
547
+ # depth=depth_da,
548
+ # time=time_da,
549
+ # frequency=frequency_da,
550
+ # #
551
+ # latitude=latitude_da,
552
+ # longitude=longitude_da,
553
+ # ),
554
+ # attrs=dict(
555
+ # # --- Metadata --- #
556
+ # ship_name=ship_name,
557
+ # cruise_name=cruise_name,
558
+ # sensor_name=sensor_name,
559
+ # processing_software_name=Coordinates.PROJECT_NAME.value,
560
+ # # NOTE: for the version to be parsable you need to build the python package
561
+ # # locally first.
562
+ # processing_software_version=importlib.metadata.version(
563
+ # "water-column-sonar-processing"
564
+ # ),
565
+ # processing_software_time=Timestamp.get_timestamp(),
566
+ # calibration_status=calibration_status,
567
+ # tile_size=Constants.TILE_SIZE.value,
568
+ # ),
569
+ # )
570
+ # del sv_da
571
+ # gc.collect()
572
+ # print(f"three: {sys.getsizeof(ds)}")
573
+ # #####################################################################
574
+ # encodings = dict(
575
+ # depth={
576
+ # "compressors": [compressor],
577
+ # "chunks": depth_chunk_shape,
578
+ # },
579
+ # time={
580
+ # "compressors": [compressor],
581
+ # "chunks": time_chunk_shape,
582
+ # "units": Coordinates.TIME_UNITS.value,
583
+ # },
584
+ # frequency={
585
+ # "compressors": [compressor],
586
+ # "chunks": frequency_chunk_shape,
587
+ # },
588
+ # latitude={
589
+ # "compressors": [compressor],
590
+ # "chunks": latitude_chunk_shape,
591
+ # },
592
+ # longitude={
593
+ # "compressors": [compressor],
594
+ # "chunks": longitude_chunk_shape,
595
+ # },
596
+ # bottom={
597
+ # "compressors": [compressor],
598
+ # "chunks": bottom_chunk_shape,
599
+ # },
600
+ # speed={
601
+ # "compressors": [compressor],
602
+ # "chunks": speed_chunk_shape,
603
+ # },
604
+ # distance={
605
+ # "compressors": [compressor],
606
+ # "chunks": distance_chunk_shape,
607
+ # },
608
+ # Sv={
609
+ # "compressors": [compressor],
610
+ # "chunks": sv_chunk_shape,
611
+ # },
612
+ # )
613
+ # gc.collect()
614
+ # ds.to_zarr(
615
+ # store=zarr_path,
616
+ # mode="w", # “w” means create (overwrite if exists)
617
+ # encoding=encodings,
618
+ # consolidated=False,
619
+ # safe_chunks=False,
620
+ # align_chunks=True,
621
+ # zarr_format=3,
622
+ # write_empty_chunks=False, # Might need to change this
599
623
  # )
600
- # root.attrs["processing_software_version"] = current_project_version
601
- # root.attrs["processing_software_time"] = Timestamp.get_timestamp()
602
- # #
603
- # # TODO: add level somewhere?
604
- # #
605
- # root.attrs["calibration_status"] = calibration_status
606
- # root.attrs["tile_size"] = TILE_SIZE
607
- #
608
- # zarr.consolidate_metadata(store)
609
624
  # #####################################################################
610
625
  # return zarr_path
611
626
  # except Exception as err:
612
- # raise RuntimeError(f"Problem trying to create level 3 zarr store, {err}")
627
+ # raise RuntimeError(f"Problem trying to create zarr store, {err}")
613
628
  # # finally:
614
629
  # # cleaner = Cleaner()
615
630
  # # cleaner.delete_local_files()
616
631
  # # TODO: should delete zarr store in temp directory too?
617
632
 
618
- ############################################################################
619
- # def update_zarr_store(
620
- # self,
621
- # path: str,
622
- # ship_name: str,
623
- # cruise_name: str, # TODO: just pass stem
624
- # sensor_name: str,
625
- # ) -> None:
626
- # """
627
- # Opens an existing Zarr store living in a s3 bucket for the purpose
628
- # of updating just a subset of the cruise-level Zarr store associated
629
- # with a file-level Zarr store.
630
- # """
631
- # pass
632
-
633
633
  ############################################################################
634
634
  def open_s3_zarr_store_with_zarr(
635
635
  self,
636
636
  ship_name: str,
637
637
  cruise_name: str,
638
638
  sensor_name: str,
639
- # zarr_synchronizer: Union[str, None] = None, # TODO:
640
639
  output_bucket_name: str,
641
- endpoint_url=None,
642
- ): # -> zarr.hierarchy.Group:
640
+ endpoint_url: Optional[str] = None,
641
+ ) -> Group:
643
642
  # Mounts a Zarr store using pythons Zarr implementation. The mounted store
644
643
  # will have read/write privileges so that store can be updated.
645
644
  print("Opening L2 Zarr store with Zarr for writing.")
646
645
  try:
647
- s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
648
- root = f"{output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
649
- store = s3fs_manager.s3_map(s3_zarr_store_path=root)
650
- # synchronizer = model.ProcessSynchronizer(f"/tmp/{ship_name}_{cruise_name}.sync")
651
- cruise_zarr = zarr.open(store=store, mode="r+")
652
- except Exception as err: # Failure
653
- raise RuntimeError(
654
- f"Exception encountered opening Zarr store with Zarr, {err}"
646
+ level = str(Constants.LEVEL_2.value)
647
+ store = f"s3://{output_bucket_name}/{level}/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
648
+ print(f"endpoint url: {endpoint_url}")
649
+ cruise_zarr = zarr.open(
650
+ store=store,
651
+ mode="r+",
652
+ zarr_format=3,
653
+ storage_options={
654
+ "endpoint_url": endpoint_url,
655
+ "key": self.key,
656
+ "secret": self.secret,
657
+ },
655
658
  )
656
- print("Done opening Zarr store with Zarr.")
657
- return cruise_zarr
659
+ print("Done opening store with Zarr.")
660
+ return cruise_zarr
661
+ except Exception as err: # Failure
662
+ raise RuntimeError(f"Exception encountered opening store with Zarr, {err}")
658
663
 
659
- ############################################################################
664
+ ###########################################################################
665
+ @staticmethod
660
666
  def open_s3_zarr_store_with_xarray(
661
- self,
662
667
  ship_name: str,
663
668
  cruise_name: str,
664
669
  sensor_name: str,
665
670
  file_name_stem: str,
666
- input_bucket_name: str,
667
- endpoint_url=None,
671
+ bucket_name: str,
672
+ # level: str, # TODO: add level
673
+ endpoint_url: Optional[str] = None, # needed for moto testing
668
674
  ) -> xr.Dataset:
669
- print(
670
- "Opening L1 Zarr store in S3 with Xarray."
671
- ) # TODO: Is this only used for reading from?
675
+ print("Opening L1 Zarr store in S3 with Xarray.")
672
676
  try:
673
- zarr_path = f"s3://{input_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
674
- s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
675
- store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
676
- ds = xr.open_dataset(filename_or_obj=store_s3_map, engine="zarr", chunks={})
677
+ zarr_path = f"s3://{bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
678
+ kwargs = {"consolidated": False}
679
+ ds = xr.open_dataset(
680
+ filename_or_obj=zarr_path,
681
+ engine="zarr",
682
+ backend_kwargs={
683
+ "storage_options": {
684
+ "endpoint_url": endpoint_url,
685
+ "anon": True,
686
+ },
687
+ },
688
+ **kwargs,
689
+ )
677
690
  return ds
678
691
  except Exception as err:
679
692
  raise RuntimeError(f"Problem opening Zarr store in S3 as Xarray, {err}")
680
- finally:
681
- print("Exiting opening Zarr store in S3 as Xarray.")
682
693
 
694
+ ###########################################################################
695
+ # TODO: can this be consolidated with above
696
+ @staticmethod
683
697
  def open_l2_zarr_store_with_xarray(
684
- self,
685
698
  ship_name: str,
686
699
  cruise_name: str,
687
700
  sensor_name: str,
688
701
  bucket_name: str,
689
- endpoint_url=None,
702
+ endpoint_url: Optional[str] = None, # needed for moto testing
690
703
  ) -> xr.Dataset:
691
704
  print("Opening L2 Zarr store in S3 with Xarray.")
692
705
  try:
693
- zarr_path = f"s3://{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
694
- s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
695
- store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
706
+ level = str(Constants.LEVEL_2.value)
707
+ zarr_path = f"s3://{bucket_name}/{level}/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
708
+ kwargs = {"consolidated": False}
696
709
  ds = xr.open_dataset(
697
- filename_or_obj=store_s3_map,
710
+ filename_or_obj=zarr_path,
698
711
  engine="zarr",
712
+ backend_kwargs={
713
+ "storage_options": {
714
+ "endpoint_url": endpoint_url,
715
+ "anon": True,
716
+ }
717
+ },
718
+ **kwargs,
699
719
  )
720
+ return ds
700
721
  except Exception as err:
701
722
  raise RuntimeError(f"Problem opening Zarr store in S3 as Xarray, {err}")
702
- print("Done opening Zarr store in S3 as Xarray.")
703
- return ds
704
723
 
705
- ############################################################################
724
+ ###########################################################################
706
725
 
707
- #######################################################
726
+ ###########################################################################
708
727
  # def create_process_synchronizer(self):
709
728
  # # TODO: explore aws redis options
710
729
  # pass
711
730
 
712
- #######################################################
731
+ ###########################################################################
713
732
  # def verify_cruise_store_data(self):
714
733
  # # TODO: run a check on a finished model store to ensure that
715
734
  # # none of the time, latitude, longitude, or depth values
716
735
  # # are NaN.
717
736
  # pass
718
737
 
719
- #######################################################
738
+ ###########################################################################
720
739
 
721
740
 
722
741
  ###########################################################