water-column-sonar-processing 0.0.6__py3-none-any.whl → 26.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. water_column_sonar_processing/__init__.py +2 -5
  2. water_column_sonar_processing/aws/__init__.py +2 -2
  3. water_column_sonar_processing/aws/dynamodb_manager.py +257 -72
  4. water_column_sonar_processing/aws/s3_manager.py +184 -112
  5. water_column_sonar_processing/aws/s3fs_manager.py +29 -33
  6. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  7. water_column_sonar_processing/cruise/create_empty_zarr_store.py +38 -97
  8. water_column_sonar_processing/cruise/datatree_manager.py +21 -0
  9. water_column_sonar_processing/cruise/resample_regrid.py +144 -129
  10. water_column_sonar_processing/geometry/__init__.py +10 -2
  11. water_column_sonar_processing/geometry/elevation_manager.py +111 -0
  12. water_column_sonar_processing/geometry/geometry_manager.py +60 -44
  13. water_column_sonar_processing/geometry/line_simplification.py +176 -0
  14. water_column_sonar_processing/geometry/pmtile_generation.py +242 -51
  15. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  16. water_column_sonar_processing/index/index_manager.py +157 -27
  17. water_column_sonar_processing/model/zarr_manager.py +663 -258
  18. water_column_sonar_processing/processing/__init__.py +4 -0
  19. water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
  20. water_column_sonar_processing/processing/raw_to_zarr.py +341 -0
  21. water_column_sonar_processing/utility/__init__.py +9 -2
  22. water_column_sonar_processing/utility/cleaner.py +1 -0
  23. water_column_sonar_processing/utility/constants.py +69 -14
  24. water_column_sonar_processing/utility/pipeline_status.py +11 -15
  25. water_column_sonar_processing/utility/timestamp.py +3 -4
  26. water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
  27. water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
  28. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
  29. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
  30. water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
  31. water_column_sonar_processing/process.py +0 -147
  32. water_column_sonar_processing-0.0.6.dist-info/METADATA +0 -123
  33. water_column_sonar_processing-0.0.6.dist-info/RECORD +0 -29
  34. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
@@ -1,274 +1,633 @@
1
1
  import os
2
+ from importlib import metadata
3
+ from typing import Optional
2
4
 
3
- import numcodecs
4
5
  import numpy as np
5
6
  import xarray as xr
6
7
  import zarr
7
- from numcodecs import Blosc
8
+ from zarr.codecs import BloscCodec, BloscShuffle
9
+ from zarr.core.group import Group
8
10
 
9
- from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
10
- from water_column_sonar_processing.utility.constants import Constants, Coordinates
11
- from water_column_sonar_processing.utility.timestamp import Timestamp
11
+ from water_column_sonar_processing.utility import Constants, Coordinates, Timestamp
12
12
 
13
- numcodecs.blosc.use_threads = False
14
- numcodecs.blosc.set_nthreads(1)
13
+ # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/blosc/index.html
14
+ compressors = BloscCodec(
15
+ cname="zstd",
16
+ clevel=9,
17
+ shuffle=BloscShuffle.bitshuffle,
18
+ )
15
19
 
16
20
 
17
- # TODO: when ready switch to version 3 of model spec
18
- # ZARR_V3_EXPERIMENTAL_API = 1
19
-
20
-
21
- # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
21
+ # creates the latlon dataset: foo = ep.consolidate.add_location(ds_Sv, echodata)
22
22
  class ZarrManager:
23
23
  #######################################################
24
24
  def __init__(
25
25
  self,
26
+ # endpoint_url: Optional[str] = None,
26
27
  ):
27
- # TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
28
- self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
29
28
  self.__overwrite = True
30
- self.__num_threads = numcodecs.blosc.get_nthreads()
31
- self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
32
- self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
29
+ self.key = os.environ.get("OUTPUT_BUCKET_ACCESS_KEY")
30
+ self.secret = os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY")
33
31
 
34
32
  #######################################################
35
33
  @staticmethod
36
34
  def get_depth_values(
37
- min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
38
- max_echo_range: float = 100.0, # maximum depth measured from whole cruise
39
- ):
35
+ max_echo_range: float, # maximum depth measured from whole cruise
36
+ cruise_min_epsilon: float = 0.20, # delta subsequent measurements
37
+ ) -> np.ndarray[tuple]:
40
38
  # Gets the set of depth values that will be used when resampling and
41
- # regridding the data to a cruise level model store.
42
- # Note: returned values do not start at zero.
43
- print("Getting depth values.")
44
- all_cruise_depth_values = np.linspace(
45
- start=min_echo_range,
46
- stop=max_echo_range,
47
- num=int(max_echo_range / min_echo_range) + 1,
39
+ # regridding the dataset to a cruise level model store.
40
+ # Note: returned values start at zero!
41
+ # For more info see here: https://echopype.readthedocs.io/en/stable/data-proc-additional.html
42
+ all_cruise_depth_values = np.linspace( # TODO: PROBLEM HERE
43
+ start=0, # start it at zero
44
+ stop=np.ceil(max_echo_range), # round up
45
+ num=int(np.ceil(max_echo_range) / cruise_min_epsilon) + 1,
48
46
  endpoint=True,
49
47
  )
50
48
 
51
- print("Done getting depth values.")
49
+ if np.any(np.isnan(all_cruise_depth_values)):
50
+ raise Exception("Problem depth values returned were NaN.")
51
+
52
52
  return all_cruise_depth_values.round(decimals=2)
53
53
 
54
54
  #######################################################
55
55
  def create_zarr_store(
56
56
  self,
57
- path: str,
57
+ path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
58
58
  ship_name: str,
59
59
  cruise_name: str,
60
60
  sensor_name: str,
61
- frequencies: list, # units in Hz
62
- width: int, # TODO: needs better name... "ping_time"
63
- min_echo_range: float, # smallest resolution in meters
61
+ frequencies: list, # units in Hz, type(frequencies) == np.ndarray
62
+ width: int,
64
63
  max_echo_range: float,
65
64
  calibration_status: bool = False, # Assume uncalibrated
66
65
  ) -> str:
67
- print(
68
- f"Creating local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
69
- )
70
-
71
- # There should be no repeated frequencies
72
- assert len(frequencies) == len(set(frequencies))
73
- # TODO: eventually switch coordinate to "channel"
74
-
75
- print(f"Debugging number of threads: {self.__num_threads}")
76
-
77
- zarr_path = f"{path}/{cruise_name}.zarr"
78
- store = zarr.DirectoryStore(path=zarr_path, normalize_keys=False)
79
- root = zarr.group(store=store, overwrite=self.__overwrite, cache_attrs=True)
80
-
81
- #####################################################################
82
- # --- Coordinate: Time --- #
83
- # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
84
- root.create_dataset(
85
- name=Coordinates.TIME.value,
86
- data=np.repeat(0.0, width),
87
- shape=width,
88
- chunks=(
89
- Constants.TILE_SIZE.value,
90
- ), # TODO: the chunking scheme doesn't seem to be working here
91
- dtype=np.dtype(Coordinates.TIME_DTYPE.value),
92
- compressor=self.__compressor,
93
- # fill_value=0.,
94
- fill_value=np.nan, # TODO: do i want nan's?
95
- overwrite=self.__overwrite,
96
- )
97
-
98
- root.time.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
99
-
100
- root.time.attrs["calendar"] = Coordinates.TIME_CALENDAR.value
101
- root.time.attrs["units"] = Coordinates.TIME_UNITS.value
102
- root.time.attrs["long_name"] = Coordinates.TIME_LONG_NAME.value
103
- root.time.attrs["standard_name"] = Coordinates.TIME_STANDARD_NAME.value
104
-
105
- #####################################################################
106
- # --- Coordinate: Depth --- #
107
- depth_values = self.get_depth_values(
108
- min_echo_range=min_echo_range, max_echo_range=max_echo_range
109
- )
110
-
111
- root.create_dataset(
112
- name=Coordinates.DEPTH.value,
113
- # TODO: verify that these values are correct
114
- data=depth_values,
115
- shape=len(depth_values),
116
- chunks=Constants.TILE_SIZE.value,
117
- dtype=np.dtype(
118
- Coordinates.DEPTH_DTYPE.value
119
- ), # float16 == 2 significant digits would be ideal
120
- compressor=self.__compressor,
121
- # fill_value=np.nan,
122
- overwrite=self.__overwrite,
123
- )
124
- # TODO: change to exception
125
- assert not np.any(np.isnan(depth_values))
126
-
127
- root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
128
-
129
- root.depth.attrs["long_name"] = Coordinates.DEPTH_LONG_NAME.value
130
- root.depth.attrs["units"] = Coordinates.DEPTH_UNITS.value
131
-
132
- #####################################################################
133
- # --- Coordinate: Latitude --- #
134
- root.create_dataset(
135
- name=Coordinates.LATITUDE.value,
136
- data=np.repeat(0.0, width),
137
- shape=width,
138
- chunks=Constants.TILE_SIZE.value,
139
- dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
140
- compressor=self.__compressor,
141
- fill_value=0.0,
142
- overwrite=self.__overwrite,
143
- )
144
-
145
- root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
146
-
147
- root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
148
- root.latitude.attrs["units"] = Coordinates.LATITUDE_UNITS.value
149
-
150
- #####################################################################
151
- # --- Coordinate: Longitude --- #
152
- root.create_dataset(
153
- name=Coordinates.LONGITUDE.value,
154
- data=np.repeat(0.0, width), # root.longitude[:] = np.nan
155
- shape=width,
156
- chunks=Constants.TILE_SIZE.value,
157
- dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
158
- compressor=self.__compressor,
159
- fill_value=0.0,
160
- overwrite=self.__overwrite,
161
- )
162
-
163
- root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
164
-
165
- root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
166
- root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
167
-
168
- #####################################################################
169
- # TODO: verify adding this variable for where the bottom was detected
170
- # --- Coordinate: Bottom --- #
171
- root.create_dataset(
172
- name=Coordinates.BOTTOM.value,
173
- # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
174
- shape=width,
175
- chunks=Constants.TILE_SIZE.value,
176
- dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
177
- compressor=self.__compressor,
178
- fill_value=np.nan,
179
- overwrite=self.__overwrite,
180
- )
181
-
182
- root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
183
-
184
- root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
185
- root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
186
-
187
- #####################################################################
188
- # --- Coordinate: Frequency --- #
189
- root.create_dataset(
190
- name=Coordinates.FREQUENCY.value,
191
- data=frequencies,
192
- shape=len(frequencies),
193
- chunks=1,
194
- dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
195
- compressor=self.__compressor,
196
- fill_value=0.0,
197
- overwrite=self.__overwrite,
198
- )
199
-
200
- # TODO: best coordinate would be channel with str type
201
- root.frequency.attrs["_ARRAY_DIMENSIONS"] = [
202
- Coordinates.FREQUENCY.value
203
- ] # TODO: is this correct
204
-
205
- root.frequency.attrs["long_name"] = Coordinates.FREQUENCY_LONG_NAME.value
206
- root.frequency.attrs["standard_name"] = (
207
- Coordinates.FREQUENCY_STANDARD_NAME.value
208
- )
209
- root.frequency.attrs["units"] = Coordinates.FREQUENCY_UNITS.value
210
-
211
- #####################################################################
212
- # --- Sv Data --- #
213
- root.create_dataset(
214
- name=Coordinates.SV.value,
215
- shape=(len(depth_values), width, len(frequencies)),
216
- chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1),
217
- dtype=np.dtype(
218
- Coordinates.SV_DTYPE.value
219
- ), # TODO: try to experiment with 'float16'
220
- compressor=self.__compressor,
221
- fill_value=np.nan,
222
- overwrite=self.__overwrite,
223
- )
224
-
225
- root.Sv.attrs["_ARRAY_DIMENSIONS"] = [
226
- Coordinates.DEPTH.value,
227
- Coordinates.TIME.value,
228
- Coordinates.FREQUENCY.value,
229
- ]
230
-
231
- root.Sv.attrs["long_name"] = Coordinates.SV_LONG_NAME.value
232
- root.Sv.attrs["units"] = Coordinates.SV_UNITS.value
233
- root.Sv.attrs["tile_size"] = Constants.TILE_SIZE.value
234
-
235
- #####################################################################
236
- # --- Metadata --- #
237
- root.attrs["ship_name"] = ship_name
238
- root.attrs["cruise_name"] = cruise_name
239
- root.attrs["sensor_name"] = sensor_name
240
- #
241
- root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
242
- root.attrs["processing_software_version"] = (
243
- "0.0.6" # TODO: get programmatically
244
- )
245
- root.attrs["processing_software_time"] = Timestamp.get_timestamp()
246
- #
247
- root.attrs["calibration_status"] = calibration_status
248
-
249
- zarr.consolidate_metadata(store)
250
- #####################################################################
251
66
  """
252
- # zzz = zarr.open('https://echofish-dev-master-118234403147-echofish-zarr-store.s3.us-west-2.amazonaws.com/GU1002_resample.zarr')
253
- # zzz.time[0] = 1274979445.423
254
- # Initialize all to origin time, will be overwritten late
67
+ Creates a new zarr store in a local temporary directory(?)
255
68
  """
256
- return zarr_path
69
+ try:
70
+ print(f"Creating local zarr store, {cruise_name}.zarr for ship {ship_name}")
71
+ if len(frequencies) != len(set(frequencies)):
72
+ raise Exception(
73
+ "Number of frequencies does not match number of channels"
74
+ )
75
+
76
+ zarr_path = f"{path}/{cruise_name}.zarr"
77
+ #####################################################################
78
+ frequencies = np.array(
79
+ frequencies, dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value)
80
+ )
81
+ #####################################################################
82
+ # Define the chunk sizes and the encoding
83
+ depth_chunk_shape = (Constants.TILE_SIZE.value,)
84
+ time_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
85
+ frequency_chunk_shape = (len(frequencies),)
86
+ latitude_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
87
+ longitude_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
88
+ bottom_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
89
+ speed_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
90
+ distance_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
91
+ sv_chunk_shape = (Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1)
92
+ #####################################################################
93
+ root = zarr.create_group(store=zarr_path, zarr_format=3, overwrite=True)
94
+ #####################################################################
95
+ # --- Coordinate: Time --- #
96
+ # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
97
+ # "data_type": "int64", "fill_value": 0, "units": "nanoseconds since 1970-01-01", "calendar": "proleptic_gregorian"
98
+ #
99
+ time_values = np.repeat(0.0, width)
100
+ time_values.astype(np.dtype(Coordinates.TIME_DTYPE.value))
101
+ root.create_array(
102
+ name=Coordinates.TIME.value,
103
+ # shape=width_indices,
104
+ # dtype=np.dtype(Coordinates.TIME_DTYPE.value),
105
+ data=time_values,
106
+ chunks=time_chunk_shape,
107
+ compressor=compressors,
108
+ fill_value=np.nan,
109
+ attributes=dict(
110
+ calendar=Coordinates.TIME_CALENDAR.value,
111
+ units=Coordinates.TIME_UNITS.value,
112
+ long_name=Coordinates.TIME_LONG_NAME.value,
113
+ standard_name=Coordinates.TIME_STANDARD_NAME.value,
114
+ ),
115
+ dimension_names=[Coordinates.TIME.value],
116
+ overwrite=True,
117
+ )
118
+ #####################################################################
119
+ #####################################################################
120
+ # # --- Coordinate: Depth --- #
121
+ depth_data_values = self.get_depth_values(
122
+ max_echo_range=max_echo_range,
123
+ )
124
+ depth_data = np.array(
125
+ depth_data_values, dtype=Coordinates.DEPTH_DTYPE.value
126
+ )
127
+ root.create_array(
128
+ name=Coordinates.DEPTH.value,
129
+ # shape=depth_indices,
130
+ # dtype=np.dtype(Coordinates.DEPTH_DTYPE.value),
131
+ data=depth_data,
132
+ chunks=depth_chunk_shape,
133
+ compressor=compressors,
134
+ # fill_value=np.nan,
135
+ attributes=dict(
136
+ units=Coordinates.DEPTH_UNITS.value,
137
+ long_name=Coordinates.DEPTH_LONG_NAME.value,
138
+ standard_name=Coordinates.DEPTH_STANDARD_NAME.value,
139
+ ),
140
+ dimension_names=[Coordinates.DEPTH.value], # TODO: is this right
141
+ overwrite=True,
142
+ )
143
+ # #####################################################################
144
+ # # --- Coordinate: Latitude --- #
145
+ # latitude_values = np.rep(np.nan, width_indices)
146
+ # latitude_values.astype(np.dtype(Coordinates.LATITUDE_DTYPE.value))
147
+ root.create_array(
148
+ name=Coordinates.LATITUDE.value,
149
+ shape=width,
150
+ dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
151
+ # data=latitude_values,
152
+ chunks=latitude_chunk_shape,
153
+ compressor=compressors,
154
+ fill_value=np.nan,
155
+ attributes=dict(
156
+ units=Coordinates.LATITUDE_UNITS.value,
157
+ long_name=Coordinates.LATITUDE_LONG_NAME.value,
158
+ standard_name=Coordinates.LATITUDE_STANDARD_NAME.value,
159
+ ),
160
+ dimension_names=[Coordinates.TIME.value],
161
+ overwrite=True,
162
+ )
163
+ # #####################################################################
164
+ # # --- Coordinate: Longitude --- #
165
+ # longitude_values = np.arange(0, width_indices)
166
+ # longitude_values.astype(np.dtype(Coordinates.LONGITUDE_DTYPE.value))
167
+ root.create_array(
168
+ name=Coordinates.LONGITUDE.value,
169
+ shape=width,
170
+ dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
171
+ # data=longitude_values,
172
+ chunks=longitude_chunk_shape,
173
+ compressor=compressors,
174
+ fill_value=np.nan,
175
+ attributes=dict(
176
+ units=Coordinates.LONGITUDE_UNITS.value,
177
+ long_name=Coordinates.LONGITUDE_LONG_NAME.value,
178
+ standard_name=Coordinates.LONGITUDE_STANDARD_NAME.value,
179
+ ),
180
+ dimension_names=[
181
+ Coordinates.TIME.value
182
+ ], # Note: LONGITUDE is indexed by TIME
183
+ overwrite=True,
184
+ )
185
+ # #####################################################################
186
+ # # --- Coordinate: Bottom --- #
187
+ # bottom_values = np.repeat(12.34, width_indices)
188
+ # bottom_values.astype(np.dtype(Coordinates.BOTTOM_DTYPE.value))
189
+ root.create_array(
190
+ name=Coordinates.BOTTOM.value,
191
+ shape=width,
192
+ dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
193
+ # data=bottom_values,
194
+ chunks=bottom_chunk_shape,
195
+ compressor=compressors,
196
+ fill_value=np.nan,
197
+ attributes=dict(
198
+ units=Coordinates.BOTTOM_UNITS.value,
199
+ long_name=Coordinates.BOTTOM_LONG_NAME.value,
200
+ standard_name=Coordinates.BOTTOM_STANDARD_NAME.value,
201
+ ),
202
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
203
+ overwrite=True,
204
+ )
205
+ # #####################################################################
206
+ # # --- Coordinate: Speed --- #
207
+ # speed_values = np.repeat(5.67, width_indices)
208
+ # speed_values.astype(np.dtype(Coordinates.SPEED_DTYPE.value))
209
+ root.create_array(
210
+ name=Coordinates.SPEED.value,
211
+ shape=width,
212
+ dtype=np.dtype(Coordinates.SPEED_DTYPE.value),
213
+ # data=speed_values,
214
+ chunks=speed_chunk_shape,
215
+ compressor=compressors,
216
+ fill_value=np.nan,
217
+ attributes=dict(
218
+ units=Coordinates.SPEED_UNITS.value,
219
+ long_name=Coordinates.SPEED_LONG_NAME.value,
220
+ standard_name=Coordinates.SPEED_STANDARD_NAME.value,
221
+ ),
222
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
223
+ overwrite=True,
224
+ )
225
+ # #####################################################################
226
+ # # --- Coordinate: Distance --- #
227
+ # distance_values = np.repeat(8.90, width_indices)
228
+ # distance_values.astype(np.dtype(Coordinates.DISTANCE_DTYPE.value))
229
+ root.create_array(
230
+ name=Coordinates.DISTANCE.value,
231
+ shape=width,
232
+ dtype=np.dtype(Coordinates.DISTANCE_DTYPE.value),
233
+ # data=distance_values,
234
+ chunks=distance_chunk_shape,
235
+ compressor=compressors,
236
+ fill_value=np.nan,
237
+ attributes=dict(
238
+ units=Coordinates.DISTANCE_UNITS.value,
239
+ long_name=Coordinates.DISTANCE_LONG_NAME.value,
240
+ standard_name=Coordinates.DISTANCE_STANDARD_NAME.value,
241
+ ),
242
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
243
+ overwrite=True,
244
+ )
245
+ # #####################################################################
246
+ # # --- Coordinate: Frequency --- #
247
+ root.create_array(
248
+ name=Coordinates.FREQUENCY.value,
249
+ # shape=frequency_indices,
250
+ # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
251
+ data=frequencies,
252
+ # chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
253
+ chunks=frequency_chunk_shape,
254
+ compressor=compressors,
255
+ # fill_value=0,
256
+ attributes=dict(
257
+ units=Coordinates.FREQUENCY_UNITS.value,
258
+ long_name=Coordinates.FREQUENCY_LONG_NAME.value,
259
+ standard_name=Coordinates.FREQUENCY_STANDARD_NAME.value,
260
+ ),
261
+ dimension_names=[Coordinates.FREQUENCY.value],
262
+ overwrite=True,
263
+ )
264
+ # #####################################################################
265
+ # # --- Sv Data --- #
266
+ root.create_array(
267
+ name=Coordinates.SV.value,
268
+ shape=(len(depth_data), width, len(frequencies)),
269
+ dtype=np.dtype(Coordinates.SV_DTYPE.value),
270
+ # data=,
271
+ chunks=sv_chunk_shape,
272
+ compressor=compressors,
273
+ fill_value=np.nan,
274
+ attributes=dict(
275
+ units=Coordinates.SV_UNITS.value,
276
+ long_name=Coordinates.SV_LONG_NAME.value,
277
+ standard_name=Coordinates.SV_STANDARD_NAME.value,
278
+ ),
279
+ dimension_names=[
280
+ Coordinates.DEPTH.value,
281
+ Coordinates.TIME.value,
282
+ Coordinates.FREQUENCY.value,
283
+ ],
284
+ overwrite=True,
285
+ )
286
+ #####################################################################
287
+ # # --- Metadata --- #
288
+ root.attrs["ship_name"] = ship_name
289
+ root.attrs["cruise_name"] = cruise_name
290
+ root.attrs["sensor_name"] = sensor_name
291
+ #
292
+ root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
293
+ # NOTE: for the version to be parsable you need to build the python package locally first.
294
+ root.attrs["processing_software_version"] = metadata.version(
295
+ "water-column-sonar-processing"
296
+ )
297
+ root.attrs["processing_software_time"] = Timestamp.get_timestamp()
298
+ #
299
+ root.attrs["calibration_status"] = calibration_status
300
+ root.attrs["tile_size"] = Constants.TILE_SIZE.value
301
+ #
302
+ return zarr_path
303
+ except Exception as err:
304
+ raise RuntimeError(f"Problem trying to create zarr store, {err}")
257
305
 
258
- ############################################################################
259
- # def update_zarr_store(
306
+ # #######################################################
307
+ # def create_zarr_store_old(
260
308
  # self,
261
- # path: str,
309
+ # path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
262
310
  # ship_name: str,
263
- # cruise_name: str, # TODO: just pass stem
311
+ # cruise_name: str,
264
312
  # sensor_name: str,
265
- # ) -> None:
313
+ # frequencies: list, # units in Hz
314
+ # width: int,
315
+ # max_echo_range: float,
316
+ # # cruise_min_epsilon: float, # smallest resolution in meters
317
+ # calibration_status: bool = False, # Assume uncalibrated
318
+ # ) -> str:
266
319
  # """
267
- # Opens an existing Zarr store living in a s3 bucket for the purpose
268
- # of updating just a subset of the cruise-level Zarr store associated
269
- # with a file-level Zarr store.
320
+ # Creates a new zarr store in a local temporary directory(?)
270
321
  # """
271
- # pass
322
+ # try:
323
+ # print(f"Creating local zarr store, {cruise_name}.zarr for ship {ship_name}")
324
+ # if len(frequencies) != len(set(frequencies)):
325
+ # raise Exception(
326
+ # "Number of frequencies does not match number of channels"
327
+ # )
328
+ #
329
+ # zarr_path = f"{path}/{cruise_name}.zarr"
330
+ # #####################################################################
331
+ # # Define the chunk sizes and the encoding
332
+ # # 1_000_000 data points for quickest download
333
+ # spatiotemporal_chunk_size = int(1e6)
334
+ # depth_chunk_shape = (512,)
335
+ # time_chunk_shape = (spatiotemporal_chunk_size,)
336
+ # frequency_chunk_shape = (len(frequencies),)
337
+ # latitude_chunk_shape = (spatiotemporal_chunk_size,)
338
+ # longitude_chunk_shape = (spatiotemporal_chunk_size,)
339
+ # bottom_chunk_shape = (spatiotemporal_chunk_size,)
340
+ # speed_chunk_shape = (spatiotemporal_chunk_size,)
341
+ # distance_chunk_shape = (spatiotemporal_chunk_size,)
342
+ # sv_chunk_shape = (512, 512, 1) # TODO: move to constants
343
+ #
344
+ # #####################################################################
345
+ # ##### Depth #####
346
+ # depth_data_values = self.get_depth_values(
347
+ # max_echo_range=max_echo_range,
348
+ # )
349
+ #
350
+ # depth_data = np.array(
351
+ # depth_data_values, dtype=Coordinates.DEPTH_DTYPE.value
352
+ # )
353
+ # depth_da = xr.DataArray(
354
+ # data=depth_data,
355
+ # dims=Coordinates.DEPTH.value,
356
+ # name=Coordinates.DEPTH.value,
357
+ # attrs=dict(
358
+ # units=Coordinates.DEPTH_UNITS.value,
359
+ # long_name=Coordinates.DEPTH_LONG_NAME.value,
360
+ # standard_name=Coordinates.DEPTH_STANDARD_NAME.value,
361
+ # ),
362
+ # )
363
+ #
364
+ # ##### Time #####
365
+ # # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
366
+ # time_data = np.array(
367
+ # np.repeat(np.datetime64(0, "ns"), width),
368
+ # dtype="datetime64[ns]",
369
+ # )
370
+ # time_da = xr.DataArray(
371
+ # data=time_data,
372
+ # dims=Coordinates.TIME.value,
373
+ # name=Coordinates.TIME.value,
374
+ # attrs=dict(
375
+ # # Note: cal & units are written automatically by xarray
376
+ # # calendar="proleptic_gregorian",
377
+ # # units="seconds since 1970-01-01 00:00:00",
378
+ # long_name=Coordinates.TIME_LONG_NAME.value,
379
+ # standard_name=Coordinates.TIME_STANDARD_NAME.value,
380
+ # ),
381
+ # )
382
+ #
383
+ # ##### Frequency #####
384
+ # frequency_data = np.array(
385
+ # frequencies,
386
+ # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
387
+ # )
388
+ # frequency_da = xr.DataArray(
389
+ # data=frequency_data,
390
+ # dims=Coordinates.FREQUENCY.value,
391
+ # name=Coordinates.FREQUENCY.value,
392
+ # attrs=dict(
393
+ # units=Coordinates.FREQUENCY_UNITS.value,
394
+ # long_name=Coordinates.FREQUENCY_LONG_NAME.value,
395
+ # standard_name=Coordinates.FREQUENCY_STANDARD_NAME.value,
396
+ # ),
397
+ # )
398
+ #
399
+ # ##### Latitude #####
400
+ # gps_data = np.array(
401
+ # np.repeat(np.nan, width),
402
+ # dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
403
+ # )
404
+ # latitude_da = xr.DataArray(
405
+ # data=gps_data,
406
+ # coords=dict(
407
+ # time=time_da,
408
+ # ),
409
+ # dims=Coordinates.TIME.value, # Note: "TIME"
410
+ # name=Coordinates.LATITUDE.value,
411
+ # attrs=dict(
412
+ # units=Coordinates.LATITUDE_UNITS.value,
413
+ # long_name=Coordinates.LATITUDE_LONG_NAME.value,
414
+ # standard_name=Coordinates.LATITUDE_STANDARD_NAME.value,
415
+ # ),
416
+ # ) # Note: LATITUDE is indexed by TIME
417
+ #
418
+ # ##### Longitude #####
419
+ # longitude_da = xr.DataArray(
420
+ # data=gps_data,
421
+ # coords=dict(
422
+ # time=time_da,
423
+ # ),
424
+ # dims=Coordinates.TIME.value, # Note: "TIME"
425
+ # name=Coordinates.LONGITUDE.value,
426
+ # attrs=dict(
427
+ # units=Coordinates.LONGITUDE_UNITS.value,
428
+ # long_name=Coordinates.LONGITUDE_LONG_NAME.value,
429
+ # standard_name=Coordinates.LONGITUDE_STANDARD_NAME.value,
430
+ # ),
431
+ # ) # Note: LONGITUDE is indexed by TIME
432
+ #
433
+ # ##### Bottom #####
434
+ # bottom_data = np.array(
435
+ # np.repeat(np.nan, width), dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value)
436
+ # )
437
+ # bottom_da = xr.DataArray(
438
+ # data=bottom_data,
439
+ # coords=dict(
440
+ # time=time_da,
441
+ # ),
442
+ # dims=Coordinates.TIME.value, # Note: "TIME"
443
+ # name=Coordinates.BOTTOM.value,
444
+ # attrs=dict(
445
+ # units=Coordinates.BOTTOM_UNITS.value,
446
+ # long_name=Coordinates.BOTTOM_LONG_NAME.value,
447
+ # standard_name=Coordinates.BOTTOM_STANDARD_NAME.value,
448
+ # ),
449
+ # )
450
+ #
451
+ # ##### Speed #####
452
+ # speed_data = np.array(
453
+ # np.repeat(np.nan, width), dtype=np.dtype(Coordinates.SPEED_DTYPE.value)
454
+ # )
455
+ # speed_da = xr.DataArray(
456
+ # data=speed_data,
457
+ # coords=dict(
458
+ # time=time_da,
459
+ # ),
460
+ # dims=Coordinates.TIME.value, # Note: "TIME"
461
+ # name=Coordinates.SPEED.value,
462
+ # attrs=dict(
463
+ # units=Coordinates.SPEED_UNITS.value,
464
+ # long_name=Coordinates.SPEED_LONG_NAME.value,
465
+ # standard_name=Coordinates.SPEED_STANDARD_NAME.value,
466
+ # ),
467
+ # )
468
+ #
469
+ # ##### Distance #####
470
+ # distance_data = np.array(
471
+ # np.repeat(np.nan, width),
472
+ # dtype=np.dtype(Coordinates.DISTANCE_DTYPE.value),
473
+ # )
474
+ # distance_da = xr.DataArray(
475
+ # data=distance_data,
476
+ # coords=dict(
477
+ # time=time_da,
478
+ # ),
479
+ # dims=Coordinates.TIME.value, # Note: "TIME"
480
+ # name=Coordinates.DISTANCE.value,
481
+ # attrs=dict(
482
+ # units=Coordinates.DISTANCE_UNITS.value,
483
+ # long_name=Coordinates.DISTANCE_LONG_NAME.value,
484
+ # standard_name=Coordinates.DISTANCE_STANDARD_NAME.value,
485
+ # ),
486
+ # )
487
+ #
488
+ # ##### Sv #####
489
+ # gc.collect()
490
+ # # sv_data = np.empty(
491
+ # # (len(depth_data), width, len(frequencies)),
492
+ # # # (2501, 4_100_782, 4), # large cruise used for testing
493
+ # # dtype=np.dtype(Coordinates.SV_DTYPE.value),
494
+ # # )
495
+ # sv_data = np.full(
496
+ # (len(depth_data), width, len(frequencies)),
497
+ # np.nan,
498
+ # dtype=np.dtype(Coordinates.SV_DTYPE.value),
499
+ # )
500
+ # print(f"one: {sys.getsizeof(sv_data)}")
501
+ # # sv_data[:] = np.nan # initialize all
502
+ #
503
+ # sv_da = xr.DataArray(
504
+ # data=sv_data,
505
+ # coords=dict(
506
+ # depth=depth_da,
507
+ # time=time_da,
508
+ # frequency=frequency_da,
509
+ # #
510
+ # latitude=latitude_da,
511
+ # longitude=longitude_da,
512
+ # bottom=bottom_da,
513
+ # speed=speed_da,
514
+ # distance=distance_da,
515
+ # ),
516
+ # dims=( # Depth * Time * Frequency
517
+ # Coordinates.DEPTH.value,
518
+ # Coordinates.TIME.value,
519
+ # Coordinates.FREQUENCY.value,
520
+ # ),
521
+ # name=Coordinates.SV.value,
522
+ # attrs=dict(
523
+ # units=Coordinates.SV_UNITS.value,
524
+ # long_name=Coordinates.SV_LONG_NAME.value,
525
+ # standard_name=Coordinates.SV_STANDARD_NAME.value,
526
+ # tiles_size=Constants.TILE_SIZE.value,
527
+ # _FillValue=np.nan,
528
+ # ),
529
+ # )
530
+ # print(f"two: {sys.getsizeof(sv_data)}") # getting to at least here
531
+ # del sv_data
532
+ # sv_da.encoding = {"compressors": [compressor], "chunks": sv_chunk_shape}
533
+ # # sv_da = sv_da.astype(np.float32) # was crashing here
534
+ # gc.collect()
535
+ # #####################################################################
536
+ # ### Now create the xarray.Dataset
537
+ # ds = xr.Dataset(
538
+ # data_vars=dict(
539
+ # Sv=sv_da,
540
+ # #
541
+ # bottom=bottom_da,
542
+ # speed=speed_da,
543
+ # distance=distance_da,
544
+ # ),
545
+ # coords=dict(
546
+ # depth=depth_da,
547
+ # time=time_da,
548
+ # frequency=frequency_da,
549
+ # #
550
+ # latitude=latitude_da,
551
+ # longitude=longitude_da,
552
+ # ),
553
+ # attrs=dict(
554
+ # # --- Metadata --- #
555
+ # ship_name=ship_name,
556
+ # cruise_name=cruise_name,
557
+ # sensor_name=sensor_name,
558
+ # processing_software_name=Coordinates.PROJECT_NAME.value,
559
+ # # NOTE: for the version to be parsable you need to build the python package
560
+ # # locally first.
561
+ # processing_software_version=importlib.metadata.version(
562
+ # "water-column-sonar-processing"
563
+ # ),
564
+ # processing_software_time=Timestamp.get_timestamp(),
565
+ # calibration_status=calibration_status,
566
+ # tile_size=Constants.TILE_SIZE.value,
567
+ # ),
568
+ # )
569
+ # del sv_da
570
+ # gc.collect()
571
+ # print(f"three: {sys.getsizeof(ds)}")
572
+ # #####################################################################
573
+ # encodings = dict(
574
+ # depth={
575
+ # "compressors": [compressor],
576
+ # "chunks": depth_chunk_shape,
577
+ # },
578
+ # time={
579
+ # "compressors": [compressor],
580
+ # "chunks": time_chunk_shape,
581
+ # "units": Coordinates.TIME_UNITS.value,
582
+ # },
583
+ # frequency={
584
+ # "compressors": [compressor],
585
+ # "chunks": frequency_chunk_shape,
586
+ # },
587
+ # latitude={
588
+ # "compressors": [compressor],
589
+ # "chunks": latitude_chunk_shape,
590
+ # },
591
+ # longitude={
592
+ # "compressors": [compressor],
593
+ # "chunks": longitude_chunk_shape,
594
+ # },
595
+ # bottom={
596
+ # "compressors": [compressor],
597
+ # "chunks": bottom_chunk_shape,
598
+ # },
599
+ # speed={
600
+ # "compressors": [compressor],
601
+ # "chunks": speed_chunk_shape,
602
+ # },
603
+ # distance={
604
+ # "compressors": [compressor],
605
+ # "chunks": distance_chunk_shape,
606
+ # },
607
+ # Sv={
608
+ # "compressors": [compressor],
609
+ # "chunks": sv_chunk_shape,
610
+ # },
611
+ # )
612
+ # gc.collect()
613
+ # ds.to_zarr(
614
+ # store=zarr_path,
615
+ # mode="w", # “w” means create (overwrite if exists)
616
+ # encoding=encodings,
617
+ # consolidated=False,
618
+ # safe_chunks=False,
619
+ # align_chunks=True,
620
+ # zarr_format=3,
621
+ # write_empty_chunks=False, # Might need to change this
622
+ # )
623
+ # #####################################################################
624
+ # return zarr_path
625
+ # except Exception as err:
626
+ # raise RuntimeError(f"Problem trying to create zarr store, {err}")
627
+ # # finally:
628
+ # # cleaner = Cleaner()
629
+ # # cleaner.delete_local_files()
630
+ # # TODO: should delete zarr store in temp directory too?
272
631
 
273
632
  ############################################################################
274
633
  def open_s3_zarr_store_with_zarr(
@@ -276,60 +635,106 @@ class ZarrManager:
276
635
  ship_name: str,
277
636
  cruise_name: str,
278
637
  sensor_name: str,
279
- # zarr_synchronizer: Union[str, None] = None,
280
- ):
638
+ output_bucket_name: str,
639
+ endpoint_url: Optional[str] = None,
640
+ ) -> Group:
281
641
  # Mounts a Zarr store using pythons Zarr implementation. The mounted store
282
642
  # will have read/write privileges so that store can be updated.
283
- print("Opening Zarr store with Zarr.")
643
+ print("Opening L2 Zarr store with Zarr for writing.")
284
644
  try:
285
- s3fs_manager = S3FSManager()
286
- root = f"{self.output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
287
- store = s3fs_manager.s3_map(s3_zarr_store_path=root)
288
- # synchronizer = model.ProcessSynchronizer(f"/tmp/{ship_name}_{cruise_name}.sync")
289
- cruise_zarr = zarr.open(store=store, mode="r+")
645
+ level = str(Constants.LEVEL_2.value)
646
+ store = f"s3://{output_bucket_name}/{level}/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
647
+ print(f"endpoint url: {endpoint_url}")
648
+ cruise_zarr = zarr.open(
649
+ store=store,
650
+ mode="r+",
651
+ zarr_format=3,
652
+ storage_options={
653
+ "endpoint_url": endpoint_url,
654
+ "key": self.key,
655
+ "secret": self.secret,
656
+ },
657
+ )
658
+ print("Done opening store with Zarr.")
659
+ return cruise_zarr
290
660
  except Exception as err: # Failure
291
- print(f"Exception encountered opening Zarr store with Zarr.: {err}")
292
- raise
293
- print("Done opening Zarr store with Zarr.")
294
- return cruise_zarr
661
+ raise RuntimeError(f"Exception encountered opening store with Zarr, {err}")
295
662
 
296
- ############################################################################
663
+ ###########################################################################
664
+ @staticmethod
297
665
  def open_s3_zarr_store_with_xarray(
298
- self,
299
666
  ship_name: str,
300
667
  cruise_name: str,
301
668
  sensor_name: str,
302
669
  file_name_stem: str,
670
+ bucket_name: str,
671
+ # level: str, # TODO: add level
672
+ endpoint_url: Optional[str] = None, # needed for moto testing
303
673
  ) -> xr.Dataset:
304
- print("Opening Zarr store in S3 as Xarray.")
674
+ print("Opening L1 Zarr store in S3 with Xarray.")
305
675
  try:
306
- zarr_path = f"s3://{self.output_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
307
- s3fs_manager = S3FSManager()
308
- store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
309
- ds = xr.open_zarr(
310
- store=store_s3_map, consolidated=None
311
- ) # synchronizer=SYNCHRONIZER
676
+ zarr_path = f"s3://{bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
677
+ kwargs = {"consolidated": False}
678
+ ds = xr.open_dataset(
679
+ filename_or_obj=zarr_path,
680
+ engine="zarr",
681
+ backend_kwargs={
682
+ "storage_options": {
683
+ "endpoint_url": endpoint_url,
684
+ "anon": True,
685
+ },
686
+ },
687
+ **kwargs,
688
+ )
689
+ return ds
312
690
  except Exception as err:
313
- print("Problem opening Zarr store in S3 as Xarray.")
314
- raise err
315
- print("Done opening Zarr store in S3 as Xarray.")
316
- return ds
691
+ raise RuntimeError(f"Problem opening Zarr store in S3 as Xarray, {err}")
317
692
 
318
- ############################################################################
693
+ ###########################################################################
694
+ # TODO: can this be consolidated with above
695
+ @staticmethod
696
+ def open_l2_zarr_store_with_xarray(
697
+ ship_name: str,
698
+ cruise_name: str,
699
+ sensor_name: str,
700
+ bucket_name: str,
701
+ endpoint_url: Optional[str] = None, # needed for moto testing
702
+ ) -> xr.Dataset:
703
+ print("Opening L2 Zarr store in S3 with Xarray.")
704
+ try:
705
+ level = str(Constants.LEVEL_2.value)
706
+ zarr_path = f"s3://{bucket_name}/{level}/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
707
+ kwargs = {"consolidated": False}
708
+ ds = xr.open_dataset(
709
+ filename_or_obj=zarr_path,
710
+ engine="zarr",
711
+ backend_kwargs={
712
+ "storage_options": {
713
+ "endpoint_url": endpoint_url,
714
+ "anon": True,
715
+ }
716
+ },
717
+ **kwargs,
718
+ )
719
+ return ds
720
+ except Exception as err:
721
+ raise RuntimeError(f"Problem opening Zarr store in S3 as Xarray, {err}")
319
722
 
320
- #######################################################
723
+ ###########################################################################
724
+
725
+ ###########################################################################
321
726
  # def create_process_synchronizer(self):
322
727
  # # TODO: explore aws redis options
323
728
  # pass
324
729
 
325
- #######################################################
730
+ ###########################################################################
326
731
  # def verify_cruise_store_data(self):
327
732
  # # TODO: run a check on a finished model store to ensure that
328
733
  # # none of the time, latitude, longitude, or depth values
329
734
  # # are NaN.
330
735
  # pass
331
736
 
332
- #######################################################
737
+ ###########################################################################
333
738
 
334
739
 
335
740
  ###########################################################