water-column-sonar-processing 0.0.9__py3-none-any.whl → 26.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. water_column_sonar_processing/aws/dynamodb_manager.py +138 -59
  2. water_column_sonar_processing/aws/s3_manager.py +179 -141
  3. water_column_sonar_processing/aws/s3fs_manager.py +29 -33
  4. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  5. water_column_sonar_processing/cruise/create_empty_zarr_store.py +35 -96
  6. water_column_sonar_processing/cruise/datatree_manager.py +21 -0
  7. water_column_sonar_processing/cruise/resample_regrid.py +142 -127
  8. water_column_sonar_processing/geometry/__init__.py +10 -2
  9. water_column_sonar_processing/geometry/elevation_manager.py +111 -0
  10. water_column_sonar_processing/geometry/geometry_manager.py +50 -49
  11. water_column_sonar_processing/geometry/line_simplification.py +176 -0
  12. water_column_sonar_processing/geometry/pmtile_generation.py +227 -223
  13. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  14. water_column_sonar_processing/index/index_manager.py +151 -33
  15. water_column_sonar_processing/model/zarr_manager.py +665 -262
  16. water_column_sonar_processing/processing/__init__.py +3 -3
  17. water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
  18. water_column_sonar_processing/processing/raw_to_zarr.py +206 -214
  19. water_column_sonar_processing/utility/__init__.py +9 -2
  20. water_column_sonar_processing/utility/constants.py +69 -18
  21. water_column_sonar_processing/utility/pipeline_status.py +11 -15
  22. water_column_sonar_processing/utility/timestamp.py +3 -4
  23. water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
  24. water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
  25. {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
  26. {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
  27. water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
  28. water_column_sonar_processing/process.py +0 -147
  29. water_column_sonar_processing/processing/cruise_sampler.py +0 -342
  30. water_column_sonar_processing-0.0.9.dist-info/METADATA +0 -134
  31. water_column_sonar_processing-0.0.9.dist-info/RECORD +0 -32
  32. {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
@@ -1,276 +1,633 @@
1
1
  import os
2
- import numcodecs
2
+ from importlib import metadata
3
+ from typing import Optional
4
+
3
5
  import numpy as np
4
6
  import xarray as xr
5
7
  import zarr
6
- from numcodecs import Blosc
7
-
8
- from water_column_sonar_processing.aws import S3FSManager
9
- from water_column_sonar_processing.utility import Constants
10
- from water_column_sonar_processing.utility import Timestamp
11
- from water_column_sonar_processing.utility import Coordinates
8
+ from zarr.codecs import BloscCodec, BloscShuffle
9
+ from zarr.core.group import Group
12
10
 
13
- numcodecs.blosc.use_threads = False
14
- numcodecs.blosc.set_nthreads(1)
11
+ from water_column_sonar_processing.utility import Constants, Coordinates, Timestamp
15
12
 
13
+ # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/blosc/index.html
14
+ compressors = BloscCodec(
15
+ cname="zstd",
16
+ clevel=9,
17
+ shuffle=BloscShuffle.bitshuffle,
18
+ )
16
19
 
17
- # TODO: when ready switch to version 3 of model spec
18
- # ZARR_V3_EXPERIMENTAL_API = 1
19
20
 
20
-
21
- # creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
21
+ # creates the latlon dataset: foo = ep.consolidate.add_location(ds_Sv, echodata)
22
22
  class ZarrManager:
23
23
  #######################################################
24
24
  def __init__(
25
25
  self,
26
+ # endpoint_url: Optional[str] = None,
26
27
  ):
27
- # TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
28
- self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
29
28
  self.__overwrite = True
30
- self.__num_threads = numcodecs.blosc.get_nthreads()
31
- self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
32
- self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
29
+ self.key = os.environ.get("OUTPUT_BUCKET_ACCESS_KEY")
30
+ self.secret = os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY")
33
31
 
34
32
  #######################################################
33
+ @staticmethod
35
34
  def get_depth_values(
36
- self,
37
- min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
38
- max_echo_range: float = 100.0, # maximum depth measured from whole cruise
39
- ):
35
+ max_echo_range: float, # maximum depth measured from whole cruise
36
+ cruise_min_epsilon: float = 0.20, # delta subsequent measurements
37
+ ) -> np.ndarray[tuple]:
40
38
  # Gets the set of depth values that will be used when resampling and
41
- # regridding the data to a cruise level model store.
42
- # Note: returned values do not start at zero.
43
- print("Getting depth values.")
44
- all_cruise_depth_values = np.linspace(
45
- start=min_echo_range,
46
- stop=max_echo_range,
47
- num=int(max_echo_range / min_echo_range) + 1,
39
+ # regridding the dataset to a cruise level model store.
40
+ # Note: returned values start at zero!
41
+ # For more info see here: https://echopype.readthedocs.io/en/stable/data-proc-additional.html
42
+ all_cruise_depth_values = np.linspace( # TODO: PROBLEM HERE
43
+ start=0, # start it at zero
44
+ stop=np.ceil(max_echo_range), # round up
45
+ num=int(np.ceil(max_echo_range) / cruise_min_epsilon) + 1,
48
46
  endpoint=True,
49
47
  )
50
48
 
51
- print("Done getting depth values.")
49
+ if np.any(np.isnan(all_cruise_depth_values)):
50
+ raise Exception("Problem depth values returned were NaN.")
51
+
52
52
  return all_cruise_depth_values.round(decimals=2)
53
53
 
54
54
  #######################################################
55
55
  def create_zarr_store(
56
56
  self,
57
- path: str,
57
+ path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
58
58
  ship_name: str,
59
59
  cruise_name: str,
60
60
  sensor_name: str,
61
- frequencies: list, # units in Hz
62
- width: int, # TODO: needs better name... "ping_time"
63
- min_echo_range: float, # smallest resolution in meters
61
+ frequencies: list, # units in Hz, type(frequencies) == np.ndarray
62
+ width: int,
64
63
  max_echo_range: float,
65
64
  calibration_status: bool = False, # Assume uncalibrated
66
65
  ) -> str:
67
- print(
68
- f"Creating local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
69
- )
70
-
71
- # There should be no repeated frequencies
72
- assert len(frequencies) == len(set(frequencies))
73
- # TODO: eventually switch coordinate to "channel"
74
-
75
- print(f"Debugging number of threads: {self.__num_threads}")
76
-
77
- zarr_path = f"{path}/{cruise_name}.zarr"
78
- store = zarr.DirectoryStore(path=zarr_path, normalize_keys=False)
79
- root = zarr.group(store=store, overwrite=self.__overwrite, cache_attrs=True)
80
-
81
- #####################################################################
82
- # --- Coordinate: Time --- #
83
- # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
84
- root.create_dataset(
85
- name=Coordinates.TIME.value,
86
- data=np.repeat(0.0, width),
87
- shape=width,
88
- chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
89
- # Constants.TILE_SIZE.value,
90
- #), # TODO: the chunking scheme doesn't seem to be working here
91
- dtype=np.dtype(Coordinates.TIME_DTYPE.value),
92
- compressor=self.__compressor,
93
- fill_value=np.nan, # TODO: do i want nan's?
94
- overwrite=self.__overwrite,
95
- )
96
-
97
- root.time.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
98
-
99
- root.time.attrs["calendar"] = Coordinates.TIME_CALENDAR.value
100
- root.time.attrs["units"] = Coordinates.TIME_UNITS.value
101
- root.time.attrs["long_name"] = Coordinates.TIME_LONG_NAME.value
102
- root.time.attrs["standard_name"] = Coordinates.TIME_STANDARD_NAME.value
103
-
104
- #####################################################################
105
- # --- Coordinate: Depth --- #
106
- depth_values = self.get_depth_values(
107
- min_echo_range=min_echo_range, max_echo_range=max_echo_range
108
- )
109
-
110
- root.create_dataset(
111
- name=Coordinates.DEPTH.value,
112
- # TODO: verify that these values are correct
113
- data=depth_values,
114
- shape=len(depth_values),
115
- chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
116
- dtype=np.dtype(
117
- Coordinates.DEPTH_DTYPE.value
118
- ), # float16 == 2 significant digits would be ideal
119
- compressor=self.__compressor,
120
- fill_value=np.nan,
121
- overwrite=self.__overwrite,
122
- )
123
- # TODO: change to exception
124
- assert not np.any(np.isnan(depth_values))
125
-
126
- root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
127
-
128
- root.depth.attrs["long_name"] = Coordinates.DEPTH_LONG_NAME.value
129
- root.depth.attrs["units"] = Coordinates.DEPTH_UNITS.value
130
-
131
- #####################################################################
132
- # --- Coordinate: Latitude --- #
133
- root.create_dataset(
134
- name=Coordinates.LATITUDE.value,
135
- # data=np.repeat(0.0, width),
136
- shape=width,
137
- chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
138
- dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
139
- compressor=self.__compressor,
140
- fill_value=np.nan,
141
- overwrite=self.__overwrite,
142
- )
143
-
144
- # Note: LATITUDE is indexed by TIME
145
- root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
146
-
147
- root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
148
- root.latitude.attrs["units"] = Coordinates.LATITUDE_UNITS.value
149
-
150
- #####################################################################
151
- # --- Coordinate: Longitude --- #
152
- root.create_dataset(
153
- name=Coordinates.LONGITUDE.value,
154
- # data=np.repeat(0.0, width), # root.longitude[:] = np.nan
155
- shape=width,
156
- chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
157
- dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
158
- compressor=self.__compressor,
159
- fill_value=np.nan,
160
- overwrite=self.__overwrite,
161
- )
162
-
163
- # Note: LONGITUDE is indexed by TIME
164
- root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
165
-
166
- root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
167
- root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
168
-
169
- #####################################################################
170
- # TODO: verify adding this variable for where the bottom was detected
171
- # --- Coordinate: Bottom --- #
172
- root.create_dataset(
173
- name=Coordinates.BOTTOM.value,
174
- data=np.repeat(0.0, width), # root.longitude[:] = np.nan
175
- shape=width,
176
- chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
177
- dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
178
- compressor=self.__compressor,
179
- fill_value=0.0,
180
- overwrite=self.__overwrite,
181
- )
182
-
183
- # BOTTOM is indexed by TIME
184
- root.bottom.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
185
-
186
- root.bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
187
- root.bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
188
-
189
- #####################################################################
190
- # --- Coordinate: Frequency --- #
191
- root.create_dataset(
192
- name=Coordinates.FREQUENCY.value,
193
- data=frequencies,
194
- shape=len(frequencies),
195
- chunks=len(frequencies),
196
- dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
197
- compressor=self.__compressor,
198
- fill_value=0.0,
199
- overwrite=self.__overwrite,
200
- )
201
-
202
- # TODO: best coordinate would be channel with str type
203
- root.frequency.attrs["_ARRAY_DIMENSIONS"] = [
204
- Coordinates.FREQUENCY.value
205
- ] # TODO: is this correct
206
-
207
- root.frequency.attrs["long_name"] = Coordinates.FREQUENCY_LONG_NAME.value
208
- root.frequency.attrs["standard_name"] = (
209
- Coordinates.FREQUENCY_STANDARD_NAME.value
210
- )
211
- root.frequency.attrs["units"] = Coordinates.FREQUENCY_UNITS.value
212
-
213
- #####################################################################
214
- # --- Sv Data --- #
215
- root.create_dataset(
216
- name=Coordinates.SV.value,
217
- shape=(len(depth_values), width, len(frequencies)),
218
- chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
219
- dtype=np.dtype(
220
- Coordinates.SV_DTYPE.value
221
- ), # TODO: try to experiment with 'float16'
222
- compressor=self.__compressor,
223
- fill_value=np.nan,
224
- overwrite=self.__overwrite,
225
- )
226
-
227
- root.Sv.attrs["_ARRAY_DIMENSIONS"] = [
228
- Coordinates.DEPTH.value,
229
- Coordinates.TIME.value,
230
- Coordinates.FREQUENCY.value,
231
- ]
232
-
233
- root.Sv.attrs["long_name"] = Coordinates.SV_LONG_NAME.value
234
- root.Sv.attrs["units"] = Coordinates.SV_UNITS.value
235
- root.Sv.attrs["tile_size"] = Constants.TILE_SIZE.value
236
-
237
- #####################################################################
238
- # --- Metadata --- #
239
- root.attrs["ship_name"] = ship_name
240
- root.attrs["cruise_name"] = cruise_name
241
- root.attrs["sensor_name"] = sensor_name
242
- #
243
- root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
244
- root.attrs["processing_software_version"] = (
245
- "0.0.6" # TODO: get programmatically
246
- )
247
- root.attrs["processing_software_time"] = Timestamp.get_timestamp()
248
- #
249
- root.attrs["calibration_status"] = calibration_status
250
-
251
- zarr.consolidate_metadata(store)
252
- #####################################################################
253
66
  """
254
- # zzz = zarr.open('https://echofish-dev-master-118234403147-echofish-zarr-store.s3.us-west-2.amazonaws.com/GU1002_resample.zarr')
255
- # zzz.time[0] = 1274979445.423
256
- # Initialize all to origin time, will be overwritten late
67
+ Creates a new zarr store in a local temporary directory(?)
257
68
  """
258
- return zarr_path
69
+ try:
70
+ print(f"Creating local zarr store, {cruise_name}.zarr for ship {ship_name}")
71
+ if len(frequencies) != len(set(frequencies)):
72
+ raise Exception(
73
+ "Number of frequencies does not match number of channels"
74
+ )
75
+
76
+ zarr_path = f"{path}/{cruise_name}.zarr"
77
+ #####################################################################
78
+ frequencies = np.array(
79
+ frequencies, dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value)
80
+ )
81
+ #####################################################################
82
+ # Define the chunk sizes and the encoding
83
+ depth_chunk_shape = (Constants.TILE_SIZE.value,)
84
+ time_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
85
+ frequency_chunk_shape = (len(frequencies),)
86
+ latitude_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
87
+ longitude_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
88
+ bottom_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
89
+ speed_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
90
+ distance_chunk_shape = (Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,)
91
+ sv_chunk_shape = (Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1)
92
+ #####################################################################
93
+ root = zarr.create_group(store=zarr_path, zarr_format=3, overwrite=True)
94
+ #####################################################################
95
+ # --- Coordinate: Time --- #
96
+ # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
97
+ # "data_type": "int64", "fill_value": 0, "units": "nanoseconds since 1970-01-01", "calendar": "proleptic_gregorian"
98
+ #
99
+ time_values = np.repeat(0.0, width)
100
+ time_values.astype(np.dtype(Coordinates.TIME_DTYPE.value))
101
+ root.create_array(
102
+ name=Coordinates.TIME.value,
103
+ # shape=width_indices,
104
+ # dtype=np.dtype(Coordinates.TIME_DTYPE.value),
105
+ data=time_values,
106
+ chunks=time_chunk_shape,
107
+ compressor=compressors,
108
+ fill_value=np.nan,
109
+ attributes=dict(
110
+ calendar=Coordinates.TIME_CALENDAR.value,
111
+ units=Coordinates.TIME_UNITS.value,
112
+ long_name=Coordinates.TIME_LONG_NAME.value,
113
+ standard_name=Coordinates.TIME_STANDARD_NAME.value,
114
+ ),
115
+ dimension_names=[Coordinates.TIME.value],
116
+ overwrite=True,
117
+ )
118
+ #####################################################################
119
+ #####################################################################
120
+ # # --- Coordinate: Depth --- #
121
+ depth_data_values = self.get_depth_values(
122
+ max_echo_range=max_echo_range,
123
+ )
124
+ depth_data = np.array(
125
+ depth_data_values, dtype=Coordinates.DEPTH_DTYPE.value
126
+ )
127
+ root.create_array(
128
+ name=Coordinates.DEPTH.value,
129
+ # shape=depth_indices,
130
+ # dtype=np.dtype(Coordinates.DEPTH_DTYPE.value),
131
+ data=depth_data,
132
+ chunks=depth_chunk_shape,
133
+ compressor=compressors,
134
+ # fill_value=np.nan,
135
+ attributes=dict(
136
+ units=Coordinates.DEPTH_UNITS.value,
137
+ long_name=Coordinates.DEPTH_LONG_NAME.value,
138
+ standard_name=Coordinates.DEPTH_STANDARD_NAME.value,
139
+ ),
140
+ dimension_names=[Coordinates.DEPTH.value], # TODO: is this right
141
+ overwrite=True,
142
+ )
143
+ # #####################################################################
144
+ # # --- Coordinate: Latitude --- #
145
+ # latitude_values = np.rep(np.nan, width_indices)
146
+ # latitude_values.astype(np.dtype(Coordinates.LATITUDE_DTYPE.value))
147
+ root.create_array(
148
+ name=Coordinates.LATITUDE.value,
149
+ shape=width,
150
+ dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
151
+ # data=latitude_values,
152
+ chunks=latitude_chunk_shape,
153
+ compressor=compressors,
154
+ fill_value=np.nan,
155
+ attributes=dict(
156
+ units=Coordinates.LATITUDE_UNITS.value,
157
+ long_name=Coordinates.LATITUDE_LONG_NAME.value,
158
+ standard_name=Coordinates.LATITUDE_STANDARD_NAME.value,
159
+ ),
160
+ dimension_names=[Coordinates.TIME.value],
161
+ overwrite=True,
162
+ )
163
+ # #####################################################################
164
+ # # --- Coordinate: Longitude --- #
165
+ # longitude_values = np.arange(0, width_indices)
166
+ # longitude_values.astype(np.dtype(Coordinates.LONGITUDE_DTYPE.value))
167
+ root.create_array(
168
+ name=Coordinates.LONGITUDE.value,
169
+ shape=width,
170
+ dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
171
+ # data=longitude_values,
172
+ chunks=longitude_chunk_shape,
173
+ compressor=compressors,
174
+ fill_value=np.nan,
175
+ attributes=dict(
176
+ units=Coordinates.LONGITUDE_UNITS.value,
177
+ long_name=Coordinates.LONGITUDE_LONG_NAME.value,
178
+ standard_name=Coordinates.LONGITUDE_STANDARD_NAME.value,
179
+ ),
180
+ dimension_names=[
181
+ Coordinates.TIME.value
182
+ ], # Note: LONGITUDE is indexed by TIME
183
+ overwrite=True,
184
+ )
185
+ # #####################################################################
186
+ # # --- Coordinate: Bottom --- #
187
+ # bottom_values = np.repeat(12.34, width_indices)
188
+ # bottom_values.astype(np.dtype(Coordinates.BOTTOM_DTYPE.value))
189
+ root.create_array(
190
+ name=Coordinates.BOTTOM.value,
191
+ shape=width,
192
+ dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
193
+ # data=bottom_values,
194
+ chunks=bottom_chunk_shape,
195
+ compressor=compressors,
196
+ fill_value=np.nan,
197
+ attributes=dict(
198
+ units=Coordinates.BOTTOM_UNITS.value,
199
+ long_name=Coordinates.BOTTOM_LONG_NAME.value,
200
+ standard_name=Coordinates.BOTTOM_STANDARD_NAME.value,
201
+ ),
202
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
203
+ overwrite=True,
204
+ )
205
+ # #####################################################################
206
+ # # --- Coordinate: Speed --- #
207
+ # speed_values = np.repeat(5.67, width_indices)
208
+ # speed_values.astype(np.dtype(Coordinates.SPEED_DTYPE.value))
209
+ root.create_array(
210
+ name=Coordinates.SPEED.value,
211
+ shape=width,
212
+ dtype=np.dtype(Coordinates.SPEED_DTYPE.value),
213
+ # data=speed_values,
214
+ chunks=speed_chunk_shape,
215
+ compressor=compressors,
216
+ fill_value=np.nan,
217
+ attributes=dict(
218
+ units=Coordinates.SPEED_UNITS.value,
219
+ long_name=Coordinates.SPEED_LONG_NAME.value,
220
+ standard_name=Coordinates.SPEED_STANDARD_NAME.value,
221
+ ),
222
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
223
+ overwrite=True,
224
+ )
225
+ # #####################################################################
226
+ # # --- Coordinate: Distance --- #
227
+ # distance_values = np.repeat(8.90, width_indices)
228
+ # distance_values.astype(np.dtype(Coordinates.DISTANCE_DTYPE.value))
229
+ root.create_array(
230
+ name=Coordinates.DISTANCE.value,
231
+ shape=width,
232
+ dtype=np.dtype(Coordinates.DISTANCE_DTYPE.value),
233
+ # data=distance_values,
234
+ chunks=distance_chunk_shape,
235
+ compressor=compressors,
236
+ fill_value=np.nan,
237
+ attributes=dict(
238
+ units=Coordinates.DISTANCE_UNITS.value,
239
+ long_name=Coordinates.DISTANCE_LONG_NAME.value,
240
+ standard_name=Coordinates.DISTANCE_STANDARD_NAME.value,
241
+ ),
242
+ dimension_names=[Coordinates.TIME.value], # Note: _ is indexed by TIME
243
+ overwrite=True,
244
+ )
245
+ # #####################################################################
246
+ # # --- Coordinate: Frequency --- #
247
+ root.create_array(
248
+ name=Coordinates.FREQUENCY.value,
249
+ # shape=frequency_indices,
250
+ # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
251
+ data=frequencies,
252
+ # chunks=(Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,),
253
+ chunks=frequency_chunk_shape,
254
+ compressor=compressors,
255
+ # fill_value=0,
256
+ attributes=dict(
257
+ units=Coordinates.FREQUENCY_UNITS.value,
258
+ long_name=Coordinates.FREQUENCY_LONG_NAME.value,
259
+ standard_name=Coordinates.FREQUENCY_STANDARD_NAME.value,
260
+ ),
261
+ dimension_names=[Coordinates.FREQUENCY.value],
262
+ overwrite=True,
263
+ )
264
+ # #####################################################################
265
+ # # --- Sv Data --- #
266
+ root.create_array(
267
+ name=Coordinates.SV.value,
268
+ shape=(len(depth_data), width, len(frequencies)),
269
+ dtype=np.dtype(Coordinates.SV_DTYPE.value),
270
+ # data=,
271
+ chunks=sv_chunk_shape,
272
+ compressor=compressors,
273
+ fill_value=np.nan,
274
+ attributes=dict(
275
+ units=Coordinates.SV_UNITS.value,
276
+ long_name=Coordinates.SV_LONG_NAME.value,
277
+ standard_name=Coordinates.SV_STANDARD_NAME.value,
278
+ ),
279
+ dimension_names=[
280
+ Coordinates.DEPTH.value,
281
+ Coordinates.TIME.value,
282
+ Coordinates.FREQUENCY.value,
283
+ ],
284
+ overwrite=True,
285
+ )
286
+ #####################################################################
287
+ # # --- Metadata --- #
288
+ root.attrs["ship_name"] = ship_name
289
+ root.attrs["cruise_name"] = cruise_name
290
+ root.attrs["sensor_name"] = sensor_name
291
+ #
292
+ root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
293
+ # NOTE: for the version to be parsable you need to build the python package locally first.
294
+ root.attrs["processing_software_version"] = metadata.version(
295
+ "water-column-sonar-processing"
296
+ )
297
+ root.attrs["processing_software_time"] = Timestamp.get_timestamp()
298
+ #
299
+ root.attrs["calibration_status"] = calibration_status
300
+ root.attrs["tile_size"] = Constants.TILE_SIZE.value
301
+ #
302
+ return zarr_path
303
+ except Exception as err:
304
+ raise RuntimeError(f"Problem trying to create zarr store, {err}")
259
305
 
260
- ############################################################################
261
- # def update_zarr_store(
306
+ # #######################################################
307
+ # def create_zarr_store_old(
262
308
  # self,
263
- # path: str,
309
+ # path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
264
310
  # ship_name: str,
265
- # cruise_name: str, # TODO: just pass stem
311
+ # cruise_name: str,
266
312
  # sensor_name: str,
267
- # ) -> None:
313
+ # frequencies: list, # units in Hz
314
+ # width: int,
315
+ # max_echo_range: float,
316
+ # # cruise_min_epsilon: float, # smallest resolution in meters
317
+ # calibration_status: bool = False, # Assume uncalibrated
318
+ # ) -> str:
268
319
  # """
269
- # Opens an existing Zarr store living in a s3 bucket for the purpose
270
- # of updating just a subset of the cruise-level Zarr store associated
271
- # with a file-level Zarr store.
320
+ # Creates a new zarr store in a local temporary directory(?)
272
321
  # """
273
- # pass
322
+ # try:
323
+ # print(f"Creating local zarr store, {cruise_name}.zarr for ship {ship_name}")
324
+ # if len(frequencies) != len(set(frequencies)):
325
+ # raise Exception(
326
+ # "Number of frequencies does not match number of channels"
327
+ # )
328
+ #
329
+ # zarr_path = f"{path}/{cruise_name}.zarr"
330
+ # #####################################################################
331
+ # # Define the chunk sizes and the encoding
332
+ # # 1_000_000 data points for quickest download
333
+ # spatiotemporal_chunk_size = int(1e6)
334
+ # depth_chunk_shape = (512,)
335
+ # time_chunk_shape = (spatiotemporal_chunk_size,)
336
+ # frequency_chunk_shape = (len(frequencies),)
337
+ # latitude_chunk_shape = (spatiotemporal_chunk_size,)
338
+ # longitude_chunk_shape = (spatiotemporal_chunk_size,)
339
+ # bottom_chunk_shape = (spatiotemporal_chunk_size,)
340
+ # speed_chunk_shape = (spatiotemporal_chunk_size,)
341
+ # distance_chunk_shape = (spatiotemporal_chunk_size,)
342
+ # sv_chunk_shape = (512, 512, 1) # TODO: move to constants
343
+ #
344
+ # #####################################################################
345
+ # ##### Depth #####
346
+ # depth_data_values = self.get_depth_values(
347
+ # max_echo_range=max_echo_range,
348
+ # )
349
+ #
350
+ # depth_data = np.array(
351
+ # depth_data_values, dtype=Coordinates.DEPTH_DTYPE.value
352
+ # )
353
+ # depth_da = xr.DataArray(
354
+ # data=depth_data,
355
+ # dims=Coordinates.DEPTH.value,
356
+ # name=Coordinates.DEPTH.value,
357
+ # attrs=dict(
358
+ # units=Coordinates.DEPTH_UNITS.value,
359
+ # long_name=Coordinates.DEPTH_LONG_NAME.value,
360
+ # standard_name=Coordinates.DEPTH_STANDARD_NAME.value,
361
+ # ),
362
+ # )
363
+ #
364
+ # ##### Time #####
365
+ # # https://zarr.readthedocs.io/en/stable/spec/v2.html#data-type-encoding
366
+ # time_data = np.array(
367
+ # np.repeat(np.datetime64(0, "ns"), width),
368
+ # dtype="datetime64[ns]",
369
+ # )
370
+ # time_da = xr.DataArray(
371
+ # data=time_data,
372
+ # dims=Coordinates.TIME.value,
373
+ # name=Coordinates.TIME.value,
374
+ # attrs=dict(
375
+ # # Note: cal & units are written automatically by xarray
376
+ # # calendar="proleptic_gregorian",
377
+ # # units="seconds since 1970-01-01 00:00:00",
378
+ # long_name=Coordinates.TIME_LONG_NAME.value,
379
+ # standard_name=Coordinates.TIME_STANDARD_NAME.value,
380
+ # ),
381
+ # )
382
+ #
383
+ # ##### Frequency #####
384
+ # frequency_data = np.array(
385
+ # frequencies,
386
+ # dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
387
+ # )
388
+ # frequency_da = xr.DataArray(
389
+ # data=frequency_data,
390
+ # dims=Coordinates.FREQUENCY.value,
391
+ # name=Coordinates.FREQUENCY.value,
392
+ # attrs=dict(
393
+ # units=Coordinates.FREQUENCY_UNITS.value,
394
+ # long_name=Coordinates.FREQUENCY_LONG_NAME.value,
395
+ # standard_name=Coordinates.FREQUENCY_STANDARD_NAME.value,
396
+ # ),
397
+ # )
398
+ #
399
+ # ##### Latitude #####
400
+ # gps_data = np.array(
401
+ # np.repeat(np.nan, width),
402
+ # dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
403
+ # )
404
+ # latitude_da = xr.DataArray(
405
+ # data=gps_data,
406
+ # coords=dict(
407
+ # time=time_da,
408
+ # ),
409
+ # dims=Coordinates.TIME.value, # Note: "TIME"
410
+ # name=Coordinates.LATITUDE.value,
411
+ # attrs=dict(
412
+ # units=Coordinates.LATITUDE_UNITS.value,
413
+ # long_name=Coordinates.LATITUDE_LONG_NAME.value,
414
+ # standard_name=Coordinates.LATITUDE_STANDARD_NAME.value,
415
+ # ),
416
+ # ) # Note: LATITUDE is indexed by TIME
417
+ #
418
+ # ##### Longitude #####
419
+ # longitude_da = xr.DataArray(
420
+ # data=gps_data,
421
+ # coords=dict(
422
+ # time=time_da,
423
+ # ),
424
+ # dims=Coordinates.TIME.value, # Note: "TIME"
425
+ # name=Coordinates.LONGITUDE.value,
426
+ # attrs=dict(
427
+ # units=Coordinates.LONGITUDE_UNITS.value,
428
+ # long_name=Coordinates.LONGITUDE_LONG_NAME.value,
429
+ # standard_name=Coordinates.LONGITUDE_STANDARD_NAME.value,
430
+ # ),
431
+ # ) # Note: LONGITUDE is indexed by TIME
432
+ #
433
+ # ##### Bottom #####
434
+ # bottom_data = np.array(
435
+ # np.repeat(np.nan, width), dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value)
436
+ # )
437
+ # bottom_da = xr.DataArray(
438
+ # data=bottom_data,
439
+ # coords=dict(
440
+ # time=time_da,
441
+ # ),
442
+ # dims=Coordinates.TIME.value, # Note: "TIME"
443
+ # name=Coordinates.BOTTOM.value,
444
+ # attrs=dict(
445
+ # units=Coordinates.BOTTOM_UNITS.value,
446
+ # long_name=Coordinates.BOTTOM_LONG_NAME.value,
447
+ # standard_name=Coordinates.BOTTOM_STANDARD_NAME.value,
448
+ # ),
449
+ # )
450
+ #
451
+ # ##### Speed #####
452
+ # speed_data = np.array(
453
+ # np.repeat(np.nan, width), dtype=np.dtype(Coordinates.SPEED_DTYPE.value)
454
+ # )
455
+ # speed_da = xr.DataArray(
456
+ # data=speed_data,
457
+ # coords=dict(
458
+ # time=time_da,
459
+ # ),
460
+ # dims=Coordinates.TIME.value, # Note: "TIME"
461
+ # name=Coordinates.SPEED.value,
462
+ # attrs=dict(
463
+ # units=Coordinates.SPEED_UNITS.value,
464
+ # long_name=Coordinates.SPEED_LONG_NAME.value,
465
+ # standard_name=Coordinates.SPEED_STANDARD_NAME.value,
466
+ # ),
467
+ # )
468
+ #
469
+ # ##### Distance #####
470
+ # distance_data = np.array(
471
+ # np.repeat(np.nan, width),
472
+ # dtype=np.dtype(Coordinates.DISTANCE_DTYPE.value),
473
+ # )
474
+ # distance_da = xr.DataArray(
475
+ # data=distance_data,
476
+ # coords=dict(
477
+ # time=time_da,
478
+ # ),
479
+ # dims=Coordinates.TIME.value, # Note: "TIME"
480
+ # name=Coordinates.DISTANCE.value,
481
+ # attrs=dict(
482
+ # units=Coordinates.DISTANCE_UNITS.value,
483
+ # long_name=Coordinates.DISTANCE_LONG_NAME.value,
484
+ # standard_name=Coordinates.DISTANCE_STANDARD_NAME.value,
485
+ # ),
486
+ # )
487
+ #
488
+ # ##### Sv #####
489
+ # gc.collect()
490
+ # # sv_data = np.empty(
491
+ # # (len(depth_data), width, len(frequencies)),
492
+ # # # (2501, 4_100_782, 4), # large cruise used for testing
493
+ # # dtype=np.dtype(Coordinates.SV_DTYPE.value),
494
+ # # )
495
+ # sv_data = np.full(
496
+ # (len(depth_data), width, len(frequencies)),
497
+ # np.nan,
498
+ # dtype=np.dtype(Coordinates.SV_DTYPE.value),
499
+ # )
500
+ # print(f"one: {sys.getsizeof(sv_data)}")
501
+ # # sv_data[:] = np.nan # initialize all
502
+ #
503
+ # sv_da = xr.DataArray(
504
+ # data=sv_data,
505
+ # coords=dict(
506
+ # depth=depth_da,
507
+ # time=time_da,
508
+ # frequency=frequency_da,
509
+ # #
510
+ # latitude=latitude_da,
511
+ # longitude=longitude_da,
512
+ # bottom=bottom_da,
513
+ # speed=speed_da,
514
+ # distance=distance_da,
515
+ # ),
516
+ # dims=( # Depth * Time * Frequency
517
+ # Coordinates.DEPTH.value,
518
+ # Coordinates.TIME.value,
519
+ # Coordinates.FREQUENCY.value,
520
+ # ),
521
+ # name=Coordinates.SV.value,
522
+ # attrs=dict(
523
+ # units=Coordinates.SV_UNITS.value,
524
+ # long_name=Coordinates.SV_LONG_NAME.value,
525
+ # standard_name=Coordinates.SV_STANDARD_NAME.value,
526
+ # tiles_size=Constants.TILE_SIZE.value,
527
+ # _FillValue=np.nan,
528
+ # ),
529
+ # )
530
+ # print(f"two: {sys.getsizeof(sv_data)}") # getting to at least here
531
+ # del sv_data
532
+ # sv_da.encoding = {"compressors": [compressor], "chunks": sv_chunk_shape}
533
+ # # sv_da = sv_da.astype(np.float32) # was crashing here
534
+ # gc.collect()
535
+ # #####################################################################
536
+ # ### Now create the xarray.Dataset
537
+ # ds = xr.Dataset(
538
+ # data_vars=dict(
539
+ # Sv=sv_da,
540
+ # #
541
+ # bottom=bottom_da,
542
+ # speed=speed_da,
543
+ # distance=distance_da,
544
+ # ),
545
+ # coords=dict(
546
+ # depth=depth_da,
547
+ # time=time_da,
548
+ # frequency=frequency_da,
549
+ # #
550
+ # latitude=latitude_da,
551
+ # longitude=longitude_da,
552
+ # ),
553
+ # attrs=dict(
554
+ # # --- Metadata --- #
555
+ # ship_name=ship_name,
556
+ # cruise_name=cruise_name,
557
+ # sensor_name=sensor_name,
558
+ # processing_software_name=Coordinates.PROJECT_NAME.value,
559
+ # # NOTE: for the version to be parsable you need to build the python package
560
+ # # locally first.
561
+ # processing_software_version=importlib.metadata.version(
562
+ # "water-column-sonar-processing"
563
+ # ),
564
+ # processing_software_time=Timestamp.get_timestamp(),
565
+ # calibration_status=calibration_status,
566
+ # tile_size=Constants.TILE_SIZE.value,
567
+ # ),
568
+ # )
569
+ # del sv_da
570
+ # gc.collect()
571
+ # print(f"three: {sys.getsizeof(ds)}")
572
+ # #####################################################################
573
+ # encodings = dict(
574
+ # depth={
575
+ # "compressors": [compressor],
576
+ # "chunks": depth_chunk_shape,
577
+ # },
578
+ # time={
579
+ # "compressors": [compressor],
580
+ # "chunks": time_chunk_shape,
581
+ # "units": Coordinates.TIME_UNITS.value,
582
+ # },
583
+ # frequency={
584
+ # "compressors": [compressor],
585
+ # "chunks": frequency_chunk_shape,
586
+ # },
587
+ # latitude={
588
+ # "compressors": [compressor],
589
+ # "chunks": latitude_chunk_shape,
590
+ # },
591
+ # longitude={
592
+ # "compressors": [compressor],
593
+ # "chunks": longitude_chunk_shape,
594
+ # },
595
+ # bottom={
596
+ # "compressors": [compressor],
597
+ # "chunks": bottom_chunk_shape,
598
+ # },
599
+ # speed={
600
+ # "compressors": [compressor],
601
+ # "chunks": speed_chunk_shape,
602
+ # },
603
+ # distance={
604
+ # "compressors": [compressor],
605
+ # "chunks": distance_chunk_shape,
606
+ # },
607
+ # Sv={
608
+ # "compressors": [compressor],
609
+ # "chunks": sv_chunk_shape,
610
+ # },
611
+ # )
612
+ # gc.collect()
613
+ # ds.to_zarr(
614
+ # store=zarr_path,
615
+ # mode="w", # “w” means create (overwrite if exists)
616
+ # encoding=encodings,
617
+ # consolidated=False,
618
+ # safe_chunks=False,
619
+ # align_chunks=True,
620
+ # zarr_format=3,
621
+ # write_empty_chunks=False, # Might need to change this
622
+ # )
623
+ # #####################################################################
624
+ # return zarr_path
625
+ # except Exception as err:
626
+ # raise RuntimeError(f"Problem trying to create zarr store, {err}")
627
+ # # finally:
628
+ # # cleaner = Cleaner()
629
+ # # cleaner.delete_local_files()
630
+ # # TODO: should delete zarr store in temp directory too?
274
631
 
275
632
  ############################################################################
276
633
  def open_s3_zarr_store_with_zarr(
@@ -278,60 +635,106 @@ class ZarrManager:
278
635
  ship_name: str,
279
636
  cruise_name: str,
280
637
  sensor_name: str,
281
- # zarr_synchronizer: Union[str, None] = None,
282
- ):
638
+ output_bucket_name: str,
639
+ endpoint_url: Optional[str] = None,
640
+ ) -> Group:
283
641
  # Mounts a Zarr store using pythons Zarr implementation. The mounted store
284
642
  # will have read/write privileges so that store can be updated.
285
- print("Opening Zarr store with Zarr.")
643
+ print("Opening L2 Zarr store with Zarr for writing.")
286
644
  try:
287
- s3fs_manager = S3FSManager()
288
- root = f"{self.output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
289
- store = s3fs_manager.s3_map(s3_zarr_store_path=root)
290
- # synchronizer = model.ProcessSynchronizer(f"/tmp/{ship_name}_{cruise_name}.sync")
291
- cruise_zarr = zarr.open(store=store, mode="r+")
645
+ level = str(Constants.LEVEL_2.value)
646
+ store = f"s3://{output_bucket_name}/{level}/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
647
+ print(f"endpoint url: {endpoint_url}")
648
+ cruise_zarr = zarr.open(
649
+ store=store,
650
+ mode="r+",
651
+ zarr_format=3,
652
+ storage_options={
653
+ "endpoint_url": endpoint_url,
654
+ "key": self.key,
655
+ "secret": self.secret,
656
+ },
657
+ )
658
+ print("Done opening store with Zarr.")
659
+ return cruise_zarr
292
660
  except Exception as err: # Failure
293
- print(f"Exception encountered opening Zarr store with Zarr.: {err}")
294
- raise
295
- print("Done opening Zarr store with Zarr.")
296
- return cruise_zarr
661
+ raise RuntimeError(f"Exception encountered opening store with Zarr, {err}")
297
662
 
298
- ############################################################################
663
+ ###########################################################################
664
+ @staticmethod
299
665
  def open_s3_zarr_store_with_xarray(
300
- self,
301
666
  ship_name: str,
302
667
  cruise_name: str,
303
668
  sensor_name: str,
304
669
  file_name_stem: str,
670
+ bucket_name: str,
671
+ # level: str, # TODO: add level
672
+ endpoint_url: Optional[str] = None, # needed for moto testing
305
673
  ) -> xr.Dataset:
306
- print("Opening Zarr store in S3 as Xarray.")
674
+ print("Opening L1 Zarr store in S3 with Xarray.")
307
675
  try:
308
- zarr_path = f"s3://{self.output_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
309
- s3fs_manager = S3FSManager()
310
- store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
311
- ds = xr.open_zarr(
312
- store=store_s3_map, consolidated=None
313
- ) # synchronizer=SYNCHRONIZER
676
+ zarr_path = f"s3://{bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
677
+ kwargs = {"consolidated": False}
678
+ ds = xr.open_dataset(
679
+ filename_or_obj=zarr_path,
680
+ engine="zarr",
681
+ backend_kwargs={
682
+ "storage_options": {
683
+ "endpoint_url": endpoint_url,
684
+ "anon": True,
685
+ },
686
+ },
687
+ **kwargs,
688
+ )
689
+ return ds
314
690
  except Exception as err:
315
- print("Problem opening Zarr store in S3 as Xarray.")
316
- raise err
317
- print("Done opening Zarr store in S3 as Xarray.")
318
- return ds
691
+ raise RuntimeError(f"Problem opening Zarr store in S3 as Xarray, {err}")
319
692
 
320
- ############################################################################
693
+ ###########################################################################
694
+ # TODO: can this be consolidated with above
695
+ @staticmethod
696
+ def open_l2_zarr_store_with_xarray(
697
+ ship_name: str,
698
+ cruise_name: str,
699
+ sensor_name: str,
700
+ bucket_name: str,
701
+ endpoint_url: Optional[str] = None, # needed for moto testing
702
+ ) -> xr.Dataset:
703
+ print("Opening L2 Zarr store in S3 with Xarray.")
704
+ try:
705
+ level = str(Constants.LEVEL_2.value)
706
+ zarr_path = f"s3://{bucket_name}/{level}/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
707
+ kwargs = {"consolidated": False}
708
+ ds = xr.open_dataset(
709
+ filename_or_obj=zarr_path,
710
+ engine="zarr",
711
+ backend_kwargs={
712
+ "storage_options": {
713
+ "endpoint_url": endpoint_url,
714
+ "anon": True,
715
+ }
716
+ },
717
+ **kwargs,
718
+ )
719
+ return ds
720
+ except Exception as err:
721
+ raise RuntimeError(f"Problem opening Zarr store in S3 as Xarray, {err}")
321
722
 
322
- #######################################################
723
+ ###########################################################################
724
+
725
+ ###########################################################################
323
726
  # def create_process_synchronizer(self):
324
727
  # # TODO: explore aws redis options
325
728
  # pass
326
729
 
327
- #######################################################
730
+ ###########################################################################
328
731
  # def verify_cruise_store_data(self):
329
732
  # # TODO: run a check on a finished model store to ensure that
330
733
  # # none of the time, latitude, longitude, or depth values
331
734
  # # are NaN.
332
735
  # pass
333
736
 
334
- #######################################################
737
+ ###########################################################################
335
738
 
336
739
 
337
740
  ###########################################################