water-column-sonar-processing 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

@@ -121,7 +121,7 @@ class DynamoDBManager:
121
121
  UpdateExpression=update_expression,
122
122
  )
123
123
  status_code = response["ResponseMetadata"]["HTTPStatusCode"]
124
- print(f"HTTPStatusCode: {status_code}")
124
+ # print(f"HTTPStatusCode: {status_code}")
125
125
  # assert status_code == 200, "Problem, unable to update dynamodb table."
126
126
  # assert response['ConsumedCapacity']['TableName'] == table_name
127
127
  except Exception as err:
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import os
3
3
  import boto3
4
+ from typing import Optional
4
5
  from collections.abc import Generator
5
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
7
 
@@ -29,9 +30,11 @@ class S3Manager:
29
30
  #####################################################################
30
31
  def __init__(
31
32
  self,
33
+ endpoint_url: Optional[str] = None,
32
34
  ):
33
- self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
34
- self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
35
+ self.endpoint_url = endpoint_url
36
+ # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
37
+ # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
35
38
  self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
36
39
  self.s3_client_config = Config(max_pool_connections=MAX_POOL_CONNECTIONS)
37
40
  self.s3_transfer_config = TransferConfig(
@@ -49,6 +52,7 @@ class S3Manager:
49
52
  service_name="s3",
50
53
  config=self.s3_client_config,
51
54
  region_name=self.s3_region,
55
+ endpoint_url=self.endpoint_url,
52
56
  )
53
57
  self.s3_resource = boto3.resource(
54
58
  service_name="s3",
@@ -64,11 +68,13 @@ class S3Manager:
64
68
  service_name="s3",
65
69
  config=self.s3_client_config,
66
70
  region_name=self.s3_region,
71
+ endpoint_url=self.endpoint_url,
67
72
  )
68
73
  self.s3_resource_noaa_wcsd_zarr_pds = self.s3_session_noaa_wcsd_zarr_pds.resource(
69
74
  service_name="s3",
70
75
  config=self.s3_client_config,
71
76
  region_name=self.s3_region,
77
+ endpoint_url=self.endpoint_url,
72
78
  )
73
79
  self.paginator = self.s3_client.get_paginator('list_objects_v2')
74
80
  self.paginator_noaa_wcsd_zarr_pds = self.s3_client_noaa_wcsd_zarr_pds.get_paginator('list_objects_v2')
@@ -85,13 +91,31 @@ class S3Manager:
85
91
  self,
86
92
  bucket_name: str,
87
93
  ):
88
- self.s3_client.create_bucket(
94
+ """
95
+ Note: this function is only really meant to be used for creating test
96
+ buckets. It allows public read of all objects.
97
+ """
98
+ # https://github.com/aodn/aodn_cloud_optimised/blob/e5035495e782783cc8b9e58711d63ed466420350/test_aodn_cloud_optimised/test_schema.py#L7
99
+ public_policy = {
100
+ "Version": "2012-10-17",
101
+ "Statement": [
102
+ {
103
+ "Effect": "Allow",
104
+ "Principal": "*",
105
+ "Action": "s3:GetObject",
106
+ "Resource": f"arn:aws:s3:::{bucket_name}/*",
107
+ }
108
+ ],
109
+ }
110
+ response1 = self.s3_client.create_bucket(
89
111
  Bucket=bucket_name,
90
- # Required when region is different then us-east-1
91
- #
92
- # TODO: if region is us-east-1, don't include this line somehow
93
- # CreateBucketConfiguration={'LocationConstraint': self.__s3_region}
112
+ ACL='public-read'
94
113
  )
114
+ print(response1)
115
+ # response = self.s3_client.put_bucket_policy(
116
+ # Bucket=bucket_name, Policy=json.dumps(public_policy)
117
+ # )
118
+ # print(response)
95
119
 
96
120
  #####################################################################
97
121
  def list_buckets(self):
@@ -156,6 +180,7 @@ class S3Manager:
156
180
  self,
157
181
  local_directory,
158
182
  remote_directory,
183
+ output_bucket_name,
159
184
  ):
160
185
  # Right now this is just for uploading a model store to s3
161
186
  print("Uploading files to output bucket.")
@@ -173,7 +198,7 @@ class S3Manager:
173
198
  all_files.append([local_path, s3_key])
174
199
 
175
200
  all_uploads = self.upload_files_with_thread_pool_executor(
176
- output_bucket_name=self.output_bucket_name,
201
+ output_bucket_name=output_bucket_name,
177
202
  all_files=all_files,
178
203
  )
179
204
  print("Done uploading files to output bucket.")
@@ -228,8 +253,8 @@ class S3Manager:
228
253
  # ):
229
254
  # # Returns a list of key strings for each object in bucket defined by prefix
230
255
  # keys = []
231
- # page_iterator = self.paginator_noaa_wcsd_zarr_pds.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
232
- # for page in paginator.paginate(Bucket=self.output_bucket_name, Prefix=prefix):
256
+ # page_iterator = self.paginator_noaa_wcsd_zarr_pds.paginate(Bucket=output_bucket_name, Prefix=prefix):
257
+ # for page in paginator.paginate(Bucket=output_bucket_name, Prefix=prefix):
233
258
  # if "Contents" in page.keys():
234
259
  # keys.extend([k["Key"] for k in page["Contents"]])
235
260
  # return keys
@@ -371,7 +396,6 @@ class S3Manager:
371
396
  print(f"Problem was encountered while deleting objects: {err}")
372
397
 
373
398
  #####################################################################
374
- # not used TODO: remove
375
399
  def put(self, bucket_name, key, body): # noaa-wcsd-model-pds
376
400
  self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=body) # "Body" can be a file
377
401
 
@@ -382,10 +406,12 @@ class S3Manager:
382
406
  cruise_name,
383
407
  sensor_name,
384
408
  file_name_stem,
409
+ output_bucket_name,
385
410
  ) -> str:
386
411
  try:
387
- content_object = self.s3_resource_noaa_wcsd_zarr_pds.Object(
388
- bucket_name=self.output_bucket_name,
412
+ resource = self.s3_resource_noaa_wcsd_zarr_pds
413
+ content_object = resource.Object(
414
+ bucket_name=output_bucket_name,
389
415
  key=f"spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.json",
390
416
  ).get()
391
417
  file_content = content_object["Body"].read().decode("utf-8")
@@ -1,5 +1,5 @@
1
1
  import os
2
-
2
+ from typing import Optional
3
3
  import s3fs
4
4
 
5
5
  # TODO: S3FS_LOGGING_LEVEL=DEBUG
@@ -9,37 +9,25 @@ class S3FSManager:
9
9
  #####################################################################
10
10
  def __init__(
11
11
  self,
12
+ endpoint_url: Optional[str] = None,
12
13
  ):
13
- self.__s3_region = os.environ.get("AWS_REGION", default="us-east-1")
14
+ self.endpoint_url = endpoint_url
15
+ # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
16
+ # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
17
+ self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
14
18
  self.s3fs = s3fs.S3FileSystem(
19
+ endpoint_url=endpoint_url,
15
20
  key=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
16
21
  secret=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),
17
- # asynchronous=True
18
- # use_ssl=False,
19
- # skip_instance_cache=True,
20
- # default_block_size='100MB', # if no specific value is given at all time. The built-in default is 5MB
21
- # client_kwargs={
22
- # "region_name": self.__s3_region
23
- # }
24
22
  )
25
23
 
26
- #####################################################################
27
- def add_file(self, filename):
28
- full_path = f"{os.getenv('OUTPUT_BUCKET_NAME')}/testing/{filename}"
29
- print(full_path)
30
-
31
- self.s3fs.touch(full_path)
32
- ff = self.s3fs.ls(f"{os.getenv('OUTPUT_BUCKET_NAME')}/")
33
-
34
- print(ff)
35
-
36
- #####################################################################
37
- def upload_data(self, bucket_name, file_path, prefix):
38
- # TODO: this works in theory but use boto3 to upload files
39
- s3_path = f"s3://{bucket_name}/{prefix}/"
40
- s3_file_system = self.s3fs
41
- s3_file_system.put(file_path, s3_path, recursive=True)
42
-
24
+ # s3_fs = s3fs.S3FileSystem( # TODO: use s3fs_manager?
25
+ # anon=True,
26
+ # client_kwargs={
27
+ # "endpoint_url": moto_server,
28
+ # "region_name": "us-east-1",
29
+ # },
30
+ # )
43
31
  #####################################################################
44
32
  def s3_map(
45
33
  self,
@@ -49,20 +37,39 @@ class S3FSManager:
49
37
  # create=False, not false because will be writing
50
38
  # return s3fs.S3Map(root=s3_zarr_store_path, s3=self.s3fs, check=True)
51
39
  return s3fs.S3Map(
52
- root=s3_zarr_store_path, s3=self.s3fs
40
+ root=s3_zarr_store_path,
41
+ s3=self.s3fs
53
42
  ) # create=False, not false because will be writing
54
43
 
44
+ #####################################################################
45
+ # def add_file(self, filename):
46
+ # full_path = f"{os.getenv('OUTPUT_BUCKET_NAME')}/testing/{filename}"
47
+ # print(full_path)
48
+ #
49
+ # self.s3fs.touch(full_path)
50
+ # ff = self.s3fs.ls(f"{os.getenv('OUTPUT_BUCKET_NAME')}/")
51
+ #
52
+ # print(ff)
53
+
54
+ #####################################################################
55
+ def upload_data(
56
+ self,
57
+ bucket_name,
58
+ file_path,
59
+ prefix
60
+ ):
61
+ # TODO: this works in theory but use boto3 to upload files
62
+ s3_path = f"s3://{bucket_name}/{prefix}/"
63
+ s3_file_system = self.s3fs
64
+ s3_file_system.put(file_path, s3_path, recursive=True)
65
+
55
66
  #####################################################################
56
67
  def exists(
57
68
  self,
58
- geo_json_s3_path,
69
+ s3_path,
59
70
  ):
60
- s3_file_system = self.s3fs
61
- return s3_file_system.exists(path=geo_json_s3_path)
71
+ # s3_file_system =
72
+ return self.s3fs.exists(s3_path)
73
+
62
74
 
63
75
  #####################################################################
64
- # def put(
65
- # self
66
- # ):
67
- # s3_file_system = self.s3fs
68
- # return
@@ -24,14 +24,14 @@ class CreateEmptyZarrStore:
24
24
  self,
25
25
  ):
26
26
  self.__overwrite = True
27
- self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
28
- self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
27
+ # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
28
+ # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
29
29
 
30
30
  #######################################################
31
-
32
31
  # TODO: move this to the s3_manager
33
32
  def upload_zarr_store_to_s3(
34
33
  self,
34
+ output_bucket_name: str,
35
35
  local_directory: str,
36
36
  object_prefix: str,
37
37
  cruise_name: str,
@@ -43,24 +43,28 @@ class CreateEmptyZarrStore:
43
43
  # # 'all_files' is passed a list of lists: [[local_path, s3_key], [...], ...]
44
44
  all_files = []
45
45
  for subdir, dirs, files in os.walk(
46
- f"{local_directory}/{cruise_name}.zarr_manager"
46
+ f"{local_directory}/{cruise_name}.zarr"
47
47
  ):
48
48
  for file in files:
49
49
  local_path = os.path.join(subdir, file)
50
- # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.model/..zattrs'
51
- s3_key = f'{object_prefix}/{cruise_name}.model{local_path.split(f"{cruise_name}.model")[-1]}'
50
+ # TODO: find a better method for splitting strings here:
51
+ # 'level_2/Henry_B._Bigelow/HB0806/EK60/HB0806.zarr/.zattrs'
52
+ s3_key = f'{object_prefix}/{cruise_name}.zarr{local_path.split(f"{cruise_name}.zarr")[-1]}'
52
53
  all_files.append([local_path, s3_key])
53
54
  #
54
55
  # print(all_files)
55
56
  s3_manager.upload_files_with_thread_pool_executor(
57
+ output_bucket_name=output_bucket_name,
56
58
  all_files=all_files,
57
59
  )
58
60
  print("Done uploading with thread pool executor.")
59
61
  # TODO: move to common place
60
62
 
61
63
  #######################################################
64
+ # @classmethod
62
65
  def create_cruise_level_zarr_store(
63
66
  self,
67
+ output_bucket_name: str,
64
68
  ship_name: str,
65
69
  cruise_name: str,
66
70
  sensor_name: str,
@@ -116,17 +120,18 @@ class CreateEmptyZarrStore:
116
120
  new_width = int(consolidated_zarr_width)
117
121
  print(f"new_width: {new_width}")
118
122
  #################################################################
119
- store_name = f"{cruise_name}.model"
123
+ store_name = f"{cruise_name}.zarr"
120
124
  print(store_name)
121
125
  ################################################################
122
126
  # Delete existing model store if it exists
123
127
  zarr_prefix = os.path.join("level_2", ship_name, cruise_name, sensor_name)
124
128
  child_objects = s3_manager.get_child_objects(
125
- bucket_name=self.output_bucket_name,
129
+ bucket_name=output_bucket_name,
126
130
  sub_prefix=zarr_prefix,
127
131
  )
128
132
  if len(child_objects) > 0:
129
133
  s3_manager.delete_nodd_objects(
134
+ bucket_name=output_bucket_name,
130
135
  objects=child_objects,
131
136
  )
132
137
  ################################################################
@@ -153,6 +158,7 @@ class CreateEmptyZarrStore:
153
158
  )
154
159
  #################################################################
155
160
  self.upload_zarr_store_to_s3(
161
+ output_bucket_name=output_bucket_name,
156
162
  local_directory=tempdir,
157
163
  object_prefix=zarr_prefix,
158
164
  cruise_name=cruise_name,
@@ -174,6 +180,7 @@ class CreateEmptyZarrStore:
174
180
  #################################################################
175
181
  # Success
176
182
  # TODO: update enum in dynamodb
183
+ print("Done creating cruise level zarr store.")
177
184
  #################################################################
178
185
  except Exception as err:
179
186
  print(f"Problem trying to create new cruise model store: {err}")
@@ -0,0 +1,13 @@
1
+ from datatree import DataTree
2
+
3
+ ds1 = xr.Dataset({"foo": "orange"})
4
+
5
+ dt = DataTree(name="root", data=ds1) # create root node
6
+
7
+ dt
8
+ Out[4]:
9
+ DataTree('root', parent=None)
10
+ Dimensions: ()
11
+ Data variables:
12
+ foo <U6 24B 'orange'
13
+
@@ -26,8 +26,8 @@ class ResampleRegrid:
26
26
  self,
27
27
  ):
28
28
  self.__overwrite = True
29
- self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
30
- self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
29
+ # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
30
+ # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
31
31
  self.dtype = "float32"
32
32
 
33
33
  #################################################################
@@ -144,6 +144,10 @@ class ResampleRegrid:
144
144
  cruise_name,
145
145
  sensor_name,
146
146
  table_name,
147
+ # TODO: file_name?,
148
+ bucket_name, # TODO: this is the same bucket
149
+ override_select_files=None,
150
+ endpoint_url=None
147
151
  ) -> None:
148
152
  """
149
153
  The goal here is to interpolate the data against the depth values already populated
@@ -151,17 +155,17 @@ class ResampleRegrid:
151
155
  read/write operations. We open the file-level store with Xarray to leverage tools for
152
156
  resampling and subsetting the data.
153
157
  """
154
- print("Interpolating data.")
158
+ print("Resample Regrid, Interpolating data.")
155
159
  try:
156
160
  zarr_manager = ZarrManager()
157
- # s3_manager = S3Manager()
158
161
  geo_manager = GeometryManager()
159
- # get model store
162
+
160
163
  output_zarr_store = zarr_manager.open_s3_zarr_store_with_zarr(
161
164
  ship_name=ship_name,
162
165
  cruise_name=cruise_name,
163
166
  sensor_name=sensor_name,
164
- # zarr_synchronizer=? # TODO: pass in for parallelization
167
+ output_bucket_name=bucket_name,
168
+ endpoint_url=endpoint_url,
165
169
  )
166
170
 
167
171
  # get dynamo stuff
@@ -175,15 +179,19 @@ class ResampleRegrid:
175
179
 
176
180
  #########################################################
177
181
  #########################################################
178
- # TODO: iterate files here
179
182
  all_file_names = cruise_df["FILE_NAME"]
183
+
184
+ if override_select_files is not None:
185
+ all_file_names = override_select_files
186
+
187
+ # Iterate files
180
188
  for file_name in all_file_names:
181
189
  gc.collect()
182
190
  file_name_stem = Path(file_name).stem
183
- # file_name_stem = "D20070724-T151330"
184
191
  print(f"Processing file: {file_name_stem}.")
185
- # if f"{file_name_stem}.raw" not in list(cruise_df['FILE_NAME']):
186
- # raise Exception(f"Raw file file_stem not found in dynamodb.")
192
+
193
+ if f"{file_name_stem}.raw" not in list(cruise_df['FILE_NAME']):
194
+ raise Exception(f"Raw file file_stem not found in dynamodb.")
187
195
 
188
196
  # status = PipelineStatus['LEVEL_1_PROCESSING']
189
197
  # TODO: filter rows by enum success, filter the dataframe just for enums >= LEVEL_1_PROCESSING
@@ -200,6 +208,8 @@ class ResampleRegrid:
200
208
  cruise_name=cruise_name,
201
209
  sensor_name=sensor_name,
202
210
  file_name_stem=file_name_stem,
211
+ input_bucket_name=bucket_name,
212
+ endpoint_url=endpoint_url,
203
213
  )
204
214
  #########################################################################
205
215
  # [3] Get needed indices
@@ -225,11 +235,11 @@ class ResampleRegrid:
225
235
  :, start_ping_time_index:end_ping_time_index, :
226
236
  ].shape
227
237
  )
228
- cruise_sv_subset[:, :, :] = np.nan # (5208, 9778, 4)
238
+ cruise_sv_subset[:, :, :] = np.nan
229
239
 
230
240
  all_cruise_depth_values = zarr_manager.get_depth_values(
231
241
  min_echo_range=min_echo_range, max_echo_range=max_echo_range
232
- )
242
+ ) # (5262,) and
233
243
 
234
244
  print(" ".join(list(input_xr_zarr_store.Sv.dims)))
235
245
  if set(input_xr_zarr_store.Sv.dims) != {
@@ -239,13 +249,14 @@ class ResampleRegrid:
239
249
  }:
240
250
  raise Exception("Xarray dimensions are not as expected.")
241
251
 
242
- # get geojson
243
252
  indices, geospatial = geo_manager.read_s3_geo_json(
244
253
  ship_name=ship_name,
245
254
  cruise_name=cruise_name,
246
255
  sensor_name=sensor_name,
247
256
  file_name_stem=file_name_stem,
248
257
  input_xr_zarr_store=input_xr_zarr_store,
258
+ endpoint_url=endpoint_url,
259
+ output_bucket_name=bucket_name,
249
260
  )
250
261
 
251
262
  input_xr = input_xr_zarr_store.isel(ping_time=indices)
@@ -261,22 +272,18 @@ class ResampleRegrid:
261
272
  )
262
273
 
263
274
  # --- UPDATING --- #
264
-
265
275
  regrid_resample = self.interpolate_data(
266
276
  input_xr=input_xr,
267
277
  ping_times=ping_times,
268
278
  all_cruise_depth_values=all_cruise_depth_values,
269
279
  )
270
280
 
271
- print(
272
- f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}"
273
- )
274
-
281
+ print(f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}")
275
282
  #########################################################################
276
283
  # write Sv values to cruise-level-model-store
277
284
  for channel in range(
278
285
  len(input_xr.channel.values)
279
- ): # doesn't like being written in one fell swoop :(
286
+ ): # does not like being written in one fell swoop :(
280
287
  output_zarr_store.Sv[
281
288
  :, start_ping_time_index:end_ping_time_index, channel
282
289
  ] = regrid_resample[:, :, channel]
@@ -285,14 +292,38 @@ class ResampleRegrid:
285
292
  # [5] write subset of latitude/longitude
286
293
  output_zarr_store.latitude[
287
294
  start_ping_time_index:end_ping_time_index
288
- ] = geospatial.dropna()["latitude"].values
295
+ ] = geospatial.dropna()["latitude"].values # TODO: get from ds_sv directly, dont need geojson anymore
289
296
  output_zarr_store.longitude[
290
297
  start_ping_time_index:end_ping_time_index
291
298
  ] = geospatial.dropna()["longitude"].values
299
+
300
+ #########################################################################
301
+ # TODO: add the "detected_seafloor_depth/" to the
302
+ # L2 cruise dataarrays
303
+ # TODO: make bottom optional if 'detected_seafloor_depth' in input_xr.variables:
304
+ # TODO: Only checking the first channel for now. Need to average across all channels
305
+ # in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
306
+ # detected_seafloor_depths = input_xr.detected_seafloor_depth.values[0, :] # note can include nans?
307
+ detected_seafloor_depth = input_xr.detected_seafloor_depth.values
308
+ detected_seafloor_depth[detected_seafloor_depth == 0.] = np.nan
309
+ detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
310
+ detected_seafloor_depths[detected_seafloor_depths == 0.] = np.nan
311
+ print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
312
+ print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
313
+ #available_indices = np.argwhere(np.isnan(geospatial['latitude'].values))
314
+ output_zarr_store.bottom[
315
+ start_ping_time_index:end_ping_time_index
316
+ ] = detected_seafloor_depths
317
+ #########################################################################
318
+ #########################################################################
292
319
  except Exception as err:
293
320
  print(f"Problem interpolating the data: {err}")
294
321
  raise err
295
- print("Done interpolating data.")
322
+ # else:
323
+ # pass
324
+ finally:
325
+ print("Done interpolating data.")
326
+ # TODO: read across times and verify data was written?
296
327
 
297
328
  #######################################################
298
329
 
@@ -38,6 +38,7 @@ class GeometryManager:
38
38
  cruise_name,
39
39
  sensor_name,
40
40
  file_name,
41
+ endpoint_url=None,
41
42
  write_geojson=True,
42
43
  ) -> tuple:
43
44
  file_name_stem = Path(file_name).stem
@@ -61,7 +62,7 @@ class GeometryManager:
61
62
  time1 = echodata.environment.time1.values
62
63
 
63
64
  if len(nmea_times) < len(time1):
64
- raise Exception(
65
+ raise Exception( # TODO: explore this logic further...
65
66
  "Problem: Not enough NMEA times available to extrapolate time1."
66
67
  )
67
68
 
@@ -137,7 +138,7 @@ class GeometryManager:
137
138
  )
138
139
 
139
140
  print("Checking s3 and deleting any existing GeoJSON file.")
140
- s3_manager = S3Manager()
141
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
141
142
  geojson_object_exists = s3_manager.check_if_object_exists(
142
143
  bucket_name=output_bucket_name,
143
144
  key_name=f"{geo_json_prefix}/{geo_json_name}"
@@ -180,7 +181,8 @@ class GeometryManager:
180
181
  raise
181
182
  # Note: returned lat/lon values can include np.nan because they need to be aligned with
182
183
  # the Sv data! GeoJSON needs simplification but has been filtered.
183
- return gps_df.index.values, gps_df.latitude.values, gps_df.longitude.values
184
+ # return gps_df.index.values, gps_df.latitude.values, gps_df.longitude.values
185
+ return gps_df.index.values, lat, lon
184
186
  # TODO: if geojson is already returned with 0,0, the return here
185
187
  # can include np.nan values?
186
188
 
@@ -192,14 +194,18 @@ class GeometryManager:
192
194
  sensor_name,
193
195
  file_name_stem,
194
196
  input_xr_zarr_store,
197
+ endpoint_url,
198
+ output_bucket_name,
195
199
  ):
196
200
  try:
197
- s3_manager = S3Manager()
201
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
198
202
  geo_json = s3_manager.read_s3_json(
199
203
  ship_name=ship_name,
200
204
  cruise_name=cruise_name,
201
205
  sensor_name=sensor_name,
202
206
  file_name_stem=file_name_stem,
207
+ output_bucket_name=output_bucket_name,
208
+
203
209
  )
204
210
  ###
205
211
  geospatial = geopandas.GeoDataFrame.from_features(
@@ -28,8 +28,8 @@ class ZarrManager:
28
28
  self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
29
29
  self.__overwrite = True
30
30
  self.__num_threads = numcodecs.blosc.get_nthreads()
31
- self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
32
- self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
31
+ # self.input_bucket_name = os.environ.get("INPUT_BUCKET_NAME")
32
+ # self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
33
33
 
34
34
  #######################################################
35
35
  def get_depth_values(
@@ -54,7 +54,7 @@ class ZarrManager:
54
54
  #######################################################
55
55
  def create_zarr_store(
56
56
  self,
57
- path: str,
57
+ path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
58
58
  ship_name: str,
59
59
  cruise_name: str,
60
60
  sensor_name: str,
@@ -246,7 +246,7 @@ class ZarrManager:
246
246
  #
247
247
  root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
248
248
  root.attrs["processing_software_version"] = (
249
- "0.0.9" # TODO: get programmatically, echopype>utils>prov.py
249
+ "0.0.13" # TODO: get programmatically, echopype>utils>prov.py
250
250
  )
251
251
  root.attrs["processing_software_time"] = Timestamp.get_timestamp()
252
252
  #
@@ -282,14 +282,16 @@ class ZarrManager:
282
282
  ship_name: str,
283
283
  cruise_name: str,
284
284
  sensor_name: str,
285
- # zarr_synchronizer: Union[str, None] = None,
285
+ # zarr_synchronizer: Union[str, None] = None, # TODO:
286
+ output_bucket_name: str,
287
+ endpoint_url=None,
286
288
  ):
287
289
  # Mounts a Zarr store using pythons Zarr implementation. The mounted store
288
290
  # will have read/write privileges so that store can be updated.
289
- print("Opening Zarr store with Zarr.")
291
+ print("Opening L2 Zarr store with Zarr for writing.")
290
292
  try:
291
- s3fs_manager = S3FSManager()
292
- root = f"{self.output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
293
+ s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
294
+ root = f"{output_bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
293
295
  store = s3fs_manager.s3_map(s3_zarr_store_path=root)
294
296
  # synchronizer = model.ProcessSynchronizer(f"/tmp/{ship_name}_{cruise_name}.sync")
295
297
  cruise_zarr = zarr.open(store=store, mode="r+")
@@ -306,11 +308,13 @@ class ZarrManager:
306
308
  cruise_name: str,
307
309
  sensor_name: str,
308
310
  file_name_stem: str,
311
+ input_bucket_name: str,
312
+ endpoint_url=None,
309
313
  ) -> xr.Dataset:
310
- print("Opening Zarr store in S3 as Xarray.")
314
+ print("Opening L1 Zarr store in S3 with Xarray.")
311
315
  try:
312
- zarr_path = f"s3://{self.output_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
313
- s3fs_manager = S3FSManager()
316
+ zarr_path = f"s3://{input_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
317
+ s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
314
318
  store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
315
319
  ds = xr.open_zarr(
316
320
  store=store_s3_map, consolidated=None
@@ -321,6 +325,25 @@ class ZarrManager:
321
325
  print("Done opening Zarr store in S3 as Xarray.")
322
326
  return ds
323
327
 
328
+ def open_l2_zarr_store_with_xarray(
329
+ self,
330
+ ship_name: str,
331
+ cruise_name: str,
332
+ sensor_name: str,
333
+ bucket_name: str,
334
+ endpoint_url=None,
335
+ ) -> xr.Dataset:
336
+ print("Opening L2 Zarr store in S3 with Xarray.")
337
+ try:
338
+ zarr_path = f"s3://{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{cruise_name}.zarr"
339
+ s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
340
+ store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
341
+ ds = xr.open_zarr(store=store_s3_map, consolidated=None)
342
+ except Exception as err:
343
+ print("Problem opening Zarr store in S3 as Xarray.")
344
+ raise err
345
+ print("Done opening Zarr store in S3 as Xarray.")
346
+ return ds
324
347
  ############################################################################
325
348
 
326
349
  #######################################################
@@ -120,9 +120,10 @@ class RawToZarr:
120
120
  output_bucket_name,
121
121
  local_directory,
122
122
  object_prefix,
123
+ endpoint_url,
123
124
  ):
124
125
  # Note: this will be passed credentials if using NODD
125
- s3_manager = S3Manager()
126
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
126
127
  print('Uploading files using thread pool executor.')
127
128
  all_files = []
128
129
  for subdir, dirs, files in os.walk(local_directory):
@@ -147,6 +148,8 @@ class RawToZarr:
147
148
  cruise_name,
148
149
  sensor_name,
149
150
  raw_file_name,
151
+ endpoint_url=None,
152
+ include_bot=True,
150
153
  ):
151
154
  """
152
155
  Downloads the raw files, processes them with echopype, writes geojson, and uploads files
@@ -157,12 +160,14 @@ class RawToZarr:
157
160
  cleaner = Cleaner()
158
161
  cleaner.delete_local_files(file_types=["*.zarr", "*.json"]) # TODO: include bot and raw?
159
162
 
160
- s3_manager = S3Manager()
163
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
161
164
  s3_file_path = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{raw_file_name}"
162
165
  bottom_file_name = f"{Path(raw_file_name).stem}.bot"
163
166
  s3_bottom_file_path = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{bottom_file_name}"
164
167
  s3_manager.download_file(bucket_name=input_bucket_name, key=s3_file_path, file_name=raw_file_name)
165
- s3_manager.download_file(bucket_name=input_bucket_name, key=s3_bottom_file_path, file_name=bottom_file_name)
168
+ # TODO: add the bottom file
169
+ if include_bot:
170
+ s3_manager.download_file(bucket_name=input_bucket_name, key=s3_bottom_file_path, file_name=bottom_file_name)
166
171
 
167
172
  try:
168
173
  gc.collect()
@@ -172,13 +177,14 @@ class RawToZarr:
172
177
  echodata = ep.open_raw(
173
178
  raw_file=raw_file_name,
174
179
  sonar_model=sensor_name,
175
- include_bot=True,
176
- use_swap=True,
177
- # max_chunk_size=100,
180
+ include_bot=include_bot,
181
+ # use_swap=True,
182
+ # max_chunk_size=300,
178
183
  # storage_options={'anon': True } # 'endpoint_url': self.endpoint_url} # this was creating problems
179
184
  )
180
185
  print('Compute volume backscattering strength (Sv) from raw data.')
181
186
  ds_sv = ep.calibrate.compute_Sv(echodata)
187
+ gc.collect()
182
188
  print('Done computing volume backscatter strength (Sv) from raw data.')
183
189
  # Note: detected_seafloor_depth is located at echodata.vendor.detected_seafloor_depth
184
190
  # but is not written out with ds_sv
@@ -195,8 +201,12 @@ class RawToZarr:
195
201
  cruise_name=cruise_name,
196
202
  sensor_name=sensor_name,
197
203
  file_name=raw_file_name,
204
+ endpoint_url=endpoint_url,
198
205
  write_geojson=True
199
206
  )
207
+ ds_sv = ep.consolidate.add_location(ds_sv, echodata)
208
+ ds_sv.latitude.values = lat # overwriting echopype gps values to include missing values
209
+ ds_sv.longitude.values = lon
200
210
  # gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
201
211
  #################################################################
202
212
  # Technically the min_echo_range would be 0 m.
@@ -215,12 +225,15 @@ class RawToZarr:
215
225
  #################################################################
216
226
  # Create the zarr store
217
227
  store_name = f"{Path(raw_file_name).stem}.zarr"
218
- ds_sv.to_zarr(store=store_name)
228
+ # Sv = ds_sv.Sv
229
+ # ds_sv['Sv'] = Sv.astype('int32', copy=False)
230
+ ds_sv.to_zarr(store=store_name) # ds_sv.Sv.sel(channel=ds_sv.channel.values[0]).shape
231
+ gc.collect()
219
232
  #################################################################
220
233
  output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}/"
221
234
  #################################################################
222
235
  # If zarr store already exists then delete
223
- s3_manager = S3Manager()
236
+ s3_manager = S3Manager(endpoint_url=endpoint_url)
224
237
  child_objects = s3_manager.get_child_objects(
225
238
  bucket_name=output_bucket_name,
226
239
  sub_prefix=f"level_1/{ship_name}/{cruise_name}/{sensor_name}/{Path(raw_file_name).stem}.zarr",
@@ -235,7 +248,8 @@ class RawToZarr:
235
248
  self.__upload_files_to_output_bucket(
236
249
  output_bucket_name=output_bucket_name,
237
250
  local_directory=store_name,
238
- object_prefix=output_zarr_prefix
251
+ object_prefix=output_zarr_prefix,
252
+ endpoint_url=endpoint_url
239
253
  )
240
254
  #################################################################
241
255
  self.__zarr_info_to_table(
@@ -262,6 +276,7 @@ class RawToZarr:
262
276
  print(f'Exception encountered creating local Zarr store with echopype: {err}')
263
277
  raise RuntimeError(f"Problem creating local Zarr store, {err}")
264
278
  finally:
279
+ gc.collect()
265
280
  print("Finally.")
266
281
  cleaner.delete_local_files(file_types=["*.raw", "*.bot", "*.zarr", "*.json"])
267
282
  print('Done creating local zarr store.')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: water_column_sonar_processing
3
- Version: 0.0.11
3
+ Version: 0.0.13
4
4
  Summary: A processing tool for water column sonar data.
5
5
  Author-email: Rudy Klucik <rudy.klucik@noaa.gov>
6
6
  Project-URL: Homepage, https://github.com/CI-CMG/water-column-sonar-processing
@@ -28,7 +28,7 @@ Requires-Dist: python-dotenv==1.0.1
28
28
  Requires-Dist: requests==2.32.3
29
29
  Requires-Dist: s3fs==2023.12.1
30
30
  Requires-Dist: scipy==1.14.1
31
- Requires-Dist: setuptools==75.6.0
31
+ Requires-Dist: setuptools
32
32
  Requires-Dist: shapely==2.0.3
33
33
  Requires-Dist: typing-extensions==4.10.0
34
34
  Requires-Dist: xarray==2024.10.0
@@ -37,6 +37,16 @@ Requires-Dist: zarr==2.18.3
37
37
  # Water Column Sonar Processing
38
38
  Processing tool for converting L0 data to L1 and L2 as well as generating geospatial information
39
39
 
40
+ ![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/CI-CMG/water-column-sonar-processing/test_action.yaml)
41
+
42
+ ![GitHub License](https://img.shields.io/github/license/CI-CMG/water-column-sonar-processing)
43
+
44
+ ![PyPI - Implementation](https://img.shields.io/pypi/v/water-column-sonar-processing?color=black)
45
+
46
+ ![PyPI - Downloads](https://img.shields.io/pypi/dd/water-column-sonar-processing)
47
+
48
+ ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/CI-CMG/water-column-sonar-processing) ![GitHub repo size](https://img.shields.io/github/repo-size/CI-CMG/water-column-sonar-processing)
49
+
40
50
  # Setting up the Python Environment
41
51
  > Python 3.10.12
42
52
 
@@ -93,20 +103,6 @@ or
93
103
  Following this tutorial:
94
104
  https://packaging.python.org/en/latest/tutorials/packaging-projects/
95
105
 
96
- # To Publish To TEST
97
- ```commandline
98
- python -m build
99
- # python -m build --sdist
100
- # python -m build --wheel
101
- python -m twine upload --repository testpypi dist/*
102
- pytho -m pip install --index-url https://test.pypi.org/simple/ hello-pypi-rudy-klucik
103
- python
104
- ```
105
- ```
106
- from water-column-sonar-processing import ZarrManager
107
- example.add_one(2)
108
- ```
109
-
110
106
  # To Publish To PROD
111
107
  ```commandline
112
108
  python -m build
@@ -134,6 +130,15 @@ https://colab.research.google.com/drive/1KiLMueXiz9WVB9o4RuzYeGjNZ6PsZU7a#scroll
134
130
  5 failed, 35 passed, 3 skipped, 1 warning in 9.71s
135
131
  3 failed, 38 passed, 3 skipped, 1 warning in 7.24s
136
132
 
133
+ # Tag a Release
134
+ Step 1 --> increment the semantic version in the zarr_manager.py "metadata" & the "pyproject.toml"
135
+ ```commandline
136
+ git tag "v0.0.13" -a
137
+ ```
138
+ Step 3 --> enter description
139
+ ```commandline
140
+ git push origin --tags
141
+ ```
137
142
 
138
143
  # TODO:
139
144
  add https://pypi.org/project/setuptools-scm/
@@ -1,32 +1,33 @@
1
1
  water_column_sonar_processing/__init__.py,sha256=fvRK4uFo_A0l7w_T4yckvDqJ3wMUq4JB3VVPXqWfewE,226
2
2
  water_column_sonar_processing/process.py,sha256=-yQtK3rnZq6lGAr3q02zLDe1NuMH9c0PiUOxKzG_r18,5386
3
3
  water_column_sonar_processing/aws/__init__.py,sha256=KJqK8oYMn-u8n8i-Jp_lG5BvCOTjwWSjWP8yAyDlWVo,297
4
- water_column_sonar_processing/aws/dynamodb_manager.py,sha256=gMDAXLE_p_nKmNZYICKA9T56PYDqtXBySlysSOVnWrI,10250
5
- water_column_sonar_processing/aws/s3_manager.py,sha256=kS48Vu_jE_fOKbwKOhCLWKDSqHzOGVEdZ_Lc4MaMCfA,15291
6
- water_column_sonar_processing/aws/s3fs_manager.py,sha256=thVJPQKhbvF1g-Ue3BYgwazFOFDYOICIEJx4zkXBQ1E,2381
4
+ water_column_sonar_processing/aws/dynamodb_manager.py,sha256=LQ3eh7Zf1fBLG-RKovod9KbQwhE-0Qdq1JPk4Ro5bdo,10252
5
+ water_column_sonar_processing/aws/s3_manager.py,sha256=-PCiW7YF31nGIPa1oVOVTzjTSExAAkT_IyNNnvWv2HU,16214
6
+ water_column_sonar_processing/aws/s3fs_manager.py,sha256=d7p9Sx-ocooKzHjVJVCawnXSGv6BpmKvvN9uhzilglw,2529
7
7
  water_column_sonar_processing/aws/sns_manager.py,sha256=Dp9avG5VSugSWPR1dZ-askuAw1fCZkNUHbOUP65iR-k,1867
8
8
  water_column_sonar_processing/aws/sqs_manager.py,sha256=NSUrWmnSC8h8Gf7gT0U8zFaQQ-yX89h0Q0mDLKGqp2Y,1597
9
9
  water_column_sonar_processing/cruise/__init__.py,sha256=H5hW0JMORuaFvQk_R31B4VL8RnRyKeanOOiWmqEMZJk,156
10
- water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=uQiZoKm16jD0SUuXmhuPryxdE-6bUc6BlCi2UtmzUpw,7318
11
- water_column_sonar_processing/cruise/resample_regrid.py,sha256=4Tw6Ro9mQZOr0uIph6foz6a1OeFAZW0SMUT_asIwvKw,12309
10
+ water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=1IehrlhMAS5XAl7DLdQI4jIMSY9ZNLiW4YdcBEwYkbc,7679
11
+ water_column_sonar_processing/cruise/experiment_datatree.py,sha256=K6Uq_36Rygw5oFF8zWavEwb1x8D27lJv5G3j0B59agE,243
12
+ water_column_sonar_processing/cruise/resample_regrid.py,sha256=XpGRs8nWspWuVoXBEV6VNVJSMlr3_IjnKlN1dK6dEA4,14292
12
13
  water_column_sonar_processing/geometry/__init__.py,sha256=_ol5nI8AL30pYXeAh5rtP7YmQggitPC6LA_kuTfPJ0Q,231
13
- water_column_sonar_processing/geometry/geometry_manager.py,sha256=0Q9IRiBr6XvxUg5M2vCPtUhbnYnwa5pJI1ayfWXMgMs,10587
14
+ water_column_sonar_processing/geometry/geometry_manager.py,sha256=nz5T1vCDWHYIfQ853EqKYHDetTul7jRWS3y8Evep8QU,10855
14
15
  water_column_sonar_processing/geometry/geometry_simplification.py,sha256=im1HG9nfYIerQv3w-PUHzphw2B7aGgnsA3Zcdy2oTmA,3016
15
16
  water_column_sonar_processing/geometry/pmtile_generation.py,sha256=7Lm08Jr6YaM4nYmexClxbIMOqSV1teo9wMm6dfjFuNA,12384
16
17
  water_column_sonar_processing/index/__init__.py,sha256=izEObsKiOoIJ0kZCFhvaYsBd6Ga71XJxnogjrNInw68,68
17
18
  water_column_sonar_processing/index/index_manager.py,sha256=YS6y_THfGAZpjfBZOj5n8O1aY_BnBYS781eNHfhpip0,11239
18
19
  water_column_sonar_processing/model/__init__.py,sha256=FXaCdbPqxp0ogmZm9NplRirqpgMiYs1iRYgJbFbbX2Y,65
19
- water_column_sonar_processing/model/zarr_manager.py,sha256=TbcVux-GWfX4XJ7UT20E7dI_h_islrKsGtjx_VwSsLg,14003
20
+ water_column_sonar_processing/model/zarr_manager.py,sha256=LoL8vOnEl2r_Jhu4l30p6AgfUZg1tW5aBydHx_BZAZg,15068
20
21
  water_column_sonar_processing/processing/__init__.py,sha256=UwdB3BnoUxy4q3k9-ZjBF6KzmCWVDcqbcArTeHgmvGA,118
21
22
  water_column_sonar_processing/processing/cruise_sampler.py,sha256=hadPrnH5nz7_oG_4pND7YbMFH6NMR9d6p3xAXedtKU8,15927
22
- water_column_sonar_processing/processing/raw_to_zarr.py,sha256=OPu4CoIlHQFW38iY4DLe5A5Ttrdz4NXtjYThrB-FuPs,16874
23
+ water_column_sonar_processing/processing/raw_to_zarr.py,sha256=agbb2A0BWf7D4b5u-mYOBN_VyjRVjOdQM2aeRGBweWw,17617
23
24
  water_column_sonar_processing/utility/__init__.py,sha256=yDObMOL0_OxKWet5wffK2-XVJgoE9iwiY2q04GZrtBQ,234
24
25
  water_column_sonar_processing/utility/cleaner.py,sha256=bNbs-hopWxtKAFBK0Eu18xdRErZCGZvtla3j-1bTwQw,619
25
26
  water_column_sonar_processing/utility/constants.py,sha256=EbzsorvYKadsPjuutRjQKKByGibhFm0Gw6D-Sp2ZD3I,2143
26
27
  water_column_sonar_processing/utility/pipeline_status.py,sha256=O-0SySqdRGJ6bs3zQe1NV9vkOpmsRM7zj5QoHgzYioY,4395
27
28
  water_column_sonar_processing/utility/timestamp.py,sha256=bO0oir7KxxoEHPGRkz9FCBfOligkocUyRiWRzAq8fnU,361
28
- water_column_sonar_processing-0.0.11.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
29
- water_column_sonar_processing-0.0.11.dist-info/METADATA,sha256=KFkI1367kV7L7pl8SIK4UFwUVJvUCHkRTPwBCqpnxWA,4566
30
- water_column_sonar_processing-0.0.11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
31
- water_column_sonar_processing-0.0.11.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
32
- water_column_sonar_processing-0.0.11.dist-info/RECORD,,
29
+ water_column_sonar_processing-0.0.13.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
30
+ water_column_sonar_processing-0.0.13.dist-info/METADATA,sha256=MUkVn5e1wkAFUAYpk25V02yNCeYNmwBsyib788i2ibg,5087
31
+ water_column_sonar_processing-0.0.13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
32
+ water_column_sonar_processing-0.0.13.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
33
+ water_column_sonar_processing-0.0.13.dist-info/RECORD,,