water-column-sonar-processing 0.0.1__py3-none-any.whl → 25.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (60) hide show
  1. water_column_sonar_processing/__init__.py +13 -0
  2. water_column_sonar_processing/aws/__init__.py +7 -0
  3. water_column_sonar_processing/aws/dynamodb_manager.py +355 -0
  4. water_column_sonar_processing/aws/s3_manager.py +420 -0
  5. water_column_sonar_processing/aws/s3fs_manager.py +72 -0
  6. {model → water_column_sonar_processing}/aws/sns_manager.py +10 -21
  7. {model → water_column_sonar_processing}/aws/sqs_manager.py +11 -19
  8. water_column_sonar_processing/cruise/__init__.py +4 -0
  9. water_column_sonar_processing/cruise/create_empty_zarr_store.py +191 -0
  10. water_column_sonar_processing/cruise/datatree_manager.py +21 -0
  11. water_column_sonar_processing/cruise/resample_regrid.py +339 -0
  12. water_column_sonar_processing/geometry/__init__.py +11 -0
  13. water_column_sonar_processing/geometry/elevation_manager.py +111 -0
  14. water_column_sonar_processing/geometry/geometry_manager.py +243 -0
  15. water_column_sonar_processing/geometry/line_simplification.py +176 -0
  16. water_column_sonar_processing/geometry/pmtile_generation.py +261 -0
  17. water_column_sonar_processing/index/__init__.py +3 -0
  18. water_column_sonar_processing/index/index_manager.py +384 -0
  19. water_column_sonar_processing/model/__init__.py +3 -0
  20. water_column_sonar_processing/model/zarr_manager.py +722 -0
  21. water_column_sonar_processing/process.py +149 -0
  22. water_column_sonar_processing/processing/__init__.py +4 -0
  23. water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
  24. water_column_sonar_processing/processing/raw_to_zarr.py +425 -0
  25. water_column_sonar_processing/utility/__init__.py +13 -0
  26. {model → water_column_sonar_processing}/utility/cleaner.py +7 -8
  27. water_column_sonar_processing/utility/constants.py +118 -0
  28. {model → water_column_sonar_processing}/utility/pipeline_status.py +47 -24
  29. water_column_sonar_processing/utility/timestamp.py +12 -0
  30. water_column_sonar_processing-25.11.1.dist-info/METADATA +182 -0
  31. water_column_sonar_processing-25.11.1.dist-info/RECORD +34 -0
  32. {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-25.11.1.dist-info}/WHEEL +1 -1
  33. {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-25.11.1.dist-info/licenses}/LICENSE +1 -1
  34. water_column_sonar_processing-25.11.1.dist-info/top_level.txt +1 -0
  35. __init__.py +0 -0
  36. model/__init__.py +0 -0
  37. model/aws/__init__.py +0 -0
  38. model/aws/dynamodb_manager.py +0 -149
  39. model/aws/s3_manager.py +0 -356
  40. model/aws/s3fs_manager.py +0 -74
  41. model/cruise/__init__.py +0 -0
  42. model/cruise/create_empty_zarr_store.py +0 -166
  43. model/cruise/resample_regrid.py +0 -248
  44. model/geospatial/__init__.py +0 -0
  45. model/geospatial/geometry_manager.py +0 -194
  46. model/geospatial/geometry_simplification.py +0 -81
  47. model/geospatial/pmtile_generation.py +0 -74
  48. model/index/__init__.py +0 -0
  49. model/index/index.py +0 -228
  50. model/model.py +0 -138
  51. model/utility/__init__.py +0 -0
  52. model/utility/constants.py +0 -56
  53. model/utility/timestamp.py +0 -12
  54. model/zarr/__init__.py +0 -0
  55. model/zarr/bar.py +0 -28
  56. model/zarr/foo.py +0 -11
  57. model/zarr/zarr_manager.py +0 -298
  58. water_column_sonar_processing-0.0.1.dist-info/METADATA +0 -89
  59. water_column_sonar_processing-0.0.1.dist-info/RECORD +0 -32
  60. water_column_sonar_processing-0.0.1.dist-info/top_level.txt +0 -2
@@ -0,0 +1,384 @@
1
+ import os
2
+ import re
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from datetime import datetime
5
+ from hashlib import sha256
6
+
7
+ import networkx as nx
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ from water_column_sonar_processing.aws import S3Manager
12
+
13
+ MAX_POOL_CONNECTIONS = 64
14
+ MAX_CONCURRENCY = 64
15
+ MAX_WORKERS = 64
16
+ GB = 1024**3
17
+
18
+
19
+ class IndexManager:
20
+ # TODO: index into dynamodb instead of csv files
21
+
22
+ def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
23
+ self.input_bucket_name = input_bucket_name
24
+ self.calibration_bucket = calibration_bucket
25
+ self.calibration_key = calibration_key # TODO: make optional?
26
+ self.s3_manager = S3Manager() # TODO: make anonymous?
27
+
28
+ #################################################################
29
+ def list_ships(
30
+ self,
31
+ prefix="data/raw/",
32
+ ):
33
+ page_iterator = self.s3_manager.paginator.paginate(
34
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
35
+ )
36
+ # common_prefixes = s3_client.list_objects(Bucket=self.input_bucket_name, Prefix=prefix, Delimiter='/')
37
+ # print(common_prefixes)
38
+ ships = []
39
+ for page in page_iterator:
40
+ if "Contents" in page.keys():
41
+ ships.extend([k["Prefix"] for k in page["CommonPrefixes"]])
42
+ return ships # ~76 ships
43
+
44
+ #################################################################
45
+ def list_cruises(
46
+ self,
47
+ ship_prefixes, # e.g. 'data/raw/Alaska_Knight/'
48
+ ):
49
+ cruises = []
50
+ for ship_prefix in ship_prefixes:
51
+ page_iterator = self.s3_manager.paginator.paginate(
52
+ Bucket=self.input_bucket_name, Prefix=ship_prefix, Delimiter="/"
53
+ )
54
+ for page in page_iterator:
55
+ cruises.extend([k["Prefix"] for k in page["CommonPrefixes"]])
56
+ return cruises # ~1204 cruises
57
+
58
+ #################################################################
59
+ def list_ek60_cruises(
60
+ self,
61
+ cruise_prefixes,
62
+ ):
63
+ """
64
+ This returns a list of ek60 prefixed cruises.
65
+ """
66
+ cruise_sensors = [] # includes all sensor types
67
+ for cruise_prefix in cruise_prefixes:
68
+ page_iterator = self.s3_manager.paginator.paginate(
69
+ Bucket=self.input_bucket_name, Prefix=cruise_prefix, Delimiter="/"
70
+ )
71
+ for page in page_iterator:
72
+ cruise_sensors.extend([k["Prefix"] for k in page["CommonPrefixes"]])
73
+ # Note: these are "EK60" by prefix. They still need to be verified by scanning the datagram.
74
+ return [i for i in cruise_sensors if "/EK60/" in i] # ~447 different cruises
75
+
76
+ #################################################################
77
+ def get_raw_files(
78
+ self,
79
+ ship_name,
80
+ cruise_name,
81
+ sensor_name,
82
+ ):
83
+ # Gets all raw files for a cruise under the given prefix
84
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
85
+ page_iterator = self.s3_manager.paginator.paginate(
86
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
87
+ )
88
+ all_files = []
89
+ for page in page_iterator:
90
+ if "Contents" in page.keys():
91
+ all_files.extend([i["Key"] for i in page["Contents"]])
92
+ return [i for i in all_files if i.endswith(".raw")]
93
+
94
+ def get_first_raw_file(
95
+ self,
96
+ ship_name,
97
+ cruise_name,
98
+ sensor_name,
99
+ ):
100
+ # Same as above but only needs to get the first raw file
101
+ # because we are only interested in the first datagram of one file
102
+ # TODO: "dataset?"
103
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
104
+ # page_iterator = self.s3_manager.paginator.paginate(
105
+ # Bucket=self.input_bucket_name,
106
+ # Prefix=prefix,
107
+ # Delimiter="/",
108
+ # PaginationConfig={ 'MaxItems': 5 }
109
+ # ) # TODO: this can create a problem if there is a non raw file returned first
110
+ ### filter with JMESPath expressions ###
111
+ page_iterator = self.s3_manager.paginator.paginate(
112
+ Bucket=self.input_bucket_name,
113
+ Prefix=prefix,
114
+ Delimiter="/",
115
+ )
116
+ # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
117
+ page_iterator = page_iterator.search(
118
+ expression="Contents[?contains(Key, '.raw')] "
119
+ )
120
+ for res in page_iterator:
121
+ if "Key" in res:
122
+ return res["Key"]
123
+ # else raise exception?
124
+
125
+ # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
126
+
127
+ def get_files_under_size(
128
+ self,
129
+ ship_name,
130
+ cruise_name,
131
+ sensor_name,
132
+ ):
133
+ # THIS isn't used, just playing with JMES paths spec
134
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
135
+ ### filter with JMESPath expressions ###
136
+ page_iterator = self.s3_manager.paginator.paginate(
137
+ Bucket=self.input_bucket_name,
138
+ Prefix=prefix,
139
+ Delimiter="/",
140
+ )
141
+ page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
142
+ all_files = []
143
+ for page in page_iterator:
144
+ if "Contents" in page.keys():
145
+ all_files.extend([i["Key"] for i in page["Contents"]])
146
+ return [i for i in all_files if i.endswith(".raw")]
147
+
148
+ #################################################################
149
+ def get_raw_files_csv(
150
+ self,
151
+ ship_name,
152
+ cruise_name,
153
+ sensor_name,
154
+ ):
155
+ raw_files = self.get_raw_files(
156
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
157
+ )
158
+ files_list = [
159
+ {
160
+ "ship_name": ship_name,
161
+ "cruise_name": cruise_name,
162
+ "sensor_name": sensor_name,
163
+ "file_name": os.path.basename(raw_file),
164
+ }
165
+ for raw_file in raw_files
166
+ ]
167
+ df = pd.DataFrame(files_list)
168
+ df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
169
+ print("done")
170
+
171
+ def get_raw_files_list(
172
+ self,
173
+ ship_name,
174
+ cruise_name,
175
+ sensor_name,
176
+ ):
177
+ # gets all raw files in cruise and returns a list of dicts
178
+ raw_files = self.get_raw_files(
179
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
180
+ )
181
+ files_list = [
182
+ {
183
+ "ship_name": ship_name,
184
+ "cruise_name": cruise_name,
185
+ "sensor_name": sensor_name,
186
+ "file_name": os.path.basename(raw_file),
187
+ }
188
+ for raw_file in raw_files
189
+ ]
190
+ return files_list
191
+
192
+ #################################################################
193
+ def get_subset_ek60_prefix(
194
+ self, df: pd.DataFrame
195
+ ) -> pd.DataFrame: # TODO: is this used?
196
+ # Returns all objects with 'EK60' in prefix of file path
197
+ # Note that this can include 'EK80' dataset that are false-positives
198
+ # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
199
+ print("getting subset of ek60 dataset by prefix")
200
+ objects = []
201
+ for row in df.itertuples():
202
+ row_split = row[1].split(os.sep)
203
+ if len(row_split) == 6:
204
+ filename = os.path.basename(
205
+ row[1]
206
+ ) # 'EX1608_EK60-D20161205-T040300.raw'
207
+ if filename.endswith(".raw"):
208
+ ship_name, cruise_name, sensor_name = row_split[
209
+ 2:5
210
+ ] # 'Okeanos_Explorer', 'EX1608', 'EK60'
211
+ if (
212
+ re.search("[D](\\d{8})", filename) is not None
213
+ and re.search("[T](\\d{6})", filename) is not None
214
+ ):
215
+ # Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
216
+ # and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
217
+ date_substring = re.search("[D](\\d{8})", filename).group(1)
218
+ time_substring = re.search("[T](\\d{6})", filename).group(1)
219
+ date_string = datetime.strptime(
220
+ f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
221
+ )
222
+ else: # otherwise use current date
223
+ date_string = f"{datetime.utcnow().isoformat()[:19]}Z"
224
+ objects.append(
225
+ {
226
+ "KEY": row[1],
227
+ "FILENAME": filename,
228
+ "SHIP": ship_name,
229
+ "CRUISE": cruise_name,
230
+ "SENSOR": sensor_name,
231
+ "SIZE": row[2],
232
+ "DATE": date_string,
233
+ "DATAGRAM": None,
234
+ }
235
+ )
236
+ return pd.DataFrame(objects)
237
+
238
+ #################################################################
239
+ def scan_datagram(self, select_key: str) -> list:
240
+ # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
241
+ # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
242
+ # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
243
+ s3_resource = self.s3_manager.s3_resource
244
+ obj = s3_resource.Object(
245
+ bucket_name=self.input_bucket_name, key=select_key
246
+ ) # XML0
247
+ first_datagram = (
248
+ obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
249
+ )
250
+ # return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
251
+ ### EK60 dataset are denoted by 'CON0' ###
252
+ return first_datagram
253
+
254
+ #################################################################
255
+ def get_subset_datagrams(
256
+ self, df: pd.DataFrame
257
+ ) -> list: # TODO: is this getting used
258
+ print("getting subset of datagrams")
259
+ select_keys = (
260
+ df[["KEY", "CRUISE"]]
261
+ .drop_duplicates(subset="CRUISE")["KEY"]
262
+ .values.tolist()
263
+ )
264
+ all_datagrams = []
265
+ with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
266
+ futures = [
267
+ executor.submit(self.scan_datagram, select_key)
268
+ for select_key in select_keys
269
+ ]
270
+ for future in as_completed(futures):
271
+ result = future.result()
272
+ if result:
273
+ all_datagrams.extend(result)
274
+ return all_datagrams
275
+
276
+ #################################################################
277
+ def get_ek60_objects(
278
+ self, df: pd.DataFrame, subset_datagrams: list
279
+ ) -> pd.DataFrame:
280
+ # for each key write datagram value to all other files in same cruise
281
+ for subset_datagram in subset_datagrams:
282
+ if subset_datagram["DATAGRAM"] == "CON0":
283
+ select_cruise = df.loc[df["KEY"] == subset_datagram["KEY"]][
284
+ "CRUISE"
285
+ ].iloc[0]
286
+ df.loc[df["CRUISE"] == select_cruise, ["DATAGRAM"]] = subset_datagram[
287
+ "DATAGRAM"
288
+ ]
289
+ return df.loc[df["DATAGRAM"] == "CON0"]
290
+
291
+ #################################################################
292
+ def get_calibration_information(
293
+ self,
294
+ ) -> pd.DataFrame:
295
+ # Calibration dataset generated by dataset manager currently located here:
296
+ # https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
297
+ # Note: Data are either:
298
+ # [1] Calibrated w/ calibration dataset
299
+ # [2] Calibrated w/o calibration dataset
300
+ # [3] uncalibrated
301
+ response = self.s3_manager.get_object(
302
+ bucket_name=self.calibration_bucket, key_name=self.calibration_key
303
+ )
304
+ calibration_statuses = pd.read_csv(response.get("Body"))
305
+ calibration_statuses["DATASET_NAME"] = calibration_statuses[
306
+ "DATASET_NAME"
307
+ ].apply(lambda x: x.split("_EK60")[0])
308
+ calibration_statuses["CAL_STATE"] = calibration_statuses["CAL_STATE"].apply(
309
+ lambda x: x.find("Calibrated") >= 0
310
+ )
311
+ return calibration_statuses
312
+
313
+ #################################################################
314
+ # def index( # TODO: get rid of this?
315
+ # self
316
+ # ):
317
+ # start_time = datetime.now() # used for benchmarking
318
+ # # Get all object in public dataset bucket
319
+ # all_objects = self.get_all_objects()
320
+ # #
321
+ # subset_ek60_by_prefix = self.get_subset_ek60_prefix(
322
+ # df=all_objects[all_objects['Key'].str.contains('EK60')][['Key', 'Size']]
323
+ # )
324
+ # #
325
+ # subset_datagrams = self.get_subset_datagrams(df=subset_ek60_by_prefix)
326
+ # print("done getting subset of datagrams")
327
+ # ek60_objects = self.get_ek60_objects(subset_ek60_by_prefix, subset_datagrams)
328
+ # print("done getting ek60_objects")
329
+ # print(start_time)
330
+ # calibration_status = self.get_calibration_information(s3)
331
+ # cruise_names = list(set(ek60_objects['CRUISE']))
332
+ # cruise_names.sort()
333
+ # for cruise_name in cruise_names: # ~322 cruises
334
+ # cruise_data = ek60_objects.groupby('CRUISE').get_group(cruise_name)
335
+ # ship = cruise_data['SHIP'].iloc[0]
336
+ # sensor = cruise_data['SENSOR'].iloc[0]
337
+ # datagram = cruise_data['DATAGRAM'].iloc[0]
338
+ # file_count = cruise_data.shape[0]
339
+ # total_size = np.sum(cruise_data['SIZE'])
340
+ # calibrated = cruise_name in calibration_status['DATASET_NAME'].unique() # ~276 entries
341
+ # start_date = np.min(cruise_data['DATE']).isoformat(timespec="seconds") + "Z"
342
+ # end_date = np.max(cruise_data['DATE']).isoformat(timespec="seconds") + "Z"
343
+ # end_time = datetime.now() # used for benchmarking
344
+ # print(start_time)
345
+ # print(end_time)
346
+
347
+ # TODO: wip
348
+ def build_merkle_tree(self):
349
+ G = nx.DiGraph()
350
+ # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
351
+ ship_name = "Henry_B._Bigelow"
352
+ cruise_name = "HB0707"
353
+ # cruise_name = "HB0805"
354
+ prefix = f"data/raw/{ship_name}/{cruise_name}/"
355
+ # prefix = f"data/raw/{ship_name}/"
356
+ page_iterator = self.s3_manager.paginator.paginate(
357
+ Bucket=self.input_bucket_name,
358
+ Prefix=prefix,
359
+ )
360
+ for page in page_iterator:
361
+ for contents in page["Contents"]:
362
+ obj_key = contents["Key"]
363
+ # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
364
+ obj_etag = contents["ETag"].split('"')[1] # properties
365
+ obj_size = contents["Size"]
366
+ basename = os.path.basename(obj_key)
367
+ G.add_node(
368
+ node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
369
+ ) # TODO: add parent hash
370
+ split_path = os.path.normpath(obj_key).split(os.path.sep)
371
+ # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
372
+ for previous, current in zip(split_path, split_path[1:]):
373
+ if not G.has_edge(previous, current):
374
+ G.add_edge(previous, current)
375
+ # print(G)
376
+ etag_set = frozenset(
377
+ [k for j, k in list(G.nodes.data("ETag")) if k is not None]
378
+ )
379
+ new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
380
+ total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
381
+ print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
382
+ print(" ")
383
+ print(new_hash)
384
+ return new_hash
@@ -0,0 +1,3 @@
1
+ from .zarr_manager import ZarrManager
2
+
3
+ __all__ = ["ZarrManager"]