water-column-sonar-processing 0.0.9__py3-none-any.whl → 26.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. water_column_sonar_processing/aws/dynamodb_manager.py +138 -59
  2. water_column_sonar_processing/aws/s3_manager.py +179 -141
  3. water_column_sonar_processing/aws/s3fs_manager.py +29 -33
  4. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  5. water_column_sonar_processing/cruise/create_empty_zarr_store.py +35 -96
  6. water_column_sonar_processing/cruise/datatree_manager.py +21 -0
  7. water_column_sonar_processing/cruise/resample_regrid.py +142 -127
  8. water_column_sonar_processing/geometry/__init__.py +10 -2
  9. water_column_sonar_processing/geometry/elevation_manager.py +111 -0
  10. water_column_sonar_processing/geometry/geometry_manager.py +50 -49
  11. water_column_sonar_processing/geometry/line_simplification.py +176 -0
  12. water_column_sonar_processing/geometry/pmtile_generation.py +227 -223
  13. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  14. water_column_sonar_processing/index/index_manager.py +151 -33
  15. water_column_sonar_processing/model/zarr_manager.py +665 -262
  16. water_column_sonar_processing/processing/__init__.py +3 -3
  17. water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
  18. water_column_sonar_processing/processing/raw_to_zarr.py +206 -214
  19. water_column_sonar_processing/utility/__init__.py +9 -2
  20. water_column_sonar_processing/utility/constants.py +69 -18
  21. water_column_sonar_processing/utility/pipeline_status.py +11 -15
  22. water_column_sonar_processing/utility/timestamp.py +3 -4
  23. water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
  24. water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
  25. {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
  26. {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
  27. water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
  28. water_column_sonar_processing/process.py +0 -147
  29. water_column_sonar_processing/processing/cruise_sampler.py +0 -342
  30. water_column_sonar_processing-0.0.9.dist-info/METADATA +0 -134
  31. water_column_sonar_processing-0.0.9.dist-info/RECORD +0 -32
  32. {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,27 @@
1
1
  import os
2
2
  import re
3
- import pandas as pd
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from datetime import datetime
5
- from concurrent.futures import ThreadPoolExecutor
6
- from concurrent.futures import as_completed
5
+
6
+ # import networkx as nx
7
+ import pandas as pd
8
+
7
9
  from water_column_sonar_processing.aws import S3Manager
8
10
 
11
+ MAX_POOL_CONNECTIONS = 64
12
+ MAX_CONCURRENCY = 64
13
+ MAX_WORKERS = 64
14
+ GB = 1024**3
15
+
9
16
 
10
17
  class IndexManager:
18
+ # TODO: index into dynamodb instead of csv files
11
19
 
12
20
  def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
13
21
  self.input_bucket_name = input_bucket_name
14
22
  self.calibration_bucket = calibration_bucket
15
- self.calibration_key = calibration_key
16
- self.s3_manager = S3Manager()
23
+ self.calibration_key = calibration_key # TODO: make optional?
24
+ self.s3_manager = S3Manager() # TODO: make anonymous?
17
25
 
18
26
  #################################################################
19
27
  def list_ships(
@@ -50,6 +58,9 @@ class IndexManager:
50
58
  self,
51
59
  cruise_prefixes,
52
60
  ):
61
+ """
62
+ This returns a list of ek60 prefixed cruises.
63
+ """
53
64
  cruise_sensors = [] # includes all sensor types
54
65
  for cruise_prefix in cruise_prefixes:
55
66
  page_iterator = self.s3_manager.paginator.paginate(
@@ -67,6 +78,7 @@ class IndexManager:
67
78
  cruise_name,
68
79
  sensor_name,
69
80
  ):
81
+ # Gets all raw files for a cruise under the given prefix
70
82
  prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
71
83
  page_iterator = self.s3_manager.paginator.paginate(
72
84
  Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
@@ -77,6 +89,61 @@ class IndexManager:
77
89
  all_files.extend([i["Key"] for i in page["Contents"]])
78
90
  return [i for i in all_files if i.endswith(".raw")]
79
91
 
92
+ def get_first_raw_file(
93
+ self,
94
+ ship_name,
95
+ cruise_name,
96
+ sensor_name,
97
+ ):
98
+ # Same as above but only needs to get the first raw file
99
+ # because we are only interested in the first datagram of one file
100
+ # TODO: "dataset?"
101
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
102
+ # page_iterator = self.s3_manager.paginator.paginate(
103
+ # Bucket=self.input_bucket_name,
104
+ # Prefix=prefix,
105
+ # Delimiter="/",
106
+ # PaginationConfig={ 'MaxItems': 5 }
107
+ # ) # TODO: this can create a problem if there is a non raw file returned first
108
+ ### filter with JMESPath expressions ###
109
+ page_iterator = self.s3_manager.paginator.paginate(
110
+ Bucket=self.input_bucket_name,
111
+ Prefix=prefix,
112
+ Delimiter="/",
113
+ )
114
+ # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
115
+ page_iterator = page_iterator.search(
116
+ expression="Contents[?contains(Key, '.raw')] "
117
+ )
118
+ for res in page_iterator:
119
+ if "Key" in res:
120
+ return res["Key"]
121
+ return None
122
+ # else raise exception?
123
+
124
+ # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
125
+
126
+ def get_files_under_size(
127
+ self,
128
+ ship_name,
129
+ cruise_name,
130
+ sensor_name,
131
+ ):
132
+ # THIS isn't used, just playing with JMES paths spec
133
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
134
+ ### filter with JMESPath expressions ###
135
+ page_iterator = self.s3_manager.paginator.paginate(
136
+ Bucket=self.input_bucket_name,
137
+ Prefix=prefix,
138
+ Delimiter="/",
139
+ )
140
+ page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
141
+ all_files = []
142
+ for page in page_iterator:
143
+ if "Contents" in page.keys():
144
+ all_files.extend([i["Key"] for i in page["Contents"]])
145
+ return [i for i in all_files if i.endswith(".raw")]
146
+
80
147
  #################################################################
81
148
  def get_raw_files_csv(
82
149
  self,
@@ -85,9 +152,7 @@ class IndexManager:
85
152
  sensor_name,
86
153
  ):
87
154
  raw_files = self.get_raw_files(
88
- ship_name=ship_name,
89
- cruise_name=cruise_name,
90
- sensor_name=sensor_name
155
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
91
156
  )
92
157
  files_list = [
93
158
  {
@@ -102,15 +167,34 @@ class IndexManager:
102
167
  df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
103
168
  print("done")
104
169
 
105
- #################################################################
106
- def get_subset_ek60_prefix( # TODO: is this used?
170
+ def get_raw_files_list(
107
171
  self,
108
- df: pd.DataFrame
109
- ) -> pd.DataFrame:
172
+ ship_name,
173
+ cruise_name,
174
+ sensor_name,
175
+ ):
176
+ # gets all raw files in cruise and returns a list of dicts
177
+ raw_files = self.get_raw_files(
178
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
179
+ )
180
+ files_list = [
181
+ {
182
+ "ship_name": ship_name,
183
+ "cruise_name": cruise_name,
184
+ "sensor_name": sensor_name,
185
+ "file_name": os.path.basename(raw_file),
186
+ }
187
+ for raw_file in raw_files
188
+ ]
189
+ return files_list
190
+
191
+ #################################################################
192
+ @staticmethod
193
+ def get_subset_ek60_prefix(df: pd.DataFrame) -> pd.DataFrame: # TODO: is this used?
110
194
  # Returns all objects with 'EK60' in prefix of file path
111
- # Note that this can include 'EK80' data that are false-positives
195
+ # Note that this can include 'EK80' dataset that are false-positives
112
196
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
113
- print("getting subset of ek60 data by prefix")
197
+ print("getting subset of ek60 dataset by prefix")
114
198
  objects = []
115
199
  for row in df.itertuples():
116
200
  row_split = row[1].split(os.sep)
@@ -150,10 +234,7 @@ class IndexManager:
150
234
  return pd.DataFrame(objects)
151
235
 
152
236
  #################################################################
153
- def scan_datagram(
154
- self,
155
- select_key: str
156
- ) -> list:
237
+ def scan_datagram(self, select_key: str) -> list:
157
238
  # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
158
239
  # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
159
240
  # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
@@ -165,20 +246,21 @@ class IndexManager:
165
246
  obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
166
247
  )
167
248
  # return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
168
- ### EK60 data are denoted by 'CON0' ###
249
+ ### EK60 dataset are denoted by 'CON0' ###
169
250
  return first_datagram
170
251
 
171
252
  #################################################################
172
253
  def get_subset_datagrams(
173
- self,
174
- df: pd.DataFrame
175
- ) -> list:
254
+ self, df: pd.DataFrame
255
+ ) -> list: # TODO: is this getting used
176
256
  print("getting subset of datagrams")
177
- select_keys = list(
178
- df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
257
+ select_keys = (
258
+ df[["KEY", "CRUISE"]]
259
+ .drop_duplicates(subset="CRUISE")["KEY"]
260
+ .values.tolist()
179
261
  )
180
262
  all_datagrams = []
181
- with ThreadPoolExecutor(max_workers=self.max_pool_connections) as executor:
263
+ with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
182
264
  futures = [
183
265
  executor.submit(self.scan_datagram, select_key)
184
266
  for select_key in select_keys
@@ -190,11 +272,8 @@ class IndexManager:
190
272
  return all_datagrams
191
273
 
192
274
  #################################################################
193
- def get_ek60_objects(
194
- self,
195
- df: pd.DataFrame,
196
- subset_datagrams: list
197
- ) -> pd.DataFrame:
275
+ @staticmethod
276
+ def get_ek60_objects(df: pd.DataFrame, subset_datagrams: list) -> pd.DataFrame:
198
277
  # for each key write datagram value to all other files in same cruise
199
278
  for subset_datagram in subset_datagrams:
200
279
  if subset_datagram["DATAGRAM"] == "CON0":
@@ -210,11 +289,11 @@ class IndexManager:
210
289
  def get_calibration_information(
211
290
  self,
212
291
  ) -> pd.DataFrame:
213
- # Calibration data generated by data manager currently located here:
292
+ # Calibration dataset generated by dataset manager currently located here:
214
293
  # https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
215
294
  # Note: Data are either:
216
- # [1] Calibrated w/ calibration data
217
- # [2] Calibrated w/o calibration data
295
+ # [1] Calibrated w/ calibration dataset
296
+ # [2] Calibrated w/o calibration dataset
218
297
  # [3] uncalibrated
219
298
  response = self.s3_manager.get_object(
220
299
  bucket_name=self.calibration_bucket, key_name=self.calibration_key
@@ -261,3 +340,42 @@ class IndexManager:
261
340
  # end_time = datetime.now() # used for benchmarking
262
341
  # print(start_time)
263
342
  # print(end_time)
343
+
344
+ # TODO: wip
345
+ # def build_merkle_tree(self):
346
+ # G = nx.DiGraph()
347
+ # # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
348
+ # ship_name = "Henry_B._Bigelow"
349
+ # cruise_name = "HB0707"
350
+ # # cruise_name = "HB0805"
351
+ # prefix = f"data/raw/{ship_name}/{cruise_name}/"
352
+ # # prefix = f"data/raw/{ship_name}/"
353
+ # page_iterator = self.s3_manager.paginator.paginate(
354
+ # Bucket=self.input_bucket_name,
355
+ # Prefix=prefix,
356
+ # )
357
+ # for page in page_iterator:
358
+ # for contents in page["Contents"]:
359
+ # obj_key = contents["Key"]
360
+ # # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
361
+ # obj_etag = contents["ETag"].split('"')[1] # properties
362
+ # obj_size = contents["Size"]
363
+ # basename = os.path.basename(obj_key)
364
+ # G.add_node(
365
+ # node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
366
+ # ) # TODO: add parent hash
367
+ # split_path = os.path.normpath(obj_key).split(os.path.sep)
368
+ # # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
369
+ # for previous, current in zip(split_path, split_path[1:]):
370
+ # if not G.has_edge(previous, current):
371
+ # G.add_edge(previous, current)
372
+ # # print(G)
373
+ # etag_set = frozenset(
374
+ # [k for j, k in list(G.nodes.data("ETag")) if k is not None]
375
+ # )
376
+ # new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
377
+ # total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
378
+ # print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
379
+ # print(" ")
380
+ # print(new_hash)
381
+ # return new_hash