water-column-sonar-processing 0.0.6__py3-none-any.whl → 26.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. water_column_sonar_processing/__init__.py +2 -5
  2. water_column_sonar_processing/aws/__init__.py +2 -2
  3. water_column_sonar_processing/aws/dynamodb_manager.py +257 -72
  4. water_column_sonar_processing/aws/s3_manager.py +184 -112
  5. water_column_sonar_processing/aws/s3fs_manager.py +29 -33
  6. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  7. water_column_sonar_processing/cruise/create_empty_zarr_store.py +38 -97
  8. water_column_sonar_processing/cruise/datatree_manager.py +21 -0
  9. water_column_sonar_processing/cruise/resample_regrid.py +144 -129
  10. water_column_sonar_processing/geometry/__init__.py +10 -2
  11. water_column_sonar_processing/geometry/elevation_manager.py +111 -0
  12. water_column_sonar_processing/geometry/geometry_manager.py +60 -44
  13. water_column_sonar_processing/geometry/line_simplification.py +176 -0
  14. water_column_sonar_processing/geometry/pmtile_generation.py +242 -51
  15. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  16. water_column_sonar_processing/index/index_manager.py +157 -27
  17. water_column_sonar_processing/model/zarr_manager.py +663 -258
  18. water_column_sonar_processing/processing/__init__.py +4 -0
  19. water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
  20. water_column_sonar_processing/processing/raw_to_zarr.py +341 -0
  21. water_column_sonar_processing/utility/__init__.py +9 -2
  22. water_column_sonar_processing/utility/cleaner.py +1 -0
  23. water_column_sonar_processing/utility/constants.py +69 -14
  24. water_column_sonar_processing/utility/pipeline_status.py +11 -15
  25. water_column_sonar_processing/utility/timestamp.py +3 -4
  26. water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
  27. water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
  28. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
  29. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
  30. water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
  31. water_column_sonar_processing/process.py +0 -147
  32. water_column_sonar_processing-0.0.6.dist-info/METADATA +0 -123
  33. water_column_sonar_processing-0.0.6.dist-info/RECORD +0 -29
  34. {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,33 @@
1
1
  import os
2
2
  import re
3
- import pandas as pd
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from datetime import datetime
5
- from concurrent.futures import ThreadPoolExecutor
6
- from concurrent.futures import as_completed
7
- from water_column_sonar_processing.aws.s3_manager import S3Manager
5
+
6
+ # import networkx as nx
7
+ import pandas as pd
8
+
9
+ from water_column_sonar_processing.aws import S3Manager
10
+
11
+ MAX_POOL_CONNECTIONS = 64
12
+ MAX_CONCURRENCY = 64
13
+ MAX_WORKERS = 64
14
+ GB = 1024**3
8
15
 
9
16
 
10
17
  class IndexManager:
18
+ # TODO: index into dynamodb instead of csv files
11
19
 
12
20
  def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
13
21
  self.input_bucket_name = input_bucket_name
14
22
  self.calibration_bucket = calibration_bucket
15
- self.calibration_key = calibration_key
16
- self.s3_manager = S3Manager()
23
+ self.calibration_key = calibration_key # TODO: make optional?
24
+ self.s3_manager = S3Manager() # TODO: make anonymous?
17
25
 
18
26
  #################################################################
19
-
20
27
  def list_ships(
21
28
  self,
22
29
  prefix="data/raw/",
23
30
  ):
24
- # s3_client = self.s3_manager.s3_client
25
31
  page_iterator = self.s3_manager.paginator.paginate(
26
32
  Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
27
33
  )
@@ -52,6 +58,9 @@ class IndexManager:
52
58
  self,
53
59
  cruise_prefixes,
54
60
  ):
61
+ """
62
+ This returns a list of ek60 prefixed cruises.
63
+ """
55
64
  cruise_sensors = [] # includes all sensor types
56
65
  for cruise_prefix in cruise_prefixes:
57
66
  page_iterator = self.s3_manager.paginator.paginate(
@@ -69,6 +78,7 @@ class IndexManager:
69
78
  cruise_name,
70
79
  sensor_name,
71
80
  ):
81
+ # Gets all raw files for a cruise under the given prefix
72
82
  prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
73
83
  page_iterator = self.s3_manager.paginator.paginate(
74
84
  Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
@@ -79,6 +89,62 @@ class IndexManager:
79
89
  all_files.extend([i["Key"] for i in page["Contents"]])
80
90
  return [i for i in all_files if i.endswith(".raw")]
81
91
 
92
+ def get_first_raw_file(
93
+ self,
94
+ ship_name,
95
+ cruise_name,
96
+ sensor_name,
97
+ ):
98
+ # Same as above but only needs to get the first raw file
99
+ # because we are only interested in the first datagram of one file
100
+ # TODO: "dataset?"
101
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
102
+ # page_iterator = self.s3_manager.paginator.paginate(
103
+ # Bucket=self.input_bucket_name,
104
+ # Prefix=prefix,
105
+ # Delimiter="/",
106
+ # PaginationConfig={ 'MaxItems': 5 }
107
+ # ) # TODO: this can create a problem if there is a non raw file returned first
108
+ ### filter with JMESPath expressions ###
109
+ page_iterator = self.s3_manager.paginator.paginate(
110
+ Bucket=self.input_bucket_name,
111
+ Prefix=prefix,
112
+ Delimiter="/",
113
+ )
114
+ # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
115
+ page_iterator = page_iterator.search(
116
+ expression="Contents[?contains(Key, '.raw')] "
117
+ )
118
+ for res in page_iterator:
119
+ if "Key" in res:
120
+ return res["Key"]
121
+ return None
122
+ # else raise exception?
123
+
124
+ # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
125
+
126
+ def get_files_under_size(
127
+ self,
128
+ ship_name,
129
+ cruise_name,
130
+ sensor_name,
131
+ ):
132
+ # THIS isn't used, just playing with JMES paths spec
133
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
134
+ ### filter with JMESPath expressions ###
135
+ page_iterator = self.s3_manager.paginator.paginate(
136
+ Bucket=self.input_bucket_name,
137
+ Prefix=prefix,
138
+ Delimiter="/",
139
+ )
140
+ page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
141
+ all_files = []
142
+ for page in page_iterator:
143
+ if "Contents" in page.keys():
144
+ all_files.extend([i["Key"] for i in page["Contents"]])
145
+ return [i for i in all_files if i.endswith(".raw")]
146
+
147
+ #################################################################
82
148
  def get_raw_files_csv(
83
149
  self,
84
150
  ship_name,
@@ -101,12 +167,34 @@ class IndexManager:
101
167
  df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
102
168
  print("done")
103
169
 
170
+ def get_raw_files_list(
171
+ self,
172
+ ship_name,
173
+ cruise_name,
174
+ sensor_name,
175
+ ):
176
+ # gets all raw files in cruise and returns a list of dicts
177
+ raw_files = self.get_raw_files(
178
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
179
+ )
180
+ files_list = [
181
+ {
182
+ "ship_name": ship_name,
183
+ "cruise_name": cruise_name,
184
+ "sensor_name": sensor_name,
185
+ "file_name": os.path.basename(raw_file),
186
+ }
187
+ for raw_file in raw_files
188
+ ]
189
+ return files_list
190
+
104
191
  #################################################################
105
- def get_subset_ek60_prefix(self, df: pd.DataFrame) -> pd.DataFrame:
192
+ @staticmethod
193
+ def get_subset_ek60_prefix(df: pd.DataFrame) -> pd.DataFrame: # TODO: is this used?
106
194
  # Returns all objects with 'EK60' in prefix of file path
107
- # Note that this can include 'EK80' data that are false-positives
195
+ # Note that this can include 'EK80' dataset that are false-positives
108
196
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
109
- print("getting subset of ek60 data by prefix")
197
+ print("getting subset of ek60 dataset by prefix")
110
198
  objects = []
111
199
  for row in df.itertuples():
112
200
  row_split = row[1].split(os.sep)
@@ -119,13 +207,13 @@ class IndexManager:
119
207
  2:5
120
208
  ] # 'Okeanos_Explorer', 'EX1608', 'EK60'
121
209
  if (
122
- re.search("[D](\d{8})", filename) is not None
123
- and re.search("[T](\d{6})", filename) is not None
210
+ re.search("[D](\\d{8})", filename) is not None
211
+ and re.search("[T](\\d{6})", filename) is not None
124
212
  ):
125
213
  # Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
126
214
  # and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
127
- date_substring = re.search("[D](\d{8})", filename).group(1)
128
- time_substring = re.search("[T](\d{6})", filename).group(1)
215
+ date_substring = re.search("[D](\\d{8})", filename).group(1)
216
+ time_substring = re.search("[T](\\d{6})", filename).group(1)
129
217
  date_string = datetime.strptime(
130
218
  f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
131
219
  )
@@ -158,17 +246,21 @@ class IndexManager:
158
246
  obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
159
247
  )
160
248
  # return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
161
- ### EK60 data are denoted by 'CON0' ###
249
+ ### EK60 dataset are denoted by 'CON0' ###
162
250
  return first_datagram
163
251
 
164
252
  #################################################################
165
- def get_subset_datagrams(self, df: pd.DataFrame) -> list:
253
+ def get_subset_datagrams(
254
+ self, df: pd.DataFrame
255
+ ) -> list: # TODO: is this getting used
166
256
  print("getting subset of datagrams")
167
- select_keys = list(
168
- df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
257
+ select_keys = (
258
+ df[["KEY", "CRUISE"]]
259
+ .drop_duplicates(subset="CRUISE")["KEY"]
260
+ .values.tolist()
169
261
  )
170
262
  all_datagrams = []
171
- with ThreadPoolExecutor(max_workers=self.max_pool_connections) as executor:
263
+ with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
172
264
  futures = [
173
265
  executor.submit(self.scan_datagram, select_key)
174
266
  for select_key in select_keys
@@ -180,9 +272,8 @@ class IndexManager:
180
272
  return all_datagrams
181
273
 
182
274
  #################################################################
183
- def get_ek60_objects(
184
- self, df: pd.DataFrame, subset_datagrams: list
185
- ) -> pd.DataFrame:
275
+ @staticmethod
276
+ def get_ek60_objects(df: pd.DataFrame, subset_datagrams: list) -> pd.DataFrame:
186
277
  # for each key write datagram value to all other files in same cruise
187
278
  for subset_datagram in subset_datagrams:
188
279
  if subset_datagram["DATAGRAM"] == "CON0":
@@ -195,14 +286,14 @@ class IndexManager:
195
286
  return df.loc[df["DATAGRAM"] == "CON0"]
196
287
 
197
288
  #################################################################
198
- def get_calibration_information( # tested
289
+ def get_calibration_information(
199
290
  self,
200
291
  ) -> pd.DataFrame:
201
- # Calibration data generated by data manager currently located here:
292
+ # Calibration dataset generated by dataset manager currently located here:
202
293
  # https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
203
294
  # Note: Data are either:
204
- # [1] Calibrated w/ calibration data
205
- # [2] Calibrated w/o calibration data
295
+ # [1] Calibrated w/ calibration dataset
296
+ # [2] Calibrated w/o calibration dataset
206
297
  # [3] uncalibrated
207
298
  response = self.s3_manager.get_object(
208
299
  bucket_name=self.calibration_bucket, key_name=self.calibration_key
@@ -249,3 +340,42 @@ class IndexManager:
249
340
  # end_time = datetime.now() # used for benchmarking
250
341
  # print(start_time)
251
342
  # print(end_time)
343
+
344
+ # TODO: wip
345
+ # def build_merkle_tree(self):
346
+ # G = nx.DiGraph()
347
+ # # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
348
+ # ship_name = "Henry_B._Bigelow"
349
+ # cruise_name = "HB0707"
350
+ # # cruise_name = "HB0805"
351
+ # prefix = f"data/raw/{ship_name}/{cruise_name}/"
352
+ # # prefix = f"data/raw/{ship_name}/"
353
+ # page_iterator = self.s3_manager.paginator.paginate(
354
+ # Bucket=self.input_bucket_name,
355
+ # Prefix=prefix,
356
+ # )
357
+ # for page in page_iterator:
358
+ # for contents in page["Contents"]:
359
+ # obj_key = contents["Key"]
360
+ # # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
361
+ # obj_etag = contents["ETag"].split('"')[1] # properties
362
+ # obj_size = contents["Size"]
363
+ # basename = os.path.basename(obj_key)
364
+ # G.add_node(
365
+ # node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
366
+ # ) # TODO: add parent hash
367
+ # split_path = os.path.normpath(obj_key).split(os.path.sep)
368
+ # # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
369
+ # for previous, current in zip(split_path, split_path[1:]):
370
+ # if not G.has_edge(previous, current):
371
+ # G.add_edge(previous, current)
372
+ # # print(G)
373
+ # etag_set = frozenset(
374
+ # [k for j, k in list(G.nodes.data("ETag")) if k is not None]
375
+ # )
376
+ # new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
377
+ # total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
378
+ # print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
379
+ # print(" ")
380
+ # print(new_hash)
381
+ # return new_hash