water-column-sonar-processing 0.0.6__py3-none-any.whl → 26.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- water_column_sonar_processing/__init__.py +2 -5
- water_column_sonar_processing/aws/__init__.py +2 -2
- water_column_sonar_processing/aws/dynamodb_manager.py +257 -72
- water_column_sonar_processing/aws/s3_manager.py +184 -112
- water_column_sonar_processing/aws/s3fs_manager.py +29 -33
- water_column_sonar_processing/aws/sqs_manager.py +1 -1
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +38 -97
- water_column_sonar_processing/cruise/datatree_manager.py +21 -0
- water_column_sonar_processing/cruise/resample_regrid.py +144 -129
- water_column_sonar_processing/geometry/__init__.py +10 -2
- water_column_sonar_processing/geometry/elevation_manager.py +111 -0
- water_column_sonar_processing/geometry/geometry_manager.py +60 -44
- water_column_sonar_processing/geometry/line_simplification.py +176 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +242 -51
- water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
- water_column_sonar_processing/index/index_manager.py +157 -27
- water_column_sonar_processing/model/zarr_manager.py +663 -258
- water_column_sonar_processing/processing/__init__.py +4 -0
- water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +341 -0
- water_column_sonar_processing/utility/__init__.py +9 -2
- water_column_sonar_processing/utility/cleaner.py +1 -0
- water_column_sonar_processing/utility/constants.py +69 -14
- water_column_sonar_processing/utility/pipeline_status.py +11 -15
- water_column_sonar_processing/utility/timestamp.py +3 -4
- water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
- water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
- water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
- water_column_sonar_processing/process.py +0 -147
- water_column_sonar_processing-0.0.6.dist-info/METADATA +0 -123
- water_column_sonar_processing-0.0.6.dist-info/RECORD +0 -29
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
|
@@ -1,27 +1,33 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
|
-
import
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
|
|
6
|
+
# import networkx as nx
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from water_column_sonar_processing.aws import S3Manager
|
|
10
|
+
|
|
11
|
+
MAX_POOL_CONNECTIONS = 64
|
|
12
|
+
MAX_CONCURRENCY = 64
|
|
13
|
+
MAX_WORKERS = 64
|
|
14
|
+
GB = 1024**3
|
|
8
15
|
|
|
9
16
|
|
|
10
17
|
class IndexManager:
|
|
18
|
+
# TODO: index into dynamodb instead of csv files
|
|
11
19
|
|
|
12
20
|
def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
|
|
13
21
|
self.input_bucket_name = input_bucket_name
|
|
14
22
|
self.calibration_bucket = calibration_bucket
|
|
15
|
-
self.calibration_key = calibration_key
|
|
16
|
-
self.s3_manager = S3Manager()
|
|
23
|
+
self.calibration_key = calibration_key # TODO: make optional?
|
|
24
|
+
self.s3_manager = S3Manager() # TODO: make anonymous?
|
|
17
25
|
|
|
18
26
|
#################################################################
|
|
19
|
-
|
|
20
27
|
def list_ships(
|
|
21
28
|
self,
|
|
22
29
|
prefix="data/raw/",
|
|
23
30
|
):
|
|
24
|
-
# s3_client = self.s3_manager.s3_client
|
|
25
31
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
26
32
|
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
27
33
|
)
|
|
@@ -52,6 +58,9 @@ class IndexManager:
|
|
|
52
58
|
self,
|
|
53
59
|
cruise_prefixes,
|
|
54
60
|
):
|
|
61
|
+
"""
|
|
62
|
+
This returns a list of ek60 prefixed cruises.
|
|
63
|
+
"""
|
|
55
64
|
cruise_sensors = [] # includes all sensor types
|
|
56
65
|
for cruise_prefix in cruise_prefixes:
|
|
57
66
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
@@ -69,6 +78,7 @@ class IndexManager:
|
|
|
69
78
|
cruise_name,
|
|
70
79
|
sensor_name,
|
|
71
80
|
):
|
|
81
|
+
# Gets all raw files for a cruise under the given prefix
|
|
72
82
|
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
73
83
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
74
84
|
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
@@ -79,6 +89,62 @@ class IndexManager:
|
|
|
79
89
|
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
80
90
|
return [i for i in all_files if i.endswith(".raw")]
|
|
81
91
|
|
|
92
|
+
def get_first_raw_file(
|
|
93
|
+
self,
|
|
94
|
+
ship_name,
|
|
95
|
+
cruise_name,
|
|
96
|
+
sensor_name,
|
|
97
|
+
):
|
|
98
|
+
# Same as above but only needs to get the first raw file
|
|
99
|
+
# because we are only interested in the first datagram of one file
|
|
100
|
+
# TODO: "dataset?"
|
|
101
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
102
|
+
# page_iterator = self.s3_manager.paginator.paginate(
|
|
103
|
+
# Bucket=self.input_bucket_name,
|
|
104
|
+
# Prefix=prefix,
|
|
105
|
+
# Delimiter="/",
|
|
106
|
+
# PaginationConfig={ 'MaxItems': 5 }
|
|
107
|
+
# ) # TODO: this can create a problem if there is a non raw file returned first
|
|
108
|
+
### filter with JMESPath expressions ###
|
|
109
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
110
|
+
Bucket=self.input_bucket_name,
|
|
111
|
+
Prefix=prefix,
|
|
112
|
+
Delimiter="/",
|
|
113
|
+
)
|
|
114
|
+
# page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
115
|
+
page_iterator = page_iterator.search(
|
|
116
|
+
expression="Contents[?contains(Key, '.raw')] "
|
|
117
|
+
)
|
|
118
|
+
for res in page_iterator:
|
|
119
|
+
if "Key" in res:
|
|
120
|
+
return res["Key"]
|
|
121
|
+
return None
|
|
122
|
+
# else raise exception?
|
|
123
|
+
|
|
124
|
+
# DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
|
|
125
|
+
|
|
126
|
+
def get_files_under_size(
|
|
127
|
+
self,
|
|
128
|
+
ship_name,
|
|
129
|
+
cruise_name,
|
|
130
|
+
sensor_name,
|
|
131
|
+
):
|
|
132
|
+
# THIS isn't used, just playing with JMES paths spec
|
|
133
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
|
|
134
|
+
### filter with JMESPath expressions ###
|
|
135
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
136
|
+
Bucket=self.input_bucket_name,
|
|
137
|
+
Prefix=prefix,
|
|
138
|
+
Delimiter="/",
|
|
139
|
+
)
|
|
140
|
+
page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
141
|
+
all_files = []
|
|
142
|
+
for page in page_iterator:
|
|
143
|
+
if "Contents" in page.keys():
|
|
144
|
+
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
145
|
+
return [i for i in all_files if i.endswith(".raw")]
|
|
146
|
+
|
|
147
|
+
#################################################################
|
|
82
148
|
def get_raw_files_csv(
|
|
83
149
|
self,
|
|
84
150
|
ship_name,
|
|
@@ -101,12 +167,34 @@ class IndexManager:
|
|
|
101
167
|
df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
|
|
102
168
|
print("done")
|
|
103
169
|
|
|
170
|
+
def get_raw_files_list(
|
|
171
|
+
self,
|
|
172
|
+
ship_name,
|
|
173
|
+
cruise_name,
|
|
174
|
+
sensor_name,
|
|
175
|
+
):
|
|
176
|
+
# gets all raw files in cruise and returns a list of dicts
|
|
177
|
+
raw_files = self.get_raw_files(
|
|
178
|
+
ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
|
|
179
|
+
)
|
|
180
|
+
files_list = [
|
|
181
|
+
{
|
|
182
|
+
"ship_name": ship_name,
|
|
183
|
+
"cruise_name": cruise_name,
|
|
184
|
+
"sensor_name": sensor_name,
|
|
185
|
+
"file_name": os.path.basename(raw_file),
|
|
186
|
+
}
|
|
187
|
+
for raw_file in raw_files
|
|
188
|
+
]
|
|
189
|
+
return files_list
|
|
190
|
+
|
|
104
191
|
#################################################################
|
|
105
|
-
|
|
192
|
+
@staticmethod
|
|
193
|
+
def get_subset_ek60_prefix(df: pd.DataFrame) -> pd.DataFrame: # TODO: is this used?
|
|
106
194
|
# Returns all objects with 'EK60' in prefix of file path
|
|
107
|
-
# Note that this can include 'EK80'
|
|
195
|
+
# Note that this can include 'EK80' dataset that are false-positives
|
|
108
196
|
# in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
|
|
109
|
-
print("getting subset of ek60
|
|
197
|
+
print("getting subset of ek60 dataset by prefix")
|
|
110
198
|
objects = []
|
|
111
199
|
for row in df.itertuples():
|
|
112
200
|
row_split = row[1].split(os.sep)
|
|
@@ -119,13 +207,13 @@ class IndexManager:
|
|
|
119
207
|
2:5
|
|
120
208
|
] # 'Okeanos_Explorer', 'EX1608', 'EK60'
|
|
121
209
|
if (
|
|
122
|
-
re.search("[D](
|
|
123
|
-
and re.search("[T](
|
|
210
|
+
re.search("[D](\\d{8})", filename) is not None
|
|
211
|
+
and re.search("[T](\\d{6})", filename) is not None
|
|
124
212
|
):
|
|
125
213
|
# Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
|
|
126
214
|
# and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
|
|
127
|
-
date_substring = re.search("[D](
|
|
128
|
-
time_substring = re.search("[T](
|
|
215
|
+
date_substring = re.search("[D](\\d{8})", filename).group(1)
|
|
216
|
+
time_substring = re.search("[T](\\d{6})", filename).group(1)
|
|
129
217
|
date_string = datetime.strptime(
|
|
130
218
|
f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
|
|
131
219
|
)
|
|
@@ -158,17 +246,21 @@ class IndexManager:
|
|
|
158
246
|
obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
|
|
159
247
|
)
|
|
160
248
|
# return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
|
|
161
|
-
### EK60
|
|
249
|
+
### EK60 dataset are denoted by 'CON0' ###
|
|
162
250
|
return first_datagram
|
|
163
251
|
|
|
164
252
|
#################################################################
|
|
165
|
-
def get_subset_datagrams(
|
|
253
|
+
def get_subset_datagrams(
|
|
254
|
+
self, df: pd.DataFrame
|
|
255
|
+
) -> list: # TODO: is this getting used
|
|
166
256
|
print("getting subset of datagrams")
|
|
167
|
-
select_keys =
|
|
168
|
-
df[["KEY", "CRUISE"]]
|
|
257
|
+
select_keys = (
|
|
258
|
+
df[["KEY", "CRUISE"]]
|
|
259
|
+
.drop_duplicates(subset="CRUISE")["KEY"]
|
|
260
|
+
.values.tolist()
|
|
169
261
|
)
|
|
170
262
|
all_datagrams = []
|
|
171
|
-
with ThreadPoolExecutor(max_workers=
|
|
263
|
+
with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
|
|
172
264
|
futures = [
|
|
173
265
|
executor.submit(self.scan_datagram, select_key)
|
|
174
266
|
for select_key in select_keys
|
|
@@ -180,9 +272,8 @@ class IndexManager:
|
|
|
180
272
|
return all_datagrams
|
|
181
273
|
|
|
182
274
|
#################################################################
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
) -> pd.DataFrame:
|
|
275
|
+
@staticmethod
|
|
276
|
+
def get_ek60_objects(df: pd.DataFrame, subset_datagrams: list) -> pd.DataFrame:
|
|
186
277
|
# for each key write datagram value to all other files in same cruise
|
|
187
278
|
for subset_datagram in subset_datagrams:
|
|
188
279
|
if subset_datagram["DATAGRAM"] == "CON0":
|
|
@@ -195,14 +286,14 @@ class IndexManager:
|
|
|
195
286
|
return df.loc[df["DATAGRAM"] == "CON0"]
|
|
196
287
|
|
|
197
288
|
#################################################################
|
|
198
|
-
def get_calibration_information(
|
|
289
|
+
def get_calibration_information(
|
|
199
290
|
self,
|
|
200
291
|
) -> pd.DataFrame:
|
|
201
|
-
# Calibration
|
|
292
|
+
# Calibration dataset generated by dataset manager currently located here:
|
|
202
293
|
# https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
|
|
203
294
|
# Note: Data are either:
|
|
204
|
-
# [1] Calibrated w/ calibration
|
|
205
|
-
# [2] Calibrated w/o calibration
|
|
295
|
+
# [1] Calibrated w/ calibration dataset
|
|
296
|
+
# [2] Calibrated w/o calibration dataset
|
|
206
297
|
# [3] uncalibrated
|
|
207
298
|
response = self.s3_manager.get_object(
|
|
208
299
|
bucket_name=self.calibration_bucket, key_name=self.calibration_key
|
|
@@ -249,3 +340,42 @@ class IndexManager:
|
|
|
249
340
|
# end_time = datetime.now() # used for benchmarking
|
|
250
341
|
# print(start_time)
|
|
251
342
|
# print(end_time)
|
|
343
|
+
|
|
344
|
+
# TODO: wip
|
|
345
|
+
# def build_merkle_tree(self):
|
|
346
|
+
# G = nx.DiGraph()
|
|
347
|
+
# # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
|
|
348
|
+
# ship_name = "Henry_B._Bigelow"
|
|
349
|
+
# cruise_name = "HB0707"
|
|
350
|
+
# # cruise_name = "HB0805"
|
|
351
|
+
# prefix = f"data/raw/{ship_name}/{cruise_name}/"
|
|
352
|
+
# # prefix = f"data/raw/{ship_name}/"
|
|
353
|
+
# page_iterator = self.s3_manager.paginator.paginate(
|
|
354
|
+
# Bucket=self.input_bucket_name,
|
|
355
|
+
# Prefix=prefix,
|
|
356
|
+
# )
|
|
357
|
+
# for page in page_iterator:
|
|
358
|
+
# for contents in page["Contents"]:
|
|
359
|
+
# obj_key = contents["Key"]
|
|
360
|
+
# # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
|
|
361
|
+
# obj_etag = contents["ETag"].split('"')[1] # properties
|
|
362
|
+
# obj_size = contents["Size"]
|
|
363
|
+
# basename = os.path.basename(obj_key)
|
|
364
|
+
# G.add_node(
|
|
365
|
+
# node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
|
|
366
|
+
# ) # TODO: add parent hash
|
|
367
|
+
# split_path = os.path.normpath(obj_key).split(os.path.sep)
|
|
368
|
+
# # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
|
|
369
|
+
# for previous, current in zip(split_path, split_path[1:]):
|
|
370
|
+
# if not G.has_edge(previous, current):
|
|
371
|
+
# G.add_edge(previous, current)
|
|
372
|
+
# # print(G)
|
|
373
|
+
# etag_set = frozenset(
|
|
374
|
+
# [k for j, k in list(G.nodes.data("ETag")) if k is not None]
|
|
375
|
+
# )
|
|
376
|
+
# new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
|
|
377
|
+
# total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
|
|
378
|
+
# print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
|
|
379
|
+
# print(" ")
|
|
380
|
+
# print(new_hash)
|
|
381
|
+
# return new_hash
|