water-column-sonar-processing 0.0.9__py3-none-any.whl → 26.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- water_column_sonar_processing/aws/dynamodb_manager.py +138 -59
- water_column_sonar_processing/aws/s3_manager.py +179 -141
- water_column_sonar_processing/aws/s3fs_manager.py +29 -33
- water_column_sonar_processing/aws/sqs_manager.py +1 -1
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +35 -96
- water_column_sonar_processing/cruise/datatree_manager.py +21 -0
- water_column_sonar_processing/cruise/resample_regrid.py +142 -127
- water_column_sonar_processing/geometry/__init__.py +10 -2
- water_column_sonar_processing/geometry/elevation_manager.py +111 -0
- water_column_sonar_processing/geometry/geometry_manager.py +50 -49
- water_column_sonar_processing/geometry/line_simplification.py +176 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +227 -223
- water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
- water_column_sonar_processing/index/index_manager.py +151 -33
- water_column_sonar_processing/model/zarr_manager.py +665 -262
- water_column_sonar_processing/processing/__init__.py +3 -3
- water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +206 -214
- water_column_sonar_processing/utility/__init__.py +9 -2
- water_column_sonar_processing/utility/constants.py +69 -18
- water_column_sonar_processing/utility/pipeline_status.py +11 -15
- water_column_sonar_processing/utility/timestamp.py +3 -4
- water_column_sonar_processing-26.1.9.dist-info/METADATA +239 -0
- water_column_sonar_processing-26.1.9.dist-info/RECORD +34 -0
- {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/WHEEL +1 -1
- {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info/licenses}/LICENSE +1 -1
- water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
- water_column_sonar_processing/process.py +0 -147
- water_column_sonar_processing/processing/cruise_sampler.py +0 -342
- water_column_sonar_processing-0.0.9.dist-info/METADATA +0 -134
- water_column_sonar_processing-0.0.9.dist-info/RECORD +0 -32
- {water_column_sonar_processing-0.0.9.dist-info → water_column_sonar_processing-26.1.9.dist-info}/top_level.txt +0 -0
|
@@ -1,19 +1,27 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
|
-
import
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
|
|
6
|
+
# import networkx as nx
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
7
9
|
from water_column_sonar_processing.aws import S3Manager
|
|
8
10
|
|
|
11
|
+
MAX_POOL_CONNECTIONS = 64
|
|
12
|
+
MAX_CONCURRENCY = 64
|
|
13
|
+
MAX_WORKERS = 64
|
|
14
|
+
GB = 1024**3
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
class IndexManager:
|
|
18
|
+
# TODO: index into dynamodb instead of csv files
|
|
11
19
|
|
|
12
20
|
def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
|
|
13
21
|
self.input_bucket_name = input_bucket_name
|
|
14
22
|
self.calibration_bucket = calibration_bucket
|
|
15
|
-
self.calibration_key = calibration_key
|
|
16
|
-
self.s3_manager = S3Manager()
|
|
23
|
+
self.calibration_key = calibration_key # TODO: make optional?
|
|
24
|
+
self.s3_manager = S3Manager() # TODO: make anonymous?
|
|
17
25
|
|
|
18
26
|
#################################################################
|
|
19
27
|
def list_ships(
|
|
@@ -50,6 +58,9 @@ class IndexManager:
|
|
|
50
58
|
self,
|
|
51
59
|
cruise_prefixes,
|
|
52
60
|
):
|
|
61
|
+
"""
|
|
62
|
+
This returns a list of ek60 prefixed cruises.
|
|
63
|
+
"""
|
|
53
64
|
cruise_sensors = [] # includes all sensor types
|
|
54
65
|
for cruise_prefix in cruise_prefixes:
|
|
55
66
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
@@ -67,6 +78,7 @@ class IndexManager:
|
|
|
67
78
|
cruise_name,
|
|
68
79
|
sensor_name,
|
|
69
80
|
):
|
|
81
|
+
# Gets all raw files for a cruise under the given prefix
|
|
70
82
|
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
71
83
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
72
84
|
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
@@ -77,6 +89,61 @@ class IndexManager:
|
|
|
77
89
|
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
78
90
|
return [i for i in all_files if i.endswith(".raw")]
|
|
79
91
|
|
|
92
|
+
def get_first_raw_file(
|
|
93
|
+
self,
|
|
94
|
+
ship_name,
|
|
95
|
+
cruise_name,
|
|
96
|
+
sensor_name,
|
|
97
|
+
):
|
|
98
|
+
# Same as above but only needs to get the first raw file
|
|
99
|
+
# because we are only interested in the first datagram of one file
|
|
100
|
+
# TODO: "dataset?"
|
|
101
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
102
|
+
# page_iterator = self.s3_manager.paginator.paginate(
|
|
103
|
+
# Bucket=self.input_bucket_name,
|
|
104
|
+
# Prefix=prefix,
|
|
105
|
+
# Delimiter="/",
|
|
106
|
+
# PaginationConfig={ 'MaxItems': 5 }
|
|
107
|
+
# ) # TODO: this can create a problem if there is a non raw file returned first
|
|
108
|
+
### filter with JMESPath expressions ###
|
|
109
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
110
|
+
Bucket=self.input_bucket_name,
|
|
111
|
+
Prefix=prefix,
|
|
112
|
+
Delimiter="/",
|
|
113
|
+
)
|
|
114
|
+
# page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
115
|
+
page_iterator = page_iterator.search(
|
|
116
|
+
expression="Contents[?contains(Key, '.raw')] "
|
|
117
|
+
)
|
|
118
|
+
for res in page_iterator:
|
|
119
|
+
if "Key" in res:
|
|
120
|
+
return res["Key"]
|
|
121
|
+
return None
|
|
122
|
+
# else raise exception?
|
|
123
|
+
|
|
124
|
+
# DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
|
|
125
|
+
|
|
126
|
+
def get_files_under_size(
|
|
127
|
+
self,
|
|
128
|
+
ship_name,
|
|
129
|
+
cruise_name,
|
|
130
|
+
sensor_name,
|
|
131
|
+
):
|
|
132
|
+
# THIS isn't used, just playing with JMES paths spec
|
|
133
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
|
|
134
|
+
### filter with JMESPath expressions ###
|
|
135
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
136
|
+
Bucket=self.input_bucket_name,
|
|
137
|
+
Prefix=prefix,
|
|
138
|
+
Delimiter="/",
|
|
139
|
+
)
|
|
140
|
+
page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
141
|
+
all_files = []
|
|
142
|
+
for page in page_iterator:
|
|
143
|
+
if "Contents" in page.keys():
|
|
144
|
+
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
145
|
+
return [i for i in all_files if i.endswith(".raw")]
|
|
146
|
+
|
|
80
147
|
#################################################################
|
|
81
148
|
def get_raw_files_csv(
|
|
82
149
|
self,
|
|
@@ -85,9 +152,7 @@ class IndexManager:
|
|
|
85
152
|
sensor_name,
|
|
86
153
|
):
|
|
87
154
|
raw_files = self.get_raw_files(
|
|
88
|
-
ship_name=ship_name,
|
|
89
|
-
cruise_name=cruise_name,
|
|
90
|
-
sensor_name=sensor_name
|
|
155
|
+
ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
|
|
91
156
|
)
|
|
92
157
|
files_list = [
|
|
93
158
|
{
|
|
@@ -102,15 +167,34 @@ class IndexManager:
|
|
|
102
167
|
df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
|
|
103
168
|
print("done")
|
|
104
169
|
|
|
105
|
-
|
|
106
|
-
def get_subset_ek60_prefix( # TODO: is this used?
|
|
170
|
+
def get_raw_files_list(
|
|
107
171
|
self,
|
|
108
|
-
|
|
109
|
-
|
|
172
|
+
ship_name,
|
|
173
|
+
cruise_name,
|
|
174
|
+
sensor_name,
|
|
175
|
+
):
|
|
176
|
+
# gets all raw files in cruise and returns a list of dicts
|
|
177
|
+
raw_files = self.get_raw_files(
|
|
178
|
+
ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
|
|
179
|
+
)
|
|
180
|
+
files_list = [
|
|
181
|
+
{
|
|
182
|
+
"ship_name": ship_name,
|
|
183
|
+
"cruise_name": cruise_name,
|
|
184
|
+
"sensor_name": sensor_name,
|
|
185
|
+
"file_name": os.path.basename(raw_file),
|
|
186
|
+
}
|
|
187
|
+
for raw_file in raw_files
|
|
188
|
+
]
|
|
189
|
+
return files_list
|
|
190
|
+
|
|
191
|
+
#################################################################
|
|
192
|
+
@staticmethod
|
|
193
|
+
def get_subset_ek60_prefix(df: pd.DataFrame) -> pd.DataFrame: # TODO: is this used?
|
|
110
194
|
# Returns all objects with 'EK60' in prefix of file path
|
|
111
|
-
# Note that this can include 'EK80'
|
|
195
|
+
# Note that this can include 'EK80' dataset that are false-positives
|
|
112
196
|
# in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
|
|
113
|
-
print("getting subset of ek60
|
|
197
|
+
print("getting subset of ek60 dataset by prefix")
|
|
114
198
|
objects = []
|
|
115
199
|
for row in df.itertuples():
|
|
116
200
|
row_split = row[1].split(os.sep)
|
|
@@ -150,10 +234,7 @@ class IndexManager:
|
|
|
150
234
|
return pd.DataFrame(objects)
|
|
151
235
|
|
|
152
236
|
#################################################################
|
|
153
|
-
def scan_datagram(
|
|
154
|
-
self,
|
|
155
|
-
select_key: str
|
|
156
|
-
) -> list:
|
|
237
|
+
def scan_datagram(self, select_key: str) -> list:
|
|
157
238
|
# Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
|
|
158
239
|
# Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
|
|
159
240
|
# select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
|
|
@@ -165,20 +246,21 @@ class IndexManager:
|
|
|
165
246
|
obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
|
|
166
247
|
)
|
|
167
248
|
# return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
|
|
168
|
-
### EK60
|
|
249
|
+
### EK60 dataset are denoted by 'CON0' ###
|
|
169
250
|
return first_datagram
|
|
170
251
|
|
|
171
252
|
#################################################################
|
|
172
253
|
def get_subset_datagrams(
|
|
173
|
-
self,
|
|
174
|
-
|
|
175
|
-
) -> list:
|
|
254
|
+
self, df: pd.DataFrame
|
|
255
|
+
) -> list: # TODO: is this getting used
|
|
176
256
|
print("getting subset of datagrams")
|
|
177
|
-
select_keys =
|
|
178
|
-
df[["KEY", "CRUISE"]]
|
|
257
|
+
select_keys = (
|
|
258
|
+
df[["KEY", "CRUISE"]]
|
|
259
|
+
.drop_duplicates(subset="CRUISE")["KEY"]
|
|
260
|
+
.values.tolist()
|
|
179
261
|
)
|
|
180
262
|
all_datagrams = []
|
|
181
|
-
with ThreadPoolExecutor(max_workers=
|
|
263
|
+
with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
|
|
182
264
|
futures = [
|
|
183
265
|
executor.submit(self.scan_datagram, select_key)
|
|
184
266
|
for select_key in select_keys
|
|
@@ -190,11 +272,8 @@ class IndexManager:
|
|
|
190
272
|
return all_datagrams
|
|
191
273
|
|
|
192
274
|
#################################################################
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
df: pd.DataFrame,
|
|
196
|
-
subset_datagrams: list
|
|
197
|
-
) -> pd.DataFrame:
|
|
275
|
+
@staticmethod
|
|
276
|
+
def get_ek60_objects(df: pd.DataFrame, subset_datagrams: list) -> pd.DataFrame:
|
|
198
277
|
# for each key write datagram value to all other files in same cruise
|
|
199
278
|
for subset_datagram in subset_datagrams:
|
|
200
279
|
if subset_datagram["DATAGRAM"] == "CON0":
|
|
@@ -210,11 +289,11 @@ class IndexManager:
|
|
|
210
289
|
def get_calibration_information(
|
|
211
290
|
self,
|
|
212
291
|
) -> pd.DataFrame:
|
|
213
|
-
# Calibration
|
|
292
|
+
# Calibration dataset generated by dataset manager currently located here:
|
|
214
293
|
# https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
|
|
215
294
|
# Note: Data are either:
|
|
216
|
-
# [1] Calibrated w/ calibration
|
|
217
|
-
# [2] Calibrated w/o calibration
|
|
295
|
+
# [1] Calibrated w/ calibration dataset
|
|
296
|
+
# [2] Calibrated w/o calibration dataset
|
|
218
297
|
# [3] uncalibrated
|
|
219
298
|
response = self.s3_manager.get_object(
|
|
220
299
|
bucket_name=self.calibration_bucket, key_name=self.calibration_key
|
|
@@ -261,3 +340,42 @@ class IndexManager:
|
|
|
261
340
|
# end_time = datetime.now() # used for benchmarking
|
|
262
341
|
# print(start_time)
|
|
263
342
|
# print(end_time)
|
|
343
|
+
|
|
344
|
+
# TODO: wip
|
|
345
|
+
# def build_merkle_tree(self):
|
|
346
|
+
# G = nx.DiGraph()
|
|
347
|
+
# # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
|
|
348
|
+
# ship_name = "Henry_B._Bigelow"
|
|
349
|
+
# cruise_name = "HB0707"
|
|
350
|
+
# # cruise_name = "HB0805"
|
|
351
|
+
# prefix = f"data/raw/{ship_name}/{cruise_name}/"
|
|
352
|
+
# # prefix = f"data/raw/{ship_name}/"
|
|
353
|
+
# page_iterator = self.s3_manager.paginator.paginate(
|
|
354
|
+
# Bucket=self.input_bucket_name,
|
|
355
|
+
# Prefix=prefix,
|
|
356
|
+
# )
|
|
357
|
+
# for page in page_iterator:
|
|
358
|
+
# for contents in page["Contents"]:
|
|
359
|
+
# obj_key = contents["Key"]
|
|
360
|
+
# # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
|
|
361
|
+
# obj_etag = contents["ETag"].split('"')[1] # properties
|
|
362
|
+
# obj_size = contents["Size"]
|
|
363
|
+
# basename = os.path.basename(obj_key)
|
|
364
|
+
# G.add_node(
|
|
365
|
+
# node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
|
|
366
|
+
# ) # TODO: add parent hash
|
|
367
|
+
# split_path = os.path.normpath(obj_key).split(os.path.sep)
|
|
368
|
+
# # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
|
|
369
|
+
# for previous, current in zip(split_path, split_path[1:]):
|
|
370
|
+
# if not G.has_edge(previous, current):
|
|
371
|
+
# G.add_edge(previous, current)
|
|
372
|
+
# # print(G)
|
|
373
|
+
# etag_set = frozenset(
|
|
374
|
+
# [k for j, k in list(G.nodes.data("ETag")) if k is not None]
|
|
375
|
+
# )
|
|
376
|
+
# new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
|
|
377
|
+
# total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
|
|
378
|
+
# print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
|
|
379
|
+
# print(" ")
|
|
380
|
+
# print(new_hash)
|
|
381
|
+
# return new_hash
|