water-column-sonar-processing 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- water_column_sonar_processing/__init__.py +16 -0
- water_column_sonar_processing/aws/__init__.py +7 -0
- {aws_manager → water_column_sonar_processing/aws}/dynamodb_manager.py +71 -50
- {aws_manager → water_column_sonar_processing/aws}/s3_manager.py +120 -130
- {aws_manager → water_column_sonar_processing/aws}/s3fs_manager.py +13 -19
- {aws_manager → water_column_sonar_processing/aws}/sns_manager.py +10 -21
- {aws_manager → water_column_sonar_processing/aws}/sqs_manager.py +10 -18
- water_column_sonar_processing/cruise/__init__.py +4 -0
- {cruise → water_column_sonar_processing/cruise}/create_empty_zarr_store.py +62 -44
- {cruise → water_column_sonar_processing/cruise}/resample_regrid.py +117 -66
- water_column_sonar_processing/geometry/__init__.py +5 -0
- {geometry_manager → water_column_sonar_processing/geometry}/geometry_manager.py +80 -49
- {geometry_manager → water_column_sonar_processing/geometry}/geometry_simplification.py +13 -12
- {geometry_manager → water_column_sonar_processing/geometry}/pmtile_generation.py +25 -24
- water_column_sonar_processing/index/__init__.py +3 -0
- {index_manager → water_column_sonar_processing/index}/index_manager.py +106 -82
- water_column_sonar_processing/model/__init__.py +3 -0
- {zarr_manager → water_column_sonar_processing/model}/zarr_manager.py +119 -83
- water_column_sonar_processing/process.py +147 -0
- water_column_sonar_processing/utility/__init__.py +6 -0
- {utility → water_column_sonar_processing/utility}/cleaner.py +6 -7
- water_column_sonar_processing/utility/constants.py +63 -0
- {utility → water_column_sonar_processing/utility}/pipeline_status.py +37 -10
- {utility → water_column_sonar_processing/utility}/timestamp.py +3 -2
- {water_column_sonar_processing-0.0.4.dist-info → water_column_sonar_processing-0.0.6.dist-info}/METADATA +31 -1
- water_column_sonar_processing-0.0.6.dist-info/RECORD +29 -0
- water_column_sonar_processing-0.0.6.dist-info/top_level.txt +1 -0
- __init__.py +0 -0
- aws_manager/__init__.py +0 -4
- cruise/__init__.py +0 -0
- geometry_manager/__init__.py +0 -0
- index_manager/__init__.py +0 -0
- model.py +0 -140
- utility/__init__.py +0 -0
- utility/constants.py +0 -56
- water_column_sonar_processing-0.0.4.dist-info/RECORD +0 -29
- water_column_sonar_processing-0.0.4.dist-info/top_level.txt +0 -8
- zarr_manager/__init__.py +0 -0
- {water_column_sonar_processing-0.0.4.dist-info → water_column_sonar_processing-0.0.6.dist-info}/LICENSE +0 -0
- {water_column_sonar_processing-0.0.4.dist-info → water_column_sonar_processing-0.0.6.dist-info}/WHEEL +0 -0
|
@@ -4,101 +4,105 @@ import pandas as pd
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
6
|
from concurrent.futures import as_completed
|
|
7
|
-
from
|
|
7
|
+
from water_column_sonar_processing.aws.s3_manager import S3Manager
|
|
8
|
+
|
|
8
9
|
|
|
9
10
|
class IndexManager:
|
|
10
11
|
|
|
11
|
-
def __init__(
|
|
12
|
-
self,
|
|
13
|
-
input_bucket_name,
|
|
14
|
-
calibration_bucket,
|
|
15
|
-
calibration_key
|
|
16
|
-
):
|
|
12
|
+
def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
|
|
17
13
|
self.input_bucket_name = input_bucket_name
|
|
18
14
|
self.calibration_bucket = calibration_bucket
|
|
19
15
|
self.calibration_key = calibration_key
|
|
20
16
|
self.s3_manager = S3Manager()
|
|
21
17
|
|
|
22
18
|
#################################################################
|
|
19
|
+
|
|
23
20
|
def list_ships(
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
self,
|
|
22
|
+
prefix="data/raw/",
|
|
26
23
|
):
|
|
27
24
|
# s3_client = self.s3_manager.s3_client
|
|
28
|
-
page_iterator = self.s3_manager.paginator.paginate(
|
|
25
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
26
|
+
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
27
|
+
)
|
|
29
28
|
# common_prefixes = s3_client.list_objects(Bucket=self.input_bucket_name, Prefix=prefix, Delimiter='/')
|
|
30
29
|
# print(common_prefixes)
|
|
31
30
|
ships = []
|
|
32
31
|
for page in page_iterator:
|
|
33
|
-
if
|
|
34
|
-
ships.extend([k[
|
|
32
|
+
if "Contents" in page.keys():
|
|
33
|
+
ships.extend([k["Prefix"] for k in page["CommonPrefixes"]])
|
|
35
34
|
return ships # ~76 ships
|
|
36
35
|
|
|
37
36
|
#################################################################
|
|
38
37
|
def list_cruises(
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
self,
|
|
39
|
+
ship_prefixes, # e.g. 'data/raw/Alaska_Knight/'
|
|
41
40
|
):
|
|
42
41
|
cruises = []
|
|
43
42
|
for ship_prefix in ship_prefixes:
|
|
44
|
-
page_iterator = self.s3_manager.paginator.paginate(
|
|
43
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
44
|
+
Bucket=self.input_bucket_name, Prefix=ship_prefix, Delimiter="/"
|
|
45
|
+
)
|
|
45
46
|
for page in page_iterator:
|
|
46
|
-
cruises.extend([k[
|
|
47
|
+
cruises.extend([k["Prefix"] for k in page["CommonPrefixes"]])
|
|
47
48
|
return cruises # ~1204 cruises
|
|
48
49
|
|
|
49
50
|
#################################################################
|
|
50
51
|
def list_ek60_cruises(
|
|
51
|
-
|
|
52
|
-
|
|
52
|
+
self,
|
|
53
|
+
cruise_prefixes,
|
|
53
54
|
):
|
|
54
55
|
cruise_sensors = [] # includes all sensor types
|
|
55
56
|
for cruise_prefix in cruise_prefixes:
|
|
56
|
-
page_iterator = self.s3_manager.paginator.paginate(
|
|
57
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
58
|
+
Bucket=self.input_bucket_name, Prefix=cruise_prefix, Delimiter="/"
|
|
59
|
+
)
|
|
57
60
|
for page in page_iterator:
|
|
58
|
-
cruise_sensors.extend([k[
|
|
61
|
+
cruise_sensors.extend([k["Prefix"] for k in page["CommonPrefixes"]])
|
|
59
62
|
# Note: these are "EK60" by prefix. They still need to be verified by scanning the datagram.
|
|
60
|
-
return [i for i in cruise_sensors if
|
|
63
|
+
return [i for i in cruise_sensors if "/EK60/" in i] # ~447 different cruises
|
|
61
64
|
|
|
62
65
|
#################################################################
|
|
63
66
|
def get_raw_files(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
self,
|
|
68
|
+
ship_name,
|
|
69
|
+
cruise_name,
|
|
70
|
+
sensor_name,
|
|
68
71
|
):
|
|
69
72
|
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
70
|
-
page_iterator = self.s3_manager.paginator.paginate(
|
|
73
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
74
|
+
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
75
|
+
)
|
|
71
76
|
all_files = []
|
|
72
77
|
for page in page_iterator:
|
|
73
|
-
if
|
|
74
|
-
all_files.extend([i[
|
|
75
|
-
return [i for i in all_files if i.endswith(
|
|
78
|
+
if "Contents" in page.keys():
|
|
79
|
+
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
80
|
+
return [i for i in all_files if i.endswith(".raw")]
|
|
76
81
|
|
|
77
82
|
def get_raw_files_csv(
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
83
|
+
self,
|
|
84
|
+
ship_name,
|
|
85
|
+
cruise_name,
|
|
86
|
+
sensor_name,
|
|
82
87
|
):
|
|
83
|
-
raw_files = self.get_raw_files(
|
|
88
|
+
raw_files = self.get_raw_files(
|
|
89
|
+
ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
|
|
90
|
+
)
|
|
84
91
|
files_list = [
|
|
85
92
|
{
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
}
|
|
93
|
+
"ship_name": ship_name,
|
|
94
|
+
"cruise_name": cruise_name,
|
|
95
|
+
"sensor_name": sensor_name,
|
|
96
|
+
"file_name": os.path.basename(raw_file),
|
|
97
|
+
}
|
|
98
|
+
for raw_file in raw_files
|
|
91
99
|
]
|
|
92
100
|
df = pd.DataFrame(files_list)
|
|
93
|
-
df.to_csv(f
|
|
94
|
-
print(
|
|
95
|
-
|
|
101
|
+
df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
|
|
102
|
+
print("done")
|
|
96
103
|
|
|
97
104
|
#################################################################
|
|
98
|
-
def get_subset_ek60_prefix(
|
|
99
|
-
self,
|
|
100
|
-
df: pd.DataFrame
|
|
101
|
-
) -> pd.DataFrame:
|
|
105
|
+
def get_subset_ek60_prefix(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
102
106
|
# Returns all objects with 'EK60' in prefix of file path
|
|
103
107
|
# Note that this can include 'EK80' data that are false-positives
|
|
104
108
|
# in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
|
|
@@ -107,56 +111,68 @@ class IndexManager:
|
|
|
107
111
|
for row in df.itertuples():
|
|
108
112
|
row_split = row[1].split(os.sep)
|
|
109
113
|
if len(row_split) == 6:
|
|
110
|
-
filename = os.path.basename(
|
|
114
|
+
filename = os.path.basename(
|
|
115
|
+
row[1]
|
|
116
|
+
) # 'EX1608_EK60-D20161205-T040300.raw'
|
|
111
117
|
if filename.endswith(".raw"):
|
|
112
|
-
ship_name, cruise_name, sensor_name = row_split[
|
|
113
|
-
|
|
118
|
+
ship_name, cruise_name, sensor_name = row_split[
|
|
119
|
+
2:5
|
|
120
|
+
] # 'Okeanos_Explorer', 'EX1608', 'EK60'
|
|
121
|
+
if (
|
|
122
|
+
re.search("[D](\d{8})", filename) is not None
|
|
123
|
+
and re.search("[T](\d{6})", filename) is not None
|
|
124
|
+
):
|
|
114
125
|
# Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
|
|
115
126
|
# and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
|
|
116
127
|
date_substring = re.search("[D](\d{8})", filename).group(1)
|
|
117
128
|
time_substring = re.search("[T](\d{6})", filename).group(1)
|
|
118
|
-
date_string = datetime.strptime(
|
|
129
|
+
date_string = datetime.strptime(
|
|
130
|
+
f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
|
|
131
|
+
)
|
|
119
132
|
else: # otherwise use current date
|
|
120
133
|
date_string = f"{datetime.utcnow().isoformat()[:19]}Z"
|
|
121
134
|
objects.append(
|
|
122
135
|
{
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
136
|
+
"KEY": row[1],
|
|
137
|
+
"FILENAME": filename,
|
|
138
|
+
"SHIP": ship_name,
|
|
139
|
+
"CRUISE": cruise_name,
|
|
140
|
+
"SENSOR": sensor_name,
|
|
141
|
+
"SIZE": row[2],
|
|
142
|
+
"DATE": date_string,
|
|
143
|
+
"DATAGRAM": None,
|
|
131
144
|
}
|
|
132
145
|
)
|
|
133
146
|
return pd.DataFrame(objects)
|
|
134
147
|
|
|
135
148
|
#################################################################
|
|
136
|
-
def scan_datagram(
|
|
137
|
-
self,
|
|
138
|
-
select_key: str
|
|
139
|
-
) -> list:
|
|
149
|
+
def scan_datagram(self, select_key: str) -> list:
|
|
140
150
|
# Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
|
|
141
151
|
# Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
|
|
142
152
|
# select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
|
|
143
153
|
s3_resource = self.s3_manager.s3_resource
|
|
144
|
-
obj = s3_resource.Object(
|
|
145
|
-
|
|
154
|
+
obj = s3_resource.Object(
|
|
155
|
+
bucket_name=self.input_bucket_name, key=select_key
|
|
156
|
+
) # XML0
|
|
157
|
+
first_datagram = (
|
|
158
|
+
obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
|
|
159
|
+
)
|
|
146
160
|
# return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
|
|
147
161
|
### EK60 data are denoted by 'CON0' ###
|
|
148
162
|
return first_datagram
|
|
149
163
|
|
|
150
164
|
#################################################################
|
|
151
|
-
def get_subset_datagrams(
|
|
152
|
-
self,
|
|
153
|
-
df: pd.DataFrame
|
|
154
|
-
) -> list:
|
|
165
|
+
def get_subset_datagrams(self, df: pd.DataFrame) -> list:
|
|
155
166
|
print("getting subset of datagrams")
|
|
156
|
-
select_keys = list(
|
|
167
|
+
select_keys = list(
|
|
168
|
+
df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
|
|
169
|
+
)
|
|
157
170
|
all_datagrams = []
|
|
158
171
|
with ThreadPoolExecutor(max_workers=self.max_pool_connections) as executor:
|
|
159
|
-
futures = [
|
|
172
|
+
futures = [
|
|
173
|
+
executor.submit(self.scan_datagram, select_key)
|
|
174
|
+
for select_key in select_keys
|
|
175
|
+
]
|
|
160
176
|
for future in as_completed(futures):
|
|
161
177
|
result = future.result()
|
|
162
178
|
if result:
|
|
@@ -165,20 +181,22 @@ class IndexManager:
|
|
|
165
181
|
|
|
166
182
|
#################################################################
|
|
167
183
|
def get_ek60_objects(
|
|
168
|
-
|
|
169
|
-
df: pd.DataFrame,
|
|
170
|
-
subset_datagrams: list
|
|
184
|
+
self, df: pd.DataFrame, subset_datagrams: list
|
|
171
185
|
) -> pd.DataFrame:
|
|
172
186
|
# for each key write datagram value to all other files in same cruise
|
|
173
187
|
for subset_datagram in subset_datagrams:
|
|
174
|
-
if subset_datagram[
|
|
175
|
-
select_cruise = df.loc[df[
|
|
176
|
-
|
|
177
|
-
|
|
188
|
+
if subset_datagram["DATAGRAM"] == "CON0":
|
|
189
|
+
select_cruise = df.loc[df["KEY"] == subset_datagram["KEY"]][
|
|
190
|
+
"CRUISE"
|
|
191
|
+
].iloc[0]
|
|
192
|
+
df.loc[df["CRUISE"] == select_cruise, ["DATAGRAM"]] = subset_datagram[
|
|
193
|
+
"DATAGRAM"
|
|
194
|
+
]
|
|
195
|
+
return df.loc[df["DATAGRAM"] == "CON0"]
|
|
178
196
|
|
|
179
197
|
#################################################################
|
|
180
198
|
def get_calibration_information( # tested
|
|
181
|
-
|
|
199
|
+
self,
|
|
182
200
|
) -> pd.DataFrame:
|
|
183
201
|
# Calibration data generated by data manager currently located here:
|
|
184
202
|
# https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
|
|
@@ -186,14 +204,20 @@ class IndexManager:
|
|
|
186
204
|
# [1] Calibrated w/ calibration data
|
|
187
205
|
# [2] Calibrated w/o calibration data
|
|
188
206
|
# [3] uncalibrated
|
|
189
|
-
response = self.s3_manager.get_object(
|
|
207
|
+
response = self.s3_manager.get_object(
|
|
208
|
+
bucket_name=self.calibration_bucket, key_name=self.calibration_key
|
|
209
|
+
)
|
|
190
210
|
calibration_statuses = pd.read_csv(response.get("Body"))
|
|
191
|
-
calibration_statuses[
|
|
192
|
-
|
|
211
|
+
calibration_statuses["DATASET_NAME"] = calibration_statuses[
|
|
212
|
+
"DATASET_NAME"
|
|
213
|
+
].apply(lambda x: x.split("_EK60")[0])
|
|
214
|
+
calibration_statuses["CAL_STATE"] = calibration_statuses["CAL_STATE"].apply(
|
|
215
|
+
lambda x: x.find("Calibrated") >= 0
|
|
216
|
+
)
|
|
193
217
|
return calibration_statuses
|
|
194
218
|
|
|
195
219
|
#################################################################
|
|
196
|
-
# def
|
|
220
|
+
# def index( # TODO: get rid of this?
|
|
197
221
|
# self
|
|
198
222
|
# ):
|
|
199
223
|
# start_time = datetime.now() # used for benchmarking
|