water-column-sonar-processing 25.1.7__py3-none-any.whl → 25.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/aws/dynamodb_manager.py +27 -32
- water_column_sonar_processing/aws/s3_manager.py +52 -64
- water_column_sonar_processing/aws/s3fs_manager.py +3 -9
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +14 -14
- water_column_sonar_processing/cruise/datatree_manager.py +3 -6
- water_column_sonar_processing/cruise/resample_regrid.py +67 -49
- water_column_sonar_processing/geometry/__init__.py +7 -2
- water_column_sonar_processing/geometry/elevation_manager.py +16 -17
- water_column_sonar_processing/geometry/geometry_manager.py +25 -25
- water_column_sonar_processing/geometry/line_simplification.py +150 -0
- water_column_sonar_processing/geometry/pmtile_generation.py +99 -64
- water_column_sonar_processing/index/index_manager.py +67 -32
- water_column_sonar_processing/model/zarr_manager.py +32 -21
- water_column_sonar_processing/process.py +15 -13
- water_column_sonar_processing/processing/__init__.py +2 -2
- water_column_sonar_processing/processing/batch_downloader.py +66 -41
- water_column_sonar_processing/processing/raw_to_zarr.py +121 -82
- water_column_sonar_processing/utility/constants.py +10 -1
- water_column_sonar_processing/utility/pipeline_status.py +11 -15
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/METADATA +21 -12
- water_column_sonar_processing-25.3.0.dist-info/RECORD +34 -0
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/WHEEL +1 -1
- water_column_sonar_processing/geometry/geometry_simplification.py +0 -82
- water_column_sonar_processing-25.1.7.dist-info/RECORD +0 -34
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info/licenses}/LICENSE +0 -0
- {water_column_sonar_processing-25.1.7.dist-info → water_column_sonar_processing-25.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import os
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
4
|
from pathlib import Path
|
|
5
|
+
|
|
4
6
|
import fiona
|
|
5
|
-
import
|
|
7
|
+
import geopandas
|
|
8
|
+
import geopandas as gpd
|
|
6
9
|
import numpy as np
|
|
7
10
|
import pandas as pd
|
|
8
11
|
import xarray as xr
|
|
9
|
-
import geopandas
|
|
10
|
-
import geopandas as gpd
|
|
11
|
-
import pyogrio
|
|
12
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
13
12
|
from shapely.geometry import LineString
|
|
14
13
|
|
|
15
14
|
MAX_POOL_CONNECTIONS = 64
|
|
@@ -19,6 +18,16 @@ GB = 1024**3
|
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class PMTileGeneration(object):
|
|
21
|
+
"""
|
|
22
|
+
TODO: need to
|
|
23
|
+
- iterate through the zarr stores for all cruises
|
|
24
|
+
- generate geojson in geopandas df
|
|
25
|
+
- consolidate into singular df, one cruise per row
|
|
26
|
+
- export as _shape?_ file
|
|
27
|
+
- document next steps creating pmtiles with linux commands
|
|
28
|
+
- upload to s3
|
|
29
|
+
"""
|
|
30
|
+
|
|
22
31
|
#######################################################
|
|
23
32
|
def __init__(
|
|
24
33
|
self,
|
|
@@ -85,13 +94,20 @@ class PMTileGeneration(object):
|
|
|
85
94
|
ship_name,
|
|
86
95
|
cruise_names,
|
|
87
96
|
):
|
|
97
|
+
# TODO: NOT USED ANYWHERE
|
|
88
98
|
total_size = 0
|
|
89
|
-
s3_fs = s3fs.S3FileSystem(anon=True)
|
|
99
|
+
# s3_fs = s3fs.S3FileSystem(anon=True)
|
|
90
100
|
for cruise_name in cruise_names:
|
|
91
|
-
|
|
92
|
-
zarr_store = s3fs.S3Map(root=
|
|
93
|
-
xr_store = xr.
|
|
94
|
-
|
|
101
|
+
s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
|
|
102
|
+
# zarr_store = s3fs.S3Map(root=s3_path, s3=s3_fs)
|
|
103
|
+
xr_store = xr.open_dataset(
|
|
104
|
+
filename_or_obj=s3_path,
|
|
105
|
+
engine="zarr",
|
|
106
|
+
storage_options={"anon": True},
|
|
107
|
+
chunks={}, # this allows the engine to define the chunk scheme
|
|
108
|
+
cache=True,
|
|
109
|
+
)
|
|
110
|
+
print(f"Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}")
|
|
95
111
|
total_size = total_size + xr_store.time.shape[0]
|
|
96
112
|
|
|
97
113
|
def get_geospatial_info_from_zarr_store(
|
|
@@ -102,40 +118,51 @@ class PMTileGeneration(object):
|
|
|
102
118
|
"""
|
|
103
119
|
Open Zarr store, create geometry, write to geojson, return name
|
|
104
120
|
"""
|
|
105
|
-
s3_fs = s3fs.S3FileSystem(anon=True)
|
|
121
|
+
# s3_fs = s3fs.S3FileSystem(anon=True)
|
|
106
122
|
gps_gdf = geopandas.GeoDataFrame(
|
|
107
123
|
columns=["id", "ship", "cruise", "sensor", "geometry"],
|
|
108
124
|
geometry="geometry",
|
|
109
|
-
crs="EPSG:4326"
|
|
125
|
+
crs="EPSG:4326",
|
|
110
126
|
)
|
|
111
|
-
|
|
112
|
-
# file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
|
|
113
|
-
# file_stem = os.path.splitext(os.path.basename(file_name))[0]
|
|
114
|
-
zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
|
|
115
|
-
# ---Open Zarr Store--- #
|
|
127
|
+
s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
|
|
116
128
|
# TODO: try-except to allow failures
|
|
117
|
-
print(
|
|
118
|
-
|
|
119
|
-
|
|
129
|
+
print("opening store")
|
|
130
|
+
xr_store = xr.open_dataset(
|
|
131
|
+
filename_or_obj=s3_path,
|
|
132
|
+
engine="zarr",
|
|
133
|
+
storage_options={"anon": True},
|
|
134
|
+
chunks={}, # this allows the engine to define the chunk scheme
|
|
135
|
+
cache=True,
|
|
136
|
+
)
|
|
120
137
|
print(xr_store.Sv.shape)
|
|
121
138
|
# ---Read Zarr Store Time/Latitude/Longitude--- #
|
|
122
139
|
latitude = xr_store.latitude.values
|
|
123
140
|
longitude = xr_store.longitude.values
|
|
124
141
|
if np.isnan(latitude).any() or np.isnan(longitude).any():
|
|
125
|
-
print(f
|
|
142
|
+
print(f"there was missing lat-lon data for {cruise_name}")
|
|
126
143
|
return None
|
|
127
144
|
# ---Add To GeoPandas Dataframe--- #
|
|
128
145
|
# TODO: experiment with tolerance "0.001"
|
|
129
|
-
geom = LineString(list(zip(longitude, latitude))).simplify(
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
gps_gdf.
|
|
146
|
+
geom = LineString(list(zip(longitude, latitude))).simplify(
|
|
147
|
+
tolerance=0.001, preserve_topology=True
|
|
148
|
+
)
|
|
149
|
+
gps_gdf.loc[0] = (
|
|
150
|
+
0,
|
|
151
|
+
"Henry_B._Bigelow",
|
|
152
|
+
cruise_name,
|
|
153
|
+
"EK60",
|
|
154
|
+
geom,
|
|
155
|
+
) # (ship, cruise, sensor, geometry)
|
|
156
|
+
gps_gdf.set_index("id", inplace=True)
|
|
157
|
+
gps_gdf.to_file(
|
|
158
|
+
f"dataframe_{cruise_name}.geojson", driver="GeoJSON"
|
|
159
|
+
) # , engine="pyogrio")
|
|
133
160
|
return cruise_name
|
|
134
161
|
|
|
135
162
|
#######################################################
|
|
136
163
|
def open_zarr_stores_with_thread_pool_executor(
|
|
137
|
-
|
|
138
|
-
|
|
164
|
+
self,
|
|
165
|
+
cruises: list,
|
|
139
166
|
):
|
|
140
167
|
# 'cruises' is a list of cruises to process
|
|
141
168
|
completed_cruises = []
|
|
@@ -156,37 +183,46 @@ class PMTileGeneration(object):
|
|
|
156
183
|
except Exception as err:
|
|
157
184
|
print(err)
|
|
158
185
|
print("Done opening zarr stores using thread pool.")
|
|
159
|
-
return completed_cruises
|
|
186
|
+
return completed_cruises # Took ~12 minutes
|
|
160
187
|
|
|
161
188
|
#######################################################
|
|
162
189
|
# https://docs.protomaps.com/pmtiles/create
|
|
163
|
-
def aggregate_geojson_into_dataframe(
|
|
164
|
-
self
|
|
165
|
-
):
|
|
190
|
+
def aggregate_geojson_into_dataframe(self):
|
|
166
191
|
"""
|
|
167
192
|
iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
|
|
168
193
|
"""
|
|
169
194
|
gps_gdf = geopandas.GeoDataFrame(
|
|
170
195
|
columns=["id", "ship", "cruise", "sensor", "geometry"],
|
|
171
196
|
geometry="geometry",
|
|
172
|
-
crs="EPSG:4326"
|
|
197
|
+
crs="EPSG:4326",
|
|
173
198
|
)
|
|
174
199
|
|
|
175
|
-
file_type =
|
|
200
|
+
file_type = "dataframe_*.geojson"
|
|
176
201
|
geojson_files = glob.glob(file_type)
|
|
177
202
|
for jjj in range(len(geojson_files)):
|
|
178
203
|
print(jjj)
|
|
179
204
|
geom = geopandas.read_file(geojson_files[jjj])
|
|
180
|
-
gps_gdf.loc[jjj] = (
|
|
181
|
-
|
|
205
|
+
gps_gdf.loc[jjj] = (
|
|
206
|
+
jjj,
|
|
207
|
+
geom.ship[0],
|
|
208
|
+
geom.cruise[0],
|
|
209
|
+
geom.sensor[0],
|
|
210
|
+
geom.geometry[0],
|
|
211
|
+
)
|
|
212
|
+
# gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
182
213
|
print(gps_gdf)
|
|
183
|
-
gps_gdf.set_index(
|
|
184
|
-
gps_gdf.to_file(
|
|
214
|
+
gps_gdf.set_index("id", inplace=True)
|
|
215
|
+
gps_gdf.to_file(
|
|
216
|
+
"data.geojson",
|
|
217
|
+
driver="GeoJSON",
|
|
218
|
+
engine="pyogrio",
|
|
219
|
+
layer_options={"ID_GENERATE": "YES"},
|
|
220
|
+
)
|
|
185
221
|
return list(gps_gdf.cruise)
|
|
186
222
|
|
|
187
223
|
# gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
188
|
-
#print('writing to file')
|
|
189
|
-
#print(gps_gdf)
|
|
224
|
+
# print('writing to file')
|
|
225
|
+
# print(gps_gdf)
|
|
190
226
|
# gps_gdf.set_index('id', inplace=True)
|
|
191
227
|
# gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
192
228
|
# https://gdal.org/en/latest/drivers/vector/jsonfg.html
|
|
@@ -198,25 +234,25 @@ class PMTileGeneration(object):
|
|
|
198
234
|
# )
|
|
199
235
|
# gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
|
|
200
236
|
|
|
237
|
+
|
|
201
238
|
# print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
|
|
202
|
-
#gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
|
|
239
|
+
# gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
|
|
203
240
|
# Convert geojson feature collection to pmtiles
|
|
204
|
-
#gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
|
|
205
|
-
#print("done")
|
|
241
|
+
# gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
|
|
242
|
+
# print("done")
|
|
206
243
|
# ---Export Shapefile--- #
|
|
207
244
|
|
|
208
245
|
|
|
209
|
-
|
|
210
|
-
#gps_gdf.
|
|
211
|
-
#gps_gdf.
|
|
212
|
-
#gps_gdf.
|
|
213
|
-
#gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
|
|
246
|
+
# gps_gdf.set_geometry(col='geometry', inplace=True)
|
|
247
|
+
# gps_gdf.__geo_interface__
|
|
248
|
+
# gps_gdf.set_index('id', inplace=True)
|
|
249
|
+
# gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
|
|
214
250
|
|
|
215
251
|
### this gives the right layer id values
|
|
216
|
-
#gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
252
|
+
# gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
217
253
|
# jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
|
|
218
|
-
#tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
|
|
219
|
-
#tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
|
|
254
|
+
# tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
|
|
255
|
+
# tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
|
|
220
256
|
# {
|
|
221
257
|
# "type": "FeatureCollection",
|
|
222
258
|
# "name": "dataframe5",
|
|
@@ -226,19 +262,19 @@ class PMTileGeneration(object):
|
|
|
226
262
|
# { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
|
|
227
263
|
# ]
|
|
228
264
|
# }
|
|
229
|
-
"""
|
|
230
|
-
# https://docs.protomaps.com/pmtiles/create
|
|
231
|
-
#ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
|
|
232
|
-
# Only need to do the second one here...
|
|
233
|
-
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
|
|
234
|
-
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
235
|
-
# used this to combine all the geojson files into single pmtile file (2024-12-03):
|
|
236
|
-
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
237
265
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
266
|
+
# # https://docs.protomaps.com/pmtiles/create
|
|
267
|
+
# #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
|
|
268
|
+
# # Only need to do the second one here...
|
|
269
|
+
# tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
|
|
270
|
+
# tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
271
|
+
# # used this to combine all the geojson files into single pmtile file (2024-12-03):
|
|
272
|
+
# tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
273
|
+
#
|
|
274
|
+
# TODO:
|
|
275
|
+
# run each one of the cruises in a separate ospool workflow.
|
|
276
|
+
# each process gets own store
|
|
277
|
+
|
|
242
278
|
###########################################################
|
|
243
279
|
|
|
244
280
|
# s3_manager = S3Manager() # endpoint_url=endpoint_url)
|
|
@@ -258,5 +294,4 @@ TODO:
|
|
|
258
294
|
# print(ds_zarr.Sv.shape)
|
|
259
295
|
|
|
260
296
|
|
|
261
|
-
|
|
262
|
-
total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
|
|
297
|
+
# total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
|
-
import
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
from hashlib import sha256
|
|
6
|
+
|
|
7
|
+
import networkx as nx
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
8
10
|
|
|
11
|
+
from water_column_sonar_processing.aws import S3Manager
|
|
9
12
|
|
|
10
13
|
MAX_POOL_CONNECTIONS = 64
|
|
11
14
|
MAX_CONCURRENCY = 64
|
|
@@ -19,8 +22,8 @@ class IndexManager:
|
|
|
19
22
|
def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
|
|
20
23
|
self.input_bucket_name = input_bucket_name
|
|
21
24
|
self.calibration_bucket = calibration_bucket
|
|
22
|
-
self.calibration_key = calibration_key
|
|
23
|
-
self.s3_manager = S3Manager()
|
|
25
|
+
self.calibration_key = calibration_key # TODO: make optional?
|
|
26
|
+
self.s3_manager = S3Manager() # TODO: make anonymous?
|
|
24
27
|
|
|
25
28
|
#################################################################
|
|
26
29
|
def list_ships(
|
|
@@ -80,9 +83,7 @@ class IndexManager:
|
|
|
80
83
|
# Gets all raw files for a cruise under the given prefix
|
|
81
84
|
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
82
85
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
83
|
-
Bucket=self.input_bucket_name,
|
|
84
|
-
Prefix=prefix,
|
|
85
|
-
Delimiter="/"
|
|
86
|
+
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
86
87
|
)
|
|
87
88
|
all_files = []
|
|
88
89
|
for page in page_iterator:
|
|
@@ -112,7 +113,9 @@ class IndexManager:
|
|
|
112
113
|
Delimiter="/",
|
|
113
114
|
)
|
|
114
115
|
# page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
115
|
-
page_iterator = page_iterator.search(
|
|
116
|
+
page_iterator = page_iterator.search(
|
|
117
|
+
expression="Contents[?contains(Key, '.raw')] "
|
|
118
|
+
)
|
|
116
119
|
for res in page_iterator:
|
|
117
120
|
if "Key" in res:
|
|
118
121
|
return res["Key"]
|
|
@@ -149,9 +152,7 @@ class IndexManager:
|
|
|
149
152
|
sensor_name,
|
|
150
153
|
):
|
|
151
154
|
raw_files = self.get_raw_files(
|
|
152
|
-
ship_name=ship_name,
|
|
153
|
-
cruise_name=cruise_name,
|
|
154
|
-
sensor_name=sensor_name
|
|
155
|
+
ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
|
|
155
156
|
)
|
|
156
157
|
files_list = [
|
|
157
158
|
{
|
|
@@ -174,9 +175,7 @@ class IndexManager:
|
|
|
174
175
|
):
|
|
175
176
|
# gets all raw files in cruise and returns a list of dicts
|
|
176
177
|
raw_files = self.get_raw_files(
|
|
177
|
-
ship_name=ship_name,
|
|
178
|
-
cruise_name=cruise_name,
|
|
179
|
-
sensor_name=sensor_name
|
|
178
|
+
ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
|
|
180
179
|
)
|
|
181
180
|
files_list = [
|
|
182
181
|
{
|
|
@@ -190,10 +189,9 @@ class IndexManager:
|
|
|
190
189
|
return files_list
|
|
191
190
|
|
|
192
191
|
#################################################################
|
|
193
|
-
def get_subset_ek60_prefix(
|
|
194
|
-
self,
|
|
195
|
-
|
|
196
|
-
) -> pd.DataFrame:
|
|
192
|
+
def get_subset_ek60_prefix(
|
|
193
|
+
self, df: pd.DataFrame
|
|
194
|
+
) -> pd.DataFrame: # TODO: is this used?
|
|
197
195
|
# Returns all objects with 'EK60' in prefix of file path
|
|
198
196
|
# Note that this can include 'EK80' data that are false-positives
|
|
199
197
|
# in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
|
|
@@ -237,10 +235,7 @@ class IndexManager:
|
|
|
237
235
|
return pd.DataFrame(objects)
|
|
238
236
|
|
|
239
237
|
#################################################################
|
|
240
|
-
def scan_datagram(
|
|
241
|
-
self,
|
|
242
|
-
select_key: str
|
|
243
|
-
) -> list:
|
|
238
|
+
def scan_datagram(self, select_key: str) -> list:
|
|
244
239
|
# Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
|
|
245
240
|
# Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
|
|
246
241
|
# select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
|
|
@@ -256,12 +251,15 @@ class IndexManager:
|
|
|
256
251
|
return first_datagram
|
|
257
252
|
|
|
258
253
|
#################################################################
|
|
259
|
-
def get_subset_datagrams(
|
|
260
|
-
self,
|
|
261
|
-
|
|
262
|
-
) -> list:
|
|
254
|
+
def get_subset_datagrams(
|
|
255
|
+
self, df: pd.DataFrame
|
|
256
|
+
) -> list: # TODO: is this getting used
|
|
263
257
|
print("getting subset of datagrams")
|
|
264
|
-
select_keys =
|
|
258
|
+
select_keys = (
|
|
259
|
+
df[["KEY", "CRUISE"]]
|
|
260
|
+
.drop_duplicates(subset="CRUISE")["KEY"]
|
|
261
|
+
.values.tolist()
|
|
262
|
+
)
|
|
265
263
|
all_datagrams = []
|
|
266
264
|
with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
|
|
267
265
|
futures = [
|
|
@@ -276,9 +274,7 @@ class IndexManager:
|
|
|
276
274
|
|
|
277
275
|
#################################################################
|
|
278
276
|
def get_ek60_objects(
|
|
279
|
-
self,
|
|
280
|
-
df: pd.DataFrame,
|
|
281
|
-
subset_datagrams: list
|
|
277
|
+
self, df: pd.DataFrame, subset_datagrams: list
|
|
282
278
|
) -> pd.DataFrame:
|
|
283
279
|
# for each key write datagram value to all other files in same cruise
|
|
284
280
|
for subset_datagram in subset_datagrams:
|
|
@@ -346,3 +342,42 @@ class IndexManager:
|
|
|
346
342
|
# end_time = datetime.now() # used for benchmarking
|
|
347
343
|
# print(start_time)
|
|
348
344
|
# print(end_time)
|
|
345
|
+
|
|
346
|
+
# TODO: wip
|
|
347
|
+
def build_merkle_tree(self):
|
|
348
|
+
G = nx.DiGraph()
|
|
349
|
+
# https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
|
|
350
|
+
ship_name = "Henry_B._Bigelow"
|
|
351
|
+
cruise_name = "HB0707"
|
|
352
|
+
# cruise_name = "HB0805"
|
|
353
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/"
|
|
354
|
+
# prefix = f"data/raw/{ship_name}/"
|
|
355
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
356
|
+
Bucket=self.input_bucket_name,
|
|
357
|
+
Prefix=prefix,
|
|
358
|
+
)
|
|
359
|
+
for page in page_iterator:
|
|
360
|
+
for contents in page["Contents"]:
|
|
361
|
+
obj_key = contents["Key"]
|
|
362
|
+
# https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
|
|
363
|
+
obj_etag = contents["ETag"].split('"')[1] # properties
|
|
364
|
+
obj_size = contents["Size"]
|
|
365
|
+
basename = os.path.basename(obj_key)
|
|
366
|
+
G.add_node(
|
|
367
|
+
node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
|
|
368
|
+
) # TODO: add parent hash
|
|
369
|
+
split_path = os.path.normpath(obj_key).split(os.path.sep)
|
|
370
|
+
# split_path: ['data', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
|
|
371
|
+
for previous, current in zip(split_path, split_path[1:]):
|
|
372
|
+
if not G.has_edge(previous, current):
|
|
373
|
+
G.add_edge(previous, current)
|
|
374
|
+
# print(G)
|
|
375
|
+
etag_set = frozenset(
|
|
376
|
+
[k for j, k in list(G.nodes.data("ETag")) if k is not None]
|
|
377
|
+
)
|
|
378
|
+
new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
|
|
379
|
+
total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
|
|
380
|
+
print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
|
|
381
|
+
print(" ")
|
|
382
|
+
print(new_hash)
|
|
383
|
+
return new_hash
|
|
@@ -1,17 +1,16 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
|
|
1
3
|
import numcodecs
|
|
2
4
|
import numpy as np
|
|
3
5
|
import xarray as xr
|
|
4
6
|
import zarr
|
|
5
|
-
import importlib.metadata
|
|
6
7
|
from numcodecs import Blosc
|
|
7
8
|
|
|
8
9
|
from water_column_sonar_processing.aws import S3FSManager
|
|
9
|
-
from water_column_sonar_processing.utility import Constants
|
|
10
|
-
from water_column_sonar_processing.utility import Timestamp
|
|
11
|
-
from water_column_sonar_processing.utility import Coordinates
|
|
10
|
+
from water_column_sonar_processing.utility import Constants, Coordinates, Timestamp
|
|
12
11
|
|
|
13
|
-
numcodecs.blosc.use_threads = False
|
|
14
|
-
numcodecs.blosc.set_nthreads(1)
|
|
12
|
+
# numcodecs.blosc.use_threads = False
|
|
13
|
+
# numcodecs.blosc.set_nthreads(1)
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
# TODO: when ready switch to version 3 of model spec
|
|
@@ -36,20 +35,22 @@ class ZarrManager:
|
|
|
36
35
|
self,
|
|
37
36
|
min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
|
|
38
37
|
max_echo_range: float = 100.0, # maximum depth measured from whole cruise
|
|
38
|
+
cruise_min_epsilon: float = 0.25, # resolution between subsequent measurements
|
|
39
39
|
):
|
|
40
40
|
# Gets the set of depth values that will be used when resampling and
|
|
41
41
|
# regridding the data to a cruise level model store.
|
|
42
42
|
# Note: returned values do not start at zero.
|
|
43
|
+
# For more info see here: https://echopype.readthedocs.io/en/stable/data-proc-additional.html
|
|
43
44
|
print("Getting depth values.")
|
|
44
45
|
all_cruise_depth_values = np.linspace(
|
|
45
46
|
start=min_echo_range,
|
|
46
47
|
stop=max_echo_range,
|
|
47
|
-
num=int(max_echo_range
|
|
48
|
+
num=int((max_echo_range - min_echo_range) / cruise_min_epsilon) + 1,
|
|
48
49
|
endpoint=True,
|
|
49
|
-
)
|
|
50
|
+
) # np.arange(min_echo_range, max_echo_range, step=min_echo_range) # this is worse
|
|
50
51
|
|
|
51
52
|
if np.any(np.isnan(all_cruise_depth_values)):
|
|
52
|
-
raise Exception(
|
|
53
|
+
raise Exception("Problem depth values returned were NaN.")
|
|
53
54
|
|
|
54
55
|
print("Done getting depth values.")
|
|
55
56
|
return all_cruise_depth_values.round(decimals=2)
|
|
@@ -57,7 +58,7 @@ class ZarrManager:
|
|
|
57
58
|
#######################################################
|
|
58
59
|
def create_zarr_store(
|
|
59
60
|
self,
|
|
60
|
-
path: str,
|
|
61
|
+
path: str, # 'level_2/Henry_B._Bigelow/HB0707/EK60/HB0707.model/tmp/HB0707.zarr/.zattrs'
|
|
61
62
|
ship_name: str,
|
|
62
63
|
cruise_name: str,
|
|
63
64
|
sensor_name: str,
|
|
@@ -65,6 +66,7 @@ class ZarrManager:
|
|
|
65
66
|
width: int, # TODO: needs better name... "ping_time"
|
|
66
67
|
min_echo_range: float, # smallest resolution in meters
|
|
67
68
|
max_echo_range: float,
|
|
69
|
+
cruise_min_epsilon: float,
|
|
68
70
|
calibration_status: bool = False, # Assume uncalibrated
|
|
69
71
|
) -> str:
|
|
70
72
|
print(
|
|
@@ -105,7 +107,9 @@ class ZarrManager:
|
|
|
105
107
|
#####################################################################
|
|
106
108
|
# --- Coordinate: Depth --- #
|
|
107
109
|
depth_values = self.get_depth_values(
|
|
108
|
-
min_echo_range=min_echo_range,
|
|
110
|
+
min_echo_range=min_echo_range,
|
|
111
|
+
max_echo_range=max_echo_range,
|
|
112
|
+
cruise_min_epsilon=cruise_min_epsilon,
|
|
109
113
|
)
|
|
110
114
|
|
|
111
115
|
root.create_dataset(
|
|
@@ -123,7 +127,7 @@ class ZarrManager:
|
|
|
123
127
|
)
|
|
124
128
|
|
|
125
129
|
if np.any(np.isnan(depth_values)):
|
|
126
|
-
raise Exception(
|
|
130
|
+
raise Exception("Some depth values returned were NaN.")
|
|
127
131
|
|
|
128
132
|
root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
|
|
129
133
|
|
|
@@ -171,7 +175,9 @@ class ZarrManager:
|
|
|
171
175
|
|
|
172
176
|
root.longitude.attrs["units"] = Coordinates.LONGITUDE_UNITS.value
|
|
173
177
|
root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
|
|
174
|
-
root.longitude.attrs["standard_name"] =
|
|
178
|
+
root.longitude.attrs["standard_name"] = (
|
|
179
|
+
Coordinates.LONGITUDE_STANDARD_NAME.value
|
|
180
|
+
)
|
|
175
181
|
|
|
176
182
|
#####################################################################
|
|
177
183
|
# TODO: verify adding this variable for where the bottom was detected
|
|
@@ -224,7 +230,11 @@ class ZarrManager:
|
|
|
224
230
|
name=Coordinates.SV.value,
|
|
225
231
|
shape=(len(depth_values), width, len(frequencies)),
|
|
226
232
|
# chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
|
|
227
|
-
chunks=(
|
|
233
|
+
chunks=(
|
|
234
|
+
Constants.TILE_SIZE.value,
|
|
235
|
+
Constants.TILE_SIZE.value,
|
|
236
|
+
1,
|
|
237
|
+
), # 256x256x1 <- speed up for alex
|
|
228
238
|
dtype=np.dtype(
|
|
229
239
|
Coordinates.SV_DTYPE.value
|
|
230
240
|
), # TODO: try to experiment with 'float16'
|
|
@@ -251,7 +261,9 @@ class ZarrManager:
|
|
|
251
261
|
#
|
|
252
262
|
root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
|
|
253
263
|
|
|
254
|
-
current_project_version = importlib.metadata.version(
|
|
264
|
+
current_project_version = importlib.metadata.version(
|
|
265
|
+
"water_column_sonar_processing"
|
|
266
|
+
)
|
|
255
267
|
root.attrs["processing_software_version"] = current_project_version
|
|
256
268
|
root.attrs["processing_software_time"] = Timestamp.get_timestamp()
|
|
257
269
|
#
|
|
@@ -317,16 +329,14 @@ class ZarrManager:
|
|
|
317
329
|
input_bucket_name: str,
|
|
318
330
|
endpoint_url=None,
|
|
319
331
|
) -> xr.Dataset:
|
|
320
|
-
print(
|
|
332
|
+
print(
|
|
333
|
+
"Opening L1 Zarr store in S3 with Xarray."
|
|
334
|
+
) # TODO: Is this only used for reading from?
|
|
321
335
|
try:
|
|
322
336
|
zarr_path = f"s3://{input_bucket_name}/level_1/{ship_name}/{cruise_name}/{sensor_name}/{file_name_stem}.zarr"
|
|
323
337
|
s3fs_manager = S3FSManager(endpoint_url=endpoint_url)
|
|
324
338
|
store_s3_map = s3fs_manager.s3_map(s3_zarr_store_path=zarr_path)
|
|
325
|
-
ds = xr.open_dataset(
|
|
326
|
-
filename_or_obj=store_s3_map,
|
|
327
|
-
engine="zarr",
|
|
328
|
-
chunks={}
|
|
329
|
-
)
|
|
339
|
+
ds = xr.open_dataset(filename_or_obj=store_s3_map, engine="zarr", chunks={})
|
|
330
340
|
except Exception as err:
|
|
331
341
|
print("Problem opening Zarr store in S3 as Xarray.")
|
|
332
342
|
raise err
|
|
@@ -353,6 +363,7 @@ class ZarrManager:
|
|
|
353
363
|
raise err
|
|
354
364
|
print("Done opening Zarr store in S3 as Xarray.")
|
|
355
365
|
return ds
|
|
366
|
+
|
|
356
367
|
############################################################################
|
|
357
368
|
|
|
358
369
|
#######################################################
|
|
@@ -3,10 +3,12 @@ import os
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from water_column_sonar_processing.aws import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
from water_column_sonar_processing.aws import (
|
|
7
|
+
DynamoDBManager,
|
|
8
|
+
S3FSManager,
|
|
9
|
+
S3Manager,
|
|
10
|
+
SNSManager,
|
|
11
|
+
)
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
###########################################################
|
|
@@ -23,9 +25,9 @@ class Process:
|
|
|
23
25
|
# self.output_bucket_secret_access_key = ?
|
|
24
26
|
|
|
25
27
|
def execute(self):
|
|
26
|
-
input_s3_manager = (
|
|
27
|
-
|
|
28
|
-
) # TODO: Need to allow passing in of credentials when writing to protected bucket
|
|
28
|
+
# input_s3_manager = (
|
|
29
|
+
# S3Manager()
|
|
30
|
+
# ) # TODO: Need to allow passing in of credentials when writing to protected bucket
|
|
29
31
|
s3fs_manager = S3FSManager() # TODO: delete this
|
|
30
32
|
print(s3fs_manager) # TODO: delete this
|
|
31
33
|
output_s3_manager = S3Manager()
|
|
@@ -76,8 +78,8 @@ class Process:
|
|
|
76
78
|
"#SE": "SENSOR_NAME",
|
|
77
79
|
"#SH": "SHIP_NAME",
|
|
78
80
|
"#ST": "START_TIME",
|
|
79
|
-
"#ZB": "ZARR_BUCKET",
|
|
80
|
-
"#ZP": "ZARR_PATH",
|
|
81
|
+
# "#ZB": "ZARR_BUCKET",
|
|
82
|
+
# "#ZP": "ZARR_PATH",
|
|
81
83
|
},
|
|
82
84
|
expression_attribute_values={
|
|
83
85
|
":ch": {"L": [{"S": i} for i in test_channels]},
|
|
@@ -92,10 +94,10 @@ class Process:
|
|
|
92
94
|
":se": {"S": sensor_name},
|
|
93
95
|
":sh": {"S": ship_name},
|
|
94
96
|
":st": {"S": "2006-04-06T11:34:07.288Z"},
|
|
95
|
-
":zb": {"S": "r2d2-dev-echofish2-118234403147-echofish-dev-output"},
|
|
96
|
-
":zp": {
|
|
97
|
-
|
|
98
|
-
},
|
|
97
|
+
# ":zb": {"S": "r2d2-dev-echofish2-118234403147-echofish-dev-output"},
|
|
98
|
+
# ":zp": {
|
|
99
|
+
# "S": "level_1/David_Starr_Jordan/DS0604/EK60/DSJ0604-D20060406-T113407.model"
|
|
100
|
+
# },
|
|
99
101
|
},
|
|
100
102
|
update_expression=(
|
|
101
103
|
"SET "
|