water-column-sonar-processing 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- water_column_sonar_processing/__init__.py +2 -5
- water_column_sonar_processing/aws/__init__.py +2 -2
- water_column_sonar_processing/aws/dynamodb_manager.py +149 -43
- water_column_sonar_processing/aws/s3_manager.py +71 -37
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +6 -4
- water_column_sonar_processing/cruise/resample_regrid.py +3 -3
- water_column_sonar_processing/geometry/geometry_manager.py +21 -6
- water_column_sonar_processing/geometry/pmtile_generation.py +200 -13
- water_column_sonar_processing/index/index_manager.py +25 -13
- water_column_sonar_processing/model/zarr_manager.py +27 -25
- water_column_sonar_processing/process.py +4 -4
- water_column_sonar_processing/processing/__init__.py +4 -0
- water_column_sonar_processing/processing/cruise_sampler.py +342 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +349 -0
- water_column_sonar_processing/utility/__init__.py +2 -2
- water_column_sonar_processing/utility/cleaner.py +1 -0
- water_column_sonar_processing/utility/constants.py +6 -2
- {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/METADATA +20 -10
- water_column_sonar_processing-0.0.9.dist-info/RECORD +32 -0
- {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/WHEEL +1 -1
- water_column_sonar_processing-0.0.7.dist-info/RECORD +0 -29
- {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/LICENSE +0 -0
- {water_column_sonar_processing-0.0.7.dist-info → water_column_sonar_processing-0.0.9.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,22 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import os
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
# from shapely import wkt
|
|
5
|
-
# import json
|
|
6
|
-
# from shapely.geometry import shape, GeometryCollection
|
|
7
4
|
import fiona
|
|
8
|
-
import
|
|
5
|
+
import s3fs
|
|
6
|
+
import numpy as np
|
|
9
7
|
import pandas as pd
|
|
8
|
+
import xarray as xr
|
|
9
|
+
import geopandas
|
|
10
|
+
import geopandas as gpd
|
|
11
|
+
import pyogrio
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
13
|
from shapely.geometry import LineString
|
|
11
14
|
|
|
15
|
+
MAX_POOL_CONNECTIONS = 64
|
|
16
|
+
MAX_CONCURRENCY = 64
|
|
17
|
+
MAX_WORKERS = 64
|
|
18
|
+
GB = 1024**3
|
|
19
|
+
|
|
12
20
|
|
|
13
21
|
class PMTileGeneration(object):
|
|
14
22
|
#######################################################
|
|
@@ -18,34 +26,35 @@ class PMTileGeneration(object):
|
|
|
18
26
|
print("123")
|
|
19
27
|
|
|
20
28
|
#######################################################
|
|
29
|
+
# This uses a local collection of file-level geojson files to create the data
|
|
21
30
|
def generate_geojson_feature_collection(self):
|
|
22
31
|
# This was used to read from noaa-wcsd-model-pds bucket geojson files and then to
|
|
23
32
|
# generate the geopandas dataframe which could be exported to another comprehensive
|
|
24
33
|
# geojson file. That
|
|
25
34
|
result = list(Path("/Users/r2d2/Documents/echofish/geojson").rglob("*.json"))
|
|
26
35
|
# result = result[:100]
|
|
27
|
-
|
|
36
|
+
jjj = 0
|
|
28
37
|
pieces = []
|
|
29
|
-
for
|
|
30
|
-
file_name = os.path.normpath(result[
|
|
38
|
+
for jjj in range(len(result)):
|
|
39
|
+
file_name = os.path.normpath(result[jjj]).split(os.sep)[-1]
|
|
31
40
|
file_stem = os.path.splitext(os.path.basename(file_name))[0]
|
|
32
|
-
geom =
|
|
41
|
+
geom = gpd.read_file(result[jjj]).iloc[0]["geometry"]
|
|
33
42
|
# TDOO: Filter (0,0) coordinates
|
|
34
43
|
if len(geom.coords.xy[0]) < 2:
|
|
35
44
|
continue
|
|
36
45
|
geom = LineString(list(zip(geom.coords.xy[1], geom.coords.xy[0])))
|
|
37
46
|
pieces.append(
|
|
38
47
|
{
|
|
39
|
-
"ship_name": os.path.normpath(result[
|
|
40
|
-
"cruise_name": os.path.normpath(result[
|
|
48
|
+
"ship_name": os.path.normpath(result[jjj]).split(os.sep)[-4],
|
|
49
|
+
"cruise_name": os.path.normpath(result[jjj]).split(os.sep)[-3],
|
|
41
50
|
"file_stem": file_stem,
|
|
42
|
-
"file_path": result[
|
|
51
|
+
"file_path": result[jjj],
|
|
43
52
|
"geom": geom,
|
|
44
53
|
}
|
|
45
54
|
)
|
|
46
55
|
df = pd.DataFrame(pieces)
|
|
47
56
|
print(df)
|
|
48
|
-
gps_gdf =
|
|
57
|
+
gps_gdf = gpd.GeoDataFrame(
|
|
49
58
|
data=df[
|
|
50
59
|
["ship_name", "cruise_name", "file_stem"]
|
|
51
60
|
], # try again with file_stem
|
|
@@ -70,6 +79,184 @@ class PMTileGeneration(object):
|
|
|
70
79
|
"""
|
|
71
80
|
|
|
72
81
|
#######################################################
|
|
82
|
+
# TODO: temporary using this to get info
|
|
83
|
+
def get_info_from_zarr_store(
|
|
84
|
+
self,
|
|
85
|
+
ship_name,
|
|
86
|
+
cruise_names,
|
|
87
|
+
):
|
|
88
|
+
total_size = 0
|
|
89
|
+
s3_fs = s3fs.S3FileSystem(anon=True)
|
|
90
|
+
for cruise_name in cruise_names:
|
|
91
|
+
path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
|
|
92
|
+
zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
|
|
93
|
+
xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
|
|
94
|
+
print(f'Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}')
|
|
95
|
+
total_size = total_size + xr_store.time.shape[0]
|
|
96
|
+
|
|
97
|
+
def get_geospatial_info_from_zarr_store(
|
|
98
|
+
self,
|
|
99
|
+
ship_name,
|
|
100
|
+
cruise_name,
|
|
101
|
+
):
|
|
102
|
+
"""
|
|
103
|
+
Open Zarr store, create geometry, write to geojson, return name
|
|
104
|
+
"""
|
|
105
|
+
s3_fs = s3fs.S3FileSystem(anon=True)
|
|
106
|
+
gps_gdf = geopandas.GeoDataFrame(
|
|
107
|
+
columns=["id", "ship", "cruise", "sensor", "geometry"],
|
|
108
|
+
geometry="geometry",
|
|
109
|
+
crs="EPSG:4326"
|
|
110
|
+
)
|
|
111
|
+
path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
|
|
112
|
+
# file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
|
|
113
|
+
# file_stem = os.path.splitext(os.path.basename(file_name))[0]
|
|
114
|
+
zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
|
|
115
|
+
# ---Open Zarr Store--- #
|
|
116
|
+
# TODO: try-except to allow failures
|
|
117
|
+
print('opening store')
|
|
118
|
+
# xr_store = xr.open_zarr(store=zarr_store, consolidated=False)
|
|
119
|
+
xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
|
|
120
|
+
print(xr_store.Sv.shape)
|
|
121
|
+
# ---Read Zarr Store Time/Latitude/Longitude--- #
|
|
122
|
+
latitude = xr_store.latitude.values
|
|
123
|
+
longitude = xr_store.longitude.values
|
|
124
|
+
if np.isnan(latitude).any() or np.isnan(longitude).any():
|
|
125
|
+
print(f'there was missing lat-lon data for {cruise_name}')
|
|
126
|
+
return None
|
|
127
|
+
# ---Add To GeoPandas Dataframe--- #
|
|
128
|
+
# TODO: experiment with tolerance "0.001"
|
|
129
|
+
geom = LineString(list(zip(longitude, latitude))).simplify(tolerance=0.001, preserve_topology=True)
|
|
130
|
+
gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
131
|
+
gps_gdf.set_index('id', inplace=True)
|
|
132
|
+
gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, engine="pyogrio")
|
|
133
|
+
return cruise_name
|
|
134
|
+
|
|
135
|
+
#######################################################
|
|
136
|
+
def open_zarr_stores_with_thread_pool_executor(
|
|
137
|
+
self,
|
|
138
|
+
cruises: list,
|
|
139
|
+
):
|
|
140
|
+
# 'cruises' is a list of cruises to process
|
|
141
|
+
completed_cruises = []
|
|
142
|
+
try:
|
|
143
|
+
with ThreadPoolExecutor(max_workers=32) as executor:
|
|
144
|
+
futures = [
|
|
145
|
+
executor.submit(
|
|
146
|
+
self.get_geospatial_info_from_zarr_store,
|
|
147
|
+
"Henry_B._Bigelow", # ship_name
|
|
148
|
+
cruise, # cruise_name
|
|
149
|
+
)
|
|
150
|
+
for cruise in cruises
|
|
151
|
+
]
|
|
152
|
+
for future in as_completed(futures):
|
|
153
|
+
result = future.result()
|
|
154
|
+
if result:
|
|
155
|
+
completed_cruises.extend([result])
|
|
156
|
+
except Exception as err:
|
|
157
|
+
print(err)
|
|
158
|
+
print("Done opening zarr stores using thread pool.")
|
|
159
|
+
return completed_cruises # Took ~12 minutes
|
|
160
|
+
|
|
161
|
+
#######################################################
|
|
162
|
+
# https://docs.protomaps.com/pmtiles/create
|
|
163
|
+
def aggregate_geojson_into_dataframe(
|
|
164
|
+
self
|
|
165
|
+
):
|
|
166
|
+
"""
|
|
167
|
+
iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
|
|
168
|
+
"""
|
|
169
|
+
gps_gdf = geopandas.GeoDataFrame(
|
|
170
|
+
columns=["id", "ship", "cruise", "sensor", "geometry"],
|
|
171
|
+
geometry="geometry",
|
|
172
|
+
crs="EPSG:4326"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
file_type = 'dataframe_*.geojson'
|
|
176
|
+
geojson_files = glob.glob(file_type)
|
|
177
|
+
for jjj in range(len(geojson_files)):
|
|
178
|
+
print(jjj)
|
|
179
|
+
geom = geopandas.read_file(geojson_files[jjj])
|
|
180
|
+
gps_gdf.loc[jjj] = (jjj, geom.ship[0], geom.cruise[0], geom.sensor[0], geom.geometry[0])
|
|
181
|
+
#gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
182
|
+
print(gps_gdf)
|
|
183
|
+
gps_gdf.set_index('id', inplace=True)
|
|
184
|
+
gps_gdf.to_file(f"data.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
185
|
+
return list(gps_gdf.cruise)
|
|
186
|
+
|
|
187
|
+
# gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
188
|
+
#print('writing to file')
|
|
189
|
+
#print(gps_gdf)
|
|
190
|
+
# gps_gdf.set_index('id', inplace=True)
|
|
191
|
+
# gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
192
|
+
# https://gdal.org/en/latest/drivers/vector/jsonfg.html
|
|
193
|
+
# gps_gdf.to_file(
|
|
194
|
+
# f"data.geojson",
|
|
195
|
+
# driver="GeoJSON",
|
|
196
|
+
# engine="pyogrio",
|
|
197
|
+
# layer_options={"ID_FIELD": "id"}
|
|
198
|
+
# )
|
|
199
|
+
# gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
|
|
73
200
|
|
|
201
|
+
# print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
|
|
202
|
+
#gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
|
|
203
|
+
# Convert geojson feature collection to pmtiles
|
|
204
|
+
#gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
|
|
205
|
+
#print("done")
|
|
206
|
+
# ---Export Shapefile--- #
|
|
74
207
|
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
#gps_gdf.set_geometry(col='geometry', inplace=True)
|
|
211
|
+
#gps_gdf.__geo_interface__
|
|
212
|
+
#gps_gdf.set_index('id', inplace=True)
|
|
213
|
+
#gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
|
|
214
|
+
|
|
215
|
+
### this gives the right layer id values
|
|
216
|
+
#gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
217
|
+
# jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
|
|
218
|
+
#tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
|
|
219
|
+
#tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
|
|
220
|
+
# {
|
|
221
|
+
# "type": "FeatureCollection",
|
|
222
|
+
# "name": "dataframe5",
|
|
223
|
+
# "features": [
|
|
224
|
+
# { "type": "Feature", "id": 0, "properties": { "id": 0, "ship": "Henry_B._Bigelow", "cruise": "HB0706", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.120498657226562, 39.659671783447266 ], [ -72.120773315429688, 39.660198211669922 ] ] } },
|
|
225
|
+
# { "type": "Feature", "id": 1, "properties": { "id": 1, "ship": "Henry_B._Bigelow", "cruise": "HB0707", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -71.797836303710938, 41.003166198730469 ], [ -71.797996520996094, 41.002998352050781 ], [ -71.798583984375, 41.002994537353516 ] ] } },
|
|
226
|
+
# { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
|
|
227
|
+
# ]
|
|
228
|
+
# }
|
|
229
|
+
"""
|
|
230
|
+
# https://docs.protomaps.com/pmtiles/create
|
|
231
|
+
#ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
|
|
232
|
+
# Only need to do the second one here...
|
|
233
|
+
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
|
|
234
|
+
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
235
|
+
# used this to combine all the geojson files into single pmtile file (2024-12-03):
|
|
236
|
+
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
237
|
+
|
|
238
|
+
TODO:
|
|
239
|
+
run each one of the cruises in a separate ospool workflow.
|
|
240
|
+
each process gets own store
|
|
241
|
+
"""
|
|
75
242
|
###########################################################
|
|
243
|
+
|
|
244
|
+
# s3_manager = S3Manager() # endpoint_url=endpoint_url)
|
|
245
|
+
# # s3fs_manager = S3FSManager()
|
|
246
|
+
# # input_bucket_name = "test_input_bucket"
|
|
247
|
+
# # s3_manager.create_bucket(bucket_name=input_bucket_name)
|
|
248
|
+
# ship_name = "Henry_B._Bigelow"
|
|
249
|
+
# cruise_name = "HB0706"
|
|
250
|
+
# sensor_name = "EK60"
|
|
251
|
+
#
|
|
252
|
+
# # ---Scan Bucket For All Zarr Stores--- #
|
|
253
|
+
# # https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html#level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr/
|
|
254
|
+
# path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr"
|
|
255
|
+
# s3 = s3fs.S3FileSystem()
|
|
256
|
+
# zarr_store = s3fs.S3Map(path_to_zarr_store, s3=s3)
|
|
257
|
+
# ds_zarr = xr.open_zarr(zarr_store, consolidated=None)
|
|
258
|
+
# print(ds_zarr.Sv.shape)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
|
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
6
|
from concurrent.futures import as_completed
|
|
7
|
-
from water_column_sonar_processing.aws
|
|
7
|
+
from water_column_sonar_processing.aws import S3Manager
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class IndexManager:
|
|
@@ -16,12 +16,10 @@ class IndexManager:
|
|
|
16
16
|
self.s3_manager = S3Manager()
|
|
17
17
|
|
|
18
18
|
#################################################################
|
|
19
|
-
|
|
20
19
|
def list_ships(
|
|
21
20
|
self,
|
|
22
21
|
prefix="data/raw/",
|
|
23
22
|
):
|
|
24
|
-
# s3_client = self.s3_manager.s3_client
|
|
25
23
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
26
24
|
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
27
25
|
)
|
|
@@ -79,6 +77,7 @@ class IndexManager:
|
|
|
79
77
|
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
80
78
|
return [i for i in all_files if i.endswith(".raw")]
|
|
81
79
|
|
|
80
|
+
#################################################################
|
|
82
81
|
def get_raw_files_csv(
|
|
83
82
|
self,
|
|
84
83
|
ship_name,
|
|
@@ -86,7 +85,9 @@ class IndexManager:
|
|
|
86
85
|
sensor_name,
|
|
87
86
|
):
|
|
88
87
|
raw_files = self.get_raw_files(
|
|
89
|
-
ship_name=ship_name,
|
|
88
|
+
ship_name=ship_name,
|
|
89
|
+
cruise_name=cruise_name,
|
|
90
|
+
sensor_name=sensor_name
|
|
90
91
|
)
|
|
91
92
|
files_list = [
|
|
92
93
|
{
|
|
@@ -102,7 +103,10 @@ class IndexManager:
|
|
|
102
103
|
print("done")
|
|
103
104
|
|
|
104
105
|
#################################################################
|
|
105
|
-
def get_subset_ek60_prefix(
|
|
106
|
+
def get_subset_ek60_prefix( # TODO: is this used?
|
|
107
|
+
self,
|
|
108
|
+
df: pd.DataFrame
|
|
109
|
+
) -> pd.DataFrame:
|
|
106
110
|
# Returns all objects with 'EK60' in prefix of file path
|
|
107
111
|
# Note that this can include 'EK80' data that are false-positives
|
|
108
112
|
# in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
|
|
@@ -119,13 +123,13 @@ class IndexManager:
|
|
|
119
123
|
2:5
|
|
120
124
|
] # 'Okeanos_Explorer', 'EX1608', 'EK60'
|
|
121
125
|
if (
|
|
122
|
-
re.search("[D](
|
|
123
|
-
and re.search("[T](
|
|
126
|
+
re.search("[D](\\d{8})", filename) is not None
|
|
127
|
+
and re.search("[T](\\d{6})", filename) is not None
|
|
124
128
|
):
|
|
125
129
|
# Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
|
|
126
130
|
# and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
|
|
127
|
-
date_substring = re.search("[D](
|
|
128
|
-
time_substring = re.search("[T](
|
|
131
|
+
date_substring = re.search("[D](\\d{8})", filename).group(1)
|
|
132
|
+
time_substring = re.search("[T](\\d{6})", filename).group(1)
|
|
129
133
|
date_string = datetime.strptime(
|
|
130
134
|
f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
|
|
131
135
|
)
|
|
@@ -146,7 +150,10 @@ class IndexManager:
|
|
|
146
150
|
return pd.DataFrame(objects)
|
|
147
151
|
|
|
148
152
|
#################################################################
|
|
149
|
-
def scan_datagram(
|
|
153
|
+
def scan_datagram(
|
|
154
|
+
self,
|
|
155
|
+
select_key: str
|
|
156
|
+
) -> list:
|
|
150
157
|
# Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
|
|
151
158
|
# Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
|
|
152
159
|
# select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
|
|
@@ -162,7 +169,10 @@ class IndexManager:
|
|
|
162
169
|
return first_datagram
|
|
163
170
|
|
|
164
171
|
#################################################################
|
|
165
|
-
def get_subset_datagrams(
|
|
172
|
+
def get_subset_datagrams(
|
|
173
|
+
self,
|
|
174
|
+
df: pd.DataFrame
|
|
175
|
+
) -> list:
|
|
166
176
|
print("getting subset of datagrams")
|
|
167
177
|
select_keys = list(
|
|
168
178
|
df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
|
|
@@ -181,7 +191,9 @@ class IndexManager:
|
|
|
181
191
|
|
|
182
192
|
#################################################################
|
|
183
193
|
def get_ek60_objects(
|
|
184
|
-
self,
|
|
194
|
+
self,
|
|
195
|
+
df: pd.DataFrame,
|
|
196
|
+
subset_datagrams: list
|
|
185
197
|
) -> pd.DataFrame:
|
|
186
198
|
# for each key write datagram value to all other files in same cruise
|
|
187
199
|
for subset_datagram in subset_datagrams:
|
|
@@ -195,7 +207,7 @@ class IndexManager:
|
|
|
195
207
|
return df.loc[df["DATAGRAM"] == "CON0"]
|
|
196
208
|
|
|
197
209
|
#################################################################
|
|
198
|
-
def get_calibration_information(
|
|
210
|
+
def get_calibration_information(
|
|
199
211
|
self,
|
|
200
212
|
) -> pd.DataFrame:
|
|
201
213
|
# Calibration data generated by data manager currently located here:
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
2
|
import numcodecs
|
|
4
3
|
import numpy as np
|
|
5
4
|
import xarray as xr
|
|
6
5
|
import zarr
|
|
7
6
|
from numcodecs import Blosc
|
|
8
7
|
|
|
9
|
-
from water_column_sonar_processing.aws
|
|
10
|
-
from water_column_sonar_processing.utility
|
|
11
|
-
from water_column_sonar_processing.utility
|
|
8
|
+
from water_column_sonar_processing.aws import S3FSManager
|
|
9
|
+
from water_column_sonar_processing.utility import Constants
|
|
10
|
+
from water_column_sonar_processing.utility import Timestamp
|
|
11
|
+
from water_column_sonar_processing.utility import Coordinates
|
|
12
12
|
|
|
13
13
|
numcodecs.blosc.use_threads = False
|
|
14
14
|
numcodecs.blosc.set_nthreads(1)
|
|
@@ -32,8 +32,8 @@ class ZarrManager:
|
|
|
32
32
|
self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
33
33
|
|
|
34
34
|
#######################################################
|
|
35
|
-
@staticmethod
|
|
36
35
|
def get_depth_values(
|
|
36
|
+
self,
|
|
37
37
|
min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
|
|
38
38
|
max_echo_range: float = 100.0, # maximum depth measured from whole cruise
|
|
39
39
|
):
|
|
@@ -85,12 +85,11 @@ class ZarrManager:
|
|
|
85
85
|
name=Coordinates.TIME.value,
|
|
86
86
|
data=np.repeat(0.0, width),
|
|
87
87
|
shape=width,
|
|
88
|
-
chunks=
|
|
89
|
-
Constants.TILE_SIZE.value,
|
|
90
|
-
), # TODO: the chunking scheme doesn't seem to be working here
|
|
88
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
89
|
+
# Constants.TILE_SIZE.value,
|
|
90
|
+
#), # TODO: the chunking scheme doesn't seem to be working here
|
|
91
91
|
dtype=np.dtype(Coordinates.TIME_DTYPE.value),
|
|
92
92
|
compressor=self.__compressor,
|
|
93
|
-
# fill_value=0.,
|
|
94
93
|
fill_value=np.nan, # TODO: do i want nan's?
|
|
95
94
|
overwrite=self.__overwrite,
|
|
96
95
|
)
|
|
@@ -113,12 +112,12 @@ class ZarrManager:
|
|
|
113
112
|
# TODO: verify that these values are correct
|
|
114
113
|
data=depth_values,
|
|
115
114
|
shape=len(depth_values),
|
|
116
|
-
chunks=Constants.
|
|
115
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
117
116
|
dtype=np.dtype(
|
|
118
117
|
Coordinates.DEPTH_DTYPE.value
|
|
119
118
|
), # float16 == 2 significant digits would be ideal
|
|
120
119
|
compressor=self.__compressor,
|
|
121
|
-
|
|
120
|
+
fill_value=np.nan,
|
|
122
121
|
overwrite=self.__overwrite,
|
|
123
122
|
)
|
|
124
123
|
# TODO: change to exception
|
|
@@ -133,15 +132,16 @@ class ZarrManager:
|
|
|
133
132
|
# --- Coordinate: Latitude --- #
|
|
134
133
|
root.create_dataset(
|
|
135
134
|
name=Coordinates.LATITUDE.value,
|
|
136
|
-
data=np.repeat(0.0, width),
|
|
135
|
+
# data=np.repeat(0.0, width),
|
|
137
136
|
shape=width,
|
|
138
|
-
chunks=Constants.
|
|
137
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
139
138
|
dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
|
|
140
139
|
compressor=self.__compressor,
|
|
141
|
-
fill_value=
|
|
140
|
+
fill_value=np.nan,
|
|
142
141
|
overwrite=self.__overwrite,
|
|
143
142
|
)
|
|
144
143
|
|
|
144
|
+
# Note: LATITUDE is indexed by TIME
|
|
145
145
|
root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
|
|
146
146
|
|
|
147
147
|
root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
|
|
@@ -151,15 +151,16 @@ class ZarrManager:
|
|
|
151
151
|
# --- Coordinate: Longitude --- #
|
|
152
152
|
root.create_dataset(
|
|
153
153
|
name=Coordinates.LONGITUDE.value,
|
|
154
|
-
data=np.repeat(0.0, width), # root.longitude[:] = np.nan
|
|
154
|
+
# data=np.repeat(0.0, width), # root.longitude[:] = np.nan
|
|
155
155
|
shape=width,
|
|
156
|
-
chunks=Constants.
|
|
156
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
157
157
|
dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
|
|
158
158
|
compressor=self.__compressor,
|
|
159
|
-
fill_value=
|
|
159
|
+
fill_value=np.nan,
|
|
160
160
|
overwrite=self.__overwrite,
|
|
161
161
|
)
|
|
162
162
|
|
|
163
|
+
# Note: LONGITUDE is indexed by TIME
|
|
163
164
|
root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
|
|
164
165
|
|
|
165
166
|
root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
|
|
@@ -170,19 +171,20 @@ class ZarrManager:
|
|
|
170
171
|
# --- Coordinate: Bottom --- #
|
|
171
172
|
root.create_dataset(
|
|
172
173
|
name=Coordinates.BOTTOM.value,
|
|
173
|
-
|
|
174
|
+
data=np.repeat(0.0, width), # root.longitude[:] = np.nan
|
|
174
175
|
shape=width,
|
|
175
|
-
chunks=Constants.
|
|
176
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
176
177
|
dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
|
|
177
178
|
compressor=self.__compressor,
|
|
178
|
-
fill_value=
|
|
179
|
+
fill_value=0.0,
|
|
179
180
|
overwrite=self.__overwrite,
|
|
180
181
|
)
|
|
181
182
|
|
|
182
|
-
|
|
183
|
+
# BOTTOM is indexed by TIME
|
|
184
|
+
root.bottom.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
|
|
183
185
|
|
|
184
|
-
root.
|
|
185
|
-
root.
|
|
186
|
+
root.bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
|
|
187
|
+
root.bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
|
|
186
188
|
|
|
187
189
|
#####################################################################
|
|
188
190
|
# --- Coordinate: Frequency --- #
|
|
@@ -190,7 +192,7 @@ class ZarrManager:
|
|
|
190
192
|
name=Coordinates.FREQUENCY.value,
|
|
191
193
|
data=frequencies,
|
|
192
194
|
shape=len(frequencies),
|
|
193
|
-
chunks=
|
|
195
|
+
chunks=len(frequencies),
|
|
194
196
|
dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
|
|
195
197
|
compressor=self.__compressor,
|
|
196
198
|
fill_value=0.0,
|
|
@@ -213,7 +215,7 @@ class ZarrManager:
|
|
|
213
215
|
root.create_dataset(
|
|
214
216
|
name=Coordinates.SV.value,
|
|
215
217
|
shape=(len(depth_values), width, len(frequencies)),
|
|
216
|
-
chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value,
|
|
218
|
+
chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
|
|
217
219
|
dtype=np.dtype(
|
|
218
220
|
Coordinates.SV_DTYPE.value
|
|
219
221
|
), # TODO: try to experiment with 'float16'
|
|
@@ -3,10 +3,10 @@ import os
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from water_column_sonar_processing.aws
|
|
7
|
-
from water_column_sonar_processing.aws
|
|
8
|
-
from water_column_sonar_processing.aws
|
|
9
|
-
from water_column_sonar_processing.aws
|
|
6
|
+
from water_column_sonar_processing.aws import DynamoDBManager
|
|
7
|
+
from water_column_sonar_processing.aws import S3Manager
|
|
8
|
+
from water_column_sonar_processing.aws import S3FSManager
|
|
9
|
+
from water_column_sonar_processing.aws import SNSManager
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
###########################################################
|