water-column-sonar-processing 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/__init__.py +4 -5
- water_column_sonar_processing/aws/dynamodb_manager.py +149 -43
- water_column_sonar_processing/aws/s3_manager.py +71 -37
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +6 -4
- water_column_sonar_processing/cruise/resample_regrid.py +3 -3
- water_column_sonar_processing/geometry/geometry_manager.py +21 -6
- water_column_sonar_processing/geometry/pmtile_generation.py +202 -13
- water_column_sonar_processing/index/index_manager.py +25 -13
- water_column_sonar_processing/model/zarr_manager.py +26 -25
- water_column_sonar_processing/process.py +4 -4
- water_column_sonar_processing/processing/__init__.py +4 -0
- water_column_sonar_processing/processing/cruise_sampler.py +342 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +349 -0
- water_column_sonar_processing/utility/cleaner.py +1 -0
- water_column_sonar_processing/utility/constants.py +6 -2
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/METADATA +21 -10
- water_column_sonar_processing-0.0.8.dist-info/RECORD +32 -0
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/WHEEL +1 -1
- water_column_sonar_processing-0.0.6.dist-info/RECORD +0 -29
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/LICENSE +0 -0
- {water_column_sonar_processing-0.0.6.dist-info → water_column_sonar_processing-0.0.8.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,24 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import os
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
# from shapely import wkt
|
|
5
|
-
# import json
|
|
6
|
-
# from shapely.geometry import shape, GeometryCollection
|
|
7
4
|
import fiona
|
|
8
|
-
import
|
|
5
|
+
import s3fs
|
|
6
|
+
import numpy as np
|
|
9
7
|
import pandas as pd
|
|
8
|
+
import xarray as xr
|
|
9
|
+
import geopandas
|
|
10
|
+
import geopandas as gpd
|
|
11
|
+
import pyogrio
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
13
|
from shapely.geometry import LineString
|
|
11
14
|
|
|
15
|
+
from src.water_column_sonar_processing.aws import S3Manager, S3FSManager
|
|
16
|
+
|
|
17
|
+
MAX_POOL_CONNECTIONS = 64
|
|
18
|
+
MAX_CONCURRENCY = 64
|
|
19
|
+
MAX_WORKERS = 64
|
|
20
|
+
GB = 1024**3
|
|
21
|
+
|
|
12
22
|
|
|
13
23
|
class PMTileGeneration(object):
|
|
14
24
|
#######################################################
|
|
@@ -18,34 +28,35 @@ class PMTileGeneration(object):
|
|
|
18
28
|
print("123")
|
|
19
29
|
|
|
20
30
|
#######################################################
|
|
31
|
+
# This uses a local collection of file-level geojson files to create the data
|
|
21
32
|
def generate_geojson_feature_collection(self):
|
|
22
33
|
# This was used to read from noaa-wcsd-model-pds bucket geojson files and then to
|
|
23
34
|
# generate the geopandas dataframe which could be exported to another comprehensive
|
|
24
35
|
# geojson file. That
|
|
25
36
|
result = list(Path("/Users/r2d2/Documents/echofish/geojson").rglob("*.json"))
|
|
26
37
|
# result = result[:100]
|
|
27
|
-
|
|
38
|
+
jjj = 0
|
|
28
39
|
pieces = []
|
|
29
|
-
for
|
|
30
|
-
file_name = os.path.normpath(result[
|
|
40
|
+
for jjj in range(len(result)):
|
|
41
|
+
file_name = os.path.normpath(result[jjj]).split(os.sep)[-1]
|
|
31
42
|
file_stem = os.path.splitext(os.path.basename(file_name))[0]
|
|
32
|
-
geom =
|
|
43
|
+
geom = gpd.read_file(result[jjj]).iloc[0]["geometry"]
|
|
33
44
|
# TDOO: Filter (0,0) coordinates
|
|
34
45
|
if len(geom.coords.xy[0]) < 2:
|
|
35
46
|
continue
|
|
36
47
|
geom = LineString(list(zip(geom.coords.xy[1], geom.coords.xy[0])))
|
|
37
48
|
pieces.append(
|
|
38
49
|
{
|
|
39
|
-
"ship_name": os.path.normpath(result[
|
|
40
|
-
"cruise_name": os.path.normpath(result[
|
|
50
|
+
"ship_name": os.path.normpath(result[jjj]).split(os.sep)[-4],
|
|
51
|
+
"cruise_name": os.path.normpath(result[jjj]).split(os.sep)[-3],
|
|
41
52
|
"file_stem": file_stem,
|
|
42
|
-
"file_path": result[
|
|
53
|
+
"file_path": result[jjj],
|
|
43
54
|
"geom": geom,
|
|
44
55
|
}
|
|
45
56
|
)
|
|
46
57
|
df = pd.DataFrame(pieces)
|
|
47
58
|
print(df)
|
|
48
|
-
gps_gdf =
|
|
59
|
+
gps_gdf = gpd.GeoDataFrame(
|
|
49
60
|
data=df[
|
|
50
61
|
["ship_name", "cruise_name", "file_stem"]
|
|
51
62
|
], # try again with file_stem
|
|
@@ -70,6 +81,184 @@ class PMTileGeneration(object):
|
|
|
70
81
|
"""
|
|
71
82
|
|
|
72
83
|
#######################################################
|
|
84
|
+
# TODO: temporary using this to get info
|
|
85
|
+
def get_info_from_zarr_store(
|
|
86
|
+
self,
|
|
87
|
+
ship_name,
|
|
88
|
+
cruise_names,
|
|
89
|
+
):
|
|
90
|
+
total_size = 0
|
|
91
|
+
s3_fs = s3fs.S3FileSystem(anon=True)
|
|
92
|
+
for cruise_name in cruise_names:
|
|
93
|
+
path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
|
|
94
|
+
zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
|
|
95
|
+
xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
|
|
96
|
+
print(f'Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}')
|
|
97
|
+
total_size = total_size + xr_store.time.shape[0]
|
|
98
|
+
|
|
99
|
+
def get_geospatial_info_from_zarr_store(
|
|
100
|
+
self,
|
|
101
|
+
ship_name,
|
|
102
|
+
cruise_name,
|
|
103
|
+
):
|
|
104
|
+
"""
|
|
105
|
+
Open Zarr store, create geometry, write to geojson, return name
|
|
106
|
+
"""
|
|
107
|
+
s3_fs = s3fs.S3FileSystem(anon=True)
|
|
108
|
+
gps_gdf = geopandas.GeoDataFrame(
|
|
109
|
+
columns=["id", "ship", "cruise", "sensor", "geometry"],
|
|
110
|
+
geometry="geometry",
|
|
111
|
+
crs="EPSG:4326"
|
|
112
|
+
)
|
|
113
|
+
path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
|
|
114
|
+
# file_name = os.path.normpath(path_to_zarr_store).split(os.sep)[-1]
|
|
115
|
+
# file_stem = os.path.splitext(os.path.basename(file_name))[0]
|
|
116
|
+
zarr_store = s3fs.S3Map(root=path_to_zarr_store, s3=s3_fs)
|
|
117
|
+
# ---Open Zarr Store--- #
|
|
118
|
+
# TODO: try-except to allow failures
|
|
119
|
+
print('opening store')
|
|
120
|
+
# xr_store = xr.open_zarr(store=zarr_store, consolidated=False)
|
|
121
|
+
xr_store = xr.open_zarr(store=zarr_store, consolidated=None)
|
|
122
|
+
print(xr_store.Sv.shape)
|
|
123
|
+
# ---Read Zarr Store Time/Latitude/Longitude--- #
|
|
124
|
+
latitude = xr_store.latitude.values
|
|
125
|
+
longitude = xr_store.longitude.values
|
|
126
|
+
if np.isnan(latitude).any() or np.isnan(longitude).any():
|
|
127
|
+
print(f'there was missing lat-lon data for {cruise_name}')
|
|
128
|
+
return None
|
|
129
|
+
# ---Add To GeoPandas Dataframe--- #
|
|
130
|
+
# TODO: experiment with tolerance "0.001"
|
|
131
|
+
geom = LineString(list(zip(longitude, latitude))).simplify(tolerance=0.001, preserve_topology=True)
|
|
132
|
+
gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
133
|
+
gps_gdf.set_index('id', inplace=True)
|
|
134
|
+
gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, engine="pyogrio")
|
|
135
|
+
return cruise_name
|
|
73
136
|
|
|
137
|
+
#######################################################
|
|
138
|
+
def open_zarr_stores_with_thread_pool_executor(
|
|
139
|
+
self,
|
|
140
|
+
cruises: list,
|
|
141
|
+
):
|
|
142
|
+
# 'cruises' is a list of cruises to process
|
|
143
|
+
completed_cruises = []
|
|
144
|
+
try:
|
|
145
|
+
with ThreadPoolExecutor(max_workers=32) as executor:
|
|
146
|
+
futures = [
|
|
147
|
+
executor.submit(
|
|
148
|
+
self.get_geospatial_info_from_zarr_store,
|
|
149
|
+
"Henry_B._Bigelow", # ship_name
|
|
150
|
+
cruise, # cruise_name
|
|
151
|
+
)
|
|
152
|
+
for cruise in cruises
|
|
153
|
+
]
|
|
154
|
+
for future in as_completed(futures):
|
|
155
|
+
result = future.result()
|
|
156
|
+
if result:
|
|
157
|
+
completed_cruises.extend([result])
|
|
158
|
+
except Exception as err:
|
|
159
|
+
print(err)
|
|
160
|
+
print("Done opening zarr stores using thread pool.")
|
|
161
|
+
return completed_cruises # Took ~12 minutes
|
|
74
162
|
|
|
163
|
+
#######################################################
|
|
164
|
+
# https://docs.protomaps.com/pmtiles/create
|
|
165
|
+
def aggregate_geojson_into_dataframe(
|
|
166
|
+
self
|
|
167
|
+
):
|
|
168
|
+
"""
|
|
169
|
+
iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
|
|
170
|
+
"""
|
|
171
|
+
gps_gdf = geopandas.GeoDataFrame(
|
|
172
|
+
columns=["id", "ship", "cruise", "sensor", "geometry"],
|
|
173
|
+
geometry="geometry",
|
|
174
|
+
crs="EPSG:4326"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
file_type = 'dataframe_*.geojson'
|
|
178
|
+
geojson_files = glob.glob(file_type)
|
|
179
|
+
for jjj in range(len(geojson_files)):
|
|
180
|
+
print(jjj)
|
|
181
|
+
geom = geopandas.read_file(geojson_files[jjj])
|
|
182
|
+
gps_gdf.loc[jjj] = (jjj, geom.ship[0], geom.cruise[0], geom.sensor[0], geom.geometry[0])
|
|
183
|
+
#gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
184
|
+
print(gps_gdf)
|
|
185
|
+
gps_gdf.set_index('id', inplace=True)
|
|
186
|
+
gps_gdf.to_file(f"data.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
187
|
+
return list(gps_gdf.cruise)
|
|
188
|
+
|
|
189
|
+
# gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
|
|
190
|
+
#print('writing to file')
|
|
191
|
+
#print(gps_gdf)
|
|
192
|
+
# gps_gdf.set_index('id', inplace=True)
|
|
193
|
+
# gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
194
|
+
# https://gdal.org/en/latest/drivers/vector/jsonfg.html
|
|
195
|
+
# gps_gdf.to_file(
|
|
196
|
+
# f"data.geojson",
|
|
197
|
+
# driver="GeoJSON",
|
|
198
|
+
# engine="pyogrio",
|
|
199
|
+
# layer_options={"ID_FIELD": "id"}
|
|
200
|
+
# )
|
|
201
|
+
# gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
|
|
202
|
+
|
|
203
|
+
# print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
|
|
204
|
+
#gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
|
|
205
|
+
# Convert geojson feature collection to pmtiles
|
|
206
|
+
#gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
|
|
207
|
+
#print("done")
|
|
208
|
+
# ---Export Shapefile--- #
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
#gps_gdf.set_geometry(col='geometry', inplace=True)
|
|
213
|
+
#gps_gdf.__geo_interface__
|
|
214
|
+
#gps_gdf.set_index('id', inplace=True)
|
|
215
|
+
#gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
|
|
216
|
+
|
|
217
|
+
### this gives the right layer id values
|
|
218
|
+
#gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
|
|
219
|
+
# jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
|
|
220
|
+
#tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
|
|
221
|
+
#tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
|
|
222
|
+
# {
|
|
223
|
+
# "type": "FeatureCollection",
|
|
224
|
+
# "name": "dataframe5",
|
|
225
|
+
# "features": [
|
|
226
|
+
# { "type": "Feature", "id": 0, "properties": { "id": 0, "ship": "Henry_B._Bigelow", "cruise": "HB0706", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.120498657226562, 39.659671783447266 ], [ -72.120773315429688, 39.660198211669922 ] ] } },
|
|
227
|
+
# { "type": "Feature", "id": 1, "properties": { "id": 1, "ship": "Henry_B._Bigelow", "cruise": "HB0707", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -71.797836303710938, 41.003166198730469 ], [ -71.797996520996094, 41.002998352050781 ], [ -71.798583984375, 41.002994537353516 ] ] } },
|
|
228
|
+
# { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
|
|
229
|
+
# ]
|
|
230
|
+
# }
|
|
231
|
+
"""
|
|
232
|
+
# https://docs.protomaps.com/pmtiles/create
|
|
233
|
+
#ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
|
|
234
|
+
# Only need to do the second one here...
|
|
235
|
+
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
|
|
236
|
+
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
237
|
+
# used this to combine all the geojson files into single pmtile file (2024-12-03):
|
|
238
|
+
tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
|
|
239
|
+
|
|
240
|
+
TODO:
|
|
241
|
+
run each one of the cruises in a separate ospool workflow.
|
|
242
|
+
each process gets own store
|
|
243
|
+
"""
|
|
75
244
|
###########################################################
|
|
245
|
+
|
|
246
|
+
# s3_manager = S3Manager() # endpoint_url=endpoint_url)
|
|
247
|
+
# # s3fs_manager = S3FSManager()
|
|
248
|
+
# # input_bucket_name = "test_input_bucket"
|
|
249
|
+
# # s3_manager.create_bucket(bucket_name=input_bucket_name)
|
|
250
|
+
# ship_name = "Henry_B._Bigelow"
|
|
251
|
+
# cruise_name = "HB0706"
|
|
252
|
+
# sensor_name = "EK60"
|
|
253
|
+
#
|
|
254
|
+
# # ---Scan Bucket For All Zarr Stores--- #
|
|
255
|
+
# # https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html#level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr/
|
|
256
|
+
# path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr"
|
|
257
|
+
# s3 = s3fs.S3FileSystem()
|
|
258
|
+
# zarr_store = s3fs.S3Map(path_to_zarr_store, s3=s3)
|
|
259
|
+
# ds_zarr = xr.open_zarr(zarr_store, consolidated=None)
|
|
260
|
+
# print(ds_zarr.Sv.shape)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]
|
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
6
|
from concurrent.futures import as_completed
|
|
7
|
-
from water_column_sonar_processing.aws.s3_manager import S3Manager
|
|
7
|
+
from src.water_column_sonar_processing.aws.s3_manager import S3Manager
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class IndexManager:
|
|
@@ -16,12 +16,10 @@ class IndexManager:
|
|
|
16
16
|
self.s3_manager = S3Manager()
|
|
17
17
|
|
|
18
18
|
#################################################################
|
|
19
|
-
|
|
20
19
|
def list_ships(
|
|
21
20
|
self,
|
|
22
21
|
prefix="data/raw/",
|
|
23
22
|
):
|
|
24
|
-
# s3_client = self.s3_manager.s3_client
|
|
25
23
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
26
24
|
Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
|
|
27
25
|
)
|
|
@@ -79,6 +77,7 @@ class IndexManager:
|
|
|
79
77
|
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
80
78
|
return [i for i in all_files if i.endswith(".raw")]
|
|
81
79
|
|
|
80
|
+
#################################################################
|
|
82
81
|
def get_raw_files_csv(
|
|
83
82
|
self,
|
|
84
83
|
ship_name,
|
|
@@ -86,7 +85,9 @@ class IndexManager:
|
|
|
86
85
|
sensor_name,
|
|
87
86
|
):
|
|
88
87
|
raw_files = self.get_raw_files(
|
|
89
|
-
ship_name=ship_name,
|
|
88
|
+
ship_name=ship_name,
|
|
89
|
+
cruise_name=cruise_name,
|
|
90
|
+
sensor_name=sensor_name
|
|
90
91
|
)
|
|
91
92
|
files_list = [
|
|
92
93
|
{
|
|
@@ -102,7 +103,10 @@ class IndexManager:
|
|
|
102
103
|
print("done")
|
|
103
104
|
|
|
104
105
|
#################################################################
|
|
105
|
-
def get_subset_ek60_prefix(
|
|
106
|
+
def get_subset_ek60_prefix( # TODO: is this used?
|
|
107
|
+
self,
|
|
108
|
+
df: pd.DataFrame
|
|
109
|
+
) -> pd.DataFrame:
|
|
106
110
|
# Returns all objects with 'EK60' in prefix of file path
|
|
107
111
|
# Note that this can include 'EK80' data that are false-positives
|
|
108
112
|
# in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
|
|
@@ -119,13 +123,13 @@ class IndexManager:
|
|
|
119
123
|
2:5
|
|
120
124
|
] # 'Okeanos_Explorer', 'EX1608', 'EK60'
|
|
121
125
|
if (
|
|
122
|
-
re.search("[D](
|
|
123
|
-
and re.search("[T](
|
|
126
|
+
re.search("[D](\\d{8})", filename) is not None
|
|
127
|
+
and re.search("[T](\\d{6})", filename) is not None
|
|
124
128
|
):
|
|
125
129
|
# Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
|
|
126
130
|
# and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
|
|
127
|
-
date_substring = re.search("[D](
|
|
128
|
-
time_substring = re.search("[T](
|
|
131
|
+
date_substring = re.search("[D](\\d{8})", filename).group(1)
|
|
132
|
+
time_substring = re.search("[T](\\d{6})", filename).group(1)
|
|
129
133
|
date_string = datetime.strptime(
|
|
130
134
|
f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
|
|
131
135
|
)
|
|
@@ -146,7 +150,10 @@ class IndexManager:
|
|
|
146
150
|
return pd.DataFrame(objects)
|
|
147
151
|
|
|
148
152
|
#################################################################
|
|
149
|
-
def scan_datagram(
|
|
153
|
+
def scan_datagram(
|
|
154
|
+
self,
|
|
155
|
+
select_key: str
|
|
156
|
+
) -> list:
|
|
150
157
|
# Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
|
|
151
158
|
# Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
|
|
152
159
|
# select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
|
|
@@ -162,7 +169,10 @@ class IndexManager:
|
|
|
162
169
|
return first_datagram
|
|
163
170
|
|
|
164
171
|
#################################################################
|
|
165
|
-
def get_subset_datagrams(
|
|
172
|
+
def get_subset_datagrams(
|
|
173
|
+
self,
|
|
174
|
+
df: pd.DataFrame
|
|
175
|
+
) -> list:
|
|
166
176
|
print("getting subset of datagrams")
|
|
167
177
|
select_keys = list(
|
|
168
178
|
df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
|
|
@@ -181,7 +191,9 @@ class IndexManager:
|
|
|
181
191
|
|
|
182
192
|
#################################################################
|
|
183
193
|
def get_ek60_objects(
|
|
184
|
-
self,
|
|
194
|
+
self,
|
|
195
|
+
df: pd.DataFrame,
|
|
196
|
+
subset_datagrams: list
|
|
185
197
|
) -> pd.DataFrame:
|
|
186
198
|
# for each key write datagram value to all other files in same cruise
|
|
187
199
|
for subset_datagram in subset_datagrams:
|
|
@@ -195,7 +207,7 @@ class IndexManager:
|
|
|
195
207
|
return df.loc[df["DATAGRAM"] == "CON0"]
|
|
196
208
|
|
|
197
209
|
#################################################################
|
|
198
|
-
def get_calibration_information(
|
|
210
|
+
def get_calibration_information(
|
|
199
211
|
self,
|
|
200
212
|
) -> pd.DataFrame:
|
|
201
213
|
# Calibration data generated by data manager currently located here:
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
2
|
import numcodecs
|
|
4
3
|
import numpy as np
|
|
5
4
|
import xarray as xr
|
|
6
5
|
import zarr
|
|
7
6
|
from numcodecs import Blosc
|
|
8
7
|
|
|
9
|
-
from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
|
|
10
|
-
from water_column_sonar_processing.utility.constants import Constants, Coordinates
|
|
11
|
-
from water_column_sonar_processing.utility.timestamp import Timestamp
|
|
8
|
+
from src.water_column_sonar_processing.aws.s3fs_manager import S3FSManager
|
|
9
|
+
from src.water_column_sonar_processing.utility.constants import Constants, Coordinates
|
|
10
|
+
from src.water_column_sonar_processing.utility.timestamp import Timestamp
|
|
12
11
|
|
|
13
12
|
numcodecs.blosc.use_threads = False
|
|
14
13
|
numcodecs.blosc.set_nthreads(1)
|
|
@@ -32,8 +31,8 @@ class ZarrManager:
|
|
|
32
31
|
self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
33
32
|
|
|
34
33
|
#######################################################
|
|
35
|
-
@staticmethod
|
|
36
34
|
def get_depth_values(
|
|
35
|
+
self,
|
|
37
36
|
min_echo_range: float = 1.0, # minimum depth measured (zero non-inclusive) from whole cruise
|
|
38
37
|
max_echo_range: float = 100.0, # maximum depth measured from whole cruise
|
|
39
38
|
):
|
|
@@ -85,12 +84,11 @@ class ZarrManager:
|
|
|
85
84
|
name=Coordinates.TIME.value,
|
|
86
85
|
data=np.repeat(0.0, width),
|
|
87
86
|
shape=width,
|
|
88
|
-
chunks=
|
|
89
|
-
Constants.TILE_SIZE.value,
|
|
90
|
-
), # TODO: the chunking scheme doesn't seem to be working here
|
|
87
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
88
|
+
# Constants.TILE_SIZE.value,
|
|
89
|
+
#), # TODO: the chunking scheme doesn't seem to be working here
|
|
91
90
|
dtype=np.dtype(Coordinates.TIME_DTYPE.value),
|
|
92
91
|
compressor=self.__compressor,
|
|
93
|
-
# fill_value=0.,
|
|
94
92
|
fill_value=np.nan, # TODO: do i want nan's?
|
|
95
93
|
overwrite=self.__overwrite,
|
|
96
94
|
)
|
|
@@ -113,12 +111,12 @@ class ZarrManager:
|
|
|
113
111
|
# TODO: verify that these values are correct
|
|
114
112
|
data=depth_values,
|
|
115
113
|
shape=len(depth_values),
|
|
116
|
-
chunks=Constants.
|
|
114
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
117
115
|
dtype=np.dtype(
|
|
118
116
|
Coordinates.DEPTH_DTYPE.value
|
|
119
117
|
), # float16 == 2 significant digits would be ideal
|
|
120
118
|
compressor=self.__compressor,
|
|
121
|
-
|
|
119
|
+
fill_value=np.nan,
|
|
122
120
|
overwrite=self.__overwrite,
|
|
123
121
|
)
|
|
124
122
|
# TODO: change to exception
|
|
@@ -133,15 +131,16 @@ class ZarrManager:
|
|
|
133
131
|
# --- Coordinate: Latitude --- #
|
|
134
132
|
root.create_dataset(
|
|
135
133
|
name=Coordinates.LATITUDE.value,
|
|
136
|
-
data=np.repeat(0.0, width),
|
|
134
|
+
# data=np.repeat(0.0, width),
|
|
137
135
|
shape=width,
|
|
138
|
-
chunks=Constants.
|
|
136
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
139
137
|
dtype=np.dtype(Coordinates.LATITUDE_DTYPE.value),
|
|
140
138
|
compressor=self.__compressor,
|
|
141
|
-
fill_value=
|
|
139
|
+
fill_value=np.nan,
|
|
142
140
|
overwrite=self.__overwrite,
|
|
143
141
|
)
|
|
144
142
|
|
|
143
|
+
# Note: LATITUDE is indexed by TIME
|
|
145
144
|
root.latitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
|
|
146
145
|
|
|
147
146
|
root.latitude.attrs["long_name"] = Coordinates.LATITUDE_LONG_NAME.value
|
|
@@ -151,15 +150,16 @@ class ZarrManager:
|
|
|
151
150
|
# --- Coordinate: Longitude --- #
|
|
152
151
|
root.create_dataset(
|
|
153
152
|
name=Coordinates.LONGITUDE.value,
|
|
154
|
-
data=np.repeat(0.0, width), # root.longitude[:] = np.nan
|
|
153
|
+
# data=np.repeat(0.0, width), # root.longitude[:] = np.nan
|
|
155
154
|
shape=width,
|
|
156
|
-
chunks=Constants.
|
|
155
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
157
156
|
dtype=np.dtype(Coordinates.LONGITUDE_DTYPE.value),
|
|
158
157
|
compressor=self.__compressor,
|
|
159
|
-
fill_value=
|
|
158
|
+
fill_value=np.nan,
|
|
160
159
|
overwrite=self.__overwrite,
|
|
161
160
|
)
|
|
162
161
|
|
|
162
|
+
# Note: LONGITUDE is indexed by TIME
|
|
163
163
|
root.longitude.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
|
|
164
164
|
|
|
165
165
|
root.longitude.attrs["long_name"] = Coordinates.LONGITUDE_LONG_NAME.value
|
|
@@ -170,19 +170,20 @@ class ZarrManager:
|
|
|
170
170
|
# --- Coordinate: Bottom --- #
|
|
171
171
|
root.create_dataset(
|
|
172
172
|
name=Coordinates.BOTTOM.value,
|
|
173
|
-
|
|
173
|
+
data=np.repeat(0.0, width), # root.longitude[:] = np.nan
|
|
174
174
|
shape=width,
|
|
175
|
-
chunks=Constants.
|
|
175
|
+
chunks=Constants.SPATIOTEMPORAL_CHUNK_SIZE.value,
|
|
176
176
|
dtype=np.dtype(Coordinates.BOTTOM_DTYPE.value),
|
|
177
177
|
compressor=self.__compressor,
|
|
178
|
-
fill_value=
|
|
178
|
+
fill_value=0.0,
|
|
179
179
|
overwrite=self.__overwrite,
|
|
180
180
|
)
|
|
181
181
|
|
|
182
|
-
|
|
182
|
+
# BOTTOM is indexed by TIME
|
|
183
|
+
root.bottom.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.TIME.value]
|
|
183
184
|
|
|
184
|
-
root.
|
|
185
|
-
root.
|
|
185
|
+
root.bottom.attrs["long_name"] = Coordinates.BOTTOM_LONG_NAME.value
|
|
186
|
+
root.bottom.attrs["units"] = Coordinates.BOTTOM_UNITS.value
|
|
186
187
|
|
|
187
188
|
#####################################################################
|
|
188
189
|
# --- Coordinate: Frequency --- #
|
|
@@ -190,7 +191,7 @@ class ZarrManager:
|
|
|
190
191
|
name=Coordinates.FREQUENCY.value,
|
|
191
192
|
data=frequencies,
|
|
192
193
|
shape=len(frequencies),
|
|
193
|
-
chunks=
|
|
194
|
+
chunks=len(frequencies),
|
|
194
195
|
dtype=np.dtype(Coordinates.FREQUENCY_DTYPE.value),
|
|
195
196
|
compressor=self.__compressor,
|
|
196
197
|
fill_value=0.0,
|
|
@@ -213,7 +214,7 @@ class ZarrManager:
|
|
|
213
214
|
root.create_dataset(
|
|
214
215
|
name=Coordinates.SV.value,
|
|
215
216
|
shape=(len(depth_values), width, len(frequencies)),
|
|
216
|
-
chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value,
|
|
217
|
+
chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
|
|
217
218
|
dtype=np.dtype(
|
|
218
219
|
Coordinates.SV_DTYPE.value
|
|
219
220
|
), # TODO: try to experiment with 'float16'
|
|
@@ -3,10 +3,10 @@ import os
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
|
|
7
|
-
from water_column_sonar_processing.aws.s3_manager import S3Manager
|
|
8
|
-
from water_column_sonar_processing.aws.s3fs_manager import S3FSManager
|
|
9
|
-
from water_column_sonar_processing.aws.sns_manager import SNSManager
|
|
6
|
+
from src.water_column_sonar_processing.aws.dynamodb_manager import DynamoDBManager
|
|
7
|
+
from src.water_column_sonar_processing.aws.s3_manager import S3Manager
|
|
8
|
+
from src.water_column_sonar_processing.aws.s3fs_manager import S3FSManager
|
|
9
|
+
from src.water_column_sonar_processing.aws.sns_manager import SNSManager
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
###########################################################
|