water-column-sonar-processing 0.0.13__py3-none-any.whl → 24.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of water-column-sonar-processing might be problematic. Click here for more details.
- water_column_sonar_processing/aws/s3fs_manager.py +1 -0
- water_column_sonar_processing/cruise/create_empty_zarr_store.py +4 -5
- water_column_sonar_processing/cruise/datatree_manager.py +24 -0
- water_column_sonar_processing/cruise/resample_regrid.py +15 -20
- water_column_sonar_processing/geometry/__init__.py +2 -1
- water_column_sonar_processing/geometry/elevation_manager.py +112 -0
- water_column_sonar_processing/index/index_manager.py +92 -7
- water_column_sonar_processing/model/zarr_manager.py +14 -9
- water_column_sonar_processing/processing/__init__.py +3 -2
- water_column_sonar_processing/processing/batch_downloader.py +132 -0
- water_column_sonar_processing/processing/raw_to_zarr.py +0 -2
- water_column_sonar_processing/utility/constants.py +3 -2
- {water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/METADATA +24 -21
- {water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/RECORD +17 -16
- {water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/WHEEL +1 -1
- water_column_sonar_processing/cruise/experiment_datatree.py +0 -13
- water_column_sonar_processing/processing/cruise_sampler.py +0 -342
- {water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/LICENSE +0 -0
- {water_column_sonar_processing-0.0.13.dist-info → water_column_sonar_processing-24.1.1.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ class S3FSManager:
|
|
|
16
16
|
# self.output_bucket_name = os.environ.get("OUTPUT_BUCKET_NAME")
|
|
17
17
|
self.s3_region = os.environ.get("AWS_REGION", default="us-east-1")
|
|
18
18
|
self.s3fs = s3fs.S3FileSystem(
|
|
19
|
+
asynchronous=False,
|
|
19
20
|
endpoint_url=endpoint_url,
|
|
20
21
|
key=os.environ.get("OUTPUT_BUCKET_ACCESS_KEY"),
|
|
21
22
|
secret=os.environ.get("OUTPUT_BUCKET_SECRET_ACCESS_KEY"),
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import tempfile
|
|
2
3
|
|
|
3
4
|
import numcodecs
|
|
4
5
|
import numpy as np
|
|
@@ -11,7 +12,6 @@ from water_column_sonar_processing.utility import Cleaner
|
|
|
11
12
|
numcodecs.blosc.use_threads = False
|
|
12
13
|
numcodecs.blosc.set_nthreads(1)
|
|
13
14
|
|
|
14
|
-
# TEMPDIR = "/tmp"
|
|
15
15
|
# TODO: when ready switch to version 3 of model spec
|
|
16
16
|
# ZARR_V3_EXPERIMENTAL_API = 1
|
|
17
17
|
# creates the latlon data: foo = ep.consolidate.add_location(ds_Sv, echodata)
|
|
@@ -61,7 +61,6 @@ class CreateEmptyZarrStore:
|
|
|
61
61
|
# TODO: move to common place
|
|
62
62
|
|
|
63
63
|
#######################################################
|
|
64
|
-
# @classmethod
|
|
65
64
|
def create_cruise_level_zarr_store(
|
|
66
65
|
self,
|
|
67
66
|
output_bucket_name: str,
|
|
@@ -69,8 +68,8 @@ class CreateEmptyZarrStore:
|
|
|
69
68
|
cruise_name: str,
|
|
70
69
|
sensor_name: str,
|
|
71
70
|
table_name: str,
|
|
72
|
-
tempdir: str,
|
|
73
71
|
) -> None:
|
|
72
|
+
tempdir = tempfile.TemporaryDirectory()
|
|
74
73
|
try:
|
|
75
74
|
# HB0806 - 123, HB0903 - 220
|
|
76
75
|
dynamo_db_manager = DynamoDBManager()
|
|
@@ -146,7 +145,7 @@ class CreateEmptyZarrStore:
|
|
|
146
145
|
print(f"new_height: {new_height}")
|
|
147
146
|
|
|
148
147
|
zarr_manager.create_zarr_store(
|
|
149
|
-
path=tempdir,
|
|
148
|
+
path=tempdir.name, # TODO: need to use .name or problem
|
|
150
149
|
ship_name=ship_name,
|
|
151
150
|
cruise_name=cruise_name,
|
|
152
151
|
sensor_name=sensor_name,
|
|
@@ -159,7 +158,7 @@ class CreateEmptyZarrStore:
|
|
|
159
158
|
#################################################################
|
|
160
159
|
self.upload_zarr_store_to_s3(
|
|
161
160
|
output_bucket_name=output_bucket_name,
|
|
162
|
-
local_directory=tempdir,
|
|
161
|
+
local_directory=tempdir.name, # TODO: need to use .name or problem
|
|
163
162
|
object_prefix=zarr_prefix,
|
|
164
163
|
cruise_name=cruise_name,
|
|
165
164
|
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
### https://xarray-datatree.readthedocs.io/en/latest/data-structures.html
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datatree import DataTree
|
|
4
|
+
import xarray as xr
|
|
5
|
+
|
|
6
|
+
class DatatreeManager:
|
|
7
|
+
#######################################################
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
):
|
|
11
|
+
self.dtype = "float32"
|
|
12
|
+
|
|
13
|
+
#################################################################
|
|
14
|
+
def create_datatree(
|
|
15
|
+
self,
|
|
16
|
+
input_ds,
|
|
17
|
+
) -> None:
|
|
18
|
+
ds1 = xr.Dataset({"foo": "orange"})
|
|
19
|
+
dt = DataTree(name="root", data=ds1) # create root node
|
|
20
|
+
ds2 = xr.Dataset({"bar": 0}, coords={"y": ("y", [0, 1, 2])})
|
|
21
|
+
return dt
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
@@ -281,12 +281,7 @@ class ResampleRegrid:
|
|
|
281
281
|
print(f"start_ping_time_index: {start_ping_time_index}, end_ping_time_index: {end_ping_time_index}")
|
|
282
282
|
#########################################################################
|
|
283
283
|
# write Sv values to cruise-level-model-store
|
|
284
|
-
|
|
285
|
-
len(input_xr.channel.values)
|
|
286
|
-
): # does not like being written in one fell swoop :(
|
|
287
|
-
output_zarr_store.Sv[
|
|
288
|
-
:, start_ping_time_index:end_ping_time_index, channel
|
|
289
|
-
] = regrid_resample[:, :, channel]
|
|
284
|
+
output_zarr_store.Sv[:, start_ping_time_index:end_ping_time_index, :] = regrid_resample.values
|
|
290
285
|
|
|
291
286
|
#########################################################################
|
|
292
287
|
# [5] write subset of latitude/longitude
|
|
@@ -300,27 +295,27 @@ class ResampleRegrid:
|
|
|
300
295
|
#########################################################################
|
|
301
296
|
# TODO: add the "detected_seafloor_depth/" to the
|
|
302
297
|
# L2 cruise dataarrays
|
|
303
|
-
# TODO: make bottom optional
|
|
298
|
+
# TODO: make bottom optional
|
|
304
299
|
# TODO: Only checking the first channel for now. Need to average across all channels
|
|
305
300
|
# in the future. See https://github.com/CI-CMG/water-column-sonar-processing/issues/11
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
301
|
+
if 'detected_seafloor_depth' in input_xr.variables:
|
|
302
|
+
print('Found detected_seafloor_depth, adding data to output store.')
|
|
303
|
+
detected_seafloor_depth = input_xr.detected_seafloor_depth.values
|
|
304
|
+
detected_seafloor_depth[detected_seafloor_depth == 0.] = np.nan
|
|
305
|
+
# TODO: problem here: Processing file: D20070711-T210709.
|
|
306
|
+
detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0) # RuntimeWarning: Mean of empty slice detected_seafloor_depths = np.nanmean(detected_seafloor_depth, 0)
|
|
307
|
+
detected_seafloor_depths[detected_seafloor_depths == 0.] = np.nan
|
|
308
|
+
print(f"min depth measured: {np.nanmin(detected_seafloor_depths)}")
|
|
309
|
+
print(f"max depth measured: {np.nanmax(detected_seafloor_depths)}")
|
|
310
|
+
#available_indices = np.argwhere(np.isnan(geospatial['latitude'].values))
|
|
311
|
+
output_zarr_store.bottom[
|
|
312
|
+
start_ping_time_index:end_ping_time_index
|
|
313
|
+
] = detected_seafloor_depths
|
|
317
314
|
#########################################################################
|
|
318
315
|
#########################################################################
|
|
319
316
|
except Exception as err:
|
|
320
317
|
print(f"Problem interpolating the data: {err}")
|
|
321
318
|
raise err
|
|
322
|
-
# else:
|
|
323
|
-
# pass
|
|
324
319
|
finally:
|
|
325
320
|
print("Done interpolating data.")
|
|
326
321
|
# TODO: read across times and verify data was written?
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from .elevation_manager import ElevationManager
|
|
1
2
|
from .geometry_manager import GeometryManager
|
|
2
3
|
from .geometry_simplification import GeometrySimplification
|
|
3
4
|
from .pmtile_generation import PMTileGeneration
|
|
4
5
|
|
|
5
|
-
__all__ = ["GeometryManager", "GeometrySimplification", "PMTileGeneration"]
|
|
6
|
+
__all__ = ["ElevationManager", "GeometryManager", "GeometrySimplification", "PMTileGeneration"]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry=-31.70235%2C13.03332&geometryType=esriGeometryPoint&returnGeometry=false&returnCatalogItems=false&f=json
|
|
3
|
+
|
|
4
|
+
https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/
|
|
5
|
+
identify?
|
|
6
|
+
geometry=-31.70235%2C13.03332
|
|
7
|
+
&geometryType=esriGeometryPoint
|
|
8
|
+
&returnGeometry=false
|
|
9
|
+
&returnCatalogItems=false
|
|
10
|
+
&f=json
|
|
11
|
+
{"objectId":0,"name":"Pixel","value":"-5733","location":{"x":-31.702349999999999,"y":13.03332,"spatialReference":{"wkid":4326,"latestWkid":4326}},"properties":null,"catalogItems":null,"catalogItemVisibilities":[]}
|
|
12
|
+
-5733
|
|
13
|
+
|
|
14
|
+
(base) rudy:deleteME rudy$ curl https://api.opentopodata.org/v1/gebco2020?locations=13.03332,-31.70235
|
|
15
|
+
{
|
|
16
|
+
"results": [
|
|
17
|
+
{
|
|
18
|
+
"dataset": "gebco2020",
|
|
19
|
+
"elevation": -5729.0,
|
|
20
|
+
"location": {
|
|
21
|
+
"lat": 13.03332,
|
|
22
|
+
"lng": -31.70235
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"status": "OK"
|
|
27
|
+
}
|
|
28
|
+
"""
|
|
29
|
+
import json
|
|
30
|
+
import time
|
|
31
|
+
|
|
32
|
+
import requests
|
|
33
|
+
from collections.abc import Generator
|
|
34
|
+
|
|
35
|
+
def chunked(
|
|
36
|
+
ll: list,
|
|
37
|
+
n: int
|
|
38
|
+
) -> Generator:
|
|
39
|
+
# Yields successively n-sized chunks from ll.
|
|
40
|
+
for i in range(0, len(ll), n):
|
|
41
|
+
yield ll[i : i + n]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ElevationManager:
|
|
45
|
+
#######################################################
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
):
|
|
49
|
+
self.DECIMAL_PRECISION = 5 # precision for GPS coordinates
|
|
50
|
+
self.TIMOUT_SECONDS = 10
|
|
51
|
+
|
|
52
|
+
#######################################################
|
|
53
|
+
def get_arcgis_elevation(
|
|
54
|
+
self,
|
|
55
|
+
lngs: list,
|
|
56
|
+
lats: list,
|
|
57
|
+
chunk_size: int=500, # I think this is the api limit
|
|
58
|
+
) -> int:
|
|
59
|
+
# Reference: https://developers.arcgis.com/rest/services-reference/enterprise/map-to-image/
|
|
60
|
+
# Info: https://www.arcgis.com/home/item.html?id=c876e3c96a8642ab8557646a3b4fa0ff
|
|
61
|
+
### 'https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry={"points":[[-31.70235,13.03332],[-32.70235,14.03332]]}&geometryType=esriGeometryMultipoint&returnGeometry=false&returnCatalogItems=false&f=json'
|
|
62
|
+
if len(lngs) != len(lats):
|
|
63
|
+
raise ValueError("lngs and lats must have same length")
|
|
64
|
+
|
|
65
|
+
geometryType = "esriGeometryMultipoint" # TODO: allow single point?
|
|
66
|
+
|
|
67
|
+
depths = []
|
|
68
|
+
|
|
69
|
+
list_of_points = [list(elem) for elem in list(zip(lngs, lats))]
|
|
70
|
+
for chunk in chunked(list_of_points, chunk_size):
|
|
71
|
+
time.sleep(0.1)
|
|
72
|
+
# order: (lng, lat)
|
|
73
|
+
geometry = f'{{"points":{str(chunk)}}}'
|
|
74
|
+
url=f'https://gis.ngdc.noaa.gov/arcgis/rest/services/DEM_mosaics/DEM_global_mosaic/ImageServer/identify?geometry={geometry}&geometryType={geometryType}&returnGeometry=false&returnCatalogItems=false&f=json'
|
|
75
|
+
result = requests.get(url, timeout=self.TIMOUT_SECONDS)
|
|
76
|
+
res = json.loads(result.content.decode('utf8'))
|
|
77
|
+
if 'results' in res:
|
|
78
|
+
for element in res['results']:
|
|
79
|
+
depths.append(float(element['value']))
|
|
80
|
+
elif 'value' in res:
|
|
81
|
+
depths.append(float(res['value']))
|
|
82
|
+
|
|
83
|
+
return depths
|
|
84
|
+
|
|
85
|
+
# def get_gebco_bathymetry_elevation(self) -> int:
|
|
86
|
+
# # Documentation: https://www.opentopodata.org/datasets/gebco2020/
|
|
87
|
+
# latitude = 13.03332
|
|
88
|
+
# longitude = -31.70235
|
|
89
|
+
# dataset = "gebco2020"
|
|
90
|
+
# url = f"https://api.opentopodata.org/v1/{dataset}?locations={latitude},{longitude}"
|
|
91
|
+
# pass
|
|
92
|
+
|
|
93
|
+
# def get_elevation(
|
|
94
|
+
# self,
|
|
95
|
+
# df,
|
|
96
|
+
# lat_column,
|
|
97
|
+
# lon_column,
|
|
98
|
+
# ) -> int:
|
|
99
|
+
# """Query service using lat, lon. add the elevation values as a new column."""
|
|
100
|
+
# url = r'https://epqs.nationalmap.gov/v1/json?'
|
|
101
|
+
# elevations = []
|
|
102
|
+
# for lat, lon in zip(df[lat_column], df[lon_column]):
|
|
103
|
+
# # define rest query params
|
|
104
|
+
# params = {
|
|
105
|
+
# 'output': 'json',
|
|
106
|
+
# 'x': lon,
|
|
107
|
+
# 'y': lat,
|
|
108
|
+
# 'units': 'Meters'
|
|
109
|
+
# }
|
|
110
|
+
# result = requests.get((url + urllib.parse.urlencode(params)))
|
|
111
|
+
# elevations.append(result.json()['value'])
|
|
112
|
+
# return elevations
|
|
@@ -7,13 +7,20 @@ from concurrent.futures import as_completed
|
|
|
7
7
|
from water_column_sonar_processing.aws import S3Manager
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
MAX_POOL_CONNECTIONS = 64
|
|
11
|
+
MAX_CONCURRENCY = 64
|
|
12
|
+
MAX_WORKERS = 64
|
|
13
|
+
GB = 1024**3
|
|
14
|
+
|
|
15
|
+
|
|
10
16
|
class IndexManager:
|
|
17
|
+
# TODO: index into dynamodb instead of csv files
|
|
11
18
|
|
|
12
19
|
def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
|
|
13
20
|
self.input_bucket_name = input_bucket_name
|
|
14
21
|
self.calibration_bucket = calibration_bucket
|
|
15
22
|
self.calibration_key = calibration_key
|
|
16
|
-
self.s3_manager = S3Manager()
|
|
23
|
+
self.s3_manager = S3Manager() # TODO: make anonymous?
|
|
17
24
|
|
|
18
25
|
#################################################################
|
|
19
26
|
def list_ships(
|
|
@@ -50,6 +57,9 @@ class IndexManager:
|
|
|
50
57
|
self,
|
|
51
58
|
cruise_prefixes,
|
|
52
59
|
):
|
|
60
|
+
"""
|
|
61
|
+
This returns a list of ek60 prefixed cruises.
|
|
62
|
+
"""
|
|
53
63
|
cruise_sensors = [] # includes all sensor types
|
|
54
64
|
for cruise_prefix in cruise_prefixes:
|
|
55
65
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
@@ -67,9 +77,12 @@ class IndexManager:
|
|
|
67
77
|
cruise_name,
|
|
68
78
|
sensor_name,
|
|
69
79
|
):
|
|
80
|
+
# Gets all raw files for a cruise under the given prefix
|
|
70
81
|
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
71
82
|
page_iterator = self.s3_manager.paginator.paginate(
|
|
72
|
-
Bucket=self.input_bucket_name,
|
|
83
|
+
Bucket=self.input_bucket_name,
|
|
84
|
+
Prefix=prefix,
|
|
85
|
+
Delimiter="/"
|
|
73
86
|
)
|
|
74
87
|
all_files = []
|
|
75
88
|
for page in page_iterator:
|
|
@@ -77,6 +90,57 @@ class IndexManager:
|
|
|
77
90
|
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
78
91
|
return [i for i in all_files if i.endswith(".raw")]
|
|
79
92
|
|
|
93
|
+
def get_first_raw_file(
|
|
94
|
+
self,
|
|
95
|
+
ship_name,
|
|
96
|
+
cruise_name,
|
|
97
|
+
sensor_name,
|
|
98
|
+
):
|
|
99
|
+
# Same as above but only needs to get the first raw file
|
|
100
|
+
# because we are only interested in the first datagram of one file
|
|
101
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
|
|
102
|
+
# page_iterator = self.s3_manager.paginator.paginate(
|
|
103
|
+
# Bucket=self.input_bucket_name,
|
|
104
|
+
# Prefix=prefix,
|
|
105
|
+
# Delimiter="/",
|
|
106
|
+
# PaginationConfig={ 'MaxItems': 5 }
|
|
107
|
+
# ) # TODO: this can create a problem if there is a non raw file returned first
|
|
108
|
+
### filter with JMESPath expressions ###
|
|
109
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
110
|
+
Bucket=self.input_bucket_name,
|
|
111
|
+
Prefix=prefix,
|
|
112
|
+
Delimiter="/",
|
|
113
|
+
)
|
|
114
|
+
# page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
115
|
+
page_iterator = page_iterator.search(expression="Contents[?contains(Key, '.raw')] ")
|
|
116
|
+
for res in page_iterator:
|
|
117
|
+
if "Key" in res:
|
|
118
|
+
return res["Key"]
|
|
119
|
+
# else raise exception?
|
|
120
|
+
|
|
121
|
+
# DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
|
|
122
|
+
|
|
123
|
+
def get_files_under_size(
|
|
124
|
+
self,
|
|
125
|
+
ship_name,
|
|
126
|
+
cruise_name,
|
|
127
|
+
sensor_name,
|
|
128
|
+
):
|
|
129
|
+
# THIS isn't used, just playing with JMES paths spec
|
|
130
|
+
prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
|
|
131
|
+
### filter with JMESPath expressions ###
|
|
132
|
+
page_iterator = self.s3_manager.paginator.paginate(
|
|
133
|
+
Bucket=self.input_bucket_name,
|
|
134
|
+
Prefix=prefix,
|
|
135
|
+
Delimiter="/",
|
|
136
|
+
)
|
|
137
|
+
page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
|
|
138
|
+
all_files = []
|
|
139
|
+
for page in page_iterator:
|
|
140
|
+
if "Contents" in page.keys():
|
|
141
|
+
all_files.extend([i["Key"] for i in page["Contents"]])
|
|
142
|
+
return [i for i in all_files if i.endswith(".raw")]
|
|
143
|
+
|
|
80
144
|
#################################################################
|
|
81
145
|
def get_raw_files_csv(
|
|
82
146
|
self,
|
|
@@ -102,6 +166,29 @@ class IndexManager:
|
|
|
102
166
|
df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
|
|
103
167
|
print("done")
|
|
104
168
|
|
|
169
|
+
def get_raw_files_list(
|
|
170
|
+
self,
|
|
171
|
+
ship_name,
|
|
172
|
+
cruise_name,
|
|
173
|
+
sensor_name,
|
|
174
|
+
):
|
|
175
|
+
# gets all raw files in cruise and returns a list of dicts
|
|
176
|
+
raw_files = self.get_raw_files(
|
|
177
|
+
ship_name=ship_name,
|
|
178
|
+
cruise_name=cruise_name,
|
|
179
|
+
sensor_name=sensor_name
|
|
180
|
+
)
|
|
181
|
+
files_list = [
|
|
182
|
+
{
|
|
183
|
+
"ship_name": ship_name,
|
|
184
|
+
"cruise_name": cruise_name,
|
|
185
|
+
"sensor_name": sensor_name,
|
|
186
|
+
"file_name": os.path.basename(raw_file),
|
|
187
|
+
}
|
|
188
|
+
for raw_file in raw_files
|
|
189
|
+
]
|
|
190
|
+
return files_list
|
|
191
|
+
|
|
105
192
|
#################################################################
|
|
106
193
|
def get_subset_ek60_prefix( # TODO: is this used?
|
|
107
194
|
self,
|
|
@@ -169,16 +256,14 @@ class IndexManager:
|
|
|
169
256
|
return first_datagram
|
|
170
257
|
|
|
171
258
|
#################################################################
|
|
172
|
-
def get_subset_datagrams(
|
|
259
|
+
def get_subset_datagrams( # TODO: is this getting used
|
|
173
260
|
self,
|
|
174
261
|
df: pd.DataFrame
|
|
175
262
|
) -> list:
|
|
176
263
|
print("getting subset of datagrams")
|
|
177
|
-
select_keys =
|
|
178
|
-
df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
|
|
179
|
-
)
|
|
264
|
+
select_keys = df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values.tolist()
|
|
180
265
|
all_datagrams = []
|
|
181
|
-
with ThreadPoolExecutor(max_workers=
|
|
266
|
+
with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
|
|
182
267
|
futures = [
|
|
183
268
|
executor.submit(self.scan_datagram, select_key)
|
|
184
269
|
for select_key in select_keys
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import numcodecs
|
|
3
2
|
import numpy as np
|
|
4
3
|
import xarray as xr
|
|
@@ -48,6 +47,9 @@ class ZarrManager:
|
|
|
48
47
|
endpoint=True,
|
|
49
48
|
)
|
|
50
49
|
|
|
50
|
+
if np.any(np.isnan(all_cruise_depth_values)):
|
|
51
|
+
raise Exception('Problem depth values returned were NaN.')
|
|
52
|
+
|
|
51
53
|
print("Done getting depth values.")
|
|
52
54
|
return all_cruise_depth_values.round(decimals=2)
|
|
53
55
|
|
|
@@ -67,10 +69,10 @@ class ZarrManager:
|
|
|
67
69
|
print(
|
|
68
70
|
f"Creating local zarr_manager store at {cruise_name}.zarr for ship {ship_name}"
|
|
69
71
|
)
|
|
70
|
-
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
|
|
72
|
+
# There can not currently be repeated frequencies
|
|
73
|
+
# TODO: eventually switch coordinate to "channel" because frequencies can repeat
|
|
74
|
+
if len(frequencies) != len(set(frequencies)):
|
|
75
|
+
raise Exception("Number of frequencies does not match number of channels")
|
|
74
76
|
|
|
75
77
|
print(f"Debugging number of threads: {self.__num_threads}")
|
|
76
78
|
|
|
@@ -118,8 +120,9 @@ class ZarrManager:
|
|
|
118
120
|
fill_value=np.nan,
|
|
119
121
|
overwrite=self.__overwrite,
|
|
120
122
|
)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
+
|
|
124
|
+
if np.any(np.isnan(depth_values)):
|
|
125
|
+
raise Exception('Some depth values returned were NaN.')
|
|
123
126
|
|
|
124
127
|
root.depth.attrs["_ARRAY_DIMENSIONS"] = [Coordinates.DEPTH.value]
|
|
125
128
|
|
|
@@ -219,7 +222,8 @@ class ZarrManager:
|
|
|
219
222
|
root.create_dataset(
|
|
220
223
|
name=Coordinates.SV.value,
|
|
221
224
|
shape=(len(depth_values), width, len(frequencies)),
|
|
222
|
-
chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
|
|
225
|
+
# chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, len(frequencies)),
|
|
226
|
+
chunks=(Constants.TILE_SIZE.value, Constants.TILE_SIZE.value, 1), # 256x256x1 <- speed up for alex
|
|
223
227
|
dtype=np.dtype(
|
|
224
228
|
Coordinates.SV_DTYPE.value
|
|
225
229
|
), # TODO: try to experiment with 'float16'
|
|
@@ -246,11 +250,12 @@ class ZarrManager:
|
|
|
246
250
|
#
|
|
247
251
|
root.attrs["processing_software_name"] = Coordinates.PROJECT_NAME.value
|
|
248
252
|
root.attrs["processing_software_version"] = (
|
|
249
|
-
"
|
|
253
|
+
"24.01.01" # TODO: get programmatically, echopype>utils>prov.py
|
|
250
254
|
)
|
|
251
255
|
root.attrs["processing_software_time"] = Timestamp.get_timestamp()
|
|
252
256
|
#
|
|
253
257
|
root.attrs["calibration_status"] = calibration_status
|
|
258
|
+
root.attrs["tile_size"] = Constants.TILE_SIZE.value
|
|
254
259
|
|
|
255
260
|
zarr.consolidate_metadata(store)
|
|
256
261
|
#####################################################################
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import xarray as xr
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import xbatcher
|
|
5
|
+
from typing import Optional
|
|
6
|
+
# s3fs.core.setup_logging("DEBUG")
|
|
7
|
+
|
|
8
|
+
class BatchDownloader:
|
|
9
|
+
"""
|
|
10
|
+
Uses the xbatcher XbatchDownloader to download data from an xarray dataset. Connection
|
|
11
|
+
is established
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
bucket_name: Optional[str] = "noaa-wcsd-zarr-pds",
|
|
17
|
+
ship_name: Optional[str] = "Henry_B._Bigelow",
|
|
18
|
+
cruise_name: Optional[str] = "HB0707",
|
|
19
|
+
sensor_name: Optional[str] = "EK60",
|
|
20
|
+
patch_dims: Optional[int] = 64, # TODO: change to 64
|
|
21
|
+
# input_steps: Optional[int] = 3,
|
|
22
|
+
):
|
|
23
|
+
self.bucket_name = bucket_name
|
|
24
|
+
self.ship_name = ship_name
|
|
25
|
+
self.cruise_name = cruise_name
|
|
26
|
+
self.sensor_name = sensor_name
|
|
27
|
+
self.patch_dims = patch_dims
|
|
28
|
+
|
|
29
|
+
# TODO: move this to the s3fs module
|
|
30
|
+
def get_s3_zarr_store(self) -> xr.Dataset:
|
|
31
|
+
""" Returns an Xarray Dataset """
|
|
32
|
+
s3_zarr_store_path = f"{self.bucket_name}/level_2/{self.ship_name}/{self.cruise_name}/{self.sensor_name}/{self.cruise_name}.zarr"
|
|
33
|
+
# Info about the HB0707 cruise:
|
|
34
|
+
# Time: ["2007-07-11T18:20:33.657573888", "2007-07-11T18:20:53.657573888", "2007-07-13T00:55:17.454448896"]
|
|
35
|
+
# Frequency: [ 18000. 38000. 120000. 200000.]
|
|
36
|
+
# Depth: [0.19, 999.74]
|
|
37
|
+
|
|
38
|
+
# Needed to override credentials for github actions
|
|
39
|
+
# s3_file_system = s3fs.S3FileSystem(anon=True)
|
|
40
|
+
# store = s3fs.S3Map(root=s3_zarr_store_path, s3=s3_file_system, check=False)
|
|
41
|
+
|
|
42
|
+
# return xr.open_zarr(store=f"s3://{s3_zarr_store_path}", consolidated=True, storage_options={'anon': True})
|
|
43
|
+
return xr.open_dataset(f"s3://{s3_zarr_store_path}", engine="zarr", storage_options={'anon': True})
|
|
44
|
+
# return xr.open_zarr(store, consolidated=True)
|
|
45
|
+
|
|
46
|
+
def get_toy_batch_generator(self) -> xbatcher.BatchGenerator:
|
|
47
|
+
"""
|
|
48
|
+
Returns a BatchGenerator with subsets of Sv data
|
|
49
|
+
Note: this is synthetic data, for a smaller toy example
|
|
50
|
+
"""
|
|
51
|
+
depth = np.arange(1, 21) # N meters
|
|
52
|
+
time = pd.date_range(start="2025-01-01", end="2025-01-31", freq='D') # N days
|
|
53
|
+
frequency = [1_000, 2_000, 3_000] # N frequencies
|
|
54
|
+
Sv = np.random.rand(len(depth), len(time), len(frequency)) # synthetic data
|
|
55
|
+
cruise = xr.Dataset(
|
|
56
|
+
data_vars={
|
|
57
|
+
"Sv": (["depth", "time", "frequency"], Sv)
|
|
58
|
+
},
|
|
59
|
+
coords={
|
|
60
|
+
"depth": depth,
|
|
61
|
+
"time": time,
|
|
62
|
+
"frequency": frequency,
|
|
63
|
+
},
|
|
64
|
+
attrs=dict(description="Toy Example"),
|
|
65
|
+
)
|
|
66
|
+
batch_generator = xbatcher.BatchGenerator(
|
|
67
|
+
ds=cruise,
|
|
68
|
+
# get samples that are shaped 10x10x3
|
|
69
|
+
input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
|
|
70
|
+
# no overlap between samples
|
|
71
|
+
input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
|
|
72
|
+
)
|
|
73
|
+
return batch_generator
|
|
74
|
+
|
|
75
|
+
def get_s3_batch_generator(self) -> xbatcher.BatchGenerator:
|
|
76
|
+
""" Returns a BatchGenerator with subsets of Sv data from s3 Zarr store """
|
|
77
|
+
cruise = self.get_s3_zarr_store()
|
|
78
|
+
|
|
79
|
+
# TODO: temporarily limits to a smaller slice of the data
|
|
80
|
+
cruise_select = (cruise
|
|
81
|
+
.where(cruise.depth < 100., drop=True)
|
|
82
|
+
.sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
|
|
83
|
+
# .sel(time=slice("2007-07-11T18:20:00", "2007-07-11T19:20:00"))
|
|
84
|
+
)
|
|
85
|
+
print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
|
|
86
|
+
|
|
87
|
+
batch_generator = xbatcher.BatchGenerator(
|
|
88
|
+
ds=cruise_select,
|
|
89
|
+
input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
|
|
90
|
+
input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
|
|
91
|
+
preload_batch=False,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# TODO: need to raise exception if all the data is nan
|
|
95
|
+
|
|
96
|
+
return batch_generator
|
|
97
|
+
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
|
|
98
|
+
|
|
99
|
+
def get_s3_manual_batch_generator(self):
|
|
100
|
+
"""
|
|
101
|
+
Using just xarray (no xbatcher), iterate through the data and generate batches.
|
|
102
|
+
Returns a BatchGenerator with subsets of Sv data from s3 Zarr store.
|
|
103
|
+
"""
|
|
104
|
+
cruise = self.get_s3_zarr_store()
|
|
105
|
+
|
|
106
|
+
# TODO: temporarily limits to a smaller slice of the data
|
|
107
|
+
cruise_select = (cruise
|
|
108
|
+
.where(cruise.depth < 100., drop=True)
|
|
109
|
+
.sel(time=slice("2007-07-11T18:20:33", "2007-07-11T18:20:53"))
|
|
110
|
+
)
|
|
111
|
+
print(cruise_select.Sv.shape) # (526 depth, 21 time, 4 freq)
|
|
112
|
+
batch_generator = xbatcher.BatchGenerator(
|
|
113
|
+
ds=cruise_select,
|
|
114
|
+
input_dims={ 'depth': 10, 'time': 10, 'frequency': cruise.frequency.shape[0] }, # A dictionary specifying the size of the inputs in each dimension, e.g. ``{'lat': 30, 'lon': 30}`` These are the dimensions the ML library will see. All other dimensions will be stacked into one dimension called ``sample``.
|
|
115
|
+
input_overlap={ 'depth': 0, 'time': 0, 'frequency': 0 }, # Zero means no overlap. A dictionary specifying the overlap along each dimension
|
|
116
|
+
preload_batch=True,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# TODO: need to raise exception if all the data is nan
|
|
120
|
+
|
|
121
|
+
return batch_generator
|
|
122
|
+
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
|
|
123
|
+
|
|
124
|
+
"""
|
|
125
|
+
(105, 21, 4)
|
|
126
|
+
|
|
127
|
+
depth-start: 0.1899999976158142, depth-end: 1.899999976158142
|
|
128
|
+
time-start: 2007-07-11T18:20:33.657573888, time-end: 2007-07-11T18:20:42.657573888
|
|
129
|
+
frequency-start: 18000.0, frequency-end: 200000.0
|
|
130
|
+
(10, 10, 4)
|
|
131
|
+
np.nanmean: -53.70000076293945
|
|
132
|
+
"""
|
|
@@ -11,8 +11,6 @@ from water_column_sonar_processing.aws import DynamoDBManager, S3Manager
|
|
|
11
11
|
from water_column_sonar_processing.geometry import GeometryManager
|
|
12
12
|
from water_column_sonar_processing.utility import Cleaner, PipelineStatus
|
|
13
13
|
|
|
14
|
-
TEMPDIR = "/tmp"
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
# This code is getting copied from echofish-aws-raw-to-zarr-lambda
|
|
18
16
|
class RawToZarr:
|
|
@@ -3,11 +3,12 @@ from enum import Enum, Flag, unique
|
|
|
3
3
|
|
|
4
4
|
@unique
|
|
5
5
|
class Constants(Flag):
|
|
6
|
-
TILE_SIZE =
|
|
6
|
+
TILE_SIZE = 256 # TODO: add tile size to metadata?
|
|
7
|
+
|
|
7
8
|
# Average https://noaa-wcsd-zarr-pds.s3.us-east-1.amazonaws.com/level_2/Henry_B._Bigelow/HB0902/EK60/HB0902.zarr/time/927
|
|
8
9
|
# chunk size is ~1.3 kB, HB0902 cruise takes ~30 seconds to load all time/lat/lon data
|
|
9
10
|
# NOTE: larger value here will speed up the TurfJS download of data in the UI
|
|
10
|
-
SPATIOTEMPORAL_CHUNK_SIZE = int(
|
|
11
|
+
SPATIOTEMPORAL_CHUNK_SIZE = int(2**16) # 2**17
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class Coordinates(Enum):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: water_column_sonar_processing
|
|
3
|
-
Version:
|
|
3
|
+
Version: 24.1.1
|
|
4
4
|
Summary: A processing tool for water column sonar data.
|
|
5
5
|
Author-email: Rudy Klucik <rudy.klucik@noaa.gov>
|
|
6
6
|
Project-URL: Homepage, https://github.com/CI-CMG/water-column-sonar-processing
|
|
@@ -8,7 +8,7 @@ Project-URL: Issues, https://github.com/CI-CMG/water-column-sonar-processing/iss
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: aiobotocore==2.15.2
|
|
@@ -26,26 +26,19 @@ Requires-Dist: pandas==2.2.3
|
|
|
26
26
|
Requires-Dist: pyarrow==18.1.0
|
|
27
27
|
Requires-Dist: python-dotenv==1.0.1
|
|
28
28
|
Requires-Dist: requests==2.32.3
|
|
29
|
-
Requires-Dist: s3fs==
|
|
29
|
+
Requires-Dist: s3fs==2024.2.0
|
|
30
30
|
Requires-Dist: scipy==1.14.1
|
|
31
31
|
Requires-Dist: setuptools
|
|
32
32
|
Requires-Dist: shapely==2.0.3
|
|
33
33
|
Requires-Dist: typing-extensions==4.10.0
|
|
34
34
|
Requires-Dist: xarray==2024.10.0
|
|
35
|
+
Requires-Dist: xbatcher==0.4.0
|
|
35
36
|
Requires-Dist: zarr==2.18.3
|
|
36
37
|
|
|
37
38
|
# Water Column Sonar Processing
|
|
38
39
|
Processing tool for converting L0 data to L1 and L2 as well as generating geospatial information
|
|
39
40
|
|
|
40
|
-

|
|
41
|
-
|
|
42
|
-

|
|
43
|
-
|
|
44
|
-

|
|
45
|
-
|
|
46
|
-

|
|
47
|
-
|
|
48
|
-
 
|
|
41
|
+
     
|
|
49
42
|
|
|
50
43
|
# Setting up the Python Environment
|
|
51
44
|
> Python 3.10.12
|
|
@@ -103,12 +96,6 @@ or
|
|
|
103
96
|
Following this tutorial:
|
|
104
97
|
https://packaging.python.org/en/latest/tutorials/packaging-projects/
|
|
105
98
|
|
|
106
|
-
# To Publish To PROD
|
|
107
|
-
```commandline
|
|
108
|
-
python -m build
|
|
109
|
-
python -m twine upload --repository pypi dist/*
|
|
110
|
-
```
|
|
111
|
-
|
|
112
99
|
# Pre Commit Hook
|
|
113
100
|
see here for installation: https://pre-commit.com/
|
|
114
101
|
https://dev.to/rafaelherik/using-trufflehog-and-pre-commit-hook-to-prevent-secret-exposure-edo
|
|
@@ -133,13 +120,29 @@ https://colab.research.google.com/drive/1KiLMueXiz9WVB9o4RuzYeGjNZ6PsZU7a#scroll
|
|
|
133
120
|
# Tag a Release
|
|
134
121
|
Step 1 --> increment the semantic version in the zarr_manager.py "metadata" & the "pyproject.toml"
|
|
135
122
|
```commandline
|
|
136
|
-
git tag "
|
|
123
|
+
git tag -a v24.01.01 -m "Releasing version v24.01.01"
|
|
137
124
|
```
|
|
138
|
-
|
|
125
|
+
|
|
139
126
|
```commandline
|
|
140
127
|
git push origin --tags
|
|
141
128
|
```
|
|
142
129
|
|
|
130
|
+
# To Publish To PROD
|
|
131
|
+
```commandline
|
|
132
|
+
python -m build
|
|
133
|
+
python -m twine upload --repository pypi dist/*
|
|
134
|
+
```
|
|
135
|
+
|
|
143
136
|
# TODO:
|
|
144
137
|
add https://pypi.org/project/setuptools-scm/
|
|
145
138
|
for extracting the version
|
|
139
|
+
|
|
140
|
+
# Security scanning
|
|
141
|
+
> bandit -r water_column_sonar_processing/
|
|
142
|
+
|
|
143
|
+
# Data Debugging
|
|
144
|
+
Experimental Plotting in Xarray (hvPlot):
|
|
145
|
+
https://colab.research.google.com/drive/18vrI9LAip4xRGEX6EvnuVFp35RAiVYwU#scrollTo=q9_j9p2yXsLV
|
|
146
|
+
|
|
147
|
+
HB0707 Cruise zoomable:
|
|
148
|
+
https://hb0707.s3.us-east-1.amazonaws.com/index.html
|
|
@@ -3,31 +3,32 @@ water_column_sonar_processing/process.py,sha256=-yQtK3rnZq6lGAr3q02zLDe1NuMH9c0P
|
|
|
3
3
|
water_column_sonar_processing/aws/__init__.py,sha256=KJqK8oYMn-u8n8i-Jp_lG5BvCOTjwWSjWP8yAyDlWVo,297
|
|
4
4
|
water_column_sonar_processing/aws/dynamodb_manager.py,sha256=LQ3eh7Zf1fBLG-RKovod9KbQwhE-0Qdq1JPk4Ro5bdo,10252
|
|
5
5
|
water_column_sonar_processing/aws/s3_manager.py,sha256=-PCiW7YF31nGIPa1oVOVTzjTSExAAkT_IyNNnvWv2HU,16214
|
|
6
|
-
water_column_sonar_processing/aws/s3fs_manager.py,sha256=
|
|
6
|
+
water_column_sonar_processing/aws/s3fs_manager.py,sha256=Vo-DXj6vgb8t1l4LdtNu7JCtq_RfFsnl33RuGeBUXhk,2561
|
|
7
7
|
water_column_sonar_processing/aws/sns_manager.py,sha256=Dp9avG5VSugSWPR1dZ-askuAw1fCZkNUHbOUP65iR-k,1867
|
|
8
8
|
water_column_sonar_processing/aws/sqs_manager.py,sha256=NSUrWmnSC8h8Gf7gT0U8zFaQQ-yX89h0Q0mDLKGqp2Y,1597
|
|
9
9
|
water_column_sonar_processing/cruise/__init__.py,sha256=H5hW0JMORuaFvQk_R31B4VL8RnRyKeanOOiWmqEMZJk,156
|
|
10
|
-
water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=
|
|
11
|
-
water_column_sonar_processing/cruise/
|
|
12
|
-
water_column_sonar_processing/cruise/resample_regrid.py,sha256=
|
|
13
|
-
water_column_sonar_processing/geometry/__init__.py,sha256=
|
|
10
|
+
water_column_sonar_processing/cruise/create_empty_zarr_store.py,sha256=ZsFQTDA0gXfQHlxDsXBGD1qQ0ipmx4kS81DcY6ml5Ew,7767
|
|
11
|
+
water_column_sonar_processing/cruise/datatree_manager.py,sha256=Qy4dZCW8_q31lbjxbMsx3JtBS4BvQT17_2P0QD1RQcY,639
|
|
12
|
+
water_column_sonar_processing/cruise/resample_regrid.py,sha256=gz_uP-mBD4JSBRBr69ZvsfmXX4yyBdRG9-P1z3If43E,14246
|
|
13
|
+
water_column_sonar_processing/geometry/__init__.py,sha256=GIzzc-_7pwEwbOkGpc4i_fmjWI5ymllXqzdHq_d3Rio,299
|
|
14
|
+
water_column_sonar_processing/geometry/elevation_manager.py,sha256=eq9w691WJknPwWYkvO3giKTPleIxCVc2tMGR0e8ZRxQ,4267
|
|
14
15
|
water_column_sonar_processing/geometry/geometry_manager.py,sha256=nz5T1vCDWHYIfQ853EqKYHDetTul7jRWS3y8Evep8QU,10855
|
|
15
16
|
water_column_sonar_processing/geometry/geometry_simplification.py,sha256=im1HG9nfYIerQv3w-PUHzphw2B7aGgnsA3Zcdy2oTmA,3016
|
|
16
17
|
water_column_sonar_processing/geometry/pmtile_generation.py,sha256=7Lm08Jr6YaM4nYmexClxbIMOqSV1teo9wMm6dfjFuNA,12384
|
|
17
18
|
water_column_sonar_processing/index/__init__.py,sha256=izEObsKiOoIJ0kZCFhvaYsBd6Ga71XJxnogjrNInw68,68
|
|
18
|
-
water_column_sonar_processing/index/index_manager.py,sha256=
|
|
19
|
+
water_column_sonar_processing/index/index_manager.py,sha256=qsS6rKObJlFXKyzRuT1bk2_qW1YagW-Fg_AkQ1U_KRs,14213
|
|
19
20
|
water_column_sonar_processing/model/__init__.py,sha256=FXaCdbPqxp0ogmZm9NplRirqpgMiYs1iRYgJbFbbX2Y,65
|
|
20
|
-
water_column_sonar_processing/model/zarr_manager.py,sha256=
|
|
21
|
-
water_column_sonar_processing/processing/__init__.py,sha256=
|
|
22
|
-
water_column_sonar_processing/processing/
|
|
23
|
-
water_column_sonar_processing/processing/raw_to_zarr.py,sha256=
|
|
21
|
+
water_column_sonar_processing/model/zarr_manager.py,sha256=Sgh8wXhjTgvQ_UlHGALIbUQA9d7ESdpAT2hJIavpXwM,15507
|
|
22
|
+
water_column_sonar_processing/processing/__init__.py,sha256=tdpSfwnY6lbAS_yBTu4aG0SjPgCKqh6LAFvIj_t3j3U,168
|
|
23
|
+
water_column_sonar_processing/processing/batch_downloader.py,sha256=qXoruHdbgzAolmroK6eRn9bWgeHFgaVQLwhJ6X5oHRE,6299
|
|
24
|
+
water_column_sonar_processing/processing/raw_to_zarr.py,sha256=Sn0_zBT7yYP6abbSTlQBPA6iZSBxeVqPYYSgoroiBEU,17599
|
|
24
25
|
water_column_sonar_processing/utility/__init__.py,sha256=yDObMOL0_OxKWet5wffK2-XVJgoE9iwiY2q04GZrtBQ,234
|
|
25
26
|
water_column_sonar_processing/utility/cleaner.py,sha256=bNbs-hopWxtKAFBK0Eu18xdRErZCGZvtla3j-1bTwQw,619
|
|
26
|
-
water_column_sonar_processing/utility/constants.py,sha256=
|
|
27
|
+
water_column_sonar_processing/utility/constants.py,sha256=AD6RlDrJRVN1GYwRvo7cunLhrdC0F8CyOlbkB_GxL-s,2180
|
|
27
28
|
water_column_sonar_processing/utility/pipeline_status.py,sha256=O-0SySqdRGJ6bs3zQe1NV9vkOpmsRM7zj5QoHgzYioY,4395
|
|
28
29
|
water_column_sonar_processing/utility/timestamp.py,sha256=bO0oir7KxxoEHPGRkz9FCBfOligkocUyRiWRzAq8fnU,361
|
|
29
|
-
water_column_sonar_processing-
|
|
30
|
-
water_column_sonar_processing-
|
|
31
|
-
water_column_sonar_processing-
|
|
32
|
-
water_column_sonar_processing-
|
|
33
|
-
water_column_sonar_processing-
|
|
30
|
+
water_column_sonar_processing-24.1.1.dist-info/LICENSE,sha256=lz4IpJ5_adG3S0ali-WaIpQFVTnEAOucMDQPECUVEYw,1110
|
|
31
|
+
water_column_sonar_processing-24.1.1.dist-info/METADATA,sha256=Bym-EHrC46s9vFs9eN-nqZisesp5r5AFOwCckUVULS8,5474
|
|
32
|
+
water_column_sonar_processing-24.1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
33
|
+
water_column_sonar_processing-24.1.1.dist-info/top_level.txt,sha256=aRYU4A7RNBlNrL4vzjytFAir3BNnmOgsvIGKKA36tg4,30
|
|
34
|
+
water_column_sonar_processing-24.1.1.dist-info/RECORD,,
|
|
@@ -1,342 +0,0 @@
|
|
|
1
|
-
import gc
|
|
2
|
-
import os
|
|
3
|
-
import echopype as ep
|
|
4
|
-
import numpy as np
|
|
5
|
-
from numcodecs import Blosc
|
|
6
|
-
|
|
7
|
-
from water_column_sonar_processing.utility import Cleaner
|
|
8
|
-
|
|
9
|
-
TEMPDIR = "/tmp"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# This code is getting copied from echofish-aws-raw-to-zarr-lambda
|
|
13
|
-
class CruiseSampler:
|
|
14
|
-
#######################################################
|
|
15
|
-
def __init__(
|
|
16
|
-
self,
|
|
17
|
-
):
|
|
18
|
-
# TODO: revert to Blosc.BITSHUFFLE, troubleshooting misc error
|
|
19
|
-
self.__compressor = Blosc(cname="zstd", clevel=2) # shuffle=Blosc.NOSHUFFLE
|
|
20
|
-
self.bucket_name = os.environ.get("INPUT_BUCKET_NAME")
|
|
21
|
-
# self.__s3 = s3_operations
|
|
22
|
-
|
|
23
|
-
############################################################################
|
|
24
|
-
############################################################################
|
|
25
|
-
def __zarr_info_to_table(
|
|
26
|
-
self,
|
|
27
|
-
file_name,
|
|
28
|
-
cruise_name,
|
|
29
|
-
zarr_path,
|
|
30
|
-
min_echo_range,
|
|
31
|
-
max_echo_range,
|
|
32
|
-
num_ping_time_dropna,
|
|
33
|
-
start_time,
|
|
34
|
-
end_time,
|
|
35
|
-
frequencies,
|
|
36
|
-
channels
|
|
37
|
-
):
|
|
38
|
-
print('Writing Zarr information to DynamoDB table.')
|
|
39
|
-
self.__dynamo.update_item(
|
|
40
|
-
table_name=self.__table_name,
|
|
41
|
-
key={
|
|
42
|
-
'FILE_NAME': {'S': file_name}, # Partition Key
|
|
43
|
-
'CRUISE_NAME': {'S': cruise_name}, # Sort Key
|
|
44
|
-
# TODO: should be FILE_NAME & SENSOR_NAME so they are truely unique for when two sensors are processed within one cruise
|
|
45
|
-
},
|
|
46
|
-
expression='SET #ZB = :zb, #ZP = :zp, #MINER = :miner, #MAXER = :maxer, #P = :p, #ST = :st, #ET = :et, #F = :f, #C = :c',
|
|
47
|
-
attribute_names={
|
|
48
|
-
'#ZB': 'ZARR_BUCKET',
|
|
49
|
-
'#ZP': 'ZARR_PATH',
|
|
50
|
-
'#MINER': 'MIN_ECHO_RANGE',
|
|
51
|
-
'#MAXER': 'MAX_ECHO_RANGE',
|
|
52
|
-
'#P': 'NUM_PING_TIME_DROPNA',
|
|
53
|
-
'#ST': 'START_TIME',
|
|
54
|
-
'#ET': 'END_TIME',
|
|
55
|
-
'#F': 'FREQUENCIES',
|
|
56
|
-
'#C': 'CHANNELS',
|
|
57
|
-
},
|
|
58
|
-
attribute_values={
|
|
59
|
-
':zb': {
|
|
60
|
-
'S': self.__output_bucket
|
|
61
|
-
},
|
|
62
|
-
':zp': {
|
|
63
|
-
'S': zarr_path
|
|
64
|
-
},
|
|
65
|
-
':miner': {
|
|
66
|
-
'N': str(np.round(min_echo_range, 4))
|
|
67
|
-
},
|
|
68
|
-
':maxer': {
|
|
69
|
-
'N': str(np.round(max_echo_range, 4))
|
|
70
|
-
},
|
|
71
|
-
':p': {
|
|
72
|
-
'N': str(num_ping_time_dropna)
|
|
73
|
-
},
|
|
74
|
-
':st': {
|
|
75
|
-
'S': start_time
|
|
76
|
-
},
|
|
77
|
-
':et': {
|
|
78
|
-
'S': end_time
|
|
79
|
-
},
|
|
80
|
-
':f': {
|
|
81
|
-
'L': [{'N': str(i)} for i in frequencies]
|
|
82
|
-
},
|
|
83
|
-
':c': {
|
|
84
|
-
'L': [{'S': i} for i in channels]
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
############################################################################
|
|
90
|
-
############################################################################
|
|
91
|
-
############################################################################
|
|
92
|
-
def raw_to_zarr(
|
|
93
|
-
self,
|
|
94
|
-
ship_name,
|
|
95
|
-
cruise_name,
|
|
96
|
-
sensor_name,
|
|
97
|
-
file_name,
|
|
98
|
-
):
|
|
99
|
-
print(f'Opening raw: {file_name} and creating zarr store.')
|
|
100
|
-
geometry_manager = GeometryManager()
|
|
101
|
-
try:
|
|
102
|
-
gc.collect()
|
|
103
|
-
print('Opening raw file with echopype.')
|
|
104
|
-
bucket_name="test_input_bucket" # noaa-wcsd-pds
|
|
105
|
-
s3_file_path = f"s3://{bucket_name}/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}"
|
|
106
|
-
# s3_file_path = Path(f"s3://noaa-wcsd-pds/data/raw/{ship_name}/{cruise_name}/{sensor_name}/{file_name}")
|
|
107
|
-
# TODO: add the bottom file here
|
|
108
|
-
echodata = ep.open_raw(
|
|
109
|
-
raw_file=s3_file_path,
|
|
110
|
-
sonar_model=sensor_name,
|
|
111
|
-
# include_bot=True,
|
|
112
|
-
use_swap=True,
|
|
113
|
-
# max_chunk_size=100,
|
|
114
|
-
# storage_options={'anon': True} # this was creating problems
|
|
115
|
-
)
|
|
116
|
-
print('Compute volume backscattering strength (Sv) from raw data.')
|
|
117
|
-
ds_sv = ep.calibrate.compute_Sv(echodata)
|
|
118
|
-
print('Done computing volume backscattering strength (Sv) from raw data.')
|
|
119
|
-
frequencies = echodata.environment.frequency_nominal.values
|
|
120
|
-
#################################################################
|
|
121
|
-
# Get GPS coordinates
|
|
122
|
-
gps_data, lat, lon = geometry_manager.read_echodata_gps_data(
|
|
123
|
-
echodata=echodata,
|
|
124
|
-
ship_name=ship_name,
|
|
125
|
-
cruise_name=cruise_name,
|
|
126
|
-
sensor_name=sensor_name,
|
|
127
|
-
file_name=file_name,
|
|
128
|
-
write_geojson=True
|
|
129
|
-
)
|
|
130
|
-
# gps_data, lat, lon = self.__get_gps_data(echodata=echodata)
|
|
131
|
-
#################################################################
|
|
132
|
-
# Technically the min_echo_range would be 0 m.
|
|
133
|
-
# TODO: this var name is supposed to represent minimum resolution of depth measurements
|
|
134
|
-
# The most minimum the resolution can be is as small as 0.25 meters
|
|
135
|
-
min_echo_range = np.maximum(0.25, np.nanmin(np.diff(ds_sv.echo_range.values)))
|
|
136
|
-
max_echo_range = float(np.nanmax(ds_sv.echo_range))
|
|
137
|
-
#
|
|
138
|
-
num_ping_time_dropna = lat[~np.isnan(lat)].shape[0] # symmetric to lon
|
|
139
|
-
#
|
|
140
|
-
start_time = np.datetime_as_string(ds_sv.ping_time.values[0], unit='ms') + "Z"
|
|
141
|
-
end_time = np.datetime_as_string(ds_sv.ping_time.values[-1], unit='ms') + "Z"
|
|
142
|
-
channels = list(ds_sv.channel.values)
|
|
143
|
-
#
|
|
144
|
-
#################################################################
|
|
145
|
-
# Create the zarr store
|
|
146
|
-
ds_sv.to_zarr(store=store_name)
|
|
147
|
-
#################################################################
|
|
148
|
-
print('Note: Adding GeoJSON inside Zarr store')
|
|
149
|
-
self.__write_geojson_to_file(store_name=store_name, data=gps_data)
|
|
150
|
-
#################################################################
|
|
151
|
-
self.__zarr_info_to_table(
|
|
152
|
-
file_name=raw_file_name,
|
|
153
|
-
cruise_name=cruise_name,
|
|
154
|
-
zarr_path=os.path.join(output_zarr_prefix, store_name),
|
|
155
|
-
min_echo_range=min_echo_range,
|
|
156
|
-
max_echo_range=max_echo_range,
|
|
157
|
-
num_ping_time_dropna=num_ping_time_dropna,
|
|
158
|
-
start_time=start_time,
|
|
159
|
-
end_time=end_time,
|
|
160
|
-
frequencies=frequencies,
|
|
161
|
-
channels=channels
|
|
162
|
-
)
|
|
163
|
-
except Exception as err:
|
|
164
|
-
print(f'Exception encountered creating local Zarr store with echopype: {err}')
|
|
165
|
-
raise RuntimeError(f"Problem creating local Zarr store, {err}")
|
|
166
|
-
print('Done creating local zarr store.')
|
|
167
|
-
|
|
168
|
-
############################################################################
|
|
169
|
-
def __upload_files_to_output_bucket(
|
|
170
|
-
self,
|
|
171
|
-
local_directory,
|
|
172
|
-
object_prefix,
|
|
173
|
-
):
|
|
174
|
-
# Note: this will be passed credentials if using NODD
|
|
175
|
-
print('Uploading files using thread pool executor.')
|
|
176
|
-
all_files = []
|
|
177
|
-
for subdir, dirs, files in os.walk(local_directory):
|
|
178
|
-
for file in files:
|
|
179
|
-
local_path = os.path.join(subdir, file)
|
|
180
|
-
s3_key = os.path.join(object_prefix, local_path)
|
|
181
|
-
all_files.append([local_path, s3_key])
|
|
182
|
-
# all_files
|
|
183
|
-
all_uploads = self.__s3.upload_files_with_thread_pool_executor(
|
|
184
|
-
bucket_name=self.__output_bucket,
|
|
185
|
-
all_files=all_files,
|
|
186
|
-
access_key_id=self.__output_bucket_access_key,
|
|
187
|
-
secret_access_key=self.__output_bucket_secret_access_key
|
|
188
|
-
)
|
|
189
|
-
return all_uploads
|
|
190
|
-
|
|
191
|
-
############################################################################
|
|
192
|
-
def execute(self, input_message):
|
|
193
|
-
ship_name = input_message['shipName']
|
|
194
|
-
cruise_name = input_message['cruiseName']
|
|
195
|
-
sensor_name = input_message['sensorName']
|
|
196
|
-
input_file_name = input_message['fileName']
|
|
197
|
-
#
|
|
198
|
-
try:
|
|
199
|
-
self.__update_processing_status(
|
|
200
|
-
file_name=input_file_name,
|
|
201
|
-
cruise_name=cruise_name,
|
|
202
|
-
pipeline_status="PROCESSING_RAW_TO_ZARR"
|
|
203
|
-
)
|
|
204
|
-
#######################################################################
|
|
205
|
-
store_name = f"{os.path.splitext(input_file_name)[0]}.zarr"
|
|
206
|
-
output_zarr_prefix = f"level_1/{ship_name}/{cruise_name}/{sensor_name}"
|
|
207
|
-
bucket_key = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/{input_file_name}"
|
|
208
|
-
zarr_prefix = os.path.join("level_1", ship_name, cruise_name, sensor_name)
|
|
209
|
-
#
|
|
210
|
-
os.chdir(TEMPDIR) # Lambdas require use of temp directory
|
|
211
|
-
#######################################################################
|
|
212
|
-
#######################################################################
|
|
213
|
-
# Check if zarr store already exists
|
|
214
|
-
s3_objects = self.__s3.list_objects(
|
|
215
|
-
bucket_name=self.__output_bucket,
|
|
216
|
-
prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
|
|
217
|
-
access_key_id=self.__output_bucket_access_key,
|
|
218
|
-
secret_access_key=self.__output_bucket_secret_access_key
|
|
219
|
-
)
|
|
220
|
-
if len(s3_objects) > 0:
|
|
221
|
-
print('Zarr store data already exists in s3, deleting existing and continuing.')
|
|
222
|
-
self.__s3.delete_objects(
|
|
223
|
-
bucket_name=self.__output_bucket,
|
|
224
|
-
objects=s3_objects,
|
|
225
|
-
access_key_id=self.__output_bucket_access_key,
|
|
226
|
-
secret_access_key=self.__output_bucket_secret_access_key
|
|
227
|
-
)
|
|
228
|
-
#######################################################################
|
|
229
|
-
# self.__delete_all_local_raw_and_zarr_files()
|
|
230
|
-
Cleaner.delete_local_files(file_types=["*.raw*", "*.zarr"])
|
|
231
|
-
self.__s3.download_file(
|
|
232
|
-
bucket_name=self.__input_bucket,
|
|
233
|
-
key=bucket_key,
|
|
234
|
-
file_name=input_file_name
|
|
235
|
-
)
|
|
236
|
-
self.__create_local_zarr_store(
|
|
237
|
-
raw_file_name=input_file_name,
|
|
238
|
-
cruise_name=cruise_name,
|
|
239
|
-
sensor_name=sensor_name,
|
|
240
|
-
output_zarr_prefix=output_zarr_prefix,
|
|
241
|
-
store_name=store_name
|
|
242
|
-
)
|
|
243
|
-
#######################################################################
|
|
244
|
-
self.__upload_files_to_output_bucket(store_name, output_zarr_prefix)
|
|
245
|
-
#######################################################################
|
|
246
|
-
# # TODO: verify count of objects matches
|
|
247
|
-
# s3_objects = self.__s3.list_objects(
|
|
248
|
-
# bucket_name=self.__output_bucket,
|
|
249
|
-
# prefix=f"{zarr_prefix}/{os.path.splitext(input_file_name)[0]}.zarr/",
|
|
250
|
-
# access_key_id=self.__output_bucket_access_key,
|
|
251
|
-
# secret_access_key=self.__output_bucket_secret_access_key
|
|
252
|
-
# )
|
|
253
|
-
#######################################################################
|
|
254
|
-
self.__update_processing_status(
|
|
255
|
-
file_name=input_file_name,
|
|
256
|
-
cruise_name=cruise_name,
|
|
257
|
-
pipeline_status='SUCCESS_RAW_TO_ZARR'
|
|
258
|
-
)
|
|
259
|
-
#######################################################################
|
|
260
|
-
self.__publish_done_message(input_message)
|
|
261
|
-
#######################################################################
|
|
262
|
-
# except Exception as err:
|
|
263
|
-
# print(f'Exception encountered: {err}')
|
|
264
|
-
# self.__update_processing_status(
|
|
265
|
-
# file_name=input_file_name,
|
|
266
|
-
# cruise_name=cruise_name,
|
|
267
|
-
# pipeline_status='FAILURE_RAW_TO_ZARR',
|
|
268
|
-
# error_message=str(err),
|
|
269
|
-
# )
|
|
270
|
-
finally:
|
|
271
|
-
self.__delete_all_local_raw_and_zarr_files()
|
|
272
|
-
#######################################################################
|
|
273
|
-
|
|
274
|
-
############################################################################
|
|
275
|
-
|
|
276
|
-
################################################################################
|
|
277
|
-
############################################################################
|
|
278
|
-
# TODO: DELETE
|
|
279
|
-
# def __get_gps_data(
|
|
280
|
-
# self,
|
|
281
|
-
# echodata: ep.echodata.echodata.EchoData
|
|
282
|
-
# ) -> tuple:
|
|
283
|
-
# print('Getting GPS data.')
|
|
284
|
-
# try:
|
|
285
|
-
# # if 'latitude' not in echodata.platform.variables and 'longitude' not in echodata.platform.variables:
|
|
286
|
-
# # raise KeyError;
|
|
287
|
-
# assert( # TODO: raise error, e.g. KeyError
|
|
288
|
-
# 'latitude' in echodata.platform.variables and 'longitude' in echodata.platform.variables
|
|
289
|
-
# ), "Problem: GPS coordinates not found in echodata."
|
|
290
|
-
# latitude = echodata.platform.latitude.values
|
|
291
|
-
# longitude = echodata.platform.longitude.values # len(longitude) == 14691
|
|
292
|
-
# # RE: time coordinates: https://github.com/OSOceanAcoustics/echopype/issues/656#issue-1219104771
|
|
293
|
-
# assert(
|
|
294
|
-
# 'time1' in echodata.platform.variables and 'time1' in echodata.environment.variables
|
|
295
|
-
# ), "Problem: Time coordinate not found in echodata."
|
|
296
|
-
# # 'nmea_times' are times from the nmea datalogger associated with GPS
|
|
297
|
-
# # nmea times, unlike env times, can be sorted
|
|
298
|
-
# nmea_times = np.sort(echodata.platform.time1.values)
|
|
299
|
-
# # 'time1' are times from the echosounder associated with transducer measurement
|
|
300
|
-
# time1 = echodata.environment.time1.values
|
|
301
|
-
# # Align 'sv_times' to 'nmea_times'
|
|
302
|
-
# assert(
|
|
303
|
-
# np.all(time1[:-1] <= time1[1:]) and np.all(nmea_times[:-1] <= nmea_times[1:])
|
|
304
|
-
# ), "Problem: NMEA time stamps are not sorted."
|
|
305
|
-
# # Finds the indices where 'v' can be inserted just to the right of 'a'
|
|
306
|
-
# indices = np.searchsorted(a=nmea_times, v=time1, side="right") - 1
|
|
307
|
-
# #
|
|
308
|
-
# lat = latitude[indices]
|
|
309
|
-
# lat[indices < 0] = np.nan # values recorded before indexing are set to nan
|
|
310
|
-
# lon = longitude[indices]
|
|
311
|
-
# lon[indices < 0] = np.nan
|
|
312
|
-
# if len(lat) < 2 or len(lon) < 2:
|
|
313
|
-
# raise Exception("There was not enough data in lat or lon to create geojson.")
|
|
314
|
-
# assert( # TODO: raise ValueError
|
|
315
|
-
# np.all(lat[~np.isnan(lat)] >= -90.) and np.all(lat[~np.isnan(lat)] <= 90.) and np.all(lon[~np.isnan(lon)] >= -180.) and np.all(lon[~np.isnan(lon)] <= 180.)
|
|
316
|
-
# ), "Problem: Data falls outside GPS bounds!"
|
|
317
|
-
# # TODO: check for visits to null island
|
|
318
|
-
# # https://osoceanacoustics.github.io/echopype-examples/echopype_tour.html
|
|
319
|
-
# print(np.count_nonzero(np.isnan(lat)))
|
|
320
|
-
# print(np.count_nonzero(np.isnan(lon)))
|
|
321
|
-
# if len(lat[~np.isnan(lat)]) < 1:
|
|
322
|
-
# raise RuntimeError(f"Problem all data is NaN.")
|
|
323
|
-
# time1 = time1[~np.isnan(lat)]
|
|
324
|
-
# lat = lat[~np.isnan(lat)]
|
|
325
|
-
# lon = lon[~np.isnan(lon)]
|
|
326
|
-
# #
|
|
327
|
-
# gps_df = pd.DataFrame({
|
|
328
|
-
# 'latitude': lat,
|
|
329
|
-
# 'longitude': lon,
|
|
330
|
-
# 'time1': time1
|
|
331
|
-
# }).set_index(['time1'])
|
|
332
|
-
# gps_gdf = geopandas.GeoDataFrame(
|
|
333
|
-
# gps_df,
|
|
334
|
-
# geometry=geopandas.points_from_xy(gps_df['longitude'], gps_df['latitude']),
|
|
335
|
-
# crs="epsg:4326" # TODO: does this sound right?
|
|
336
|
-
# )
|
|
337
|
-
# # GeoJSON FeatureCollection with IDs as "time1"
|
|
338
|
-
# geo_json = gps_gdf.to_json()
|
|
339
|
-
# except Exception as err:
|
|
340
|
-
# print(f'Exception encountered creating local Zarr store with echopype: {err}')
|
|
341
|
-
# raise
|
|
342
|
-
# return geo_json, lat, lon
|
|
File without changes
|
|
File without changes
|