water-column-sonar-processing 25.3.2__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (32) hide show
  1. water_column_sonar_processing/aws/dynamodb_manager.py +6 -6
  2. water_column_sonar_processing/aws/s3_manager.py +95 -90
  3. water_column_sonar_processing/aws/s3fs_manager.py +5 -3
  4. water_column_sonar_processing/aws/sqs_manager.py +1 -1
  5. water_column_sonar_processing/cruise/__init__.py +2 -1
  6. water_column_sonar_processing/cruise/create_empty_zarr_store.py +49 -43
  7. water_column_sonar_processing/cruise/create_empty_zarr_store_level_3.py +161 -0
  8. water_column_sonar_processing/cruise/datatree_manager.py +21 -21
  9. water_column_sonar_processing/cruise/resample_regrid.py +57 -47
  10. water_column_sonar_processing/dataset/__init__.py +3 -0
  11. water_column_sonar_processing/dataset/dataset_manager.py +205 -0
  12. water_column_sonar_processing/dataset/feature_manager.py +32 -0
  13. water_column_sonar_processing/geometry/geometry_manager.py +11 -12
  14. water_column_sonar_processing/geometry/line_simplification.py +26 -1
  15. water_column_sonar_processing/geometry/pmtile_generation.py +211 -247
  16. water_column_sonar_processing/index/index_manager.py +18 -17
  17. water_column_sonar_processing/model/zarr_manager.py +504 -256
  18. water_column_sonar_processing/processing/__init__.py +3 -2
  19. water_column_sonar_processing/processing/batch_downloader.py +11 -11
  20. water_column_sonar_processing/processing/raw_to_netcdf.py +319 -0
  21. water_column_sonar_processing/processing/raw_to_zarr.py +41 -31
  22. water_column_sonar_processing/utility/__init__.py +9 -2
  23. water_column_sonar_processing/utility/cleaner.py +1 -2
  24. water_column_sonar_processing/utility/constants.py +26 -7
  25. water_column_sonar_processing/utility/timestamp.py +1 -0
  26. water_column_sonar_processing-25.8.0.dist-info/METADATA +162 -0
  27. water_column_sonar_processing-25.8.0.dist-info/RECORD +39 -0
  28. {water_column_sonar_processing-25.3.2.dist-info → water_column_sonar_processing-25.8.0.dist-info}/WHEEL +1 -1
  29. water_column_sonar_processing-25.3.2.dist-info/licenses/LICENSE → water_column_sonar_processing-25.8.0.dist-info/licenses/LICENSE-MIT +1 -1
  30. water_column_sonar_processing-25.3.2.dist-info/METADATA +0 -170
  31. water_column_sonar_processing-25.3.2.dist-info/RECORD +0 -34
  32. {water_column_sonar_processing-25.3.2.dist-info → water_column_sonar_processing-25.8.0.dist-info}/top_level.txt +0 -0
@@ -42,7 +42,7 @@ class GeometryManager:
42
42
  file_name_stem = Path(file_name).stem
43
43
  geo_json_name = f"{file_name_stem}.json"
44
44
 
45
- print("Getting GPS data from echopype object.")
45
+ print("Getting GPS dataset from echopype object.")
46
46
  try:
47
47
  latitude = np.round(
48
48
  echodata.platform.latitude.values, self.DECIMAL_PRECISION
@@ -56,7 +56,7 @@ class GeometryManager:
56
56
  # note that nmea_times, unlike time1, can be sorted
57
57
  nmea_times = np.sort(echodata.platform.time1.values)
58
58
 
59
- # 'time1' are times from the echosounder associated with the data of the transducer measurement
59
+ # 'time1' are times from the echosounder associated with the dataset of the transducer measurement
60
60
  time1 = echodata.environment.time1.values
61
61
 
62
62
  if len(nmea_times) < len(time1):
@@ -98,14 +98,14 @@ class GeometryManager:
98
98
 
99
99
  # create requirement for minimum linestring size
100
100
  MIN_ALLOWED_SIZE = (
101
- 4 # don't want to process files with less than 4 data points
101
+ 4 # don't want to process files with less than 4 dataset points
102
102
  )
103
103
  if (
104
104
  len(lat[~np.isnan(lat)]) < MIN_ALLOWED_SIZE
105
105
  or len(lon[~np.isnan(lon)]) < MIN_ALLOWED_SIZE
106
106
  ):
107
107
  raise Exception(
108
- f"There was not enough data in lat or lon to create geojson, {len(lat[~np.isnan(lat)])} found, less than {MIN_ALLOWED_SIZE}."
108
+ f"There was not enough dataset in lat or lon to create geojson, {len(lat[~np.isnan(lat)])} found, less than {MIN_ALLOWED_SIZE}."
109
109
  )
110
110
 
111
111
  # https://osoceanacoustics.github.io/echopype-examples/echopype_tour.html
@@ -124,7 +124,7 @@ class GeometryManager:
124
124
  crs="epsg:4326",
125
125
  )
126
126
  # Note: We set np.nan to 0,0 so downstream missing values can be omitted
127
- # TODO: so what ends up here is data with corruption at null island!!!
127
+ # TODO: so what ends up here is dataset with corruption at null island!!!
128
128
  geo_json_line = gps_gdf.to_json()
129
129
  if write_geojson:
130
130
  print("Creating local copy of geojson file.")
@@ -180,12 +180,12 @@ class GeometryManager:
180
180
  #################################################################
181
181
  # GeoJSON FeatureCollection with IDs as "time"
182
182
  except Exception as err:
183
- print(
184
- f"Exception encountered extracting gps coordinates creating geojson: {err}"
183
+ raise RuntimeError(
184
+ f"Exception encountered extracting gps coordinates creating geojson, {err}"
185
185
  )
186
- raise
186
+
187
187
  # Note: returned lat/lon values can include np.nan because they need to be aligned with
188
- # the Sv data! GeoJSON needs simplification but has been filtered.
188
+ # the Sv dataset! GeoJSON needs simplification but has been filtered.
189
189
  # return gps_df.index.values, gps_df.latitude.values, gps_df.longitude.values
190
190
  return gps_df.index.values, lat, lon
191
191
  # TODO: if geojson is already returned with 0,0, the return here
@@ -229,9 +229,8 @@ class GeometryManager:
229
229
  indices = np.searchsorted(a=aa, v=vv)
230
230
 
231
231
  return indices, geospatial
232
- except Exception as err: # Failure
233
- print(f"Exception encountered reading s3 GeoJSON: {err}")
234
- raise
232
+ except Exception as err:
233
+ raise RuntimeError(f"Exception encountered reading s3 GeoJSON, {err}")
235
234
 
236
235
  ############################################################################
237
236
  # COMES from the raw-to-zarr conversion
@@ -4,6 +4,10 @@ import numpy as np
4
4
  from pykalman import KalmanFilter
5
5
  from shapely.geometry import Point
6
6
 
7
+ # import hvplot.pandas
8
+ # from holoviews import opts
9
+ # hv.extension('bokeh')
10
+
7
11
  # import matplotlib.pyplot as plt
8
12
 
9
13
 
@@ -24,6 +28,16 @@ def mph_to_knots(mph_value):
24
28
  return mph_value * 0.868976
25
29
 
26
30
 
31
+ def mps_to_knots(mps_value):
32
+ return mps_value * 1.94384
33
+
34
+
35
+ ###############################################################################
36
+ # Colab Notebook:
37
+ # https://colab.research.google.com/drive/1Ihb1x0EeYRNwGJ4Bqi4RqQQHu9-40oDk?usp=sharing#scrollTo=hIPziqVO48Xg
38
+ ###############################################################################
39
+
40
+
27
41
  # https://shapely.readthedocs.io/en/stable/reference/shapely.MultiLineString.html#shapely.MultiLineString
28
42
  class LineSimplification:
29
43
  """
@@ -89,7 +103,7 @@ class LineSimplification:
89
103
  #######################################################
90
104
  def get_speeds(
91
105
  self,
92
- times: np.ndarray, # don't really need time, do need to segment the data first
106
+ times: np.ndarray, # don't really need time, do need to segment the dataset first
93
107
  latitudes: np.ndarray,
94
108
  longitudes: np.ndarray,
95
109
  ) -> np.ndarray:
@@ -147,4 +161,15 @@ class LineSimplification:
147
161
  #######################################################
148
162
 
149
163
 
164
+ # [(-72.2001724243164, 40.51750183105469), # latBB
165
+ # (-72.20023345947266, 40.51749038696289),
166
+ # (-72.20033264160156, 40.51750183105469), # lonAA, latBB
167
+ # (-72.20030212402344, 40.517391204833984),
168
+ # (-72.20033264160156, 40.517330169677734), # lonAA, latCC
169
+ # (-72.2003402709961, 40.51729965209961),
170
+ # (-72.20033264160156, 40.517330169677734), # lonAA, latCC
171
+ # (-72.20040130615234, 40.5172004699707),
172
+ # (-72.20050048828125, 40.51716995239258),
173
+ # (-72.2004623413086, 40.51710891723633)]
174
+
150
175
  ###########################################################
@@ -1,10 +1,4 @@
1
- import glob
2
- import os
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from pathlib import Path
5
-
6
1
  import fiona
7
- import geopandas
8
2
  import geopandas as gpd
9
3
  import numpy as np
10
4
  import pandas as pd
@@ -16,282 +10,252 @@ MAX_CONCURRENCY = 64
16
10
  MAX_WORKERS = 64
17
11
  GB = 1024**3
18
12
 
13
+ bucket_name = "noaa-wcsd-zarr-pds"
14
+ ship_name = "Henry_B._Bigelow"
15
+ sensor_name = "EK60"
16
+
17
+ # TODO: get pmtiles of all the evr points
18
+
19
19
 
20
20
  class PMTileGeneration(object):
21
21
  """
22
- TODO: need to
23
- - iterate through the zarr stores for all cruises
24
- - generate geojson in geopandas df
25
- - consolidate into singular df, one cruise per row
26
- - export as _shape?_ file
27
- - document next steps creating pmtiles with linux commands
28
- - upload to s3
22
+ - iterate through the zarr stores for all cruises
23
+ - generate geojson in geopandas df, simplify linestrings
24
+ - consolidate into singular df, one cruise per row
25
+ - export as geojson
26
+ - using tippecanoe, geojson --> pmtiles w linux command
27
+ - upload to s3
29
28
  """
30
29
 
31
30
  #######################################################
32
31
  def __init__(
33
32
  self,
34
33
  ):
35
- print("123")
34
+ self.bucket_name = "noaa-wcsd-zarr-pds"
35
+ self.ship_name = "Henry_B._Bigelow"
36
+ self.sensor_name = "EK60"
36
37
 
37
38
  #######################################################
38
- # This uses a local collection of file-level geojson files to create the data
39
- def generate_geojson_feature_collection(self):
40
- # This was used to read from noaa-wcsd-model-pds bucket geojson files and then to
41
- # generate the geopandas dataframe which could be exported to another comprehensive
42
- # geojson file. That
43
- result = list(Path("/Users/r2d2/Documents/echofish/geojson").rglob("*.json"))
44
- # result = result[:100]
45
- jjj = 0
46
- pieces = []
47
- for jjj in range(len(result)):
48
- file_name = os.path.normpath(result[jjj]).split(os.sep)[-1]
49
- file_stem = os.path.splitext(os.path.basename(file_name))[0]
50
- geom = gpd.read_file(result[jjj]).iloc[0]["geometry"]
51
- # TDOO: Filter (0,0) coordinates
52
- if len(geom.coords.xy[0]) < 2:
53
- continue
54
- geom = LineString(list(zip(geom.coords.xy[1], geom.coords.xy[0])))
55
- pieces.append(
56
- {
57
- "ship_name": os.path.normpath(result[jjj]).split(os.sep)[-4],
58
- "cruise_name": os.path.normpath(result[jjj]).split(os.sep)[-3],
59
- "file_stem": file_stem,
60
- "file_path": result[jjj],
61
- "geom": geom,
62
- }
63
- )
64
- df = pd.DataFrame(pieces)
65
- print(df)
66
- gps_gdf = gpd.GeoDataFrame(
67
- data=df[
68
- ["ship_name", "cruise_name", "file_stem"]
69
- ], # try again with file_stem
70
- geometry=df["geom"],
71
- crs="EPSG:4326",
72
- )
73
- print(fiona.supported_drivers)
74
- # gps_gdf.to_file('dataframe.shp', crs='epsg:4326')
75
- # Convert geojson feature collection to pmtiles
76
- gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="epsg:4326")
77
- print("done")
78
- """
79
- # need to eliminate visits to null island
80
- tippecanoe --no-feature-limit -zg --projection=EPSG:4326 -o dataframe.pmtiles -l cruises dataframe.geojson
81
-
82
- https://docs.protomaps.com/pmtiles/create
83
- PMTiles
84
- https://drive.google.com/file/d/17Bi-UIXB9IJkIz30BHpiKHXYpCOgRFge/view?usp=sharing
85
-
86
- Viewer
87
- https://protomaps.github.io/PMTiles/#map=8.91/56.0234/-166.6346
88
- """
39
+ def check_all_cruises(self, bucket_name, cruises):
40
+ completed = []
41
+ for cruise_name in cruises:
42
+ print(cruise_name)
43
+ try:
44
+ zarr_store = f"{cruise_name}.zarr"
45
+ s3_zarr_store_path = f"{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{zarr_store}"
46
+ cruise = xr.open_dataset(
47
+ filename_or_obj=f"s3://{s3_zarr_store_path}",
48
+ engine="zarr",
49
+ storage_options={"anon": True},
50
+ )
51
+ width = cruise.Sv.shape[1]
52
+ height = cruise.Sv.shape[0]
53
+ depth = cruise.Sv.shape[2]
54
+ print(
55
+ f"height: {height}, width: {width}, depth: {depth} = {width * height * depth}"
56
+ )
57
+ lats = cruise.latitude.to_numpy()
58
+ percent_done = np.count_nonzero(~np.isnan(lats)) / width
59
+ if percent_done != 1.0:
60
+ print(
61
+ f"percent done: {np.round(percent_done, 2)}, {np.count_nonzero(~np.isnan(cruise.latitude.values))}, {width}"
62
+ )
63
+ else:
64
+ completed.append(cruise_name)
65
+ except Exception as err:
66
+ raise RuntimeError(f"Problem parsing Zarr stores, {err}")
67
+ return completed
89
68
 
90
69
  #######################################################
91
- # TODO: temporary using this to get info
92
- def get_info_from_zarr_store(
93
- self,
94
- ship_name,
95
- cruise_names,
96
- ):
97
- # TODO: NOT USED ANYWHERE
98
- total_size = 0
99
- # s3_fs = s3fs.S3FileSystem(anon=True)
100
- for cruise_name in cruise_names:
101
- s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
102
- # zarr_store = s3fs.S3Map(root=s3_path, s3=s3_fs)
103
- xr_store = xr.open_dataset(
104
- filename_or_obj=s3_path,
70
+ def get_cruise_geometry(self, cruise_name, index):
71
+ print(cruise_name)
72
+ try:
73
+ pieces = []
74
+ zarr_store = f"{cruise_name}.zarr"
75
+ s3_zarr_store_path = f"{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{zarr_store}"
76
+ cruise = xr.open_dataset(
77
+ filename_or_obj=f"s3://{s3_zarr_store_path}",
105
78
  engine="zarr",
106
79
  storage_options={"anon": True},
107
- chunks={}, # this allows the engine to define the chunk scheme
80
+ chunks={},
108
81
  cache=True,
109
82
  )
110
- print(f"Cruise: {cruise_name}, shape: {xr_store.time.shape[0]}")
111
- total_size = total_size + xr_store.time.shape[0]
112
-
113
- def get_geospatial_info_from_zarr_store(
114
- self,
115
- ship_name,
116
- cruise_name,
117
- ):
118
- """
119
- Open Zarr store, create geometry, write to geojson, return name
120
- """
121
- # s3_fs = s3fs.S3FileSystem(anon=True)
122
- gps_gdf = geopandas.GeoDataFrame(
123
- columns=["id", "ship", "cruise", "sensor", "geometry"],
124
- geometry="geometry",
125
- crs="EPSG:4326",
126
- )
127
- s3_path = f"s3://noaa-wcsd-zarr-pds/level_2/{ship_name}/{cruise_name}/EK60/{cruise_name}.zarr"
128
- # TODO: try-except to allow failures
129
- print("opening store")
130
- xr_store = xr.open_dataset(
131
- filename_or_obj=s3_path,
132
- engine="zarr",
133
- storage_options={"anon": True},
134
- chunks={}, # this allows the engine to define the chunk scheme
135
- cache=True,
136
- )
137
- print(xr_store.Sv.shape)
138
- # ---Read Zarr Store Time/Latitude/Longitude--- #
139
- latitude = xr_store.latitude.values
140
- longitude = xr_store.longitude.values
141
- if np.isnan(latitude).any() or np.isnan(longitude).any():
142
- print(f"there was missing lat-lon data for {cruise_name}")
143
- return None
144
- # ---Add To GeoPandas Dataframe--- #
145
- # TODO: experiment with tolerance "0.001"
146
- geom = LineString(list(zip(longitude, latitude))).simplify(
147
- tolerance=0.001, preserve_topology=True
148
- )
149
- gps_gdf.loc[0] = (
150
- 0,
151
- "Henry_B._Bigelow",
152
- cruise_name,
153
- "EK60",
154
- geom,
155
- ) # (ship, cruise, sensor, geometry)
156
- gps_gdf.set_index("id", inplace=True)
157
- gps_gdf.to_file(
158
- f"dataframe_{cruise_name}.geojson", driver="GeoJSON"
159
- ) # , engine="pyogrio")
160
- return cruise_name
83
+ latitude_array = cruise.latitude.to_numpy()
84
+ longitude_array = cruise.longitude.to_numpy()
85
+ if np.isnan(latitude_array).any() or np.isnan(longitude_array).any():
86
+ raise RuntimeError(
87
+ f"There was missing lat-lon dataset for, {cruise_name}"
88
+ )
89
+ geom = LineString(list(zip(longitude_array, latitude_array))).simplify(
90
+ tolerance=0.001, # preserve_topology=True # 113
91
+ ) # TODO: do speed check, convert linestrings to multilinestrings
92
+ print(len(geom.coords))
93
+ pieces.append(
94
+ {
95
+ "id": index,
96
+ "ship_name": ship_name,
97
+ "cruise_name": cruise_name,
98
+ "sensor_name": sensor_name,
99
+ "geom": geom,
100
+ }
101
+ )
102
+ df = pd.DataFrame(pieces)
103
+ gps_gdf = gpd.GeoDataFrame(
104
+ data=df[["id", "ship_name", "cruise_name", "sensor_name"]],
105
+ geometry=df["geom"],
106
+ crs="EPSG:4326",
107
+ )
108
+ print(gps_gdf)
109
+ # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
110
+ if "GeoJSON" not in fiona.supported_drivers.keys():
111
+ raise RuntimeError("Missing GeoJSON driver")
161
112
 
162
- #######################################################
163
- def open_zarr_stores_with_thread_pool_executor(
164
- self,
165
- cruises: list,
166
- ):
167
- # 'cruises' is a list of cruises to process
168
- completed_cruises = []
169
- try:
170
- with ThreadPoolExecutor(max_workers=32) as executor:
171
- futures = [
172
- executor.submit(
173
- self.get_geospatial_info_from_zarr_store,
174
- "Henry_B._Bigelow", # ship_name
175
- cruise, # cruise_name
176
- )
177
- for cruise in cruises
178
- ]
179
- for future in as_completed(futures):
180
- result = future.result()
181
- if result:
182
- completed_cruises.extend([result])
113
+ gps_gdf.set_index("id", inplace=True)
114
+ # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, crs="epsg:4326")
115
+ return gps_gdf
183
116
  except Exception as err:
184
- print(err)
185
- print("Done opening zarr stores using thread pool.")
186
- return completed_cruises # Took ~12 minutes
117
+ raise RuntimeError(f"Problem parsing Zarr stores, {err}")
187
118
 
188
119
  #######################################################
189
- # https://docs.protomaps.com/pmtiles/create
190
- def aggregate_geojson_into_dataframe(self):
191
- """
192
- iterate through cruises, threadpoolexecute geojson creation, aggregate geojson files into df,
193
- """
194
- gps_gdf = geopandas.GeoDataFrame(
120
+ def aggregate_geojson_into_dataframe(self, geoms):
121
+ gps_gdf = gpd.GeoDataFrame(
195
122
  columns=["id", "ship", "cruise", "sensor", "geometry"],
196
123
  geometry="geometry",
197
124
  crs="EPSG:4326",
198
125
  )
199
-
200
- file_type = "dataframe_*.geojson"
201
- geojson_files = glob.glob(file_type)
202
- for jjj in range(len(geojson_files)):
203
- print(jjj)
204
- geom = geopandas.read_file(geojson_files[jjj])
205
- gps_gdf.loc[jjj] = (
206
- jjj,
207
- geom.ship[0],
208
- geom.cruise[0],
209
- geom.sensor[0],
210
- geom.geometry[0],
126
+ for iii, geom in enumerate(geoms):
127
+ gps_gdf.loc[iii] = (
128
+ iii,
129
+ geom.ship_name[iii],
130
+ geom.cruise_name[iii],
131
+ geom.sensor_name[iii],
132
+ geom.geometry[iii],
211
133
  )
212
- # gps_gdf.loc[0] = (0, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
213
- print(gps_gdf)
214
134
  gps_gdf.set_index("id", inplace=True)
215
135
  gps_gdf.to_file(
216
- "data.geojson",
136
+ filename="dataset.geojson",
217
137
  driver="GeoJSON",
218
- engine="pyogrio",
138
+ engine="fiona", # or "pyogrio"
219
139
  layer_options={"ID_GENERATE": "YES"},
140
+ crs="EPSG:4326",
141
+ id_generate=True, # required for the feature click selection
220
142
  )
221
- return list(gps_gdf.cruise)
222
-
223
- # gps_gdf.loc[iii] = (iii, "Henry_B._Bigelow", cruise_name, "EK60", geom) # (ship, cruise, sensor, geometry)
224
- # print('writing to file')
225
- # print(gps_gdf)
226
- # gps_gdf.set_index('id', inplace=True)
227
- # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
228
- # https://gdal.org/en/latest/drivers/vector/jsonfg.html
229
- # gps_gdf.to_file(
230
- # f"data.geojson",
231
- # driver="GeoJSON",
232
- # engine="pyogrio",
233
- # layer_options={"ID_FIELD": "id"}
234
- # )
235
- # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON", engine="pyogrio", id_generate=True)
236
-
143
+ print(gps_gdf)
237
144
 
238
- # print(fiona.supported_drivers) # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
239
- # gps_gdf.to_file('dataframe.shp', crs="EPSG:4326", engine="fiona")
240
- # Convert geojson feature collection to pmtiles
241
- # gps_gdf.to_file("dataframe.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona")
242
- # print("done")
243
- # ---Export Shapefile--- #
145
+ #######################################################
146
+ def create_collection_geojson(self):
147
+ cruises = [
148
+ "HB0706",
149
+ "HB0707",
150
+ "HB0710",
151
+ "HB0711",
152
+ "HB0802",
153
+ "HB0803",
154
+ "HB0805",
155
+ "HB0806",
156
+ "HB0807",
157
+ "HB0901",
158
+ "HB0902",
159
+ "HB0903",
160
+ "HB0904",
161
+ "HB0905",
162
+ "HB1002",
163
+ "HB1006",
164
+ "HB1102",
165
+ "HB1103",
166
+ "HB1105",
167
+ "HB1201",
168
+ "HB1206",
169
+ "HB1301",
170
+ "HB1303",
171
+ "HB1304",
172
+ "HB1401",
173
+ "HB1402",
174
+ "HB1403",
175
+ "HB1405",
176
+ "HB1501",
177
+ "HB1502",
178
+ "HB1503",
179
+ "HB1506",
180
+ "HB1507",
181
+ "HB1601",
182
+ "HB1603",
183
+ "HB1604",
184
+ "HB1701",
185
+ "HB1702",
186
+ "HB1801",
187
+ "HB1802",
188
+ "HB1803",
189
+ "HB1804",
190
+ "HB1805",
191
+ "HB1806",
192
+ "HB1901",
193
+ "HB1902",
194
+ "HB1903",
195
+ "HB1904",
196
+ "HB1906",
197
+ "HB1907",
198
+ "HB2001",
199
+ "HB2006",
200
+ "HB2007",
201
+ "HB20ORT",
202
+ "HB20TR",
203
+ ]
204
+ completed_cruises = self.check_all_cruises(
205
+ bucket_name=bucket_name, cruises=cruises
206
+ ) # TODO: threadpool this
207
+ ### create linestring ###
208
+ geometries = []
209
+ for jjj, completed_cruise in enumerate(
210
+ completed_cruises
211
+ ): # TODO: threadpool this
212
+ geometries.append(
213
+ self.get_cruise_geometry(cruise_name=completed_cruise, index=jjj)
214
+ )
215
+ #
216
+ self.aggregate_geojson_into_dataframe(geoms=geometries)
217
+ #
218
+ print(
219
+ 'Now run this: "tippecanoe --no-feature-limit -zg -o dataset.pmtiles -l cruises dataset.geojson --force"'
220
+ )
221
+ # # water-column-sonar-id.pmtiles
222
+ # linux command: "tippecanoe --no-feature-limit -zg -o water-column-sonar-id.pmtiles -l cruises dataset.geojson --force"
223
+ # note: 'cruises' is the name of the layer
224
+ # size is ~3.3 MB for the pmtiles
225
+ # then drag-and-drop here: https://pmtiles.io/#map=6.79/39.802/-71.51
244
226
 
227
+ #######################################################
228
+ # TODO: copy the .pmtiles file to the s3 bucket "noaa-wcsd-pds-index"
229
+ #######################################################
245
230
 
246
- # gps_gdf.set_geometry(col='geometry', inplace=True)
247
- # gps_gdf.__geo_interface__
248
- # gps_gdf.set_index('id', inplace=True)
249
- # gps_gdf.to_file(f"dataframe3.geojson", driver="GeoJSON", crs="EPSG:4326", engine="fiona", index=True)
231
+ #######################################################
232
+ # TODO: get threadpool working
233
+ # def open_zarr_stores_with_thread_pool_executor(
234
+ # self,
235
+ # cruises: list,
236
+ # ):
237
+ # # 'cruises' is a list of cruises to process
238
+ # completed_cruises = []
239
+ # try:
240
+ # with ThreadPoolExecutor(max_workers=32) as executor:
241
+ # futures = [
242
+ # executor.submit(
243
+ # self.get_geospatial_info_from_zarr_store,
244
+ # "Henry_B._Bigelow", # ship_name
245
+ # cruise, # cruise_name
246
+ # )
247
+ # for cruise in cruises
248
+ # ]
249
+ # for future in as_completed(futures):
250
+ # result = future.result()
251
+ # if result:
252
+ # completed_cruises.extend([result])
253
+ # except Exception as err:
254
+ # raise RuntimeError(f"Problem, {err}")
255
+ # print("Done opening zarr stores using thread pool.")
256
+ # return completed_cruises # Took ~12 minutes
250
257
 
251
- ### this gives the right layer id values
252
- # gps_gdf.to_file(f"dataframe6.geojson", driver="GeoJSON", engine="pyogrio", layer_options={"ID_GENERATE": "YES"})
253
- # jq '{"type": "FeatureCollection", "features": [.[] | .features[]]}' --slurp input*.geojson > output.geojson
254
- # tippecanoe -zg --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises output.geojson
255
- # tippecanoe -zg --convert-stringified-ids-to-numbers --projection=EPSG:4326 -o water-column-sonar-id.pmtiles -l cruises dataframe*.geojson
256
- # {
257
- # "type": "FeatureCollection",
258
- # "name": "dataframe5",
259
- # "features": [
260
- # { "type": "Feature", "id": 0, "properties": { "id": 0, "ship": "Henry_B._Bigelow", "cruise": "HB0706", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.120498657226562, 39.659671783447266 ], [ -72.120773315429688, 39.660198211669922 ] ] } },
261
- # { "type": "Feature", "id": 1, "properties": { "id": 1, "ship": "Henry_B._Bigelow", "cruise": "HB0707", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -71.797836303710938, 41.003166198730469 ], [ -71.797996520996094, 41.002998352050781 ], [ -71.798583984375, 41.002994537353516 ] ] } },
262
- # { "type": "Feature", "id": 2, "properties": { "id": 2, "ship": "Henry_B._Bigelow", "cruise": "HB0710", "sensor": "EK60" }, "geometry": { "type": "LineString", "coordinates": [ [ -72.489486694335938, 40.331901550292969 ], [ -72.490760803222656, 40.33099365234375 ] ] } }
263
- # ]
264
- # }
258
+ #######################################################
265
259
 
266
- # # https://docs.protomaps.com/pmtiles/create
267
- # #ogr2ogr -t_srs EPSG:4326 data.geojson dataframe.shp
268
- # # Only need to do the second one here...
269
- # tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises dataframe.geojson
270
- # tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
271
- # # used this to combine all the geojson files into single pmtile file (2024-12-03):
272
- # tippecanoe -zg --projection=EPSG:4326 -o data.pmtiles -l cruises --coalesce-densest-as-needed --extend-zooms-if-still-dropping dataframe*.geojson
273
- #
274
- # TODO:
275
- # run each one of the cruises in a separate ospool workflow.
276
- # each process gets own store
277
260
 
278
261
  ###########################################################
279
-
280
- # s3_manager = S3Manager() # endpoint_url=endpoint_url)
281
- # # s3fs_manager = S3FSManager()
282
- # # input_bucket_name = "test_input_bucket"
283
- # # s3_manager.create_bucket(bucket_name=input_bucket_name)
284
- # ship_name = "Henry_B._Bigelow"
285
- # cruise_name = "HB0706"
286
- # sensor_name = "EK60"
287
- #
288
- # # ---Scan Bucket For All Zarr Stores--- #
289
- # # https://noaa-wcsd-zarr-pds.s3.amazonaws.com/index.html#level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr/
290
- # path_to_zarr_store = f"s3://noaa-wcsd-zarr-pds/level_2/Henry_B._Bigelow/HB0706/EK60/HB0706.zarr"
291
- # s3 = s3fs.S3FileSystem()
292
- # zarr_store = s3fs.S3Map(path_to_zarr_store, s3=s3)
293
- # ds_zarr = xr.open_zarr(zarr_store, consolidated=None)
294
- # print(ds_zarr.Sv.shape)
295
-
296
-
297
- # total = [246847, 89911, 169763, 658047, 887640, 708771, 187099, 3672813, 4095002, 763268, 162727, 189454, 1925270, 3575857, 1031920, 1167590, 3737415, 4099957, 3990725, 3619996, 3573052, 2973090, 55851, 143192, 1550164, 3692819, 668400, 489735, 393260, 1311234, 242989, 4515760, 1303091, 704663, 270645, 3886437, 4204381, 1062090, 428639, 541455, 4206506, 298561, 1279329, 137416, 139836, 228947, 517949]