water-column-sonar-processing 25.11.1__py3-none-any.whl → 26.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (22) hide show
  1. water_column_sonar_processing/aws/s3_manager.py +2 -4
  2. water_column_sonar_processing/aws/s3fs_manager.py +1 -9
  3. water_column_sonar_processing/cruise/create_empty_zarr_store.py +19 -81
  4. water_column_sonar_processing/cruise/resample_regrid.py +88 -104
  5. water_column_sonar_processing/geometry/__init__.py +2 -0
  6. water_column_sonar_processing/geometry/elevation_manager.py +2 -2
  7. water_column_sonar_processing/geometry/geometry_manager.py +11 -13
  8. water_column_sonar_processing/geometry/line_simplification.py +10 -10
  9. water_column_sonar_processing/geometry/pmtile_generation.py +8 -3
  10. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  11. water_column_sonar_processing/index/index_manager.py +43 -46
  12. water_column_sonar_processing/model/zarr_manager.py +533 -514
  13. water_column_sonar_processing/processing/raw_to_zarr.py +45 -139
  14. water_column_sonar_processing/utility/cleaner.py +2 -1
  15. water_column_sonar_processing/utility/constants.py +29 -29
  16. water_column_sonar_processing-26.1.14.dist-info/METADATA +240 -0
  17. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/RECORD +20 -20
  18. water_column_sonar_processing/process.py +0 -149
  19. water_column_sonar_processing-25.11.1.dist-info/METADATA +0 -182
  20. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/WHEEL +0 -0
  21. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/licenses/LICENSE +0 -0
  22. {water_column_sonar_processing-25.11.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- import os
2
1
  from pathlib import Path
3
2
 
4
3
  import geopandas
@@ -8,6 +7,7 @@ import pandas as pd
8
7
  from water_column_sonar_processing.aws import S3Manager
9
8
  from water_column_sonar_processing.utility import Cleaner
10
9
 
10
+
11
11
  # // [Decimal / Places / Degrees / Object that can be recognized at scale / N/S or E/W at equator, E/W at 23N/S, E/W at 45N/S, E/W at 67N/S]
12
12
  # // 0 1.0 1° 00′ 0″ country or large region 111.32 km 102.47 km 78.71 km 43.496 km
13
13
  # // 1 0.1 0° 06′ 0″ large city or district 11.132 km 10.247 km 7.871 km 4.3496 km
@@ -24,7 +24,7 @@ class GeometryManager:
24
24
  def __init__(
25
25
  self,
26
26
  ):
27
- self.DECIMAL_PRECISION = 5 # precision for GPS coordinates
27
+ self.DECIMAL_PRECISION = 6 # precision for GPS coordinates
28
28
  self.SIMPLIFICATION_TOLERANCE = 0.0001 # RDP simplification to "street level"
29
29
 
30
30
  #######################################################
@@ -44,12 +44,10 @@ class GeometryManager:
44
44
 
45
45
  print("Getting GPS dataset from echopype object.")
46
46
  try:
47
- latitude = np.round(
48
- echodata.platform.latitude.values, self.DECIMAL_PRECISION
49
- )
50
- longitude = np.round(
51
- echodata.platform.longitude.values, self.DECIMAL_PRECISION
52
- )
47
+ latitude = (
48
+ echodata.platform.latitude.values
49
+ ) # TODO: DONT get values from here!
50
+ longitude = echodata.platform.longitude.values
53
51
 
54
52
  # RE: time coordinates: https://github.com/OSOceanAcoustics/echopype/issues/656#issue-1219104771
55
53
  # 'nmea_times' are times from the nmea datalogger associated with GPS
@@ -192,8 +190,8 @@ class GeometryManager:
192
190
  # can include np.nan values?
193
191
 
194
192
  #######################################################
193
+ @staticmethod
195
194
  def read_s3_geo_json(
196
- self,
197
195
  ship_name,
198
196
  cruise_name,
199
197
  sensor_name,
@@ -234,10 +232,10 @@ class GeometryManager:
234
232
 
235
233
  ############################################################################
236
234
  # COMES from the raw-to-zarr conversion
237
- def __write_geojson_to_file(self, store_name, data) -> None:
238
- print("Writing GeoJSON to file.")
239
- with open(os.path.join(store_name, "geo.json"), "w") as outfile:
240
- outfile.write(data)
235
+ # def __write_geojson_to_file(self, store_name, data) -> None:
236
+ # print("Writing GeoJSON to file.")
237
+ # with open(os.path.join(store_name, "geo.json"), "w") as outfile:
238
+ # outfile.write(data)
241
239
 
242
240
 
243
241
  ###########################################################
@@ -71,11 +71,11 @@ class LineSimplification:
71
71
  pass
72
72
 
73
73
  #######################################################
74
+ @staticmethod
74
75
  def kalman_filter(
75
- self,
76
76
  longitudes,
77
77
  latitudes,
78
- ) -> (np.ndarray, np.ndarray):
78
+ ):
79
79
  """
80
80
  # TODO: need to use masked array to get the right number of values
81
81
  """
@@ -102,8 +102,8 @@ class LineSimplification:
102
102
  return smoothed_state_means[:, [0, 2]]
103
103
 
104
104
  #######################################################
105
+ @staticmethod
105
106
  def get_speeds(
106
- self,
107
107
  times: np.ndarray, # don't really need time, do need to segment the dataset first
108
108
  latitudes: np.ndarray,
109
109
  longitudes: np.ndarray,
@@ -136,13 +136,13 @@ class LineSimplification:
136
136
  # returns the speed in meters per second #TODO: get speed in knots
137
137
  return speed_meters_per_second.to_numpy(dtype="float32") # includes nan
138
138
 
139
- def remove_null_island_values(
140
- self,
141
- epsilon=1e-5,
142
- ) -> None:
143
- # TODO: low priority
144
- print(epsilon)
145
- pass
139
+ # def remove_null_island_values(
140
+ # self,
141
+ # epsilon=1e-5,
142
+ # ) -> None:
143
+ # # TODO: low priority
144
+ # print(epsilon)
145
+ # pass
146
146
 
147
147
  def break_linestring_into_multi_linestring(
148
148
  self,
@@ -36,17 +36,20 @@ class PMTileGeneration(object):
36
36
  self.sensor_name = "EK60"
37
37
 
38
38
  #######################################################
39
- def check_all_cruises(self, bucket_name, cruises):
39
+ @staticmethod
40
+ def check_all_cruises(bucket_name, cruises):
40
41
  completed = []
41
42
  for cruise_name in cruises:
42
43
  print(cruise_name)
43
44
  try:
44
45
  zarr_store = f"{cruise_name}.zarr"
45
46
  s3_zarr_store_path = f"{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{zarr_store}"
47
+ kwargs = {"consolidated": False}
46
48
  cruise = xr.open_dataset(
47
49
  filename_or_obj=f"s3://{s3_zarr_store_path}",
48
50
  engine="zarr",
49
51
  storage_options={"anon": True},
52
+ **kwargs,
50
53
  )
51
54
  width = cruise.Sv.shape[1]
52
55
  height = cruise.Sv.shape[0]
@@ -67,7 +70,8 @@ class PMTileGeneration(object):
67
70
  return completed
68
71
 
69
72
  #######################################################
70
- def get_cruise_geometry(self, cruise_name, index):
73
+ @staticmethod
74
+ def get_cruise_geometry(cruise_name, index):
71
75
  print(cruise_name)
72
76
  try:
73
77
  pieces = []
@@ -117,7 +121,8 @@ class PMTileGeneration(object):
117
121
  raise RuntimeError(f"Problem parsing Zarr stores, {err}")
118
122
 
119
123
  #######################################################
120
- def aggregate_geojson_into_dataframe(self, geoms):
124
+ @staticmethod
125
+ def aggregate_geojson_into_dataframe(geoms):
121
126
  gps_gdf = gpd.GeoDataFrame(
122
127
  columns=["id", "ship", "cruise", "sensor", "geometry"],
123
128
  geometry="geometry",
@@ -0,0 +1,106 @@
1
+ import geopandas as gpd
2
+ import numpy as np
3
+ import pandas as pd
4
+ from shapely.geometry import Point
5
+
6
+ from water_column_sonar_processing.model import ZarrManager
7
+
8
+
9
+ # Convert "meters per second" to "knots"
10
+ # meters_per_second_to_knots = lambda mps_value: mps_value * 1.94384
11
+
12
+
13
+ class Spatiotemporal:
14
+ #######################################################
15
+ def __init__(
16
+ self,
17
+ ):
18
+ self.NANOSECONDS_PER_SECOND = 1e9
19
+ self.CUTOFF_DISTANCE_METERS = 50.0
20
+ self.CUTOFF_TIME_SECONDS = 10.0
21
+
22
+ #######################################################
23
+ @staticmethod
24
+ def meters_per_second_to_knots(
25
+ mps_value,
26
+ ):
27
+ return mps_value * 1.94384
28
+
29
+ #######################################################
30
+ def compute_speed_and_distance(
31
+ self,
32
+ times_ns, #: np.ndarray[tuple[int], np.dtype[np.int64]],
33
+ latitudes, #: np.ndarray,
34
+ longitudes, #: np.ndarray,
35
+ ) -> pd.DataFrame:
36
+ try:
37
+ # fix times
38
+ times = np.array([np.datetime64(int(i), "ns") for i in times_ns])
39
+ geom = [Point(xy) for xy in zip(longitudes, latitudes)]
40
+ points_df = gpd.GeoDataFrame({"geometry": geom}, crs="EPSG:4326")
41
+ # Conversion to a rectilinear projection coordinate system where distance can be calculated with pythagorean theorem
42
+ # EPSG:4087, WGS 84 / World Equidistant Cylindrical
43
+ # https://epsg.io/4087
44
+ points_df.to_crs(epsg=4087, inplace=True)
45
+ distance_diffs = points_df.distance(points_df.geometry.shift())
46
+ distance_diffs[0] = distance_diffs[1] # missing first datapoint, backfill
47
+ # Issue: np.max(distance_diffs) = 3397 meters
48
+ time_diffs_ns = np.append(0, (times[1:] - times[:-1]).astype(int))
49
+ time_diffs_ns[0] = time_diffs_ns[1] # missing first datapoint, backfill
50
+ time_diffs_seconds = time_diffs_ns / self.NANOSECONDS_PER_SECOND
51
+ # Calculate the speed in knots
52
+ speed_meters_per_second = np.array(
53
+ (distance_diffs / time_diffs_ns * self.NANOSECONDS_PER_SECOND),
54
+ dtype=np.float32,
55
+ )
56
+ knots = self.meters_per_second_to_knots(speed_meters_per_second)
57
+ metrics_df = pd.DataFrame(
58
+ {
59
+ "speed_knots": knots.astype(dtype=np.float32),
60
+ "distance_meters": distance_diffs.to_numpy(dtype=np.float32),
61
+ "diff_seconds": time_diffs_seconds.astype(np.float32),
62
+ },
63
+ index=times,
64
+ )
65
+ #
66
+ return metrics_df
67
+ except Exception as err:
68
+ raise RuntimeError(f"Exception encountered, {err}")
69
+
70
+ #######################################################
71
+ def add_speed_and_distance(
72
+ self,
73
+ ship_name,
74
+ cruise_name,
75
+ sensor_name,
76
+ bucket_name,
77
+ endpoint_url=None,
78
+ ) -> None:
79
+ try:
80
+ zarr_manager = ZarrManager()
81
+ zarr_store = zarr_manager.open_s3_zarr_store_with_zarr(
82
+ ship_name=ship_name,
83
+ cruise_name=cruise_name,
84
+ sensor_name=sensor_name,
85
+ output_bucket_name=bucket_name,
86
+ endpoint_url=endpoint_url,
87
+ )
88
+ longitudes = zarr_store["longitude"][:]
89
+ latitudes = zarr_store["latitude"][:]
90
+ times = zarr_store["time"][:]
91
+ #
92
+ metrics_df = self.compute_speed_and_distance(
93
+ times_ns=times,
94
+ latitudes=latitudes,
95
+ longitudes=longitudes,
96
+ )
97
+ # Write the speed and distance to the output zarr store
98
+ zarr_store["speed"][:] = metrics_df.speed_knots.values
99
+ zarr_store["distance"][:] = metrics_df.distance_meters.values
100
+ except Exception as err:
101
+ raise RuntimeError(
102
+ f"Exception encountered writing the speed and distance, {err}"
103
+ )
104
+
105
+
106
+ ###########################################################
@@ -2,10 +2,8 @@ import os
2
2
  import re
3
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from datetime import datetime
5
- from hashlib import sha256
6
5
 
7
- import networkx as nx
8
- import numpy as np
6
+ # import networkx as nx
9
7
  import pandas as pd
10
8
 
11
9
  from water_column_sonar_processing.aws import S3Manager
@@ -120,6 +118,7 @@ class IndexManager:
120
118
  for res in page_iterator:
121
119
  if "Key" in res:
122
120
  return res["Key"]
121
+ return None
123
122
  # else raise exception?
124
123
 
125
124
  # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
@@ -190,9 +189,8 @@ class IndexManager:
190
189
  return files_list
191
190
 
192
191
  #################################################################
193
- def get_subset_ek60_prefix(
194
- self, df: pd.DataFrame
195
- ) -> pd.DataFrame: # TODO: is this used?
192
+ @staticmethod
193
+ def get_subset_ek60_prefix(df: pd.DataFrame) -> pd.DataFrame: # TODO: is this used?
196
194
  # Returns all objects with 'EK60' in prefix of file path
197
195
  # Note that this can include 'EK80' dataset that are false-positives
198
196
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
@@ -274,9 +272,8 @@ class IndexManager:
274
272
  return all_datagrams
275
273
 
276
274
  #################################################################
277
- def get_ek60_objects(
278
- self, df: pd.DataFrame, subset_datagrams: list
279
- ) -> pd.DataFrame:
275
+ @staticmethod
276
+ def get_ek60_objects(df: pd.DataFrame, subset_datagrams: list) -> pd.DataFrame:
280
277
  # for each key write datagram value to all other files in same cruise
281
278
  for subset_datagram in subset_datagrams:
282
279
  if subset_datagram["DATAGRAM"] == "CON0":
@@ -345,40 +342,40 @@ class IndexManager:
345
342
  # print(end_time)
346
343
 
347
344
  # TODO: wip
348
- def build_merkle_tree(self):
349
- G = nx.DiGraph()
350
- # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
351
- ship_name = "Henry_B._Bigelow"
352
- cruise_name = "HB0707"
353
- # cruise_name = "HB0805"
354
- prefix = f"data/raw/{ship_name}/{cruise_name}/"
355
- # prefix = f"data/raw/{ship_name}/"
356
- page_iterator = self.s3_manager.paginator.paginate(
357
- Bucket=self.input_bucket_name,
358
- Prefix=prefix,
359
- )
360
- for page in page_iterator:
361
- for contents in page["Contents"]:
362
- obj_key = contents["Key"]
363
- # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
364
- obj_etag = contents["ETag"].split('"')[1] # properties
365
- obj_size = contents["Size"]
366
- basename = os.path.basename(obj_key)
367
- G.add_node(
368
- node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
369
- ) # TODO: add parent hash
370
- split_path = os.path.normpath(obj_key).split(os.path.sep)
371
- # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
372
- for previous, current in zip(split_path, split_path[1:]):
373
- if not G.has_edge(previous, current):
374
- G.add_edge(previous, current)
375
- # print(G)
376
- etag_set = frozenset(
377
- [k for j, k in list(G.nodes.data("ETag")) if k is not None]
378
- )
379
- new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
380
- total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
381
- print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
382
- print(" ")
383
- print(new_hash)
384
- return new_hash
345
+ # def build_merkle_tree(self):
346
+ # G = nx.DiGraph()
347
+ # # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
348
+ # ship_name = "Henry_B._Bigelow"
349
+ # cruise_name = "HB0707"
350
+ # # cruise_name = "HB0805"
351
+ # prefix = f"data/raw/{ship_name}/{cruise_name}/"
352
+ # # prefix = f"data/raw/{ship_name}/"
353
+ # page_iterator = self.s3_manager.paginator.paginate(
354
+ # Bucket=self.input_bucket_name,
355
+ # Prefix=prefix,
356
+ # )
357
+ # for page in page_iterator:
358
+ # for contents in page["Contents"]:
359
+ # obj_key = contents["Key"]
360
+ # # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
361
+ # obj_etag = contents["ETag"].split('"')[1] # properties
362
+ # obj_size = contents["Size"]
363
+ # basename = os.path.basename(obj_key)
364
+ # G.add_node(
365
+ # node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
366
+ # ) # TODO: add parent hash
367
+ # split_path = os.path.normpath(obj_key).split(os.path.sep)
368
+ # # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
369
+ # for previous, current in zip(split_path, split_path[1:]):
370
+ # if not G.has_edge(previous, current):
371
+ # G.add_edge(previous, current)
372
+ # # print(G)
373
+ # etag_set = frozenset(
374
+ # [k for j, k in list(G.nodes.data("ETag")) if k is not None]
375
+ # )
376
+ # new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
377
+ # total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
378
+ # print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
379
+ # print(" ")
380
+ # print(new_hash)
381
+ # return new_hash