PyPI - water-column-sonar-processing - Versions diffs - 0.0.1__py3-none-any.whl → 25.11.1__py3-none-any.whl - Mend

water-column-sonar-processing 0.0.1py3-none-any.whl → 25.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (60) hide show

water_column_sonar_processing/__init__.py +13 -0
water_column_sonar_processing/aws/__init__.py +7 -0
water_column_sonar_processing/aws/dynamodb_manager.py +355 -0
water_column_sonar_processing/aws/s3_manager.py +420 -0
water_column_sonar_processing/aws/s3fs_manager.py +72 -0
{model → water_column_sonar_processing}/aws/sns_manager.py +10 -21
{model → water_column_sonar_processing}/aws/sqs_manager.py +11 -19
water_column_sonar_processing/cruise/__init__.py +4 -0
water_column_sonar_processing/cruise/create_empty_zarr_store.py +191 -0
water_column_sonar_processing/cruise/datatree_manager.py +21 -0
water_column_sonar_processing/cruise/resample_regrid.py +339 -0
water_column_sonar_processing/geometry/__init__.py +11 -0
water_column_sonar_processing/geometry/elevation_manager.py +111 -0
water_column_sonar_processing/geometry/geometry_manager.py +243 -0
water_column_sonar_processing/geometry/line_simplification.py +176 -0
water_column_sonar_processing/geometry/pmtile_generation.py +261 -0
water_column_sonar_processing/index/__init__.py +3 -0
water_column_sonar_processing/index/index_manager.py +384 -0
water_column_sonar_processing/model/__init__.py +3 -0
water_column_sonar_processing/model/zarr_manager.py +722 -0
water_column_sonar_processing/process.py +149 -0
water_column_sonar_processing/processing/__init__.py +4 -0
water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
water_column_sonar_processing/processing/raw_to_zarr.py +425 -0
water_column_sonar_processing/utility/__init__.py +13 -0
{model → water_column_sonar_processing}/utility/cleaner.py +7 -8
water_column_sonar_processing/utility/constants.py +118 -0
{model → water_column_sonar_processing}/utility/pipeline_status.py +47 -24
water_column_sonar_processing/utility/timestamp.py +12 -0
water_column_sonar_processing-25.11.1.dist-info/METADATA +182 -0
water_column_sonar_processing-25.11.1.dist-info/RECORD +34 -0
{water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-25.11.1.dist-info}/WHEEL +1 -1
{water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-25.11.1.dist-info/licenses}/LICENSE +1 -1
water_column_sonar_processing-25.11.1.dist-info/top_level.txt +1 -0
__init__.py +0 -0
model/__init__.py +0 -0
model/aws/__init__.py +0 -0
model/aws/dynamodb_manager.py +0 -149
model/aws/s3_manager.py +0 -356
model/aws/s3fs_manager.py +0 -74
model/cruise/__init__.py +0 -0
model/cruise/create_empty_zarr_store.py +0 -166
model/cruise/resample_regrid.py +0 -248
model/geospatial/__init__.py +0 -0
model/geospatial/geometry_manager.py +0 -194
model/geospatial/geometry_simplification.py +0 -81
model/geospatial/pmtile_generation.py +0 -74
model/index/__init__.py +0 -0
model/index/index.py +0 -228
model/model.py +0 -138
model/utility/__init__.py +0 -0
model/utility/constants.py +0 -56
model/utility/timestamp.py +0 -12
model/zarr/__init__.py +0 -0
model/zarr/bar.py +0 -28
model/zarr/foo.py +0 -11
model/zarr/zarr_manager.py +0 -298
water_column_sonar_processing-0.0.1.dist-info/METADATA +0 -89
water_column_sonar_processing-0.0.1.dist-info/RECORD +0 -32
water_column_sonar_processing-0.0.1.dist-info/top_level.txt +0 -2

water_column_sonar_processing/geometry/geometry_manager.py ADDED Viewed

@@ -0,0 +1,243 @@
+import os
+from pathlib import Path
+import geopandas
+import numpy as np
+import pandas as pd
+from water_column_sonar_processing.aws import S3Manager
+from water_column_sonar_processing.utility import Cleaner
+# //  [Decimal / Places / Degrees	/ Object that can be recognized at scale / N/S or E/W at equator, E/W at 23N/S, E/W at 45N/S, E/W at 67N/S]
+#   //  0   1.0	        1° 00′ 0″	        country or large region                             111.32 km	  102.47 km	  78.71 km	43.496 km
+#   //  1	  0.1	        0° 06′ 0″         large city or district                              11.132 km	  10.247 km	  7.871 km	4.3496 km
+#   //  2	  0.01	      0° 00′ 36″        town or village                                     1.1132 km	  1.0247 km	  787.1 m	  434.96 m
+#   //  3	  0.001	      0° 00′ 3.6″       neighborhood, street                                111.32 m	  102.47 m	  78.71 m	  43.496 m
+#   //  4	  0.0001	    0° 00′ 0.36″      individual street, land parcel                      11.132 m	  10.247 m	  7.871 m	  4.3496 m
+#   //  5	  0.00001	    0° 00′ 0.036″     individual trees, door entrance	                    1.1132 m	  1.0247 m	  787.1 mm	434.96 mm
+#   //  6	  0.000001	  0° 00′ 0.0036″    individual humans                                   111.32 mm	  102.47 mm	  78.71 mm	43.496 mm
+#   //  7	  0.0000001	  0° 00′ 0.00036″   practical limit of commercial surveying	            11.132 mm	  10.247 mm	  7.871 mm	4.3496 mm
+class GeometryManager:
+    #######################################################
+    def __init__(
+        self,
+    ):
+        self.DECIMAL_PRECISION = 5  # precision for GPS coordinates
+        self.SIMPLIFICATION_TOLERANCE = 0.0001  # RDP simplification to "street level"
+    #######################################################
+    def read_echodata_gps_data(
+        self,
+        echodata,
+        output_bucket_name,
+        ship_name,
+        cruise_name,
+        sensor_name,
+        file_name,
+        endpoint_url=None,
+        write_geojson=True,
+    ) -> tuple:
+        file_name_stem = Path(file_name).stem
+        geo_json_name = f"{file_name_stem}.json"
+        print("Getting GPS dataset from echopype object.")
+        try:
+            latitude = np.round(
+                echodata.platform.latitude.values, self.DECIMAL_PRECISION
+            )
+            longitude = np.round(
+                echodata.platform.longitude.values, self.DECIMAL_PRECISION
+            )
+            # RE: time coordinates: https://github.com/OSOceanAcoustics/echopype/issues/656#issue-1219104771
+            # 'nmea_times' are times from the nmea datalogger associated with GPS
+            #   note that nmea_times, unlike time1, can be sorted
+            nmea_times = np.sort(echodata.platform.time1.values)
+            # 'time1' are times from the echosounder associated with the dataset of the transducer measurement
+            time1 = echodata.environment.time1.values
+            if len(nmea_times) < len(time1):
+                raise Exception(
+                    "Problem: Not enough NMEA times available to extrapolate time1."
+                )  # TODO: explore this logic further...
+            # Align 'sv_times' to 'nmea_times'
+            if not (
+                np.all(time1[:-1] <= time1[1:])
+                and np.all(nmea_times[:-1] <= nmea_times[1:])
+            ):
+                raise Exception("Problem: NMEA times are not sorted.")
+            # Finds the indices where 'v' can be inserted just to the right of 'a'
+            indices = np.searchsorted(a=nmea_times, v=time1, side="right") - 1
+            lat = latitude[indices]
+            lat[indices < 0] = np.nan  # values recorded before indexing are set to nan
+            lon = longitude[indices]
+            lon[indices < 0] = np.nan
+            if not (
+                np.all(lat[~np.isnan(lat)] >= -90.0)
+                and np.all(lat[~np.isnan(lat)] <= 90.0)
+                and np.all(lon[~np.isnan(lon)] >= -180.0)
+                and np.all(lon[~np.isnan(lon)] <= 180.0)
+            ):
+                raise Exception("Problem: GPS Data falls outside allowed bounds.")
+            # check for visits to null island
+            null_island_indices = list(
+                set.intersection(
+                    set(np.where(np.abs(lat) < 1e-3)[0]),
+                    set(np.where(np.abs(lon) < 1e-3)[0]),
+                )
+            )
+            lat[null_island_indices] = np.nan
+            lon[null_island_indices] = np.nan
+            # create requirement for minimum linestring size
+            MIN_ALLOWED_SIZE = (
+                4  # don't want to process files with less than 4 dataset points
+            )
+            if (
+                len(lat[~np.isnan(lat)]) < MIN_ALLOWED_SIZE
+                or len(lon[~np.isnan(lon)]) < MIN_ALLOWED_SIZE
+            ):
+                raise Exception(
+                    f"There was not enough dataset in lat or lon to create geojson, {len(lat[~np.isnan(lat)])} found, less than {MIN_ALLOWED_SIZE}."
+                )
+            # https://osoceanacoustics.github.io/echopype-examples/echopype_tour.html
+            gps_df = (
+                pd.DataFrame({"latitude": lat, "longitude": lon, "time": time1})
+                .set_index(["time"])
+                .fillna(0)
+            )
+            # Note: We set np.nan to 0,0 so downstream missing values can be omitted
+            gps_gdf = geopandas.GeoDataFrame(
+                gps_df,
+                geometry=geopandas.points_from_xy(
+                    gps_df["longitude"], gps_df["latitude"]
+                ),
+                crs="epsg:4326",
+            )
+            # Note: We set np.nan to 0,0 so downstream missing values can be omitted
+            # TODO: so what ends up here is dataset with corruption at null island!!!
+            geo_json_line = gps_gdf.to_json()
+            if write_geojson:
+                print("Creating local copy of geojson file.")
+                with open(geo_json_name, "w") as write_file:
+                    write_file.write(
+                        geo_json_line
+                    )  # NOTE: this file can include zeros for lat lon
+                geo_json_prefix = (
+                    f"spatial/geojson/{ship_name}/{cruise_name}/{sensor_name}"
+                )
+                print("Checking s3 and deleting any existing GeoJSON file.")
+                s3_manager = S3Manager(endpoint_url=endpoint_url)
+                geojson_object_exists = s3_manager.check_if_object_exists(
+                    bucket_name=output_bucket_name,
+                    key_name=f"{geo_json_prefix}/{geo_json_name}",
+                )
+                if geojson_object_exists:
+                    print(
+                        "GeoJSON already exists in s3, deleting existing and continuing."
+                    )
+                    s3_manager.delete_nodd_object(
+                        bucket_name=output_bucket_name,
+                        key_name=f"{geo_json_prefix}/{geo_json_name}",
+                    )
+                print("Upload GeoJSON to s3.")
+                s3_manager.upload_nodd_file(
+                    file_name=geo_json_name,  # file_name
+                    key=f"{geo_json_prefix}/{geo_json_name}",  # key
+                    output_bucket_name=output_bucket_name,
+                )
+                # TODO: delete geo_json file
+                cleaner = Cleaner()
+                cleaner.delete_local_files(file_types=["*.json"])
+            #################################################################
+            # TODO: simplify with shapely
+            # linestring = shapely.geometry.LineString(
+            #     [xy for xy in zip(gps_gdf.longitude, gps_gdf.latitude)]
+            # )
+            # len(linestring.coords)
+            # line_simplified = linestring.simplify(
+            #     tolerance=self.SIMPLIFICATION_TOLERANCE,
+            #     preserve_topology=True
+            # )
+            # print(f"Total number of points for original linestring: {len(linestring.coords)}")
+            # print(f"Total number of points needed for the simplified linestring: {len(line_simplified.coords)}")
+            # print(line_simplified)
+            # geo_json_line_simplified = shapely.to_geojson(line_simplified)
+            #################################################################
+            # GeoJSON FeatureCollection with IDs as "time"
+        except Exception as err:
+            raise RuntimeError(
+                f"Exception encountered extracting gps coordinates creating geojson, {err}"
+            )
+        # Note: returned lat/lon values can include np.nan because they need to be aligned with
+        # the Sv dataset! GeoJSON needs simplification but has been filtered.
+        # return gps_df.index.values, gps_df.latitude.values, gps_df.longitude.values
+        return gps_df.index.values, lat, lon
+        # TODO: if geojson is already returned with 0,0, the return here
+        #  can include np.nan values?
+    #######################################################
+    def read_s3_geo_json(
+        self,
+        ship_name,
+        cruise_name,
+        sensor_name,
+        file_name_stem,
+        input_xr_zarr_store,
+        endpoint_url,
+        output_bucket_name,
+    ):
+        try:
+            s3_manager = S3Manager(endpoint_url=endpoint_url)
+            geo_json = s3_manager.read_s3_json(
+                ship_name=ship_name,
+                cruise_name=cruise_name,
+                sensor_name=sensor_name,
+                file_name_stem=file_name_stem,
+                output_bucket_name=output_bucket_name,
+            )
+            ###
+            geospatial = geopandas.GeoDataFrame.from_features(
+                geo_json["features"]
+            ).set_index(pd.json_normalize(geo_json["features"])["id"].values)
+            null_island_indices = list(
+                set.intersection(
+                    set(np.where(np.abs(geospatial.latitude.values) < 1e-3)[0]),
+                    set(np.where(np.abs(geospatial.longitude.values) < 1e-3)[0]),
+                )
+            )
+            geospatial.iloc[null_island_indices] = np.nan
+            ###
+            geospatial_index = geospatial.dropna().index.values.astype("datetime64[ns]")
+            aa = input_xr_zarr_store.ping_time.values.tolist()
+            vv = geospatial_index.tolist()
+            indices = np.searchsorted(a=aa, v=vv)
+            return indices, geospatial
+        except Exception as err:
+            raise RuntimeError(f"Exception encountered reading s3 GeoJSON, {err}")
+    ############################################################################
+    # COMES from the raw-to-zarr conversion
+    def __write_geojson_to_file(self, store_name, data) -> None:
+        print("Writing GeoJSON to file.")
+        with open(os.path.join(store_name, "geo.json"), "w") as outfile:
+            outfile.write(data)
+###########################################################

water_column_sonar_processing/geometry/line_simplification.py ADDED Viewed

@@ -0,0 +1,176 @@
+# import json
+import geopandas as gpd
+import numpy as np
+from pykalman import KalmanFilter
+from shapely.geometry import Point
+# import hvplot.pandas
+# from holoviews import opts
+# hv.extension('bokeh')
+# import matplotlib.pyplot as plt
+# lambda for timestamp in form "yyyy-MM-ddTHH:mm:ssZ"
+# dt = lambda: datetime.now().isoformat(timespec="seconds") + "Z"
+# TODO: get line for example HB1906 ...save linestring to array for testing
+MAX_SPEED_KNOTS = 50
+# Lambert's formula ==> better accuracy than haversinte
+# Lambert's formula (the formula used by the calculators above) is the method used to calculate the shortest distance along the surface of an ellipsoid. When used to approximate the Earth and calculate the distance on the Earth surface, it has an accuracy on the order of 10 meters over thousands of kilometers, which is more precise than the haversine formula.
+def mph_to_knots(mph_value):
+    """TODO:"""
+    # 1 mile per hour === 0.868976 Knots
+    return mph_value * 0.868976
+def mps_to_knots(mps_value):
+    return mps_value * 1.94384
+###############################################################################
+# Colab Notebook:
+# https://colab.research.google.com/drive/1Ihb1x0EeYRNwGJ4Bqi4RqQQHu9-40oDk?usp=sharing#scrollTo=hIPziqVO48Xg
+###############################################################################
+# https://shapely.readthedocs.io/en/stable/reference/shapely.MultiLineString.html#shapely.MultiLineString
+class LineSimplification:
+    """
+    //  [Decimal / Places / Degrees	/ Object that can be recognized at scale / N/S or E/W at equator, E/W at 23N/S, E/W at 45N/S, E/W at 67N/S]
+      //  0   1.0	        1° 00′ 0″	        country or large region                             111.32 km	  102.47 km	  78.71 km	43.496 km
+      //  1	  0.1	        0° 06′ 0″         large city or district                              11.132 km	  10.247 km	  7.871 km	4.3496 km
+      //  2	  0.01	      0° 00′ 36″        town or village                                     1.1132 km	  1.0247 km	  787.1 m	  434.96 m
+      //  3	  0.001	      0° 00′ 3.6″       neighborhood, street                                111.32 m	  102.47 m	  78.71 m	  43.496 m
+      //  4	  0.0001	    0° 00′ 0.36″      individual street, land parcel                      11.132 m	  10.247 m	  7.871 m	  4.3496 m
+      //  5	  0.00001	    0° 00′ 0.036″     individual trees, door entrance	                    1.1132 m	  1.0247 m	  787.1 mm	434.96 mm
+      //  6	  0.000001	  0° 00′ 0.0036″    individual humans                                   111.32 mm	  102.47 mm	  78.71 mm	43.496 mm
+      //  7	  0.0000001	  0° 00′ 0.00036″   practical limit of commercial surveying	            11.132 mm	  10.247 mm	  7.871 mm	4.3496 mm
+        private static final int SRID = 8307;
+        private static final double simplificationTolerance = 0.0001;
+        private static final long splitGeometryMs = 900000L;
+        private static final int batchSize = 10000;
+        private static final int geoJsonPrecision = 5;
+        final int geoJsonPrecision = 5;
+        final double simplificationTolerance = 0.0001;
+        final int simplifierBatchSize = 3000;
+        final long maxCount = 0;
+        private static final double maxAllowedSpeedKnts = 60D;
+    """
+    # TODO: in the future move to standalone library
+    #######################################################
+    def __init__(
+        self,
+    ):
+        pass
+    #######################################################
+    def kalman_filter(
+        self,
+        longitudes,
+        latitudes,
+    ) -> (np.ndarray, np.ndarray):
+        """
+        # TODO: need to use masked array to get the right number of values
+        """
+        ### https://github.com/pykalman/pykalman
+        # https://stackoverflow.com/questions/43377626/how-to-use-kalman-filter-in-python-for-location-data
+        measurements = np.asarray([list(elem) for elem in zip(longitudes, latitudes)])
+        initial_state_mean = [measurements[0, 0], 0, measurements[0, 1], 0]
+        transition_matrix = [[1, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1], [0, 0, 0, 1]]
+        observation_matrix = [[1, 0, 0, 0], [0, 0, 1, 0]]
+        kf = KalmanFilter(
+            transition_matrices=transition_matrix,
+            observation_matrices=observation_matrix,
+            initial_state_mean=initial_state_mean,
+        )
+        kf = kf.em(measurements, n_iter=2)  # TODO: 5
+        (smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements)
+        # plt.plot(longitudes, latitudes, label="original")
+        # plt.plot(smoothed_state_means[:, 0], smoothed_state_means[:, 2], label="smoothed")
+        # plt.legend()
+        # plt.show()
+        return smoothed_state_means[:, [0, 2]]
+    #######################################################
+    def get_speeds(
+        self,
+        times: np.ndarray,  # don't really need time, do need to segment the dataset first
+        latitudes: np.ndarray,
+        longitudes: np.ndarray,
+    ) -> np.ndarray:
+        print(MAX_SPEED_KNOTS)  # TODO: too high
+        print(times[0], latitudes[0], longitudes[0])
+        # TODO: distance/time ==> need to take position2 - position1 to get speed
+        # get distance difference
+        geom = [Point(xy) for xy in zip(longitudes, latitudes)]
+        points_df = gpd.GeoDataFrame({"geometry": geom}, crs="EPSG:4326")
+        # Conversion to UTM, a rectilinear projection coordinate system where distance can be calculated with pythagorean theorem
+        # an alternative could be to use EPSG 32663
+        points_df.to_crs(
+            epsg=3310, inplace=True
+        )  # https://gis.stackexchange.com/questions/293310/finding-distance-between-two-points-with-geoseries-distance
+        distance_diffs = points_df.distance(points_df.shift())
+        # distance_diffs_sorted = distance_diffs.sort_values(
+        #     ascending=False
+        # )  # TODO: get avg cutoff time
+        #
+        time_diffs_ns = np.append(0, (times[1:] - times[:-1]).astype(int))
+        # time_diffs_ns_sorted = np.sort(time_diffs_ns)
+        # largest time diffs HB0707 [ 17. 17.93749786  21.0781271  54.82812723  85.09374797, 113.56249805 204.87500006 216. 440.68749798 544.81249818]
+        # largest diffs HB1906 [3.01015808e+00 3.01016013e+00 3.01017805e+00 3.01018701e+00, 3.01018701e+00 3.01018906e+00 3.01019802e+00 3.01021005e+00, 3.01021005e+00 3.01021414e+00 3.01022208e+00 3.01022899e+00, 3.01024998e+00 3.01025920e+00 3.01026202e+00 3.01028096e+00, 3.01119411e+00 3.01120896e+00 3.01120998e+00 3.01120998e+00, 3.01122099e+00 3.01122790e+00 3.01122790e+00 3.01124506e+00, 3.01125197e+00 3.01128090e+00 3.01142707e+00 3.01219814e+00, 3.01221120e+00 3.01223014e+00 3.01225498e+00 3.01225882e+00, 3.01226010e+00 3.01312998e+00 3.01316096e+00 3.01321190e+00, 3.01321293e+00 3.01322880e+00 3.01322906e+00 3.01323110e+00, 3.01323213e+00 3.01323290e+00 3.01326208e+00 3.01328512e+00, 3.01418112e+00 3.01420109e+00 3.01421107e+00 3.01421184e+00, 3.01421414e+00 3.01424819e+00 3.01512883e+00 3.01516006e+00, 3.01524198e+00 3.01619917e+00 3.01623194e+00 3.01623296e+00, 3.01917594e+00 3.01921408e+00 3.01921587e+00 3.02022195e+00, 3.02025216e+00 3.02121702e+00 3.02325811e+00 3.02410291e+00, 3.02421914e+00 3.02426701e+00 3.02523776e+00 3.02718694e+00, 3.02927590e+00 3.03621606e+00 3.03826304e+00 3.34047514e+00, 3.36345114e+00 3.39148595e+00 4.36819302e+00 4.50157901e+00, 4.50315699e+00 4.50330598e+00 4.50333491e+00 4.50428416e+00, 4.50430490e+00 4.50430694e+00 4.50526387e+00 4.50530790e+00, 4.50530995e+00 4.50532301e+00 4.50533478e+00 4.50629402e+00, 4.50730701e+00 4.50825882e+00 4.50939008e+00 6.50179098e+00, 2.25025029e+01 1.39939425e+02 1.54452331e+02 1.60632653e+03, 1.74574667e+05 4.33569587e+05 4.35150475e+05 8.00044883e+05]
+        nanoseconds_per_second = 1e9
+        speed_meters_per_second = (
+            distance_diffs / time_diffs_ns * nanoseconds_per_second
+        )
+        # returns the speed in meters per second #TODO: get speed in knots
+        return speed_meters_per_second.to_numpy(dtype="float32")  # includes nan
+    def remove_null_island_values(
+        self,
+        epsilon=1e-5,
+    ) -> None:
+        # TODO: low priority
+        print(epsilon)
+        pass
+    def break_linestring_into_multi_linestring(
+        self,
+    ) -> None:
+        # TODO: medium priority
+        # For any line-strings across the antimeridian, break into multilinestring
+        # average cadence is measurements every 1 second
+        # break when over 1 minute
+        pass
+    def simplify(
+        self,
+    ) -> None:
+        # TODO: medium-high priority
+        pass
+    #######################################################
+# [(-72.2001724243164, 40.51750183105469), # latBB
+#  (-72.20023345947266, 40.51749038696289),
+#  (-72.20033264160156, 40.51750183105469), # lonAA, latBB
+#  (-72.20030212402344, 40.517391204833984),
+#  (-72.20033264160156, 40.517330169677734), # lonAA, latCC
+#  (-72.2003402709961, 40.51729965209961),
+#  (-72.20033264160156, 40.517330169677734), # lonAA, latCC
+#  (-72.20040130615234, 40.5172004699707),
+#  (-72.20050048828125, 40.51716995239258),
+#  (-72.2004623413086, 40.51710891723633)]
+###########################################################

water_column_sonar_processing/geometry/pmtile_generation.py ADDED Viewed

@@ -0,0 +1,261 @@
+import fiona
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import xarray as xr
+from shapely.geometry import LineString
+MAX_POOL_CONNECTIONS = 64
+MAX_CONCURRENCY = 64
+MAX_WORKERS = 64
+GB = 1024**3
+bucket_name = "noaa-wcsd-zarr-pds"
+ship_name = "Henry_B._Bigelow"
+sensor_name = "EK60"
+# TODO: get pmtiles of all the evr points
+class PMTileGeneration(object):
+    """
+    - iterate through the zarr stores for all cruises
+    - generate geojson in geopandas df, simplify linestrings
+    - consolidate into singular df, one cruise per row
+    - export as geojson
+    - using tippecanoe, geojson --> pmtiles w linux command
+    - upload to s3
+    """
+    #######################################################
+    def __init__(
+        self,
+    ):
+        self.bucket_name = "noaa-wcsd-zarr-pds"
+        self.ship_name = "Henry_B._Bigelow"
+        self.sensor_name = "EK60"
+    #######################################################
+    def check_all_cruises(self, bucket_name, cruises):
+        completed = []
+        for cruise_name in cruises:
+            print(cruise_name)
+            try:
+                zarr_store = f"{cruise_name}.zarr"
+                s3_zarr_store_path = f"{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{zarr_store}"
+                cruise = xr.open_dataset(
+                    filename_or_obj=f"s3://{s3_zarr_store_path}",
+                    engine="zarr",
+                    storage_options={"anon": True},
+                )
+                width = cruise.Sv.shape[1]
+                height = cruise.Sv.shape[0]
+                depth = cruise.Sv.shape[2]
+                print(
+                    f"height: {height}, width: {width}, depth: {depth} = {width * height * depth}"
+                )
+                lats = cruise.latitude.to_numpy()
+                percent_done = np.count_nonzero(~np.isnan(lats)) / width
+                if percent_done != 1.0:
+                    print(
+                        f"percent done: {np.round(percent_done, 2)}, {np.count_nonzero(~np.isnan(cruise.latitude.values))}, {width}"
+                    )
+                else:
+                    completed.append(cruise_name)
+            except Exception as err:
+                raise RuntimeError(f"Problem parsing Zarr stores, {err}")
+        return completed
+    #######################################################
+    def get_cruise_geometry(self, cruise_name, index):
+        print(cruise_name)
+        try:
+            pieces = []
+            zarr_store = f"{cruise_name}.zarr"
+            s3_zarr_store_path = f"{bucket_name}/level_2/{ship_name}/{cruise_name}/{sensor_name}/{zarr_store}"
+            cruise = xr.open_dataset(
+                filename_or_obj=f"s3://{s3_zarr_store_path}",
+                engine="zarr",
+                storage_options={"anon": True},
+                chunks={},
+                cache=True,
+            )
+            latitude_array = cruise.latitude.to_numpy()
+            longitude_array = cruise.longitude.to_numpy()
+            if np.isnan(latitude_array).any() or np.isnan(longitude_array).any():
+                raise RuntimeError(
+                    f"There was missing lat-lon dataset for, {cruise_name}"
+                )
+            geom = LineString(list(zip(longitude_array, latitude_array))).simplify(
+                tolerance=0.001,  # preserve_topology=True # 113
+            )  # TODO: do speed check, convert linestrings to multilinestrings
+            print(len(geom.coords))
+            pieces.append(
+                {
+                    "id": index,
+                    "ship_name": ship_name,
+                    "cruise_name": cruise_name,
+                    "sensor_name": sensor_name,
+                    "geom": geom,
+                }
+            )
+            df = pd.DataFrame(pieces)
+            gps_gdf = gpd.GeoDataFrame(
+                data=df[["id", "ship_name", "cruise_name", "sensor_name"]],
+                geometry=df["geom"],
+                crs="EPSG:4326",
+            )
+            print(gps_gdf)
+            # {'DXF': 'rw', 'CSV': 'raw', 'OpenFileGDB': 'raw', 'ESRIJSON': 'r', 'ESRI Shapefile': 'raw', 'FlatGeobuf': 'raw', 'GeoJSON': 'raw', 'GeoJSONSeq': 'raw', 'GPKG': 'raw', 'GML': 'rw', 'OGR_GMT': 'rw', 'GPX': 'rw', 'MapInfo File': 'raw', 'DGN': 'raw', 'S57': 'r', 'SQLite': 'raw', 'TopoJSON': 'r'}
+            if "GeoJSON" not in fiona.supported_drivers.keys():
+                raise RuntimeError("Missing GeoJSON driver")
+            gps_gdf.set_index("id", inplace=True)
+            # gps_gdf.to_file(f"dataframe_{cruise_name}.geojson", driver="GeoJSON") #, crs="epsg:4326")
+            return gps_gdf
+        except Exception as err:
+            raise RuntimeError(f"Problem parsing Zarr stores, {err}")
+    #######################################################
+    def aggregate_geojson_into_dataframe(self, geoms):
+        gps_gdf = gpd.GeoDataFrame(
+            columns=["id", "ship", "cruise", "sensor", "geometry"],
+            geometry="geometry",
+            crs="EPSG:4326",
+        )
+        for iii, geom in enumerate(geoms):
+            gps_gdf.loc[iii] = (
+                iii,
+                geom.ship_name[iii],
+                geom.cruise_name[iii],
+                geom.sensor_name[iii],
+                geom.geometry[iii],
+            )
+        gps_gdf.set_index("id", inplace=True)
+        gps_gdf.to_file(
+            filename="dataset.geojson",
+            driver="GeoJSON",
+            engine="fiona",  # or "pyogrio"
+            layer_options={"ID_GENERATE": "YES"},
+            crs="EPSG:4326",
+            id_generate=True,  # required for the feature click selection
+        )
+        print(gps_gdf)
+    #######################################################
+    def create_collection_geojson(self):
+        cruises = [
+            "HB0706",
+            "HB0707",
+            "HB0710",
+            "HB0711",
+            "HB0802",
+            "HB0803",
+            "HB0805",
+            "HB0806",
+            "HB0807",
+            "HB0901",
+            "HB0902",
+            "HB0903",
+            "HB0904",
+            "HB0905",
+            "HB1002",
+            "HB1006",
+            "HB1102",
+            "HB1103",
+            "HB1105",
+            "HB1201",
+            "HB1206",
+            "HB1301",
+            "HB1303",
+            "HB1304",
+            "HB1401",
+            "HB1402",
+            "HB1403",
+            "HB1405",
+            "HB1501",
+            "HB1502",
+            "HB1503",
+            "HB1506",
+            "HB1507",
+            "HB1601",
+            "HB1603",
+            "HB1604",
+            "HB1701",
+            "HB1702",
+            "HB1801",
+            "HB1802",
+            "HB1803",
+            "HB1804",
+            "HB1805",
+            "HB1806",
+            "HB1901",
+            "HB1902",
+            "HB1903",
+            "HB1904",
+            "HB1906",
+            "HB1907",
+            "HB2001",
+            "HB2006",
+            "HB2007",
+            "HB20ORT",
+            "HB20TR",
+        ]
+        completed_cruises = self.check_all_cruises(
+            bucket_name=bucket_name, cruises=cruises
+        )  # TODO: threadpool this
+        ### create linestring ###
+        geometries = []
+        for jjj, completed_cruise in enumerate(
+            completed_cruises
+        ):  # TODO: threadpool this
+            geometries.append(
+                self.get_cruise_geometry(cruise_name=completed_cruise, index=jjj)
+            )
+        #
+        self.aggregate_geojson_into_dataframe(geoms=geometries)
+        #
+        print(
+            'Now run this: "tippecanoe --no-feature-limit -zg -o dataset.pmtiles -l cruises dataset.geojson --force"'
+        )
+        # # water-column-sonar-id.pmtiles
+        # linux command: "tippecanoe --no-feature-limit -zg -o water-column-sonar-id.pmtiles -l cruises dataset.geojson --force"
+        #   note: 'cruises' is the name of the layer
+        #   size is ~3.3 MB for the pmtiles
+        # then drag-and-drop here: https://pmtiles.io/#map=6.79/39.802/-71.51
+    #######################################################
+    # TODO: copy the .pmtiles file to the s3 bucket "noaa-wcsd-pds-index"
+    #######################################################
+    #######################################################
+    # TODO: get threadpool working
+    # def open_zarr_stores_with_thread_pool_executor(
+    #     self,
+    #     cruises: list,
+    # ):
+    #     # 'cruises' is a list of cruises to process
+    #     completed_cruises = []
+    #     try:
+    #         with ThreadPoolExecutor(max_workers=32) as executor:
+    #             futures = [
+    #                 executor.submit(
+    #                     self.get_geospatial_info_from_zarr_store,
+    #                     "Henry_B._Bigelow",  # ship_name
+    #                     cruise,  # cruise_name
+    #                 )
+    #                 for cruise in cruises
+    #             ]
+    #             for future in as_completed(futures):
+    #                 result = future.result()
+    #                 if result:
+    #                     completed_cruises.extend([result])
+    #     except Exception as err:
+    #         raise RuntimeError(f"Problem, {err}")
+    #     print("Done opening zarr stores using thread pool.")
+    #     return completed_cruises  # Took ~12 minutes
+    #######################################################
+###########################################################

water_column_sonar_processing/index/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .index_manager import IndexManager
+__all__ = ["IndexManager"]

water-column-sonar-processing 0.0.1__py3-none-any.whl → 25.11.1__py3-none-any.whl

Potentially problematic release.

water-column-sonar-processing 0.0.1py3-none-any.whl → 25.11.1py3-none-any.whl