water-column-sonar-processing 0.0.1__py3-none-any.whl → 26.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of water-column-sonar-processing might be problematic. Click here for more details.

Files changed (60) hide show
  1. water_column_sonar_processing/__init__.py +13 -0
  2. water_column_sonar_processing/aws/__init__.py +7 -0
  3. water_column_sonar_processing/aws/dynamodb_manager.py +355 -0
  4. water_column_sonar_processing/aws/s3_manager.py +418 -0
  5. water_column_sonar_processing/aws/s3fs_manager.py +64 -0
  6. {model → water_column_sonar_processing}/aws/sns_manager.py +10 -21
  7. {model → water_column_sonar_processing}/aws/sqs_manager.py +11 -19
  8. water_column_sonar_processing/cruise/__init__.py +4 -0
  9. water_column_sonar_processing/cruise/create_empty_zarr_store.py +129 -0
  10. water_column_sonar_processing/cruise/datatree_manager.py +21 -0
  11. water_column_sonar_processing/cruise/resample_regrid.py +323 -0
  12. water_column_sonar_processing/geometry/__init__.py +13 -0
  13. water_column_sonar_processing/geometry/elevation_manager.py +111 -0
  14. water_column_sonar_processing/geometry/geometry_manager.py +241 -0
  15. water_column_sonar_processing/geometry/line_simplification.py +176 -0
  16. water_column_sonar_processing/geometry/pmtile_generation.py +266 -0
  17. water_column_sonar_processing/geometry/spatiotemporal.py +106 -0
  18. water_column_sonar_processing/index/__init__.py +3 -0
  19. water_column_sonar_processing/index/index_manager.py +381 -0
  20. water_column_sonar_processing/model/__init__.py +3 -0
  21. water_column_sonar_processing/model/zarr_manager.py +741 -0
  22. water_column_sonar_processing/processing/__init__.py +4 -0
  23. water_column_sonar_processing/processing/raw_to_netcdf.py +320 -0
  24. water_column_sonar_processing/processing/raw_to_zarr.py +331 -0
  25. water_column_sonar_processing/utility/__init__.py +13 -0
  26. {model → water_column_sonar_processing}/utility/cleaner.py +7 -7
  27. water_column_sonar_processing/utility/constants.py +118 -0
  28. {model → water_column_sonar_processing}/utility/pipeline_status.py +47 -24
  29. water_column_sonar_processing/utility/timestamp.py +12 -0
  30. water_column_sonar_processing-26.1.14.dist-info/METADATA +240 -0
  31. water_column_sonar_processing-26.1.14.dist-info/RECORD +34 -0
  32. {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-26.1.14.dist-info}/WHEEL +1 -1
  33. {water_column_sonar_processing-0.0.1.dist-info → water_column_sonar_processing-26.1.14.dist-info/licenses}/LICENSE +1 -1
  34. water_column_sonar_processing-26.1.14.dist-info/top_level.txt +1 -0
  35. __init__.py +0 -0
  36. model/__init__.py +0 -0
  37. model/aws/__init__.py +0 -0
  38. model/aws/dynamodb_manager.py +0 -149
  39. model/aws/s3_manager.py +0 -356
  40. model/aws/s3fs_manager.py +0 -74
  41. model/cruise/__init__.py +0 -0
  42. model/cruise/create_empty_zarr_store.py +0 -166
  43. model/cruise/resample_regrid.py +0 -248
  44. model/geospatial/__init__.py +0 -0
  45. model/geospatial/geometry_manager.py +0 -194
  46. model/geospatial/geometry_simplification.py +0 -81
  47. model/geospatial/pmtile_generation.py +0 -74
  48. model/index/__init__.py +0 -0
  49. model/index/index.py +0 -228
  50. model/model.py +0 -138
  51. model/utility/__init__.py +0 -0
  52. model/utility/constants.py +0 -56
  53. model/utility/timestamp.py +0 -12
  54. model/zarr/__init__.py +0 -0
  55. model/zarr/bar.py +0 -28
  56. model/zarr/foo.py +0 -11
  57. model/zarr/zarr_manager.py +0 -298
  58. water_column_sonar_processing-0.0.1.dist-info/METADATA +0 -89
  59. water_column_sonar_processing-0.0.1.dist-info/RECORD +0 -32
  60. water_column_sonar_processing-0.0.1.dist-info/top_level.txt +0 -2
@@ -0,0 +1,106 @@
1
+ import geopandas as gpd
2
+ import numpy as np
3
+ import pandas as pd
4
+ from shapely.geometry import Point
5
+
6
+ from water_column_sonar_processing.model import ZarrManager
7
+
8
+
9
+ # Convert "meters per second" to "knots"
10
+ # meters_per_second_to_knots = lambda mps_value: mps_value * 1.94384
11
+
12
+
13
+ class Spatiotemporal:
14
+ #######################################################
15
+ def __init__(
16
+ self,
17
+ ):
18
+ self.NANOSECONDS_PER_SECOND = 1e9
19
+ self.CUTOFF_DISTANCE_METERS = 50.0
20
+ self.CUTOFF_TIME_SECONDS = 10.0
21
+
22
+ #######################################################
23
+ @staticmethod
24
+ def meters_per_second_to_knots(
25
+ mps_value,
26
+ ):
27
+ return mps_value * 1.94384
28
+
29
+ #######################################################
30
+ def compute_speed_and_distance(
31
+ self,
32
+ times_ns, #: np.ndarray[tuple[int], np.dtype[np.int64]],
33
+ latitudes, #: np.ndarray,
34
+ longitudes, #: np.ndarray,
35
+ ) -> pd.DataFrame:
36
+ try:
37
+ # fix times
38
+ times = np.array([np.datetime64(int(i), "ns") for i in times_ns])
39
+ geom = [Point(xy) for xy in zip(longitudes, latitudes)]
40
+ points_df = gpd.GeoDataFrame({"geometry": geom}, crs="EPSG:4326")
41
+ # Conversion to a rectilinear projection coordinate system where distance can be calculated with pythagorean theorem
42
+ # EPSG:4087, WGS 84 / World Equidistant Cylindrical
43
+ # https://epsg.io/4087
44
+ points_df.to_crs(epsg=4087, inplace=True)
45
+ distance_diffs = points_df.distance(points_df.geometry.shift())
46
+ distance_diffs[0] = distance_diffs[1] # missing first datapoint, backfill
47
+ # Issue: np.max(distance_diffs) = 3397 meters
48
+ time_diffs_ns = np.append(0, (times[1:] - times[:-1]).astype(int))
49
+ time_diffs_ns[0] = time_diffs_ns[1] # missing first datapoint, backfill
50
+ time_diffs_seconds = time_diffs_ns / self.NANOSECONDS_PER_SECOND
51
+ # Calculate the speed in knots
52
+ speed_meters_per_second = np.array(
53
+ (distance_diffs / time_diffs_ns * self.NANOSECONDS_PER_SECOND),
54
+ dtype=np.float32,
55
+ )
56
+ knots = self.meters_per_second_to_knots(speed_meters_per_second)
57
+ metrics_df = pd.DataFrame(
58
+ {
59
+ "speed_knots": knots.astype(dtype=np.float32),
60
+ "distance_meters": distance_diffs.to_numpy(dtype=np.float32),
61
+ "diff_seconds": time_diffs_seconds.astype(np.float32),
62
+ },
63
+ index=times,
64
+ )
65
+ #
66
+ return metrics_df
67
+ except Exception as err:
68
+ raise RuntimeError(f"Exception encountered, {err}")
69
+
70
+ #######################################################
71
+ def add_speed_and_distance(
72
+ self,
73
+ ship_name,
74
+ cruise_name,
75
+ sensor_name,
76
+ bucket_name,
77
+ endpoint_url=None,
78
+ ) -> None:
79
+ try:
80
+ zarr_manager = ZarrManager()
81
+ zarr_store = zarr_manager.open_s3_zarr_store_with_zarr(
82
+ ship_name=ship_name,
83
+ cruise_name=cruise_name,
84
+ sensor_name=sensor_name,
85
+ output_bucket_name=bucket_name,
86
+ endpoint_url=endpoint_url,
87
+ )
88
+ longitudes = zarr_store["longitude"][:]
89
+ latitudes = zarr_store["latitude"][:]
90
+ times = zarr_store["time"][:]
91
+ #
92
+ metrics_df = self.compute_speed_and_distance(
93
+ times_ns=times,
94
+ latitudes=latitudes,
95
+ longitudes=longitudes,
96
+ )
97
+ # Write the speed and distance to the output zarr store
98
+ zarr_store["speed"][:] = metrics_df.speed_knots.values
99
+ zarr_store["distance"][:] = metrics_df.distance_meters.values
100
+ except Exception as err:
101
+ raise RuntimeError(
102
+ f"Exception encountered writing the speed and distance, {err}"
103
+ )
104
+
105
+
106
+ ###########################################################
@@ -0,0 +1,3 @@
1
+ from .index_manager import IndexManager
2
+
3
+ __all__ = ["IndexManager"]
@@ -0,0 +1,381 @@
1
+ import os
2
+ import re
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from datetime import datetime
5
+
6
+ # import networkx as nx
7
+ import pandas as pd
8
+
9
+ from water_column_sonar_processing.aws import S3Manager
10
+
11
+ MAX_POOL_CONNECTIONS = 64
12
+ MAX_CONCURRENCY = 64
13
+ MAX_WORKERS = 64
14
+ GB = 1024**3
15
+
16
+
17
+ class IndexManager:
18
+ # TODO: index into dynamodb instead of csv files
19
+
20
+ def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
21
+ self.input_bucket_name = input_bucket_name
22
+ self.calibration_bucket = calibration_bucket
23
+ self.calibration_key = calibration_key # TODO: make optional?
24
+ self.s3_manager = S3Manager() # TODO: make anonymous?
25
+
26
+ #################################################################
27
+ def list_ships(
28
+ self,
29
+ prefix="data/raw/",
30
+ ):
31
+ page_iterator = self.s3_manager.paginator.paginate(
32
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
33
+ )
34
+ # common_prefixes = s3_client.list_objects(Bucket=self.input_bucket_name, Prefix=prefix, Delimiter='/')
35
+ # print(common_prefixes)
36
+ ships = []
37
+ for page in page_iterator:
38
+ if "Contents" in page.keys():
39
+ ships.extend([k["Prefix"] for k in page["CommonPrefixes"]])
40
+ return ships # ~76 ships
41
+
42
+ #################################################################
43
+ def list_cruises(
44
+ self,
45
+ ship_prefixes, # e.g. 'data/raw/Alaska_Knight/'
46
+ ):
47
+ cruises = []
48
+ for ship_prefix in ship_prefixes:
49
+ page_iterator = self.s3_manager.paginator.paginate(
50
+ Bucket=self.input_bucket_name, Prefix=ship_prefix, Delimiter="/"
51
+ )
52
+ for page in page_iterator:
53
+ cruises.extend([k["Prefix"] for k in page["CommonPrefixes"]])
54
+ return cruises # ~1204 cruises
55
+
56
+ #################################################################
57
+ def list_ek60_cruises(
58
+ self,
59
+ cruise_prefixes,
60
+ ):
61
+ """
62
+ This returns a list of ek60 prefixed cruises.
63
+ """
64
+ cruise_sensors = [] # includes all sensor types
65
+ for cruise_prefix in cruise_prefixes:
66
+ page_iterator = self.s3_manager.paginator.paginate(
67
+ Bucket=self.input_bucket_name, Prefix=cruise_prefix, Delimiter="/"
68
+ )
69
+ for page in page_iterator:
70
+ cruise_sensors.extend([k["Prefix"] for k in page["CommonPrefixes"]])
71
+ # Note: these are "EK60" by prefix. They still need to be verified by scanning the datagram.
72
+ return [i for i in cruise_sensors if "/EK60/" in i] # ~447 different cruises
73
+
74
+ #################################################################
75
+ def get_raw_files(
76
+ self,
77
+ ship_name,
78
+ cruise_name,
79
+ sensor_name,
80
+ ):
81
+ # Gets all raw files for a cruise under the given prefix
82
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
83
+ page_iterator = self.s3_manager.paginator.paginate(
84
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
85
+ )
86
+ all_files = []
87
+ for page in page_iterator:
88
+ if "Contents" in page.keys():
89
+ all_files.extend([i["Key"] for i in page["Contents"]])
90
+ return [i for i in all_files if i.endswith(".raw")]
91
+
92
+ def get_first_raw_file(
93
+ self,
94
+ ship_name,
95
+ cruise_name,
96
+ sensor_name,
97
+ ):
98
+ # Same as above but only needs to get the first raw file
99
+ # because we are only interested in the first datagram of one file
100
+ # TODO: "dataset?"
101
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
102
+ # page_iterator = self.s3_manager.paginator.paginate(
103
+ # Bucket=self.input_bucket_name,
104
+ # Prefix=prefix,
105
+ # Delimiter="/",
106
+ # PaginationConfig={ 'MaxItems': 5 }
107
+ # ) # TODO: this can create a problem if there is a non raw file returned first
108
+ ### filter with JMESPath expressions ###
109
+ page_iterator = self.s3_manager.paginator.paginate(
110
+ Bucket=self.input_bucket_name,
111
+ Prefix=prefix,
112
+ Delimiter="/",
113
+ )
114
+ # page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
115
+ page_iterator = page_iterator.search(
116
+ expression="Contents[?contains(Key, '.raw')] "
117
+ )
118
+ for res in page_iterator:
119
+ if "Key" in res:
120
+ return res["Key"]
121
+ return None
122
+ # else raise exception?
123
+
124
+ # DSJ0604-D20060406-T050022.bot 2kB == 2152 'Size'
125
+
126
+ def get_files_under_size(
127
+ self,
128
+ ship_name,
129
+ cruise_name,
130
+ sensor_name,
131
+ ):
132
+ # THIS isn't used, just playing with JMES paths spec
133
+ prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/"
134
+ ### filter with JMESPath expressions ###
135
+ page_iterator = self.s3_manager.paginator.paginate(
136
+ Bucket=self.input_bucket_name,
137
+ Prefix=prefix,
138
+ Delimiter="/",
139
+ )
140
+ page_iterator = page_iterator.search("Contents[?Size < `2200`][]")
141
+ all_files = []
142
+ for page in page_iterator:
143
+ if "Contents" in page.keys():
144
+ all_files.extend([i["Key"] for i in page["Contents"]])
145
+ return [i for i in all_files if i.endswith(".raw")]
146
+
147
+ #################################################################
148
+ def get_raw_files_csv(
149
+ self,
150
+ ship_name,
151
+ cruise_name,
152
+ sensor_name,
153
+ ):
154
+ raw_files = self.get_raw_files(
155
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
156
+ )
157
+ files_list = [
158
+ {
159
+ "ship_name": ship_name,
160
+ "cruise_name": cruise_name,
161
+ "sensor_name": sensor_name,
162
+ "file_name": os.path.basename(raw_file),
163
+ }
164
+ for raw_file in raw_files
165
+ ]
166
+ df = pd.DataFrame(files_list)
167
+ df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
168
+ print("done")
169
+
170
+ def get_raw_files_list(
171
+ self,
172
+ ship_name,
173
+ cruise_name,
174
+ sensor_name,
175
+ ):
176
+ # gets all raw files in cruise and returns a list of dicts
177
+ raw_files = self.get_raw_files(
178
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
179
+ )
180
+ files_list = [
181
+ {
182
+ "ship_name": ship_name,
183
+ "cruise_name": cruise_name,
184
+ "sensor_name": sensor_name,
185
+ "file_name": os.path.basename(raw_file),
186
+ }
187
+ for raw_file in raw_files
188
+ ]
189
+ return files_list
190
+
191
+ #################################################################
192
+ @staticmethod
193
+ def get_subset_ek60_prefix(df: pd.DataFrame) -> pd.DataFrame: # TODO: is this used?
194
+ # Returns all objects with 'EK60' in prefix of file path
195
+ # Note that this can include 'EK80' dataset that are false-positives
196
+ # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
197
+ print("getting subset of ek60 dataset by prefix")
198
+ objects = []
199
+ for row in df.itertuples():
200
+ row_split = row[1].split(os.sep)
201
+ if len(row_split) == 6:
202
+ filename = os.path.basename(
203
+ row[1]
204
+ ) # 'EX1608_EK60-D20161205-T040300.raw'
205
+ if filename.endswith(".raw"):
206
+ ship_name, cruise_name, sensor_name = row_split[
207
+ 2:5
208
+ ] # 'Okeanos_Explorer', 'EX1608', 'EK60'
209
+ if (
210
+ re.search("[D](\\d{8})", filename) is not None
211
+ and re.search("[T](\\d{6})", filename) is not None
212
+ ):
213
+ # Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
214
+ # and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
215
+ date_substring = re.search("[D](\\d{8})", filename).group(1)
216
+ time_substring = re.search("[T](\\d{6})", filename).group(1)
217
+ date_string = datetime.strptime(
218
+ f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
219
+ )
220
+ else: # otherwise use current date
221
+ date_string = f"{datetime.utcnow().isoformat()[:19]}Z"
222
+ objects.append(
223
+ {
224
+ "KEY": row[1],
225
+ "FILENAME": filename,
226
+ "SHIP": ship_name,
227
+ "CRUISE": cruise_name,
228
+ "SENSOR": sensor_name,
229
+ "SIZE": row[2],
230
+ "DATE": date_string,
231
+ "DATAGRAM": None,
232
+ }
233
+ )
234
+ return pd.DataFrame(objects)
235
+
236
+ #################################################################
237
+ def scan_datagram(self, select_key: str) -> list:
238
+ # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
239
+ # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
240
+ # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
241
+ s3_resource = self.s3_manager.s3_resource
242
+ obj = s3_resource.Object(
243
+ bucket_name=self.input_bucket_name, key=select_key
244
+ ) # XML0
245
+ first_datagram = (
246
+ obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
247
+ )
248
+ # return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
249
+ ### EK60 dataset are denoted by 'CON0' ###
250
+ return first_datagram
251
+
252
+ #################################################################
253
+ def get_subset_datagrams(
254
+ self, df: pd.DataFrame
255
+ ) -> list: # TODO: is this getting used
256
+ print("getting subset of datagrams")
257
+ select_keys = (
258
+ df[["KEY", "CRUISE"]]
259
+ .drop_duplicates(subset="CRUISE")["KEY"]
260
+ .values.tolist()
261
+ )
262
+ all_datagrams = []
263
+ with ThreadPoolExecutor(max_workers=MAX_POOL_CONNECTIONS) as executor:
264
+ futures = [
265
+ executor.submit(self.scan_datagram, select_key)
266
+ for select_key in select_keys
267
+ ]
268
+ for future in as_completed(futures):
269
+ result = future.result()
270
+ if result:
271
+ all_datagrams.extend(result)
272
+ return all_datagrams
273
+
274
+ #################################################################
275
+ @staticmethod
276
+ def get_ek60_objects(df: pd.DataFrame, subset_datagrams: list) -> pd.DataFrame:
277
+ # for each key write datagram value to all other files in same cruise
278
+ for subset_datagram in subset_datagrams:
279
+ if subset_datagram["DATAGRAM"] == "CON0":
280
+ select_cruise = df.loc[df["KEY"] == subset_datagram["KEY"]][
281
+ "CRUISE"
282
+ ].iloc[0]
283
+ df.loc[df["CRUISE"] == select_cruise, ["DATAGRAM"]] = subset_datagram[
284
+ "DATAGRAM"
285
+ ]
286
+ return df.loc[df["DATAGRAM"] == "CON0"]
287
+
288
+ #################################################################
289
+ def get_calibration_information(
290
+ self,
291
+ ) -> pd.DataFrame:
292
+ # Calibration dataset generated by dataset manager currently located here:
293
+ # https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
294
+ # Note: Data are either:
295
+ # [1] Calibrated w/ calibration dataset
296
+ # [2] Calibrated w/o calibration dataset
297
+ # [3] uncalibrated
298
+ response = self.s3_manager.get_object(
299
+ bucket_name=self.calibration_bucket, key_name=self.calibration_key
300
+ )
301
+ calibration_statuses = pd.read_csv(response.get("Body"))
302
+ calibration_statuses["DATASET_NAME"] = calibration_statuses[
303
+ "DATASET_NAME"
304
+ ].apply(lambda x: x.split("_EK60")[0])
305
+ calibration_statuses["CAL_STATE"] = calibration_statuses["CAL_STATE"].apply(
306
+ lambda x: x.find("Calibrated") >= 0
307
+ )
308
+ return calibration_statuses
309
+
310
+ #################################################################
311
+ # def index( # TODO: get rid of this?
312
+ # self
313
+ # ):
314
+ # start_time = datetime.now() # used for benchmarking
315
+ # # Get all object in public dataset bucket
316
+ # all_objects = self.get_all_objects()
317
+ # #
318
+ # subset_ek60_by_prefix = self.get_subset_ek60_prefix(
319
+ # df=all_objects[all_objects['Key'].str.contains('EK60')][['Key', 'Size']]
320
+ # )
321
+ # #
322
+ # subset_datagrams = self.get_subset_datagrams(df=subset_ek60_by_prefix)
323
+ # print("done getting subset of datagrams")
324
+ # ek60_objects = self.get_ek60_objects(subset_ek60_by_prefix, subset_datagrams)
325
+ # print("done getting ek60_objects")
326
+ # print(start_time)
327
+ # calibration_status = self.get_calibration_information(s3)
328
+ # cruise_names = list(set(ek60_objects['CRUISE']))
329
+ # cruise_names.sort()
330
+ # for cruise_name in cruise_names: # ~322 cruises
331
+ # cruise_data = ek60_objects.groupby('CRUISE').get_group(cruise_name)
332
+ # ship = cruise_data['SHIP'].iloc[0]
333
+ # sensor = cruise_data['SENSOR'].iloc[0]
334
+ # datagram = cruise_data['DATAGRAM'].iloc[0]
335
+ # file_count = cruise_data.shape[0]
336
+ # total_size = np.sum(cruise_data['SIZE'])
337
+ # calibrated = cruise_name in calibration_status['DATASET_NAME'].unique() # ~276 entries
338
+ # start_date = np.min(cruise_data['DATE']).isoformat(timespec="seconds") + "Z"
339
+ # end_date = np.max(cruise_data['DATE']).isoformat(timespec="seconds") + "Z"
340
+ # end_time = datetime.now() # used for benchmarking
341
+ # print(start_time)
342
+ # print(end_time)
343
+
344
+ # TODO: wip
345
+ # def build_merkle_tree(self):
346
+ # G = nx.DiGraph()
347
+ # # https://noaa-wcsd-pds.s3.amazonaws.com/index.html#data/raw/Henry_B._Bigelow/HB0707/
348
+ # ship_name = "Henry_B._Bigelow"
349
+ # cruise_name = "HB0707"
350
+ # # cruise_name = "HB0805"
351
+ # prefix = f"data/raw/{ship_name}/{cruise_name}/"
352
+ # # prefix = f"data/raw/{ship_name}/"
353
+ # page_iterator = self.s3_manager.paginator.paginate(
354
+ # Bucket=self.input_bucket_name,
355
+ # Prefix=prefix,
356
+ # )
357
+ # for page in page_iterator:
358
+ # for contents in page["Contents"]:
359
+ # obj_key = contents["Key"]
360
+ # # https://datatracker.ietf.org/doc/html/rfc7232#section-2.3
361
+ # obj_etag = contents["ETag"].split('"')[1] # properties
362
+ # obj_size = contents["Size"]
363
+ # basename = os.path.basename(obj_key)
364
+ # G.add_node(
365
+ # node_for_adding=basename, ETag=obj_etag, Size=obj_size, Key=obj_key
366
+ # ) # TODO: add parent hash
367
+ # split_path = os.path.normpath(obj_key).split(os.path.sep)
368
+ # # split_path: ['dataset', 'raw', 'Henry_B._Bigelow', 'HB0707', 'EK60', 'D20070712-T004447.raw']
369
+ # for previous, current in zip(split_path, split_path[1:]):
370
+ # if not G.has_edge(previous, current):
371
+ # G.add_edge(previous, current)
372
+ # # print(G)
373
+ # etag_set = frozenset(
374
+ # [k for j, k in list(G.nodes.data("ETag")) if k is not None]
375
+ # )
376
+ # new_hash = sha256(str(etag_set.__hash__()).encode("utf-8")).hexdigest()
377
+ # total_size = [k for j, k in list(G.nodes.data("Size")) if k is not None]
378
+ # print(np.sum(total_size)) # 22.24 Terabytes in Henry_B._Bigelow cruises
379
+ # print(" ")
380
+ # print(new_hash)
381
+ # return new_hash
@@ -0,0 +1,3 @@
1
+ from .zarr_manager import ZarrManager
2
+
3
+ __all__ = ["ZarrManager"]