water-column-sonar-processing 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. water_column_sonar_processing/__init__.py +16 -0
  2. water_column_sonar_processing/aws/__init__.py +7 -0
  3. {aws_manager → water_column_sonar_processing/aws}/dynamodb_manager.py +71 -50
  4. {aws_manager → water_column_sonar_processing/aws}/s3_manager.py +120 -130
  5. {aws_manager → water_column_sonar_processing/aws}/s3fs_manager.py +13 -19
  6. {aws_manager → water_column_sonar_processing/aws}/sns_manager.py +10 -21
  7. {aws_manager → water_column_sonar_processing/aws}/sqs_manager.py +10 -18
  8. water_column_sonar_processing/cruise/__init__.py +4 -0
  9. {cruise → water_column_sonar_processing/cruise}/create_empty_zarr_store.py +62 -44
  10. {cruise → water_column_sonar_processing/cruise}/resample_regrid.py +117 -66
  11. water_column_sonar_processing/geometry/__init__.py +5 -0
  12. {geometry_manager → water_column_sonar_processing/geometry}/geometry_manager.py +80 -49
  13. {geometry_manager → water_column_sonar_processing/geometry}/geometry_simplification.py +13 -12
  14. {geometry_manager → water_column_sonar_processing/geometry}/pmtile_generation.py +25 -24
  15. water_column_sonar_processing/index/__init__.py +3 -0
  16. {index_manager → water_column_sonar_processing/index}/index_manager.py +106 -82
  17. water_column_sonar_processing/model/__init__.py +3 -0
  18. {zarr_manager → water_column_sonar_processing/model}/zarr_manager.py +119 -83
  19. water_column_sonar_processing/process.py +147 -0
  20. water_column_sonar_processing/utility/__init__.py +6 -0
  21. {utility → water_column_sonar_processing/utility}/cleaner.py +6 -7
  22. water_column_sonar_processing/utility/constants.py +63 -0
  23. {utility → water_column_sonar_processing/utility}/pipeline_status.py +37 -10
  24. {utility → water_column_sonar_processing/utility}/timestamp.py +3 -2
  25. {water_column_sonar_processing-0.0.4.dist-info → water_column_sonar_processing-0.0.6.dist-info}/METADATA +31 -1
  26. water_column_sonar_processing-0.0.6.dist-info/RECORD +29 -0
  27. water_column_sonar_processing-0.0.6.dist-info/top_level.txt +1 -0
  28. __init__.py +0 -0
  29. aws_manager/__init__.py +0 -4
  30. cruise/__init__.py +0 -0
  31. geometry_manager/__init__.py +0 -0
  32. index_manager/__init__.py +0 -0
  33. model.py +0 -140
  34. utility/__init__.py +0 -0
  35. utility/constants.py +0 -56
  36. water_column_sonar_processing-0.0.4.dist-info/RECORD +0 -29
  37. water_column_sonar_processing-0.0.4.dist-info/top_level.txt +0 -8
  38. zarr_manager/__init__.py +0 -0
  39. {water_column_sonar_processing-0.0.4.dist-info → water_column_sonar_processing-0.0.6.dist-info}/LICENSE +0 -0
  40. {water_column_sonar_processing-0.0.4.dist-info → water_column_sonar_processing-0.0.6.dist-info}/WHEEL +0 -0
@@ -4,101 +4,105 @@ import pandas as pd
4
4
  from datetime import datetime
5
5
  from concurrent.futures import ThreadPoolExecutor
6
6
  from concurrent.futures import as_completed
7
- from aws_manager.s3_manager import S3Manager
7
+ from water_column_sonar_processing.aws.s3_manager import S3Manager
8
+
8
9
 
9
10
  class IndexManager:
10
11
 
11
- def __init__(
12
- self,
13
- input_bucket_name,
14
- calibration_bucket,
15
- calibration_key
16
- ):
12
+ def __init__(self, input_bucket_name, calibration_bucket, calibration_key):
17
13
  self.input_bucket_name = input_bucket_name
18
14
  self.calibration_bucket = calibration_bucket
19
15
  self.calibration_key = calibration_key
20
16
  self.s3_manager = S3Manager()
21
17
 
22
18
  #################################################################
19
+
23
20
  def list_ships(
24
- self,
25
- prefix='data/raw/',
21
+ self,
22
+ prefix="data/raw/",
26
23
  ):
27
24
  # s3_client = self.s3_manager.s3_client
28
- page_iterator = self.s3_manager.paginator.paginate(Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/")
25
+ page_iterator = self.s3_manager.paginator.paginate(
26
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
27
+ )
29
28
  # common_prefixes = s3_client.list_objects(Bucket=self.input_bucket_name, Prefix=prefix, Delimiter='/')
30
29
  # print(common_prefixes)
31
30
  ships = []
32
31
  for page in page_iterator:
33
- if 'Contents' in page.keys():
34
- ships.extend([k['Prefix'] for k in page['CommonPrefixes']])
32
+ if "Contents" in page.keys():
33
+ ships.extend([k["Prefix"] for k in page["CommonPrefixes"]])
35
34
  return ships # ~76 ships
36
35
 
37
36
  #################################################################
38
37
  def list_cruises(
39
- self,
40
- ship_prefixes, # e.g. 'data/raw/Alaska_Knight/'
38
+ self,
39
+ ship_prefixes, # e.g. 'data/raw/Alaska_Knight/'
41
40
  ):
42
41
  cruises = []
43
42
  for ship_prefix in ship_prefixes:
44
- page_iterator = self.s3_manager.paginator.paginate(Bucket=self.input_bucket_name, Prefix=ship_prefix, Delimiter="/")
43
+ page_iterator = self.s3_manager.paginator.paginate(
44
+ Bucket=self.input_bucket_name, Prefix=ship_prefix, Delimiter="/"
45
+ )
45
46
  for page in page_iterator:
46
- cruises.extend([k['Prefix'] for k in page['CommonPrefixes']])
47
+ cruises.extend([k["Prefix"] for k in page["CommonPrefixes"]])
47
48
  return cruises # ~1204 cruises
48
49
 
49
50
  #################################################################
50
51
  def list_ek60_cruises(
51
- self,
52
- cruise_prefixes,
52
+ self,
53
+ cruise_prefixes,
53
54
  ):
54
55
  cruise_sensors = [] # includes all sensor types
55
56
  for cruise_prefix in cruise_prefixes:
56
- page_iterator = self.s3_manager.paginator.paginate(Bucket=self.input_bucket_name, Prefix=cruise_prefix, Delimiter="/")
57
+ page_iterator = self.s3_manager.paginator.paginate(
58
+ Bucket=self.input_bucket_name, Prefix=cruise_prefix, Delimiter="/"
59
+ )
57
60
  for page in page_iterator:
58
- cruise_sensors.extend([k['Prefix'] for k in page['CommonPrefixes']])
61
+ cruise_sensors.extend([k["Prefix"] for k in page["CommonPrefixes"]])
59
62
  # Note: these are "EK60" by prefix. They still need to be verified by scanning the datagram.
60
- return [i for i in cruise_sensors if '/EK60/' in i] # ~447 different cruises
63
+ return [i for i in cruise_sensors if "/EK60/" in i] # ~447 different cruises
61
64
 
62
65
  #################################################################
63
66
  def get_raw_files(
64
- self,
65
- ship_name,
66
- cruise_name,
67
- sensor_name,
67
+ self,
68
+ ship_name,
69
+ cruise_name,
70
+ sensor_name,
68
71
  ):
69
72
  prefix = f"data/raw/{ship_name}/{cruise_name}/{sensor_name}/" # Note no forward slash at beginning
70
- page_iterator = self.s3_manager.paginator.paginate(Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/")
73
+ page_iterator = self.s3_manager.paginator.paginate(
74
+ Bucket=self.input_bucket_name, Prefix=prefix, Delimiter="/"
75
+ )
71
76
  all_files = []
72
77
  for page in page_iterator:
73
- if 'Contents' in page.keys():
74
- all_files.extend([i['Key'] for i in page['Contents']])
75
- return [i for i in all_files if i.endswith('.raw')]
78
+ if "Contents" in page.keys():
79
+ all_files.extend([i["Key"] for i in page["Contents"]])
80
+ return [i for i in all_files if i.endswith(".raw")]
76
81
 
77
82
  def get_raw_files_csv(
78
- self,
79
- ship_name,
80
- cruise_name,
81
- sensor_name,
83
+ self,
84
+ ship_name,
85
+ cruise_name,
86
+ sensor_name,
82
87
  ):
83
- raw_files = self.get_raw_files(ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name)
88
+ raw_files = self.get_raw_files(
89
+ ship_name=ship_name, cruise_name=cruise_name, sensor_name=sensor_name
90
+ )
84
91
  files_list = [
85
92
  {
86
- 'ship_name': ship_name,
87
- 'cruise_name': cruise_name,
88
- 'sensor_name': sensor_name,
89
- 'file_name': os.path.basename(raw_file)
90
- } for raw_file in raw_files
93
+ "ship_name": ship_name,
94
+ "cruise_name": cruise_name,
95
+ "sensor_name": sensor_name,
96
+ "file_name": os.path.basename(raw_file),
97
+ }
98
+ for raw_file in raw_files
91
99
  ]
92
100
  df = pd.DataFrame(files_list)
93
- df.to_csv(f'{ship_name}_{cruise_name}.csv', index=False, header=False, sep=' ')
94
- print('done')
95
-
101
+ df.to_csv(f"{ship_name}_{cruise_name}.csv", index=False, header=False, sep=" ")
102
+ print("done")
96
103
 
97
104
  #################################################################
98
- def get_subset_ek60_prefix(
99
- self,
100
- df: pd.DataFrame
101
- ) -> pd.DataFrame:
105
+ def get_subset_ek60_prefix(self, df: pd.DataFrame) -> pd.DataFrame:
102
106
  # Returns all objects with 'EK60' in prefix of file path
103
107
  # Note that this can include 'EK80' data that are false-positives
104
108
  # in dataframe with ['key', 'filename', 'ship', 'cruise', 'sensor', 'size', 'date', 'datagram']
@@ -107,56 +111,68 @@ class IndexManager:
107
111
  for row in df.itertuples():
108
112
  row_split = row[1].split(os.sep)
109
113
  if len(row_split) == 6:
110
- filename = os.path.basename(row[1]) # 'EX1608_EK60-D20161205-T040300.raw'
114
+ filename = os.path.basename(
115
+ row[1]
116
+ ) # 'EX1608_EK60-D20161205-T040300.raw'
111
117
  if filename.endswith(".raw"):
112
- ship_name, cruise_name, sensor_name = row_split[2:5] # 'Okeanos_Explorer', 'EX1608', 'EK60'
113
- if re.search("[D](\d{8})", filename) is not None and re.search("[T](\d{6})", filename) is not None:
118
+ ship_name, cruise_name, sensor_name = row_split[
119
+ 2:5
120
+ ] # 'Okeanos_Explorer', 'EX1608', 'EK60'
121
+ if (
122
+ re.search("[D](\d{8})", filename) is not None
123
+ and re.search("[T](\d{6})", filename) is not None
124
+ ):
114
125
  # Parse date if possible e.g.: 'data/raw/Henry_B._Bigelow/HB1006/EK60/HBB-D20100723-T025105.raw'
115
126
  # and 'data/raw/Henry_B._Bigelow/HB1802/EK60/D20180513-T150250.raw'
116
127
  date_substring = re.search("[D](\d{8})", filename).group(1)
117
128
  time_substring = re.search("[T](\d{6})", filename).group(1)
118
- date_string = datetime.strptime(f'{date_substring}{time_substring}', '%Y%m%d%H%M%S')
129
+ date_string = datetime.strptime(
130
+ f"{date_substring}{time_substring}", "%Y%m%d%H%M%S"
131
+ )
119
132
  else: # otherwise use current date
120
133
  date_string = f"{datetime.utcnow().isoformat()[:19]}Z"
121
134
  objects.append(
122
135
  {
123
- 'KEY': row[1],
124
- 'FILENAME': filename,
125
- 'SHIP': ship_name,
126
- 'CRUISE': cruise_name,
127
- 'SENSOR': sensor_name,
128
- 'SIZE': row[2],
129
- 'DATE': date_string,
130
- 'DATAGRAM': None
136
+ "KEY": row[1],
137
+ "FILENAME": filename,
138
+ "SHIP": ship_name,
139
+ "CRUISE": cruise_name,
140
+ "SENSOR": sensor_name,
141
+ "SIZE": row[2],
142
+ "DATE": date_string,
143
+ "DATAGRAM": None,
131
144
  }
132
145
  )
133
146
  return pd.DataFrame(objects)
134
147
 
135
148
  #################################################################
136
- def scan_datagram(
137
- self,
138
- select_key: str
139
- ) -> list:
149
+ def scan_datagram(self, select_key: str) -> list:
140
150
  # Reads the first 8 bytes of S3 file. Used to determine if ek60 or ek80
141
151
  # Note: uses boto3 session instead of boto3 client: https://github.com/boto/boto3/issues/801
142
152
  # select_key = 'data/raw/Albatross_Iv/AL0403/EK60/L0005-D20040302-T200108-EK60.raw'
143
153
  s3_resource = self.s3_manager.s3_resource
144
- obj = s3_resource.Object(bucket_name=self.input_bucket_name, key=select_key) # XML0
145
- first_datagram = obj.get(Range='bytes=3-7')['Body'].read().decode().strip('\x00')
154
+ obj = s3_resource.Object(
155
+ bucket_name=self.input_bucket_name, key=select_key
156
+ ) # XML0
157
+ first_datagram = (
158
+ obj.get(Range="bytes=3-7")["Body"].read().decode().strip("\x00")
159
+ )
146
160
  # return [{'KEY': select_key, 'DATAGRAM': first_datagram}]
147
161
  ### EK60 data are denoted by 'CON0' ###
148
162
  return first_datagram
149
163
 
150
164
  #################################################################
151
- def get_subset_datagrams(
152
- self,
153
- df: pd.DataFrame
154
- ) -> list:
165
+ def get_subset_datagrams(self, df: pd.DataFrame) -> list:
155
166
  print("getting subset of datagrams")
156
- select_keys = list(df[['KEY', 'CRUISE']].drop_duplicates(subset='CRUISE')['KEY'].values)
167
+ select_keys = list(
168
+ df[["KEY", "CRUISE"]].drop_duplicates(subset="CRUISE")["KEY"].values
169
+ )
157
170
  all_datagrams = []
158
171
  with ThreadPoolExecutor(max_workers=self.max_pool_connections) as executor:
159
- futures = [executor.submit(self.scan_datagram, select_key) for select_key in select_keys]
172
+ futures = [
173
+ executor.submit(self.scan_datagram, select_key)
174
+ for select_key in select_keys
175
+ ]
160
176
  for future in as_completed(futures):
161
177
  result = future.result()
162
178
  if result:
@@ -165,20 +181,22 @@ class IndexManager:
165
181
 
166
182
  #################################################################
167
183
  def get_ek60_objects(
168
- self,
169
- df: pd.DataFrame,
170
- subset_datagrams: list
184
+ self, df: pd.DataFrame, subset_datagrams: list
171
185
  ) -> pd.DataFrame:
172
186
  # for each key write datagram value to all other files in same cruise
173
187
  for subset_datagram in subset_datagrams:
174
- if subset_datagram['DATAGRAM'] == 'CON0':
175
- select_cruise = df.loc[df['KEY'] == subset_datagram['KEY']]['CRUISE'].iloc[0]
176
- df.loc[df['CRUISE'] == select_cruise, ['DATAGRAM']] = subset_datagram['DATAGRAM']
177
- return df.loc[df['DATAGRAM'] == 'CON0']
188
+ if subset_datagram["DATAGRAM"] == "CON0":
189
+ select_cruise = df.loc[df["KEY"] == subset_datagram["KEY"]][
190
+ "CRUISE"
191
+ ].iloc[0]
192
+ df.loc[df["CRUISE"] == select_cruise, ["DATAGRAM"]] = subset_datagram[
193
+ "DATAGRAM"
194
+ ]
195
+ return df.loc[df["DATAGRAM"] == "CON0"]
178
196
 
179
197
  #################################################################
180
198
  def get_calibration_information( # tested
181
- self,
199
+ self,
182
200
  ) -> pd.DataFrame:
183
201
  # Calibration data generated by data manager currently located here:
184
202
  # https://noaa-wcsd-pds-index.s3.amazonaws.com/calibrated_crusies.csv
@@ -186,14 +204,20 @@ class IndexManager:
186
204
  # [1] Calibrated w/ calibration data
187
205
  # [2] Calibrated w/o calibration data
188
206
  # [3] uncalibrated
189
- response = self.s3_manager.get_object(bucket_name=self.calibration_bucket, key_name=self.calibration_key)
207
+ response = self.s3_manager.get_object(
208
+ bucket_name=self.calibration_bucket, key_name=self.calibration_key
209
+ )
190
210
  calibration_statuses = pd.read_csv(response.get("Body"))
191
- calibration_statuses['DATASET_NAME'] = calibration_statuses['DATASET_NAME'].apply(lambda x: x.split('_EK60')[0])
192
- calibration_statuses['CAL_STATE'] = calibration_statuses['CAL_STATE'].apply(lambda x: x.find('Calibrated') >= 0)
211
+ calibration_statuses["DATASET_NAME"] = calibration_statuses[
212
+ "DATASET_NAME"
213
+ ].apply(lambda x: x.split("_EK60")[0])
214
+ calibration_statuses["CAL_STATE"] = calibration_statuses["CAL_STATE"].apply(
215
+ lambda x: x.find("Calibrated") >= 0
216
+ )
193
217
  return calibration_statuses
194
218
 
195
219
  #################################################################
196
- # def index_manager( # TODO: get rid of this?
220
+ # def index( # TODO: get rid of this?
197
221
  # self
198
222
  # ):
199
223
  # start_time = datetime.now() # used for benchmarking
@@ -0,0 +1,3 @@
1
+ from .zarr_manager import ZarrManager
2
+
3
+ __all__ = ["ZarrManager"]