pydartdiags 0.0.42__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/matplots/__init__.py +0 -0
- pydartdiags/matplots/matplots.py +243 -0
- pydartdiags/obs_sequence/obs_sequence.py +712 -350
- pydartdiags/plots/plots.py +163 -133
- pydartdiags/stats/__init__.py +0 -0
- pydartdiags/stats/stats.py +323 -0
- pydartdiags-0.5.0.dist-info/METADATA +49 -0
- pydartdiags-0.5.0.dist-info/RECORD +14 -0
- {pydartdiags-0.0.42.dist-info → pydartdiags-0.5.0.dist-info}/WHEEL +1 -1
- pydartdiags-0.0.42.dist-info/METADATA +0 -404
- pydartdiags-0.0.42.dist-info/RECORD +0 -10
- {pydartdiags-0.0.42.dist-info → pydartdiags-0.5.0.dist-info}/LICENSE +0 -0
- {pydartdiags-0.0.42.dist-info → pydartdiags-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
1
2
|
import pandas as pd
|
|
2
3
|
import datetime as dt
|
|
3
4
|
import numpy as np
|
|
@@ -5,75 +6,163 @@ import os
|
|
|
5
6
|
import yaml
|
|
6
7
|
import struct
|
|
7
8
|
|
|
9
|
+
|
|
10
|
+
def requires_assimilation_info(func):
|
|
11
|
+
def wrapper(self, *args, **kwargs):
|
|
12
|
+
if self.has_assimilation_info:
|
|
13
|
+
return func(self, *args, **kwargs)
|
|
14
|
+
else:
|
|
15
|
+
raise ValueError(
|
|
16
|
+
"Assimilation information is required to call this function."
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
return wrapper
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def requires_posterior_info(func):
|
|
23
|
+
def wrapper(self, *args, **kwargs):
|
|
24
|
+
if self.has_posterior:
|
|
25
|
+
return func(self, *args, **kwargs)
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError("Posterior information is required to call this function.")
|
|
28
|
+
|
|
29
|
+
return wrapper
|
|
30
|
+
|
|
31
|
+
|
|
8
32
|
class obs_sequence:
|
|
9
|
-
"""
|
|
33
|
+
"""
|
|
34
|
+
Initialize an obs_sequence object from an ASCII or binary observation sequence file,
|
|
35
|
+
or create an empty obs_sequence object from scratch.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file (str): The input observation sequence ASCII or binary file.
|
|
39
|
+
If None, an empty obs_sequence object is created from scratch.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
An obs_sequence object
|
|
10
43
|
|
|
11
44
|
Attributes:
|
|
12
|
-
df (pandas.DataFrame): DataFrame containing
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
types (dict): Dictionary of types in the observation sequence file.
|
|
17
|
-
copie_names (list): Names of copies in the observation sequence file.
|
|
18
|
-
Spelled 'copie' to avoid conflict with the Python built-in copy function.
|
|
45
|
+
df (pandas.DataFrame): The DataFrame containing the observation sequence data.
|
|
46
|
+
header (list): The header of the observation sequence.
|
|
47
|
+
copie_names (list): The names of the copies in the observation sequence.
|
|
48
|
+
Spelled 'copie' to avoid conflict with the Python built-in 'copy'.
|
|
19
49
|
Spaces are replaced with underscores in copie_names.
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
50
|
+
non_qc_copie_names (list): The names of the copies not including quality control,
|
|
51
|
+
e.g. observation, mean, ensemble_members
|
|
52
|
+
qc_copie_names (list): The names of the quality control copies, e.g. DART_QC
|
|
53
|
+
n_copies(int): The total number of copies in the observation sequence.
|
|
54
|
+
n_non_qc(int): The number of copies not including quality control.
|
|
55
|
+
n_qc(int): The number of quality control copies.
|
|
56
|
+
vert (dict): A dictionary mapping DART vertical coordinate types to their
|
|
57
|
+
corresponding integer values.
|
|
58
|
+
|
|
59
|
+
- undefined: 'VERTISUNDEF'
|
|
60
|
+
- surface: 'VERTISSURFACE' (value is surface elevation in meters)
|
|
61
|
+
- model level: 'VERTISLEVEL'
|
|
62
|
+
- pressure: 'VERTISPRESSURE' (in Pascals)
|
|
63
|
+
- height: 'VERTISHEIGHT' (in meters)
|
|
64
|
+
- scale height: 'VERTISSCALEHEIGHT' (unitless)
|
|
65
|
+
loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
|
|
66
|
+
For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
|
|
67
|
+
types (dict): Dictionary of types of observations the observation sequence,
|
|
68
|
+
e.g. {23: 'ACARS_TEMPERATURE'},
|
|
69
|
+
reverse_types (dict): Dictionary of types with keys and values reversed, e.g
|
|
70
|
+
{'ACARS_TEMPERATURE': 23}
|
|
71
|
+
synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
|
|
72
|
+
The defualt list is
|
|
73
|
+
|
|
74
|
+
.. code-block:: python
|
|
75
|
+
|
|
76
|
+
[ 'NCEP BUFR observation',
|
|
77
|
+
'AIRS observation',
|
|
78
|
+
'GTSPP observation',
|
|
79
|
+
'SST observation',
|
|
80
|
+
'observations',
|
|
81
|
+
'WOD observation']
|
|
82
|
+
|
|
83
|
+
You can add more synonyms by providing a list of strings when
|
|
84
|
+
creating the obs_sequence object.
|
|
85
|
+
|
|
86
|
+
.. code-block:: python
|
|
87
|
+
|
|
88
|
+
obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
|
|
89
|
+
|
|
90
|
+
has_assimilation_info (bool): Indicates if assimilation information is present.
|
|
91
|
+
has_posterior (bool): Indicates if posterior information is present.
|
|
92
|
+
seq (generator): Generator of observations from the observation sequence file.
|
|
93
|
+
all_obs (list): List of all observations, each observation is a list.
|
|
94
|
+
Valid when the obs_sequence is created from a file.
|
|
95
|
+
Set to None when the obs_sequence is created from scratch or multiple
|
|
96
|
+
obs_sequences are joined.
|
|
41
97
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
-1: 'surface (m)',
|
|
52
|
-
1: 'model level',
|
|
53
|
-
2: 'pressure (Pa)',
|
|
54
|
-
3: 'height (m)',
|
|
55
|
-
4: 'scale height' }
|
|
98
|
+
|
|
99
|
+
vert = {
|
|
100
|
+
-2: "undefined",
|
|
101
|
+
-1: "surface (m)",
|
|
102
|
+
1: "model level",
|
|
103
|
+
2: "pressure (Pa)",
|
|
104
|
+
3: "height (m)",
|
|
105
|
+
4: "scale height",
|
|
106
|
+
}
|
|
56
107
|
|
|
57
108
|
reversed_vert = {value: key for key, value in vert.items()}
|
|
58
109
|
|
|
59
|
-
|
|
60
110
|
def __init__(self, file, synonyms=None):
|
|
61
|
-
|
|
111
|
+
"""
|
|
112
|
+
Create an obs_sequence object from an ASCII or binary observation sequence file,
|
|
113
|
+
or create an empty obs_sequence object from scratch.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
file (str): The input observation sequence ASCII or binary file.
|
|
117
|
+
If None, an empty obs_sequence object is created from scratch.
|
|
118
|
+
synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
an obs_sequence object
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
|
|
125
|
+
.. code-block:: python
|
|
126
|
+
|
|
127
|
+
obs_seq = obs_sequence(file='obs_seq.final')
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
self.loc_mod = "None"
|
|
132
|
+
self.has_assimilation_info = False
|
|
133
|
+
self.has_posterior = False
|
|
62
134
|
self.file = file
|
|
63
|
-
self.synonyms_for_obs = [
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
135
|
+
self.synonyms_for_obs = [
|
|
136
|
+
"NCEP BUFR observation",
|
|
137
|
+
"AIRS observation",
|
|
138
|
+
"GTSPP observation",
|
|
139
|
+
"SST observation",
|
|
140
|
+
"observations",
|
|
141
|
+
"WOD observation",
|
|
142
|
+
]
|
|
69
143
|
if synonyms:
|
|
70
144
|
if isinstance(synonyms, list):
|
|
71
145
|
self.synonyms_for_obs.extend(synonyms)
|
|
72
146
|
else:
|
|
73
147
|
self.synonyms_for_obs.append(synonyms)
|
|
74
148
|
|
|
149
|
+
if file is None:
|
|
150
|
+
# Early exit - for testing purposes or creating obs_seq objects from scratch
|
|
151
|
+
self.df = pd.DataFrame()
|
|
152
|
+
self.types = {}
|
|
153
|
+
self.reverse_types = {}
|
|
154
|
+
self.copie_names = []
|
|
155
|
+
self.non_qc_copie_names = []
|
|
156
|
+
self.qc_copie_names = []
|
|
157
|
+
self.n_copies = 0 # copies including qc
|
|
158
|
+
self.n_non_qc = 0 # copies not including qc
|
|
159
|
+
self.n_qc = 0 # number of qc copies
|
|
160
|
+
self.seq = []
|
|
161
|
+
self.all_obs = []
|
|
162
|
+
return
|
|
163
|
+
|
|
75
164
|
module_dir = os.path.dirname(__file__)
|
|
76
|
-
self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
|
|
165
|
+
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
77
166
|
|
|
78
167
|
if self.is_binary(file):
|
|
79
168
|
self.header = self.read_binary_header(file)
|
|
@@ -83,35 +172,47 @@ class obs_sequence:
|
|
|
83
172
|
self.types = self.collect_obs_types(self.header)
|
|
84
173
|
self.reverse_types = {v: k for k, v in self.types.items()}
|
|
85
174
|
self.copie_names, self.n_copies = self.collect_copie_names(self.header)
|
|
175
|
+
self.n_non_qc, self.n_qc = self.num_qc_non_qc(self.header)
|
|
176
|
+
self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
|
|
177
|
+
self.qc_copie_names = self.copie_names[self.n_non_qc :]
|
|
86
178
|
|
|
87
179
|
if self.is_binary(file):
|
|
88
180
|
self.seq = self.obs_binary_reader(file, self.n_copies)
|
|
89
|
-
self.loc_mod =
|
|
181
|
+
self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
|
|
90
182
|
else:
|
|
91
183
|
self.seq = self.obs_reader(file, self.n_copies)
|
|
92
184
|
|
|
93
|
-
self.all_obs = self.create_all_obs()
|
|
185
|
+
self.all_obs = self.create_all_obs() # uses up the generator
|
|
94
186
|
# at this point you know if the seq is loc3d or loc1d
|
|
95
|
-
if self.loc_mod ==
|
|
96
|
-
raise ValueError(
|
|
187
|
+
if self.loc_mod == "None":
|
|
188
|
+
raise ValueError(
|
|
189
|
+
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
190
|
+
)
|
|
97
191
|
self.columns = self.column_headers()
|
|
98
|
-
self.df = pd.DataFrame(self.all_obs, columns
|
|
99
|
-
if self.loc_mod ==
|
|
100
|
-
self.df[
|
|
101
|
-
self.df[
|
|
192
|
+
self.df = pd.DataFrame(self.all_obs, columns=self.columns)
|
|
193
|
+
if self.loc_mod == "loc3d":
|
|
194
|
+
self.df["longitude"] = np.rad2deg(self.df["longitude"])
|
|
195
|
+
self.df["latitude"] = np.rad2deg(self.df["latitude"])
|
|
102
196
|
# rename 'X observation' to observation
|
|
103
|
-
self.synonyms_for_obs = [
|
|
104
|
-
|
|
197
|
+
self.synonyms_for_obs = [
|
|
198
|
+
synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
|
|
199
|
+
]
|
|
200
|
+
rename_dict = {
|
|
201
|
+
old: "observation"
|
|
202
|
+
for old in self.synonyms_for_obs
|
|
203
|
+
if old in self.df.columns
|
|
204
|
+
}
|
|
105
205
|
self.df = self.df.rename(columns=rename_dict)
|
|
106
|
-
|
|
107
|
-
if
|
|
108
|
-
|
|
109
|
-
self.
|
|
110
|
-
|
|
206
|
+
|
|
207
|
+
# check if the assimilation info is present
|
|
208
|
+
if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
209
|
+
self.has_assimilation_info = True
|
|
210
|
+
if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
211
|
+
self.has_posterior = True
|
|
111
212
|
|
|
112
213
|
def create_all_obs(self):
|
|
113
|
-
"""
|
|
114
|
-
|
|
214
|
+
"""steps through the generator to create a
|
|
215
|
+
list of all observations in the sequence
|
|
115
216
|
"""
|
|
116
217
|
all_obs = []
|
|
117
218
|
for obs in self.seq:
|
|
@@ -120,71 +221,110 @@ class obs_sequence:
|
|
|
120
221
|
return all_obs
|
|
121
222
|
|
|
122
223
|
def obs_to_list(self, obs):
|
|
123
|
-
"""put single observation into a list
|
|
124
|
-
|
|
125
|
-
discards obs_def
|
|
126
|
-
"""
|
|
224
|
+
"""put single observation into a list"""
|
|
127
225
|
data = []
|
|
128
|
-
data.append(obs[0].split()[1])
|
|
129
|
-
data.extend(list(map(float,obs[1:self.n_copies+1])))
|
|
130
|
-
data.append(obs[self.n_copies+1])
|
|
226
|
+
data.append(obs[0].split()[1]) # obs_num
|
|
227
|
+
data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
|
|
228
|
+
data.append(obs[self.n_copies + 1]) # linked list info
|
|
131
229
|
try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
|
|
132
|
-
locI = obs.index(
|
|
133
|
-
location = obs[locI+1].split()
|
|
230
|
+
locI = obs.index("loc3d")
|
|
231
|
+
location = obs[locI + 1].split()
|
|
134
232
|
data.append(float(location[0])) # location x
|
|
135
233
|
data.append(float(location[1])) # location y
|
|
136
234
|
data.append(float(location[2])) # location z
|
|
137
235
|
data.append(obs_sequence.vert[int(location[3])])
|
|
138
|
-
self.loc_mod =
|
|
236
|
+
self.loc_mod = "loc3d"
|
|
139
237
|
except ValueError:
|
|
140
238
|
try:
|
|
141
|
-
locI = obs.index(
|
|
142
|
-
location = obs[locI+1]
|
|
143
|
-
data.append(float(location)) # 1d location
|
|
144
|
-
self.loc_mod =
|
|
239
|
+
locI = obs.index("loc1d")
|
|
240
|
+
location = obs[locI + 1]
|
|
241
|
+
data.append(float(location)) # 1d location
|
|
242
|
+
self.loc_mod = "loc1d"
|
|
145
243
|
except ValueError:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
|
|
244
|
+
raise ValueError(
|
|
245
|
+
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
246
|
+
)
|
|
247
|
+
typeI = obs.index("kind") # type of observation
|
|
148
248
|
type_value = obs[typeI + 1]
|
|
149
249
|
if not self.types:
|
|
150
|
-
data.append(
|
|
250
|
+
data.append("Identity")
|
|
151
251
|
else:
|
|
152
|
-
data.append(self.types[type_value])
|
|
153
|
-
|
|
252
|
+
data.append(self.types[type_value]) # observation type
|
|
253
|
+
|
|
154
254
|
# any observation specific obs def info is between here and the end of the list
|
|
255
|
+
# can be obs_def & external forward operator
|
|
256
|
+
metadata = obs[typeI + 2 : -2]
|
|
257
|
+
obs_def_metadata, external_metadata = self.split_metadata(metadata)
|
|
258
|
+
data.append(obs_def_metadata)
|
|
259
|
+
data.append(external_metadata)
|
|
260
|
+
|
|
155
261
|
time = obs[-2].split()
|
|
156
|
-
data.append(int(time[0]))
|
|
157
|
-
data.append(int(time[1]))
|
|
158
|
-
data.append(
|
|
159
|
-
|
|
160
|
-
|
|
262
|
+
data.append(int(time[0])) # seconds
|
|
263
|
+
data.append(int(time[1])) # days
|
|
264
|
+
data.append(
|
|
265
|
+
convert_dart_time(int(time[0]), int(time[1]))
|
|
266
|
+
) # datetime # HK todo what is approprate for 1d models?
|
|
267
|
+
data.append(float(obs[-1])) # obs error variance ?convert to sd?
|
|
268
|
+
|
|
161
269
|
return data
|
|
162
270
|
|
|
271
|
+
@staticmethod
|
|
272
|
+
def split_metadata(metadata):
|
|
273
|
+
"""
|
|
274
|
+
Split the metadata list at the first occurrence of an element starting with 'externalF0'.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
metadata (list of str): The metadata list to be split.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
tuple: Two sublists, the first containing elements before 'externalF0', and the second
|
|
281
|
+
containing 'externalF0' and all elements after it. If 'externalF0' is not found,
|
|
282
|
+
the first sublist contains the entire metadata list, and the second is empty.
|
|
283
|
+
"""
|
|
284
|
+
for i, item in enumerate(metadata):
|
|
285
|
+
if item.startswith("external_FO"):
|
|
286
|
+
return metadata[:i], metadata[i:]
|
|
287
|
+
return metadata, []
|
|
288
|
+
|
|
163
289
|
def list_to_obs(self, data):
|
|
290
|
+
"""convert a list of data to an observation
|
|
291
|
+
|
|
292
|
+
Assuming the order of the list is obs_seq.copie_names
|
|
293
|
+
|
|
294
|
+
"""
|
|
164
295
|
obs = []
|
|
165
|
-
obs.append(
|
|
166
|
-
obs.extend(data[1:self.n_copies+1]) # all the copies
|
|
167
|
-
obs.append(data[self.n_copies+1]) # linked list info
|
|
168
|
-
obs.append(
|
|
296
|
+
obs.append("OBS " + str(data[0])) # obs_num lots of space
|
|
297
|
+
obs.extend(data[1 : self.n_copies + 1]) # all the copies
|
|
298
|
+
obs.append(data[self.n_copies + 1]) # linked list info
|
|
299
|
+
obs.append("obdef") # TODO HK: extended_FO obs_def
|
|
169
300
|
obs.append(self.loc_mod)
|
|
170
|
-
if self.loc_mod ==
|
|
171
|
-
obs.append(
|
|
172
|
-
|
|
301
|
+
if self.loc_mod == "loc3d":
|
|
302
|
+
obs.append(
|
|
303
|
+
" ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
|
|
304
|
+
+ " "
|
|
305
|
+
+ str(self.reversed_vert[data[self.n_copies + 5]])
|
|
306
|
+
) # location x, y, z, vert
|
|
307
|
+
obs.append("kind") # this is type of observation
|
|
173
308
|
obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
|
|
174
|
-
|
|
175
|
-
obs.
|
|
176
|
-
obs.
|
|
309
|
+
# Convert metadata to a string and append !HK @todo you are not converting to string
|
|
310
|
+
obs.extend(data[self.n_copies + 7]) # metadata
|
|
311
|
+
obs.extend(data[self.n_copies + 8]) # external forward operator
|
|
312
|
+
elif self.loc_mod == "loc1d":
|
|
313
|
+
obs.append(data[self.n_copies + 2]) # 1d location
|
|
314
|
+
obs.append("kind") # this is type of observation
|
|
177
315
|
obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
|
|
178
|
-
|
|
316
|
+
obs.extend(data[self.n_copies + 4]) # metadata
|
|
317
|
+
obs.extend(data[self.n_copies + 5]) # external forward operator
|
|
318
|
+
obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
|
|
179
319
|
obs.append(data[-1]) # obs error variance
|
|
180
320
|
|
|
181
321
|
return obs
|
|
182
322
|
|
|
183
323
|
@staticmethod
|
|
184
324
|
def generate_linked_list_pattern(n):
|
|
185
|
-
"""Create a list of strings with the linked list pattern for n
|
|
325
|
+
"""Create a list of strings with the linked list pattern for n observations."""
|
|
186
326
|
result = []
|
|
187
|
-
for i in range(n-1):
|
|
327
|
+
for i in range(n - 1):
|
|
188
328
|
col1 = i if i > 0 else -1
|
|
189
329
|
col2 = i + 2
|
|
190
330
|
col3 = -1
|
|
@@ -192,108 +332,162 @@ class obs_sequence:
|
|
|
192
332
|
result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
|
|
193
333
|
return result
|
|
194
334
|
|
|
195
|
-
def write_obs_seq(self, file
|
|
335
|
+
def write_obs_seq(self, file):
|
|
196
336
|
"""
|
|
197
337
|
Write the observation sequence to a file.
|
|
198
|
-
|
|
199
|
-
This function writes the observation sequence to
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
Parameters:
|
|
338
|
+
|
|
339
|
+
This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
|
|
340
|
+
It updates the header with the number of observations, converts coordinates back to radians
|
|
341
|
+
if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
|
|
342
|
+
list pattern for reading by DART programs.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
208
345
|
file (str): The path to the file where the observation sequence will be written.
|
|
209
|
-
df (pandas.DataFrame, optional): A DataFrame containing the observation data. If not provided, the function uses self.header and self.all_obs.
|
|
210
|
-
|
|
211
|
-
Returns:
|
|
212
|
-
None
|
|
213
|
-
|
|
214
|
-
Examples:
|
|
215
|
-
``obs_seq.write_obs_seq('/path/to/output/file')``
|
|
216
|
-
``obs_seq.write_obs_seq('/path/to/output/file', df=obs_seq.df)``
|
|
217
|
-
"""
|
|
218
|
-
with open(file, 'w') as f:
|
|
219
|
-
|
|
220
|
-
if df is not None:
|
|
221
|
-
# If a DataFrame is provided, update the header with the number of observations
|
|
222
|
-
num_rows = len(df)
|
|
223
|
-
replacement_string = f'num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}'
|
|
224
|
-
new_header = [replacement_string if 'num_obs' in element else element for element in self.header]
|
|
225
|
-
|
|
226
|
-
for line in new_header[:-1]:
|
|
227
|
-
f.write(str(line) + '\n')
|
|
228
|
-
first = 1
|
|
229
|
-
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
230
|
-
|
|
231
|
-
# TODO HK is there something better than copying the whole thing here?
|
|
232
|
-
df_copy = df.copy() # copy since you want to change for writing.
|
|
233
|
-
# back to radians for obs_seq
|
|
234
|
-
if self.loc_mod == 'loc3d':
|
|
235
|
-
df_copy['longitude'] = np.deg2rad(self.df['longitude'])
|
|
236
|
-
df_copy['latitude'] = np.deg2rad(self.df['latitude'])
|
|
237
|
-
if 'bias' in df_copy.columns:
|
|
238
|
-
df_copy = df_copy.drop(columns=['bias', 'sq_err'])
|
|
239
|
-
|
|
240
|
-
# linked list for reading by dart programs
|
|
241
|
-
df_copy = df_copy.sort_values(by=['time']) # sort the DataFrame by time
|
|
242
|
-
df_copy['obs_num'] = df.index + 1 # obs_num in time order
|
|
243
|
-
df_copy['linked_list'] = obs_sequence.generate_linked_list_pattern(len(df_copy)) # linked list pattern
|
|
244
|
-
|
|
245
|
-
def write_row(row):
|
|
246
|
-
ob_write = self.list_to_obs(row.tolist())
|
|
247
|
-
for line in ob_write:
|
|
248
|
-
f.write(str(line) + '\n')
|
|
249
|
-
|
|
250
|
-
df_copy.apply(write_row, axis=1)
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
# If no DataFrame is provided, use self.header and self.all_obs
|
|
254
|
-
for line in self.header:
|
|
255
|
-
f.write(str(line) + '\n')
|
|
256
|
-
for obs in self.all_obs:
|
|
257
|
-
ob_write = self.list_to_obs(obs)
|
|
258
|
-
for line in ob_write:
|
|
259
|
-
f.write(str(line) + '\n')
|
|
260
346
|
|
|
347
|
+
Notes:
|
|
348
|
+
- Longitude and latitude are converted back to radians if the location model is 'loc3d'.
|
|
349
|
+
- The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
|
|
350
|
+
- The DataFrame is sorted by the 'time' column.
|
|
351
|
+
- An 'obs_num' column is added to the DataFrame to number the observations in time order.
|
|
352
|
+
- A 'linked_list' column is generated to create a linked list pattern for the observations.
|
|
353
|
+
|
|
354
|
+
Example:
|
|
355
|
+
obsq.write_obs_seq('obs_seq.new')
|
|
356
|
+
|
|
357
|
+
"""
|
|
358
|
+
with open(file, "w") as f:
|
|
359
|
+
|
|
360
|
+
# If a DataFrame is provided, update the header with the number of observations
|
|
361
|
+
num_rows = len(self.df)
|
|
362
|
+
replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
|
|
363
|
+
new_header = [
|
|
364
|
+
replacement_string if "num_obs" in element else element
|
|
365
|
+
for element in self.header
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
for line in new_header[:-1]:
|
|
369
|
+
f.write(str(line) + "\n")
|
|
370
|
+
first = 1
|
|
371
|
+
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
372
|
+
|
|
373
|
+
# TODO HK is there something better than copying the whole thing here?
|
|
374
|
+
df_copy = self.df.copy() # copy since you want to change for writing.
|
|
375
|
+
# back to radians for obs_seq
|
|
376
|
+
if self.loc_mod == "loc3d":
|
|
377
|
+
df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
|
|
378
|
+
df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
|
|
379
|
+
if "bias" in df_copy.columns:
|
|
380
|
+
df_copy = df_copy.drop(columns=["bias", "sq_err"])
|
|
381
|
+
|
|
382
|
+
# linked list for reading by dart programs
|
|
383
|
+
df_copy = df_copy.sort_values(
|
|
384
|
+
by=["time"], kind="stable"
|
|
385
|
+
) # sort the DataFrame by time
|
|
386
|
+
df_copy["obs_num"] = self.df.index + 1 # obs_num in time order
|
|
387
|
+
df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
388
|
+
len(df_copy)
|
|
389
|
+
) # linked list pattern
|
|
390
|
+
|
|
391
|
+
def write_row(row):
|
|
392
|
+
ob_write = self.list_to_obs(row.tolist())
|
|
393
|
+
for line in ob_write:
|
|
394
|
+
f.write(str(line) + "\n")
|
|
395
|
+
|
|
396
|
+
df_copy.apply(write_row, axis=1)
|
|
261
397
|
|
|
262
398
|
def column_headers(self):
|
|
263
|
-
"""define the columns for the dataframe
|
|
399
|
+
"""define the columns for the dataframe"""
|
|
264
400
|
heading = []
|
|
265
|
-
heading.append(
|
|
401
|
+
heading.append("obs_num")
|
|
266
402
|
heading.extend(self.copie_names)
|
|
267
|
-
heading.append(
|
|
268
|
-
if self.loc_mod ==
|
|
269
|
-
heading.append(
|
|
270
|
-
heading.append(
|
|
271
|
-
heading.append(
|
|
272
|
-
heading.append(
|
|
273
|
-
elif self.loc_mod ==
|
|
274
|
-
heading.append(
|
|
275
|
-
heading.append(
|
|
276
|
-
heading.append(
|
|
277
|
-
heading.append(
|
|
278
|
-
heading.append(
|
|
279
|
-
heading.append(
|
|
403
|
+
heading.append("linked_list")
|
|
404
|
+
if self.loc_mod == "loc3d":
|
|
405
|
+
heading.append("longitude")
|
|
406
|
+
heading.append("latitude")
|
|
407
|
+
heading.append("vertical")
|
|
408
|
+
heading.append("vert_unit")
|
|
409
|
+
elif self.loc_mod == "loc1d":
|
|
410
|
+
heading.append("location")
|
|
411
|
+
heading.append("type")
|
|
412
|
+
heading.append("metadata")
|
|
413
|
+
heading.append("external_FO")
|
|
414
|
+
heading.append("seconds")
|
|
415
|
+
heading.append("days")
|
|
416
|
+
heading.append("time")
|
|
417
|
+
heading.append("obs_err_var")
|
|
280
418
|
return heading
|
|
281
419
|
|
|
420
|
+
@requires_assimilation_info
|
|
421
|
+
def select_by_dart_qc(self, dart_qc):
|
|
422
|
+
"""
|
|
423
|
+
Selects rows from a DataFrame based on the DART quality control flag.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
df (DataFrame): A pandas DataFrame.
|
|
427
|
+
dart_qc (int): The DART quality control flag to select.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
|
|
431
|
+
|
|
432
|
+
Raises:
|
|
433
|
+
ValueError: If the DART quality control flag is not present in the DataFrame.
|
|
434
|
+
"""
|
|
435
|
+
if dart_qc not in self.df["DART_quality_control"].unique():
|
|
436
|
+
raise ValueError(
|
|
437
|
+
f"DART quality control flag '{dart_qc}' not found in DataFrame."
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
return self.df[self.df["DART_quality_control"] == dart_qc]
|
|
441
|
+
|
|
442
|
+
@requires_assimilation_info
|
|
443
|
+
def select_failed_qcs(self):
|
|
444
|
+
"""
|
|
445
|
+
Select rows from the DataFrame where the DART quality control flag is greater than 0.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
|
|
449
|
+
"""
|
|
450
|
+
return self.df[self.df["DART_quality_control"] > 0]
|
|
451
|
+
|
|
452
|
+
@requires_assimilation_info
|
|
453
|
+
def possible_vs_used(self):
|
|
454
|
+
"""
|
|
455
|
+
Calculates the count of possible vs. used observations by type.
|
|
456
|
+
|
|
457
|
+
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
458
|
+
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
459
|
+
minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
|
|
460
|
+
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
461
|
+
used observations.
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
|
|
465
|
+
'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
|
|
466
|
+
that passed quality control checks.
|
|
467
|
+
"""
|
|
468
|
+
possible = self.df.groupby("type")["observation"].count()
|
|
469
|
+
possible.rename("possible", inplace=True)
|
|
470
|
+
|
|
471
|
+
failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
|
|
472
|
+
used = possible - failed_qcs.reindex(possible.index, fill_value=0)
|
|
473
|
+
used.rename("used", inplace=True)
|
|
474
|
+
|
|
475
|
+
return pd.concat([possible, used], axis=1).reset_index()
|
|
476
|
+
|
|
282
477
|
@staticmethod
|
|
283
478
|
def is_binary(file):
|
|
284
479
|
"""Check if a file is binary file."""
|
|
285
|
-
with open(file,
|
|
480
|
+
with open(file, "rb") as f:
|
|
286
481
|
chunk = f.read(1024)
|
|
287
|
-
if b
|
|
482
|
+
if b"\0" in chunk:
|
|
288
483
|
return True
|
|
289
484
|
return False
|
|
290
485
|
|
|
291
|
-
|
|
292
486
|
@staticmethod
|
|
293
487
|
def read_header(file):
|
|
294
488
|
"""Read the header and number of lines in the header of an ascii obs_seq file"""
|
|
295
489
|
header = []
|
|
296
|
-
with open(file,
|
|
490
|
+
with open(file, "r") as f:
|
|
297
491
|
for line in f:
|
|
298
492
|
if "first:" in line and "last:" in line:
|
|
299
493
|
header.append(line.strip())
|
|
@@ -309,19 +503,19 @@ class obs_sequence:
|
|
|
309
503
|
linecount = 0
|
|
310
504
|
obs_types_definitions = -1000
|
|
311
505
|
num_obs = 0
|
|
312
|
-
max_num_obs = 0
|
|
506
|
+
max_num_obs = 0
|
|
313
507
|
# need to get:
|
|
314
508
|
# number of obs_type_definitions
|
|
315
509
|
# number of copies
|
|
316
510
|
# number of qcs
|
|
317
|
-
with open(file,
|
|
511
|
+
with open(file, "rb") as f:
|
|
318
512
|
while True:
|
|
319
513
|
# Read the record length
|
|
320
514
|
record_length = obs_sequence.read_record_length(f)
|
|
321
515
|
if record_length is None:
|
|
322
516
|
break
|
|
323
517
|
record = f.read(record_length)
|
|
324
|
-
if not record:
|
|
518
|
+
if not record: # end of file
|
|
325
519
|
break
|
|
326
520
|
|
|
327
521
|
# Read the trailing record length (should match the leading one)
|
|
@@ -329,17 +523,19 @@ class obs_sequence:
|
|
|
329
523
|
|
|
330
524
|
linecount += 1
|
|
331
525
|
|
|
332
|
-
if linecount == 3:
|
|
333
|
-
obs_types_definitions = struct.unpack(
|
|
334
|
-
continue
|
|
526
|
+
if linecount == 3:
|
|
527
|
+
obs_types_definitions = struct.unpack("i", record)[0]
|
|
528
|
+
continue
|
|
335
529
|
|
|
336
|
-
if linecount == 4+obs_types_definitions:
|
|
337
|
-
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
|
|
530
|
+
if linecount == 4 + obs_types_definitions:
|
|
531
|
+
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
|
|
532
|
+
"iiii", record
|
|
533
|
+
)[:16]
|
|
338
534
|
break
|
|
339
|
-
|
|
535
|
+
|
|
340
536
|
# Go back to the beginning of the file
|
|
341
537
|
f.seek(0)
|
|
342
|
-
|
|
538
|
+
|
|
343
539
|
for _ in range(2):
|
|
344
540
|
record_length = obs_sequence.read_record_length(f)
|
|
345
541
|
if record_length is None:
|
|
@@ -349,14 +545,14 @@ class obs_sequence:
|
|
|
349
545
|
if not record: # end of file
|
|
350
546
|
break
|
|
351
547
|
|
|
352
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
353
|
-
header.append(record.decode(
|
|
548
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
549
|
+
header.append(record.decode("utf-8").strip())
|
|
354
550
|
|
|
355
551
|
header.append(str(obs_types_definitions))
|
|
356
552
|
|
|
357
553
|
# obs_types_definitions
|
|
358
|
-
for _ in range(3,4+obs_types_definitions):
|
|
359
|
-
|
|
554
|
+
for _ in range(3, 4 + obs_types_definitions):
|
|
555
|
+
# Read the record length
|
|
360
556
|
record_length = obs_sequence.read_record_length(f)
|
|
361
557
|
if record_length is None:
|
|
362
558
|
break
|
|
@@ -366,21 +562,24 @@ class obs_sequence:
|
|
|
366
562
|
if not record: # end of file
|
|
367
563
|
break
|
|
368
564
|
|
|
369
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
565
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
370
566
|
|
|
371
567
|
if _ == 3:
|
|
372
|
-
continue
|
|
568
|
+
continue # num obs_types_definitions
|
|
373
569
|
# Read an integer and a string from the record
|
|
374
|
-
integer_value = struct.unpack(
|
|
375
|
-
string_value = record[4:].decode(
|
|
570
|
+
integer_value = struct.unpack("i", record[:4])[0]
|
|
571
|
+
string_value = record[4:].decode("utf-8").strip()
|
|
376
572
|
header.append(f"{integer_value} {string_value}")
|
|
377
573
|
|
|
378
574
|
header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
|
|
379
575
|
header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
|
|
380
|
-
|
|
381
|
-
#copie names
|
|
382
|
-
for _ in range(
|
|
383
|
-
|
|
576
|
+
|
|
577
|
+
# copie names
|
|
578
|
+
for _ in range(
|
|
579
|
+
5 + obs_types_definitions,
|
|
580
|
+
5 + obs_types_definitions + num_copies + num_qcs + 1,
|
|
581
|
+
):
|
|
582
|
+
# Read the record length
|
|
384
583
|
record_length = obs_sequence.read_record_length(f)
|
|
385
584
|
if record_length is None:
|
|
386
585
|
break
|
|
@@ -390,26 +589,26 @@ class obs_sequence:
|
|
|
390
589
|
if not record:
|
|
391
590
|
break
|
|
392
591
|
|
|
393
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
592
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
394
593
|
|
|
395
|
-
if _ == 5+obs_types_definitions:
|
|
594
|
+
if _ == 5 + obs_types_definitions:
|
|
396
595
|
continue
|
|
397
596
|
|
|
398
597
|
# Read the whole record as a string
|
|
399
|
-
string_value = record.decode(
|
|
598
|
+
string_value = record.decode("utf-8").strip()
|
|
400
599
|
header.append(string_value)
|
|
401
600
|
|
|
402
601
|
# first and last obs
|
|
403
|
-
# Read the record length
|
|
602
|
+
# Read the record length
|
|
404
603
|
record_length = obs_sequence.read_record_length(f)
|
|
405
604
|
|
|
406
605
|
# Read the actual record
|
|
407
606
|
record = f.read(record_length)
|
|
408
|
-
|
|
409
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
607
|
+
|
|
608
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
410
609
|
|
|
411
610
|
# Read the whole record as a two integers
|
|
412
|
-
first, last = struct.unpack(
|
|
611
|
+
first, last = struct.unpack("ii", record)[:8]
|
|
413
612
|
header.append(f"first: {first} last: {last}")
|
|
414
613
|
|
|
415
614
|
return header
|
|
@@ -418,7 +617,7 @@ class obs_sequence:
|
|
|
418
617
|
def collect_obs_types(header):
|
|
419
618
|
"""Create a dictionary for the observation types in the obs_seq header"""
|
|
420
619
|
num_obs_types = int(header[2])
|
|
421
|
-
types = dict([x.split() for
|
|
620
|
+
types = dict([x.split() for x in header[3 : num_obs_types + 3]])
|
|
422
621
|
return types
|
|
423
622
|
|
|
424
623
|
@staticmethod
|
|
@@ -426,32 +625,45 @@ class obs_sequence:
|
|
|
426
625
|
"""
|
|
427
626
|
Extracts the names of the copies from the header of an obs_seq file.
|
|
428
627
|
|
|
429
|
-
|
|
628
|
+
Args:
|
|
430
629
|
header (list): A list of strings representing the lines in the header of the obs_seq file.
|
|
431
630
|
|
|
432
631
|
Returns:
|
|
433
|
-
tuple: A tuple containing two elements:
|
|
434
|
-
- copie_names (list): A list of strings representing the copy names with underscores for spaces.
|
|
632
|
+
tuple: A tuple containing two elements:
|
|
633
|
+
- copie_names (list): A list of strings representing the copy names with underscores for spaces.
|
|
435
634
|
- len(copie_names) (int): The number of copy names.
|
|
436
635
|
"""
|
|
437
636
|
for i, line in enumerate(header):
|
|
438
637
|
if "num_obs:" in line and "max_num_obs:" in line:
|
|
439
|
-
first_copie = i+1
|
|
638
|
+
first_copie = i + 1
|
|
440
639
|
break
|
|
441
|
-
copie_names = [
|
|
640
|
+
copie_names = [
|
|
641
|
+
"_".join(x.split()) for x in header[first_copie:-1]
|
|
642
|
+
] # first and last is last line of header
|
|
442
643
|
return copie_names, len(copie_names)
|
|
443
644
|
|
|
645
|
+
@staticmethod
|
|
646
|
+
def num_qc_non_qc(header):
|
|
647
|
+
"""Find the number of qc and non-qc copies in the header"""
|
|
648
|
+
for line in header:
|
|
649
|
+
if "num_copies:" in line and "num_qc:" in line:
|
|
650
|
+
num_non_qc = int(line.split()[1])
|
|
651
|
+
num_qc = int(line.split()[3])
|
|
652
|
+
return num_non_qc, num_qc
|
|
653
|
+
|
|
444
654
|
@staticmethod
|
|
445
655
|
def obs_reader(file, n):
|
|
446
656
|
"""Reads the ascii obs sequence file and returns a generator of the obs"""
|
|
447
|
-
previous_line =
|
|
448
|
-
with open(file,
|
|
657
|
+
previous_line = ""
|
|
658
|
+
with open(file, "r") as f:
|
|
449
659
|
for line in f:
|
|
450
660
|
if "OBS" in line or "OBS" in previous_line:
|
|
451
661
|
if "OBS" in line:
|
|
452
662
|
obs = []
|
|
453
|
-
obs.append(line.strip())
|
|
454
|
-
for i in range(
|
|
663
|
+
obs.append(line.strip())
|
|
664
|
+
for i in range(
|
|
665
|
+
n + 100
|
|
666
|
+
): # number of copies + 100. Needs to be bigger than any metadata
|
|
455
667
|
try:
|
|
456
668
|
next_line = next(f)
|
|
457
669
|
except:
|
|
@@ -464,11 +676,15 @@ class obs_sequence:
|
|
|
464
676
|
else:
|
|
465
677
|
obs.append(next_line.strip())
|
|
466
678
|
yield obs
|
|
467
|
-
elif
|
|
679
|
+
elif (
|
|
680
|
+
"OBS" in previous_line
|
|
681
|
+
): # previous line is because I cannot use f.tell with next
|
|
468
682
|
obs = []
|
|
469
|
-
obs.append(previous_line.strip())
|
|
470
|
-
obs.append(line.strip())
|
|
471
|
-
for i in range(
|
|
683
|
+
obs.append(previous_line.strip())
|
|
684
|
+
obs.append(line.strip())
|
|
685
|
+
for i in range(
|
|
686
|
+
n + 100
|
|
687
|
+
): # number of copies + 100. Needs to be bigger than any metadata
|
|
472
688
|
try:
|
|
473
689
|
next_line = next(f)
|
|
474
690
|
except:
|
|
@@ -485,19 +701,19 @@ class obs_sequence:
|
|
|
485
701
|
|
|
486
702
|
@staticmethod
|
|
487
703
|
def check_trailing_record_length(file, expected_length):
|
|
488
|
-
|
|
704
|
+
"""Reads and checks the trailing record length from the binary file written by Fortran.
|
|
489
705
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
706
|
+
Args:
|
|
707
|
+
file (file): The file object.
|
|
708
|
+
expected_length (int): The expected length of the trailing record.
|
|
493
709
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
710
|
+
Assuming 4 bytes:
|
|
711
|
+
| Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
|
|
712
|
+
"""
|
|
713
|
+
trailing_record_length_bytes = file.read(4)
|
|
714
|
+
trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
|
|
715
|
+
if expected_length != trailing_record_length:
|
|
716
|
+
raise ValueError("Record length mismatch in Fortran binary file")
|
|
501
717
|
|
|
502
718
|
@staticmethod
|
|
503
719
|
def read_record_length(file):
|
|
@@ -505,18 +721,17 @@ class obs_sequence:
|
|
|
505
721
|
record_length_bytes = file.read(4)
|
|
506
722
|
if not record_length_bytes:
|
|
507
723
|
return None # End of file
|
|
508
|
-
return struct.unpack(
|
|
509
|
-
|
|
724
|
+
return struct.unpack("i", record_length_bytes)[0]
|
|
510
725
|
|
|
511
726
|
def obs_binary_reader(self, file, n):
|
|
512
727
|
"""Reads the obs sequence binary file and returns a generator of the obs"""
|
|
513
728
|
header_length = len(self.header)
|
|
514
|
-
with open(file,
|
|
729
|
+
with open(file, "rb") as f:
|
|
515
730
|
# Skip the first len(obs_seq.header) lines
|
|
516
|
-
for _ in range(header_length-1):
|
|
731
|
+
for _ in range(header_length - 1):
|
|
517
732
|
# Read the record length
|
|
518
733
|
record_length = obs_sequence.read_record_length(f)
|
|
519
|
-
if record_length is None:
|
|
734
|
+
if record_length is None: # End of file
|
|
520
735
|
break
|
|
521
736
|
|
|
522
737
|
# Skip the actual record
|
|
@@ -529,78 +744,78 @@ class obs_sequence:
|
|
|
529
744
|
while True:
|
|
530
745
|
obs = []
|
|
531
746
|
obs_num += 1
|
|
532
|
-
obs.append(f"OBS {obs_num}")
|
|
533
|
-
for _ in range(n):
|
|
747
|
+
obs.append(f"OBS {obs_num}")
|
|
748
|
+
for _ in range(n): # number of copies
|
|
534
749
|
# Read the record length
|
|
535
750
|
record_length = obs_sequence.read_record_length(f)
|
|
536
751
|
if record_length is None:
|
|
537
752
|
break
|
|
538
753
|
# Read the actual record (copie)
|
|
539
754
|
record = f.read(record_length)
|
|
540
|
-
obs.append(struct.unpack(
|
|
755
|
+
obs.append(struct.unpack("d", record)[0])
|
|
541
756
|
|
|
542
757
|
# Read the trailing record length (should match the leading one)
|
|
543
758
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
544
|
-
|
|
759
|
+
|
|
545
760
|
# linked list info
|
|
546
761
|
record_length = obs_sequence.read_record_length(f)
|
|
547
762
|
if record_length is None:
|
|
548
763
|
break
|
|
549
764
|
|
|
550
765
|
record = f.read(record_length)
|
|
551
|
-
int1, int2, int3 = struct.unpack(
|
|
766
|
+
int1, int2, int3 = struct.unpack("iii", record[:12])
|
|
552
767
|
linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
|
|
553
768
|
obs.append(linked_list_string)
|
|
554
769
|
|
|
555
770
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
556
771
|
|
|
557
772
|
# location (note no location header "loc3d" or "loc1d" for binary files)
|
|
558
|
-
obs.append(
|
|
773
|
+
obs.append("loc3d")
|
|
559
774
|
record_length = obs_sequence.read_record_length(f)
|
|
560
775
|
record = f.read(record_length)
|
|
561
|
-
x,y,z,vert = struct.unpack(
|
|
562
|
-
location_string = f"{x} {y} {z} {vert}"
|
|
563
|
-
obs.append(location_string)
|
|
776
|
+
x, y, z, vert = struct.unpack("dddi", record[:28])
|
|
777
|
+
location_string = f"{x} {y} {z} {vert}"
|
|
778
|
+
obs.append(location_string)
|
|
564
779
|
|
|
565
780
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
566
|
-
|
|
781
|
+
|
|
567
782
|
# kind (type of observation) value
|
|
568
|
-
obs.append(
|
|
783
|
+
obs.append("kind")
|
|
569
784
|
record_length_bytes = f.read(4)
|
|
570
|
-
record_length = struct.unpack(
|
|
785
|
+
record_length = struct.unpack("i", record_length_bytes)[0]
|
|
571
786
|
record = f.read(record_length)
|
|
572
787
|
kind = f"{struct.unpack('i', record)[0]}"
|
|
573
788
|
obs.append(kind)
|
|
574
|
-
|
|
789
|
+
|
|
575
790
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
576
791
|
|
|
577
792
|
# time (seconds, days)
|
|
578
793
|
record_length = obs_sequence.read_record_length(f)
|
|
579
794
|
record = f.read(record_length)
|
|
580
|
-
seconds, days = struct.unpack(
|
|
795
|
+
seconds, days = struct.unpack("ii", record)[:8]
|
|
581
796
|
time_string = f"{seconds} {days}"
|
|
582
797
|
obs.append(time_string)
|
|
583
798
|
|
|
584
799
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
585
|
-
|
|
800
|
+
|
|
586
801
|
# obs error variance
|
|
587
802
|
record_length = obs_sequence.read_record_length(f)
|
|
588
803
|
record = f.read(record_length)
|
|
589
|
-
obs.append(struct.unpack(
|
|
590
|
-
|
|
804
|
+
obs.append(struct.unpack("d", record)[0])
|
|
805
|
+
|
|
591
806
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
592
807
|
|
|
593
808
|
yield obs
|
|
594
809
|
|
|
595
|
-
def composite_types(self, composite_types=
|
|
810
|
+
def composite_types(self, composite_types="use_default"):
|
|
596
811
|
"""
|
|
597
812
|
Set up and construct composite types for the DataFrame.
|
|
598
813
|
|
|
599
|
-
This function sets up composite types based on a provided YAML configuration or
|
|
600
|
-
a default configuration. It constructs new composite rows by combining specified
|
|
814
|
+
This function sets up composite types based on a provided YAML configuration or
|
|
815
|
+
a default configuration. It constructs new composite rows by combining specified
|
|
601
816
|
components and adds them to the DataFrame.
|
|
602
817
|
|
|
603
|
-
|
|
818
|
+
Args:
|
|
604
819
|
composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
|
|
605
820
|
|
|
606
821
|
Returns:
|
|
@@ -610,12 +825,12 @@ class obs_sequence:
|
|
|
610
825
|
Exception: If there are repeat values in the components.
|
|
611
826
|
"""
|
|
612
827
|
|
|
613
|
-
if composite_types ==
|
|
828
|
+
if composite_types == "use_default":
|
|
614
829
|
composite_yaml = self.default_composite_types
|
|
615
830
|
else:
|
|
616
831
|
composite_yaml = composite_types
|
|
617
|
-
self.composite_types_dict
|
|
618
|
-
|
|
832
|
+
self.composite_types_dict = load_yaml_to_dict(composite_yaml)
|
|
833
|
+
|
|
619
834
|
components = []
|
|
620
835
|
for value in self.composite_types_dict.values():
|
|
621
836
|
components.extend(value["components"])
|
|
@@ -623,31 +838,234 @@ class obs_sequence:
|
|
|
623
838
|
if len(components) != len(set(components)):
|
|
624
839
|
raise Exception("There are repeat values in components.")
|
|
625
840
|
|
|
626
|
-
df_comp = self.df[
|
|
627
|
-
|
|
841
|
+
df_comp = self.df[
|
|
842
|
+
self.df["type"]
|
|
843
|
+
.str.upper()
|
|
844
|
+
.isin([component.upper() for component in components])
|
|
845
|
+
]
|
|
846
|
+
df_no_comp = self.df[
|
|
847
|
+
~self.df["type"]
|
|
848
|
+
.str.upper()
|
|
849
|
+
.isin([component.upper() for component in components])
|
|
850
|
+
]
|
|
628
851
|
|
|
629
852
|
for key in self.composite_types_dict:
|
|
630
|
-
df_new = construct_composit(
|
|
853
|
+
df_new = construct_composit(
|
|
854
|
+
df_comp, key, self.composite_types_dict[key]["components"]
|
|
855
|
+
)
|
|
631
856
|
df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
|
|
632
857
|
|
|
633
858
|
return df_no_comp
|
|
634
|
-
|
|
859
|
+
|
|
860
|
+
@classmethod
|
|
861
|
+
def join(cls, obs_sequences, copies=None):
|
|
862
|
+
"""
|
|
863
|
+
Join a list of observation sequences together.
|
|
864
|
+
|
|
865
|
+
This method combines the headers and observations from a list of obs_sequence objects
|
|
866
|
+
into a single obs_sequence object.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
|
|
870
|
+
copies (list of str, optional): A list of copy names to include in the combined data.
|
|
871
|
+
If not provided, all copies are included.
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
A new obs_sequence object containing the combined data.
|
|
875
|
+
|
|
876
|
+
Example:
|
|
877
|
+
.. code-block:: python
|
|
878
|
+
|
|
879
|
+
obs_seq1 = obs_sequence(file='obs_seq1.final')
|
|
880
|
+
obs_seq2 = obs_sequence(file='obs_seq2.final')
|
|
881
|
+
obs_seq3 = obs_sequence(file='obs_seq3.final')
|
|
882
|
+
combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
|
|
883
|
+
"""
|
|
884
|
+
if not obs_sequences:
|
|
885
|
+
raise ValueError("The list of observation sequences is empty.")
|
|
886
|
+
|
|
887
|
+
# Create a new obs_sequnece object with the combined data
|
|
888
|
+
combo = cls(file=None)
|
|
889
|
+
|
|
890
|
+
# Check if all obs_sequences have compatible attributes
|
|
891
|
+
first_loc_mod = obs_sequences[0].loc_mod
|
|
892
|
+
first_has_assimilation_info = obs_sequences[0].has_assimilation_info
|
|
893
|
+
first_has_posterior = obs_sequences[0].has_posterior
|
|
894
|
+
for obs_seq in obs_sequences:
|
|
895
|
+
if obs_seq.loc_mod != first_loc_mod:
|
|
896
|
+
raise ValueError(
|
|
897
|
+
"All observation sequences must have the same loc_mod."
|
|
898
|
+
)
|
|
899
|
+
if obs_seq.has_assimilation_info != first_has_assimilation_info:
|
|
900
|
+
raise ValueError(
|
|
901
|
+
"All observation sequences must have assimilation info."
|
|
902
|
+
)
|
|
903
|
+
if obs_seq.has_posterior != first_has_posterior:
|
|
904
|
+
raise ValueError(
|
|
905
|
+
"All observation sequences must have the posterior info."
|
|
906
|
+
)
|
|
907
|
+
# HK @todo prior only
|
|
908
|
+
combo.loc_mod = first_loc_mod
|
|
909
|
+
|
|
910
|
+
# check the copies are compatible (list of copies to combine?)
|
|
911
|
+
# subset of copies if needed
|
|
912
|
+
if copies:
|
|
913
|
+
start_required_columns = ["obs_num", "observation"]
|
|
914
|
+
end_required_columns = [
|
|
915
|
+
"linked_list",
|
|
916
|
+
"longitude",
|
|
917
|
+
"latitude",
|
|
918
|
+
"vertical",
|
|
919
|
+
"vert_unit",
|
|
920
|
+
"type",
|
|
921
|
+
"metadata",
|
|
922
|
+
"external_FO",
|
|
923
|
+
"seconds",
|
|
924
|
+
"days",
|
|
925
|
+
"time",
|
|
926
|
+
"obs_err_var",
|
|
927
|
+
]
|
|
928
|
+
required_columns = start_required_columns + end_required_columns
|
|
929
|
+
|
|
930
|
+
requested_columns = (
|
|
931
|
+
start_required_columns
|
|
932
|
+
+ [item for item in copies if item not in required_columns]
|
|
933
|
+
+ end_required_columns
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
for obs_seq in obs_sequences:
|
|
937
|
+
if not set(requested_columns).issubset(obs_seq.df.columns):
|
|
938
|
+
raise ValueError(
|
|
939
|
+
"All observation sequences must have the selected copies."
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
# go through columns and create header
|
|
943
|
+
remove_list = [
|
|
944
|
+
"obs_num",
|
|
945
|
+
"linked_list",
|
|
946
|
+
"latitude",
|
|
947
|
+
"longitude",
|
|
948
|
+
"vertical",
|
|
949
|
+
"vert_unit",
|
|
950
|
+
"type",
|
|
951
|
+
"metadata",
|
|
952
|
+
"external_FO",
|
|
953
|
+
"time",
|
|
954
|
+
"seconds",
|
|
955
|
+
"days",
|
|
956
|
+
"obs_err_var",
|
|
957
|
+
]
|
|
958
|
+
# using lists to retain copy order, non_qcs followed by qcs
|
|
959
|
+
combo.copie_names = [
|
|
960
|
+
item for item in requested_columns if item not in remove_list
|
|
961
|
+
]
|
|
962
|
+
combo.non_qc_copie_names = [
|
|
963
|
+
item
|
|
964
|
+
for item in combo.copie_names
|
|
965
|
+
if item in obs_sequences[0].non_qc_copie_names
|
|
966
|
+
]
|
|
967
|
+
combo.qc_copie_names = [
|
|
968
|
+
item
|
|
969
|
+
for item in combo.copie_names
|
|
970
|
+
if item in obs_sequences[0].qc_copie_names
|
|
971
|
+
]
|
|
972
|
+
|
|
973
|
+
combo.n_copies = len(combo.copie_names)
|
|
974
|
+
combo.n_qc = len(combo.qc_copie_names)
|
|
975
|
+
combo.n_non_qc = len(combo.non_qc_copie_names)
|
|
976
|
+
|
|
977
|
+
else:
|
|
978
|
+
for obs_seq in obs_sequences:
|
|
979
|
+
if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
|
|
980
|
+
raise ValueError(
|
|
981
|
+
"All observation sequences must have the same copies."
|
|
982
|
+
)
|
|
983
|
+
combo.n_copies = obs_sequences[0].n_copies
|
|
984
|
+
combo.n_qc = obs_sequences[0].n_qc
|
|
985
|
+
combo.n_non_qc = obs_sequences[0].n_non_qc
|
|
986
|
+
combo.copie_names = obs_sequences[0].copie_names
|
|
987
|
+
|
|
988
|
+
# todo HK @todo combine synonyms for obs?
|
|
989
|
+
|
|
990
|
+
# Initialize combined data
|
|
991
|
+
combined_types = []
|
|
992
|
+
combined_df = pd.DataFrame()
|
|
993
|
+
combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
|
|
994
|
+
|
|
995
|
+
# Iterate over the list of observation sequences and combine their data
|
|
996
|
+
for obs_seq in obs_sequences:
|
|
997
|
+
if copies:
|
|
998
|
+
combined_df = pd.concat(
|
|
999
|
+
[combined_df, obs_seq.df[requested_columns]], ignore_index=True
|
|
1000
|
+
)
|
|
1001
|
+
else:
|
|
1002
|
+
combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
|
|
1003
|
+
combined_types.extend(list(obs_seq.reverse_types.keys()))
|
|
1004
|
+
|
|
1005
|
+
# create dictionary of types
|
|
1006
|
+
keys = set(combined_types)
|
|
1007
|
+
combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
|
|
1008
|
+
combo.types = {v: k for k, v in combo.reverse_types.items()}
|
|
1009
|
+
|
|
1010
|
+
# create linked list for obs
|
|
1011
|
+
combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
|
|
1012
|
+
combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
1013
|
+
len(combo.df)
|
|
1014
|
+
)
|
|
1015
|
+
combo.df["obs_num"] = combined_df.index + 1
|
|
1016
|
+
combo.create_header(len(combo.df))
|
|
1017
|
+
|
|
1018
|
+
# set assimilation info (mean and spread) (prior and posterior)
|
|
1019
|
+
combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
|
|
1020
|
+
str.casefold, combo.df.columns
|
|
1021
|
+
)
|
|
1022
|
+
combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
|
|
1023
|
+
str.casefold, combo.df.columns
|
|
1024
|
+
)
|
|
1025
|
+
combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
|
|
1026
|
+
str.casefold, combo.df.columns
|
|
1027
|
+
)
|
|
1028
|
+
combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
|
|
1029
|
+
str.casefold, combo.df.columns
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
return combo
|
|
1033
|
+
|
|
1034
|
+
def create_header(self, n):
|
|
1035
|
+
"""Create a header for the obs_seq file from the obs_sequence object."""
|
|
1036
|
+
assert (
|
|
1037
|
+
self.n_copies == self.n_non_qc + self.n_qc
|
|
1038
|
+
), "n_copies must be equal to n_non_qc + n_qc"
|
|
1039
|
+
|
|
1040
|
+
self.header = []
|
|
1041
|
+
self.header.append(f"obs_sequence")
|
|
1042
|
+
self.header.append("obs_type_definitions")
|
|
1043
|
+
self.header.append(f"{len(self.types)}")
|
|
1044
|
+
for key, value in self.types.items():
|
|
1045
|
+
self.header.append(f"{key} {value}")
|
|
1046
|
+
self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
|
|
1047
|
+
self.header.append(f"num_obs: {n} max_num_obs: {n}")
|
|
1048
|
+
for copie in self.copie_names:
|
|
1049
|
+
self.header.append(copie)
|
|
1050
|
+
self.header.append(f"first: 1 last: {n}")
|
|
1051
|
+
|
|
1052
|
+
|
|
635
1053
|
def load_yaml_to_dict(file_path):
|
|
636
1054
|
"""
|
|
637
1055
|
Load a YAML file and convert it to a dictionary.
|
|
638
1056
|
|
|
639
|
-
|
|
1057
|
+
Args:
|
|
640
1058
|
file_path (str): The path to the YAML file.
|
|
641
1059
|
|
|
642
1060
|
Returns:
|
|
643
1061
|
dict: The YAML file content as a dictionary.
|
|
644
1062
|
"""
|
|
645
1063
|
try:
|
|
646
|
-
with open(file_path,
|
|
1064
|
+
with open(file_path, "r") as file:
|
|
647
1065
|
return yaml.safe_load(file)
|
|
648
1066
|
except Exception as e:
|
|
649
1067
|
print(f"Error loading YAML file: {e}")
|
|
650
|
-
return None
|
|
1068
|
+
return None
|
|
651
1069
|
|
|
652
1070
|
|
|
653
1071
|
def convert_dart_time(seconds, days):
|
|
@@ -657,66 +1075,8 @@ def convert_dart_time(seconds, days):
|
|
|
657
1075
|
- base year for Gregorian calendar is 1601
|
|
658
1076
|
- dart time is seconds, days since 1601
|
|
659
1077
|
"""
|
|
660
|
-
time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
|
|
1078
|
+
time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
|
|
661
1079
|
return time
|
|
662
|
-
|
|
663
|
-
def select_by_dart_qc(df, dart_qc):
|
|
664
|
-
"""
|
|
665
|
-
Selects rows from a DataFrame based on the DART quality control flag.
|
|
666
|
-
|
|
667
|
-
Parameters:
|
|
668
|
-
df (DataFrame): A pandas DataFrame.
|
|
669
|
-
dart_qc (int): The DART quality control flag to select.
|
|
670
|
-
|
|
671
|
-
Returns:
|
|
672
|
-
DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
|
|
673
|
-
|
|
674
|
-
Raises:
|
|
675
|
-
ValueError: If the DART quality control flag is not present in the DataFrame.
|
|
676
|
-
"""
|
|
677
|
-
if dart_qc not in df['DART_quality_control'].unique():
|
|
678
|
-
raise ValueError(f"DART quality control flag '{dart_qc}' not found in DataFrame.")
|
|
679
|
-
else:
|
|
680
|
-
return df[df['DART_quality_control'] == dart_qc]
|
|
681
|
-
|
|
682
|
-
def select_failed_qcs(df):
|
|
683
|
-
"""
|
|
684
|
-
Selects rows from a DataFrame where the DART quality control flag is greater than 0.
|
|
685
|
-
|
|
686
|
-
Parameters:
|
|
687
|
-
df (DataFrame): A pandas DataFrame.
|
|
688
|
-
|
|
689
|
-
Returns:
|
|
690
|
-
DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
|
|
691
|
-
"""
|
|
692
|
-
return df[df['DART_quality_control'] > 0]
|
|
693
|
-
|
|
694
|
-
def possible_vs_used(df):
|
|
695
|
-
"""
|
|
696
|
-
Calculates the count of possible vs. used observations by type.
|
|
697
|
-
|
|
698
|
-
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
699
|
-
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
700
|
-
minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
|
|
701
|
-
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
702
|
-
used observations.
|
|
703
|
-
|
|
704
|
-
Parameters:
|
|
705
|
-
df (pd.DataFrame): A DataFrame with at least two columns: 'type' for the observation type and 'observation'
|
|
706
|
-
for the observation data. It may also contain other columns required by the `select_failed_qcs` function
|
|
707
|
-
to determine failed quality control checks.
|
|
708
|
-
|
|
709
|
-
Returns:
|
|
710
|
-
pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
|
|
711
|
-
'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
|
|
712
|
-
that passed quality control checks.
|
|
713
|
-
|
|
714
|
-
"""
|
|
715
|
-
possible = df.groupby('type')['observation'].count()
|
|
716
|
-
possible.rename('possible', inplace=True)
|
|
717
|
-
used = df.groupby('type')['observation'].count() - select_failed_qcs(df).groupby('type')['observation'].count()
|
|
718
|
-
used.rename('used', inplace=True)
|
|
719
|
-
return pd.concat([possible, used], axis=1).reset_index()
|
|
720
1080
|
|
|
721
1081
|
|
|
722
1082
|
def construct_composit(df_comp, composite, components):
|
|
@@ -724,10 +1084,10 @@ def construct_composit(df_comp, composite, components):
|
|
|
724
1084
|
Construct a composite DataFrame by combining rows from two components.
|
|
725
1085
|
|
|
726
1086
|
This function takes two DataFrames and combines rows from them based on matching
|
|
727
|
-
location and time. It creates a new row with a composite type by combining
|
|
1087
|
+
location and time. It creates a new row with a composite type by combining
|
|
728
1088
|
specified columns using the square root of the sum of squares method.
|
|
729
1089
|
|
|
730
|
-
|
|
1090
|
+
Args:
|
|
731
1091
|
df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
|
|
732
1092
|
composite (str): The type name for the new composite rows.
|
|
733
1093
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
@@ -735,27 +1095,29 @@ def construct_composit(df_comp, composite, components):
|
|
|
735
1095
|
Returns:
|
|
736
1096
|
merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
|
|
737
1097
|
"""
|
|
738
|
-
selected_rows = df_comp[df_comp[
|
|
739
|
-
selected_rows_v = df_comp[df_comp[
|
|
1098
|
+
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1099
|
+
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
740
1100
|
|
|
741
|
-
columns_to_combine = df_comp.filter(regex=
|
|
742
|
-
columns_to_combine.append(
|
|
743
|
-
merge_columns = [
|
|
1101
|
+
columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
|
|
1102
|
+
columns_to_combine.append("observation") # TODO HK: bias, sq_err, obs_err_var
|
|
1103
|
+
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
744
1104
|
|
|
745
1105
|
print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
|
|
746
|
-
print("duplicates in v: ",selected_rows_v[merge_columns].duplicated().sum())
|
|
1106
|
+
print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
|
|
747
1107
|
|
|
748
1108
|
# Merge the two DataFrames on location and time columns
|
|
749
|
-
merged_df = pd.merge(
|
|
1109
|
+
merged_df = pd.merge(
|
|
1110
|
+
selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
|
|
1111
|
+
)
|
|
750
1112
|
|
|
751
1113
|
# Apply the square root of the sum of squares method to the relevant columns
|
|
752
1114
|
for col in columns_to_combine:
|
|
753
|
-
merged_df[col] = np.sqrt(merged_df[col]**2 + merged_df[f
|
|
1115
|
+
merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
|
|
754
1116
|
|
|
755
1117
|
# Create the new composite rows
|
|
756
|
-
merged_df[
|
|
757
|
-
merged_df = merged_df.drop(
|
|
1118
|
+
merged_df["type"] = composite.upper()
|
|
1119
|
+
merged_df = merged_df.drop(
|
|
1120
|
+
columns=[col for col in merged_df.columns if col.endswith("_v")]
|
|
1121
|
+
)
|
|
758
1122
|
|
|
759
1123
|
return merged_df
|
|
760
|
-
|
|
761
|
-
|