pydartdiags 0.0.43__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/matplots/__init__.py +0 -0
- pydartdiags/matplots/matplots.py +423 -0
- pydartdiags/obs_sequence/composite_types.yaml +35 -0
- pydartdiags/obs_sequence/obs_sequence.py +756 -343
- pydartdiags/plots/plots.py +80 -228
- pydartdiags/stats/__init__.py +0 -0
- pydartdiags/stats/stats.py +432 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info}/METADATA +10 -5
- pydartdiags-0.5.1.dist-info/RECORD +15 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info}/WHEEL +1 -1
- pydartdiags-0.0.43.dist-info/RECORD +0 -10
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info/licenses}/LICENSE +0 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
1
2
|
import pandas as pd
|
|
2
3
|
import datetime as dt
|
|
3
4
|
import numpy as np
|
|
@@ -5,106 +6,152 @@ import os
|
|
|
5
6
|
import yaml
|
|
6
7
|
import struct
|
|
7
8
|
|
|
9
|
+
|
|
8
10
|
def requires_assimilation_info(func):
|
|
9
11
|
def wrapper(self, *args, **kwargs):
|
|
10
|
-
if self.has_assimilation_info:
|
|
12
|
+
if self.has_assimilation_info():
|
|
11
13
|
return func(self, *args, **kwargs)
|
|
12
14
|
else:
|
|
13
|
-
raise ValueError(
|
|
14
|
-
|
|
15
|
+
raise ValueError(
|
|
16
|
+
"Assimilation information is required to call this function."
|
|
17
|
+
)
|
|
15
18
|
|
|
16
|
-
def requires_posterior_info(func):
|
|
17
|
-
def wrapper(self, *args, **kwargs):
|
|
18
|
-
if self.has_posterior_info:
|
|
19
|
-
return func(self, *args, **kwargs)
|
|
20
|
-
else:
|
|
21
|
-
raise ValueError("Posterior information is required to call this function.")
|
|
22
19
|
return wrapper
|
|
23
20
|
|
|
24
21
|
|
|
25
22
|
class obs_sequence:
|
|
26
|
-
"""
|
|
23
|
+
"""
|
|
24
|
+
Initialize an obs_sequence object from an ASCII or binary observation sequence file,
|
|
25
|
+
or create an empty obs_sequence object from scratch.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
file (str): The input observation sequence ASCII or binary file.
|
|
29
|
+
If None, an empty obs_sequence object is created from scratch.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
An obs_sequence object
|
|
27
33
|
|
|
28
34
|
Attributes:
|
|
29
|
-
df (pandas.DataFrame): DataFrame containing
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
types (dict): Dictionary of types in the observation sequence file.
|
|
34
|
-
copie_names (list): Names of copies in the observation sequence file.
|
|
35
|
-
Spelled 'copie' to avoid conflict with the Python built-in copy function.
|
|
35
|
+
df (pandas.DataFrame): The DataFrame containing the observation sequence data.
|
|
36
|
+
header (list): The header of the observation sequence.
|
|
37
|
+
copie_names (list): The names of the copies in the observation sequence.
|
|
38
|
+
Spelled 'copie' to avoid conflict with the Python built-in 'copy'.
|
|
36
39
|
Spaces are replaced with underscores in copie_names.
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
40
|
+
non_qc_copie_names (list): The names of the copies not including quality control,
|
|
41
|
+
e.g. observation, mean, ensemble_members
|
|
42
|
+
qc_copie_names (list): The names of the quality control copies, e.g. DART_QC
|
|
43
|
+
n_copies(int): The total number of copies in the observation sequence.
|
|
44
|
+
n_non_qc(int): The number of copies not including quality control.
|
|
45
|
+
n_qc(int): The number of quality control copies.
|
|
46
|
+
vert (dict): A dictionary mapping DART vertical coordinate types to their
|
|
47
|
+
corresponding integer values.
|
|
48
|
+
|
|
49
|
+
- undefined: 'VERTISUNDEF'
|
|
50
|
+
- surface: 'VERTISSURFACE' (value is surface elevation in meters)
|
|
51
|
+
- model level: 'VERTISLEVEL'
|
|
52
|
+
- pressure: 'VERTISPRESSURE' (in Pascals)
|
|
53
|
+
- height: 'VERTISHEIGHT' (in meters)
|
|
54
|
+
- scale height: 'VERTISSCALEHEIGHT' (unitless)
|
|
55
|
+
loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
|
|
56
|
+
For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
|
|
57
|
+
types (dict): Dictionary of types of observations the observation sequence,
|
|
58
|
+
e.g. {23: 'ACARS_TEMPERATURE'},
|
|
59
|
+
reverse_types (dict): Dictionary of types with keys and values reversed, e.g
|
|
60
|
+
{'ACARS_TEMPERATURE': 23}
|
|
61
|
+
synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
|
|
62
|
+
The default list is
|
|
63
|
+
|
|
64
|
+
.. code-block:: python
|
|
65
|
+
|
|
66
|
+
[ 'NCEP BUFR observation',
|
|
67
|
+
'AIRS observation',
|
|
68
|
+
'GTSPP observation',
|
|
69
|
+
'SST observation',
|
|
70
|
+
'observations',
|
|
71
|
+
'WOD observation']
|
|
72
|
+
|
|
73
|
+
You can add more synonyms by providing a list of strings when
|
|
74
|
+
creating the obs_sequence object.
|
|
75
|
+
|
|
76
|
+
.. code-block:: python
|
|
77
|
+
|
|
78
|
+
obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
|
|
79
|
+
|
|
80
|
+
seq (generator): Generator of observations from the observation sequence file.
|
|
81
|
+
all_obs (list): List of all observations, each observation is a list.
|
|
82
|
+
Valid when the obs_sequence is created from a file.
|
|
83
|
+
Set to None when the obs_sequence is created from scratch or multiple
|
|
84
|
+
obs_sequences are joined.
|
|
58
85
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
-1: 'surface (m)',
|
|
69
|
-
1: 'model level',
|
|
70
|
-
2: 'pressure (Pa)',
|
|
71
|
-
3: 'height (m)',
|
|
72
|
-
4: 'scale height' }
|
|
86
|
+
|
|
87
|
+
vert = {
|
|
88
|
+
-2: "undefined",
|
|
89
|
+
-1: "surface (m)",
|
|
90
|
+
1: "model level",
|
|
91
|
+
2: "pressure (Pa)",
|
|
92
|
+
3: "height (m)",
|
|
93
|
+
4: "scale height",
|
|
94
|
+
}
|
|
73
95
|
|
|
74
96
|
reversed_vert = {value: key for key, value in vert.items()}
|
|
75
97
|
|
|
76
|
-
|
|
77
98
|
def __init__(self, file, synonyms=None):
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
99
|
+
"""
|
|
100
|
+
Create an obs_sequence object from an ASCII or binary observation sequence file,
|
|
101
|
+
or create an empty obs_sequence object from scratch.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
file (str): The input observation sequence ASCII or binary file.
|
|
105
|
+
If None, an empty obs_sequence object is created from scratch.
|
|
106
|
+
synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
an obs_sequence object
|
|
110
|
+
1D observations are given a datetime of days, seconds since 2000-01-01 00:00:00
|
|
111
|
+
3D observations are given a datetime of days, seconds since 1601-01-01 00:00:00 (DART Gregorian calendar)
|
|
112
|
+
|
|
113
|
+
Examples:
|
|
114
|
+
|
|
115
|
+
.. code-block:: python
|
|
116
|
+
|
|
117
|
+
obs_seq = obs_sequence(file='obs_seq.final')
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
self.loc_mod = "None"
|
|
81
122
|
self.file = file
|
|
82
|
-
self.synonyms_for_obs = [
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
123
|
+
self.synonyms_for_obs = [
|
|
124
|
+
"NCEP BUFR observation",
|
|
125
|
+
"AIRS observation",
|
|
126
|
+
"GTSPP observation",
|
|
127
|
+
"SST observation",
|
|
128
|
+
"observations",
|
|
129
|
+
"WOD observation",
|
|
130
|
+
]
|
|
88
131
|
if synonyms:
|
|
89
132
|
if isinstance(synonyms, list):
|
|
90
133
|
self.synonyms_for_obs.extend(synonyms)
|
|
91
134
|
else:
|
|
92
135
|
self.synonyms_for_obs.append(synonyms)
|
|
93
136
|
|
|
137
|
+
module_dir = os.path.dirname(__file__)
|
|
138
|
+
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
139
|
+
|
|
94
140
|
if file is None:
|
|
95
|
-
# Early exit for testing purposes
|
|
141
|
+
# Early exit - for testing purposes or creating obs_seq objects from scratch
|
|
96
142
|
self.df = pd.DataFrame()
|
|
97
143
|
self.types = {}
|
|
98
144
|
self.reverse_types = {}
|
|
99
145
|
self.copie_names = []
|
|
100
|
-
self.
|
|
146
|
+
self.non_qc_copie_names = []
|
|
147
|
+
self.qc_copie_names = []
|
|
148
|
+
self.n_copies = 0 # copies including qc
|
|
149
|
+
self.n_non_qc = 0 # copies not including qc
|
|
150
|
+
self.n_qc = 0 # number of qc copies
|
|
101
151
|
self.seq = []
|
|
102
152
|
self.all_obs = []
|
|
103
153
|
return
|
|
104
154
|
|
|
105
|
-
module_dir = os.path.dirname(__file__)
|
|
106
|
-
self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
|
|
107
|
-
|
|
108
155
|
if self.is_binary(file):
|
|
109
156
|
self.header = self.read_binary_header(file)
|
|
110
157
|
else:
|
|
@@ -113,40 +160,41 @@ class obs_sequence:
|
|
|
113
160
|
self.types = self.collect_obs_types(self.header)
|
|
114
161
|
self.reverse_types = {v: k for k, v in self.types.items()}
|
|
115
162
|
self.copie_names, self.n_copies = self.collect_copie_names(self.header)
|
|
163
|
+
self.n_non_qc, self.n_qc = self.num_qc_non_qc(self.header)
|
|
164
|
+
self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
|
|
165
|
+
self.qc_copie_names = self.copie_names[self.n_non_qc :]
|
|
116
166
|
|
|
117
167
|
if self.is_binary(file):
|
|
118
168
|
self.seq = self.obs_binary_reader(file, self.n_copies)
|
|
119
|
-
self.loc_mod =
|
|
169
|
+
self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
|
|
120
170
|
else:
|
|
121
171
|
self.seq = self.obs_reader(file, self.n_copies)
|
|
122
172
|
|
|
123
|
-
self.all_obs = self.create_all_obs()
|
|
173
|
+
self.all_obs = self.create_all_obs() # uses up the generator
|
|
124
174
|
# at this point you know if the seq is loc3d or loc1d
|
|
125
|
-
if self.loc_mod ==
|
|
126
|
-
raise ValueError(
|
|
175
|
+
if self.loc_mod == "None":
|
|
176
|
+
raise ValueError(
|
|
177
|
+
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
178
|
+
)
|
|
127
179
|
self.columns = self.column_headers()
|
|
128
|
-
self.df = pd.DataFrame(self.all_obs, columns
|
|
129
|
-
if self.loc_mod ==
|
|
130
|
-
self.df[
|
|
131
|
-
self.df[
|
|
180
|
+
self.df = pd.DataFrame(self.all_obs, columns=self.columns)
|
|
181
|
+
if self.loc_mod == "loc3d":
|
|
182
|
+
self.df["longitude"] = np.rad2deg(self.df["longitude"])
|
|
183
|
+
self.df["latitude"] = np.rad2deg(self.df["latitude"])
|
|
132
184
|
# rename 'X observation' to observation
|
|
133
|
-
self.synonyms_for_obs = [
|
|
134
|
-
|
|
185
|
+
self.synonyms_for_obs = [
|
|
186
|
+
synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
|
|
187
|
+
]
|
|
188
|
+
rename_dict = {
|
|
189
|
+
old: "observation"
|
|
190
|
+
for old in self.synonyms_for_obs
|
|
191
|
+
if old in self.df.columns
|
|
192
|
+
}
|
|
135
193
|
self.df = self.df.rename(columns=rename_dict)
|
|
136
194
|
|
|
137
|
-
# calculate bias and sq_err is the obs_seq is an obs_seq.final
|
|
138
|
-
if 'prior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
|
|
139
|
-
self.has_assimilation_info = True
|
|
140
|
-
self.df['prior_bias'] = (self.df['prior_ensemble_mean'] - self.df['observation'])
|
|
141
|
-
self.df['prior_sq_err'] = self.df['prior_bias']**2 # squared error
|
|
142
|
-
if 'posterior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
|
|
143
|
-
self.has_posterior_info = True
|
|
144
|
-
self.df['posterior_bias'] = (self.df['posterior_ensemble_mean'] - self.df['observation'])
|
|
145
|
-
self.df['posterior_sq_err'] = self.df['posterior_bias']**2
|
|
146
|
-
|
|
147
195
|
def create_all_obs(self):
|
|
148
|
-
"""
|
|
149
|
-
|
|
196
|
+
"""steps through the generator to create a
|
|
197
|
+
list of all observations in the sequence
|
|
150
198
|
"""
|
|
151
199
|
all_obs = []
|
|
152
200
|
for obs in self.seq:
|
|
@@ -155,49 +203,54 @@ class obs_sequence:
|
|
|
155
203
|
return all_obs
|
|
156
204
|
|
|
157
205
|
def obs_to_list(self, obs):
|
|
158
|
-
"""put single observation into a list
|
|
159
|
-
|
|
160
|
-
discards obs_def
|
|
161
|
-
"""
|
|
206
|
+
"""put single observation into a list"""
|
|
162
207
|
data = []
|
|
163
|
-
data.append(obs[0].split()[1])
|
|
164
|
-
data.extend(list(map(float,obs[1:self.n_copies+1])))
|
|
165
|
-
data.append(obs[self.n_copies+1])
|
|
208
|
+
data.append(obs[0].split()[1]) # obs_num
|
|
209
|
+
data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
|
|
210
|
+
data.append(obs[self.n_copies + 1]) # linked list info
|
|
166
211
|
try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
|
|
167
|
-
locI = obs.index(
|
|
168
|
-
location = obs[locI+1].split()
|
|
212
|
+
locI = obs.index("loc3d")
|
|
213
|
+
location = obs[locI + 1].split()
|
|
169
214
|
data.append(float(location[0])) # location x
|
|
170
215
|
data.append(float(location[1])) # location y
|
|
171
216
|
data.append(float(location[2])) # location z
|
|
172
217
|
data.append(obs_sequence.vert[int(location[3])])
|
|
173
|
-
self.loc_mod =
|
|
218
|
+
self.loc_mod = "loc3d"
|
|
174
219
|
except ValueError:
|
|
175
220
|
try:
|
|
176
|
-
locI = obs.index(
|
|
177
|
-
location = obs[locI+1]
|
|
178
|
-
data.append(float(location)) # 1d location
|
|
179
|
-
self.loc_mod =
|
|
221
|
+
locI = obs.index("loc1d")
|
|
222
|
+
location = obs[locI + 1]
|
|
223
|
+
data.append(float(location)) # 1d location
|
|
224
|
+
self.loc_mod = "loc1d"
|
|
180
225
|
except ValueError:
|
|
181
|
-
raise ValueError(
|
|
182
|
-
|
|
226
|
+
raise ValueError(
|
|
227
|
+
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
228
|
+
)
|
|
229
|
+
typeI = obs.index("kind") # type of observation
|
|
183
230
|
type_value = obs[typeI + 1]
|
|
184
231
|
if not self.types:
|
|
185
|
-
data.append(
|
|
232
|
+
data.append("Identity")
|
|
186
233
|
else:
|
|
187
|
-
data.append(self.types[type_value])
|
|
188
|
-
|
|
234
|
+
data.append(self.types[type_value]) # observation type
|
|
235
|
+
|
|
189
236
|
# any observation specific obs def info is between here and the end of the list
|
|
190
237
|
# can be obs_def & external forward operator
|
|
191
|
-
metadata = obs[typeI+2
|
|
238
|
+
metadata = obs[typeI + 2 : -2]
|
|
192
239
|
obs_def_metadata, external_metadata = self.split_metadata(metadata)
|
|
193
240
|
data.append(obs_def_metadata)
|
|
194
241
|
data.append(external_metadata)
|
|
195
242
|
|
|
196
243
|
time = obs[-2].split()
|
|
197
|
-
data.append(int(time[0]))
|
|
198
|
-
data.append(int(time[1]))
|
|
199
|
-
|
|
200
|
-
|
|
244
|
+
data.append(int(time[0])) # seconds
|
|
245
|
+
data.append(int(time[1])) # days
|
|
246
|
+
if self.loc_mod == "loc3d":
|
|
247
|
+
data.append(convert_dart_time(int(time[0]), int(time[1])))
|
|
248
|
+
else: # HK todo what is appropriate for 1d models?
|
|
249
|
+
data.append(
|
|
250
|
+
dt.datetime(2000, 1, 1)
|
|
251
|
+
+ dt.timedelta(seconds=int(time[0]), days=int(time[1]))
|
|
252
|
+
)
|
|
253
|
+
data.append(float(obs[-1])) # obs error variance ?convert to sd?
|
|
201
254
|
|
|
202
255
|
return data
|
|
203
256
|
|
|
@@ -215,41 +268,49 @@ class obs_sequence:
|
|
|
215
268
|
the first sublist contains the entire metadata list, and the second is empty.
|
|
216
269
|
"""
|
|
217
270
|
for i, item in enumerate(metadata):
|
|
218
|
-
if item.startswith(
|
|
271
|
+
if item.startswith("external_FO"):
|
|
219
272
|
return metadata[:i], metadata[i:]
|
|
220
273
|
return metadata, []
|
|
221
274
|
|
|
222
275
|
def list_to_obs(self, data):
|
|
276
|
+
"""convert a list of data to an observation
|
|
277
|
+
|
|
278
|
+
Assuming the order of the list is obs_seq.copie_names
|
|
279
|
+
|
|
280
|
+
"""
|
|
223
281
|
obs = []
|
|
224
|
-
obs.append(
|
|
225
|
-
obs.extend(data[1:self.n_copies+1]) # all the copies
|
|
226
|
-
obs.append(data[self.n_copies+1]) # linked list info
|
|
227
|
-
obs.append(
|
|
282
|
+
obs.append("OBS " + str(data[0])) # obs_num lots of space
|
|
283
|
+
obs.extend(data[1 : self.n_copies + 1]) # all the copies
|
|
284
|
+
obs.append(data[self.n_copies + 1]) # linked list info
|
|
285
|
+
obs.append("obdef") # TODO HK: extended_FO obs_def
|
|
228
286
|
obs.append(self.loc_mod)
|
|
229
|
-
if self.loc_mod ==
|
|
230
|
-
obs.append(
|
|
231
|
-
|
|
287
|
+
if self.loc_mod == "loc3d":
|
|
288
|
+
obs.append(
|
|
289
|
+
" ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
|
|
290
|
+
+ " "
|
|
291
|
+
+ str(self.reversed_vert[data[self.n_copies + 5]])
|
|
292
|
+
) # location x, y, z, vert
|
|
293
|
+
obs.append("kind") # this is type of observation
|
|
232
294
|
obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
|
|
233
|
-
# Convert metadata to a string and append
|
|
295
|
+
# Convert metadata to a string and append !HK @todo you are not converting to string
|
|
234
296
|
obs.extend(data[self.n_copies + 7]) # metadata
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
obs.append(
|
|
297
|
+
obs.extend(data[self.n_copies + 8]) # external forward operator
|
|
298
|
+
elif self.loc_mod == "loc1d":
|
|
299
|
+
obs.append(data[self.n_copies + 2]) # 1d location
|
|
300
|
+
obs.append("kind") # this is type of observation
|
|
238
301
|
obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
|
|
239
|
-
#
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
obs.append(metadata) # metadata
|
|
243
|
-
obs.append(' '.join(map(str, data[-4:-2]))) # seconds, days
|
|
302
|
+
obs.extend(data[self.n_copies + 4]) # metadata
|
|
303
|
+
obs.extend(data[self.n_copies + 5]) # external forward operator
|
|
304
|
+
obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
|
|
244
305
|
obs.append(data[-1]) # obs error variance
|
|
245
306
|
|
|
246
307
|
return obs
|
|
247
308
|
|
|
248
309
|
@staticmethod
|
|
249
310
|
def generate_linked_list_pattern(n):
|
|
250
|
-
"""Create a list of strings with the linked list pattern for n
|
|
311
|
+
"""Create a list of strings with the linked list pattern for n observations."""
|
|
251
312
|
result = []
|
|
252
|
-
for i in range(n-1):
|
|
313
|
+
for i in range(n - 1):
|
|
253
314
|
col1 = i if i > 0 else -1
|
|
254
315
|
col2 = i + 2
|
|
255
316
|
col3 = -1
|
|
@@ -257,101 +318,190 @@ class obs_sequence:
|
|
|
257
318
|
result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
|
|
258
319
|
return result
|
|
259
320
|
|
|
260
|
-
def write_obs_seq(self, file
|
|
321
|
+
def write_obs_seq(self, file):
|
|
261
322
|
"""
|
|
262
323
|
Write the observation sequence to a file.
|
|
263
|
-
|
|
264
|
-
This function writes the observation sequence to
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
Parameters:
|
|
324
|
+
|
|
325
|
+
This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
|
|
326
|
+
It updates the header with the number of observations, converts coordinates back to radians
|
|
327
|
+
if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
|
|
328
|
+
list pattern for reading by DART programs.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
273
331
|
file (str): The path to the file where the observation sequence will be written.
|
|
274
|
-
|
|
275
|
-
|
|
332
|
+
|
|
333
|
+
Notes:
|
|
334
|
+
- Longitude and latitude are converted back to radians if the location model is 'loc3d'.
|
|
335
|
+
- The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
|
|
336
|
+
- The DataFrame is sorted by the 'time' column.
|
|
337
|
+
- An 'obs_num' column is added to the DataFrame to number the observations in time order.
|
|
338
|
+
- A 'linked_list' column is generated to create a linked list pattern for the observations.
|
|
339
|
+
|
|
340
|
+
Example:
|
|
341
|
+
obsq.write_obs_seq('obs_seq.new')
|
|
342
|
+
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
self.create_header_from_dataframe()
|
|
346
|
+
|
|
347
|
+
with open(file, "w") as f:
|
|
348
|
+
|
|
349
|
+
for line in self.header:
|
|
350
|
+
f.write(str(line) + "\n")
|
|
351
|
+
|
|
352
|
+
# TODO HK is there something better than copying the whole thing here?
|
|
353
|
+
df_copy = self.df.copy() # copy since you want to change for writing.
|
|
354
|
+
# back to radians for obs_seq
|
|
355
|
+
if self.loc_mod == "loc3d":
|
|
356
|
+
df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
|
|
357
|
+
df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
|
|
358
|
+
if "prior_bias" in df_copy.columns:
|
|
359
|
+
df_copy = df_copy.drop(
|
|
360
|
+
columns=["prior_bias", "prior_sq_err", "prior_totalvar"]
|
|
361
|
+
)
|
|
362
|
+
if "posterior_bias" in df_copy.columns:
|
|
363
|
+
df_copy = df_copy.drop(
|
|
364
|
+
columns=["posterior_bias", "posterior_sq_err", "posterior_totalvar"]
|
|
365
|
+
)
|
|
366
|
+
if "midpoint" in df_copy.columns:
|
|
367
|
+
df_copy = df_copy.drop(columns=["midpoint", "vlevels"])
|
|
368
|
+
|
|
369
|
+
# linked list for reading by dart programs
|
|
370
|
+
df_copy = df_copy.sort_values(
|
|
371
|
+
by=["time"], kind="stable"
|
|
372
|
+
) # sort the DataFrame by time
|
|
373
|
+
df_copy.reset_index(drop=True, inplace=True)
|
|
374
|
+
df_copy["obs_num"] = df_copy.index + 1 # obs_num in time order
|
|
375
|
+
df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
376
|
+
len(df_copy)
|
|
377
|
+
) # linked list pattern
|
|
378
|
+
|
|
379
|
+
def write_row(row):
|
|
380
|
+
ob_write = self.list_to_obs(row.tolist())
|
|
381
|
+
for line in ob_write:
|
|
382
|
+
f.write(str(line) + "\n")
|
|
383
|
+
|
|
384
|
+
df_copy.apply(write_row, axis=1)
|
|
385
|
+
|
|
386
|
+
@staticmethod
|
|
387
|
+
def update_types_dicts(df, reverse_types):
|
|
388
|
+
"""
|
|
389
|
+
Ensure all unique observation types are in the reverse_types dictionary and create
|
|
390
|
+
the types dictionary.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
df (pd.DataFrame): The DataFrame containing the observation sequence data.
|
|
394
|
+
reverse_types (dict): The dictionary mapping observation types to their corresponding integer values.
|
|
395
|
+
|
|
276
396
|
Returns:
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
Examples:
|
|
280
|
-
``obs_seq.write_obs_seq('/path/to/output/file')``
|
|
281
|
-
``obs_seq.write_obs_seq('/path/to/output/file', df=obs_seq.df)``
|
|
397
|
+
dict: The updated reverse_types dictionary.
|
|
398
|
+
dict: The types dictionary with keys sorted in numerical order.
|
|
282
399
|
"""
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
if df is not None:
|
|
286
|
-
# If a DataFrame is provided, update the header with the number of observations
|
|
287
|
-
num_rows = len(df)
|
|
288
|
-
replacement_string = f'num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}'
|
|
289
|
-
new_header = [replacement_string if 'num_obs' in element else element for element in self.header]
|
|
290
|
-
|
|
291
|
-
for line in new_header[:-1]:
|
|
292
|
-
f.write(str(line) + '\n')
|
|
293
|
-
first = 1
|
|
294
|
-
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
295
|
-
|
|
296
|
-
# TODO HK is there something better than copying the whole thing here?
|
|
297
|
-
df_copy = df.copy() # copy since you want to change for writing.
|
|
298
|
-
# back to radians for obs_seq
|
|
299
|
-
if self.loc_mod == 'loc3d':
|
|
300
|
-
df_copy['longitude'] = np.deg2rad(self.df['longitude'])
|
|
301
|
-
df_copy['latitude'] = np.deg2rad(self.df['latitude'])
|
|
302
|
-
if 'bias' in df_copy.columns:
|
|
303
|
-
df_copy = df_copy.drop(columns=['bias', 'sq_err'])
|
|
304
|
-
|
|
305
|
-
# linked list for reading by dart programs
|
|
306
|
-
df_copy = df_copy.sort_values(by=['time']) # sort the DataFrame by time
|
|
307
|
-
df_copy['obs_num'] = df.index + 1 # obs_num in time order
|
|
308
|
-
df_copy['linked_list'] = obs_sequence.generate_linked_list_pattern(len(df_copy)) # linked list pattern
|
|
309
|
-
|
|
310
|
-
def write_row(row):
|
|
311
|
-
ob_write = self.list_to_obs(row.tolist())
|
|
312
|
-
for line in ob_write:
|
|
313
|
-
f.write(str(line) + '\n')
|
|
314
|
-
|
|
315
|
-
df_copy.apply(write_row, axis=1)
|
|
316
|
-
|
|
317
|
-
else:
|
|
318
|
-
# If no DataFrame is provided, use self.header and self.all_obs
|
|
319
|
-
for line in self.header:
|
|
320
|
-
f.write(str(line) + '\n')
|
|
321
|
-
for obs in self.all_obs:
|
|
322
|
-
ob_write = self.list_to_obs(obs)
|
|
323
|
-
for line in ob_write:
|
|
324
|
-
f.write(str(line) + '\n')
|
|
400
|
+
# Create a dictionary of observation types from the dataframe
|
|
401
|
+
unique_types = df["type"].unique()
|
|
325
402
|
|
|
403
|
+
# Ensure all unique types are in reverse_types
|
|
404
|
+
for obs_type in unique_types:
|
|
405
|
+
if obs_type not in reverse_types:
|
|
406
|
+
new_id = int(max(reverse_types.values(), default=0)) + 1
|
|
407
|
+
reverse_types[obs_type] = str(new_id)
|
|
408
|
+
|
|
409
|
+
not_sorted_types = {
|
|
410
|
+
reverse_types[obs_type]: obs_type for obs_type in unique_types
|
|
411
|
+
}
|
|
412
|
+
types = {
|
|
413
|
+
k: not_sorted_types[k] for k in sorted(not_sorted_types)
|
|
414
|
+
} # to get keys in numerical order
|
|
415
|
+
|
|
416
|
+
return reverse_types, types
|
|
417
|
+
|
|
418
|
+
def create_header_from_dataframe(self):
|
|
419
|
+
"""
|
|
420
|
+
Create a header for the observation sequence based on the data in the DataFrame.
|
|
421
|
+
|
|
422
|
+
It creates a dictionary of unique observation types, counts the
|
|
423
|
+
number of observations, and constructs the header with necessary information.
|
|
424
|
+
|
|
425
|
+
Example:
|
|
426
|
+
self.create_header_from_dataframe()
|
|
427
|
+
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
self.reverse_types, self.types = self.update_types_dicts(
|
|
431
|
+
self.df, self.reverse_types
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
num_obs = len(self.df)
|
|
435
|
+
|
|
436
|
+
self.header = []
|
|
437
|
+
self.header.append("obs_sequence")
|
|
438
|
+
self.header.append("obs_type_definitions")
|
|
439
|
+
self.header.append(f"{len(self.types)}")
|
|
440
|
+
for key, value in self.types.items():
|
|
441
|
+
self.header.append(f"{key} {value}")
|
|
442
|
+
self.header.append(
|
|
443
|
+
f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}"
|
|
444
|
+
) # @todo HK not keeping track if num_qc changes
|
|
445
|
+
self.header.append(f"num_obs: {num_obs:>10} max_num_obs: {num_obs:>10}")
|
|
446
|
+
stats_cols = [
|
|
447
|
+
"prior_bias",
|
|
448
|
+
"prior_sq_err",
|
|
449
|
+
"prior_totalvar",
|
|
450
|
+
"posterior_bias",
|
|
451
|
+
"posterior_sq_err",
|
|
452
|
+
"posterior_totalvar",
|
|
453
|
+
]
|
|
454
|
+
level_cols = ["vlevels", "midpoint"]
|
|
455
|
+
non_copie_cols = [
|
|
456
|
+
"obs_num",
|
|
457
|
+
"linked_list",
|
|
458
|
+
"longitude",
|
|
459
|
+
"latitude",
|
|
460
|
+
"vertical",
|
|
461
|
+
"vert_unit",
|
|
462
|
+
"type",
|
|
463
|
+
"metadata",
|
|
464
|
+
"external_FO",
|
|
465
|
+
"seconds",
|
|
466
|
+
"days",
|
|
467
|
+
"time",
|
|
468
|
+
"obs_err_var",
|
|
469
|
+
"location",
|
|
470
|
+
]
|
|
471
|
+
for copie in self.df.columns:
|
|
472
|
+
if copie not in stats_cols + non_copie_cols + level_cols:
|
|
473
|
+
self.header.append(copie.replace("_", " "))
|
|
474
|
+
first = 1
|
|
475
|
+
self.header.append(f"first: {first:>12} last: {num_obs:>12}")
|
|
326
476
|
|
|
327
477
|
def column_headers(self):
|
|
328
|
-
"""define the columns for the dataframe
|
|
478
|
+
"""define the columns for the dataframe"""
|
|
329
479
|
heading = []
|
|
330
|
-
heading.append(
|
|
480
|
+
heading.append("obs_num")
|
|
331
481
|
heading.extend(self.copie_names)
|
|
332
|
-
heading.append(
|
|
333
|
-
if self.loc_mod ==
|
|
334
|
-
heading.append(
|
|
335
|
-
heading.append(
|
|
336
|
-
heading.append(
|
|
337
|
-
heading.append(
|
|
338
|
-
elif self.loc_mod ==
|
|
339
|
-
heading.append(
|
|
340
|
-
heading.append(
|
|
341
|
-
heading.append(
|
|
342
|
-
heading.append(
|
|
343
|
-
heading.append(
|
|
344
|
-
heading.append(
|
|
345
|
-
heading.append(
|
|
346
|
-
heading.append(
|
|
482
|
+
heading.append("linked_list")
|
|
483
|
+
if self.loc_mod == "loc3d":
|
|
484
|
+
heading.append("longitude")
|
|
485
|
+
heading.append("latitude")
|
|
486
|
+
heading.append("vertical")
|
|
487
|
+
heading.append("vert_unit")
|
|
488
|
+
elif self.loc_mod == "loc1d":
|
|
489
|
+
heading.append("location")
|
|
490
|
+
heading.append("type")
|
|
491
|
+
heading.append("metadata")
|
|
492
|
+
heading.append("external_FO")
|
|
493
|
+
heading.append("seconds")
|
|
494
|
+
heading.append("days")
|
|
495
|
+
heading.append("time")
|
|
496
|
+
heading.append("obs_err_var")
|
|
347
497
|
return heading
|
|
348
498
|
|
|
349
|
-
@requires_assimilation_info
|
|
499
|
+
@requires_assimilation_info
|
|
350
500
|
def select_by_dart_qc(self, dart_qc):
|
|
351
501
|
"""
|
|
352
502
|
Selects rows from a DataFrame based on the DART quality control flag.
|
|
353
503
|
|
|
354
|
-
|
|
504
|
+
Args:
|
|
355
505
|
df (DataFrame): A pandas DataFrame.
|
|
356
506
|
dart_qc (int): The DART quality control flag to select.
|
|
357
507
|
|
|
@@ -361,20 +511,26 @@ class obs_sequence:
|
|
|
361
511
|
Raises:
|
|
362
512
|
ValueError: If the DART quality control flag is not present in the DataFrame.
|
|
363
513
|
"""
|
|
364
|
-
if dart_qc not in self.df[
|
|
365
|
-
raise ValueError(
|
|
514
|
+
if dart_qc not in self.df["DART_quality_control"].unique():
|
|
515
|
+
raise ValueError(
|
|
516
|
+
f"DART quality control flag '{dart_qc}' not found in DataFrame."
|
|
517
|
+
)
|
|
366
518
|
else:
|
|
367
|
-
return self.df[self.df[
|
|
519
|
+
return self.df[self.df["DART_quality_control"] == dart_qc]
|
|
368
520
|
|
|
369
521
|
@requires_assimilation_info
|
|
370
|
-
def
|
|
522
|
+
def select_used_qcs(self):
|
|
371
523
|
"""
|
|
372
|
-
Select rows from the DataFrame where the
|
|
524
|
+
Select rows from the DataFrame where the observation was used.
|
|
525
|
+
Includes observations for which the posterior forward observation operators failed.
|
|
373
526
|
|
|
374
527
|
Returns:
|
|
375
|
-
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag
|
|
528
|
+
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag 0 or 2.
|
|
376
529
|
"""
|
|
377
|
-
return self.df[
|
|
530
|
+
return self.df[
|
|
531
|
+
(self.df["DART_quality_control"] == 0)
|
|
532
|
+
| (self.df["DART_quality_control"] == 2)
|
|
533
|
+
]
|
|
378
534
|
|
|
379
535
|
@requires_assimilation_info
|
|
380
536
|
def possible_vs_used(self):
|
|
@@ -383,7 +539,7 @@ class obs_sequence:
|
|
|
383
539
|
|
|
384
540
|
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
385
541
|
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
386
|
-
|
|
542
|
+
of assimilated observations (as determined by the `select_used_qcs` function).
|
|
387
543
|
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
388
544
|
used observations.
|
|
389
545
|
|
|
@@ -392,31 +548,29 @@ class obs_sequence:
|
|
|
392
548
|
'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
|
|
393
549
|
that passed quality control checks.
|
|
394
550
|
"""
|
|
395
|
-
possible = self.df.groupby(
|
|
396
|
-
possible.rename(
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
used =
|
|
400
|
-
used.rename(
|
|
401
|
-
|
|
402
|
-
return pd.concat([possible, used], axis=1).reset_index()
|
|
551
|
+
possible = self.df.groupby("type")["observation"].count()
|
|
552
|
+
possible.rename("possible", inplace=True)
|
|
553
|
+
|
|
554
|
+
used_qcs = self.select_used_qcs().groupby("type")["observation"].count()
|
|
555
|
+
used = used_qcs.reindex(possible.index, fill_value=0)
|
|
556
|
+
used.rename("used", inplace=True)
|
|
403
557
|
|
|
558
|
+
return pd.concat([possible, used], axis=1).reset_index()
|
|
404
559
|
|
|
405
560
|
@staticmethod
|
|
406
561
|
def is_binary(file):
|
|
407
562
|
"""Check if a file is binary file."""
|
|
408
|
-
with open(file,
|
|
563
|
+
with open(file, "rb") as f:
|
|
409
564
|
chunk = f.read(1024)
|
|
410
|
-
if b
|
|
565
|
+
if b"\0" in chunk:
|
|
411
566
|
return True
|
|
412
567
|
return False
|
|
413
568
|
|
|
414
|
-
|
|
415
569
|
@staticmethod
|
|
416
570
|
def read_header(file):
|
|
417
571
|
"""Read the header and number of lines in the header of an ascii obs_seq file"""
|
|
418
572
|
header = []
|
|
419
|
-
with open(file,
|
|
573
|
+
with open(file, "r") as f:
|
|
420
574
|
for line in f:
|
|
421
575
|
if "first:" in line and "last:" in line:
|
|
422
576
|
header.append(line.strip())
|
|
@@ -432,19 +586,19 @@ class obs_sequence:
|
|
|
432
586
|
linecount = 0
|
|
433
587
|
obs_types_definitions = -1000
|
|
434
588
|
num_obs = 0
|
|
435
|
-
max_num_obs = 0
|
|
589
|
+
max_num_obs = 0
|
|
436
590
|
# need to get:
|
|
437
591
|
# number of obs_type_definitions
|
|
438
592
|
# number of copies
|
|
439
593
|
# number of qcs
|
|
440
|
-
with open(file,
|
|
594
|
+
with open(file, "rb") as f:
|
|
441
595
|
while True:
|
|
442
596
|
# Read the record length
|
|
443
597
|
record_length = obs_sequence.read_record_length(f)
|
|
444
598
|
if record_length is None:
|
|
445
599
|
break
|
|
446
600
|
record = f.read(record_length)
|
|
447
|
-
if not record:
|
|
601
|
+
if not record: # end of file
|
|
448
602
|
break
|
|
449
603
|
|
|
450
604
|
# Read the trailing record length (should match the leading one)
|
|
@@ -452,17 +606,19 @@ class obs_sequence:
|
|
|
452
606
|
|
|
453
607
|
linecount += 1
|
|
454
608
|
|
|
455
|
-
if linecount == 3:
|
|
456
|
-
obs_types_definitions = struct.unpack(
|
|
457
|
-
continue
|
|
609
|
+
if linecount == 3:
|
|
610
|
+
obs_types_definitions = struct.unpack("i", record)[0]
|
|
611
|
+
continue
|
|
458
612
|
|
|
459
|
-
if linecount == 4+obs_types_definitions:
|
|
460
|
-
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
|
|
613
|
+
if linecount == 4 + obs_types_definitions:
|
|
614
|
+
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
|
|
615
|
+
"iiii", record
|
|
616
|
+
)[:16]
|
|
461
617
|
break
|
|
462
|
-
|
|
618
|
+
|
|
463
619
|
# Go back to the beginning of the file
|
|
464
620
|
f.seek(0)
|
|
465
|
-
|
|
621
|
+
|
|
466
622
|
for _ in range(2):
|
|
467
623
|
record_length = obs_sequence.read_record_length(f)
|
|
468
624
|
if record_length is None:
|
|
@@ -472,14 +628,14 @@ class obs_sequence:
|
|
|
472
628
|
if not record: # end of file
|
|
473
629
|
break
|
|
474
630
|
|
|
475
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
476
|
-
header.append(record.decode(
|
|
631
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
632
|
+
header.append(record.decode("utf-8").strip())
|
|
477
633
|
|
|
478
634
|
header.append(str(obs_types_definitions))
|
|
479
635
|
|
|
480
636
|
# obs_types_definitions
|
|
481
|
-
for _ in range(3,4+obs_types_definitions):
|
|
482
|
-
|
|
637
|
+
for _ in range(3, 4 + obs_types_definitions):
|
|
638
|
+
# Read the record length
|
|
483
639
|
record_length = obs_sequence.read_record_length(f)
|
|
484
640
|
if record_length is None:
|
|
485
641
|
break
|
|
@@ -489,21 +645,24 @@ class obs_sequence:
|
|
|
489
645
|
if not record: # end of file
|
|
490
646
|
break
|
|
491
647
|
|
|
492
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
648
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
493
649
|
|
|
494
650
|
if _ == 3:
|
|
495
|
-
continue
|
|
651
|
+
continue # num obs_types_definitions
|
|
496
652
|
# Read an integer and a string from the record
|
|
497
|
-
integer_value = struct.unpack(
|
|
498
|
-
string_value = record[4:].decode(
|
|
653
|
+
integer_value = struct.unpack("i", record[:4])[0]
|
|
654
|
+
string_value = record[4:].decode("utf-8").strip()
|
|
499
655
|
header.append(f"{integer_value} {string_value}")
|
|
500
656
|
|
|
501
657
|
header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
|
|
502
658
|
header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
|
|
503
|
-
|
|
504
|
-
#copie names
|
|
505
|
-
for _ in range(
|
|
506
|
-
|
|
659
|
+
|
|
660
|
+
# copie names
|
|
661
|
+
for _ in range(
|
|
662
|
+
5 + obs_types_definitions,
|
|
663
|
+
5 + obs_types_definitions + num_copies + num_qcs + 1,
|
|
664
|
+
):
|
|
665
|
+
# Read the record length
|
|
507
666
|
record_length = obs_sequence.read_record_length(f)
|
|
508
667
|
if record_length is None:
|
|
509
668
|
break
|
|
@@ -513,26 +672,26 @@ class obs_sequence:
|
|
|
513
672
|
if not record:
|
|
514
673
|
break
|
|
515
674
|
|
|
516
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
675
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
517
676
|
|
|
518
|
-
if _ == 5+obs_types_definitions:
|
|
677
|
+
if _ == 5 + obs_types_definitions:
|
|
519
678
|
continue
|
|
520
679
|
|
|
521
680
|
# Read the whole record as a string
|
|
522
|
-
string_value = record.decode(
|
|
681
|
+
string_value = record.decode("utf-8").strip()
|
|
523
682
|
header.append(string_value)
|
|
524
683
|
|
|
525
684
|
# first and last obs
|
|
526
|
-
# Read the record length
|
|
685
|
+
# Read the record length
|
|
527
686
|
record_length = obs_sequence.read_record_length(f)
|
|
528
687
|
|
|
529
688
|
# Read the actual record
|
|
530
689
|
record = f.read(record_length)
|
|
531
|
-
|
|
532
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
690
|
+
|
|
691
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
533
692
|
|
|
534
693
|
# Read the whole record as a two integers
|
|
535
|
-
first, last = struct.unpack(
|
|
694
|
+
first, last = struct.unpack("ii", record)[:8]
|
|
536
695
|
header.append(f"first: {first} last: {last}")
|
|
537
696
|
|
|
538
697
|
return header
|
|
@@ -541,7 +700,7 @@ class obs_sequence:
|
|
|
541
700
|
def collect_obs_types(header):
|
|
542
701
|
"""Create a dictionary for the observation types in the obs_seq header"""
|
|
543
702
|
num_obs_types = int(header[2])
|
|
544
|
-
types = dict([x.split() for
|
|
703
|
+
types = dict([x.split() for x in header[3 : num_obs_types + 3]])
|
|
545
704
|
return types
|
|
546
705
|
|
|
547
706
|
@staticmethod
|
|
@@ -549,32 +708,45 @@ class obs_sequence:
|
|
|
549
708
|
"""
|
|
550
709
|
Extracts the names of the copies from the header of an obs_seq file.
|
|
551
710
|
|
|
552
|
-
|
|
711
|
+
Args:
|
|
553
712
|
header (list): A list of strings representing the lines in the header of the obs_seq file.
|
|
554
713
|
|
|
555
714
|
Returns:
|
|
556
|
-
tuple: A tuple containing two elements:
|
|
557
|
-
- copie_names (list): A list of strings representing the copy names with underscores for spaces.
|
|
715
|
+
tuple: A tuple containing two elements:
|
|
716
|
+
- copie_names (list): A list of strings representing the copy names with underscores for spaces.
|
|
558
717
|
- len(copie_names) (int): The number of copy names.
|
|
559
718
|
"""
|
|
560
719
|
for i, line in enumerate(header):
|
|
561
720
|
if "num_obs:" in line and "max_num_obs:" in line:
|
|
562
|
-
first_copie = i+1
|
|
721
|
+
first_copie = i + 1
|
|
563
722
|
break
|
|
564
|
-
copie_names = [
|
|
723
|
+
copie_names = [
|
|
724
|
+
"_".join(x.split()) for x in header[first_copie:-1]
|
|
725
|
+
] # first and last is last line of header
|
|
565
726
|
return copie_names, len(copie_names)
|
|
566
727
|
|
|
728
|
+
@staticmethod
|
|
729
|
+
def num_qc_non_qc(header):
|
|
730
|
+
"""Find the number of qc and non-qc copies in the header"""
|
|
731
|
+
for line in header:
|
|
732
|
+
if "num_copies:" in line and "num_qc:" in line:
|
|
733
|
+
num_non_qc = int(line.split()[1])
|
|
734
|
+
num_qc = int(line.split()[3])
|
|
735
|
+
return num_non_qc, num_qc
|
|
736
|
+
|
|
567
737
|
@staticmethod
|
|
568
738
|
def obs_reader(file, n):
|
|
569
739
|
"""Reads the ascii obs sequence file and returns a generator of the obs"""
|
|
570
|
-
previous_line =
|
|
571
|
-
with open(file,
|
|
740
|
+
previous_line = ""
|
|
741
|
+
with open(file, "r") as f:
|
|
572
742
|
for line in f:
|
|
573
743
|
if "OBS" in line or "OBS" in previous_line:
|
|
574
744
|
if "OBS" in line:
|
|
575
745
|
obs = []
|
|
576
|
-
obs.append(line.strip())
|
|
577
|
-
for i in range(
|
|
746
|
+
obs.append(line.strip())
|
|
747
|
+
for i in range(
|
|
748
|
+
n + 100
|
|
749
|
+
): # number of copies + 100. Needs to be bigger than any metadata
|
|
578
750
|
try:
|
|
579
751
|
next_line = next(f)
|
|
580
752
|
except:
|
|
@@ -587,11 +759,15 @@ class obs_sequence:
|
|
|
587
759
|
else:
|
|
588
760
|
obs.append(next_line.strip())
|
|
589
761
|
yield obs
|
|
590
|
-
elif
|
|
762
|
+
elif (
|
|
763
|
+
"OBS" in previous_line
|
|
764
|
+
): # previous line is because I cannot use f.tell with next
|
|
591
765
|
obs = []
|
|
592
|
-
obs.append(previous_line.strip())
|
|
593
|
-
obs.append(line.strip())
|
|
594
|
-
for i in range(
|
|
766
|
+
obs.append(previous_line.strip())
|
|
767
|
+
obs.append(line.strip())
|
|
768
|
+
for i in range(
|
|
769
|
+
n + 100
|
|
770
|
+
): # number of copies + 100. Needs to be bigger than any metadata
|
|
595
771
|
try:
|
|
596
772
|
next_line = next(f)
|
|
597
773
|
except:
|
|
@@ -608,19 +784,19 @@ class obs_sequence:
|
|
|
608
784
|
|
|
609
785
|
@staticmethod
|
|
610
786
|
def check_trailing_record_length(file, expected_length):
|
|
611
|
-
|
|
787
|
+
"""Reads and checks the trailing record length from the binary file written by Fortran.
|
|
612
788
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
789
|
+
Args:
|
|
790
|
+
file (file): The file object.
|
|
791
|
+
expected_length (int): The expected length of the trailing record.
|
|
616
792
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
793
|
+
Assuming 4 bytes:
|
|
794
|
+
| Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
|
|
795
|
+
"""
|
|
796
|
+
trailing_record_length_bytes = file.read(4)
|
|
797
|
+
trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
|
|
798
|
+
if expected_length != trailing_record_length:
|
|
799
|
+
raise ValueError("Record length mismatch in Fortran binary file")
|
|
624
800
|
|
|
625
801
|
@staticmethod
|
|
626
802
|
def read_record_length(file):
|
|
@@ -628,18 +804,17 @@ class obs_sequence:
|
|
|
628
804
|
record_length_bytes = file.read(4)
|
|
629
805
|
if not record_length_bytes:
|
|
630
806
|
return None # End of file
|
|
631
|
-
return struct.unpack(
|
|
632
|
-
|
|
807
|
+
return struct.unpack("i", record_length_bytes)[0]
|
|
633
808
|
|
|
634
809
|
def obs_binary_reader(self, file, n):
|
|
635
810
|
"""Reads the obs sequence binary file and returns a generator of the obs"""
|
|
636
811
|
header_length = len(self.header)
|
|
637
|
-
with open(file,
|
|
812
|
+
with open(file, "rb") as f:
|
|
638
813
|
# Skip the first len(obs_seq.header) lines
|
|
639
|
-
for _ in range(header_length-1):
|
|
814
|
+
for _ in range(header_length - 1):
|
|
640
815
|
# Read the record length
|
|
641
816
|
record_length = obs_sequence.read_record_length(f)
|
|
642
|
-
if record_length is None:
|
|
817
|
+
if record_length is None: # End of file
|
|
643
818
|
break
|
|
644
819
|
|
|
645
820
|
# Skip the actual record
|
|
@@ -652,79 +827,80 @@ class obs_sequence:
|
|
|
652
827
|
while True:
|
|
653
828
|
obs = []
|
|
654
829
|
obs_num += 1
|
|
655
|
-
obs.append(f"OBS {obs_num}")
|
|
656
|
-
for _ in range(n):
|
|
830
|
+
obs.append(f"OBS {obs_num}")
|
|
831
|
+
for _ in range(n): # number of copies
|
|
657
832
|
# Read the record length
|
|
658
833
|
record_length = obs_sequence.read_record_length(f)
|
|
659
834
|
if record_length is None:
|
|
660
835
|
break
|
|
661
836
|
# Read the actual record (copie)
|
|
662
837
|
record = f.read(record_length)
|
|
663
|
-
obs.append(struct.unpack(
|
|
838
|
+
obs.append(struct.unpack("d", record)[0])
|
|
664
839
|
|
|
665
840
|
# Read the trailing record length (should match the leading one)
|
|
666
841
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
667
|
-
|
|
842
|
+
|
|
668
843
|
# linked list info
|
|
669
844
|
record_length = obs_sequence.read_record_length(f)
|
|
670
845
|
if record_length is None:
|
|
671
846
|
break
|
|
672
847
|
|
|
673
848
|
record = f.read(record_length)
|
|
674
|
-
int1, int2, int3 = struct.unpack(
|
|
849
|
+
int1, int2, int3 = struct.unpack("iii", record[:12])
|
|
675
850
|
linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
|
|
676
851
|
obs.append(linked_list_string)
|
|
677
852
|
|
|
678
853
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
679
854
|
|
|
680
855
|
# location (note no location header "loc3d" or "loc1d" for binary files)
|
|
681
|
-
obs.append(
|
|
856
|
+
obs.append("loc3d")
|
|
682
857
|
record_length = obs_sequence.read_record_length(f)
|
|
683
858
|
record = f.read(record_length)
|
|
684
|
-
x,y,z,vert = struct.unpack(
|
|
685
|
-
location_string = f"{x} {y} {z} {vert}"
|
|
686
|
-
obs.append(location_string)
|
|
859
|
+
x, y, z, vert = struct.unpack("dddi", record[:28])
|
|
860
|
+
location_string = f"{x} {y} {z} {vert}"
|
|
861
|
+
obs.append(location_string)
|
|
687
862
|
|
|
688
863
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
689
|
-
|
|
864
|
+
|
|
690
865
|
# kind (type of observation) value
|
|
691
|
-
obs.append(
|
|
866
|
+
obs.append("kind")
|
|
692
867
|
record_length_bytes = f.read(4)
|
|
693
|
-
record_length = struct.unpack(
|
|
868
|
+
record_length = struct.unpack("i", record_length_bytes)[0]
|
|
694
869
|
record = f.read(record_length)
|
|
695
870
|
kind = f"{struct.unpack('i', record)[0]}"
|
|
696
871
|
obs.append(kind)
|
|
697
|
-
|
|
872
|
+
|
|
698
873
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
699
874
|
|
|
700
875
|
# time (seconds, days)
|
|
701
876
|
record_length = obs_sequence.read_record_length(f)
|
|
702
877
|
record = f.read(record_length)
|
|
703
|
-
seconds, days = struct.unpack(
|
|
878
|
+
seconds, days = struct.unpack("ii", record)[:8]
|
|
704
879
|
time_string = f"{seconds} {days}"
|
|
705
880
|
obs.append(time_string)
|
|
706
881
|
|
|
707
882
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
708
|
-
|
|
883
|
+
|
|
709
884
|
# obs error variance
|
|
710
885
|
record_length = obs_sequence.read_record_length(f)
|
|
711
886
|
record = f.read(record_length)
|
|
712
|
-
obs.append(struct.unpack(
|
|
713
|
-
|
|
887
|
+
obs.append(struct.unpack("d", record)[0])
|
|
888
|
+
|
|
714
889
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
715
890
|
|
|
716
891
|
yield obs
|
|
717
892
|
|
|
718
|
-
def composite_types(self, composite_types=
|
|
893
|
+
def composite_types(self, composite_types="use_default"):
|
|
719
894
|
"""
|
|
720
895
|
Set up and construct composite types for the DataFrame.
|
|
721
896
|
|
|
722
|
-
This function sets up composite types based on a provided YAML configuration or
|
|
723
|
-
a default configuration. It constructs new composite rows by combining specified
|
|
897
|
+
This function sets up composite types based on a provided YAML configuration or
|
|
898
|
+
a default configuration. It constructs new composite rows by combining specified
|
|
724
899
|
components and adds them to the DataFrame.
|
|
725
900
|
|
|
726
|
-
|
|
727
|
-
composite_types (str, optional): The YAML configuration for composite types.
|
|
901
|
+
Args:
|
|
902
|
+
composite_types (str, optional): The YAML configuration for composite types.
|
|
903
|
+
If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
|
|
728
904
|
|
|
729
905
|
Returns:
|
|
730
906
|
pd.DataFrame: The updated DataFrame with the new composite rows added.
|
|
@@ -733,12 +909,12 @@ class obs_sequence:
|
|
|
733
909
|
Exception: If there are repeat values in the components.
|
|
734
910
|
"""
|
|
735
911
|
|
|
736
|
-
if composite_types ==
|
|
912
|
+
if composite_types == "use_default":
|
|
737
913
|
composite_yaml = self.default_composite_types
|
|
738
914
|
else:
|
|
739
915
|
composite_yaml = composite_types
|
|
740
|
-
self.composite_types_dict
|
|
741
|
-
|
|
916
|
+
self.composite_types_dict = load_yaml_to_dict(composite_yaml)
|
|
917
|
+
|
|
742
918
|
components = []
|
|
743
919
|
for value in self.composite_types_dict.values():
|
|
744
920
|
components.extend(value["components"])
|
|
@@ -746,31 +922,243 @@ class obs_sequence:
|
|
|
746
922
|
if len(components) != len(set(components)):
|
|
747
923
|
raise Exception("There are repeat values in components.")
|
|
748
924
|
|
|
749
|
-
|
|
750
|
-
|
|
925
|
+
# data frame for the composite types
|
|
926
|
+
df_comp = self.df[
|
|
927
|
+
self.df["type"]
|
|
928
|
+
.str.upper()
|
|
929
|
+
.isin([component.upper() for component in components])
|
|
930
|
+
]
|
|
751
931
|
|
|
932
|
+
df = pd.DataFrame()
|
|
752
933
|
for key in self.composite_types_dict:
|
|
753
|
-
df_new = construct_composit(
|
|
754
|
-
|
|
934
|
+
df_new = construct_composit(
|
|
935
|
+
df_comp, key, self.composite_types_dict[key]["components"]
|
|
936
|
+
)
|
|
937
|
+
df = pd.concat([df, df_new], axis=0)
|
|
938
|
+
|
|
939
|
+
# add the composite types to the DataFrame
|
|
940
|
+
self.df = pd.concat([self.df, df], axis=0)
|
|
941
|
+
return
|
|
942
|
+
|
|
943
|
+
@classmethod
|
|
944
|
+
def join(cls, obs_sequences, copies=None):
|
|
945
|
+
"""
|
|
946
|
+
Join a list of observation sequences together.
|
|
947
|
+
|
|
948
|
+
This method combines the headers and observations from a list of obs_sequence objects
|
|
949
|
+
into a single obs_sequence object.
|
|
950
|
+
|
|
951
|
+
Args:
|
|
952
|
+
obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
|
|
953
|
+
copies (list of str, optional): A list of copy names to include in the combined data.
|
|
954
|
+
If not provided, all copies are included.
|
|
955
|
+
|
|
956
|
+
Returns:
|
|
957
|
+
A new obs_sequence object containing the combined data.
|
|
958
|
+
|
|
959
|
+
Example:
|
|
960
|
+
.. code-block:: python
|
|
961
|
+
|
|
962
|
+
obs_seq1 = obs_sequence(file='obs_seq1.final')
|
|
963
|
+
obs_seq2 = obs_sequence(file='obs_seq2.final')
|
|
964
|
+
obs_seq3 = obs_sequence(file='obs_seq3.final')
|
|
965
|
+
combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
|
|
966
|
+
"""
|
|
967
|
+
if not obs_sequences:
|
|
968
|
+
raise ValueError("The list of observation sequences is empty.")
|
|
969
|
+
|
|
970
|
+
# Create a new obs_sequnece object with the combined data
|
|
971
|
+
combo = cls(file=None)
|
|
972
|
+
|
|
973
|
+
# Check if all obs_sequences have compatible attributes
|
|
974
|
+
first_loc_mod = obs_sequences[0].loc_mod
|
|
975
|
+
first_has_assimilation_info = obs_sequences[0].has_assimilation_info()
|
|
976
|
+
first_has_posterior = obs_sequences[0].has_posterior()
|
|
977
|
+
for obs_seq in obs_sequences:
|
|
978
|
+
if obs_seq.loc_mod != first_loc_mod:
|
|
979
|
+
raise ValueError(
|
|
980
|
+
"All observation sequences must have the same loc_mod."
|
|
981
|
+
)
|
|
982
|
+
if obs_seq.has_assimilation_info() != first_has_assimilation_info:
|
|
983
|
+
raise ValueError(
|
|
984
|
+
"All observation sequences must have assimilation info."
|
|
985
|
+
)
|
|
986
|
+
if obs_seq.has_posterior() != first_has_posterior:
|
|
987
|
+
raise ValueError(
|
|
988
|
+
"All observation sequences must have the posterior info."
|
|
989
|
+
)
|
|
990
|
+
# HK @todo prior only
|
|
991
|
+
combo.loc_mod = first_loc_mod
|
|
992
|
+
|
|
993
|
+
# check the copies are compatible (list of copies to combine?)
|
|
994
|
+
# subset of copies if needed # @todo HK 1d or 3d
|
|
995
|
+
if copies:
|
|
996
|
+
start_required_columns = ["obs_num", "observation"]
|
|
997
|
+
end_required_columns = [
|
|
998
|
+
"linked_list",
|
|
999
|
+
"longitude",
|
|
1000
|
+
"latitude",
|
|
1001
|
+
"vertical",
|
|
1002
|
+
"vert_unit",
|
|
1003
|
+
"type",
|
|
1004
|
+
"metadata",
|
|
1005
|
+
"external_FO",
|
|
1006
|
+
"seconds",
|
|
1007
|
+
"days",
|
|
1008
|
+
"time",
|
|
1009
|
+
"obs_err_var",
|
|
1010
|
+
]
|
|
1011
|
+
required_columns = start_required_columns + end_required_columns
|
|
1012
|
+
|
|
1013
|
+
requested_columns = (
|
|
1014
|
+
start_required_columns
|
|
1015
|
+
+ [item for item in copies if item not in required_columns]
|
|
1016
|
+
+ end_required_columns
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
for obs_seq in obs_sequences:
|
|
1020
|
+
if not set(requested_columns).issubset(obs_seq.df.columns):
|
|
1021
|
+
raise ValueError(
|
|
1022
|
+
"All observation sequences must have the selected copies."
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
# go through columns and create header
|
|
1026
|
+
remove_list = [
|
|
1027
|
+
"obs_num",
|
|
1028
|
+
"linked_list",
|
|
1029
|
+
"latitude",
|
|
1030
|
+
"longitude",
|
|
1031
|
+
"vertical",
|
|
1032
|
+
"vert_unit",
|
|
1033
|
+
"type",
|
|
1034
|
+
"metadata",
|
|
1035
|
+
"external_FO",
|
|
1036
|
+
"time",
|
|
1037
|
+
"seconds",
|
|
1038
|
+
"days",
|
|
1039
|
+
"obs_err_var",
|
|
1040
|
+
]
|
|
1041
|
+
# using lists to retain copy order, non_qcs followed by qcs
|
|
1042
|
+
combo.copie_names = [
|
|
1043
|
+
item for item in requested_columns if item not in remove_list
|
|
1044
|
+
]
|
|
1045
|
+
combo.non_qc_copie_names = [
|
|
1046
|
+
item
|
|
1047
|
+
for item in combo.copie_names
|
|
1048
|
+
if item in obs_sequences[0].non_qc_copie_names
|
|
1049
|
+
]
|
|
1050
|
+
combo.qc_copie_names = [
|
|
1051
|
+
item
|
|
1052
|
+
for item in combo.copie_names
|
|
1053
|
+
if item in obs_sequences[0].qc_copie_names
|
|
1054
|
+
]
|
|
1055
|
+
|
|
1056
|
+
combo.n_copies = len(combo.copie_names)
|
|
1057
|
+
combo.n_qc = len(combo.qc_copie_names)
|
|
1058
|
+
combo.n_non_qc = len(combo.non_qc_copie_names)
|
|
1059
|
+
|
|
1060
|
+
else:
|
|
1061
|
+
for obs_seq in obs_sequences:
|
|
1062
|
+
if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
|
|
1063
|
+
raise ValueError(
|
|
1064
|
+
"All observation sequences must have the same copies."
|
|
1065
|
+
)
|
|
1066
|
+
combo.n_copies = obs_sequences[0].n_copies
|
|
1067
|
+
combo.n_qc = obs_sequences[0].n_qc
|
|
1068
|
+
combo.n_non_qc = obs_sequences[0].n_non_qc
|
|
1069
|
+
combo.copie_names = obs_sequences[0].copie_names
|
|
1070
|
+
|
|
1071
|
+
# todo HK @todo combine synonyms for obs?
|
|
1072
|
+
|
|
1073
|
+
# Initialize combined data
|
|
1074
|
+
combined_types = []
|
|
1075
|
+
combined_df = pd.DataFrame()
|
|
1076
|
+
combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
|
|
1077
|
+
|
|
1078
|
+
# Iterate over the list of observation sequences and combine their data
|
|
1079
|
+
for obs_seq in obs_sequences:
|
|
1080
|
+
if copies:
|
|
1081
|
+
combined_df = pd.concat(
|
|
1082
|
+
[combined_df, obs_seq.df[requested_columns]], ignore_index=True
|
|
1083
|
+
)
|
|
1084
|
+
else:
|
|
1085
|
+
combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
|
|
1086
|
+
combined_types.extend(list(obs_seq.reverse_types.keys()))
|
|
1087
|
+
|
|
1088
|
+
# create dictionary of types
|
|
1089
|
+
keys = set(combined_types)
|
|
1090
|
+
combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
|
|
1091
|
+
combo.types = {v: k for k, v in combo.reverse_types.items()}
|
|
1092
|
+
|
|
1093
|
+
# create linked list for obs
|
|
1094
|
+
combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
|
|
1095
|
+
combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
1096
|
+
len(combo.df)
|
|
1097
|
+
)
|
|
1098
|
+
combo.df["obs_num"] = combined_df.index + 1
|
|
1099
|
+
combo.create_header(len(combo.df))
|
|
1100
|
+
|
|
1101
|
+
return combo
|
|
1102
|
+
|
|
1103
|
+
def has_assimilation_info(self):
|
|
1104
|
+
"""
|
|
1105
|
+
Check if the DataFrame has prior information.
|
|
1106
|
+
|
|
1107
|
+
Returns:
|
|
1108
|
+
bool: True if both 'prior_ensemble_mean' and 'prior_ensemble_spread' columns are present, False otherwise.
|
|
1109
|
+
"""
|
|
1110
|
+
return "prior_ensemble_mean".casefold() in map(
|
|
1111
|
+
str.casefold, self.df.columns
|
|
1112
|
+
) and "prior_ensemble_spread".casefold() in map(str.casefold, self.df.columns)
|
|
1113
|
+
|
|
1114
|
+
def has_posterior(self):
|
|
1115
|
+
"""
|
|
1116
|
+
Check if the DataFrame has posterior information.
|
|
1117
|
+
|
|
1118
|
+
Returns:
|
|
1119
|
+
bool: True if both 'posterior_ensemble_mean' and 'posterior_ensemble_spread' columns are present, False otherwise.
|
|
1120
|
+
"""
|
|
1121
|
+
return "posterior_ensemble_mean".casefold() in map(
|
|
1122
|
+
str.casefold, self.df.columns
|
|
1123
|
+
) and "posterior_ensemble_spread".casefold() in map(
|
|
1124
|
+
str.casefold, self.df.columns
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
def create_header(self, n):
|
|
1128
|
+
"""Create a header for the obs_seq file from the obs_sequence object."""
|
|
1129
|
+
assert (
|
|
1130
|
+
self.n_copies == self.n_non_qc + self.n_qc
|
|
1131
|
+
), "n_copies must be equal to n_non_qc + n_qc"
|
|
1132
|
+
|
|
1133
|
+
self.header = []
|
|
1134
|
+
self.header.append(f"obs_sequence")
|
|
1135
|
+
self.header.append("obs_type_definitions")
|
|
1136
|
+
self.header.append(f"{len(self.types)}")
|
|
1137
|
+
for key, value in self.types.items():
|
|
1138
|
+
self.header.append(f"{key} {value}")
|
|
1139
|
+
self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
|
|
1140
|
+
self.header.append(f"num_obs: {n} max_num_obs: {n}")
|
|
1141
|
+
for copie in self.copie_names:
|
|
1142
|
+
self.header.append(copie)
|
|
1143
|
+
self.header.append(f"first: 1 last: {n}")
|
|
1144
|
+
|
|
755
1145
|
|
|
756
|
-
return df_no_comp
|
|
757
|
-
|
|
758
1146
|
def load_yaml_to_dict(file_path):
|
|
759
1147
|
"""
|
|
760
1148
|
Load a YAML file and convert it to a dictionary.
|
|
761
1149
|
|
|
762
|
-
|
|
1150
|
+
Args:
|
|
763
1151
|
file_path (str): The path to the YAML file.
|
|
764
1152
|
|
|
765
1153
|
Returns:
|
|
766
1154
|
dict: The YAML file content as a dictionary.
|
|
767
1155
|
"""
|
|
768
1156
|
try:
|
|
769
|
-
with open(file_path,
|
|
1157
|
+
with open(file_path, "r") as file:
|
|
770
1158
|
return yaml.safe_load(file)
|
|
771
1159
|
except Exception as e:
|
|
772
1160
|
print(f"Error loading YAML file: {e}")
|
|
773
|
-
|
|
1161
|
+
raise
|
|
774
1162
|
|
|
775
1163
|
|
|
776
1164
|
def convert_dart_time(seconds, days):
|
|
@@ -780,46 +1168,71 @@ def convert_dart_time(seconds, days):
|
|
|
780
1168
|
- base year for Gregorian calendar is 1601
|
|
781
1169
|
- dart time is seconds, days since 1601
|
|
782
1170
|
"""
|
|
783
|
-
time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
|
|
1171
|
+
time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
|
|
784
1172
|
return time
|
|
785
1173
|
|
|
1174
|
+
|
|
786
1175
|
def construct_composit(df_comp, composite, components):
|
|
787
1176
|
"""
|
|
788
1177
|
Construct a composite DataFrame by combining rows from two components.
|
|
789
1178
|
|
|
790
1179
|
This function takes two DataFrames and combines rows from them based on matching
|
|
791
|
-
location and time. It creates a new row with a composite type by combining
|
|
1180
|
+
location and time. It creates a new row with a composite type by combining
|
|
792
1181
|
specified columns using the square root of the sum of squares method.
|
|
793
1182
|
|
|
794
|
-
|
|
1183
|
+
Args:
|
|
795
1184
|
df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
|
|
796
1185
|
composite (str): The type name for the new composite rows.
|
|
797
1186
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
798
1187
|
|
|
799
1188
|
Returns:
|
|
800
|
-
merged_df (pd.DataFrame):
|
|
1189
|
+
merged_df (pd.DataFrame): A DataFrame containing the new composite rows.
|
|
801
1190
|
"""
|
|
802
|
-
selected_rows = df_comp[df_comp[
|
|
803
|
-
selected_rows_v = df_comp[df_comp[
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
1191
|
+
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1192
|
+
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
1193
|
+
|
|
1194
|
+
prior_columns_to_combine = df_comp.filter(regex="prior_ensemble").columns.tolist()
|
|
1195
|
+
posterior_columns_to_combine = df_comp.filter(
|
|
1196
|
+
regex="posterior_ensemble"
|
|
1197
|
+
).columns.tolist()
|
|
1198
|
+
columns_to_combine = (
|
|
1199
|
+
prior_columns_to_combine
|
|
1200
|
+
+ posterior_columns_to_combine
|
|
1201
|
+
+ ["observation", "obs_err_var"]
|
|
1202
|
+
)
|
|
1203
|
+
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
1204
|
+
same_obs_columns = merge_columns + [
|
|
1205
|
+
"observation",
|
|
1206
|
+
"obs_err_var",
|
|
1207
|
+
] # same observation is duplicated
|
|
1208
|
+
|
|
1209
|
+
if (
|
|
1210
|
+
selected_rows[same_obs_columns].duplicated().sum() > 0
|
|
1211
|
+
or selected_rows_v[same_obs_columns].duplicated().sum() > 0
|
|
1212
|
+
):
|
|
1213
|
+
print(
|
|
1214
|
+
f"{selected_rows[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1215
|
+
)
|
|
1216
|
+
print(f"{selected_rows[same_obs_columns]}")
|
|
1217
|
+
print(
|
|
1218
|
+
f"{selected_rows_v[same_obs_columns].duplicated().sum()} duplicates in {composite} component {components[0]}: "
|
|
1219
|
+
)
|
|
1220
|
+
print(f"{selected_rows_v[same_obs_columns]}")
|
|
1221
|
+
raise Exception("There are duplicates in the components.")
|
|
811
1222
|
|
|
812
1223
|
# Merge the two DataFrames on location and time columns
|
|
813
|
-
merged_df = pd.merge(
|
|
1224
|
+
merged_df = pd.merge(
|
|
1225
|
+
selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
|
|
1226
|
+
)
|
|
814
1227
|
|
|
815
1228
|
# Apply the square root of the sum of squares method to the relevant columns
|
|
816
1229
|
for col in columns_to_combine:
|
|
817
|
-
merged_df[col] = np.sqrt(merged_df[col]**2 + merged_df[f
|
|
1230
|
+
merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
|
|
818
1231
|
|
|
819
1232
|
# Create the new composite rows
|
|
820
|
-
merged_df[
|
|
821
|
-
merged_df = merged_df.drop(
|
|
1233
|
+
merged_df["type"] = composite.upper()
|
|
1234
|
+
merged_df = merged_df.drop(
|
|
1235
|
+
columns=[col for col in merged_df.columns if col.endswith("_v")]
|
|
1236
|
+
)
|
|
822
1237
|
|
|
823
1238
|
return merged_df
|
|
824
|
-
|
|
825
|
-
|