pydartdiags 0.0.43__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags/matplots/__init__.py +0 -0
- pydartdiags/matplots/matplots.py +243 -0
- pydartdiags/obs_sequence/obs_sequence.py +618 -320
- pydartdiags/plots/plots.py +80 -228
- pydartdiags/stats/__init__.py +0 -0
- pydartdiags/stats/stats.py +323 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.0.dist-info}/METADATA +9 -5
- pydartdiags-0.5.0.dist-info/RECORD +14 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.0.dist-info}/WHEEL +1 -1
- pydartdiags-0.0.43.dist-info/RECORD +0 -10
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.0.dist-info}/LICENSE +0 -0
- {pydartdiags-0.0.43.dist-info → pydartdiags-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
1
2
|
import pandas as pd
|
|
2
3
|
import datetime as dt
|
|
3
4
|
import numpy as np
|
|
@@ -5,86 +6,140 @@ import os
|
|
|
5
6
|
import yaml
|
|
6
7
|
import struct
|
|
7
8
|
|
|
9
|
+
|
|
8
10
|
def requires_assimilation_info(func):
|
|
9
11
|
def wrapper(self, *args, **kwargs):
|
|
10
12
|
if self.has_assimilation_info:
|
|
11
13
|
return func(self, *args, **kwargs)
|
|
12
14
|
else:
|
|
13
|
-
raise ValueError(
|
|
15
|
+
raise ValueError(
|
|
16
|
+
"Assimilation information is required to call this function."
|
|
17
|
+
)
|
|
18
|
+
|
|
14
19
|
return wrapper
|
|
15
20
|
|
|
21
|
+
|
|
16
22
|
def requires_posterior_info(func):
|
|
17
23
|
def wrapper(self, *args, **kwargs):
|
|
18
|
-
if self.
|
|
24
|
+
if self.has_posterior:
|
|
19
25
|
return func(self, *args, **kwargs)
|
|
20
26
|
else:
|
|
21
27
|
raise ValueError("Posterior information is required to call this function.")
|
|
28
|
+
|
|
22
29
|
return wrapper
|
|
23
30
|
|
|
24
31
|
|
|
25
32
|
class obs_sequence:
|
|
26
|
-
"""
|
|
33
|
+
"""
|
|
34
|
+
Initialize an obs_sequence object from an ASCII or binary observation sequence file,
|
|
35
|
+
or create an empty obs_sequence object from scratch.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file (str): The input observation sequence ASCII or binary file.
|
|
39
|
+
If None, an empty obs_sequence object is created from scratch.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
An obs_sequence object
|
|
27
43
|
|
|
28
44
|
Attributes:
|
|
29
|
-
df (pandas.DataFrame): DataFrame containing
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
types (dict): Dictionary of types in the observation sequence file.
|
|
34
|
-
copie_names (list): Names of copies in the observation sequence file.
|
|
35
|
-
Spelled 'copie' to avoid conflict with the Python built-in copy function.
|
|
45
|
+
df (pandas.DataFrame): The DataFrame containing the observation sequence data.
|
|
46
|
+
header (list): The header of the observation sequence.
|
|
47
|
+
copie_names (list): The names of the copies in the observation sequence.
|
|
48
|
+
Spelled 'copie' to avoid conflict with the Python built-in 'copy'.
|
|
36
49
|
Spaces are replaced with underscores in copie_names.
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
50
|
+
non_qc_copie_names (list): The names of the copies not including quality control,
|
|
51
|
+
e.g. observation, mean, ensemble_members
|
|
52
|
+
qc_copie_names (list): The names of the quality control copies, e.g. DART_QC
|
|
53
|
+
n_copies(int): The total number of copies in the observation sequence.
|
|
54
|
+
n_non_qc(int): The number of copies not including quality control.
|
|
55
|
+
n_qc(int): The number of quality control copies.
|
|
56
|
+
vert (dict): A dictionary mapping DART vertical coordinate types to their
|
|
57
|
+
corresponding integer values.
|
|
58
|
+
|
|
59
|
+
- undefined: 'VERTISUNDEF'
|
|
60
|
+
- surface: 'VERTISSURFACE' (value is surface elevation in meters)
|
|
61
|
+
- model level: 'VERTISLEVEL'
|
|
62
|
+
- pressure: 'VERTISPRESSURE' (in Pascals)
|
|
63
|
+
- height: 'VERTISHEIGHT' (in meters)
|
|
64
|
+
- scale height: 'VERTISSCALEHEIGHT' (unitless)
|
|
65
|
+
loc_mod (str): The location model, either 'loc3d' or 'loc1d'.
|
|
66
|
+
For 3D sphere models: latitude and longitude are in degrees in the DataFrame.
|
|
67
|
+
types (dict): Dictionary of types of observations the observation sequence,
|
|
68
|
+
e.g. {23: 'ACARS_TEMPERATURE'},
|
|
69
|
+
reverse_types (dict): Dictionary of types with keys and values reversed, e.g
|
|
70
|
+
{'ACARS_TEMPERATURE': 23}
|
|
71
|
+
synonyms_for_obs (list): List of synonyms for the observation column in the DataFrame.
|
|
72
|
+
The defualt list is
|
|
73
|
+
|
|
74
|
+
.. code-block:: python
|
|
75
|
+
|
|
76
|
+
[ 'NCEP BUFR observation',
|
|
77
|
+
'AIRS observation',
|
|
78
|
+
'GTSPP observation',
|
|
79
|
+
'SST observation',
|
|
80
|
+
'observations',
|
|
81
|
+
'WOD observation']
|
|
82
|
+
|
|
83
|
+
You can add more synonyms by providing a list of strings when
|
|
84
|
+
creating the obs_sequence object.
|
|
85
|
+
|
|
86
|
+
.. code-block:: python
|
|
87
|
+
|
|
88
|
+
obs_sequence(file, synonyms=['synonym1', 'synonym2']).df
|
|
89
|
+
|
|
90
|
+
has_assimilation_info (bool): Indicates if assimilation information is present.
|
|
91
|
+
has_posterior (bool): Indicates if posterior information is present.
|
|
92
|
+
seq (generator): Generator of observations from the observation sequence file.
|
|
93
|
+
all_obs (list): List of all observations, each observation is a list.
|
|
94
|
+
Valid when the obs_sequence is created from a file.
|
|
95
|
+
Set to None when the obs_sequence is created from scratch or multiple
|
|
96
|
+
obs_sequences are joined.
|
|
58
97
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
-1: 'surface (m)',
|
|
69
|
-
1: 'model level',
|
|
70
|
-
2: 'pressure (Pa)',
|
|
71
|
-
3: 'height (m)',
|
|
72
|
-
4: 'scale height' }
|
|
98
|
+
|
|
99
|
+
vert = {
|
|
100
|
+
-2: "undefined",
|
|
101
|
+
-1: "surface (m)",
|
|
102
|
+
1: "model level",
|
|
103
|
+
2: "pressure (Pa)",
|
|
104
|
+
3: "height (m)",
|
|
105
|
+
4: "scale height",
|
|
106
|
+
}
|
|
73
107
|
|
|
74
108
|
reversed_vert = {value: key for key, value in vert.items()}
|
|
75
109
|
|
|
76
|
-
|
|
77
110
|
def __init__(self, file, synonyms=None):
|
|
78
|
-
|
|
111
|
+
"""
|
|
112
|
+
Create an obs_sequence object from an ASCII or binary observation sequence file,
|
|
113
|
+
or create an empty obs_sequence object from scratch.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
file (str): The input observation sequence ASCII or binary file.
|
|
117
|
+
If None, an empty obs_sequence object is created from scratch.
|
|
118
|
+
synonyms (list, optional): List of synonyms for the observation column in the DataFrame.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
an obs_sequence object
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
|
|
125
|
+
.. code-block:: python
|
|
126
|
+
|
|
127
|
+
obs_seq = obs_sequence(file='obs_seq.final')
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
self.loc_mod = "None"
|
|
79
132
|
self.has_assimilation_info = False
|
|
80
133
|
self.has_posterior = False
|
|
81
134
|
self.file = file
|
|
82
|
-
self.synonyms_for_obs = [
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
135
|
+
self.synonyms_for_obs = [
|
|
136
|
+
"NCEP BUFR observation",
|
|
137
|
+
"AIRS observation",
|
|
138
|
+
"GTSPP observation",
|
|
139
|
+
"SST observation",
|
|
140
|
+
"observations",
|
|
141
|
+
"WOD observation",
|
|
142
|
+
]
|
|
88
143
|
if synonyms:
|
|
89
144
|
if isinstance(synonyms, list):
|
|
90
145
|
self.synonyms_for_obs.extend(synonyms)
|
|
@@ -92,18 +147,22 @@ class obs_sequence:
|
|
|
92
147
|
self.synonyms_for_obs.append(synonyms)
|
|
93
148
|
|
|
94
149
|
if file is None:
|
|
95
|
-
# Early exit for testing purposes
|
|
150
|
+
# Early exit - for testing purposes or creating obs_seq objects from scratch
|
|
96
151
|
self.df = pd.DataFrame()
|
|
97
152
|
self.types = {}
|
|
98
153
|
self.reverse_types = {}
|
|
99
154
|
self.copie_names = []
|
|
100
|
-
self.
|
|
155
|
+
self.non_qc_copie_names = []
|
|
156
|
+
self.qc_copie_names = []
|
|
157
|
+
self.n_copies = 0 # copies including qc
|
|
158
|
+
self.n_non_qc = 0 # copies not including qc
|
|
159
|
+
self.n_qc = 0 # number of qc copies
|
|
101
160
|
self.seq = []
|
|
102
161
|
self.all_obs = []
|
|
103
162
|
return
|
|
104
163
|
|
|
105
164
|
module_dir = os.path.dirname(__file__)
|
|
106
|
-
self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
|
|
165
|
+
self.default_composite_types = os.path.join(module_dir, "composite_types.yaml")
|
|
107
166
|
|
|
108
167
|
if self.is_binary(file):
|
|
109
168
|
self.header = self.read_binary_header(file)
|
|
@@ -113,40 +172,47 @@ class obs_sequence:
|
|
|
113
172
|
self.types = self.collect_obs_types(self.header)
|
|
114
173
|
self.reverse_types = {v: k for k, v in self.types.items()}
|
|
115
174
|
self.copie_names, self.n_copies = self.collect_copie_names(self.header)
|
|
175
|
+
self.n_non_qc, self.n_qc = self.num_qc_non_qc(self.header)
|
|
176
|
+
self.non_qc_copie_names = self.copie_names[: self.n_non_qc]
|
|
177
|
+
self.qc_copie_names = self.copie_names[self.n_non_qc :]
|
|
116
178
|
|
|
117
179
|
if self.is_binary(file):
|
|
118
180
|
self.seq = self.obs_binary_reader(file, self.n_copies)
|
|
119
|
-
self.loc_mod =
|
|
181
|
+
self.loc_mod = "loc3d" # only loc3d supported for binary, & no way to check
|
|
120
182
|
else:
|
|
121
183
|
self.seq = self.obs_reader(file, self.n_copies)
|
|
122
184
|
|
|
123
|
-
self.all_obs = self.create_all_obs()
|
|
185
|
+
self.all_obs = self.create_all_obs() # uses up the generator
|
|
124
186
|
# at this point you know if the seq is loc3d or loc1d
|
|
125
|
-
if self.loc_mod ==
|
|
126
|
-
raise ValueError(
|
|
187
|
+
if self.loc_mod == "None":
|
|
188
|
+
raise ValueError(
|
|
189
|
+
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
190
|
+
)
|
|
127
191
|
self.columns = self.column_headers()
|
|
128
|
-
self.df = pd.DataFrame(self.all_obs, columns
|
|
129
|
-
if self.loc_mod ==
|
|
130
|
-
self.df[
|
|
131
|
-
self.df[
|
|
192
|
+
self.df = pd.DataFrame(self.all_obs, columns=self.columns)
|
|
193
|
+
if self.loc_mod == "loc3d":
|
|
194
|
+
self.df["longitude"] = np.rad2deg(self.df["longitude"])
|
|
195
|
+
self.df["latitude"] = np.rad2deg(self.df["latitude"])
|
|
132
196
|
# rename 'X observation' to observation
|
|
133
|
-
self.synonyms_for_obs = [
|
|
134
|
-
|
|
197
|
+
self.synonyms_for_obs = [
|
|
198
|
+
synonym.replace(" ", "_") for synonym in self.synonyms_for_obs
|
|
199
|
+
]
|
|
200
|
+
rename_dict = {
|
|
201
|
+
old: "observation"
|
|
202
|
+
for old in self.synonyms_for_obs
|
|
203
|
+
if old in self.df.columns
|
|
204
|
+
}
|
|
135
205
|
self.df = self.df.rename(columns=rename_dict)
|
|
136
206
|
|
|
137
|
-
#
|
|
138
|
-
if
|
|
207
|
+
# check if the assimilation info is present
|
|
208
|
+
if "prior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
139
209
|
self.has_assimilation_info = True
|
|
140
|
-
|
|
141
|
-
self.
|
|
142
|
-
if 'posterior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
|
|
143
|
-
self.has_posterior_info = True
|
|
144
|
-
self.df['posterior_bias'] = (self.df['posterior_ensemble_mean'] - self.df['observation'])
|
|
145
|
-
self.df['posterior_sq_err'] = self.df['posterior_bias']**2
|
|
210
|
+
if "posterior_ensemble_mean".casefold() in map(str.casefold, self.columns):
|
|
211
|
+
self.has_posterior = True
|
|
146
212
|
|
|
147
213
|
def create_all_obs(self):
|
|
148
|
-
"""
|
|
149
|
-
|
|
214
|
+
"""steps through the generator to create a
|
|
215
|
+
list of all observations in the sequence
|
|
150
216
|
"""
|
|
151
217
|
all_obs = []
|
|
152
218
|
for obs in self.seq:
|
|
@@ -155,49 +221,50 @@ class obs_sequence:
|
|
|
155
221
|
return all_obs
|
|
156
222
|
|
|
157
223
|
def obs_to_list(self, obs):
|
|
158
|
-
"""put single observation into a list
|
|
159
|
-
|
|
160
|
-
discards obs_def
|
|
161
|
-
"""
|
|
224
|
+
"""put single observation into a list"""
|
|
162
225
|
data = []
|
|
163
|
-
data.append(obs[0].split()[1])
|
|
164
|
-
data.extend(list(map(float,obs[1:self.n_copies+1])))
|
|
165
|
-
data.append(obs[self.n_copies+1])
|
|
226
|
+
data.append(obs[0].split()[1]) # obs_num
|
|
227
|
+
data.extend(list(map(float, obs[1 : self.n_copies + 1]))) # all the copies
|
|
228
|
+
data.append(obs[self.n_copies + 1]) # linked list info
|
|
166
229
|
try: # HK todo only have to check loc3d or loc1d for the first observation, the whole file is the same
|
|
167
|
-
locI = obs.index(
|
|
168
|
-
location = obs[locI+1].split()
|
|
230
|
+
locI = obs.index("loc3d")
|
|
231
|
+
location = obs[locI + 1].split()
|
|
169
232
|
data.append(float(location[0])) # location x
|
|
170
233
|
data.append(float(location[1])) # location y
|
|
171
234
|
data.append(float(location[2])) # location z
|
|
172
235
|
data.append(obs_sequence.vert[int(location[3])])
|
|
173
|
-
self.loc_mod =
|
|
236
|
+
self.loc_mod = "loc3d"
|
|
174
237
|
except ValueError:
|
|
175
238
|
try:
|
|
176
|
-
locI = obs.index(
|
|
177
|
-
location = obs[locI+1]
|
|
178
|
-
data.append(float(location)) # 1d location
|
|
179
|
-
self.loc_mod =
|
|
239
|
+
locI = obs.index("loc1d")
|
|
240
|
+
location = obs[locI + 1]
|
|
241
|
+
data.append(float(location)) # 1d location
|
|
242
|
+
self.loc_mod = "loc1d"
|
|
180
243
|
except ValueError:
|
|
181
|
-
raise ValueError(
|
|
182
|
-
|
|
244
|
+
raise ValueError(
|
|
245
|
+
"Neither 'loc3d' nor 'loc1d' could be found in the observation sequence."
|
|
246
|
+
)
|
|
247
|
+
typeI = obs.index("kind") # type of observation
|
|
183
248
|
type_value = obs[typeI + 1]
|
|
184
249
|
if not self.types:
|
|
185
|
-
data.append(
|
|
250
|
+
data.append("Identity")
|
|
186
251
|
else:
|
|
187
|
-
data.append(self.types[type_value])
|
|
188
|
-
|
|
252
|
+
data.append(self.types[type_value]) # observation type
|
|
253
|
+
|
|
189
254
|
# any observation specific obs def info is between here and the end of the list
|
|
190
255
|
# can be obs_def & external forward operator
|
|
191
|
-
metadata = obs[typeI+2
|
|
256
|
+
metadata = obs[typeI + 2 : -2]
|
|
192
257
|
obs_def_metadata, external_metadata = self.split_metadata(metadata)
|
|
193
258
|
data.append(obs_def_metadata)
|
|
194
259
|
data.append(external_metadata)
|
|
195
260
|
|
|
196
261
|
time = obs[-2].split()
|
|
197
|
-
data.append(int(time[0]))
|
|
198
|
-
data.append(int(time[1]))
|
|
199
|
-
data.append(
|
|
200
|
-
|
|
262
|
+
data.append(int(time[0])) # seconds
|
|
263
|
+
data.append(int(time[1])) # days
|
|
264
|
+
data.append(
|
|
265
|
+
convert_dart_time(int(time[0]), int(time[1]))
|
|
266
|
+
) # datetime # HK todo what is approprate for 1d models?
|
|
267
|
+
data.append(float(obs[-1])) # obs error variance ?convert to sd?
|
|
201
268
|
|
|
202
269
|
return data
|
|
203
270
|
|
|
@@ -215,41 +282,49 @@ class obs_sequence:
|
|
|
215
282
|
the first sublist contains the entire metadata list, and the second is empty.
|
|
216
283
|
"""
|
|
217
284
|
for i, item in enumerate(metadata):
|
|
218
|
-
if item.startswith(
|
|
285
|
+
if item.startswith("external_FO"):
|
|
219
286
|
return metadata[:i], metadata[i:]
|
|
220
287
|
return metadata, []
|
|
221
288
|
|
|
222
289
|
def list_to_obs(self, data):
|
|
290
|
+
"""convert a list of data to an observation
|
|
291
|
+
|
|
292
|
+
Assuming the order of the list is obs_seq.copie_names
|
|
293
|
+
|
|
294
|
+
"""
|
|
223
295
|
obs = []
|
|
224
|
-
obs.append(
|
|
225
|
-
obs.extend(data[1:self.n_copies+1]) # all the copies
|
|
226
|
-
obs.append(data[self.n_copies+1]) # linked list info
|
|
227
|
-
obs.append(
|
|
296
|
+
obs.append("OBS " + str(data[0])) # obs_num lots of space
|
|
297
|
+
obs.extend(data[1 : self.n_copies + 1]) # all the copies
|
|
298
|
+
obs.append(data[self.n_copies + 1]) # linked list info
|
|
299
|
+
obs.append("obdef") # TODO HK: extended_FO obs_def
|
|
228
300
|
obs.append(self.loc_mod)
|
|
229
|
-
if self.loc_mod ==
|
|
230
|
-
obs.append(
|
|
231
|
-
|
|
301
|
+
if self.loc_mod == "loc3d":
|
|
302
|
+
obs.append(
|
|
303
|
+
" ".join(map(str, data[self.n_copies + 2 : self.n_copies + 5]))
|
|
304
|
+
+ " "
|
|
305
|
+
+ str(self.reversed_vert[data[self.n_copies + 5]])
|
|
306
|
+
) # location x, y, z, vert
|
|
307
|
+
obs.append("kind") # this is type of observation
|
|
232
308
|
obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
|
|
233
|
-
# Convert metadata to a string and append
|
|
309
|
+
# Convert metadata to a string and append !HK @todo you are not converting to string
|
|
234
310
|
obs.extend(data[self.n_copies + 7]) # metadata
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
obs.append(
|
|
311
|
+
obs.extend(data[self.n_copies + 8]) # external forward operator
|
|
312
|
+
elif self.loc_mod == "loc1d":
|
|
313
|
+
obs.append(data[self.n_copies + 2]) # 1d location
|
|
314
|
+
obs.append("kind") # this is type of observation
|
|
238
315
|
obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
|
|
239
|
-
#
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
obs.append(metadata) # metadata
|
|
243
|
-
obs.append(' '.join(map(str, data[-4:-2]))) # seconds, days
|
|
316
|
+
obs.extend(data[self.n_copies + 4]) # metadata
|
|
317
|
+
obs.extend(data[self.n_copies + 5]) # external forward operator
|
|
318
|
+
obs.append(" ".join(map(str, data[-4:-2]))) # seconds, days
|
|
244
319
|
obs.append(data[-1]) # obs error variance
|
|
245
320
|
|
|
246
321
|
return obs
|
|
247
322
|
|
|
248
323
|
@staticmethod
|
|
249
324
|
def generate_linked_list_pattern(n):
|
|
250
|
-
"""Create a list of strings with the linked list pattern for n
|
|
325
|
+
"""Create a list of strings with the linked list pattern for n observations."""
|
|
251
326
|
result = []
|
|
252
|
-
for i in range(n-1):
|
|
327
|
+
for i in range(n - 1):
|
|
253
328
|
col1 = i if i > 0 else -1
|
|
254
329
|
col2 = i + 2
|
|
255
330
|
col3 = -1
|
|
@@ -257,101 +332,97 @@ class obs_sequence:
|
|
|
257
332
|
result.append(f"{n-1:<12}{'-1':<11}{'-1'}")
|
|
258
333
|
return result
|
|
259
334
|
|
|
260
|
-
def write_obs_seq(self, file
|
|
335
|
+
def write_obs_seq(self, file):
|
|
261
336
|
"""
|
|
262
337
|
Write the observation sequence to a file.
|
|
263
|
-
|
|
264
|
-
This function writes the observation sequence to
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
Parameters:
|
|
338
|
+
|
|
339
|
+
This function writes the observation sequence stored in the obs_seq.DataFrame to a specified file.
|
|
340
|
+
It updates the header with the number of observations, converts coordinates back to radians
|
|
341
|
+
if necessary, drops unnecessary columns, sorts the DataFrame by time, and generates a linked
|
|
342
|
+
list pattern for reading by DART programs.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
273
345
|
file (str): The path to the file where the observation sequence will be written.
|
|
274
|
-
df (pandas.DataFrame, optional): A DataFrame containing the observation data. If not provided, the function uses self.header and self.all_obs.
|
|
275
|
-
|
|
276
|
-
Returns:
|
|
277
|
-
None
|
|
278
|
-
|
|
279
|
-
Examples:
|
|
280
|
-
``obs_seq.write_obs_seq('/path/to/output/file')``
|
|
281
|
-
``obs_seq.write_obs_seq('/path/to/output/file', df=obs_seq.df)``
|
|
282
|
-
"""
|
|
283
|
-
with open(file, 'w') as f:
|
|
284
|
-
|
|
285
|
-
if df is not None:
|
|
286
|
-
# If a DataFrame is provided, update the header with the number of observations
|
|
287
|
-
num_rows = len(df)
|
|
288
|
-
replacement_string = f'num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}'
|
|
289
|
-
new_header = [replacement_string if 'num_obs' in element else element for element in self.header]
|
|
290
|
-
|
|
291
|
-
for line in new_header[:-1]:
|
|
292
|
-
f.write(str(line) + '\n')
|
|
293
|
-
first = 1
|
|
294
|
-
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
295
|
-
|
|
296
|
-
# TODO HK is there something better than copying the whole thing here?
|
|
297
|
-
df_copy = df.copy() # copy since you want to change for writing.
|
|
298
|
-
# back to radians for obs_seq
|
|
299
|
-
if self.loc_mod == 'loc3d':
|
|
300
|
-
df_copy['longitude'] = np.deg2rad(self.df['longitude'])
|
|
301
|
-
df_copy['latitude'] = np.deg2rad(self.df['latitude'])
|
|
302
|
-
if 'bias' in df_copy.columns:
|
|
303
|
-
df_copy = df_copy.drop(columns=['bias', 'sq_err'])
|
|
304
|
-
|
|
305
|
-
# linked list for reading by dart programs
|
|
306
|
-
df_copy = df_copy.sort_values(by=['time']) # sort the DataFrame by time
|
|
307
|
-
df_copy['obs_num'] = df.index + 1 # obs_num in time order
|
|
308
|
-
df_copy['linked_list'] = obs_sequence.generate_linked_list_pattern(len(df_copy)) # linked list pattern
|
|
309
|
-
|
|
310
|
-
def write_row(row):
|
|
311
|
-
ob_write = self.list_to_obs(row.tolist())
|
|
312
|
-
for line in ob_write:
|
|
313
|
-
f.write(str(line) + '\n')
|
|
314
|
-
|
|
315
|
-
df_copy.apply(write_row, axis=1)
|
|
316
|
-
|
|
317
|
-
else:
|
|
318
|
-
# If no DataFrame is provided, use self.header and self.all_obs
|
|
319
|
-
for line in self.header:
|
|
320
|
-
f.write(str(line) + '\n')
|
|
321
|
-
for obs in self.all_obs:
|
|
322
|
-
ob_write = self.list_to_obs(obs)
|
|
323
|
-
for line in ob_write:
|
|
324
|
-
f.write(str(line) + '\n')
|
|
325
346
|
|
|
347
|
+
Notes:
|
|
348
|
+
- Longitude and latitude are converted back to radians if the location model is 'loc3d'.
|
|
349
|
+
- The 'bias' and 'sq_err' columns are dropped if they exist in the DataFrame.
|
|
350
|
+
- The DataFrame is sorted by the 'time' column.
|
|
351
|
+
- An 'obs_num' column is added to the DataFrame to number the observations in time order.
|
|
352
|
+
- A 'linked_list' column is generated to create a linked list pattern for the observations.
|
|
353
|
+
|
|
354
|
+
Example:
|
|
355
|
+
obsq.write_obs_seq('obs_seq.new')
|
|
356
|
+
|
|
357
|
+
"""
|
|
358
|
+
with open(file, "w") as f:
|
|
359
|
+
|
|
360
|
+
# If a DataFrame is provided, update the header with the number of observations
|
|
361
|
+
num_rows = len(self.df)
|
|
362
|
+
replacement_string = f"num_obs: {num_rows:>10} max_num_obs: {num_rows:>10}"
|
|
363
|
+
new_header = [
|
|
364
|
+
replacement_string if "num_obs" in element else element
|
|
365
|
+
for element in self.header
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
for line in new_header[:-1]:
|
|
369
|
+
f.write(str(line) + "\n")
|
|
370
|
+
first = 1
|
|
371
|
+
f.write(f"first: {first:>12} last: {num_rows:>12}\n")
|
|
372
|
+
|
|
373
|
+
# TODO HK is there something better than copying the whole thing here?
|
|
374
|
+
df_copy = self.df.copy() # copy since you want to change for writing.
|
|
375
|
+
# back to radians for obs_seq
|
|
376
|
+
if self.loc_mod == "loc3d":
|
|
377
|
+
df_copy["longitude"] = np.deg2rad(self.df["longitude"]).round(16)
|
|
378
|
+
df_copy["latitude"] = np.deg2rad(self.df["latitude"]).round(16)
|
|
379
|
+
if "bias" in df_copy.columns:
|
|
380
|
+
df_copy = df_copy.drop(columns=["bias", "sq_err"])
|
|
381
|
+
|
|
382
|
+
# linked list for reading by dart programs
|
|
383
|
+
df_copy = df_copy.sort_values(
|
|
384
|
+
by=["time"], kind="stable"
|
|
385
|
+
) # sort the DataFrame by time
|
|
386
|
+
df_copy["obs_num"] = self.df.index + 1 # obs_num in time order
|
|
387
|
+
df_copy["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
388
|
+
len(df_copy)
|
|
389
|
+
) # linked list pattern
|
|
390
|
+
|
|
391
|
+
def write_row(row):
|
|
392
|
+
ob_write = self.list_to_obs(row.tolist())
|
|
393
|
+
for line in ob_write:
|
|
394
|
+
f.write(str(line) + "\n")
|
|
395
|
+
|
|
396
|
+
df_copy.apply(write_row, axis=1)
|
|
326
397
|
|
|
327
398
|
def column_headers(self):
|
|
328
|
-
"""define the columns for the dataframe
|
|
399
|
+
"""define the columns for the dataframe"""
|
|
329
400
|
heading = []
|
|
330
|
-
heading.append(
|
|
401
|
+
heading.append("obs_num")
|
|
331
402
|
heading.extend(self.copie_names)
|
|
332
|
-
heading.append(
|
|
333
|
-
if self.loc_mod ==
|
|
334
|
-
heading.append(
|
|
335
|
-
heading.append(
|
|
336
|
-
heading.append(
|
|
337
|
-
heading.append(
|
|
338
|
-
elif self.loc_mod ==
|
|
339
|
-
heading.append(
|
|
340
|
-
heading.append(
|
|
341
|
-
heading.append(
|
|
342
|
-
heading.append(
|
|
343
|
-
heading.append(
|
|
344
|
-
heading.append(
|
|
345
|
-
heading.append(
|
|
346
|
-
heading.append(
|
|
403
|
+
heading.append("linked_list")
|
|
404
|
+
if self.loc_mod == "loc3d":
|
|
405
|
+
heading.append("longitude")
|
|
406
|
+
heading.append("latitude")
|
|
407
|
+
heading.append("vertical")
|
|
408
|
+
heading.append("vert_unit")
|
|
409
|
+
elif self.loc_mod == "loc1d":
|
|
410
|
+
heading.append("location")
|
|
411
|
+
heading.append("type")
|
|
412
|
+
heading.append("metadata")
|
|
413
|
+
heading.append("external_FO")
|
|
414
|
+
heading.append("seconds")
|
|
415
|
+
heading.append("days")
|
|
416
|
+
heading.append("time")
|
|
417
|
+
heading.append("obs_err_var")
|
|
347
418
|
return heading
|
|
348
419
|
|
|
349
|
-
@requires_assimilation_info
|
|
420
|
+
@requires_assimilation_info
|
|
350
421
|
def select_by_dart_qc(self, dart_qc):
|
|
351
422
|
"""
|
|
352
423
|
Selects rows from a DataFrame based on the DART quality control flag.
|
|
353
424
|
|
|
354
|
-
|
|
425
|
+
Args:
|
|
355
426
|
df (DataFrame): A pandas DataFrame.
|
|
356
427
|
dart_qc (int): The DART quality control flag to select.
|
|
357
428
|
|
|
@@ -361,10 +432,12 @@ class obs_sequence:
|
|
|
361
432
|
Raises:
|
|
362
433
|
ValueError: If the DART quality control flag is not present in the DataFrame.
|
|
363
434
|
"""
|
|
364
|
-
if dart_qc not in self.df[
|
|
365
|
-
raise ValueError(
|
|
435
|
+
if dart_qc not in self.df["DART_quality_control"].unique():
|
|
436
|
+
raise ValueError(
|
|
437
|
+
f"DART quality control flag '{dart_qc}' not found in DataFrame."
|
|
438
|
+
)
|
|
366
439
|
else:
|
|
367
|
-
return self.df[self.df[
|
|
440
|
+
return self.df[self.df["DART_quality_control"] == dart_qc]
|
|
368
441
|
|
|
369
442
|
@requires_assimilation_info
|
|
370
443
|
def select_failed_qcs(self):
|
|
@@ -374,7 +447,7 @@ class obs_sequence:
|
|
|
374
447
|
Returns:
|
|
375
448
|
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
|
|
376
449
|
"""
|
|
377
|
-
return self.df[self.df[
|
|
450
|
+
return self.df[self.df["DART_quality_control"] > 0]
|
|
378
451
|
|
|
379
452
|
@requires_assimilation_info
|
|
380
453
|
def possible_vs_used(self):
|
|
@@ -392,31 +465,29 @@ class obs_sequence:
|
|
|
392
465
|
'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
|
|
393
466
|
that passed quality control checks.
|
|
394
467
|
"""
|
|
395
|
-
possible = self.df.groupby(
|
|
396
|
-
possible.rename(
|
|
397
|
-
|
|
398
|
-
failed_qcs = self.select_failed_qcs().groupby(
|
|
468
|
+
possible = self.df.groupby("type")["observation"].count()
|
|
469
|
+
possible.rename("possible", inplace=True)
|
|
470
|
+
|
|
471
|
+
failed_qcs = self.select_failed_qcs().groupby("type")["observation"].count()
|
|
399
472
|
used = possible - failed_qcs.reindex(possible.index, fill_value=0)
|
|
400
|
-
used.rename(
|
|
401
|
-
|
|
402
|
-
return pd.concat([possible, used], axis=1).reset_index()
|
|
473
|
+
used.rename("used", inplace=True)
|
|
403
474
|
|
|
475
|
+
return pd.concat([possible, used], axis=1).reset_index()
|
|
404
476
|
|
|
405
477
|
@staticmethod
|
|
406
478
|
def is_binary(file):
|
|
407
479
|
"""Check if a file is binary file."""
|
|
408
|
-
with open(file,
|
|
480
|
+
with open(file, "rb") as f:
|
|
409
481
|
chunk = f.read(1024)
|
|
410
|
-
if b
|
|
482
|
+
if b"\0" in chunk:
|
|
411
483
|
return True
|
|
412
484
|
return False
|
|
413
485
|
|
|
414
|
-
|
|
415
486
|
@staticmethod
|
|
416
487
|
def read_header(file):
|
|
417
488
|
"""Read the header and number of lines in the header of an ascii obs_seq file"""
|
|
418
489
|
header = []
|
|
419
|
-
with open(file,
|
|
490
|
+
with open(file, "r") as f:
|
|
420
491
|
for line in f:
|
|
421
492
|
if "first:" in line and "last:" in line:
|
|
422
493
|
header.append(line.strip())
|
|
@@ -432,19 +503,19 @@ class obs_sequence:
|
|
|
432
503
|
linecount = 0
|
|
433
504
|
obs_types_definitions = -1000
|
|
434
505
|
num_obs = 0
|
|
435
|
-
max_num_obs = 0
|
|
506
|
+
max_num_obs = 0
|
|
436
507
|
# need to get:
|
|
437
508
|
# number of obs_type_definitions
|
|
438
509
|
# number of copies
|
|
439
510
|
# number of qcs
|
|
440
|
-
with open(file,
|
|
511
|
+
with open(file, "rb") as f:
|
|
441
512
|
while True:
|
|
442
513
|
# Read the record length
|
|
443
514
|
record_length = obs_sequence.read_record_length(f)
|
|
444
515
|
if record_length is None:
|
|
445
516
|
break
|
|
446
517
|
record = f.read(record_length)
|
|
447
|
-
if not record:
|
|
518
|
+
if not record: # end of file
|
|
448
519
|
break
|
|
449
520
|
|
|
450
521
|
# Read the trailing record length (should match the leading one)
|
|
@@ -452,17 +523,19 @@ class obs_sequence:
|
|
|
452
523
|
|
|
453
524
|
linecount += 1
|
|
454
525
|
|
|
455
|
-
if linecount == 3:
|
|
456
|
-
obs_types_definitions = struct.unpack(
|
|
457
|
-
continue
|
|
526
|
+
if linecount == 3:
|
|
527
|
+
obs_types_definitions = struct.unpack("i", record)[0]
|
|
528
|
+
continue
|
|
458
529
|
|
|
459
|
-
if linecount == 4+obs_types_definitions:
|
|
460
|
-
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
|
|
530
|
+
if linecount == 4 + obs_types_definitions:
|
|
531
|
+
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack(
|
|
532
|
+
"iiii", record
|
|
533
|
+
)[:16]
|
|
461
534
|
break
|
|
462
|
-
|
|
535
|
+
|
|
463
536
|
# Go back to the beginning of the file
|
|
464
537
|
f.seek(0)
|
|
465
|
-
|
|
538
|
+
|
|
466
539
|
for _ in range(2):
|
|
467
540
|
record_length = obs_sequence.read_record_length(f)
|
|
468
541
|
if record_length is None:
|
|
@@ -472,14 +545,14 @@ class obs_sequence:
|
|
|
472
545
|
if not record: # end of file
|
|
473
546
|
break
|
|
474
547
|
|
|
475
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
476
|
-
header.append(record.decode(
|
|
548
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
549
|
+
header.append(record.decode("utf-8").strip())
|
|
477
550
|
|
|
478
551
|
header.append(str(obs_types_definitions))
|
|
479
552
|
|
|
480
553
|
# obs_types_definitions
|
|
481
|
-
for _ in range(3,4+obs_types_definitions):
|
|
482
|
-
|
|
554
|
+
for _ in range(3, 4 + obs_types_definitions):
|
|
555
|
+
# Read the record length
|
|
483
556
|
record_length = obs_sequence.read_record_length(f)
|
|
484
557
|
if record_length is None:
|
|
485
558
|
break
|
|
@@ -489,21 +562,24 @@ class obs_sequence:
|
|
|
489
562
|
if not record: # end of file
|
|
490
563
|
break
|
|
491
564
|
|
|
492
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
565
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
493
566
|
|
|
494
567
|
if _ == 3:
|
|
495
|
-
continue
|
|
568
|
+
continue # num obs_types_definitions
|
|
496
569
|
# Read an integer and a string from the record
|
|
497
|
-
integer_value = struct.unpack(
|
|
498
|
-
string_value = record[4:].decode(
|
|
570
|
+
integer_value = struct.unpack("i", record[:4])[0]
|
|
571
|
+
string_value = record[4:].decode("utf-8").strip()
|
|
499
572
|
header.append(f"{integer_value} {string_value}")
|
|
500
573
|
|
|
501
574
|
header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
|
|
502
575
|
header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
|
|
503
|
-
|
|
504
|
-
#copie names
|
|
505
|
-
for _ in range(
|
|
506
|
-
|
|
576
|
+
|
|
577
|
+
# copie names
|
|
578
|
+
for _ in range(
|
|
579
|
+
5 + obs_types_definitions,
|
|
580
|
+
5 + obs_types_definitions + num_copies + num_qcs + 1,
|
|
581
|
+
):
|
|
582
|
+
# Read the record length
|
|
507
583
|
record_length = obs_sequence.read_record_length(f)
|
|
508
584
|
if record_length is None:
|
|
509
585
|
break
|
|
@@ -513,26 +589,26 @@ class obs_sequence:
|
|
|
513
589
|
if not record:
|
|
514
590
|
break
|
|
515
591
|
|
|
516
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
592
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
517
593
|
|
|
518
|
-
if _ == 5+obs_types_definitions:
|
|
594
|
+
if _ == 5 + obs_types_definitions:
|
|
519
595
|
continue
|
|
520
596
|
|
|
521
597
|
# Read the whole record as a string
|
|
522
|
-
string_value = record.decode(
|
|
598
|
+
string_value = record.decode("utf-8").strip()
|
|
523
599
|
header.append(string_value)
|
|
524
600
|
|
|
525
601
|
# first and last obs
|
|
526
|
-
# Read the record length
|
|
602
|
+
# Read the record length
|
|
527
603
|
record_length = obs_sequence.read_record_length(f)
|
|
528
604
|
|
|
529
605
|
# Read the actual record
|
|
530
606
|
record = f.read(record_length)
|
|
531
|
-
|
|
532
|
-
obs_sequence.check_trailing_record_length(f, record_length)
|
|
607
|
+
|
|
608
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
533
609
|
|
|
534
610
|
# Read the whole record as a two integers
|
|
535
|
-
first, last = struct.unpack(
|
|
611
|
+
first, last = struct.unpack("ii", record)[:8]
|
|
536
612
|
header.append(f"first: {first} last: {last}")
|
|
537
613
|
|
|
538
614
|
return header
|
|
@@ -541,7 +617,7 @@ class obs_sequence:
|
|
|
541
617
|
def collect_obs_types(header):
|
|
542
618
|
"""Create a dictionary for the observation types in the obs_seq header"""
|
|
543
619
|
num_obs_types = int(header[2])
|
|
544
|
-
types = dict([x.split() for
|
|
620
|
+
types = dict([x.split() for x in header[3 : num_obs_types + 3]])
|
|
545
621
|
return types
|
|
546
622
|
|
|
547
623
|
@staticmethod
|
|
@@ -549,32 +625,45 @@ class obs_sequence:
|
|
|
549
625
|
"""
|
|
550
626
|
Extracts the names of the copies from the header of an obs_seq file.
|
|
551
627
|
|
|
552
|
-
|
|
628
|
+
Args:
|
|
553
629
|
header (list): A list of strings representing the lines in the header of the obs_seq file.
|
|
554
630
|
|
|
555
631
|
Returns:
|
|
556
|
-
tuple: A tuple containing two elements:
|
|
557
|
-
- copie_names (list): A list of strings representing the copy names with underscores for spaces.
|
|
632
|
+
tuple: A tuple containing two elements:
|
|
633
|
+
- copie_names (list): A list of strings representing the copy names with underscores for spaces.
|
|
558
634
|
- len(copie_names) (int): The number of copy names.
|
|
559
635
|
"""
|
|
560
636
|
for i, line in enumerate(header):
|
|
561
637
|
if "num_obs:" in line and "max_num_obs:" in line:
|
|
562
|
-
first_copie = i+1
|
|
638
|
+
first_copie = i + 1
|
|
563
639
|
break
|
|
564
|
-
copie_names = [
|
|
640
|
+
copie_names = [
|
|
641
|
+
"_".join(x.split()) for x in header[first_copie:-1]
|
|
642
|
+
] # first and last is last line of header
|
|
565
643
|
return copie_names, len(copie_names)
|
|
566
644
|
|
|
645
|
+
@staticmethod
|
|
646
|
+
def num_qc_non_qc(header):
|
|
647
|
+
"""Find the number of qc and non-qc copies in the header"""
|
|
648
|
+
for line in header:
|
|
649
|
+
if "num_copies:" in line and "num_qc:" in line:
|
|
650
|
+
num_non_qc = int(line.split()[1])
|
|
651
|
+
num_qc = int(line.split()[3])
|
|
652
|
+
return num_non_qc, num_qc
|
|
653
|
+
|
|
567
654
|
@staticmethod
|
|
568
655
|
def obs_reader(file, n):
|
|
569
656
|
"""Reads the ascii obs sequence file and returns a generator of the obs"""
|
|
570
|
-
previous_line =
|
|
571
|
-
with open(file,
|
|
657
|
+
previous_line = ""
|
|
658
|
+
with open(file, "r") as f:
|
|
572
659
|
for line in f:
|
|
573
660
|
if "OBS" in line or "OBS" in previous_line:
|
|
574
661
|
if "OBS" in line:
|
|
575
662
|
obs = []
|
|
576
|
-
obs.append(line.strip())
|
|
577
|
-
for i in range(
|
|
663
|
+
obs.append(line.strip())
|
|
664
|
+
for i in range(
|
|
665
|
+
n + 100
|
|
666
|
+
): # number of copies + 100. Needs to be bigger than any metadata
|
|
578
667
|
try:
|
|
579
668
|
next_line = next(f)
|
|
580
669
|
except:
|
|
@@ -587,11 +676,15 @@ class obs_sequence:
|
|
|
587
676
|
else:
|
|
588
677
|
obs.append(next_line.strip())
|
|
589
678
|
yield obs
|
|
590
|
-
elif
|
|
679
|
+
elif (
|
|
680
|
+
"OBS" in previous_line
|
|
681
|
+
): # previous line is because I cannot use f.tell with next
|
|
591
682
|
obs = []
|
|
592
|
-
obs.append(previous_line.strip())
|
|
593
|
-
obs.append(line.strip())
|
|
594
|
-
for i in range(
|
|
683
|
+
obs.append(previous_line.strip())
|
|
684
|
+
obs.append(line.strip())
|
|
685
|
+
for i in range(
|
|
686
|
+
n + 100
|
|
687
|
+
): # number of copies + 100. Needs to be bigger than any metadata
|
|
595
688
|
try:
|
|
596
689
|
next_line = next(f)
|
|
597
690
|
except:
|
|
@@ -608,19 +701,19 @@ class obs_sequence:
|
|
|
608
701
|
|
|
609
702
|
@staticmethod
|
|
610
703
|
def check_trailing_record_length(file, expected_length):
|
|
611
|
-
|
|
704
|
+
"""Reads and checks the trailing record length from the binary file written by Fortran.
|
|
612
705
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
706
|
+
Args:
|
|
707
|
+
file (file): The file object.
|
|
708
|
+
expected_length (int): The expected length of the trailing record.
|
|
616
709
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
710
|
+
Assuming 4 bytes:
|
|
711
|
+
| Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
|
|
712
|
+
"""
|
|
713
|
+
trailing_record_length_bytes = file.read(4)
|
|
714
|
+
trailing_record_length = struct.unpack("i", trailing_record_length_bytes)[0]
|
|
715
|
+
if expected_length != trailing_record_length:
|
|
716
|
+
raise ValueError("Record length mismatch in Fortran binary file")
|
|
624
717
|
|
|
625
718
|
@staticmethod
|
|
626
719
|
def read_record_length(file):
|
|
@@ -628,18 +721,17 @@ class obs_sequence:
|
|
|
628
721
|
record_length_bytes = file.read(4)
|
|
629
722
|
if not record_length_bytes:
|
|
630
723
|
return None # End of file
|
|
631
|
-
return struct.unpack(
|
|
632
|
-
|
|
724
|
+
return struct.unpack("i", record_length_bytes)[0]
|
|
633
725
|
|
|
634
726
|
def obs_binary_reader(self, file, n):
|
|
635
727
|
"""Reads the obs sequence binary file and returns a generator of the obs"""
|
|
636
728
|
header_length = len(self.header)
|
|
637
|
-
with open(file,
|
|
729
|
+
with open(file, "rb") as f:
|
|
638
730
|
# Skip the first len(obs_seq.header) lines
|
|
639
|
-
for _ in range(header_length-1):
|
|
731
|
+
for _ in range(header_length - 1):
|
|
640
732
|
# Read the record length
|
|
641
733
|
record_length = obs_sequence.read_record_length(f)
|
|
642
|
-
if record_length is None:
|
|
734
|
+
if record_length is None: # End of file
|
|
643
735
|
break
|
|
644
736
|
|
|
645
737
|
# Skip the actual record
|
|
@@ -652,78 +744,78 @@ class obs_sequence:
|
|
|
652
744
|
while True:
|
|
653
745
|
obs = []
|
|
654
746
|
obs_num += 1
|
|
655
|
-
obs.append(f"OBS {obs_num}")
|
|
656
|
-
for _ in range(n):
|
|
747
|
+
obs.append(f"OBS {obs_num}")
|
|
748
|
+
for _ in range(n): # number of copies
|
|
657
749
|
# Read the record length
|
|
658
750
|
record_length = obs_sequence.read_record_length(f)
|
|
659
751
|
if record_length is None:
|
|
660
752
|
break
|
|
661
753
|
# Read the actual record (copie)
|
|
662
754
|
record = f.read(record_length)
|
|
663
|
-
obs.append(struct.unpack(
|
|
755
|
+
obs.append(struct.unpack("d", record)[0])
|
|
664
756
|
|
|
665
757
|
# Read the trailing record length (should match the leading one)
|
|
666
758
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
667
|
-
|
|
759
|
+
|
|
668
760
|
# linked list info
|
|
669
761
|
record_length = obs_sequence.read_record_length(f)
|
|
670
762
|
if record_length is None:
|
|
671
763
|
break
|
|
672
764
|
|
|
673
765
|
record = f.read(record_length)
|
|
674
|
-
int1, int2, int3 = struct.unpack(
|
|
766
|
+
int1, int2, int3 = struct.unpack("iii", record[:12])
|
|
675
767
|
linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
|
|
676
768
|
obs.append(linked_list_string)
|
|
677
769
|
|
|
678
770
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
679
771
|
|
|
680
772
|
# location (note no location header "loc3d" or "loc1d" for binary files)
|
|
681
|
-
obs.append(
|
|
773
|
+
obs.append("loc3d")
|
|
682
774
|
record_length = obs_sequence.read_record_length(f)
|
|
683
775
|
record = f.read(record_length)
|
|
684
|
-
x,y,z,vert = struct.unpack(
|
|
685
|
-
location_string = f"{x} {y} {z} {vert}"
|
|
686
|
-
obs.append(location_string)
|
|
776
|
+
x, y, z, vert = struct.unpack("dddi", record[:28])
|
|
777
|
+
location_string = f"{x} {y} {z} {vert}"
|
|
778
|
+
obs.append(location_string)
|
|
687
779
|
|
|
688
780
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
689
|
-
|
|
781
|
+
|
|
690
782
|
# kind (type of observation) value
|
|
691
|
-
obs.append(
|
|
783
|
+
obs.append("kind")
|
|
692
784
|
record_length_bytes = f.read(4)
|
|
693
|
-
record_length = struct.unpack(
|
|
785
|
+
record_length = struct.unpack("i", record_length_bytes)[0]
|
|
694
786
|
record = f.read(record_length)
|
|
695
787
|
kind = f"{struct.unpack('i', record)[0]}"
|
|
696
788
|
obs.append(kind)
|
|
697
|
-
|
|
789
|
+
|
|
698
790
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
699
791
|
|
|
700
792
|
# time (seconds, days)
|
|
701
793
|
record_length = obs_sequence.read_record_length(f)
|
|
702
794
|
record = f.read(record_length)
|
|
703
|
-
seconds, days = struct.unpack(
|
|
795
|
+
seconds, days = struct.unpack("ii", record)[:8]
|
|
704
796
|
time_string = f"{seconds} {days}"
|
|
705
797
|
obs.append(time_string)
|
|
706
798
|
|
|
707
799
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
708
|
-
|
|
800
|
+
|
|
709
801
|
# obs error variance
|
|
710
802
|
record_length = obs_sequence.read_record_length(f)
|
|
711
803
|
record = f.read(record_length)
|
|
712
|
-
obs.append(struct.unpack(
|
|
713
|
-
|
|
804
|
+
obs.append(struct.unpack("d", record)[0])
|
|
805
|
+
|
|
714
806
|
obs_sequence.check_trailing_record_length(f, record_length)
|
|
715
807
|
|
|
716
808
|
yield obs
|
|
717
809
|
|
|
718
|
-
def composite_types(self, composite_types=
|
|
810
|
+
def composite_types(self, composite_types="use_default"):
|
|
719
811
|
"""
|
|
720
812
|
Set up and construct composite types for the DataFrame.
|
|
721
813
|
|
|
722
|
-
This function sets up composite types based on a provided YAML configuration or
|
|
723
|
-
a default configuration. It constructs new composite rows by combining specified
|
|
814
|
+
This function sets up composite types based on a provided YAML configuration or
|
|
815
|
+
a default configuration. It constructs new composite rows by combining specified
|
|
724
816
|
components and adds them to the DataFrame.
|
|
725
817
|
|
|
726
|
-
|
|
818
|
+
Args:
|
|
727
819
|
composite_types (str, optional): The YAML configuration for composite types. If 'use_default', the default configuration is used. Otherwise, a custom YAML configuration can be provided.
|
|
728
820
|
|
|
729
821
|
Returns:
|
|
@@ -733,12 +825,12 @@ class obs_sequence:
|
|
|
733
825
|
Exception: If there are repeat values in the components.
|
|
734
826
|
"""
|
|
735
827
|
|
|
736
|
-
if composite_types ==
|
|
828
|
+
if composite_types == "use_default":
|
|
737
829
|
composite_yaml = self.default_composite_types
|
|
738
830
|
else:
|
|
739
831
|
composite_yaml = composite_types
|
|
740
|
-
self.composite_types_dict
|
|
741
|
-
|
|
832
|
+
self.composite_types_dict = load_yaml_to_dict(composite_yaml)
|
|
833
|
+
|
|
742
834
|
components = []
|
|
743
835
|
for value in self.composite_types_dict.values():
|
|
744
836
|
components.extend(value["components"])
|
|
@@ -746,31 +838,234 @@ class obs_sequence:
|
|
|
746
838
|
if len(components) != len(set(components)):
|
|
747
839
|
raise Exception("There are repeat values in components.")
|
|
748
840
|
|
|
749
|
-
df_comp = self.df[
|
|
750
|
-
|
|
841
|
+
df_comp = self.df[
|
|
842
|
+
self.df["type"]
|
|
843
|
+
.str.upper()
|
|
844
|
+
.isin([component.upper() for component in components])
|
|
845
|
+
]
|
|
846
|
+
df_no_comp = self.df[
|
|
847
|
+
~self.df["type"]
|
|
848
|
+
.str.upper()
|
|
849
|
+
.isin([component.upper() for component in components])
|
|
850
|
+
]
|
|
751
851
|
|
|
752
852
|
for key in self.composite_types_dict:
|
|
753
|
-
df_new = construct_composit(
|
|
853
|
+
df_new = construct_composit(
|
|
854
|
+
df_comp, key, self.composite_types_dict[key]["components"]
|
|
855
|
+
)
|
|
754
856
|
df_no_comp = pd.concat([df_no_comp, df_new], axis=0)
|
|
755
857
|
|
|
756
858
|
return df_no_comp
|
|
757
|
-
|
|
859
|
+
|
|
860
|
+
@classmethod
|
|
861
|
+
def join(cls, obs_sequences, copies=None):
|
|
862
|
+
"""
|
|
863
|
+
Join a list of observation sequences together.
|
|
864
|
+
|
|
865
|
+
This method combines the headers and observations from a list of obs_sequence objects
|
|
866
|
+
into a single obs_sequence object.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
obs_sequences (list of obs_sequences): The list of observation sequences objects to join.
|
|
870
|
+
copies (list of str, optional): A list of copy names to include in the combined data.
|
|
871
|
+
If not provided, all copies are included.
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
A new obs_sequence object containing the combined data.
|
|
875
|
+
|
|
876
|
+
Example:
|
|
877
|
+
.. code-block:: python
|
|
878
|
+
|
|
879
|
+
obs_seq1 = obs_sequence(file='obs_seq1.final')
|
|
880
|
+
obs_seq2 = obs_sequence(file='obs_seq2.final')
|
|
881
|
+
obs_seq3 = obs_sequence(file='obs_seq3.final')
|
|
882
|
+
combined = obs_sequence.join([obs_seq1, obs_seq2, obs_seq3])
|
|
883
|
+
"""
|
|
884
|
+
if not obs_sequences:
|
|
885
|
+
raise ValueError("The list of observation sequences is empty.")
|
|
886
|
+
|
|
887
|
+
# Create a new obs_sequnece object with the combined data
|
|
888
|
+
combo = cls(file=None)
|
|
889
|
+
|
|
890
|
+
# Check if all obs_sequences have compatible attributes
|
|
891
|
+
first_loc_mod = obs_sequences[0].loc_mod
|
|
892
|
+
first_has_assimilation_info = obs_sequences[0].has_assimilation_info
|
|
893
|
+
first_has_posterior = obs_sequences[0].has_posterior
|
|
894
|
+
for obs_seq in obs_sequences:
|
|
895
|
+
if obs_seq.loc_mod != first_loc_mod:
|
|
896
|
+
raise ValueError(
|
|
897
|
+
"All observation sequences must have the same loc_mod."
|
|
898
|
+
)
|
|
899
|
+
if obs_seq.has_assimilation_info != first_has_assimilation_info:
|
|
900
|
+
raise ValueError(
|
|
901
|
+
"All observation sequences must have assimilation info."
|
|
902
|
+
)
|
|
903
|
+
if obs_seq.has_posterior != first_has_posterior:
|
|
904
|
+
raise ValueError(
|
|
905
|
+
"All observation sequences must have the posterior info."
|
|
906
|
+
)
|
|
907
|
+
# HK @todo prior only
|
|
908
|
+
combo.loc_mod = first_loc_mod
|
|
909
|
+
|
|
910
|
+
# check the copies are compatible (list of copies to combine?)
|
|
911
|
+
# subset of copies if needed
|
|
912
|
+
if copies:
|
|
913
|
+
start_required_columns = ["obs_num", "observation"]
|
|
914
|
+
end_required_columns = [
|
|
915
|
+
"linked_list",
|
|
916
|
+
"longitude",
|
|
917
|
+
"latitude",
|
|
918
|
+
"vertical",
|
|
919
|
+
"vert_unit",
|
|
920
|
+
"type",
|
|
921
|
+
"metadata",
|
|
922
|
+
"external_FO",
|
|
923
|
+
"seconds",
|
|
924
|
+
"days",
|
|
925
|
+
"time",
|
|
926
|
+
"obs_err_var",
|
|
927
|
+
]
|
|
928
|
+
required_columns = start_required_columns + end_required_columns
|
|
929
|
+
|
|
930
|
+
requested_columns = (
|
|
931
|
+
start_required_columns
|
|
932
|
+
+ [item for item in copies if item not in required_columns]
|
|
933
|
+
+ end_required_columns
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
for obs_seq in obs_sequences:
|
|
937
|
+
if not set(requested_columns).issubset(obs_seq.df.columns):
|
|
938
|
+
raise ValueError(
|
|
939
|
+
"All observation sequences must have the selected copies."
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
# go through columns and create header
|
|
943
|
+
remove_list = [
|
|
944
|
+
"obs_num",
|
|
945
|
+
"linked_list",
|
|
946
|
+
"latitude",
|
|
947
|
+
"longitude",
|
|
948
|
+
"vertical",
|
|
949
|
+
"vert_unit",
|
|
950
|
+
"type",
|
|
951
|
+
"metadata",
|
|
952
|
+
"external_FO",
|
|
953
|
+
"time",
|
|
954
|
+
"seconds",
|
|
955
|
+
"days",
|
|
956
|
+
"obs_err_var",
|
|
957
|
+
]
|
|
958
|
+
# using lists to retain copy order, non_qcs followed by qcs
|
|
959
|
+
combo.copie_names = [
|
|
960
|
+
item for item in requested_columns if item not in remove_list
|
|
961
|
+
]
|
|
962
|
+
combo.non_qc_copie_names = [
|
|
963
|
+
item
|
|
964
|
+
for item in combo.copie_names
|
|
965
|
+
if item in obs_sequences[0].non_qc_copie_names
|
|
966
|
+
]
|
|
967
|
+
combo.qc_copie_names = [
|
|
968
|
+
item
|
|
969
|
+
for item in combo.copie_names
|
|
970
|
+
if item in obs_sequences[0].qc_copie_names
|
|
971
|
+
]
|
|
972
|
+
|
|
973
|
+
combo.n_copies = len(combo.copie_names)
|
|
974
|
+
combo.n_qc = len(combo.qc_copie_names)
|
|
975
|
+
combo.n_non_qc = len(combo.non_qc_copie_names)
|
|
976
|
+
|
|
977
|
+
else:
|
|
978
|
+
for obs_seq in obs_sequences:
|
|
979
|
+
if not obs_sequences[0].df.columns.isin(obs_seq.df.columns).all():
|
|
980
|
+
raise ValueError(
|
|
981
|
+
"All observation sequences must have the same copies."
|
|
982
|
+
)
|
|
983
|
+
combo.n_copies = obs_sequences[0].n_copies
|
|
984
|
+
combo.n_qc = obs_sequences[0].n_qc
|
|
985
|
+
combo.n_non_qc = obs_sequences[0].n_non_qc
|
|
986
|
+
combo.copie_names = obs_sequences[0].copie_names
|
|
987
|
+
|
|
988
|
+
# todo HK @todo combine synonyms for obs?
|
|
989
|
+
|
|
990
|
+
# Initialize combined data
|
|
991
|
+
combined_types = []
|
|
992
|
+
combined_df = pd.DataFrame()
|
|
993
|
+
combo.all_obs = None # set to none to force writing from the dataframe if write_obs_seq is called
|
|
994
|
+
|
|
995
|
+
# Iterate over the list of observation sequences and combine their data
|
|
996
|
+
for obs_seq in obs_sequences:
|
|
997
|
+
if copies:
|
|
998
|
+
combined_df = pd.concat(
|
|
999
|
+
[combined_df, obs_seq.df[requested_columns]], ignore_index=True
|
|
1000
|
+
)
|
|
1001
|
+
else:
|
|
1002
|
+
combined_df = pd.concat([combined_df, obs_seq.df], ignore_index=True)
|
|
1003
|
+
combined_types.extend(list(obs_seq.reverse_types.keys()))
|
|
1004
|
+
|
|
1005
|
+
# create dictionary of types
|
|
1006
|
+
keys = set(combined_types)
|
|
1007
|
+
combo.reverse_types = {item: i + 1 for i, item in enumerate(keys)}
|
|
1008
|
+
combo.types = {v: k for k, v in combo.reverse_types.items()}
|
|
1009
|
+
|
|
1010
|
+
# create linked list for obs
|
|
1011
|
+
combo.df = combined_df.sort_values(by="time").reset_index(drop=True)
|
|
1012
|
+
combo.df["linked_list"] = obs_sequence.generate_linked_list_pattern(
|
|
1013
|
+
len(combo.df)
|
|
1014
|
+
)
|
|
1015
|
+
combo.df["obs_num"] = combined_df.index + 1
|
|
1016
|
+
combo.create_header(len(combo.df))
|
|
1017
|
+
|
|
1018
|
+
# set assimilation info (mean and spread) (prior and posterior)
|
|
1019
|
+
combo.has_assimilation_info = "prior_ensemble_mean".casefold() in map(
|
|
1020
|
+
str.casefold, combo.df.columns
|
|
1021
|
+
)
|
|
1022
|
+
combo.has_assimilation_info = "prior_ensemble_spread".casefold() in map(
|
|
1023
|
+
str.casefold, combo.df.columns
|
|
1024
|
+
)
|
|
1025
|
+
combo.has_posterior = "posterior_ensemble_mean".casefold() in map(
|
|
1026
|
+
str.casefold, combo.df.columns
|
|
1027
|
+
)
|
|
1028
|
+
combo.has_posterior = "posterior_ensemble_spread".casefold() in map(
|
|
1029
|
+
str.casefold, combo.df.columns
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
return combo
|
|
1033
|
+
|
|
1034
|
+
def create_header(self, n):
|
|
1035
|
+
"""Create a header for the obs_seq file from the obs_sequence object."""
|
|
1036
|
+
assert (
|
|
1037
|
+
self.n_copies == self.n_non_qc + self.n_qc
|
|
1038
|
+
), "n_copies must be equal to n_non_qc + n_qc"
|
|
1039
|
+
|
|
1040
|
+
self.header = []
|
|
1041
|
+
self.header.append(f"obs_sequence")
|
|
1042
|
+
self.header.append("obs_type_definitions")
|
|
1043
|
+
self.header.append(f"{len(self.types)}")
|
|
1044
|
+
for key, value in self.types.items():
|
|
1045
|
+
self.header.append(f"{key} {value}")
|
|
1046
|
+
self.header.append(f"num_copies: {self.n_non_qc} num_qc: {self.n_qc}")
|
|
1047
|
+
self.header.append(f"num_obs: {n} max_num_obs: {n}")
|
|
1048
|
+
for copie in self.copie_names:
|
|
1049
|
+
self.header.append(copie)
|
|
1050
|
+
self.header.append(f"first: 1 last: {n}")
|
|
1051
|
+
|
|
1052
|
+
|
|
758
1053
|
def load_yaml_to_dict(file_path):
|
|
759
1054
|
"""
|
|
760
1055
|
Load a YAML file and convert it to a dictionary.
|
|
761
1056
|
|
|
762
|
-
|
|
1057
|
+
Args:
|
|
763
1058
|
file_path (str): The path to the YAML file.
|
|
764
1059
|
|
|
765
1060
|
Returns:
|
|
766
1061
|
dict: The YAML file content as a dictionary.
|
|
767
1062
|
"""
|
|
768
1063
|
try:
|
|
769
|
-
with open(file_path,
|
|
1064
|
+
with open(file_path, "r") as file:
|
|
770
1065
|
return yaml.safe_load(file)
|
|
771
1066
|
except Exception as e:
|
|
772
1067
|
print(f"Error loading YAML file: {e}")
|
|
773
|
-
return None
|
|
1068
|
+
return None
|
|
774
1069
|
|
|
775
1070
|
|
|
776
1071
|
def convert_dart_time(seconds, days):
|
|
@@ -780,18 +1075,19 @@ def convert_dart_time(seconds, days):
|
|
|
780
1075
|
- base year for Gregorian calendar is 1601
|
|
781
1076
|
- dart time is seconds, days since 1601
|
|
782
1077
|
"""
|
|
783
|
-
time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
|
|
1078
|
+
time = dt.datetime(1601, 1, 1) + dt.timedelta(days=days, seconds=seconds)
|
|
784
1079
|
return time
|
|
785
1080
|
|
|
1081
|
+
|
|
786
1082
|
def construct_composit(df_comp, composite, components):
|
|
787
1083
|
"""
|
|
788
1084
|
Construct a composite DataFrame by combining rows from two components.
|
|
789
1085
|
|
|
790
1086
|
This function takes two DataFrames and combines rows from them based on matching
|
|
791
|
-
location and time. It creates a new row with a composite type by combining
|
|
1087
|
+
location and time. It creates a new row with a composite type by combining
|
|
792
1088
|
specified columns using the square root of the sum of squares method.
|
|
793
1089
|
|
|
794
|
-
|
|
1090
|
+
Args:
|
|
795
1091
|
df_comp (pd.DataFrame): The DataFrame containing the component rows to be combined.
|
|
796
1092
|
composite (str): The type name for the new composite rows.
|
|
797
1093
|
components (list of str): A list containing the type names of the two components to be combined.
|
|
@@ -799,27 +1095,29 @@ def construct_composit(df_comp, composite, components):
|
|
|
799
1095
|
Returns:
|
|
800
1096
|
merged_df (pd.DataFrame): The updated DataFrame with the new composite rows added.
|
|
801
1097
|
"""
|
|
802
|
-
selected_rows = df_comp[df_comp[
|
|
803
|
-
selected_rows_v = df_comp[df_comp[
|
|
1098
|
+
selected_rows = df_comp[df_comp["type"] == components[0].upper()]
|
|
1099
|
+
selected_rows_v = df_comp[df_comp["type"] == components[1].upper()]
|
|
804
1100
|
|
|
805
|
-
columns_to_combine = df_comp.filter(regex=
|
|
806
|
-
columns_to_combine.append(
|
|
807
|
-
merge_columns = [
|
|
1101
|
+
columns_to_combine = df_comp.filter(regex="ensemble").columns.tolist()
|
|
1102
|
+
columns_to_combine.append("observation") # TODO HK: bias, sq_err, obs_err_var
|
|
1103
|
+
merge_columns = ["latitude", "longitude", "vertical", "time"]
|
|
808
1104
|
|
|
809
1105
|
print("duplicates in u: ", selected_rows[merge_columns].duplicated().sum())
|
|
810
|
-
print("duplicates in v: ",selected_rows_v[merge_columns].duplicated().sum())
|
|
1106
|
+
print("duplicates in v: ", selected_rows_v[merge_columns].duplicated().sum())
|
|
811
1107
|
|
|
812
1108
|
# Merge the two DataFrames on location and time columns
|
|
813
|
-
merged_df = pd.merge(
|
|
1109
|
+
merged_df = pd.merge(
|
|
1110
|
+
selected_rows, selected_rows_v, on=merge_columns, suffixes=("", "_v")
|
|
1111
|
+
)
|
|
814
1112
|
|
|
815
1113
|
# Apply the square root of the sum of squares method to the relevant columns
|
|
816
1114
|
for col in columns_to_combine:
|
|
817
|
-
merged_df[col] = np.sqrt(merged_df[col]**2 + merged_df[f
|
|
1115
|
+
merged_df[col] = np.sqrt(merged_df[col] ** 2 + merged_df[f"{col}_v"] ** 2)
|
|
818
1116
|
|
|
819
1117
|
# Create the new composite rows
|
|
820
|
-
merged_df[
|
|
821
|
-
merged_df = merged_df.drop(
|
|
1118
|
+
merged_df["type"] = composite.upper()
|
|
1119
|
+
merged_df = merged_df.drop(
|
|
1120
|
+
columns=[col for col in merged_df.columns if col.endswith("_v")]
|
|
1121
|
+
)
|
|
822
1122
|
|
|
823
1123
|
return merged_df
|
|
824
|
-
|
|
825
|
-
|