pydartdiags 0.0.41__tar.gz → 0.0.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydartdiags might be problematic. Click here for more details.
- pydartdiags-0.0.43/PKG-INFO +45 -0
- pydartdiags-0.0.43/README.md +24 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/pyproject.toml +1 -1
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags/obs_sequence/obs_sequence.py +396 -77
- pydartdiags-0.0.43/src/pydartdiags/plots/plots.py +339 -0
- pydartdiags-0.0.43/src/pydartdiags.egg-info/PKG-INFO +45 -0
- pydartdiags-0.0.43/tests/test_obs_sequence.py +225 -0
- pydartdiags-0.0.41/PKG-INFO +0 -399
- pydartdiags-0.0.41/README.md +0 -378
- pydartdiags-0.0.41/src/pydartdiags/plots/plots.py +0 -161
- pydartdiags-0.0.41/src/pydartdiags.egg-info/PKG-INFO +0 -399
- pydartdiags-0.0.41/tests/test_obs_sequence.py +0 -29
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/LICENSE +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/setup.cfg +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/setup.py +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags/__init__.py +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags/obs_sequence/__init__.py +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags/plots/__init__.py +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags.egg-info/SOURCES.txt +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags.egg-info/dependency_links.txt +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags.egg-info/requires.txt +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/src/pydartdiags.egg-info/top_level.txt +0 -0
- {pydartdiags-0.0.41 → pydartdiags-0.0.43}/tests/test_plots.py +0 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pydartdiags
|
|
3
|
+
Version: 0.0.43
|
|
4
|
+
Summary: Observation Sequence Diagnostics for DART
|
|
5
|
+
Home-page: https://github.com/NCAR/pyDARTdiags.git
|
|
6
|
+
Author: Helen Kershaw
|
|
7
|
+
Author-email: Helen Kershaw <hkershaw@ucar.edu>
|
|
8
|
+
Project-URL: Homepage, https://github.com/NCAR/pyDARTdiags.git
|
|
9
|
+
Project-URL: Issues, https://github.com/NCAR/pyDARTdiags/issues
|
|
10
|
+
Project-URL: Documentation, https://ncar.github.io/pyDARTdiags
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pandas>=2.2.0
|
|
18
|
+
Requires-Dist: numpy>=1.26
|
|
19
|
+
Requires-Dist: plotly>=5.22.0
|
|
20
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
21
|
+
|
|
22
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
23
|
+
[](https://codecov.io/gh/NCAR/pyDARTdiags)
|
|
24
|
+
[](https://pypi.org/project/pydartdiags/)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# pyDARTdiags
|
|
28
|
+
|
|
29
|
+
pyDARTdiags is a Python library for obsevation space diagnostics for the Data Assimilation Research Testbed ([DART](https://github.com/NCAR/DART)).
|
|
30
|
+
|
|
31
|
+
pyDARTdiags is under initial development, so please use caution.
|
|
32
|
+
The MATLAB [observation space diagnostics](https://docs.dart.ucar.edu/en/latest/guide/matlab-observation-space.html) are available through [DART](https://github.com/NCAR/DART).
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
pyDARTdiags can be installed through pip: https://pypi.org/project/pydartdiags/
|
|
36
|
+
Documenation : https://ncar.github.io/pyDARTdiags/
|
|
37
|
+
|
|
38
|
+
## Contributing
|
|
39
|
+
Contributions are welcome! If you have a feature request, bug report, or a suggestion, please open an issue on our GitHub repository.
|
|
40
|
+
Please read our [Contributors Guide](https://github.com/NCAR/pyDARTdiags/blob/main/CONTRIBUTING.md) if you would like to contribute to
|
|
41
|
+
pyDARTdiags.
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
pyDARTdiags is released under the Apache License 2.0. For more details, see the LICENSE file in the root directory of this source tree or visit [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
2
|
+
[](https://codecov.io/gh/NCAR/pyDARTdiags)
|
|
3
|
+
[](https://pypi.org/project/pydartdiags/)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# pyDARTdiags
|
|
7
|
+
|
|
8
|
+
pyDARTdiags is a Python library for obsevation space diagnostics for the Data Assimilation Research Testbed ([DART](https://github.com/NCAR/DART)).
|
|
9
|
+
|
|
10
|
+
pyDARTdiags is under initial development, so please use caution.
|
|
11
|
+
The MATLAB [observation space diagnostics](https://docs.dart.ucar.edu/en/latest/guide/matlab-observation-space.html) are available through [DART](https://github.com/NCAR/DART).
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
pyDARTdiags can be installed through pip: https://pypi.org/project/pydartdiags/
|
|
15
|
+
Documenation : https://ncar.github.io/pyDARTdiags/
|
|
16
|
+
|
|
17
|
+
## Contributing
|
|
18
|
+
Contributions are welcome! If you have a feature request, bug report, or a suggestion, please open an issue on our GitHub repository.
|
|
19
|
+
Please read our [Contributors Guide](https://github.com/NCAR/pyDARTdiags/blob/main/CONTRIBUTING.md) if you would like to contribute to
|
|
20
|
+
pyDARTdiags.
|
|
21
|
+
|
|
22
|
+
## License
|
|
23
|
+
|
|
24
|
+
pyDARTdiags is released under the Apache License 2.0. For more details, see the LICENSE file in the root directory of this source tree or visit [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
|
|
@@ -3,6 +3,24 @@ import datetime as dt
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import os
|
|
5
5
|
import yaml
|
|
6
|
+
import struct
|
|
7
|
+
|
|
8
|
+
def requires_assimilation_info(func):
|
|
9
|
+
def wrapper(self, *args, **kwargs):
|
|
10
|
+
if self.has_assimilation_info:
|
|
11
|
+
return func(self, *args, **kwargs)
|
|
12
|
+
else:
|
|
13
|
+
raise ValueError("Assimilation information is required to call this function.")
|
|
14
|
+
return wrapper
|
|
15
|
+
|
|
16
|
+
def requires_posterior_info(func):
|
|
17
|
+
def wrapper(self, *args, **kwargs):
|
|
18
|
+
if self.has_posterior_info:
|
|
19
|
+
return func(self, *args, **kwargs)
|
|
20
|
+
else:
|
|
21
|
+
raise ValueError("Posterior information is required to call this function.")
|
|
22
|
+
return wrapper
|
|
23
|
+
|
|
6
24
|
|
|
7
25
|
class obs_sequence:
|
|
8
26
|
"""Create an obs_sequence object from an ascii observation sequence file.
|
|
@@ -55,22 +73,53 @@ class obs_sequence:
|
|
|
55
73
|
|
|
56
74
|
reversed_vert = {value: key for key, value in vert.items()}
|
|
57
75
|
|
|
58
|
-
# synonyms for observation
|
|
59
|
-
synonyms_for_obs = ['NCEP BUFR observation',
|
|
60
|
-
'AIRS observation',
|
|
61
|
-
'GTSPP observation',
|
|
62
|
-
'SST observation',
|
|
63
|
-
'observations',
|
|
64
|
-
'WOD observation']
|
|
65
76
|
|
|
66
|
-
def __init__(self, file):
|
|
77
|
+
def __init__(self, file, synonyms=None):
|
|
67
78
|
self.loc_mod = 'None'
|
|
79
|
+
self.has_assimilation_info = False
|
|
80
|
+
self.has_posterior = False
|
|
68
81
|
self.file = file
|
|
69
|
-
self.
|
|
82
|
+
self.synonyms_for_obs = ['NCEP BUFR observation',
|
|
83
|
+
'AIRS observation',
|
|
84
|
+
'GTSPP observation',
|
|
85
|
+
'SST observation',
|
|
86
|
+
'observations',
|
|
87
|
+
'WOD observation']
|
|
88
|
+
if synonyms:
|
|
89
|
+
if isinstance(synonyms, list):
|
|
90
|
+
self.synonyms_for_obs.extend(synonyms)
|
|
91
|
+
else:
|
|
92
|
+
self.synonyms_for_obs.append(synonyms)
|
|
93
|
+
|
|
94
|
+
if file is None:
|
|
95
|
+
# Early exit for testing purposes
|
|
96
|
+
self.df = pd.DataFrame()
|
|
97
|
+
self.types = {}
|
|
98
|
+
self.reverse_types = {}
|
|
99
|
+
self.copie_names = []
|
|
100
|
+
self.n_copies = 0
|
|
101
|
+
self.seq = []
|
|
102
|
+
self.all_obs = []
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
module_dir = os.path.dirname(__file__)
|
|
106
|
+
self.default_composite_types = os.path.join(module_dir,"composite_types.yaml")
|
|
107
|
+
|
|
108
|
+
if self.is_binary(file):
|
|
109
|
+
self.header = self.read_binary_header(file)
|
|
110
|
+
else:
|
|
111
|
+
self.header = self.read_header(file)
|
|
112
|
+
|
|
70
113
|
self.types = self.collect_obs_types(self.header)
|
|
71
114
|
self.reverse_types = {v: k for k, v in self.types.items()}
|
|
72
115
|
self.copie_names, self.n_copies = self.collect_copie_names(self.header)
|
|
73
|
-
|
|
116
|
+
|
|
117
|
+
if self.is_binary(file):
|
|
118
|
+
self.seq = self.obs_binary_reader(file, self.n_copies)
|
|
119
|
+
self.loc_mod = 'loc3d' # only loc3d supported for binary, & no way to check
|
|
120
|
+
else:
|
|
121
|
+
self.seq = self.obs_reader(file, self.n_copies)
|
|
122
|
+
|
|
74
123
|
self.all_obs = self.create_all_obs() # uses up the generator
|
|
75
124
|
# at this point you know if the seq is loc3d or loc1d
|
|
76
125
|
if self.loc_mod == 'None':
|
|
@@ -84,12 +133,16 @@ class obs_sequence:
|
|
|
84
133
|
self.synonyms_for_obs = [synonym.replace(' ', '_') for synonym in self.synonyms_for_obs]
|
|
85
134
|
rename_dict = {old: 'observation' for old in self.synonyms_for_obs if old in self.df.columns}
|
|
86
135
|
self.df = self.df.rename(columns=rename_dict)
|
|
136
|
+
|
|
87
137
|
# calculate bias and sq_err is the obs_seq is an obs_seq.final
|
|
88
138
|
if 'prior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
|
|
89
|
-
self.
|
|
90
|
-
self.df['
|
|
91
|
-
|
|
92
|
-
|
|
139
|
+
self.has_assimilation_info = True
|
|
140
|
+
self.df['prior_bias'] = (self.df['prior_ensemble_mean'] - self.df['observation'])
|
|
141
|
+
self.df['prior_sq_err'] = self.df['prior_bias']**2 # squared error
|
|
142
|
+
if 'posterior_ensemble_mean'.casefold() in map(str.casefold, self.columns):
|
|
143
|
+
self.has_posterior_info = True
|
|
144
|
+
self.df['posterior_bias'] = (self.df['posterior_ensemble_mean'] - self.df['observation'])
|
|
145
|
+
self.df['posterior_sq_err'] = self.df['posterior_bias']**2
|
|
93
146
|
|
|
94
147
|
def create_all_obs(self):
|
|
95
148
|
""" steps through the generator to create a
|
|
@@ -128,16 +181,44 @@ class obs_sequence:
|
|
|
128
181
|
raise ValueError("Neither 'loc3d' nor 'loc1d' could be found in the observation sequence.")
|
|
129
182
|
typeI = obs.index('kind') # type of observation
|
|
130
183
|
type_value = obs[typeI + 1]
|
|
131
|
-
|
|
184
|
+
if not self.types:
|
|
185
|
+
data.append('Identity')
|
|
186
|
+
else:
|
|
187
|
+
data.append(self.types[type_value]) # observation type
|
|
188
|
+
|
|
132
189
|
# any observation specific obs def info is between here and the end of the list
|
|
190
|
+
# can be obs_def & external forward operator
|
|
191
|
+
metadata = obs[typeI+2:-2]
|
|
192
|
+
obs_def_metadata, external_metadata = self.split_metadata(metadata)
|
|
193
|
+
data.append(obs_def_metadata)
|
|
194
|
+
data.append(external_metadata)
|
|
195
|
+
|
|
133
196
|
time = obs[-2].split()
|
|
134
197
|
data.append(int(time[0])) # seconds
|
|
135
198
|
data.append(int(time[1])) # days
|
|
136
199
|
data.append(convert_dart_time(int(time[0]), int(time[1]))) # datetime # HK todo what is approprate for 1d models?
|
|
137
200
|
data.append(float(obs[-1])) # obs error variance ?convert to sd?
|
|
138
|
-
|
|
201
|
+
|
|
139
202
|
return data
|
|
140
203
|
|
|
204
|
+
@staticmethod
|
|
205
|
+
def split_metadata(metadata):
|
|
206
|
+
"""
|
|
207
|
+
Split the metadata list at the first occurrence of an element starting with 'externalF0'.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
metadata (list of str): The metadata list to be split.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
tuple: Two sublists, the first containing elements before 'externalF0', and the second
|
|
214
|
+
containing 'externalF0' and all elements after it. If 'externalF0' is not found,
|
|
215
|
+
the first sublist contains the entire metadata list, and the second is empty.
|
|
216
|
+
"""
|
|
217
|
+
for i, item in enumerate(metadata):
|
|
218
|
+
if item.startswith('external_FO'):
|
|
219
|
+
return metadata[:i], metadata[i:]
|
|
220
|
+
return metadata, []
|
|
221
|
+
|
|
141
222
|
def list_to_obs(self, data):
|
|
142
223
|
obs = []
|
|
143
224
|
obs.append('OBS ' + str(data[0])) # obs_num lots of space
|
|
@@ -149,10 +230,16 @@ class obs_sequence:
|
|
|
149
230
|
obs.append(' '.join(map(str, data[self.n_copies+2:self.n_copies+5])) + ' ' + str(self.reversed_vert[data[self.n_copies+5]]) ) # location x, y, z, vert
|
|
150
231
|
obs.append('kind') # this is type of observation
|
|
151
232
|
obs.append(self.reverse_types[data[self.n_copies + 6]]) # observation type
|
|
233
|
+
# Convert metadata to a string and append
|
|
234
|
+
obs.extend(data[self.n_copies + 7]) # metadata
|
|
152
235
|
elif self.loc_mod == 'loc1d':
|
|
153
236
|
obs.append(data[self.n_copies+2]) # 1d location
|
|
154
237
|
obs.append('kind') # this is type of observation
|
|
155
238
|
obs.append(self.reverse_types[data[self.n_copies + 3]]) # observation type
|
|
239
|
+
# Convert metadata to a string and append
|
|
240
|
+
metadata = ' '.join(map(str, data[self.n_copies + 4:-4]))
|
|
241
|
+
if metadata:
|
|
242
|
+
obs.append(metadata) # metadata
|
|
156
243
|
obs.append(' '.join(map(str, data[-4:-2]))) # seconds, days
|
|
157
244
|
obs.append(data[-1]) # obs error variance
|
|
158
245
|
|
|
@@ -251,15 +338,83 @@ class obs_sequence:
|
|
|
251
338
|
elif self.loc_mod == 'loc1d':
|
|
252
339
|
heading.append('location')
|
|
253
340
|
heading.append('type')
|
|
341
|
+
heading.append('metadata')
|
|
342
|
+
heading.append('external_FO')
|
|
254
343
|
heading.append('seconds')
|
|
255
344
|
heading.append('days')
|
|
256
345
|
heading.append('time')
|
|
257
346
|
heading.append('obs_err_var')
|
|
258
347
|
return heading
|
|
259
348
|
|
|
349
|
+
@requires_assimilation_info
|
|
350
|
+
def select_by_dart_qc(self, dart_qc):
|
|
351
|
+
"""
|
|
352
|
+
Selects rows from a DataFrame based on the DART quality control flag.
|
|
353
|
+
|
|
354
|
+
Parameters:
|
|
355
|
+
df (DataFrame): A pandas DataFrame.
|
|
356
|
+
dart_qc (int): The DART quality control flag to select.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
|
|
360
|
+
|
|
361
|
+
Raises:
|
|
362
|
+
ValueError: If the DART quality control flag is not present in the DataFrame.
|
|
363
|
+
"""
|
|
364
|
+
if dart_qc not in self.df['DART_quality_control'].unique():
|
|
365
|
+
raise ValueError(f"DART quality control flag '{dart_qc}' not found in DataFrame.")
|
|
366
|
+
else:
|
|
367
|
+
return self.df[self.df['DART_quality_control'] == dart_qc]
|
|
368
|
+
|
|
369
|
+
@requires_assimilation_info
|
|
370
|
+
def select_failed_qcs(self):
|
|
371
|
+
"""
|
|
372
|
+
Select rows from the DataFrame where the DART quality control flag is greater than 0.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
pandas.DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
|
|
376
|
+
"""
|
|
377
|
+
return self.df[self.df['DART_quality_control'] > 0]
|
|
378
|
+
|
|
379
|
+
@requires_assimilation_info
|
|
380
|
+
def possible_vs_used(self):
|
|
381
|
+
"""
|
|
382
|
+
Calculates the count of possible vs. used observations by type.
|
|
383
|
+
|
|
384
|
+
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
385
|
+
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
386
|
+
minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
|
|
387
|
+
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
388
|
+
used observations.
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
|
|
392
|
+
'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
|
|
393
|
+
that passed quality control checks.
|
|
394
|
+
"""
|
|
395
|
+
possible = self.df.groupby('type')['observation'].count()
|
|
396
|
+
possible.rename('possible', inplace=True)
|
|
397
|
+
|
|
398
|
+
failed_qcs = self.select_failed_qcs().groupby('type')['observation'].count()
|
|
399
|
+
used = possible - failed_qcs.reindex(possible.index, fill_value=0)
|
|
400
|
+
used.rename('used', inplace=True)
|
|
401
|
+
|
|
402
|
+
return pd.concat([possible, used], axis=1).reset_index()
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
@staticmethod
|
|
406
|
+
def is_binary(file):
|
|
407
|
+
"""Check if a file is binary file."""
|
|
408
|
+
with open(file, 'rb') as f:
|
|
409
|
+
chunk = f.read(1024)
|
|
410
|
+
if b'\0' in chunk:
|
|
411
|
+
return True
|
|
412
|
+
return False
|
|
413
|
+
|
|
414
|
+
|
|
260
415
|
@staticmethod
|
|
261
416
|
def read_header(file):
|
|
262
|
-
"""Read the header and number of lines in the header of an obs_seq file"""
|
|
417
|
+
"""Read the header and number of lines in the header of an ascii obs_seq file"""
|
|
263
418
|
header = []
|
|
264
419
|
with open(file, 'r') as f:
|
|
265
420
|
for line in f:
|
|
@@ -270,6 +425,118 @@ class obs_sequence:
|
|
|
270
425
|
header.append(line.strip())
|
|
271
426
|
return header
|
|
272
427
|
|
|
428
|
+
@staticmethod
|
|
429
|
+
def read_binary_header(file):
|
|
430
|
+
"""Read the header and number of lines in the header of a binary obs_seq file from Fortran output"""
|
|
431
|
+
header = []
|
|
432
|
+
linecount = 0
|
|
433
|
+
obs_types_definitions = -1000
|
|
434
|
+
num_obs = 0
|
|
435
|
+
max_num_obs = 0
|
|
436
|
+
# need to get:
|
|
437
|
+
# number of obs_type_definitions
|
|
438
|
+
# number of copies
|
|
439
|
+
# number of qcs
|
|
440
|
+
with open(file, 'rb') as f:
|
|
441
|
+
while True:
|
|
442
|
+
# Read the record length
|
|
443
|
+
record_length = obs_sequence.read_record_length(f)
|
|
444
|
+
if record_length is None:
|
|
445
|
+
break
|
|
446
|
+
record = f.read(record_length)
|
|
447
|
+
if not record: # end of file
|
|
448
|
+
break
|
|
449
|
+
|
|
450
|
+
# Read the trailing record length (should match the leading one)
|
|
451
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
452
|
+
|
|
453
|
+
linecount += 1
|
|
454
|
+
|
|
455
|
+
if linecount == 3:
|
|
456
|
+
obs_types_definitions = struct.unpack('i', record)[0]
|
|
457
|
+
continue
|
|
458
|
+
|
|
459
|
+
if linecount == 4+obs_types_definitions:
|
|
460
|
+
num_copies, num_qcs, num_obs, max_num_obs = struct.unpack('iiii', record)[:16]
|
|
461
|
+
break
|
|
462
|
+
|
|
463
|
+
# Go back to the beginning of the file
|
|
464
|
+
f.seek(0)
|
|
465
|
+
|
|
466
|
+
for _ in range(2):
|
|
467
|
+
record_length = obs_sequence.read_record_length(f)
|
|
468
|
+
if record_length is None:
|
|
469
|
+
break
|
|
470
|
+
|
|
471
|
+
record = f.read(record_length)
|
|
472
|
+
if not record: # end of file
|
|
473
|
+
break
|
|
474
|
+
|
|
475
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
476
|
+
header.append(record.decode('utf-8').strip())
|
|
477
|
+
|
|
478
|
+
header.append(str(obs_types_definitions))
|
|
479
|
+
|
|
480
|
+
# obs_types_definitions
|
|
481
|
+
for _ in range(3,4+obs_types_definitions):
|
|
482
|
+
# Read the record length
|
|
483
|
+
record_length = obs_sequence.read_record_length(f)
|
|
484
|
+
if record_length is None:
|
|
485
|
+
break
|
|
486
|
+
|
|
487
|
+
# Read the actual record
|
|
488
|
+
record = f.read(record_length)
|
|
489
|
+
if not record: # end of file
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
493
|
+
|
|
494
|
+
if _ == 3:
|
|
495
|
+
continue # num obs_types_definitions
|
|
496
|
+
# Read an integer and a string from the record
|
|
497
|
+
integer_value = struct.unpack('i', record[:4])[0]
|
|
498
|
+
string_value = record[4:].decode('utf-8').strip()
|
|
499
|
+
header.append(f"{integer_value} {string_value}")
|
|
500
|
+
|
|
501
|
+
header.append(f"num_copies: {num_copies} num_qc: {num_qcs}")
|
|
502
|
+
header.append(f"num_obs: {num_obs} max_num_obs: {max_num_obs}")
|
|
503
|
+
|
|
504
|
+
#copie names
|
|
505
|
+
for _ in range(5+obs_types_definitions, 5+obs_types_definitions+num_copies+num_qcs+1):
|
|
506
|
+
# Read the record length
|
|
507
|
+
record_length = obs_sequence.read_record_length(f)
|
|
508
|
+
if record_length is None:
|
|
509
|
+
break
|
|
510
|
+
|
|
511
|
+
# Read the actual record
|
|
512
|
+
record = f.read(record_length)
|
|
513
|
+
if not record:
|
|
514
|
+
break
|
|
515
|
+
|
|
516
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
517
|
+
|
|
518
|
+
if _ == 5+obs_types_definitions:
|
|
519
|
+
continue
|
|
520
|
+
|
|
521
|
+
# Read the whole record as a string
|
|
522
|
+
string_value = record.decode('utf-8').strip()
|
|
523
|
+
header.append(string_value)
|
|
524
|
+
|
|
525
|
+
# first and last obs
|
|
526
|
+
# Read the record length
|
|
527
|
+
record_length = obs_sequence.read_record_length(f)
|
|
528
|
+
|
|
529
|
+
# Read the actual record
|
|
530
|
+
record = f.read(record_length)
|
|
531
|
+
|
|
532
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
533
|
+
|
|
534
|
+
# Read the whole record as a two integers
|
|
535
|
+
first, last = struct.unpack('ii', record)[:8]
|
|
536
|
+
header.append(f"first: {first} last: {last}")
|
|
537
|
+
|
|
538
|
+
return header
|
|
539
|
+
|
|
273
540
|
@staticmethod
|
|
274
541
|
def collect_obs_types(header):
|
|
275
542
|
"""Create a dictionary for the observation types in the obs_seq header"""
|
|
@@ -299,7 +566,7 @@ class obs_sequence:
|
|
|
299
566
|
|
|
300
567
|
@staticmethod
|
|
301
568
|
def obs_reader(file, n):
|
|
302
|
-
"""Reads the obs sequence file and returns a generator of the obs"""
|
|
569
|
+
"""Reads the ascii obs sequence file and returns a generator of the obs"""
|
|
303
570
|
previous_line = ''
|
|
304
571
|
with open(file, 'r') as f:
|
|
305
572
|
for line in f:
|
|
@@ -339,6 +606,115 @@ class obs_sequence:
|
|
|
339
606
|
previous_line = next_line
|
|
340
607
|
yield obs
|
|
341
608
|
|
|
609
|
+
@staticmethod
|
|
610
|
+
def check_trailing_record_length(file, expected_length):
|
|
611
|
+
"""Reads and checks the trailing record length from the binary file written by Fortran.
|
|
612
|
+
|
|
613
|
+
Parameters:
|
|
614
|
+
file (file): The file object.
|
|
615
|
+
expected_length (int): The expected length of the trailing record.
|
|
616
|
+
|
|
617
|
+
Assuming 4 bytes:
|
|
618
|
+
| Record Length (4 bytes) | Data (N bytes) | Trailing Record Length (4 bytes) |
|
|
619
|
+
"""
|
|
620
|
+
trailing_record_length_bytes = file.read(4)
|
|
621
|
+
trailing_record_length = struct.unpack('i', trailing_record_length_bytes)[0]
|
|
622
|
+
if expected_length != trailing_record_length:
|
|
623
|
+
raise ValueError("Record length mismatch in Fortran binary file")
|
|
624
|
+
|
|
625
|
+
@staticmethod
|
|
626
|
+
def read_record_length(file):
|
|
627
|
+
"""Reads and unpacks the record length from the file."""
|
|
628
|
+
record_length_bytes = file.read(4)
|
|
629
|
+
if not record_length_bytes:
|
|
630
|
+
return None # End of file
|
|
631
|
+
return struct.unpack('i', record_length_bytes)[0]
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def obs_binary_reader(self, file, n):
|
|
635
|
+
"""Reads the obs sequence binary file and returns a generator of the obs"""
|
|
636
|
+
header_length = len(self.header)
|
|
637
|
+
with open(file, 'rb') as f:
|
|
638
|
+
# Skip the first len(obs_seq.header) lines
|
|
639
|
+
for _ in range(header_length-1):
|
|
640
|
+
# Read the record length
|
|
641
|
+
record_length = obs_sequence.read_record_length(f)
|
|
642
|
+
if record_length is None: # End of file
|
|
643
|
+
break
|
|
644
|
+
|
|
645
|
+
# Skip the actual record
|
|
646
|
+
f.seek(record_length, 1)
|
|
647
|
+
|
|
648
|
+
# Skip the trailing record length
|
|
649
|
+
f.seek(4, 1)
|
|
650
|
+
|
|
651
|
+
obs_num = 0
|
|
652
|
+
while True:
|
|
653
|
+
obs = []
|
|
654
|
+
obs_num += 1
|
|
655
|
+
obs.append(f"OBS {obs_num}")
|
|
656
|
+
for _ in range(n): # number of copies
|
|
657
|
+
# Read the record length
|
|
658
|
+
record_length = obs_sequence.read_record_length(f)
|
|
659
|
+
if record_length is None:
|
|
660
|
+
break
|
|
661
|
+
# Read the actual record (copie)
|
|
662
|
+
record = f.read(record_length)
|
|
663
|
+
obs.append(struct.unpack('d', record)[0])
|
|
664
|
+
|
|
665
|
+
# Read the trailing record length (should match the leading one)
|
|
666
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
667
|
+
|
|
668
|
+
# linked list info
|
|
669
|
+
record_length = obs_sequence.read_record_length(f)
|
|
670
|
+
if record_length is None:
|
|
671
|
+
break
|
|
672
|
+
|
|
673
|
+
record = f.read(record_length)
|
|
674
|
+
int1, int2, int3 = struct.unpack('iii', record[:12])
|
|
675
|
+
linked_list_string = f"{int1:<12} {int2:<10} {int3:<12}"
|
|
676
|
+
obs.append(linked_list_string)
|
|
677
|
+
|
|
678
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
679
|
+
|
|
680
|
+
# location (note no location header "loc3d" or "loc1d" for binary files)
|
|
681
|
+
obs.append('loc3d')
|
|
682
|
+
record_length = obs_sequence.read_record_length(f)
|
|
683
|
+
record = f.read(record_length)
|
|
684
|
+
x,y,z,vert = struct.unpack('dddi', record[:28])
|
|
685
|
+
location_string = f"{x} {y} {z} {vert}"
|
|
686
|
+
obs.append(location_string)
|
|
687
|
+
|
|
688
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
689
|
+
|
|
690
|
+
# kind (type of observation) value
|
|
691
|
+
obs.append('kind')
|
|
692
|
+
record_length_bytes = f.read(4)
|
|
693
|
+
record_length = struct.unpack('i', record_length_bytes)[0]
|
|
694
|
+
record = f.read(record_length)
|
|
695
|
+
kind = f"{struct.unpack('i', record)[0]}"
|
|
696
|
+
obs.append(kind)
|
|
697
|
+
|
|
698
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
699
|
+
|
|
700
|
+
# time (seconds, days)
|
|
701
|
+
record_length = obs_sequence.read_record_length(f)
|
|
702
|
+
record = f.read(record_length)
|
|
703
|
+
seconds, days = struct.unpack('ii', record)[:8]
|
|
704
|
+
time_string = f"{seconds} {days}"
|
|
705
|
+
obs.append(time_string)
|
|
706
|
+
|
|
707
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
708
|
+
|
|
709
|
+
# obs error variance
|
|
710
|
+
record_length = obs_sequence.read_record_length(f)
|
|
711
|
+
record = f.read(record_length)
|
|
712
|
+
obs.append(struct.unpack('d', record)[0])
|
|
713
|
+
|
|
714
|
+
obs_sequence.check_trailing_record_length(f, record_length)
|
|
715
|
+
|
|
716
|
+
yield obs
|
|
717
|
+
|
|
342
718
|
def composite_types(self, composite_types='use_default'):
|
|
343
719
|
"""
|
|
344
720
|
Set up and construct composite types for the DataFrame.
|
|
@@ -406,65 +782,6 @@ def convert_dart_time(seconds, days):
|
|
|
406
782
|
"""
|
|
407
783
|
time = dt.datetime(1601,1,1) + dt.timedelta(days=days, seconds=seconds)
|
|
408
784
|
return time
|
|
409
|
-
|
|
410
|
-
def select_by_dart_qc(df, dart_qc):
|
|
411
|
-
"""
|
|
412
|
-
Selects rows from a DataFrame based on the DART quality control flag.
|
|
413
|
-
|
|
414
|
-
Parameters:
|
|
415
|
-
df (DataFrame): A pandas DataFrame.
|
|
416
|
-
dart_qc (int): The DART quality control flag to select.
|
|
417
|
-
|
|
418
|
-
Returns:
|
|
419
|
-
DataFrame: A DataFrame containing only the rows with the specified DART quality control flag.
|
|
420
|
-
|
|
421
|
-
Raises:
|
|
422
|
-
ValueError: If the DART quality control flag is not present in the DataFrame.
|
|
423
|
-
"""
|
|
424
|
-
if dart_qc not in df['DART_quality_control'].unique():
|
|
425
|
-
raise ValueError(f"DART quality control flag '{dart_qc}' not found in DataFrame.")
|
|
426
|
-
else:
|
|
427
|
-
return df[df['DART_quality_control'] == dart_qc]
|
|
428
|
-
|
|
429
|
-
def select_failed_qcs(df):
|
|
430
|
-
"""
|
|
431
|
-
Selects rows from a DataFrame where the DART quality control flag is greater than 0.
|
|
432
|
-
|
|
433
|
-
Parameters:
|
|
434
|
-
df (DataFrame): A pandas DataFrame.
|
|
435
|
-
|
|
436
|
-
Returns:
|
|
437
|
-
DataFrame: A DataFrame containing only the rows with a DART quality control flag greater than 0.
|
|
438
|
-
"""
|
|
439
|
-
return df[df['DART_quality_control'] > 0]
|
|
440
|
-
|
|
441
|
-
def possible_vs_used(df):
|
|
442
|
-
"""
|
|
443
|
-
Calculates the count of possible vs. used observations by type.
|
|
444
|
-
|
|
445
|
-
This function takes a DataFrame containing observation data, including a 'type' column for the observation
|
|
446
|
-
type and an 'observation' column. The number of used observations ('used'), is the total number
|
|
447
|
-
minus the observations that failed quality control checks (as determined by the `select_failed_qcs` function).
|
|
448
|
-
The result is a DataFrame with each observation type, the count of possible observations, and the count of
|
|
449
|
-
used observations.
|
|
450
|
-
|
|
451
|
-
Parameters:
|
|
452
|
-
df (pd.DataFrame): A DataFrame with at least two columns: 'type' for the observation type and 'observation'
|
|
453
|
-
for the observation data. It may also contain other columns required by the `select_failed_qcs` function
|
|
454
|
-
to determine failed quality control checks.
|
|
455
|
-
|
|
456
|
-
Returns:
|
|
457
|
-
pd.DataFrame: A DataFrame with three columns: 'type', 'possible', and 'used'. 'type' is the observation type,
|
|
458
|
-
'possible' is the count of all observations of that type, and 'used' is the count of observations of that type
|
|
459
|
-
that passed quality control checks.
|
|
460
|
-
|
|
461
|
-
"""
|
|
462
|
-
possible = df.groupby('type')['observation'].count()
|
|
463
|
-
possible.rename('possible', inplace=True)
|
|
464
|
-
used = df.groupby('type')['observation'].count() - select_failed_qcs(df).groupby('type')['observation'].count()
|
|
465
|
-
used.rename('used', inplace=True)
|
|
466
|
-
return pd.concat([possible, used], axis=1).reset_index()
|
|
467
|
-
|
|
468
785
|
|
|
469
786
|
def construct_composit(df_comp, composite, components):
|
|
470
787
|
"""
|
|
@@ -504,3 +821,5 @@ def construct_composit(df_comp, composite, components):
|
|
|
504
821
|
merged_df = merged_df.drop(columns=[col for col in merged_df.columns if col.endswith('_v')])
|
|
505
822
|
|
|
506
823
|
return merged_df
|
|
824
|
+
|
|
825
|
+
|