xradio 0.0.27__py3-none-any.whl → 0.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xradio/__init__.py +5 -4
- xradio/_utils/array.py +90 -0
- xradio/_utils/zarr/common.py +48 -3
- xradio/image/_util/_fits/xds_from_fits.py +10 -5
- xradio/image/_util/_zarr/zarr_low_level.py +27 -24
- xradio/image/_util/common.py +4 -1
- xradio/image/_util/zarr.py +4 -1
- xradio/schema/__init__.py +24 -6
- xradio/schema/bases.py +440 -2
- xradio/schema/check.py +96 -55
- xradio/schema/dataclass.py +123 -27
- xradio/schema/metamodel.py +21 -4
- xradio/schema/typing.py +33 -18
- xradio/vis/__init__.py +5 -2
- xradio/vis/_processing_set.py +30 -9
- xradio/vis/_vis_utils/_ms/_tables/create_field_and_source_xds.py +710 -0
- xradio/vis/_vis_utils/_ms/_tables/load.py +23 -10
- xradio/vis/_vis_utils/_ms/_tables/load_main_table.py +145 -64
- xradio/vis/_vis_utils/_ms/_tables/read.py +782 -156
- xradio/vis/_vis_utils/_ms/_tables/read_main_table.py +176 -45
- xradio/vis/_vis_utils/_ms/_tables/read_subtables.py +79 -28
- xradio/vis/_vis_utils/_ms/_tables/write.py +102 -45
- xradio/vis/_vis_utils/_ms/_tables/write_exp_api.py +127 -65
- xradio/vis/_vis_utils/_ms/chunks.py +58 -21
- xradio/vis/_vis_utils/_ms/conversion.py +536 -67
- xradio/vis/_vis_utils/_ms/descr.py +52 -20
- xradio/vis/_vis_utils/_ms/msv2_to_msv4_meta.py +70 -35
- xradio/vis/_vis_utils/_ms/msv4_infos.py +0 -59
- xradio/vis/_vis_utils/_ms/msv4_sub_xdss.py +76 -9
- xradio/vis/_vis_utils/_ms/optimised_functions.py +0 -46
- xradio/vis/_vis_utils/_ms/partition_queries.py +308 -119
- xradio/vis/_vis_utils/_ms/partitions.py +82 -25
- xradio/vis/_vis_utils/_ms/subtables.py +32 -14
- xradio/vis/_vis_utils/_utils/partition_attrs.py +30 -11
- xradio/vis/_vis_utils/_utils/xds_helper.py +136 -45
- xradio/vis/_vis_utils/_zarr/read.py +60 -22
- xradio/vis/_vis_utils/_zarr/write.py +83 -9
- xradio/vis/_vis_utils/ms.py +48 -29
- xradio/vis/_vis_utils/zarr.py +44 -20
- xradio/vis/convert_msv2_to_processing_set.py +106 -32
- xradio/vis/load_processing_set.py +38 -61
- xradio/vis/read_processing_set.py +62 -96
- xradio/vis/schema.py +687 -0
- xradio/vis/vis_io.py +75 -43
- {xradio-0.0.27.dist-info → xradio-0.0.29.dist-info}/LICENSE.txt +6 -1
- {xradio-0.0.27.dist-info → xradio-0.0.29.dist-info}/METADATA +10 -5
- xradio-0.0.29.dist-info/RECORD +73 -0
- {xradio-0.0.27.dist-info → xradio-0.0.29.dist-info}/WHEEL +1 -1
- xradio/vis/model.py +0 -497
- xradio-0.0.27.dist-info/RECORD +0 -71
- {xradio-0.0.27.dist-info → xradio-0.0.29.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,7 @@ import graphviper.utils.logger as logger
|
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
import re
|
|
5
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, Callable, Dict, List, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas as pd
|
|
@@ -13,15 +13,18 @@ from casacore import tables
|
|
|
13
13
|
|
|
14
14
|
from .table_query import open_query, open_table_ro
|
|
15
15
|
|
|
16
|
-
CASACORE_TO_PD_TIME_CORRECTION =
|
|
16
|
+
CASACORE_TO_PD_TIME_CORRECTION = 3_506_716_800.0
|
|
17
17
|
SECS_IN_DAY = 86400
|
|
18
|
+
MJD_DIF_UNIX = 40587
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def table_exists(path: str) -> bool:
|
|
21
22
|
return tables.tableexists(path)
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
def convert_casacore_time(
|
|
25
|
+
def convert_casacore_time(
|
|
26
|
+
rawtimes: np.ndarray, convert_to_datetime: bool = True
|
|
27
|
+
) -> np.ndarray:
|
|
25
28
|
"""
|
|
26
29
|
Read time columns to datetime format
|
|
27
30
|
pandas datetimes are referenced against a 0 of 1970-01-01
|
|
@@ -29,36 +32,218 @@ def convert_casacore_time(rawtimes: np.ndarray, convert_to_datetime=True) -> np.
|
|
|
29
32
|
|
|
30
33
|
This requires a correction of 3506716800 seconds which is hardcoded to save time
|
|
31
34
|
|
|
32
|
-
|
|
33
|
-
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
rawtimes : np.ndarray
|
|
38
|
+
times in casacore ref
|
|
39
|
+
convert_to_datetime : bool (Default value = True)
|
|
40
|
+
whether to produce pandas style datetime
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
np.ndarray
|
|
45
|
+
times converted to pandas reference
|
|
34
46
|
"""
|
|
35
|
-
|
|
47
|
+
times_reref = np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION
|
|
36
48
|
if convert_to_datetime:
|
|
37
|
-
return pd.to_datetime(
|
|
38
|
-
np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION, unit="s"
|
|
39
|
-
).values
|
|
49
|
+
return pd.to_datetime(times_reref, unit="s").values
|
|
40
50
|
else:
|
|
41
|
-
return
|
|
51
|
+
return times_reref
|
|
42
52
|
# dt = pd.to_datetime(np.atleast_1d(rawtimes) - correction, unit='s').values
|
|
43
53
|
# if len(np.array(rawtimes).shape) == 0: dt = dt[0]
|
|
44
54
|
# return dt
|
|
45
55
|
|
|
46
56
|
|
|
47
57
|
def convert_mjd_time(rawtimes: np.ndarray) -> np.ndarray:
|
|
48
|
-
"""
|
|
58
|
+
"""
|
|
59
|
+
Different time conversion needed for the MJD col of EPHEM{i}_*.tab
|
|
49
60
|
files (only, as far as I've seen)
|
|
50
61
|
|
|
51
|
-
|
|
52
|
-
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
rawtimes : np.ndarray
|
|
65
|
+
MJD times for example from the MJD col of ephemerides tables
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
np.ndarray
|
|
70
|
+
times converted to pandas reference and datetime type
|
|
53
71
|
"""
|
|
54
|
-
|
|
55
|
-
rawtimes * SECS_IN_DAY
|
|
72
|
+
times_reref = pd.to_datetime(
|
|
73
|
+
(rawtimes - MJD_DIF_UNIX) * SECS_IN_DAY, unit="s"
|
|
56
74
|
).values
|
|
57
75
|
|
|
76
|
+
return times_reref
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def convert_casacore_time_to_mjd(rawtimes: np.ndarray) -> np.ndarray:
|
|
80
|
+
"""
|
|
81
|
+
From CASA/casacore time (as used in the TIME column of the main table) to MJD
|
|
82
|
+
(as used in the EPHEMi*.tab ephemeris tables). As the epochs are the same, this
|
|
83
|
+
is just a conversion of units.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
rawtimes : np.ndarray
|
|
88
|
+
times from a TIME column (seconds, casacore time epoch)
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
np.ndarray
|
|
93
|
+
times converted to (ephemeris) MJD (days since casacore time epoch (1858-11-17))
|
|
94
|
+
"""
|
|
95
|
+
return rawtimes / SECS_IN_DAY
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def make_taql_where_between_min_max(
|
|
99
|
+
min_max: Tuple[np.float64, np.float64],
|
|
100
|
+
path: str,
|
|
101
|
+
table_name: str,
|
|
102
|
+
colname="TIME",
|
|
103
|
+
) -> Union[str, None]:
|
|
104
|
+
"""
|
|
105
|
+
From a numerical min/max range, produce a TaQL string to select between
|
|
106
|
+
those min/max values (example: times) in a table.
|
|
107
|
+
The table can be for example a POINTING subtable or an EPHEM* ephemeris
|
|
108
|
+
table.
|
|
109
|
+
This is meant to be used on MSv2 table columns that will be loaded as a
|
|
110
|
+
coordinate in MSv4s and their sub-xdss (example: POINTING/TIME ephemeris/MJD).
|
|
111
|
+
|
|
112
|
+
This can be used for example to produce a TaQL string to constraing loading of:
|
|
113
|
+
- POINTING rows (based on the min/max from the time coordinate of the main MSv4)
|
|
114
|
+
- ephemeris rows, from EPHEM* tables ((based on the MJD column and the min/max
|
|
115
|
+
from the main MSv4 time coordinate).
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
min_max : Tuple[np.float64, np.float64]
|
|
120
|
+
min / max values of time or other column used as coordinate
|
|
121
|
+
(assumptions: float values, sortable, typically: time coord from MSv4)
|
|
122
|
+
path :
|
|
123
|
+
Path to input MS or location of the table
|
|
124
|
+
table_name :
|
|
125
|
+
Name of the table where to load a column (example: 'POINTING')
|
|
126
|
+
colname :
|
|
127
|
+
Name of the column to search for min/max values (examples: 'TIME', 'MJD')
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
taql_where : str
|
|
132
|
+
TaQL (sub)string with the min/max time 'WHERE' constraint
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
min_max_range = find_projected_min_max_table(min_max, path, table_name, colname)
|
|
136
|
+
if min_max_range is None:
|
|
137
|
+
taql = None
|
|
138
|
+
else:
|
|
139
|
+
(min_val, max_val) = min_max_range
|
|
140
|
+
taql = f"where {colname} >= {min_val} AND {colname} <= {max_val}"
|
|
141
|
+
|
|
142
|
+
return taql
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def find_projected_min_max_table(
|
|
146
|
+
min_max: Tuple[np.float64, np.float64], path: str, table_name: str, colname: str
|
|
147
|
+
) -> Union[Tuple[np.float64, np.float64], None]:
|
|
148
|
+
"""
|
|
149
|
+
We have: min/max values that define a range (for example of time)
|
|
150
|
+
We want: to project that min/max range on a sortable column (for example a
|
|
151
|
+
range of times onto a TIME column), and find min and max values
|
|
152
|
+
derived from that table column such that the range between those min and max
|
|
153
|
+
values includes at least the input min/max range.
|
|
154
|
+
|
|
155
|
+
The returned min/max can then be used in a data selection TaQL query to
|
|
156
|
+
select at least the values within the input range (possibly extended if
|
|
157
|
+
the input range overlaps only partially or not at all with the column
|
|
158
|
+
values). A tolerance is added to the min/max to prevent numerical issues in
|
|
159
|
+
comparisons and conversios between numerical types and strings.
|
|
160
|
+
|
|
161
|
+
When the range given as input is wider than the range of values found in
|
|
162
|
+
the column, use the input range, as it is sufficient and more inclusive.
|
|
163
|
+
|
|
164
|
+
When the range given as input is narrow (projected on the target table/column)
|
|
165
|
+
and falls between two points of the column values, or overlaps with only one
|
|
166
|
+
point, the min/max are extended to include at least the two column values that
|
|
167
|
+
define a range within which the input range is included.
|
|
168
|
+
Example scenario: an ephemeris table is sampled at a coarse interval
|
|
169
|
+
(20 min) and we want to find a min/max range projected from the time min/max
|
|
170
|
+
of a main MSv4 time coordinate sampled at ~1s for a field-scan/intent
|
|
171
|
+
that spans ~2 min. Those ~2 min will typically fall between ephemeris samples.
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
min_max : Tuple[np.float64, np.float64]
|
|
176
|
+
min / max values of time or other column used as coordinate
|
|
177
|
+
(assumptions: float values, sortable)
|
|
178
|
+
path :
|
|
179
|
+
Path to input MS or location of the table
|
|
180
|
+
table_name :
|
|
181
|
+
Name of the table where to load a column (example: 'POINTING')
|
|
182
|
+
colname :
|
|
183
|
+
Name of the column to search for min/max values (example: 'TIME')
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
output_min_max : Union[Tuple[np.float64, np.float64], None]
|
|
188
|
+
min/max values derived from the input min/max and the column values
|
|
189
|
+
"""
|
|
190
|
+
with open_table_ro(os.path.join(path, table_name)) as tb_tool:
|
|
191
|
+
if tb_tool.nrows() == 0:
|
|
192
|
+
return None
|
|
193
|
+
col = tb_tool.getcol(colname)
|
|
194
|
+
|
|
195
|
+
out_min_max = find_projected_min_max_array(min_max, col)
|
|
196
|
+
return out_min_max
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def find_projected_min_max_array(
|
|
200
|
+
min_max: Tuple[np.float64, np.float64], array: np.array
|
|
201
|
+
) -> Tuple[np.float64, np.float64]:
|
|
202
|
+
"""Does the min/max checks and search for find_projected_min_max_table()"""
|
|
203
|
+
|
|
204
|
+
sorted_array = np.sort(array)
|
|
205
|
+
(range_min, range_max) = min_max
|
|
206
|
+
if len(sorted_array) < 2:
|
|
207
|
+
tol = np.finfo(sorted_array.dtype).eps * 4
|
|
208
|
+
else:
|
|
209
|
+
tol = np.diff(sorted_array[np.nonzero(sorted_array)]).min() / 4
|
|
210
|
+
|
|
211
|
+
if range_max > sorted_array[-1]:
|
|
212
|
+
projected_max = range_max + tol
|
|
213
|
+
else:
|
|
214
|
+
max_idx = sorted_array.size - 1
|
|
215
|
+
max_array_idx = min(
|
|
216
|
+
max_idx, np.searchsorted(sorted_array, range_max, side="right")
|
|
217
|
+
)
|
|
218
|
+
projected_max = sorted_array[max_array_idx] + tol
|
|
219
|
+
|
|
220
|
+
if range_min < sorted_array[0]:
|
|
221
|
+
projected_min = range_min - tol
|
|
222
|
+
else:
|
|
223
|
+
min_array_idx = max(
|
|
224
|
+
0, np.searchsorted(sorted_array, range_min, side="left") - 1
|
|
225
|
+
)
|
|
226
|
+
# ensure 'sorted_array[min_array_idx] < range_min' when values ==
|
|
227
|
+
if sorted_array[min_array_idx] == range_min:
|
|
228
|
+
min_array_idx = max(0, min_array_idx - 1)
|
|
229
|
+
projected_min = sorted_array[min_array_idx] - tol
|
|
230
|
+
|
|
231
|
+
return (projected_min, projected_max)
|
|
232
|
+
|
|
58
233
|
|
|
59
234
|
def extract_table_attributes(infile: str) -> Dict[str, Dict]:
|
|
60
235
|
"""
|
|
61
|
-
|
|
236
|
+
Return a dictionary of table attributes created from MS keywords and column descriptions
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
infile : str
|
|
241
|
+
table file path
|
|
242
|
+
|
|
243
|
+
Returns
|
|
244
|
+
-------
|
|
245
|
+
Dict[str, Dict]
|
|
246
|
+
table attributes as a dictionary
|
|
62
247
|
"""
|
|
63
248
|
with open_table_ro(infile) as tb_tool:
|
|
64
249
|
kwd = tb_tool.getkeywords()
|
|
@@ -79,9 +264,17 @@ def add_units_measures(
|
|
|
79
264
|
"""
|
|
80
265
|
Add attributes with units and measure metainfo to the variables passed in the input dictionary
|
|
81
266
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
:
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
mvars : Dict[str, xr.DataArray]
|
|
270
|
+
data variables where to populate units
|
|
271
|
+
cc_attrs : Dict[str, Any]
|
|
272
|
+
dictionary with casacore table attributes (from extract_table_attributes)
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
Dict[str, xr.DataArray]
|
|
277
|
+
variables with units added in their attributes
|
|
85
278
|
"""
|
|
86
279
|
col_descrs = cc_attrs["column_descriptions"]
|
|
87
280
|
# TODO: Should probably loop the other way around, over mvars
|
|
@@ -90,6 +283,12 @@ def add_units_measures(
|
|
|
90
283
|
if var_name in mvars and "keywords" in col_descrs[col]:
|
|
91
284
|
if "QuantumUnits" in col_descrs[col]["keywords"]:
|
|
92
285
|
cc_units = col_descrs[col]["keywords"]["QuantumUnits"]
|
|
286
|
+
|
|
287
|
+
if isinstance(
|
|
288
|
+
cc_units, str
|
|
289
|
+
): # Little fix for Meerkat data where the units are a string.
|
|
290
|
+
cc_units = [cc_units]
|
|
291
|
+
|
|
93
292
|
if not isinstance(cc_units, list) or not cc_units:
|
|
94
293
|
logger.warning(
|
|
95
294
|
f"Invalid units found for column/variable {col}: {cc_units}"
|
|
@@ -128,7 +327,8 @@ def add_units_measures(
|
|
|
128
327
|
|
|
129
328
|
|
|
130
329
|
def make_freq_attrs(spw_xds: xr.Dataset, spw_id: int) -> Dict[str, Any]:
|
|
131
|
-
"""
|
|
330
|
+
"""
|
|
331
|
+
Grab the units/measure metainfo for the xds.freq dimension of a
|
|
132
332
|
parttion from the SPECTRAL_WINDOW subtable CTDS attributes.
|
|
133
333
|
|
|
134
334
|
Has to read xds_spw.meas_freq_ref and use it as index in the CTDS
|
|
@@ -137,10 +337,17 @@ def make_freq_attrs(spw_xds: xr.Dataset, spw_id: int) -> Dict[str, Any]:
|
|
|
137
337
|
(then the ref frame from the second will be pulled to
|
|
138
338
|
xds.freq.attrs)
|
|
139
339
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
:
|
|
143
|
-
|
|
340
|
+
Parameters
|
|
341
|
+
----------
|
|
342
|
+
spw_xds : xr.Dataset
|
|
343
|
+
(metainfo) SPECTRAL_WINDOW xds
|
|
344
|
+
spw_id : int
|
|
345
|
+
SPW id of a partition
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
Dict[str, Any]
|
|
350
|
+
attributes (units/measure) for the freq dim of a partition
|
|
144
351
|
"""
|
|
145
352
|
fallback_TabRefTypes = [
|
|
146
353
|
"REST",
|
|
@@ -193,6 +400,11 @@ def get_pad_nan(col: np.ndarray) -> np.ndarray:
|
|
|
193
400
|
----------
|
|
194
401
|
col : np.ndarray
|
|
195
402
|
data being loaded from a table column
|
|
403
|
+
|
|
404
|
+
Returns
|
|
405
|
+
-------
|
|
406
|
+
np.ndarray
|
|
407
|
+
nan ("nan") value for the type of the input column
|
|
196
408
|
"""
|
|
197
409
|
# This is causing frequent warnings for integers. Cast of nan to "int nan"
|
|
198
410
|
# produces -2147483648 but also seems to trigger a
|
|
@@ -208,15 +420,24 @@ def get_pad_nan(col: np.ndarray) -> np.ndarray:
|
|
|
208
420
|
|
|
209
421
|
|
|
210
422
|
def redimension_ms_subtable(xds: xr.Dataset, subt_name: str) -> xr.Dataset:
|
|
211
|
-
"""
|
|
423
|
+
"""
|
|
424
|
+
Expand a MeasurementSet subtable xds from single dimension (row)
|
|
212
425
|
to multiple dimensions (such as (source_id, time, spectral_window)
|
|
213
426
|
|
|
214
427
|
WIP: only works for source, experimenting
|
|
215
428
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
429
|
+
Parameters
|
|
430
|
+
----------
|
|
431
|
+
xds : xr.Dataset
|
|
432
|
+
dataset to change the dimensions
|
|
433
|
+
subt_name : str
|
|
434
|
+
subtable name (SOURCE, etc.)
|
|
435
|
+
|
|
436
|
+
Returns
|
|
437
|
+
-------
|
|
438
|
+
xr.Dataset
|
|
439
|
+
transformed xds with data dimensions representing the MS subtable key
|
|
440
|
+
(one dimension for every columns)
|
|
220
441
|
"""
|
|
221
442
|
subt_key_cols = {
|
|
222
443
|
"DOPPLER": ["doppler_id", "source_id"],
|
|
@@ -239,8 +460,18 @@ def redimension_ms_subtable(xds: xr.Dataset, subt_name: str) -> xr.Dataset:
|
|
|
239
460
|
|
|
240
461
|
rxds = xds.copy()
|
|
241
462
|
try:
|
|
463
|
+
# drop_duplicates() needed (https://github.com/casangi/xradio/issues/185). Examples:
|
|
464
|
+
# - Some early ALMA datasets have bogus WEATHER tables with many/most rows with
|
|
465
|
+
# (ANTENNA_ID=0, TIME=0) and no other columns to figure out the right IDs, such
|
|
466
|
+
# as "NS_WX_STATION_ID" or similar. (example: X425.pm04.scan4.ms)
|
|
467
|
+
# - Some GBT MSs have duplicated (ANTENNA_ID=0, TIME=xxx). (example: analytic_variable.ms)
|
|
242
468
|
with np.errstate(invalid="ignore"):
|
|
243
|
-
rxds =
|
|
469
|
+
rxds = (
|
|
470
|
+
rxds.set_index(row=key_dims)
|
|
471
|
+
.drop_duplicates("row")
|
|
472
|
+
.unstack("row")
|
|
473
|
+
.transpose(*key_dims, ...)
|
|
474
|
+
)
|
|
244
475
|
# unstack changes type to float when it needs to introduce NaNs, so
|
|
245
476
|
# we need to reset to the original type.
|
|
246
477
|
for var in rxds.data_vars:
|
|
@@ -302,23 +533,37 @@ def read_generic_table(
|
|
|
302
533
|
inpath: str,
|
|
303
534
|
tname: str,
|
|
304
535
|
timecols: Union[List[str], None] = None,
|
|
305
|
-
ignore=None,
|
|
536
|
+
ignore: Union[List[str], None] = None,
|
|
306
537
|
rename_ids: Dict[str, str] = None,
|
|
538
|
+
taql_where: str = None,
|
|
307
539
|
) -> xr.Dataset:
|
|
308
|
-
"""
|
|
309
|
-
|
|
540
|
+
"""
|
|
541
|
+
load generic casacore (sub)table into memory resident xds (xarray wrapped
|
|
542
|
+
numpy arrays). This reads through the table columns and loads the data.
|
|
543
|
+
|
|
544
|
+
TODO: change read_ name to load_ name (and most if not all this module)
|
|
310
545
|
|
|
311
546
|
Parameters
|
|
312
547
|
----------
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
548
|
+
inpath : str
|
|
549
|
+
path to the MS or directory containing the table
|
|
550
|
+
tname : str
|
|
551
|
+
(sub)table name, for example 'SOURCE' for myms.ms/SOURCE
|
|
552
|
+
timecols : Union[List[str], None] (Default value = None)
|
|
553
|
+
column names to convert to numpy datetime format.
|
|
554
|
+
leaves times as their original casacore format.
|
|
555
|
+
ignore : Union[List[str], None] (Default value = None)
|
|
556
|
+
list of column names to ignore and not try to read.
|
|
557
|
+
rename_ids : Dict[str, str] (Default value = None)
|
|
558
|
+
dict with dimension renaming mapping
|
|
559
|
+
taql_where : str (Default value = None)
|
|
560
|
+
TaQL string to optionally constain the rows/columns to read
|
|
561
|
+
(Default value = None)
|
|
562
|
+
|
|
563
|
+
Returns
|
|
564
|
+
-------
|
|
565
|
+
xr.Dataset
|
|
566
|
+
table loaded as XArray dataset
|
|
322
567
|
"""
|
|
323
568
|
if timecols is None:
|
|
324
569
|
timecols = []
|
|
@@ -340,13 +585,33 @@ def read_generic_table(
|
|
|
340
585
|
)
|
|
341
586
|
return xr.Dataset()
|
|
342
587
|
|
|
343
|
-
with open_table_ro(infile) as
|
|
344
|
-
if
|
|
588
|
+
with open_table_ro(infile) as gtable:
|
|
589
|
+
if gtable.nrows() == 0:
|
|
345
590
|
logger.debug(f"table is empty: {inpath} {tname}")
|
|
346
591
|
return xr.Dataset(attrs=attrs)
|
|
347
592
|
|
|
348
|
-
|
|
349
|
-
|
|
593
|
+
# if len(ignore) > 0: #This is needed because some SOURCE tables have a SOURCE_MODEL column that is corrupted and this causes the open_query to fail.
|
|
594
|
+
# select_columns = gtable.colnames()
|
|
595
|
+
# select_columns_str = str([item for item in select_columns if item not in ignore])[1:-1].replace("'", "") #Converts an array to a comma sepearted string. For example ['a', 'b', 'c'] to 'a, b, c'.
|
|
596
|
+
# taql_gtable = f"select " + select_columns_str + f" from $gtable {taql_where}"
|
|
597
|
+
# else:
|
|
598
|
+
# taql_gtable = f"select * from $gtable {taql_where}"
|
|
599
|
+
|
|
600
|
+
# relatively often broken columns that we do not need
|
|
601
|
+
exclude_pattern = ", !~p/SOURCE_MODEL/"
|
|
602
|
+
taql_gtable = f"select *{exclude_pattern} from $gtable {taql_where or ''}"
|
|
603
|
+
|
|
604
|
+
with open_query(gtable, taql_gtable) as tb_tool:
|
|
605
|
+
if tb_tool.nrows() == 0:
|
|
606
|
+
logger.debug(
|
|
607
|
+
f"table query is empty: {inpath} {tname}, with where {taql_where}"
|
|
608
|
+
)
|
|
609
|
+
return xr.Dataset(attrs=attrs)
|
|
610
|
+
|
|
611
|
+
colnames = tb_tool.colnames()
|
|
612
|
+
mcoords, mvars = load_cols_into_coords_data_vars(
|
|
613
|
+
infile, tb_tool, timecols, ignore
|
|
614
|
+
)
|
|
350
615
|
|
|
351
616
|
mvars = add_units_measures(mvars, cc_attrs)
|
|
352
617
|
mcoords = add_units_measures(mcoords, cc_attrs)
|
|
@@ -379,49 +644,116 @@ def read_generic_table(
|
|
|
379
644
|
return xds
|
|
380
645
|
|
|
381
646
|
|
|
382
|
-
def
|
|
383
|
-
|
|
647
|
+
def load_cols_into_coords_data_vars(
|
|
648
|
+
inpath: str,
|
|
649
|
+
tb_tool: tables.table,
|
|
650
|
+
timecols: Union[List[str], None] = None,
|
|
651
|
+
ignore: Union[List[str], None] = None,
|
|
384
652
|
) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
|
|
385
|
-
"""
|
|
386
|
-
|
|
653
|
+
"""
|
|
654
|
+
Produce a set of coordinate xarrays and a set of data variables xarrays
|
|
655
|
+
from the columns of a table.
|
|
656
|
+
|
|
657
|
+
Parameters
|
|
658
|
+
----------
|
|
659
|
+
inpath : str
|
|
660
|
+
input path
|
|
661
|
+
tb_tool: tables.table
|
|
662
|
+
tool being used to load data
|
|
663
|
+
timecols: Union[List[str], None] (Default value = None)
|
|
664
|
+
list of columns to be considered as TIME-related
|
|
665
|
+
ignore: Union[List[str], None] (Default value = None)
|
|
666
|
+
columns to ignore
|
|
667
|
+
|
|
668
|
+
Returns
|
|
669
|
+
-------
|
|
670
|
+
Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
|
|
671
|
+
coordinates dictionary + variables dictionary
|
|
672
|
+
"""
|
|
673
|
+
columns_loader = find_best_col_loader(inpath, tb_tool.nrows())
|
|
674
|
+
|
|
675
|
+
mcoords, mvars = columns_loader(inpath, tb_tool, timecols, ignore)
|
|
676
|
+
|
|
677
|
+
return mcoords, mvars
|
|
387
678
|
|
|
388
|
-
:param infile: path name of the MS
|
|
389
|
-
:param tb_tool: table to red the columns
|
|
390
|
-
:param timecols: columns names to convert to datetime format
|
|
391
|
-
:param ignore: list of column names to skip and not try to read.
|
|
392
679
|
|
|
393
|
-
|
|
680
|
+
def find_best_col_loader(inpath: str, nrows: int) -> Callable:
|
|
394
681
|
"""
|
|
682
|
+
Simple heuristic: for any tables other than POINTING, use the generic_load_cols
|
|
683
|
+
function that is able to deal with variable size columns. For POINTING (and if it has
|
|
684
|
+
more rows than an arbitrary "small" threshold) use a more efficient load function that
|
|
685
|
+
loads the data by column (but is not able to deal with any generic table).
|
|
686
|
+
For now, all other subtables are loaded using the generic column loader.
|
|
687
|
+
|
|
688
|
+
Background: the POINTING subtable can have a very large number of rows. For example in
|
|
689
|
+
ALMA it is sampled at ~50ms intervals which typically produces of the order of
|
|
690
|
+
[10^5, 10^7] rows. This becomes a serious performance bottleneck when loading the
|
|
691
|
+
table using row() (and one dict allocated per row).
|
|
692
|
+
This function chooses an alternative "by-column" load function to load in the columns
|
|
693
|
+
when the table is POINTING. See xradio issue #128 for now this distinction is made
|
|
694
|
+
solely for performance reasons.
|
|
395
695
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
696
|
+
Parameters
|
|
697
|
+
----------
|
|
698
|
+
inpath : str
|
|
699
|
+
path name of the MS table
|
|
700
|
+
nrows : int
|
|
701
|
+
number of rows found in the table
|
|
702
|
+
|
|
703
|
+
Returns
|
|
704
|
+
-------
|
|
705
|
+
Callable
|
|
706
|
+
function best suited to load the data from the columns of this table
|
|
707
|
+
"""
|
|
708
|
+
# do not give up generic by-row() loading if nrows is (arbitrary) small
|
|
709
|
+
ARBITRARY_MIN_ROWS = 1000
|
|
409
710
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
711
|
+
if inpath.endswith("POINTING") and nrows >= ARBITRARY_MIN_ROWS:
|
|
712
|
+
columns_loader = load_fixed_size_cols
|
|
713
|
+
else:
|
|
714
|
+
columns_loader = load_generic_cols
|
|
715
|
+
|
|
716
|
+
return columns_loader
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def load_generic_cols(
|
|
720
|
+
inpath: str,
|
|
721
|
+
tb_tool: tables.table,
|
|
722
|
+
timecols: Union[List[str], None],
|
|
723
|
+
ignore: Union[List[str], None],
|
|
724
|
+
) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
|
|
725
|
+
"""
|
|
726
|
+
Loads data for each MS column (loading the data in memory) into Xarray datasets
|
|
727
|
+
|
|
728
|
+
This function is generic in that it can load variable size array columns. See also
|
|
729
|
+
load_fixed_size_cols() as a simpler and much better performing alternative
|
|
730
|
+
for tables that are large and expected/guaranteed to not have columns with variable
|
|
731
|
+
size cells.
|
|
732
|
+
|
|
733
|
+
Parameters
|
|
734
|
+
----------
|
|
735
|
+
inpath : str
|
|
736
|
+
path name of the MS table
|
|
737
|
+
tb_tool : tables.table
|
|
738
|
+
table to load the columns
|
|
739
|
+
timecols : Union[List[str], None]
|
|
740
|
+
columns names to convert to datetime format
|
|
741
|
+
ignore : Union[List[str], None]
|
|
742
|
+
list of column names to skip and not try to load.
|
|
743
|
+
|
|
744
|
+
Returns
|
|
745
|
+
-------
|
|
746
|
+
Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
|
|
747
|
+
dict of coordinates and dict of data vars.
|
|
748
|
+
"""
|
|
749
|
+
|
|
750
|
+
col_cells = find_loadable_filled_cols(tb_tool, ignore)
|
|
416
751
|
|
|
417
752
|
trows = tb_tool.row(ignore, exclude=True)[:]
|
|
418
753
|
|
|
419
754
|
# Produce coords and data vars from MS columns
|
|
420
755
|
mcoords, mvars = {}, {}
|
|
421
756
|
for col in col_cells.keys():
|
|
422
|
-
if tb_tool.coldatatype(col) == "record":
|
|
423
|
-
continue # not supported
|
|
424
|
-
|
|
425
757
|
try:
|
|
426
758
|
# TODO
|
|
427
759
|
# benchmark np.stack() performance
|
|
@@ -446,87 +778,287 @@ def read_generic_cols(
|
|
|
446
778
|
|
|
447
779
|
if len(set([isinstance(row[col], dict) for row in trows])) > 1:
|
|
448
780
|
continue # can't deal with this case
|
|
449
|
-
mshape = np.array(max([np.array(row[col]).shape for row in trows]))
|
|
450
|
-
try:
|
|
451
|
-
pad_nan = get_pad_nan(col_cells[col])
|
|
452
781
|
|
|
453
|
-
|
|
454
|
-
# benchmark np.stack() performance
|
|
455
|
-
data = np.stack(
|
|
456
|
-
[
|
|
457
|
-
np.pad(
|
|
458
|
-
(
|
|
459
|
-
row[col]
|
|
460
|
-
if len(row[col]) > 0
|
|
461
|
-
else np.array(row[col]).reshape(
|
|
462
|
-
np.arange(len(mshape)) * 0
|
|
463
|
-
)
|
|
464
|
-
),
|
|
465
|
-
[(0, ss) for ss in mshape - np.array(row[col]).shape],
|
|
466
|
-
"constant",
|
|
467
|
-
constant_values=pad_nan,
|
|
468
|
-
)
|
|
469
|
-
for row in trows
|
|
470
|
-
]
|
|
471
|
-
)
|
|
472
|
-
except Exception as exc:
|
|
473
|
-
level = logger.WARNING
|
|
474
|
-
if col in known_misbehaving_cols:
|
|
475
|
-
level = logger.DEBUG
|
|
476
|
-
logger.log(
|
|
477
|
-
level, f"{infile}: failed to read data for column {col}: {exc}"
|
|
478
|
-
)
|
|
479
|
-
data = []
|
|
782
|
+
data = handle_variable_col_issues(inpath, col, col_cells, trows)
|
|
480
783
|
|
|
481
784
|
if len(data) == 0:
|
|
482
785
|
continue
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
786
|
+
|
|
787
|
+
array_type, array_data = raw_col_data_to_coords_vars(
|
|
788
|
+
inpath, tb_tool, col, data, timecols
|
|
789
|
+
)
|
|
790
|
+
if array_type == "coord":
|
|
791
|
+
mcoords[col.lower()] = array_data
|
|
792
|
+
elif array_type == "data_var":
|
|
793
|
+
mvars[col.lower()] = array_data
|
|
794
|
+
|
|
795
|
+
return mcoords, mvars
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def load_fixed_size_cols(
|
|
799
|
+
inpath: str,
|
|
800
|
+
tb_tool: tables.table,
|
|
801
|
+
timecols: Union[List[str], None],
|
|
802
|
+
ignore: Union[List[str], None],
|
|
803
|
+
) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
|
|
804
|
+
"""
|
|
805
|
+
Loads columns into memory via the table tool getcol() function, as opposed to
|
|
806
|
+
load_generic_cols() which loads on a per-row basis via row().
|
|
807
|
+
This function is 2+ orders of magnitude faster for large tables (pointing tables with
|
|
808
|
+
the order of >=10^5 rows)
|
|
809
|
+
Prefer this function for performance reasons when all rows can be assumed to be fixed
|
|
810
|
+
size (even if they are of array type).
|
|
811
|
+
This is performance-critical for the POINTING subtable.
|
|
812
|
+
|
|
813
|
+
Parameters
|
|
814
|
+
----------
|
|
815
|
+
inpath : str
|
|
816
|
+
path name of the MS
|
|
817
|
+
tb_tool : tables.table
|
|
818
|
+
table to red the columns
|
|
819
|
+
timecols : Union[List[str], None]
|
|
820
|
+
columns names to convert to datetime format
|
|
821
|
+
ignore : Union[List[str], None]
|
|
822
|
+
list of column names to skip and not try to load.
|
|
823
|
+
|
|
824
|
+
Returns
|
|
825
|
+
-------
|
|
826
|
+
Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
|
|
827
|
+
dict of coordinates and dict of data vars, ready to construct an xr.Dataset
|
|
828
|
+
"""
|
|
829
|
+
|
|
830
|
+
loadable_cols = find_loadable_filled_cols(tb_tool, ignore)
|
|
831
|
+
|
|
832
|
+
# Produce coords and data vars from MS columns
|
|
833
|
+
mcoords, mvars = {}, {}
|
|
834
|
+
for col in loadable_cols.keys():
|
|
835
|
+
try:
|
|
836
|
+
data = tb_tool.getcol(col)
|
|
837
|
+
if isinstance(data, dict):
|
|
838
|
+
data = data["array"].reshape(data["shape"])
|
|
839
|
+
except Exception as exc:
|
|
840
|
+
logger.warning(
|
|
841
|
+
f"{inpath}: failed to load data with getcol for column {col}: {exc}"
|
|
522
842
|
)
|
|
843
|
+
data = []
|
|
844
|
+
|
|
845
|
+
if len(data) == 0:
|
|
846
|
+
continue
|
|
847
|
+
|
|
848
|
+
array_type, array_data = raw_col_data_to_coords_vars(
|
|
849
|
+
inpath, tb_tool, col, data, timecols
|
|
850
|
+
)
|
|
851
|
+
if array_type == "coord":
|
|
852
|
+
mcoords[col.lower()] = array_data
|
|
853
|
+
elif array_type == "data_var":
|
|
854
|
+
mvars[col.lower()] = array_data
|
|
523
855
|
|
|
524
856
|
return mcoords, mvars
|
|
525
857
|
|
|
526
858
|
|
|
859
|
+
def find_loadable_filled_cols(
|
|
860
|
+
tb_tool: tables.table, ignore: Union[List[str], None]
|
|
861
|
+
) -> Dict:
|
|
862
|
+
"""
|
|
863
|
+
For a table, finds the columns that are:
|
|
864
|
+
- loadable = not of record type, and not to be ignored
|
|
865
|
+
- filled = the column cells are populated.
|
|
866
|
+
|
|
867
|
+
Parameters
|
|
868
|
+
----------
|
|
869
|
+
tb_tool : tables.table
|
|
870
|
+
table to red the columns
|
|
871
|
+
ignore : Union[List[str], None]
|
|
872
|
+
list of column names to skip and not try to load.
|
|
873
|
+
|
|
874
|
+
Returns
|
|
875
|
+
-------
|
|
876
|
+
Dict
|
|
877
|
+
dict of {column name => first cell} for columns that can/should be loaded
|
|
878
|
+
"""
|
|
879
|
+
|
|
880
|
+
colnames = tb_tool.colnames()
|
|
881
|
+
# columns that are not populated are skipped. record columns are not supported
|
|
882
|
+
loadable_cols = {
|
|
883
|
+
col: tb_tool.getcell(col, 0)
|
|
884
|
+
for col in colnames
|
|
885
|
+
if (col not in ignore)
|
|
886
|
+
and (tb_tool.iscelldefined(col, 0))
|
|
887
|
+
and tb_tool.coldatatype(col) != "record"
|
|
888
|
+
}
|
|
889
|
+
return loadable_cols
|
|
890
|
+
|
|
891
|
+
|
|
892
|
+
def raw_col_data_to_coords_vars(
|
|
893
|
+
inpath: str,
|
|
894
|
+
tb_tool: tables.table,
|
|
895
|
+
col: str,
|
|
896
|
+
data: np.ndarray,
|
|
897
|
+
timecols: Union[List[str], None],
|
|
898
|
+
) -> Tuple[str, xr.DataArray]:
|
|
899
|
+
"""
|
|
900
|
+
From a raw np array of data (freshly loaded from a table column), prepares either a
|
|
901
|
+
coord or a data_var ready to be added to an xr.Dataset
|
|
902
|
+
|
|
903
|
+
Parameters
|
|
904
|
+
----------
|
|
905
|
+
inpath: str
|
|
906
|
+
input table path
|
|
907
|
+
tb_tool: tables.table :
|
|
908
|
+
table toold being used to load data
|
|
909
|
+
col: str :
|
|
910
|
+
column
|
|
911
|
+
data: np.ndarray :
|
|
912
|
+
column data
|
|
913
|
+
timecols: Union[List[str], None]
|
|
914
|
+
columns to be treated as TIME-related
|
|
915
|
+
|
|
916
|
+
Returns
|
|
917
|
+
-------
|
|
918
|
+
Tuple[str, xr.DataArray]
|
|
919
|
+
array type string (whether this column is a 'coord' or a 'data_var') + DataArray
|
|
920
|
+
with column data/coord values ready to be added to the table xds
|
|
921
|
+
"""
|
|
922
|
+
|
|
923
|
+
# Almost sure that when TIME is present (in a standard MS subt) it
|
|
924
|
+
# is part of the key. But what about non-std subtables, ASDM subts?
|
|
925
|
+
subts_with_time_key = (
|
|
926
|
+
"FEED",
|
|
927
|
+
"FLAG_CMD",
|
|
928
|
+
"FREQ_OFFSET",
|
|
929
|
+
"HISTORY",
|
|
930
|
+
"POINTING",
|
|
931
|
+
"SOURCE",
|
|
932
|
+
"SYSCAL",
|
|
933
|
+
"WEATHER",
|
|
934
|
+
)
|
|
935
|
+
dim_prefix = "dim"
|
|
936
|
+
|
|
937
|
+
if col in timecols:
|
|
938
|
+
if col == "MJD":
|
|
939
|
+
data = convert_mjd_time(data).astype("float64") / 1e9
|
|
940
|
+
else:
|
|
941
|
+
try:
|
|
942
|
+
data = convert_casacore_time(data)
|
|
943
|
+
except pd.errors.OutOfBoundsDatetime as exc:
|
|
944
|
+
if inpath.endswith("WEATHER"):
|
|
945
|
+
# intentionally not callling logging.exception
|
|
946
|
+
logger.warning(
|
|
947
|
+
f"Exception when converting WEATHER/TIME: {exc}. TIME data: {data}"
|
|
948
|
+
)
|
|
949
|
+
else:
|
|
950
|
+
raise
|
|
951
|
+
# should also probably add INTERVAL not only TIME
|
|
952
|
+
if col.endswith("_ID") or (inpath.endswith(subts_with_time_key) and col == "TIME"):
|
|
953
|
+
# weather table: importasdm produces very wrong "-1" ANTENNA_ID
|
|
954
|
+
if (
|
|
955
|
+
inpath.endswith("WEATHER")
|
|
956
|
+
and col == "ANTENNA_ID"
|
|
957
|
+
and "NS_WX_STATION_ID" in tb_tool.colnames()
|
|
958
|
+
):
|
|
959
|
+
data = tb_tool.getcol("NS_WX_STATION_ID")
|
|
960
|
+
|
|
961
|
+
array_type = "coord"
|
|
962
|
+
array_data = xr.DataArray(
|
|
963
|
+
data,
|
|
964
|
+
dims=[
|
|
965
|
+
f"{dim_prefix}_{di}_{ds}" for di, ds in enumerate(np.array(data).shape)
|
|
966
|
+
],
|
|
967
|
+
)
|
|
968
|
+
else:
|
|
969
|
+
array_type = "data_var"
|
|
970
|
+
array_data = xr.DataArray(
|
|
971
|
+
data,
|
|
972
|
+
dims=[
|
|
973
|
+
f"{dim_prefix}_{di}_{ds}" for di, ds in enumerate(np.array(data).shape)
|
|
974
|
+
],
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
return array_type, array_data
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
def handle_variable_col_issues(
|
|
981
|
+
inpath: str, col: str, col_cells: dict, trows: tables.tablerow
|
|
982
|
+
) -> np.ndarray:
|
|
983
|
+
"""
|
|
984
|
+
load variable-size array columns, padding with nans wherever
|
|
985
|
+
needed. This happens for example often in the SPECTRAL_WINDOW
|
|
986
|
+
table (CHAN_WIDTH, EFFECTIVE_BW, etc.).
|
|
987
|
+
Also handle exceptions gracefully when trying to load the rows.
|
|
988
|
+
|
|
989
|
+
Parameters
|
|
990
|
+
----------
|
|
991
|
+
inpath : str
|
|
992
|
+
path name of the MS
|
|
993
|
+
col : str
|
|
994
|
+
column being loaded
|
|
995
|
+
col_cells : dict
|
|
996
|
+
col: cell} values
|
|
997
|
+
trows : tables.tablerow
|
|
998
|
+
rows from a table as loaded by tables.row()
|
|
999
|
+
|
|
1000
|
+
Returns
|
|
1001
|
+
-------
|
|
1002
|
+
np.ndarray
|
|
1003
|
+
array with column values (possibly padded if rows vary in size)
|
|
1004
|
+
"""
|
|
1005
|
+
|
|
1006
|
+
# Optional cols known to sometimes have inconsistent values
|
|
1007
|
+
known_misbehaving_cols = ["ASSOC_NATURE"]
|
|
1008
|
+
|
|
1009
|
+
mshape = np.array(max([np.array(row[col]).shape for row in trows]))
|
|
1010
|
+
try:
|
|
1011
|
+
pad_nan = get_pad_nan(col_cells[col])
|
|
1012
|
+
|
|
1013
|
+
# TODO
|
|
1014
|
+
# benchmark np.stack() performance
|
|
1015
|
+
data = np.stack(
|
|
1016
|
+
[
|
|
1017
|
+
np.pad(
|
|
1018
|
+
(
|
|
1019
|
+
row[col]
|
|
1020
|
+
if len(row[col]) > 0
|
|
1021
|
+
else np.array(row[col]).reshape(np.arange(len(mshape)) * 0)
|
|
1022
|
+
),
|
|
1023
|
+
[(0, ss) for ss in mshape - np.array(row[col]).shape],
|
|
1024
|
+
"constant",
|
|
1025
|
+
constant_values=pad_nan,
|
|
1026
|
+
)
|
|
1027
|
+
for row in trows
|
|
1028
|
+
]
|
|
1029
|
+
)
|
|
1030
|
+
except Exception as exc:
|
|
1031
|
+
msg = f"{inpath}: failed to load data for column {col}: {exc}"
|
|
1032
|
+
if col in known_misbehaving_cols:
|
|
1033
|
+
logger.debug(msg)
|
|
1034
|
+
else:
|
|
1035
|
+
logger.warning(msg)
|
|
1036
|
+
data = np.empty(0)
|
|
1037
|
+
|
|
1038
|
+
return data
|
|
1039
|
+
|
|
1040
|
+
|
|
527
1041
|
def read_flat_col_chunk(infile, col, cshape, ridxs, cstart, pstart) -> np.ndarray:
|
|
528
1042
|
"""
|
|
529
1043
|
Extract data chunk for each table col, this is fed to dask.delayed
|
|
1044
|
+
|
|
1045
|
+
Parameters
|
|
1046
|
+
----------
|
|
1047
|
+
infile :
|
|
1048
|
+
|
|
1049
|
+
col :
|
|
1050
|
+
|
|
1051
|
+
cshape :
|
|
1052
|
+
|
|
1053
|
+
ridxs :
|
|
1054
|
+
|
|
1055
|
+
cstart :
|
|
1056
|
+
|
|
1057
|
+
pstart :
|
|
1058
|
+
|
|
1059
|
+
Returns
|
|
1060
|
+
-------
|
|
1061
|
+
np.ndarray
|
|
530
1062
|
"""
|
|
531
1063
|
|
|
532
1064
|
with open_table_ro(infile) as tb_tool:
|
|
@@ -588,6 +1120,30 @@ def read_col_chunk(
|
|
|
588
1120
|
) -> np.ndarray:
|
|
589
1121
|
"""
|
|
590
1122
|
Function to perform delayed reads from table columns.
|
|
1123
|
+
|
|
1124
|
+
Parameters
|
|
1125
|
+
----------
|
|
1126
|
+
infile : str
|
|
1127
|
+
|
|
1128
|
+
ts_taql : str
|
|
1129
|
+
|
|
1130
|
+
col : str
|
|
1131
|
+
|
|
1132
|
+
cshape : Tuple[int]
|
|
1133
|
+
|
|
1134
|
+
tidxs : np.ndarray
|
|
1135
|
+
|
|
1136
|
+
bidxs : np.ndarray
|
|
1137
|
+
|
|
1138
|
+
didxs : np.ndarray
|
|
1139
|
+
|
|
1140
|
+
d1: Tuple[int, int]
|
|
1141
|
+
|
|
1142
|
+
d2: Tuple[int, int]
|
|
1143
|
+
|
|
1144
|
+
Returns
|
|
1145
|
+
-------
|
|
1146
|
+
np.ndarray
|
|
591
1147
|
"""
|
|
592
1148
|
# TODO: consider calling load_col_chunk() from inside the withs
|
|
593
1149
|
# for read_delayed_pointing_table and read_expanded_main_table
|
|
@@ -600,8 +1156,13 @@ def read_col_chunk(
|
|
|
600
1156
|
elif len(cshape) == 4: # DATA and FLAG
|
|
601
1157
|
data = query.getcolslice(col, (d1[0], d2[0]), (d1[1], d2[1]), [], 0, -1)
|
|
602
1158
|
|
|
603
|
-
|
|
604
|
-
|
|
1159
|
+
policy = "warn"
|
|
1160
|
+
if np.issubdtype(data.dtype, np.integer):
|
|
1161
|
+
policy = "ignore"
|
|
1162
|
+
with np.errstate(invalid=policy):
|
|
1163
|
+
# full data is the maximum of the data shape and chunk shape dimensions
|
|
1164
|
+
fulldata = np.full(cshape, np.nan, dtype=data.dtype)
|
|
1165
|
+
|
|
605
1166
|
if len(didxs) > 0:
|
|
606
1167
|
fulldata[tidxs[didxs], bidxs[didxs]] = data[didxs]
|
|
607
1168
|
|
|
@@ -609,20 +1170,85 @@ def read_col_chunk(
|
|
|
609
1170
|
|
|
610
1171
|
|
|
611
1172
|
def read_col_conversion(
|
|
612
|
-
tb_tool,
|
|
1173
|
+
tb_tool: tables.table,
|
|
613
1174
|
col: str,
|
|
614
1175
|
cshape: Tuple[int],
|
|
615
1176
|
tidxs: np.ndarray,
|
|
616
1177
|
bidxs: np.ndarray,
|
|
617
|
-
):
|
|
1178
|
+
) -> np.ndarray:
|
|
618
1179
|
"""
|
|
619
1180
|
Function to perform delayed reads from table columns when converting
|
|
620
1181
|
(no need for didxs)
|
|
1182
|
+
|
|
1183
|
+
Parameters
|
|
1184
|
+
----------
|
|
1185
|
+
tb_tool : tables.table
|
|
1186
|
+
|
|
1187
|
+
col : str
|
|
1188
|
+
|
|
1189
|
+
cshape : Tuple[int]
|
|
1190
|
+
|
|
1191
|
+
tidxs : np.ndarray
|
|
1192
|
+
|
|
1193
|
+
bidxs : np.ndarray
|
|
1194
|
+
|
|
1195
|
+
Returns
|
|
1196
|
+
-------
|
|
1197
|
+
np.ndarray
|
|
621
1198
|
"""
|
|
622
|
-
data = tb_tool.getcol(col)
|
|
623
1199
|
|
|
624
|
-
#
|
|
625
|
-
#
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
1200
|
+
# Workaround for https://github.com/casacore/python-casacore/issues/130
|
|
1201
|
+
# WARNING: Assumes tb_tool is a single measurement set not an MMS.
|
|
1202
|
+
# WARNING: Assumes the num_frequencies * num_polarizations < 2**29. If false,
|
|
1203
|
+
# https://github.com/casacore/python-casacore/issues/130 isn't mitigated.
|
|
1204
|
+
|
|
1205
|
+
# Use casacore to get the shape of a row for this column
|
|
1206
|
+
#################################################################################
|
|
1207
|
+
|
|
1208
|
+
# Get the total number of rows in the base measurement set
|
|
1209
|
+
nrows_total = tb_tool.nrows()
|
|
1210
|
+
|
|
1211
|
+
# getcolshapestring() only works on columns where a row element is an
|
|
1212
|
+
# array ie. fails for TIME
|
|
1213
|
+
# Assumes the RuntimeError is because the column is a scalar
|
|
1214
|
+
try:
|
|
1215
|
+
shape_string = tb_tool.getcolshapestring(col)[0]
|
|
1216
|
+
# Convert `shape_string` into a tuple that numpy understands
|
|
1217
|
+
extra_dimensions = tuple(
|
|
1218
|
+
[
|
|
1219
|
+
int(idx)
|
|
1220
|
+
for idx in shape_string.replace("[", "").replace("]", "").split(", ")
|
|
1221
|
+
]
|
|
1222
|
+
)
|
|
1223
|
+
except RuntimeError:
|
|
1224
|
+
extra_dimensions = ()
|
|
1225
|
+
|
|
1226
|
+
#################################################################################
|
|
1227
|
+
|
|
1228
|
+
# Get dtype of the column. Only read first row from disk
|
|
1229
|
+
col_dtype = np.array(tb_tool.col(col)[0]).dtype
|
|
1230
|
+
|
|
1231
|
+
# Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
|
|
1232
|
+
data = np.full(cshape + extra_dimensions, np.nan, dtype=col_dtype)
|
|
1233
|
+
|
|
1234
|
+
# Use built-in casacore table iterator to populate the data column by unique times.
|
|
1235
|
+
start_row = 0
|
|
1236
|
+
for ts in tb_tool.iter("TIME", sort=False):
|
|
1237
|
+
num_rows = ts.nrows()
|
|
1238
|
+
|
|
1239
|
+
# Create small temporary array to store the partial column
|
|
1240
|
+
tmp_arr = np.full((num_rows,) + extra_dimensions, np.nan, dtype=col_dtype)
|
|
1241
|
+
|
|
1242
|
+
# Note we don't use `getcol()` because it's less safe. See:
|
|
1243
|
+
# https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
|
|
1244
|
+
ts.getcolnp(col, tmp_arr)
|
|
1245
|
+
|
|
1246
|
+
# Get the slice of rows contained in `tmp_arr`.
|
|
1247
|
+
# Used to get the relevant integer indexes from `tidxs` and `bidxs`
|
|
1248
|
+
tmp_slice = slice(start_row, start_row + num_rows)
|
|
1249
|
+
|
|
1250
|
+
# Copy `tmp_arr` into correct elements of `tmp_arr`
|
|
1251
|
+
data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
|
|
1252
|
+
start_row += num_rows
|
|
1253
|
+
|
|
1254
|
+
return data
|