xradio 0.0.56__py3-none-any.whl → 0.0.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xradio/__init__.py +2 -2
- xradio/_utils/_casacore/casacore_from_casatools.py +12 -2
- xradio/_utils/_casacore/tables.py +1 -0
- xradio/_utils/coord_math.py +22 -23
- xradio/_utils/dict_helpers.py +76 -11
- xradio/_utils/schema.py +5 -2
- xradio/_utils/zarr/common.py +1 -73
- xradio/image/_util/_casacore/xds_from_casacore.py +49 -33
- xradio/image/_util/_casacore/xds_to_casacore.py +41 -14
- xradio/image/_util/_fits/xds_from_fits.py +146 -35
- xradio/image/_util/casacore.py +4 -3
- xradio/image/_util/common.py +4 -4
- xradio/image/_util/image_factory.py +8 -8
- xradio/image/image.py +45 -5
- xradio/measurement_set/__init__.py +19 -9
- xradio/measurement_set/_utils/__init__.py +1 -3
- xradio/measurement_set/_utils/_msv2/__init__.py +0 -0
- xradio/measurement_set/_utils/_msv2/_tables/read.py +17 -76
- xradio/measurement_set/_utils/_msv2/_tables/read_main_table.py +2 -685
- xradio/measurement_set/_utils/_msv2/conversion.py +174 -156
- xradio/measurement_set/_utils/_msv2/create_antenna_xds.py +9 -16
- xradio/measurement_set/_utils/_msv2/create_field_and_source_xds.py +128 -222
- xradio/measurement_set/_utils/_msv2/msv2_to_msv4_meta.py +1 -2
- xradio/measurement_set/_utils/_msv2/msv4_info_dicts.py +8 -7
- xradio/measurement_set/_utils/_msv2/msv4_sub_xdss.py +31 -74
- xradio/measurement_set/_utils/_msv2/partition_queries.py +1 -261
- xradio/measurement_set/_utils/_msv2/subtables.py +0 -107
- xradio/measurement_set/_utils/_utils/interpolate.py +60 -0
- xradio/measurement_set/_utils/_zarr/encoding.py +2 -7
- xradio/measurement_set/convert_msv2_to_processing_set.py +0 -2
- xradio/measurement_set/load_processing_set.py +2 -2
- xradio/measurement_set/measurement_set_xdt.py +20 -16
- xradio/measurement_set/open_processing_set.py +1 -3
- xradio/measurement_set/processing_set_xdt.py +54 -841
- xradio/measurement_set/schema.py +122 -132
- xradio/schema/check.py +95 -101
- xradio/schema/dataclass.py +159 -22
- xradio/schema/export.py +99 -0
- xradio/schema/metamodel.py +51 -16
- xradio/schema/typing.py +5 -5
- xradio/sphinx/schema_table.py +41 -77
- {xradio-0.0.56.dist-info → xradio-0.0.59.dist-info}/METADATA +20 -5
- xradio-0.0.59.dist-info/RECORD +65 -0
- {xradio-0.0.56.dist-info → xradio-0.0.59.dist-info}/WHEEL +1 -1
- xradio/image/_util/fits.py +0 -13
- xradio/measurement_set/_utils/_msv2/_tables/load.py +0 -66
- xradio/measurement_set/_utils/_msv2/_tables/load_main_table.py +0 -490
- xradio/measurement_set/_utils/_msv2/_tables/read_subtables.py +0 -398
- xradio/measurement_set/_utils/_msv2/_tables/write.py +0 -323
- xradio/measurement_set/_utils/_msv2/_tables/write_exp_api.py +0 -388
- xradio/measurement_set/_utils/_msv2/chunks.py +0 -115
- xradio/measurement_set/_utils/_msv2/descr.py +0 -165
- xradio/measurement_set/_utils/_msv2/msv2_msv3.py +0 -7
- xradio/measurement_set/_utils/_msv2/partitions.py +0 -392
- xradio/measurement_set/_utils/_utils/cds.py +0 -40
- xradio/measurement_set/_utils/_utils/xds_helper.py +0 -404
- xradio/measurement_set/_utils/_zarr/read.py +0 -263
- xradio/measurement_set/_utils/_zarr/write.py +0 -329
- xradio/measurement_set/_utils/msv2.py +0 -106
- xradio/measurement_set/_utils/zarr.py +0 -133
- xradio-0.0.56.dist-info/RECORD +0 -78
- {xradio-0.0.56.dist-info → xradio-0.0.59.dist-info}/licenses/LICENSE.txt +0 -0
- {xradio-0.0.56.dist-info → xradio-0.0.59.dist-info}/top_level.txt +0 -0
|
@@ -1,404 +0,0 @@
|
|
|
1
|
-
from importlib.metadata import version
|
|
2
|
-
import toolviper.utils.logger as logger, multiprocessing, psutil
|
|
3
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import xarray as xr
|
|
7
|
-
|
|
8
|
-
from .cds import CASAVisSet
|
|
9
|
-
from .stokes_types import stokes_types
|
|
10
|
-
from xradio._utils.list_and_array import get_pad_value
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def make_coords(
|
|
14
|
-
xds: xr.Dataset, ddi: int, subtables: Tuple[xr.Dataset, ...]
|
|
15
|
-
) -> Dict[str, np.ndarray]:
|
|
16
|
-
"""
|
|
17
|
-
Make the coords to be added to a partition or chunk (besides
|
|
18
|
-
the time, baseline) basic structure
|
|
19
|
-
|
|
20
|
-
Grabs:
|
|
21
|
-
- channel (center) frequency values from the spw subtable
|
|
22
|
-
- pol idxs from the pol+ddi subtables -> pol names via the stokes_types
|
|
23
|
-
- antenna IDs from antenna subtable
|
|
24
|
-
|
|
25
|
-
Parameters
|
|
26
|
-
----------
|
|
27
|
-
xds : xr.Dataset
|
|
28
|
-
|
|
29
|
-
ddi : int
|
|
30
|
-
|
|
31
|
-
subtables: Tuple[xr.Dataset, ...]
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
Returns
|
|
35
|
-
-------
|
|
36
|
-
Dict[str, np.ndarray]
|
|
37
|
-
"""
|
|
38
|
-
ant_xds, ddi_xds, spw_xds, pol_xds = subtables
|
|
39
|
-
freq = spw_xds.CHAN_FREQ.values[
|
|
40
|
-
ddi_xds.SPECTRAL_WINDOW_ID.values[ddi], : xds.freq.shape[0]
|
|
41
|
-
]
|
|
42
|
-
pol_ids = pol_xds.CORR_TYPE.values[
|
|
43
|
-
ddi_xds.POLARIZATION_ID.values[ddi], : xds.pol.shape[0]
|
|
44
|
-
]
|
|
45
|
-
pol_names = np.vectorize(stokes_types.get)(pol_ids)
|
|
46
|
-
ant_id = ant_xds.antenna_id.values
|
|
47
|
-
coords = {
|
|
48
|
-
"freq": freq,
|
|
49
|
-
"pol": pol_names,
|
|
50
|
-
"antenna_id": ant_id,
|
|
51
|
-
# These will be metainfo in partitions
|
|
52
|
-
# "spw_id": [ddi_xds["spectral_window_id"].values[ddi]],
|
|
53
|
-
# "pol_id": [ddi_xds["polarization_id"].values[ddi]],
|
|
54
|
-
}
|
|
55
|
-
return coords
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def vis_xds_packager_cds(
|
|
59
|
-
subtables: List[Tuple[str, xr.Dataset]],
|
|
60
|
-
partitions: Dict[Any, xr.Dataset],
|
|
61
|
-
descr_add: str = "",
|
|
62
|
-
) -> CASAVisSet:
|
|
63
|
-
"""
|
|
64
|
-
Takes a a list of subtable xds datasets and a dictionary of data
|
|
65
|
-
partition xds datasets and and packages them as a CASA vis dataset
|
|
66
|
-
(cds)
|
|
67
|
-
|
|
68
|
-
Parameters
|
|
69
|
-
----------
|
|
70
|
-
partitions : List[Tuple[str, xr.Dataset]]
|
|
71
|
-
data partiions as xds datasets
|
|
72
|
-
subtables : Dict[Any, xr.Dataset]
|
|
73
|
-
subtables as xds datasets
|
|
74
|
-
descr_add : str (Default value = "")
|
|
75
|
-
substring to add to the short descr string of the cds
|
|
76
|
-
|
|
77
|
-
Returns
|
|
78
|
-
-------
|
|
79
|
-
CASAVisSet
|
|
80
|
-
A "cds" - container for the metainfo subtables and data partitions
|
|
81
|
-
"""
|
|
82
|
-
vers = version("xradio")
|
|
83
|
-
|
|
84
|
-
return CASAVisSet(
|
|
85
|
-
subtables,
|
|
86
|
-
partitions,
|
|
87
|
-
f"CASA vis set produced by xradio {vers}/{descr_add}",
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def vis_xds_packager_mxds(
|
|
92
|
-
partitions: Dict[Any, xr.Dataset],
|
|
93
|
-
subtables: List[Tuple[str, xr.Dataset]],
|
|
94
|
-
add_global_coords: bool = True,
|
|
95
|
-
) -> xr.Dataset:
|
|
96
|
-
"""
|
|
97
|
-
Takes a dictionary of data partition xds datasets and a list of
|
|
98
|
-
subtable xds datasets and packages them as a dataset of datasets
|
|
99
|
-
(mxds)
|
|
100
|
-
|
|
101
|
-
Parameters
|
|
102
|
-
----------
|
|
103
|
-
partitions : Dict[Any, xr.Dataset]
|
|
104
|
-
data partiions as xds datasets
|
|
105
|
-
subtables : List[Tuple[str, xr.Dataset]]
|
|
106
|
-
subtables as xds datasets
|
|
107
|
-
:add_global_coords: whether to add coords to the output mxds
|
|
108
|
-
add_global_coords: bool (Default value = True)
|
|
109
|
-
|
|
110
|
-
Returns
|
|
111
|
-
-------
|
|
112
|
-
xr.Dataset
|
|
113
|
-
A "mxds" - xr.dataset of datasets
|
|
114
|
-
"""
|
|
115
|
-
mxds = xr.Dataset(attrs={"metainfo": subtables, "partitions": partitions})
|
|
116
|
-
|
|
117
|
-
if add_global_coords:
|
|
118
|
-
mxds = mxds.assign_coords(make_global_coords(mxds))
|
|
119
|
-
|
|
120
|
-
return mxds
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def make_global_coords(mxds: xr.Dataset) -> Dict[str, xr.DataArray]:
|
|
124
|
-
coords = {}
|
|
125
|
-
metainfo = mxds.attrs["metainfo"]
|
|
126
|
-
if "antenna" in metainfo:
|
|
127
|
-
coords["antenna_ids"] = metainfo["antenna"].antenna_id.values
|
|
128
|
-
coords["antennas"] = xr.DataArray(
|
|
129
|
-
metainfo["antenna"].NAME.values, dims=["antenna_ids"]
|
|
130
|
-
)
|
|
131
|
-
if "field" in metainfo:
|
|
132
|
-
coords["field_ids"] = metainfo["field"].field_id.values
|
|
133
|
-
coords["fields"] = xr.DataArray(
|
|
134
|
-
metainfo["field"].NAME.values, dims=["field_ids"]
|
|
135
|
-
)
|
|
136
|
-
if "feed" in mxds.attrs:
|
|
137
|
-
coords["feed_ids"] = metainfo["feed"].FEED_ID.values
|
|
138
|
-
if "observation" in metainfo:
|
|
139
|
-
coords["observation_ids"] = metainfo["observation"].observation_id.values
|
|
140
|
-
coords["observations"] = xr.DataArray(
|
|
141
|
-
metainfo["observation"].PROJECT.values, dims=["observation_ids"]
|
|
142
|
-
)
|
|
143
|
-
if "polarization" in metainfo:
|
|
144
|
-
coords["polarization_ids"] = metainfo["polarization"].pol_setup_id.values
|
|
145
|
-
if "source" in metainfo:
|
|
146
|
-
coords["source_ids"] = metainfo["source"].SOURCE_ID.values
|
|
147
|
-
coords["sources"] = xr.DataArray(
|
|
148
|
-
metainfo["source"].NAME.values, dims=["source_ids"]
|
|
149
|
-
)
|
|
150
|
-
if "spectral_window" in metainfo:
|
|
151
|
-
coords["spw_ids"] = metainfo["spectral_window"].spw_id.values
|
|
152
|
-
if "state" in metainfo:
|
|
153
|
-
coords["state_ids"] = metainfo["state"].STATE_ID.values
|
|
154
|
-
|
|
155
|
-
return coords
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def expand_xds(xds: xr.Dataset) -> xr.Dataset:
|
|
159
|
-
"""
|
|
160
|
-
expand single (row) dimension of xds to (time, baseline)
|
|
161
|
-
|
|
162
|
-
Parameters
|
|
163
|
-
----------
|
|
164
|
-
xds : xr.Dataset
|
|
165
|
-
"flat" dataset (with row dimension - without (time, baseline) dimensions)
|
|
166
|
-
|
|
167
|
-
Returns
|
|
168
|
-
-------
|
|
169
|
-
xr.Dataset
|
|
170
|
-
expanded dataset, with (time, baseline) dimensions
|
|
171
|
-
"""
|
|
172
|
-
assert "baseline" not in xds.coords
|
|
173
|
-
|
|
174
|
-
txds = xds.copy()
|
|
175
|
-
|
|
176
|
-
unique_baselines, baselines = np.unique(
|
|
177
|
-
[txds.baseline_ant1_id.values, txds.baseline_ant2_id.values],
|
|
178
|
-
axis=1,
|
|
179
|
-
return_inverse=True,
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
txds["baseline"] = xr.DataArray(baselines.astype("int32"), dims=["row"])
|
|
183
|
-
|
|
184
|
-
try:
|
|
185
|
-
txds = (
|
|
186
|
-
txds.set_index(row=["time", "baseline"])
|
|
187
|
-
.unstack("row")
|
|
188
|
-
.transpose("time", "baseline", ...)
|
|
189
|
-
)
|
|
190
|
-
# unstack changes type to float when it needs to introduce NaNs, so
|
|
191
|
-
# we need to reset to the proper type. Avoid if possible, as the
|
|
192
|
-
# astype are costly
|
|
193
|
-
for dv in txds.data_vars:
|
|
194
|
-
if txds[dv].dtype != xds[dv].dtype:
|
|
195
|
-
txds[dv] = txds[dv].astype(xds[dv].dtype)
|
|
196
|
-
except Exception as exc:
|
|
197
|
-
logger.warning(
|
|
198
|
-
f"WARNING: Cannot expand rows to (time, baseline), "
|
|
199
|
-
f"possibly duplicate values in (time, baseline). Exception: {exc}."
|
|
200
|
-
f"\nDataset: {txds=}"
|
|
201
|
-
)
|
|
202
|
-
txds = xds.copy()
|
|
203
|
-
|
|
204
|
-
return txds
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def flatten_xds(xds: xr.Dataset) -> xr.Dataset:
|
|
208
|
-
"""
|
|
209
|
-
flatten (time, baseline) dimensions of xds back to single dimension (row)
|
|
210
|
-
|
|
211
|
-
Parameters
|
|
212
|
-
----------
|
|
213
|
-
xds : xr.Dataset
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
Returns
|
|
217
|
-
-------
|
|
218
|
-
xr.Dataset
|
|
219
|
-
Dataset in flat form (back to 'row' dimension as read by casacore tables)
|
|
220
|
-
"""
|
|
221
|
-
txds = xds.copy()
|
|
222
|
-
|
|
223
|
-
# flatten the time x baseline dimensions of main table
|
|
224
|
-
if ("time" in xds.sizes) and ("baseline" in xds.sizes):
|
|
225
|
-
txds = xds.stack({"row": ("time", "baseline")}).transpose("row", ...)
|
|
226
|
-
# compute for issue https://github.com/hainegroup/oceanspy/issues/332
|
|
227
|
-
# drop=True silently does compute (or at least used to)
|
|
228
|
-
|
|
229
|
-
fill_value_int32 = get_pad_value(np.int32)
|
|
230
|
-
txds = txds.where(
|
|
231
|
-
(
|
|
232
|
-
(txds.STATE_ID != fill_value_int32)
|
|
233
|
-
& (txds.FIELD_ID != fill_value_int32)
|
|
234
|
-
).compute(),
|
|
235
|
-
drop=True,
|
|
236
|
-
) # .unify_chunks()
|
|
237
|
-
|
|
238
|
-
# re-assigning (implicitly dropping index coords) one by one produces
|
|
239
|
-
# DeprecationWarnings: https://github.com/pydata/xarray/issues/6505
|
|
240
|
-
astyped_data_vars = dict(xds.data_vars)
|
|
241
|
-
for dv in list(txds.data_vars):
|
|
242
|
-
if txds[dv].dtype != xds[dv].dtype:
|
|
243
|
-
astyped_data_vars[dv] = txds[dv].astype(xds[dv].dtype)
|
|
244
|
-
else:
|
|
245
|
-
astyped_data_vars[dv] = txds[dv]
|
|
246
|
-
|
|
247
|
-
flat_xds = xr.Dataset(astyped_data_vars, coords=txds.coords, attrs=txds.attrs)
|
|
248
|
-
flat_xds = flat_xds.reset_index(["time", "baseline"])
|
|
249
|
-
|
|
250
|
-
else:
|
|
251
|
-
flat_xds = txds
|
|
252
|
-
|
|
253
|
-
return flat_xds
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
####################################
|
|
257
|
-
# xautomatically compute best data chunking
|
|
258
|
-
def optimal_chunking(
|
|
259
|
-
ndim: Union[int, None] = None,
|
|
260
|
-
didxs: Union[Tuple[int], List[int], None] = None,
|
|
261
|
-
chunk_size: str = "auto",
|
|
262
|
-
data_shape: Union[tuple, None] = None,
|
|
263
|
-
) -> tuple:
|
|
264
|
-
"""
|
|
265
|
-
Determine the optimal chunk shape for reading an MS or Image based
|
|
266
|
-
on machine resources and intended operations
|
|
267
|
-
|
|
268
|
-
Parameters
|
|
269
|
-
----------
|
|
270
|
-
ndim : Union[int, None] = None
|
|
271
|
-
number of dimensions to chunk. An MS is 3, an
|
|
272
|
-
expanded MS is 4. An image could be anywhere from 2 to 5. Not
|
|
273
|
-
needed if data_shape is given.
|
|
274
|
-
didxs : Union[Tuple[int], List[int], None] = None
|
|
275
|
-
dimension indices over which subsequent operations
|
|
276
|
-
will be performed. Values should be less than ndim. Tries to
|
|
277
|
-
reduce inter-process communication of data contents. Needs to
|
|
278
|
-
know the shape to do this well. Default None balances chunk size
|
|
279
|
-
across all dimensions.
|
|
280
|
-
chunk_size : str (Default value = "auto")
|
|
281
|
-
target chunk size ('large', 'small', 'auto').
|
|
282
|
-
Default 'auto' tries to guess by looking at CPU core count and
|
|
283
|
-
available memory.
|
|
284
|
-
data_shape : Union[tuple, None] = None
|
|
285
|
-
shape of the total MS DDI or Image data. Helps
|
|
286
|
-
to know. Default None does not optimize based on shape
|
|
287
|
-
|
|
288
|
-
Returns
|
|
289
|
-
-------
|
|
290
|
-
tuple
|
|
291
|
-
optimal chunking for reading the ms (row, chan, pol)
|
|
292
|
-
"""
|
|
293
|
-
assert (ndim is not None) or (
|
|
294
|
-
data_shape is not None
|
|
295
|
-
), "either ndim or data_shape must be given"
|
|
296
|
-
assert chunk_size in ["large", "small", "auto"], "invalid chunk_size parameter"
|
|
297
|
-
if ndim is None:
|
|
298
|
-
ndim = len(data_shape)
|
|
299
|
-
|
|
300
|
-
opt_dims = (
|
|
301
|
-
didxs if (didxs is not None) and (len(didxs) > 0) else np.arange(ndim)
|
|
302
|
-
) # maximize these dim chunk sizes
|
|
303
|
-
nonopt_dims = np.setdiff1d(np.arange(ndim), opt_dims) # at the expense of these
|
|
304
|
-
|
|
305
|
-
max_chunk_sizes = (
|
|
306
|
-
data_shape
|
|
307
|
-
if data_shape is not None
|
|
308
|
-
else [dd for ii, dd in enumerate([10000, 10000, 10000, 4, 10]) if ii < ndim]
|
|
309
|
-
)
|
|
310
|
-
min_chunk_sizes = (
|
|
311
|
-
np.ceil(np.array(data_shape) / 80).astype(int)
|
|
312
|
-
if data_shape is not None
|
|
313
|
-
else (
|
|
314
|
-
[1000, 1, 1]
|
|
315
|
-
if ndim == 3
|
|
316
|
-
else [dd for ii, dd in enumerate([10, 10, 1, 1, 1]) if ii < ndim]
|
|
317
|
-
)
|
|
318
|
-
)
|
|
319
|
-
target_size = 175 * 1024**2 / 8 # ~175 MB chunk worst case with 8-byte DATA column
|
|
320
|
-
bytes_per_core = int(
|
|
321
|
-
round(
|
|
322
|
-
((psutil.virtual_memory().available * 0.10) / multiprocessing.cpu_count())
|
|
323
|
-
)
|
|
324
|
-
)
|
|
325
|
-
if data_shape is not None:
|
|
326
|
-
bytes_per_core = min(
|
|
327
|
-
bytes_per_core, np.prod(data_shape) * 8 / 2
|
|
328
|
-
) # ensure at least two chunks
|
|
329
|
-
if chunk_size == "large":
|
|
330
|
-
target_size = target_size * 6 # ~1 GB
|
|
331
|
-
if chunk_size == "auto":
|
|
332
|
-
target_size = max(min(target_size * 6, bytes_per_core / 8), target_size)
|
|
333
|
-
|
|
334
|
-
# start by setting the optimized dims to their max size and non-optimized dims to their min size
|
|
335
|
-
chunks = np.zeros((ndim), dtype="int")
|
|
336
|
-
chunks[opt_dims] = np.array(max_chunk_sizes)[opt_dims]
|
|
337
|
-
chunks[nonopt_dims] = np.array(min_chunk_sizes)[nonopt_dims]
|
|
338
|
-
|
|
339
|
-
# iteratively walk towards an optimal chunk size
|
|
340
|
-
# iteration is needed because rounding to nearest integer index can make a big different (2x) in chunk size
|
|
341
|
-
# for small dimensions like pol
|
|
342
|
-
for ii in range(10):
|
|
343
|
-
# if the resulting size is too big, reduce the sizes of the optimized dimensions
|
|
344
|
-
if (np.prod(chunks) > target_size) and (len(opt_dims) > 0):
|
|
345
|
-
chunks[opt_dims] = np.round(
|
|
346
|
-
chunks[opt_dims]
|
|
347
|
-
* (target_size / np.prod(chunks)) ** (1 / len(opt_dims))
|
|
348
|
-
)
|
|
349
|
-
# else if the resulting size is too small, increase the sizes of the non-optimized dimensions
|
|
350
|
-
elif (np.prod(chunks) < target_size) and (len(nonopt_dims) > 0):
|
|
351
|
-
chunks[nonopt_dims] = np.round(
|
|
352
|
-
chunks[nonopt_dims]
|
|
353
|
-
* (target_size / np.prod(chunks)) ** (1 / len(nonopt_dims))
|
|
354
|
-
)
|
|
355
|
-
chunks = np.min((chunks, max_chunk_sizes), axis=0)
|
|
356
|
-
chunks = np.max((chunks, min_chunk_sizes), axis=0)
|
|
357
|
-
|
|
358
|
-
return tuple(chunks)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
def calc_optimal_ms_chunk_shape(
|
|
362
|
-
memory_available_in_bytes, shape, element_size_in_bytes, column_name
|
|
363
|
-
) -> int:
|
|
364
|
-
"""
|
|
365
|
-
Calculates the max number of rows (1st dim in shape) of a variable
|
|
366
|
-
that can be fit in the memory for a thread.
|
|
367
|
-
|
|
368
|
-
Parameters
|
|
369
|
-
----------
|
|
370
|
-
memory_available_in_bytes :
|
|
371
|
-
|
|
372
|
-
shape :
|
|
373
|
-
|
|
374
|
-
element_size_in_bytes :
|
|
375
|
-
|
|
376
|
-
column_name :
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
Returns
|
|
380
|
-
-------
|
|
381
|
-
int
|
|
382
|
-
"""
|
|
383
|
-
factor = 0.8 # Account for memory used by other objects in thread.
|
|
384
|
-
# total_mem = np.prod(shape)*element_size_in_bytes
|
|
385
|
-
single_row_mem = np.prod(shape[1:]) * element_size_in_bytes
|
|
386
|
-
|
|
387
|
-
if not single_row_mem < factor * memory_available_in_bytes:
|
|
388
|
-
msg = (
|
|
389
|
-
"Not engough memory in a thread to contain a row of "
|
|
390
|
-
f"{column_name}. Need at least {single_row_mem / factor}"
|
|
391
|
-
" bytes."
|
|
392
|
-
)
|
|
393
|
-
raise RuntimeError(msg)
|
|
394
|
-
|
|
395
|
-
rows_chunk_size = int((factor * memory_available_in_bytes) / single_row_mem)
|
|
396
|
-
|
|
397
|
-
if rows_chunk_size > shape[0]:
|
|
398
|
-
rows_chunk_size = shape[0]
|
|
399
|
-
|
|
400
|
-
logger.debug(
|
|
401
|
-
"Numbers of rows in chunk for " + column_name + ": " + str(rows_chunk_size)
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
return rows_chunk_size
|
|
@@ -1,263 +0,0 @@
|
|
|
1
|
-
import toolviper.utils.logger as logger
|
|
2
|
-
import os
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Dict, List, Tuple, Union
|
|
5
|
-
|
|
6
|
-
import numpy as np
|
|
7
|
-
import xarray as xr
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def read_part_keys(inpath: str) -> List[Tuple]:
|
|
11
|
-
"""
|
|
12
|
-
Reads the partition keys from a Zarr-stored cds.
|
|
13
|
-
|
|
14
|
-
Parameters
|
|
15
|
-
----------
|
|
16
|
-
inpath : str
|
|
17
|
-
path to read from
|
|
18
|
-
|
|
19
|
-
Returns
|
|
20
|
-
-------
|
|
21
|
-
List[Tuple]
|
|
22
|
-
partition keys from a cds
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
xds_keys = xr.open_zarr(
|
|
27
|
-
os.path.join(inpath, "partition_keys"),
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
spw_ids = xds_keys.coords["spw_ids"]
|
|
31
|
-
pol_setup_ids = xds_keys.coords["pol_setup_ids"]
|
|
32
|
-
intents = xds_keys.coords["intents"]
|
|
33
|
-
|
|
34
|
-
return list(zip(spw_ids.values, pol_setup_ids.values, intents.values))
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def read_subtables(inpath: str, asdm_subtables: bool) -> Dict[str, xr.Dataset]:
|
|
38
|
-
"""
|
|
39
|
-
Reads the metainfo subtables from a Zarr-stored cds.
|
|
40
|
-
|
|
41
|
-
Parameters
|
|
42
|
-
----------
|
|
43
|
-
inpath : str
|
|
44
|
-
path to read from
|
|
45
|
-
|
|
46
|
-
asdm_subtables : bool
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
Returns
|
|
50
|
-
-------
|
|
51
|
-
Dict[str, xr.Dataset]
|
|
52
|
-
metainfo subtables from a cds
|
|
53
|
-
|
|
54
|
-
"""
|
|
55
|
-
|
|
56
|
-
metainfo = {}
|
|
57
|
-
metadir = Path(inpath, "metainfo")
|
|
58
|
-
for subt in sorted(metadir.iterdir()):
|
|
59
|
-
if subt.is_dir():
|
|
60
|
-
if not asdm_subtables and subt.name.startswith("ASDM_"):
|
|
61
|
-
logger.debug(f"Not loading ASDM_ subtable {subt.name}...")
|
|
62
|
-
continue
|
|
63
|
-
|
|
64
|
-
metainfo[subt.name] = read_xds(subt, consolidated=True)
|
|
65
|
-
|
|
66
|
-
return metainfo
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def read_partitions(inpath: str, part_keys: List[Tuple]) -> Dict[str, xr.Dataset]:
|
|
70
|
-
"""
|
|
71
|
-
Reads all the data partitions a Zarr-stored cds.
|
|
72
|
-
|
|
73
|
-
Parameters
|
|
74
|
-
----------
|
|
75
|
-
inpath : str
|
|
76
|
-
path to read from
|
|
77
|
-
part_keys : List[Tuple]
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
Returns
|
|
81
|
-
-------
|
|
82
|
-
Dict[str, xr.Dataset]
|
|
83
|
-
partitions from a cds
|
|
84
|
-
|
|
85
|
-
"""
|
|
86
|
-
|
|
87
|
-
partitions = {}
|
|
88
|
-
partdir = Path(inpath, "partitions")
|
|
89
|
-
xds_cnt = 0
|
|
90
|
-
for part in sorted(partdir.iterdir()):
|
|
91
|
-
if part.is_dir() and part.name.startswith("xds_"):
|
|
92
|
-
xds = read_xds(part, consolidated=True)
|
|
93
|
-
partitions[part_keys[xds_cnt]] = xds
|
|
94
|
-
xds_cnt += 1
|
|
95
|
-
|
|
96
|
-
return partitions
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def read_xds(
|
|
100
|
-
inpath: str,
|
|
101
|
-
chunks: Union[Dict, None] = None,
|
|
102
|
-
consolidated: bool = True,
|
|
103
|
-
overwrite_encoded_chunks: bool = True,
|
|
104
|
-
) -> xr.Dataset:
|
|
105
|
-
"""
|
|
106
|
-
Read single xds from zarr storage.
|
|
107
|
-
|
|
108
|
-
Parameters
|
|
109
|
-
----------
|
|
110
|
-
inpath : str
|
|
111
|
-
path to read from
|
|
112
|
-
chunks : Union[Dict, None] (Default value = None)
|
|
113
|
-
set chunk size per dimension. Dict is in the form of
|
|
114
|
-
'dim':chunk_size, for example {'time':100, 'baseline':400, 'chan':32, 'pol':1}.
|
|
115
|
-
Default None uses the original chunking in the zarr input.
|
|
116
|
-
consolidated : boold (Default value = True)
|
|
117
|
-
use zarr consolidated metadata.
|
|
118
|
-
overwrite_encoded_chunks : bool (Default value = True)
|
|
119
|
-
drop the zarr chunks encoded for each variable
|
|
120
|
-
when a dataset is loaded with specified chunk sizes.
|
|
121
|
-
|
|
122
|
-
Returns
|
|
123
|
-
-------
|
|
124
|
-
xr.Dataset
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
xds = xr.open_zarr(
|
|
128
|
-
os.path.join(inpath),
|
|
129
|
-
chunks=chunks,
|
|
130
|
-
consolidated=consolidated,
|
|
131
|
-
overwrite_encoded_chunks=overwrite_encoded_chunks,
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
return xds
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def read_zarr(
|
|
138
|
-
infile: str,
|
|
139
|
-
sel_xds: Union[List, str] = None,
|
|
140
|
-
chunks: Dict = None,
|
|
141
|
-
consolidated: bool = True,
|
|
142
|
-
overwrite_encoded_chunks: bool = True,
|
|
143
|
-
**kwargs,
|
|
144
|
-
):
|
|
145
|
-
"""
|
|
146
|
-
Note: old, initial cngi-io format. To be removed, most likely.
|
|
147
|
-
Read zarr format Visibility data from disk to an ngCASA visibilities dataset
|
|
148
|
-
object consisting of dictionaries of xarray Datasets.
|
|
149
|
-
|
|
150
|
-
Parameters
|
|
151
|
-
----------
|
|
152
|
-
infile : str
|
|
153
|
-
input Visibility filename
|
|
154
|
-
sel_xds : string or list
|
|
155
|
-
Select the ddi to open, for example ['xds0','xds1'] will open the first two ddi. Default None returns everything
|
|
156
|
-
chunks : dict
|
|
157
|
-
sets specified chunk size per dimension. Dict is in the form of
|
|
158
|
-
'dim':chunk_size, for example {'time':100, 'baseline':400, 'chan':32, 'pol':1}.
|
|
159
|
-
Default None uses the original zarr chunking.
|
|
160
|
-
consolidated : bool
|
|
161
|
-
use zarr consolidated metadata capability. Only works for stores that have
|
|
162
|
-
already been consolidated. Default True works with datasets produced by
|
|
163
|
-
convert_ms which automatically consolidates metadata.
|
|
164
|
-
overwrite_encoded_chunks : bool
|
|
165
|
-
drop the zarr chunks encoded for each variable when a dataset is loaded with
|
|
166
|
-
specified chunk sizes. Default True, only applies when chunks is not None.
|
|
167
|
-
**kwargs :
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
Returns
|
|
171
|
-
-------
|
|
172
|
-
|
|
173
|
-
"""
|
|
174
|
-
|
|
175
|
-
if chunks is None:
|
|
176
|
-
chunks = "auto"
|
|
177
|
-
# overwrite_encoded_chunks = False
|
|
178
|
-
# print('overwrite_encoded_chunks',overwrite_encoded_chunks)
|
|
179
|
-
|
|
180
|
-
infile = os.path.expanduser(infile)
|
|
181
|
-
if sel_xds is None:
|
|
182
|
-
sel_xds = os.listdir(infile)
|
|
183
|
-
sel_xds = list(np.atleast_1d(sel_xds))
|
|
184
|
-
|
|
185
|
-
# print(os.path.join(infile, 'DDI_INDEX'))
|
|
186
|
-
mxds = xr.open_zarr(
|
|
187
|
-
os.path.join(infile, "DDI_INDEX"),
|
|
188
|
-
chunks=chunks,
|
|
189
|
-
consolidated=consolidated,
|
|
190
|
-
overwrite_encoded_chunks=overwrite_encoded_chunks,
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
for part in os.listdir(os.path.join(infile, "global")):
|
|
194
|
-
xds_temp = xr.open_zarr(
|
|
195
|
-
os.path.join(infile, "global/" + part),
|
|
196
|
-
chunks=chunks,
|
|
197
|
-
consolidated=consolidated,
|
|
198
|
-
overwrite_encoded_chunks=overwrite_encoded_chunks,
|
|
199
|
-
)
|
|
200
|
-
xds_temp = _fix_dict_for_ms(part, xds_temp)
|
|
201
|
-
mxds.attrs[part] = xds_temp.compute()
|
|
202
|
-
|
|
203
|
-
for part in os.listdir(infile):
|
|
204
|
-
if ("xds" in part) and (part in sel_xds):
|
|
205
|
-
xds_temp = xr.open_zarr(
|
|
206
|
-
os.path.join(infile, part),
|
|
207
|
-
chunks=chunks,
|
|
208
|
-
consolidated=consolidated,
|
|
209
|
-
overwrite_encoded_chunks=overwrite_encoded_chunks,
|
|
210
|
-
)
|
|
211
|
-
xds_temp = _fix_dict_for_ms(part, xds_temp)
|
|
212
|
-
mxds.attrs[part] = xds_temp
|
|
213
|
-
|
|
214
|
-
return mxds
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def _fix_dict_for_ms(name, xds):
|
|
218
|
-
# Used to be:
|
|
219
|
-
# xds.attrs["column_descriptions"] = xds.attrs["column_descriptions"][0]
|
|
220
|
-
# xds.attrs["info"] = xds.attrs["info"][0]
|
|
221
|
-
|
|
222
|
-
if "xds" in name:
|
|
223
|
-
xds.column_descriptions["UVW"]["shape"] = np.array(
|
|
224
|
-
xds.column_descriptions["UVW"]["shape"].split(",")
|
|
225
|
-
).astype(int)
|
|
226
|
-
|
|
227
|
-
if "spectral_window" == name:
|
|
228
|
-
xds.column_descriptions["CHAN_FREQ"]["keywords"]["MEASINFO"]["TabRefCodes"] = (
|
|
229
|
-
np.array(
|
|
230
|
-
xds.column_descriptions["CHAN_FREQ"]["keywords"]["MEASINFO"][
|
|
231
|
-
"TabRefCodes"
|
|
232
|
-
].split(",")
|
|
233
|
-
).astype(int)
|
|
234
|
-
)
|
|
235
|
-
xds.column_descriptions["REF_FREQUENCY"]["keywords"]["MEASINFO"][
|
|
236
|
-
"TabRefCodes"
|
|
237
|
-
] = np.array(
|
|
238
|
-
xds.column_descriptions["REF_FREQUENCY"]["keywords"]["MEASINFO"][
|
|
239
|
-
"TabRefCodes"
|
|
240
|
-
].split(",")
|
|
241
|
-
).astype(
|
|
242
|
-
int
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
if "antenna" == name:
|
|
246
|
-
xds.column_descriptions["OFFSET"]["shape"] = np.array(
|
|
247
|
-
xds.column_descriptions["OFFSET"]["shape"].split(",")
|
|
248
|
-
).astype(int)
|
|
249
|
-
xds.column_descriptions["POSITION"]["shape"] = np.array(
|
|
250
|
-
xds.column_descriptions["POSITION"]["shape"].split(",")
|
|
251
|
-
).astype(int)
|
|
252
|
-
|
|
253
|
-
if "feed" == name:
|
|
254
|
-
xds.column_descriptions["POSITION"]["shape"] = np.array(
|
|
255
|
-
xds.column_descriptions["POSITION"]["shape"].split(",")
|
|
256
|
-
).astype(int)
|
|
257
|
-
|
|
258
|
-
if "observation" == name:
|
|
259
|
-
xds.column_descriptions["TIME_RANGE"]["shape"] = np.array(
|
|
260
|
-
xds.column_descriptions["TIME_RANGE"]["shape"].split(",")
|
|
261
|
-
).astype(int)
|
|
262
|
-
|
|
263
|
-
return xds
|