xradio 0.0.59__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xradio/_utils/list_and_array.py +4 -2
- xradio/image/_util/_casacore/xds_to_casacore.py +11 -4
- xradio/image/image.py +4 -2
- xradio/measurement_set/_utils/_msv2/conversion.py +36 -15
- xradio/measurement_set/_utils/_msv2/create_field_and_source_xds.py +3 -1
- xradio/measurement_set/_utils/_msv2/msv4_info_dicts.py +214 -67
- xradio/measurement_set/_utils/_msv2/partition_queries.py +248 -61
- xradio/measurement_set/convert_msv2_to_processing_set.py +28 -10
- xradio/measurement_set/measurement_set_xdt.py +14 -4
- xradio/measurement_set/open_processing_set.py +6 -6
- xradio/measurement_set/processing_set_xdt.py +69 -12
- xradio/measurement_set/schema.py +137 -180
- xradio/schema/__init__.py +0 -3
- xradio/schema/bases.py +23 -28
- xradio/schema/check.py +23 -15
- xradio/schema/common.py +45 -0
- xradio/schema/export.py +23 -2
- xradio/schema/metamodel.py +12 -8
- xradio/schema/typing.py +7 -13
- {xradio-0.0.59.dist-info → xradio-1.0.0.dist-info}/METADATA +3 -3
- {xradio-0.0.59.dist-info → xradio-1.0.0.dist-info}/RECORD +24 -23
- {xradio-0.0.59.dist-info → xradio-1.0.0.dist-info}/WHEEL +0 -0
- {xradio-0.0.59.dist-info → xradio-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
- {xradio-0.0.59.dist-info → xradio-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
import time
|
|
2
3
|
import toolviper.utils.logger as logger
|
|
4
|
+
import os
|
|
5
|
+
import pandas as pd
|
|
3
6
|
|
|
4
7
|
import numpy as np
|
|
5
8
|
|
|
@@ -17,6 +20,9 @@ def enumerated_product(*args):
|
|
|
17
20
|
)
|
|
18
21
|
|
|
19
22
|
|
|
23
|
+
import pickle, gzip
|
|
24
|
+
|
|
25
|
+
|
|
20
26
|
def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
21
27
|
"""Create a list of dictionaries with the partition information.
|
|
22
28
|
|
|
@@ -34,38 +40,53 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
34
40
|
list
|
|
35
41
|
list of dictionaries with the partition information.
|
|
36
42
|
"""
|
|
37
|
-
# vla_otf (bool, optional): The partioning of VLA OTF (on the fly) mosaics needs a special partitioning scheme. Defaults to False.
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
44
|
+
### Test new implementation without
|
|
45
|
+
# Always start with these (if available); then extend with user scheme.
|
|
46
|
+
partition_scheme = [
|
|
47
|
+
"DATA_DESC_ID",
|
|
48
|
+
"OBS_MODE",
|
|
49
|
+
"OBSERVATION_ID",
|
|
50
|
+
"EPHEMERIS_ID",
|
|
51
|
+
] + list(partition_scheme)
|
|
42
52
|
|
|
43
|
-
partition_scheme = ["DATA_DESC_ID", "OBS_MODE"
|
|
53
|
+
# partition_scheme = ["DATA_DESC_ID", "OBS_MODE"] + list(
|
|
54
|
+
# partition_scheme
|
|
55
|
+
# )
|
|
44
56
|
|
|
45
|
-
|
|
46
|
-
|
|
57
|
+
t0 = time.time()
|
|
58
|
+
# --------- Load base columns from MAIN table ----------
|
|
47
59
|
main_tb = tables.table(
|
|
48
60
|
in_file, readonly=True, lockoptions={"option": "usernoread"}, ack=False
|
|
49
61
|
)
|
|
50
|
-
par_df["DATA_DESC_ID"] = main_tb.getcol("DATA_DESC_ID")
|
|
51
|
-
par_df["FIELD_ID"] = main_tb.getcol("FIELD_ID")
|
|
52
|
-
par_df["SCAN_NUMBER"] = main_tb.getcol("SCAN_NUMBER")
|
|
53
|
-
par_df["STATE_ID"] = main_tb.getcol("STATE_ID")
|
|
54
|
-
par_df["OBSERVATION_ID"] = main_tb.getcol("OBSERVATION_ID")
|
|
55
|
-
par_df["ANTENNA1"] = main_tb.getcol("ANTENNA1")
|
|
56
|
-
par_df = par_df.drop_duplicates()
|
|
57
62
|
|
|
63
|
+
# Build minimal DF once. Pull only columns we may need.
|
|
64
|
+
# Add columns here if you expect to aggregate them per-partition.
|
|
65
|
+
base_cols = {
|
|
66
|
+
"DATA_DESC_ID": main_tb.getcol("DATA_DESC_ID"),
|
|
67
|
+
"FIELD_ID": main_tb.getcol("FIELD_ID"),
|
|
68
|
+
"SCAN_NUMBER": main_tb.getcol("SCAN_NUMBER"),
|
|
69
|
+
"STATE_ID": main_tb.getcol("STATE_ID"),
|
|
70
|
+
"OBSERVATION_ID": main_tb.getcol("OBSERVATION_ID"),
|
|
71
|
+
"ANTENNA1": main_tb.getcol("ANTENNA1"),
|
|
72
|
+
}
|
|
73
|
+
par_df = pd.DataFrame(base_cols).drop_duplicates()
|
|
74
|
+
logger.debug(
|
|
75
|
+
f"Loaded MAIN columns in {time.time() - t0:.2f}s "
|
|
76
|
+
f"({len(par_df):,} unique MAIN rows)"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# --------- Optional SOURCE/STATE derived columns ----------
|
|
80
|
+
# SOURCE_ID (via FIELD table)
|
|
81
|
+
t1 = time.time()
|
|
82
|
+
source_id_added = False
|
|
58
83
|
field_tb = tables.table(
|
|
59
84
|
os.path.join(in_file, "FIELD"),
|
|
60
85
|
readonly=True,
|
|
61
86
|
lockoptions={"option": "usernoread"},
|
|
62
87
|
ack=False,
|
|
63
88
|
)
|
|
64
|
-
|
|
65
|
-
# par_df["FIELD_NAME"] = np.array(field_tb.getcol("NAME"))[par_df["FIELD_ID"]]
|
|
66
|
-
|
|
67
|
-
# Get source ids if available from source table.
|
|
68
|
-
if table_exists(os.path.join(os.path.join(in_file, "SOURCE"))):
|
|
89
|
+
if table_exists(os.path.join(in_file, "SOURCE")):
|
|
69
90
|
source_tb = tables.table(
|
|
70
91
|
os.path.join(in_file, "SOURCE"),
|
|
71
92
|
readonly=True,
|
|
@@ -73,13 +94,31 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
73
94
|
ack=False,
|
|
74
95
|
)
|
|
75
96
|
if source_tb.nrows() != 0:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
97
|
+
# Map SOURCE_ID via FIELD_ID
|
|
98
|
+
field_source = np.asarray(field_tb.getcol("SOURCE_ID"))
|
|
99
|
+
par_df["SOURCE_ID"] = field_source[par_df["FIELD_ID"]]
|
|
100
|
+
source_id_added = True
|
|
101
|
+
logger.debug(
|
|
102
|
+
f"SOURCE processing in {time.time() - t1:.2f}s "
|
|
103
|
+
f"(added SOURCE_ID={source_id_added})"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if "EPHEMERIS_ID" in field_tb.colnames():
|
|
107
|
+
ephemeris_id_added = False
|
|
108
|
+
if field_tb.nrows() != 0:
|
|
109
|
+
# Map EPHEMERIS_ID via FIELD_ID
|
|
110
|
+
field_ephemeris = np.asarray(field_tb.getcol("EPHEMERIS_ID"))
|
|
111
|
+
par_df["EPHEMERIS_ID"] = field_ephemeris[par_df["FIELD_ID"]]
|
|
112
|
+
ephemeris_id_added = True
|
|
113
|
+
logger.debug(
|
|
114
|
+
f"EPHEMERIS processing in {time.time() - t1:.2f}s "
|
|
115
|
+
f"(added EPHEMERIS_ID={ephemeris_id_added})"
|
|
116
|
+
)
|
|
81
117
|
|
|
82
|
-
#
|
|
118
|
+
# OBS_MODE & SUB_SCAN_NUMBER (via STATE table)
|
|
119
|
+
t2 = time.time()
|
|
120
|
+
obs_mode_added = False
|
|
121
|
+
sub_scan_added = False
|
|
83
122
|
if table_exists(os.path.join(in_file, "STATE")):
|
|
84
123
|
state_tb = tables.table(
|
|
85
124
|
os.path.join(in_file, "STATE"),
|
|
@@ -88,30 +127,36 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
88
127
|
ack=False,
|
|
89
128
|
)
|
|
90
129
|
if state_tb.nrows() != 0:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
]
|
|
95
|
-
par_df["SUB_SCAN_NUMBER"] =
|
|
130
|
+
state_obs_mode = np.asarray(state_tb.getcol("OBS_MODE"))
|
|
131
|
+
state_sub_scan = np.asarray(state_tb.getcol("SUB_SCAN"))
|
|
132
|
+
# Index by STATE_ID into STATE columns
|
|
133
|
+
par_df["OBS_MODE"] = state_obs_mode[par_df["STATE_ID"]]
|
|
134
|
+
par_df["SUB_SCAN_NUMBER"] = state_sub_scan[par_df["STATE_ID"]]
|
|
135
|
+
obs_mode_added = True
|
|
136
|
+
sub_scan_added = True
|
|
96
137
|
else:
|
|
97
|
-
|
|
138
|
+
# If STATE empty, drop STATE_ID (it cannot partition anything)
|
|
139
|
+
if "STATE_ID" in par_df.columns:
|
|
140
|
+
par_df.drop(columns=["STATE_ID"], inplace=True)
|
|
98
141
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
partition_criteria = {}
|
|
102
|
-
for par in partition_scheme:
|
|
103
|
-
if par in par_df.columns:
|
|
104
|
-
partition_criteria[par] = par_df[par].unique()
|
|
105
|
-
partition_scheme_updated.append(par)
|
|
106
|
-
logger.info(f"Partition scheme that will be used: {partition_scheme_updated}")
|
|
142
|
+
if "SUB_SCAN_NUMBER" in par_df.columns:
|
|
143
|
+
par_df.drop(columns=["SUB_SCAN_NUMBER"], inplace=True)
|
|
107
144
|
|
|
108
|
-
|
|
109
|
-
|
|
145
|
+
logger.debug(
|
|
146
|
+
f"STATE processing in {time.time() - t2:.2f}s "
|
|
147
|
+
f"(OBS_MODE={obs_mode_added}, SUB_SCAN_NUMBER={sub_scan_added})"
|
|
148
|
+
)
|
|
110
149
|
|
|
111
|
-
#
|
|
150
|
+
# --------- Decide which partition keys are actually available ----------
|
|
151
|
+
t3 = time.time()
|
|
152
|
+
partition_scheme_updated = [k for k in partition_scheme if k in par_df.columns]
|
|
153
|
+
logger.info(f"Updated partition scheme used: {partition_scheme_updated}")
|
|
112
154
|
|
|
113
|
-
#
|
|
114
|
-
|
|
155
|
+
# If none of the requested keys exist, there is a single partition of "everything"
|
|
156
|
+
if not partition_scheme_updated:
|
|
157
|
+
partition_scheme_updated = []
|
|
158
|
+
|
|
159
|
+
# These are the axes we report per partition (present => aggregate unique values)
|
|
115
160
|
partition_axis_names = [
|
|
116
161
|
"DATA_DESC_ID",
|
|
117
162
|
"OBSERVATION_ID",
|
|
@@ -121,30 +166,172 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
121
166
|
"SOURCE_ID",
|
|
122
167
|
"OBS_MODE",
|
|
123
168
|
"SUB_SCAN_NUMBER",
|
|
169
|
+
"EPHEMERIS_ID",
|
|
124
170
|
]
|
|
171
|
+
# Only include ANTENNA1 if user asked for it (keeps output size down)
|
|
125
172
|
if "ANTENNA1" in partition_scheme:
|
|
126
173
|
partition_axis_names.append("ANTENNA1")
|
|
127
174
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
175
|
+
# --------- Group only by realized partitions (no Cartesian product!) ----------
|
|
176
|
+
# observed=True speeds up if categorical; here it’s harmless. sort=False keeps source order.
|
|
177
|
+
if partition_scheme_updated:
|
|
178
|
+
grp = par_df.groupby(partition_scheme_updated, sort=False, observed=False)
|
|
179
|
+
groups_iter = grp
|
|
180
|
+
else:
|
|
181
|
+
# Single group: everything
|
|
182
|
+
groups_iter = [(None, par_df)]
|
|
183
|
+
|
|
184
|
+
partitions = []
|
|
185
|
+
# Fast aggregation: use NumPy for uniques to avoid pandas overhead in the tight loop.
|
|
186
|
+
for _, gdf in groups_iter:
|
|
187
|
+
part = {}
|
|
188
|
+
for name in partition_axis_names:
|
|
189
|
+
if name in gdf.columns:
|
|
190
|
+
# Return Python lists to match your prior structure (can be np.ndarray if preferred)
|
|
191
|
+
part[name] = np.unique(gdf[name].to_numpy()).tolist()
|
|
133
192
|
else:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
sub_par_df = par_df.query(query).drop_duplicates()
|
|
193
|
+
part[name] = [None]
|
|
194
|
+
partitions.append(part)
|
|
137
195
|
|
|
138
|
-
|
|
139
|
-
|
|
196
|
+
logger.debug(
|
|
197
|
+
f"Partition build in {time.time() - t3:.2f}s; total {len(partitions):,} partitions"
|
|
198
|
+
)
|
|
199
|
+
logger.debug(f"Total create_partitions time: {time.time() - t0:.2f}s")
|
|
140
200
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if col_name in sub_par_df.columns:
|
|
144
|
-
partition_info[col_name] = sub_par_df[col_name].unique()
|
|
145
|
-
else:
|
|
146
|
-
partition_info[col_name] = [None]
|
|
201
|
+
# # with gzip.open("partition_original_small.pkl.gz", "wb") as f:
|
|
202
|
+
# # pickle.dump(partitions, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
147
203
|
|
|
148
|
-
|
|
204
|
+
# #partitions[1]["DATA_DESC_ID"] = [999] # make a change to test comparison
|
|
205
|
+
# #org_partitions = load_dict_list("partition_original_small.pkl.gz")
|
|
206
|
+
# org_partitions = load_dict_list("partition_original.pkl.gz")
|
|
149
207
|
|
|
150
208
|
return partitions
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
from typing import Any, List, Dict
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def save_dict_list(filename: str, data: List[Dict[str, Any]]) -> None:
|
|
215
|
+
"""
|
|
216
|
+
Save a list of dictionaries containing NumPy arrays (or other objects)
|
|
217
|
+
to a compressed pickle file.
|
|
218
|
+
"""
|
|
219
|
+
with gzip.open(filename, "wb") as f:
|
|
220
|
+
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def load_dict_list(filename: str) -> List[Dict[str, Any]]:
|
|
224
|
+
"""
|
|
225
|
+
Load a list of dictionaries containing NumPy arrays (or other objects)
|
|
226
|
+
from a compressed pickle file.
|
|
227
|
+
"""
|
|
228
|
+
with gzip.open(filename, "rb") as f:
|
|
229
|
+
return pickle.load(f)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def dict_list_equal(a: List[Dict[str, Any]], b: List[Dict[str, Any]]) -> bool:
|
|
233
|
+
"""
|
|
234
|
+
Compare two lists of dictionaries to ensure they are exactly the same.
|
|
235
|
+
NumPy arrays are compared with array_equal, other objects with ==.
|
|
236
|
+
"""
|
|
237
|
+
if len(a) != len(b):
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
for d1, d2 in zip(a, b):
|
|
241
|
+
if d1.keys() != d2.keys():
|
|
242
|
+
return False
|
|
243
|
+
for k in d1:
|
|
244
|
+
v1, v2 = d1[k], d2[k]
|
|
245
|
+
if isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray):
|
|
246
|
+
if not np.array_equal(v1, v2):
|
|
247
|
+
return False
|
|
248
|
+
else:
|
|
249
|
+
if v1 != v2:
|
|
250
|
+
return False
|
|
251
|
+
return True
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
from typing import Iterable, Mapping, Tuple, List, Dict, Any, Set
|
|
255
|
+
import numpy as np
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _to_python_scalar(x: Any) -> Any:
|
|
259
|
+
"""Convert NumPy scalars to Python scalars; leave others unchanged."""
|
|
260
|
+
if isinstance(x, np.generic):
|
|
261
|
+
return x.item()
|
|
262
|
+
return x
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _to_hashable_value_list(v: Any) -> Tuple[Any, ...]:
|
|
266
|
+
"""
|
|
267
|
+
Normalize a dict value (often list/np.ndarray) into a sorted, hashable tuple.
|
|
268
|
+
- Accepts list/tuple/np.ndarray/scalars/None.
|
|
269
|
+
- Treats None as a value.
|
|
270
|
+
- Sorts with a stable key that stringifies items to avoid dtype hiccups.
|
|
271
|
+
"""
|
|
272
|
+
if isinstance(v, np.ndarray):
|
|
273
|
+
v = v.tolist()
|
|
274
|
+
if v is None or isinstance(v, (str, bytes)):
|
|
275
|
+
# Treat a bare scalar as a single-element collection for consistency.
|
|
276
|
+
v = [v]
|
|
277
|
+
elif not isinstance(v, (list, tuple)):
|
|
278
|
+
v = [v]
|
|
279
|
+
|
|
280
|
+
py_vals = [_to_python_scalar(x) for x in v]
|
|
281
|
+
# Sort by (type name, repr) to keep mixed types stable if present
|
|
282
|
+
return tuple(sorted(py_vals, key=lambda x: (type(x).__name__, repr(x))))
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _canon_partition(
|
|
286
|
+
d: Mapping[str, Any], ignore_keys: Iterable[str] = ()
|
|
287
|
+
) -> Tuple[Tuple[str, Tuple[Any, ...]], ...]:
|
|
288
|
+
"""
|
|
289
|
+
Canonicalize a partition dict into a hashable, order-insensitive representation.
|
|
290
|
+
- Drops keys in ignore_keys.
|
|
291
|
+
- Converts each value collection to a sorted tuple.
|
|
292
|
+
- Sorts keys.
|
|
293
|
+
"""
|
|
294
|
+
ign: Set[str] = set(ignore_keys)
|
|
295
|
+
items = []
|
|
296
|
+
for k, v in d.items():
|
|
297
|
+
if k in ign:
|
|
298
|
+
continue
|
|
299
|
+
items.append((k, _to_hashable_value_list(v)))
|
|
300
|
+
items.sort(key=lambda kv: kv[0])
|
|
301
|
+
return tuple(items)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def compare_partitions_subset(
|
|
305
|
+
new_partitions: List[Dict[str, Any]],
|
|
306
|
+
original_partitions: List[Dict[str, Any]],
|
|
307
|
+
ignore_keys: Iterable[str] = (),
|
|
308
|
+
) -> Tuple[bool, List[Dict[str, Any]]]:
|
|
309
|
+
"""
|
|
310
|
+
Check that every partition in `new_partitions` also appears in `original_partitions`,
|
|
311
|
+
ignoring ordering (of partitions and of values within each key).
|
|
312
|
+
|
|
313
|
+
Parameters
|
|
314
|
+
----------
|
|
315
|
+
new_partitions : list of dict
|
|
316
|
+
Partitions produced by the optimized/new code.
|
|
317
|
+
original_partitions : list of dict
|
|
318
|
+
Partitions produced by the original code (the reference).
|
|
319
|
+
ignore_keys : iterable of str, optional
|
|
320
|
+
Keys to ignore when comparing partitions (e.g., timestamps or debug fields).
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
(ok, missing)
|
|
325
|
+
ok : bool
|
|
326
|
+
True if every new partition is found in the original set.
|
|
327
|
+
missing : list of dict
|
|
328
|
+
The list of partitions (from `new_partitions`) that were NOT found in `original_partitions`,
|
|
329
|
+
useful for debugging diffs.
|
|
330
|
+
"""
|
|
331
|
+
orig_set = {_canon_partition(p, ignore_keys) for p in original_partitions}
|
|
332
|
+
missing = []
|
|
333
|
+
for p in new_partitions:
|
|
334
|
+
cp = _canon_partition(p, ignore_keys)
|
|
335
|
+
if cp not in orig_set:
|
|
336
|
+
missing.append(p)
|
|
337
|
+
return (len(missing) == 0, missing)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import toolviper.utils.logger as logger
|
|
2
2
|
import numcodecs
|
|
3
|
-
from typing import Dict, Union
|
|
3
|
+
from typing import Dict, Union, Literal
|
|
4
|
+
import time
|
|
4
5
|
|
|
5
6
|
import dask
|
|
6
7
|
|
|
7
|
-
from xradio.measurement_set._utils._msv2.partition_queries import
|
|
8
|
+
from xradio.measurement_set._utils._msv2.partition_queries import (
|
|
9
|
+
create_partitions,
|
|
10
|
+
)
|
|
8
11
|
from xradio.measurement_set._utils._msv2.conversion import (
|
|
9
12
|
convert_and_write_partition,
|
|
10
13
|
estimate_memory_and_cores_for_partitions,
|
|
@@ -13,7 +16,7 @@ from xradio.measurement_set._utils._msv2.conversion import (
|
|
|
13
16
|
|
|
14
17
|
def estimate_conversion_memory_and_cores(
|
|
15
18
|
in_file: str,
|
|
16
|
-
partition_scheme: list = [
|
|
19
|
+
partition_scheme: list = [],
|
|
17
20
|
) -> tuple[float, int, int]:
|
|
18
21
|
"""
|
|
19
22
|
Given an MSv2 and a partition_scheme to use when converting it to MSv4,
|
|
@@ -52,7 +55,7 @@ def estimate_conversion_memory_and_cores(
|
|
|
52
55
|
def convert_msv2_to_processing_set(
|
|
53
56
|
in_file: str,
|
|
54
57
|
out_file: str,
|
|
55
|
-
partition_scheme: list = [
|
|
58
|
+
partition_scheme: list = [],
|
|
56
59
|
main_chunksize: Union[Dict, float, None] = None,
|
|
57
60
|
with_pointing: bool = True,
|
|
58
61
|
pointing_chunksize: Union[Dict, float, None] = None,
|
|
@@ -62,8 +65,9 @@ def convert_msv2_to_processing_set(
|
|
|
62
65
|
sys_cal_interpolate: bool = False,
|
|
63
66
|
use_table_iter: bool = False,
|
|
64
67
|
compressor: numcodecs.abc.Codec = numcodecs.Zstd(level=2),
|
|
65
|
-
|
|
66
|
-
|
|
68
|
+
add_reshaping_indices: bool = False,
|
|
69
|
+
storage_backend: Literal["zarr", "netcdf"] = "zarr",
|
|
70
|
+
parallel_mode: Literal["none", "partition", "time"] = "none",
|
|
67
71
|
overwrite: bool = False,
|
|
68
72
|
):
|
|
69
73
|
"""Convert a Measurement Set v2 into a Processing Set of Measurement Set v4.
|
|
@@ -79,7 +83,7 @@ def convert_msv2_to_processing_set(
|
|
|
79
83
|
In addition to data description and polarization setup a finer partitioning is possible by specifying a list of partitioning keys. Any combination of the following keys are possible:
|
|
80
84
|
"FIELD_ID", "SCAN_NUMBER", "STATE_ID", "SOURCE_ID", "SUB_SCAN_NUMBER", "ANTENNA1".
|
|
81
85
|
"ANTENNA1" is intended as a single-dish specific partitioning option.
|
|
82
|
-
For mosaics where the phase center is rapidly changing (such as VLA on the fly mosaics) partition_scheme should be set to an empty list []. By default, [
|
|
86
|
+
For mosaics where the phase center is rapidly changing (such as VLA on the fly mosaics) partition_scheme should be set to an empty list []. By default, [].
|
|
83
87
|
main_chunksize : Union[Dict, float, None], optional
|
|
84
88
|
Defines the chunk size of the main dataset. If given as a dictionary, defines the sizes of several dimensions, and acceptable keys are "time", "baseline_id", "antenna_id", "frequency", "polarization". If given as a float, gives the size of a chunk in GiB. By default, None.
|
|
85
89
|
with_pointing : bool, optional
|
|
@@ -98,9 +102,11 @@ def convert_msv2_to_processing_set(
|
|
|
98
102
|
Whether to use the table iterator to read the main table of the MS v2. This should be set to True when reading datasets with large number of rows and few partitions, by default False.
|
|
99
103
|
compressor : numcodecs.abc.Codec, optional
|
|
100
104
|
The Blosc compressor to use when saving the converted data to disk using Zarr, by default numcodecs.Zstd(level=2).
|
|
101
|
-
|
|
105
|
+
add_reshaping_indices : bool, optional
|
|
106
|
+
Whether to add the tidxs, bidxs and row_id variables to each partition of the main dataset. These can be used to reshape the data back to the original ordering in the MS v2. This is mainly intended for testing and debugging, by default False.
|
|
107
|
+
storage_backend : Literal["zarr", "netcdf"], optional
|
|
102
108
|
The on-disk format to use. "netcdf" is not yet implemented.
|
|
103
|
-
parallel_mode :
|
|
109
|
+
parallel_mode : Literal["none", "partition", "time"], optional
|
|
104
110
|
Choose whether to use Dask to execute conversion in parallel, by default "none" and conversion occurs serially.
|
|
105
111
|
The option "partition", parallelises the conversion over partitions specified by `partition_scheme`. The option "time" can only be used for phased array interferometers where there are no partitions
|
|
106
112
|
in the MS v2; instead the MS v2 is parallelised along the time dimension and can be controlled by `main_chunksize`.
|
|
@@ -131,6 +137,7 @@ def convert_msv2_to_processing_set(
|
|
|
131
137
|
parallel_mode = "none"
|
|
132
138
|
|
|
133
139
|
partitions = create_partitions(in_file, partition_scheme=partition_scheme)
|
|
140
|
+
|
|
134
141
|
logger.info("Number of partitions: " + str(len(partitions)))
|
|
135
142
|
if parallel_mode == "time":
|
|
136
143
|
assert (
|
|
@@ -140,7 +147,6 @@ def convert_msv2_to_processing_set(
|
|
|
140
147
|
delayed_list = []
|
|
141
148
|
|
|
142
149
|
for ms_v4_id, partition_info in enumerate(partitions):
|
|
143
|
-
# print(ms_v4_id,len(partition_info['FIELD_ID']))
|
|
144
150
|
|
|
145
151
|
logger.info(
|
|
146
152
|
"OBSERVATION_ID "
|
|
@@ -153,6 +159,11 @@ def convert_msv2_to_processing_set(
|
|
|
153
159
|
+ str(partition_info["FIELD_ID"])
|
|
154
160
|
+ ", SCAN "
|
|
155
161
|
+ str(partition_info["SCAN_NUMBER"])
|
|
162
|
+
+ (
|
|
163
|
+
", EPHEMERIS " + str(partition_info["EPHEMERIS_ID"])
|
|
164
|
+
if "EPHEMERIS_ID" in partition_info
|
|
165
|
+
else ""
|
|
166
|
+
)
|
|
156
167
|
+ (
|
|
157
168
|
", ANTENNA " + str(partition_info["ANTENNA1"])
|
|
158
169
|
if "ANTENNA1" in partition_info
|
|
@@ -178,12 +189,14 @@ def convert_msv2_to_processing_set(
|
|
|
178
189
|
ephemeris_interpolate=ephemeris_interpolate,
|
|
179
190
|
phase_cal_interpolate=phase_cal_interpolate,
|
|
180
191
|
sys_cal_interpolate=sys_cal_interpolate,
|
|
192
|
+
add_reshaping_indices=add_reshaping_indices,
|
|
181
193
|
compressor=compressor,
|
|
182
194
|
parallel_mode=parallel_mode,
|
|
183
195
|
overwrite=overwrite,
|
|
184
196
|
)
|
|
185
197
|
)
|
|
186
198
|
else:
|
|
199
|
+
start_time = time.time()
|
|
187
200
|
convert_and_write_partition(
|
|
188
201
|
in_file,
|
|
189
202
|
out_file,
|
|
@@ -198,10 +211,15 @@ def convert_msv2_to_processing_set(
|
|
|
198
211
|
ephemeris_interpolate=ephemeris_interpolate,
|
|
199
212
|
phase_cal_interpolate=phase_cal_interpolate,
|
|
200
213
|
sys_cal_interpolate=sys_cal_interpolate,
|
|
214
|
+
add_reshaping_indices=add_reshaping_indices,
|
|
201
215
|
compressor=compressor,
|
|
202
216
|
parallel_mode=parallel_mode,
|
|
203
217
|
overwrite=overwrite,
|
|
204
218
|
)
|
|
219
|
+
end_time = time.time()
|
|
220
|
+
logger.debug(
|
|
221
|
+
f"Time to convert partition {ms_v4_id}: {end_time - start_time:.2f} seconds"
|
|
222
|
+
)
|
|
205
223
|
|
|
206
224
|
if parallel_mode == "partition":
|
|
207
225
|
dask.compute(delayed_list)
|
|
@@ -203,16 +203,26 @@ class MeasurementSetXdt:
|
|
|
203
203
|
else:
|
|
204
204
|
line_name = []
|
|
205
205
|
|
|
206
|
+
if "spectral_window_intent" not in self._xdt.frequency.attrs:
|
|
207
|
+
spw_intent = "UNSPECIFIED"
|
|
208
|
+
else:
|
|
209
|
+
spw_intent = self._xdt.frequency.attrs["spectral_window_intents"]
|
|
210
|
+
|
|
211
|
+
if "intents" in self._xdt.observation_info:
|
|
212
|
+
scan_intents = self._xdt.observation_info["intents"]
|
|
213
|
+
else:
|
|
214
|
+
scan_intents = self._xdt.scan_name.attrs.get(
|
|
215
|
+
"scan_intents", ["UNSPECIFIED"]
|
|
216
|
+
)
|
|
217
|
+
|
|
206
218
|
partition_info = {
|
|
207
219
|
"spectral_window_name": self._xdt.frequency.attrs["spectral_window_name"],
|
|
208
|
-
"
|
|
209
|
-
"spectral_window_intent"
|
|
210
|
-
],
|
|
220
|
+
"spectral_window_intents": spw_intent,
|
|
211
221
|
"field_name": to_list(np.unique(field_and_source_xds.field_name.values)),
|
|
212
222
|
"polarization_setup": to_list(self._xdt.polarization.values),
|
|
213
223
|
"scan_name": to_list(np.unique(self._xdt.scan_name.values)),
|
|
214
224
|
"source_name": to_list(np.unique(field_and_source_xds.source_name.values)),
|
|
215
|
-
"
|
|
225
|
+
"scan_intents": scan_intents,
|
|
216
226
|
"line_name": line_name,
|
|
217
227
|
"data_group_name": data_group_name,
|
|
218
228
|
}
|
|
@@ -5,7 +5,7 @@ import xarray as xr
|
|
|
5
5
|
|
|
6
6
|
def open_processing_set(
|
|
7
7
|
ps_store: str,
|
|
8
|
-
|
|
8
|
+
scan_intents: list | None = None,
|
|
9
9
|
) -> xr.DataTree:
|
|
10
10
|
"""Creates a lazy representation of a Processing Set (only meta-data is loaded into memory).
|
|
11
11
|
|
|
@@ -13,9 +13,9 @@ def open_processing_set(
|
|
|
13
13
|
----------
|
|
14
14
|
ps_store : str
|
|
15
15
|
String of the path and name of the processing set. For example '/users/user_1/uid___A002_Xf07bba_Xbe5c_target.lsrk.vis.zarr'.
|
|
16
|
-
|
|
17
|
-
A list of
|
|
18
|
-
By default None, which will include all
|
|
16
|
+
scan_intents : str | None, optional
|
|
17
|
+
A list of scan_intents to be opened for example ['OBSERVE_TARGET#ON_SOURCE']. The scan_intents in a processing_set_xdt can be seen by calling processing_set_xdt.ps.summary().
|
|
18
|
+
By default None, which will include all scan_intents.
|
|
19
19
|
|
|
20
20
|
Returns
|
|
21
21
|
-------
|
|
@@ -34,10 +34,10 @@ def open_processing_set(
|
|
|
34
34
|
|
|
35
35
|
# Future work is to add ASDM backend
|
|
36
36
|
|
|
37
|
-
if
|
|
37
|
+
if scan_intents is None:
|
|
38
38
|
return ps_xdt
|
|
39
39
|
else:
|
|
40
|
-
return ps_xdt.xr_ps.query(
|
|
40
|
+
return ps_xdt.xr_ps.query(scan_intents=scan_intents)
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
# def open_processing_set(
|