xradio 0.0.60__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xradio/_utils/list_and_array.py +4 -2
- xradio/image/_util/_casacore/xds_to_casacore.py +11 -4
- xradio/image/image.py +4 -2
- xradio/measurement_set/_utils/_msv2/conversion.py +27 -15
- xradio/measurement_set/_utils/_msv2/create_field_and_source_xds.py +3 -1
- xradio/measurement_set/_utils/_msv2/msv4_info_dicts.py +214 -67
- xradio/measurement_set/_utils/_msv2/partition_queries.py +248 -61
- xradio/measurement_set/convert_msv2_to_processing_set.py +23 -10
- xradio/measurement_set/measurement_set_xdt.py +11 -4
- xradio/measurement_set/open_processing_set.py +6 -6
- xradio/measurement_set/processing_set_xdt.py +70 -12
- xradio/measurement_set/schema.py +136 -179
- xradio/schema/__init__.py +0 -3
- xradio/schema/bases.py +23 -28
- xradio/schema/check.py +23 -15
- xradio/schema/common.py +45 -0
- xradio/schema/export.py +23 -2
- xradio/schema/metamodel.py +12 -8
- xradio/schema/typing.py +7 -13
- {xradio-0.0.60.dist-info → xradio-1.0.1.dist-info}/METADATA +3 -3
- {xradio-0.0.60.dist-info → xradio-1.0.1.dist-info}/RECORD +24 -23
- {xradio-0.0.60.dist-info → xradio-1.0.1.dist-info}/WHEEL +0 -0
- {xradio-0.0.60.dist-info → xradio-1.0.1.dist-info}/licenses/LICENSE.txt +0 -0
- {xradio-0.0.60.dist-info → xradio-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
import time
|
|
2
3
|
import toolviper.utils.logger as logger
|
|
4
|
+
import os
|
|
5
|
+
import pandas as pd
|
|
3
6
|
|
|
4
7
|
import numpy as np
|
|
5
8
|
|
|
@@ -17,6 +20,9 @@ def enumerated_product(*args):
|
|
|
17
20
|
)
|
|
18
21
|
|
|
19
22
|
|
|
23
|
+
import pickle, gzip
|
|
24
|
+
|
|
25
|
+
|
|
20
26
|
def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
21
27
|
"""Create a list of dictionaries with the partition information.
|
|
22
28
|
|
|
@@ -34,38 +40,53 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
34
40
|
list
|
|
35
41
|
list of dictionaries with the partition information.
|
|
36
42
|
"""
|
|
37
|
-
# vla_otf (bool, optional): The partioning of VLA OTF (on the fly) mosaics needs a special partitioning scheme. Defaults to False.
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
44
|
+
### Test new implementation without
|
|
45
|
+
# Always start with these (if available); then extend with user scheme.
|
|
46
|
+
partition_scheme = [
|
|
47
|
+
"DATA_DESC_ID",
|
|
48
|
+
"OBS_MODE",
|
|
49
|
+
"OBSERVATION_ID",
|
|
50
|
+
"EPHEMERIS_ID",
|
|
51
|
+
] + list(partition_scheme)
|
|
42
52
|
|
|
43
|
-
partition_scheme = ["DATA_DESC_ID", "OBS_MODE"
|
|
53
|
+
# partition_scheme = ["DATA_DESC_ID", "OBS_MODE"] + list(
|
|
54
|
+
# partition_scheme
|
|
55
|
+
# )
|
|
44
56
|
|
|
45
|
-
|
|
46
|
-
|
|
57
|
+
t0 = time.time()
|
|
58
|
+
# --------- Load base columns from MAIN table ----------
|
|
47
59
|
main_tb = tables.table(
|
|
48
60
|
in_file, readonly=True, lockoptions={"option": "usernoread"}, ack=False
|
|
49
61
|
)
|
|
50
|
-
par_df["DATA_DESC_ID"] = main_tb.getcol("DATA_DESC_ID")
|
|
51
|
-
par_df["FIELD_ID"] = main_tb.getcol("FIELD_ID")
|
|
52
|
-
par_df["SCAN_NUMBER"] = main_tb.getcol("SCAN_NUMBER")
|
|
53
|
-
par_df["STATE_ID"] = main_tb.getcol("STATE_ID")
|
|
54
|
-
par_df["OBSERVATION_ID"] = main_tb.getcol("OBSERVATION_ID")
|
|
55
|
-
par_df["ANTENNA1"] = main_tb.getcol("ANTENNA1")
|
|
56
|
-
par_df = par_df.drop_duplicates()
|
|
57
62
|
|
|
63
|
+
# Build minimal DF once. Pull only columns we may need.
|
|
64
|
+
# Add columns here if you expect to aggregate them per-partition.
|
|
65
|
+
base_cols = {
|
|
66
|
+
"DATA_DESC_ID": main_tb.getcol("DATA_DESC_ID"),
|
|
67
|
+
"FIELD_ID": main_tb.getcol("FIELD_ID"),
|
|
68
|
+
"SCAN_NUMBER": main_tb.getcol("SCAN_NUMBER"),
|
|
69
|
+
"STATE_ID": main_tb.getcol("STATE_ID"),
|
|
70
|
+
"OBSERVATION_ID": main_tb.getcol("OBSERVATION_ID"),
|
|
71
|
+
"ANTENNA1": main_tb.getcol("ANTENNA1"),
|
|
72
|
+
}
|
|
73
|
+
par_df = pd.DataFrame(base_cols).drop_duplicates()
|
|
74
|
+
logger.debug(
|
|
75
|
+
f"Loaded MAIN columns in {time.time() - t0:.2f}s "
|
|
76
|
+
f"({len(par_df):,} unique MAIN rows)"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# --------- Optional SOURCE/STATE derived columns ----------
|
|
80
|
+
# SOURCE_ID (via FIELD table)
|
|
81
|
+
t1 = time.time()
|
|
82
|
+
source_id_added = False
|
|
58
83
|
field_tb = tables.table(
|
|
59
84
|
os.path.join(in_file, "FIELD"),
|
|
60
85
|
readonly=True,
|
|
61
86
|
lockoptions={"option": "usernoread"},
|
|
62
87
|
ack=False,
|
|
63
88
|
)
|
|
64
|
-
|
|
65
|
-
# par_df["FIELD_NAME"] = np.array(field_tb.getcol("NAME"))[par_df["FIELD_ID"]]
|
|
66
|
-
|
|
67
|
-
# Get source ids if available from source table.
|
|
68
|
-
if table_exists(os.path.join(os.path.join(in_file, "SOURCE"))):
|
|
89
|
+
if table_exists(os.path.join(in_file, "SOURCE")):
|
|
69
90
|
source_tb = tables.table(
|
|
70
91
|
os.path.join(in_file, "SOURCE"),
|
|
71
92
|
readonly=True,
|
|
@@ -73,13 +94,31 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
73
94
|
ack=False,
|
|
74
95
|
)
|
|
75
96
|
if source_tb.nrows() != 0:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
97
|
+
# Map SOURCE_ID via FIELD_ID
|
|
98
|
+
field_source = np.asarray(field_tb.getcol("SOURCE_ID"))
|
|
99
|
+
par_df["SOURCE_ID"] = field_source[par_df["FIELD_ID"]]
|
|
100
|
+
source_id_added = True
|
|
101
|
+
logger.debug(
|
|
102
|
+
f"SOURCE processing in {time.time() - t1:.2f}s "
|
|
103
|
+
f"(added SOURCE_ID={source_id_added})"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if "EPHEMERIS_ID" in field_tb.colnames():
|
|
107
|
+
ephemeris_id_added = False
|
|
108
|
+
if field_tb.nrows() != 0:
|
|
109
|
+
# Map EPHEMERIS_ID via FIELD_ID
|
|
110
|
+
field_ephemeris = np.asarray(field_tb.getcol("EPHEMERIS_ID"))
|
|
111
|
+
par_df["EPHEMERIS_ID"] = field_ephemeris[par_df["FIELD_ID"]]
|
|
112
|
+
ephemeris_id_added = True
|
|
113
|
+
logger.debug(
|
|
114
|
+
f"EPHEMERIS processing in {time.time() - t1:.2f}s "
|
|
115
|
+
f"(added EPHEMERIS_ID={ephemeris_id_added})"
|
|
116
|
+
)
|
|
81
117
|
|
|
82
|
-
#
|
|
118
|
+
# OBS_MODE & SUB_SCAN_NUMBER (via STATE table)
|
|
119
|
+
t2 = time.time()
|
|
120
|
+
obs_mode_added = False
|
|
121
|
+
sub_scan_added = False
|
|
83
122
|
if table_exists(os.path.join(in_file, "STATE")):
|
|
84
123
|
state_tb = tables.table(
|
|
85
124
|
os.path.join(in_file, "STATE"),
|
|
@@ -88,30 +127,36 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
88
127
|
ack=False,
|
|
89
128
|
)
|
|
90
129
|
if state_tb.nrows() != 0:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
]
|
|
95
|
-
par_df["SUB_SCAN_NUMBER"] =
|
|
130
|
+
state_obs_mode = np.asarray(state_tb.getcol("OBS_MODE"))
|
|
131
|
+
state_sub_scan = np.asarray(state_tb.getcol("SUB_SCAN"))
|
|
132
|
+
# Index by STATE_ID into STATE columns
|
|
133
|
+
par_df["OBS_MODE"] = state_obs_mode[par_df["STATE_ID"]]
|
|
134
|
+
par_df["SUB_SCAN_NUMBER"] = state_sub_scan[par_df["STATE_ID"]]
|
|
135
|
+
obs_mode_added = True
|
|
136
|
+
sub_scan_added = True
|
|
96
137
|
else:
|
|
97
|
-
|
|
138
|
+
# If STATE empty, drop STATE_ID (it cannot partition anything)
|
|
139
|
+
if "STATE_ID" in par_df.columns:
|
|
140
|
+
par_df.drop(columns=["STATE_ID"], inplace=True)
|
|
98
141
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
partition_criteria = {}
|
|
102
|
-
for par in partition_scheme:
|
|
103
|
-
if par in par_df.columns:
|
|
104
|
-
partition_criteria[par] = par_df[par].unique()
|
|
105
|
-
partition_scheme_updated.append(par)
|
|
106
|
-
logger.info(f"Partition scheme that will be used: {partition_scheme_updated}")
|
|
142
|
+
if "SUB_SCAN_NUMBER" in par_df.columns:
|
|
143
|
+
par_df.drop(columns=["SUB_SCAN_NUMBER"], inplace=True)
|
|
107
144
|
|
|
108
|
-
|
|
109
|
-
|
|
145
|
+
logger.debug(
|
|
146
|
+
f"STATE processing in {time.time() - t2:.2f}s "
|
|
147
|
+
f"(OBS_MODE={obs_mode_added}, SUB_SCAN_NUMBER={sub_scan_added})"
|
|
148
|
+
)
|
|
110
149
|
|
|
111
|
-
#
|
|
150
|
+
# --------- Decide which partition keys are actually available ----------
|
|
151
|
+
t3 = time.time()
|
|
152
|
+
partition_scheme_updated = [k for k in partition_scheme if k in par_df.columns]
|
|
153
|
+
logger.info(f"Updated partition scheme used: {partition_scheme_updated}")
|
|
112
154
|
|
|
113
|
-
#
|
|
114
|
-
|
|
155
|
+
# If none of the requested keys exist, there is a single partition of "everything"
|
|
156
|
+
if not partition_scheme_updated:
|
|
157
|
+
partition_scheme_updated = []
|
|
158
|
+
|
|
159
|
+
# These are the axes we report per partition (present => aggregate unique values)
|
|
115
160
|
partition_axis_names = [
|
|
116
161
|
"DATA_DESC_ID",
|
|
117
162
|
"OBSERVATION_ID",
|
|
@@ -121,30 +166,172 @@ def create_partitions(in_file: str, partition_scheme: list) -> list[dict]:
|
|
|
121
166
|
"SOURCE_ID",
|
|
122
167
|
"OBS_MODE",
|
|
123
168
|
"SUB_SCAN_NUMBER",
|
|
169
|
+
"EPHEMERIS_ID",
|
|
124
170
|
]
|
|
171
|
+
# Only include ANTENNA1 if user asked for it (keeps output size down)
|
|
125
172
|
if "ANTENNA1" in partition_scheme:
|
|
126
173
|
partition_axis_names.append("ANTENNA1")
|
|
127
174
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
175
|
+
# --------- Group only by realized partitions (no Cartesian product!) ----------
|
|
176
|
+
# observed=True speeds up if categorical; here it’s harmless. sort=False keeps source order.
|
|
177
|
+
if partition_scheme_updated:
|
|
178
|
+
grp = par_df.groupby(partition_scheme_updated, sort=False, observed=False)
|
|
179
|
+
groups_iter = grp
|
|
180
|
+
else:
|
|
181
|
+
# Single group: everything
|
|
182
|
+
groups_iter = [(None, par_df)]
|
|
183
|
+
|
|
184
|
+
partitions = []
|
|
185
|
+
# Fast aggregation: use NumPy for uniques to avoid pandas overhead in the tight loop.
|
|
186
|
+
for _, gdf in groups_iter:
|
|
187
|
+
part = {}
|
|
188
|
+
for name in partition_axis_names:
|
|
189
|
+
if name in gdf.columns:
|
|
190
|
+
# Return Python lists to match your prior structure (can be np.ndarray if preferred)
|
|
191
|
+
part[name] = np.unique(gdf[name].to_numpy()).tolist()
|
|
133
192
|
else:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
sub_par_df = par_df.query(query).drop_duplicates()
|
|
193
|
+
part[name] = [None]
|
|
194
|
+
partitions.append(part)
|
|
137
195
|
|
|
138
|
-
|
|
139
|
-
|
|
196
|
+
logger.debug(
|
|
197
|
+
f"Partition build in {time.time() - t3:.2f}s; total {len(partitions):,} partitions"
|
|
198
|
+
)
|
|
199
|
+
logger.debug(f"Total create_partitions time: {time.time() - t0:.2f}s")
|
|
140
200
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
if col_name in sub_par_df.columns:
|
|
144
|
-
partition_info[col_name] = sub_par_df[col_name].unique()
|
|
145
|
-
else:
|
|
146
|
-
partition_info[col_name] = [None]
|
|
201
|
+
# # with gzip.open("partition_original_small.pkl.gz", "wb") as f:
|
|
202
|
+
# # pickle.dump(partitions, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
147
203
|
|
|
148
|
-
|
|
204
|
+
# #partitions[1]["DATA_DESC_ID"] = [999] # make a change to test comparison
|
|
205
|
+
# #org_partitions = load_dict_list("partition_original_small.pkl.gz")
|
|
206
|
+
# org_partitions = load_dict_list("partition_original.pkl.gz")
|
|
149
207
|
|
|
150
208
|
return partitions
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
from typing import Any, List, Dict
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def save_dict_list(filename: str, data: List[Dict[str, Any]]) -> None:
|
|
215
|
+
"""
|
|
216
|
+
Save a list of dictionaries containing NumPy arrays (or other objects)
|
|
217
|
+
to a compressed pickle file.
|
|
218
|
+
"""
|
|
219
|
+
with gzip.open(filename, "wb") as f:
|
|
220
|
+
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def load_dict_list(filename: str) -> List[Dict[str, Any]]:
|
|
224
|
+
"""
|
|
225
|
+
Load a list of dictionaries containing NumPy arrays (or other objects)
|
|
226
|
+
from a compressed pickle file.
|
|
227
|
+
"""
|
|
228
|
+
with gzip.open(filename, "rb") as f:
|
|
229
|
+
return pickle.load(f)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def dict_list_equal(a: List[Dict[str, Any]], b: List[Dict[str, Any]]) -> bool:
|
|
233
|
+
"""
|
|
234
|
+
Compare two lists of dictionaries to ensure they are exactly the same.
|
|
235
|
+
NumPy arrays are compared with array_equal, other objects with ==.
|
|
236
|
+
"""
|
|
237
|
+
if len(a) != len(b):
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
for d1, d2 in zip(a, b):
|
|
241
|
+
if d1.keys() != d2.keys():
|
|
242
|
+
return False
|
|
243
|
+
for k in d1:
|
|
244
|
+
v1, v2 = d1[k], d2[k]
|
|
245
|
+
if isinstance(v1, np.ndarray) and isinstance(v2, np.ndarray):
|
|
246
|
+
if not np.array_equal(v1, v2):
|
|
247
|
+
return False
|
|
248
|
+
else:
|
|
249
|
+
if v1 != v2:
|
|
250
|
+
return False
|
|
251
|
+
return True
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
from typing import Iterable, Mapping, Tuple, List, Dict, Any, Set
|
|
255
|
+
import numpy as np
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _to_python_scalar(x: Any) -> Any:
|
|
259
|
+
"""Convert NumPy scalars to Python scalars; leave others unchanged."""
|
|
260
|
+
if isinstance(x, np.generic):
|
|
261
|
+
return x.item()
|
|
262
|
+
return x
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _to_hashable_value_list(v: Any) -> Tuple[Any, ...]:
|
|
266
|
+
"""
|
|
267
|
+
Normalize a dict value (often list/np.ndarray) into a sorted, hashable tuple.
|
|
268
|
+
- Accepts list/tuple/np.ndarray/scalars/None.
|
|
269
|
+
- Treats None as a value.
|
|
270
|
+
- Sorts with a stable key that stringifies items to avoid dtype hiccups.
|
|
271
|
+
"""
|
|
272
|
+
if isinstance(v, np.ndarray):
|
|
273
|
+
v = v.tolist()
|
|
274
|
+
if v is None or isinstance(v, (str, bytes)):
|
|
275
|
+
# Treat a bare scalar as a single-element collection for consistency.
|
|
276
|
+
v = [v]
|
|
277
|
+
elif not isinstance(v, (list, tuple)):
|
|
278
|
+
v = [v]
|
|
279
|
+
|
|
280
|
+
py_vals = [_to_python_scalar(x) for x in v]
|
|
281
|
+
# Sort by (type name, repr) to keep mixed types stable if present
|
|
282
|
+
return tuple(sorted(py_vals, key=lambda x: (type(x).__name__, repr(x))))
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _canon_partition(
|
|
286
|
+
d: Mapping[str, Any], ignore_keys: Iterable[str] = ()
|
|
287
|
+
) -> Tuple[Tuple[str, Tuple[Any, ...]], ...]:
|
|
288
|
+
"""
|
|
289
|
+
Canonicalize a partition dict into a hashable, order-insensitive representation.
|
|
290
|
+
- Drops keys in ignore_keys.
|
|
291
|
+
- Converts each value collection to a sorted tuple.
|
|
292
|
+
- Sorts keys.
|
|
293
|
+
"""
|
|
294
|
+
ign: Set[str] = set(ignore_keys)
|
|
295
|
+
items = []
|
|
296
|
+
for k, v in d.items():
|
|
297
|
+
if k in ign:
|
|
298
|
+
continue
|
|
299
|
+
items.append((k, _to_hashable_value_list(v)))
|
|
300
|
+
items.sort(key=lambda kv: kv[0])
|
|
301
|
+
return tuple(items)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def compare_partitions_subset(
|
|
305
|
+
new_partitions: List[Dict[str, Any]],
|
|
306
|
+
original_partitions: List[Dict[str, Any]],
|
|
307
|
+
ignore_keys: Iterable[str] = (),
|
|
308
|
+
) -> Tuple[bool, List[Dict[str, Any]]]:
|
|
309
|
+
"""
|
|
310
|
+
Check that every partition in `new_partitions` also appears in `original_partitions`,
|
|
311
|
+
ignoring ordering (of partitions and of values within each key).
|
|
312
|
+
|
|
313
|
+
Parameters
|
|
314
|
+
----------
|
|
315
|
+
new_partitions : list of dict
|
|
316
|
+
Partitions produced by the optimized/new code.
|
|
317
|
+
original_partitions : list of dict
|
|
318
|
+
Partitions produced by the original code (the reference).
|
|
319
|
+
ignore_keys : iterable of str, optional
|
|
320
|
+
Keys to ignore when comparing partitions (e.g., timestamps or debug fields).
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
(ok, missing)
|
|
325
|
+
ok : bool
|
|
326
|
+
True if every new partition is found in the original set.
|
|
327
|
+
missing : list of dict
|
|
328
|
+
The list of partitions (from `new_partitions`) that were NOT found in `original_partitions`,
|
|
329
|
+
useful for debugging diffs.
|
|
330
|
+
"""
|
|
331
|
+
orig_set = {_canon_partition(p, ignore_keys) for p in original_partitions}
|
|
332
|
+
missing = []
|
|
333
|
+
for p in new_partitions:
|
|
334
|
+
cp = _canon_partition(p, ignore_keys)
|
|
335
|
+
if cp not in orig_set:
|
|
336
|
+
missing.append(p)
|
|
337
|
+
return (len(missing) == 0, missing)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import toolviper.utils.logger as logger
|
|
2
2
|
import numcodecs
|
|
3
|
-
from typing import Dict, Union
|
|
3
|
+
from typing import Dict, Union, Literal
|
|
4
|
+
import time
|
|
4
5
|
|
|
5
6
|
import dask
|
|
6
7
|
|
|
7
|
-
from xradio.measurement_set._utils._msv2.partition_queries import
|
|
8
|
+
from xradio.measurement_set._utils._msv2.partition_queries import (
|
|
9
|
+
create_partitions,
|
|
10
|
+
)
|
|
8
11
|
from xradio.measurement_set._utils._msv2.conversion import (
|
|
9
12
|
convert_and_write_partition,
|
|
10
13
|
estimate_memory_and_cores_for_partitions,
|
|
@@ -13,7 +16,7 @@ from xradio.measurement_set._utils._msv2.conversion import (
|
|
|
13
16
|
|
|
14
17
|
def estimate_conversion_memory_and_cores(
|
|
15
18
|
in_file: str,
|
|
16
|
-
partition_scheme: list = [
|
|
19
|
+
partition_scheme: list = [],
|
|
17
20
|
) -> tuple[float, int, int]:
|
|
18
21
|
"""
|
|
19
22
|
Given an MSv2 and a partition_scheme to use when converting it to MSv4,
|
|
@@ -52,7 +55,7 @@ def estimate_conversion_memory_and_cores(
|
|
|
52
55
|
def convert_msv2_to_processing_set(
|
|
53
56
|
in_file: str,
|
|
54
57
|
out_file: str,
|
|
55
|
-
partition_scheme: list = [
|
|
58
|
+
partition_scheme: list = [],
|
|
56
59
|
main_chunksize: Union[Dict, float, None] = None,
|
|
57
60
|
with_pointing: bool = True,
|
|
58
61
|
pointing_chunksize: Union[Dict, float, None] = None,
|
|
@@ -63,8 +66,8 @@ def convert_msv2_to_processing_set(
|
|
|
63
66
|
use_table_iter: bool = False,
|
|
64
67
|
compressor: numcodecs.abc.Codec = numcodecs.Zstd(level=2),
|
|
65
68
|
add_reshaping_indices: bool = False,
|
|
66
|
-
storage_backend:
|
|
67
|
-
parallel_mode:
|
|
69
|
+
storage_backend: Literal["zarr", "netcdf"] = "zarr",
|
|
70
|
+
parallel_mode: Literal["none", "partition", "time"] = "none",
|
|
68
71
|
overwrite: bool = False,
|
|
69
72
|
):
|
|
70
73
|
"""Convert a Measurement Set v2 into a Processing Set of Measurement Set v4.
|
|
@@ -80,7 +83,7 @@ def convert_msv2_to_processing_set(
|
|
|
80
83
|
In addition to data description and polarization setup a finer partitioning is possible by specifying a list of partitioning keys. Any combination of the following keys are possible:
|
|
81
84
|
"FIELD_ID", "SCAN_NUMBER", "STATE_ID", "SOURCE_ID", "SUB_SCAN_NUMBER", "ANTENNA1".
|
|
82
85
|
"ANTENNA1" is intended as a single-dish specific partitioning option.
|
|
83
|
-
For mosaics where the phase center is rapidly changing (such as VLA on the fly mosaics) partition_scheme should be set to an empty list []. By default, [
|
|
86
|
+
For mosaics where the phase center is rapidly changing (such as VLA on the fly mosaics) partition_scheme should be set to an empty list []. By default, [].
|
|
84
87
|
main_chunksize : Union[Dict, float, None], optional
|
|
85
88
|
Defines the chunk size of the main dataset. If given as a dictionary, defines the sizes of several dimensions, and acceptable keys are "time", "baseline_id", "antenna_id", "frequency", "polarization". If given as a float, gives the size of a chunk in GiB. By default, None.
|
|
86
89
|
with_pointing : bool, optional
|
|
@@ -101,9 +104,9 @@ def convert_msv2_to_processing_set(
|
|
|
101
104
|
The Blosc compressor to use when saving the converted data to disk using Zarr, by default numcodecs.Zstd(level=2).
|
|
102
105
|
add_reshaping_indices : bool, optional
|
|
103
106
|
Whether to add the tidxs, bidxs and row_id variables to each partition of the main dataset. These can be used to reshape the data back to the original ordering in the MS v2. This is mainly intended for testing and debugging, by default False.
|
|
104
|
-
storage_backend :
|
|
107
|
+
storage_backend : Literal["zarr", "netcdf"], optional
|
|
105
108
|
The on-disk format to use. "netcdf" is not yet implemented.
|
|
106
|
-
parallel_mode :
|
|
109
|
+
parallel_mode : Literal["none", "partition", "time"], optional
|
|
107
110
|
Choose whether to use Dask to execute conversion in parallel, by default "none" and conversion occurs serially.
|
|
108
111
|
The option "partition", parallelises the conversion over partitions specified by `partition_scheme`. The option "time" can only be used for phased array interferometers where there are no partitions
|
|
109
112
|
in the MS v2; instead the MS v2 is parallelised along the time dimension and can be controlled by `main_chunksize`.
|
|
@@ -134,6 +137,7 @@ def convert_msv2_to_processing_set(
|
|
|
134
137
|
parallel_mode = "none"
|
|
135
138
|
|
|
136
139
|
partitions = create_partitions(in_file, partition_scheme=partition_scheme)
|
|
140
|
+
|
|
137
141
|
logger.info("Number of partitions: " + str(len(partitions)))
|
|
138
142
|
if parallel_mode == "time":
|
|
139
143
|
assert (
|
|
@@ -143,7 +147,6 @@ def convert_msv2_to_processing_set(
|
|
|
143
147
|
delayed_list = []
|
|
144
148
|
|
|
145
149
|
for ms_v4_id, partition_info in enumerate(partitions):
|
|
146
|
-
# print(ms_v4_id,len(partition_info['FIELD_ID']))
|
|
147
150
|
|
|
148
151
|
logger.info(
|
|
149
152
|
"OBSERVATION_ID "
|
|
@@ -156,6 +159,11 @@ def convert_msv2_to_processing_set(
|
|
|
156
159
|
+ str(partition_info["FIELD_ID"])
|
|
157
160
|
+ ", SCAN "
|
|
158
161
|
+ str(partition_info["SCAN_NUMBER"])
|
|
162
|
+
+ (
|
|
163
|
+
", EPHEMERIS " + str(partition_info["EPHEMERIS_ID"])
|
|
164
|
+
if "EPHEMERIS_ID" in partition_info
|
|
165
|
+
else ""
|
|
166
|
+
)
|
|
159
167
|
+ (
|
|
160
168
|
", ANTENNA " + str(partition_info["ANTENNA1"])
|
|
161
169
|
if "ANTENNA1" in partition_info
|
|
@@ -188,6 +196,7 @@ def convert_msv2_to_processing_set(
|
|
|
188
196
|
)
|
|
189
197
|
)
|
|
190
198
|
else:
|
|
199
|
+
start_time = time.time()
|
|
191
200
|
convert_and_write_partition(
|
|
192
201
|
in_file,
|
|
193
202
|
out_file,
|
|
@@ -207,6 +216,10 @@ def convert_msv2_to_processing_set(
|
|
|
207
216
|
parallel_mode=parallel_mode,
|
|
208
217
|
overwrite=overwrite,
|
|
209
218
|
)
|
|
219
|
+
end_time = time.time()
|
|
220
|
+
logger.debug(
|
|
221
|
+
f"Time to convert partition {ms_v4_id}: {end_time - start_time:.2f} seconds"
|
|
222
|
+
)
|
|
210
223
|
|
|
211
224
|
if parallel_mode == "partition":
|
|
212
225
|
dask.compute(delayed_list)
|
|
@@ -203,19 +203,26 @@ class MeasurementSetXdt:
|
|
|
203
203
|
else:
|
|
204
204
|
line_name = []
|
|
205
205
|
|
|
206
|
-
if "
|
|
206
|
+
if "spectral_window_intents" not in self._xdt.frequency.attrs:
|
|
207
207
|
spw_intent = "UNSPECIFIED"
|
|
208
208
|
else:
|
|
209
|
-
spw_intent = self._xdt.frequency.attrs["
|
|
209
|
+
spw_intent = self._xdt.frequency.attrs["spectral_window_intents"]
|
|
210
|
+
|
|
211
|
+
if "intents" in self._xdt.observation_info:
|
|
212
|
+
scan_intents = self._xdt.observation_info["intents"]
|
|
213
|
+
else:
|
|
214
|
+
scan_intents = self._xdt.scan_name.attrs.get(
|
|
215
|
+
"scan_intents", ["UNSPECIFIED"]
|
|
216
|
+
)
|
|
210
217
|
|
|
211
218
|
partition_info = {
|
|
212
219
|
"spectral_window_name": self._xdt.frequency.attrs["spectral_window_name"],
|
|
213
|
-
"
|
|
220
|
+
"spectral_window_intents": spw_intent,
|
|
214
221
|
"field_name": to_list(np.unique(field_and_source_xds.field_name.values)),
|
|
215
222
|
"polarization_setup": to_list(self._xdt.polarization.values),
|
|
216
223
|
"scan_name": to_list(np.unique(self._xdt.scan_name.values)),
|
|
217
224
|
"source_name": to_list(np.unique(field_and_source_xds.source_name.values)),
|
|
218
|
-
"
|
|
225
|
+
"scan_intents": scan_intents,
|
|
219
226
|
"line_name": line_name,
|
|
220
227
|
"data_group_name": data_group_name,
|
|
221
228
|
}
|
|
@@ -5,7 +5,7 @@ import xarray as xr
|
|
|
5
5
|
|
|
6
6
|
def open_processing_set(
|
|
7
7
|
ps_store: str,
|
|
8
|
-
|
|
8
|
+
scan_intents: list | None = None,
|
|
9
9
|
) -> xr.DataTree:
|
|
10
10
|
"""Creates a lazy representation of a Processing Set (only meta-data is loaded into memory).
|
|
11
11
|
|
|
@@ -13,9 +13,9 @@ def open_processing_set(
|
|
|
13
13
|
----------
|
|
14
14
|
ps_store : str
|
|
15
15
|
String of the path and name of the processing set. For example '/users/user_1/uid___A002_Xf07bba_Xbe5c_target.lsrk.vis.zarr'.
|
|
16
|
-
|
|
17
|
-
A list of
|
|
18
|
-
By default None, which will include all
|
|
16
|
+
scan_intents : str | None, optional
|
|
17
|
+
A list of scan_intents to be opened for example ['OBSERVE_TARGET#ON_SOURCE']. The scan_intents in a processing_set_xdt can be seen by calling processing_set_xdt.ps.summary().
|
|
18
|
+
By default None, which will include all scan_intents.
|
|
19
19
|
|
|
20
20
|
Returns
|
|
21
21
|
-------
|
|
@@ -34,10 +34,10 @@ def open_processing_set(
|
|
|
34
34
|
|
|
35
35
|
# Future work is to add ASDM backend
|
|
36
36
|
|
|
37
|
-
if
|
|
37
|
+
if scan_intents is None:
|
|
38
38
|
return ps_xdt
|
|
39
39
|
else:
|
|
40
|
-
return ps_xdt.xr_ps.query(
|
|
40
|
+
return ps_xdt.xr_ps.query(scan_intents=scan_intents)
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
# def open_processing_set(
|