xradio 0.0.48__py3-none-any.whl → 0.0.49__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. xradio/__init__.py +1 -0
  2. xradio/_utils/dict_helpers.py +69 -2
  3. xradio/image/_util/__init__.py +0 -3
  4. xradio/image/_util/_casacore/common.py +0 -13
  5. xradio/image/_util/_casacore/xds_from_casacore.py +102 -97
  6. xradio/image/_util/_casacore/xds_to_casacore.py +36 -24
  7. xradio/image/_util/_fits/xds_from_fits.py +81 -36
  8. xradio/image/_util/_zarr/zarr_low_level.py +3 -3
  9. xradio/image/_util/casacore.py +7 -5
  10. xradio/image/_util/common.py +13 -26
  11. xradio/image/_util/image_factory.py +143 -191
  12. xradio/image/image.py +10 -59
  13. xradio/measurement_set/__init__.py +11 -6
  14. xradio/measurement_set/_utils/_msv2/_tables/read.py +187 -46
  15. xradio/measurement_set/_utils/_msv2/_tables/table_query.py +22 -0
  16. xradio/measurement_set/_utils/_msv2/conversion.py +351 -318
  17. xradio/measurement_set/_utils/_msv2/msv4_info_dicts.py +20 -17
  18. xradio/measurement_set/convert_msv2_to_processing_set.py +46 -6
  19. xradio/measurement_set/load_processing_set.py +100 -53
  20. xradio/measurement_set/measurement_set_xdt.py +197 -0
  21. xradio/measurement_set/open_processing_set.py +122 -86
  22. xradio/measurement_set/processing_set_xdt.py +1552 -0
  23. xradio/measurement_set/schema.py +199 -94
  24. xradio/schema/bases.py +5 -1
  25. xradio/schema/check.py +97 -5
  26. {xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/METADATA +4 -4
  27. {xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/RECORD +30 -30
  28. {xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/WHEEL +1 -1
  29. xradio/measurement_set/measurement_set_xds.py +0 -117
  30. xradio/measurement_set/processing_set.py +0 -803
  31. {xradio-0.0.48.dist-info → xradio-0.0.49.dist-info/licenses}/LICENSE.txt +0 -0
  32. {xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
  import re
5
5
  from typing import Any, Callable, Dict, List, Tuple, Union
6
6
 
7
+ import dask.array as da
7
8
  import numpy as np
8
9
  import pandas as pd
9
10
  import xarray as xr
@@ -11,7 +12,7 @@ import xarray as xr
11
12
  import astropy.units
12
13
  from casacore import tables
13
14
 
14
- from .table_query import open_query, open_table_ro
15
+ from .table_query import open_query, open_table_ro, TableManager
15
16
  from xradio._utils.list_and_array import get_pad_value
16
17
 
17
18
  CASACORE_TO_PD_TIME_CORRECTION = 3_506_716_800.0
@@ -1207,13 +1208,14 @@ def read_col_chunk(
1207
1208
  return fulldata
1208
1209
 
1209
1210
 
1210
- def read_col_conversion(
1211
- tb_tool: tables.table,
1211
+ def read_col_conversion_numpy(
1212
+ table_manager: TableManager,
1212
1213
  col: str,
1213
1214
  cshape: Tuple[int],
1214
1215
  tidxs: np.ndarray,
1215
1216
  bidxs: np.ndarray,
1216
1217
  use_table_iter: bool,
1218
+ time_chunksize: int,
1217
1219
  ) -> np.ndarray:
1218
1220
  """
1219
1221
  Function to perform delayed reads from table columns when converting
@@ -1221,7 +1223,7 @@ def read_col_conversion(
1221
1223
 
1222
1224
  Parameters
1223
1225
  ----------
1224
- tb_tool : tables.table
1226
+ table_manager : TableManager
1225
1227
 
1226
1228
  col : str
1227
1229
 
@@ -1231,6 +1233,8 @@ def read_col_conversion(
1231
1233
 
1232
1234
  bidxs : np.ndarray
1233
1235
 
1236
+ use_table_iter : bool
1237
+
1234
1238
  Returns
1235
1239
  -------
1236
1240
  np.ndarray
@@ -1241,60 +1245,197 @@ def read_col_conversion(
1241
1245
  # WARNING: Assumes the num_frequencies * num_polarizations < 2**29. If false,
1242
1246
  # https://github.com/casacore/python-casacore/issues/130 isn't mitigated.
1243
1247
 
1248
+ with table_manager.get_table() as tb_tool:
1249
+
1250
+ # Use casacore to get the shape of a row for this column
1251
+ #################################################################################
1252
+
1253
+ # getcolshapestring() only works on columns where a row element is an
1254
+ # array ie. fails for TIME
1255
+ # Assumes the RuntimeError is because the column is a scalar
1256
+ try:
1257
+ shape_string = tb_tool.getcolshapestring(col)[0]
1258
+ # Convert `shape_string` into a tuple that numpy understands
1259
+ extra_dimensions = tuple(
1260
+ [
1261
+ int(idx)
1262
+ for idx in shape_string.replace("[", "")
1263
+ .replace("]", "")
1264
+ .split(", ")
1265
+ ]
1266
+ )
1267
+ except RuntimeError:
1268
+ extra_dimensions = ()
1269
+
1270
+ #################################################################################
1271
+
1272
+ # Get dtype of the column. Only read first row from disk
1273
+ col_dtype = np.array(tb_tool.col(col)[0]).dtype
1274
+ # Use a custom/safe fill value (https://github.com/casangi/xradio/issues/219)
1275
+ fill_value = get_pad_value(col_dtype)
1276
+
1277
+ # Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
1278
+ data = np.full(cshape + extra_dimensions, fill_value, dtype=col_dtype)
1279
+
1280
+ # Use built-in casacore table iterator to populate the data column by unique times.
1281
+ if use_table_iter:
1282
+ start_row = 0
1283
+ for ts in tb_tool.iter("TIME", sort=False):
1284
+ num_rows = ts.nrows()
1285
+
1286
+ # Create small temporary array to store the partial column
1287
+ tmp_arr = np.full(
1288
+ (num_rows,) + extra_dimensions, fill_value, dtype=col_dtype
1289
+ )
1290
+
1291
+ # Note we don't use `getcol()` because it's less safe. See:
1292
+ # https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
1293
+ ts.getcolnp(col, tmp_arr)
1294
+
1295
+ # Get the slice of rows contained in `tmp_arr`.
1296
+ # Used to get the relevant integer indexes from `tidxs` and `bidxs`
1297
+ tmp_slice = slice(start_row, start_row + num_rows)
1298
+
1299
+ # Copy `tmp_arr` into correct elements of `tmp_arr`
1300
+ data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
1301
+ start_row += num_rows
1302
+ else:
1303
+ data[tidxs, bidxs] = tb_tool.getcol(col)
1304
+
1305
+ return data
1306
+
1307
+
1308
+ def read_col_conversion_dask(
1309
+ table_manager: TableManager,
1310
+ col: str,
1311
+ cshape: Tuple[int],
1312
+ tidxs: np.ndarray,
1313
+ bidxs: np.ndarray,
1314
+ use_table_iter: bool,
1315
+ time_chunksize: int,
1316
+ ) -> da.Array:
1317
+ """
1318
+ Function to perform delayed reads from table columns when converting
1319
+ (no need for didxs)
1320
+
1321
+ Parameters
1322
+ ----------
1323
+ tb_tool : tables.table
1324
+
1325
+ col : str
1326
+
1327
+ cshape : Tuple[int]
1328
+
1329
+ tidxs : np.ndarray
1330
+
1331
+ bidxs : np.ndarray
1332
+
1333
+ Returns
1334
+ -------
1335
+ da.Array
1336
+ """
1337
+
1244
1338
  # Use casacore to get the shape of a row for this column
1245
1339
  #################################################################################
1246
1340
 
1247
- # Get the total number of rows in the base measurement set
1248
- nrows_total = tb_tool.nrows()
1341
+ with table_manager.get_table() as tb_tool:
1342
+ first_row = tb_tool.row(col)[0][col]
1249
1343
 
1250
- # getcolshapestring() only works on columns where a row element is an
1251
- # array ie. fails for TIME
1252
- # Assumes the RuntimeError is because the column is a scalar
1253
- try:
1254
- shape_string = tb_tool.getcolshapestring(col)[0]
1255
- # Convert `shape_string` into a tuple that numpy understands
1256
- extra_dimensions = tuple(
1257
- [
1258
- int(idx)
1259
- for idx in shape_string.replace("[", "").replace("]", "").split(", ")
1260
- ]
1261
- )
1262
- except RuntimeError:
1344
+ if isinstance(first_row, np.ndarray):
1345
+ extra_dimensions = first_row.shape
1346
+
1347
+ else:
1263
1348
  extra_dimensions = ()
1264
1349
 
1350
+ # Use dask primitives to lazily read chunks of data from the MeasurementSet
1351
+ # Takes inspiration from dask_image https://image.dask.org/en/latest/
1265
1352
  #################################################################################
1266
1353
 
1267
- # Get dtype of the column. Only read first row from disk
1268
- col_dtype = np.array(tb_tool.col(col)[0]).dtype
1269
- # Use a custom/safe fill value (https://github.com/casangi/xradio/issues/219)
1270
- fill_value = get_pad_value(col_dtype)
1354
+ # Get dtype of the column. Wrap in numpy array in case of scalar column
1355
+ col_dtype = np.array(first_row).dtype
1356
+
1357
+ # Get the number of rows for a single TIME value
1358
+ num_utimes = cshape[0]
1359
+ rows_per_time = cshape[1]
1360
+
1361
+ # Calculate the chunks of unique times that gives the target chunk sizes
1362
+ tmp_chunks = da.core.normalize_chunks(time_chunksize, (num_utimes,))[0]
1363
+
1364
+ sum = 0
1365
+ arr_start_end_rows = []
1366
+ for chunk in tmp_chunks:
1367
+ start = (sum) * rows_per_time
1368
+ end = (sum + chunk) * rows_per_time
1369
+
1370
+ arr_start_end_rows.append((start, end))
1371
+ sum += chunk
1372
+
1373
+ # Store the start and end rows that should be read for the chunk
1374
+ arr_start_end_rows = da.from_array(arr_start_end_rows, chunks=(1, 2))
1375
+
1376
+ # Specify the output shape `load_col_chunk`
1377
+ output_chunkshape = (tmp_chunks, cshape[1]) + extra_dimensions
1378
+
1379
+ # Apply `load_col_chunk` to each chunk
1380
+ data = arr_start_end_rows.map_blocks(
1381
+ load_col_chunk,
1382
+ table_manager=table_manager,
1383
+ col_name=col,
1384
+ col_dtype=col_dtype,
1385
+ tidxs=tidxs,
1386
+ bidxs=bidxs,
1387
+ rows_per_time=rows_per_time,
1388
+ cshape=cshape,
1389
+ extra_dimensions=extra_dimensions,
1390
+ drop_axis=[1],
1391
+ new_axis=list(range(1, len(cshape + extra_dimensions))),
1392
+ meta=np.array([], dtype=col_dtype),
1393
+ chunks=output_chunkshape,
1394
+ )
1271
1395
 
1272
- # Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
1273
- data = np.full(cshape + extra_dimensions, fill_value, dtype=col_dtype)
1396
+ return data
1274
1397
 
1275
- # Use built-in casacore table iterator to populate the data column by unique times.
1276
- if use_table_iter:
1277
- start_row = 0
1278
- for ts in tb_tool.iter("TIME", sort=False):
1279
- num_rows = ts.nrows()
1280
1398
 
1281
- # Create small temporary array to store the partial column
1282
- tmp_arr = np.full(
1283
- (num_rows,) + extra_dimensions, fill_value, dtype=col_dtype
1284
- )
1399
+ def load_col_chunk(
1400
+ x,
1401
+ table_manager,
1402
+ col_name,
1403
+ col_dtype,
1404
+ tidxs,
1405
+ bidxs,
1406
+ rows_per_time,
1407
+ cshape,
1408
+ extra_dimensions,
1409
+ ):
1410
+ start_row = x[0][0]
1411
+ end_row = x[0][1]
1412
+ num_rows = end_row - start_row
1413
+ assert (num_rows % rows_per_time) == 0
1414
+ num_utimes = num_rows // rows_per_time
1415
+
1416
+ # Create memory buffer to populate with data from disk
1417
+ row_data = np.full((num_rows,) + extra_dimensions, np.nan, dtype=col_dtype)
1418
+
1419
+ # Load data from the column
1420
+ # Release the casacore table as soon as possible
1421
+ with table_manager.get_table() as tb_tool:
1422
+ tb_tool.getcolnp(col_name, row_data, startrow=start_row, nrow=num_rows)
1423
+
1424
+ # Initialise reshaped numpy array
1425
+ reshaped_data = np.full(
1426
+ (num_utimes, cshape[1]) + extra_dimensions, np.nan, dtype=col_dtype
1427
+ )
1285
1428
 
1286
- # Note we don't use `getcol()` because it's less safe. See:
1287
- # https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
1288
- ts.getcolnp(col, tmp_arr)
1429
+ # Create slice object for readability
1430
+ slc = slice(start_row, end_row)
1431
+ tidxs_slc = tidxs[slc]
1289
1432
 
1290
- # Get the slice of rows contained in `tmp_arr`.
1291
- # Used to get the relevant integer indexes from `tidxs` and `bidxs`
1292
- tmp_slice = slice(start_row, start_row + num_rows)
1433
+ tidxs_slc = (
1434
+ tidxs_slc - tidxs_slc[0]
1435
+ ) # Indices of reshaped_data along time differ from values in tidxs. Assumes first time is earliest time
1436
+ bidxs_slc = bidxs[slc]
1293
1437
 
1294
- # Copy `tmp_arr` into correct elements of `tmp_arr`
1295
- data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
1296
- start_row += num_rows
1297
- else:
1298
- data[tidxs, bidxs] = tb_tool.getcol(col)
1438
+ # Populate `reshaped_data` with `row_data`
1439
+ reshaped_data[tidxs_slc, bidxs_slc] = row_data
1299
1440
 
1300
- return data
1441
+ return reshaped_data
@@ -22,3 +22,25 @@ def open_query(table: tables.table, query: str) -> Generator[tables.table, None,
22
22
  yield ttq
23
23
  finally:
24
24
  ttq.close()
25
+
26
+
27
+ class TableManager:
28
+
29
+ def __init__(
30
+ self,
31
+ infile: str,
32
+ taql_where: str = "",
33
+ ):
34
+ self.infile = infile
35
+ self.taql_where = taql_where
36
+ self.taql_query = taql_where.replace("where ", "")
37
+
38
+ def get_table(self):
39
+ # Performance note:
40
+ # table.query("(DATA_DESC_ID = 0)") is slightly faster than
41
+ # tables.taql("select * from $table (DATA_DESC_ID = 0)")
42
+ with tables.table(
43
+ self.infile, readonly=True, lockoptions={"option": "usernoread"}, ack=False
44
+ ) as mtable:
45
+ query = f"select * from $mtable {self.taql_where}"
46
+ return tables.taql(query)