PyPI - xradio - Versions diffs - 0.0.48__py3-none-any.whl → 0.0.49__py3-none-any.whl - Mend

xradio 0.0.48py3-none-any.whl → 0.0.49py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

xradio/__init__.py +1 -0
xradio/_utils/dict_helpers.py +69 -2
xradio/image/_util/__init__.py +0 -3
xradio/image/_util/_casacore/common.py +0 -13
xradio/image/_util/_casacore/xds_from_casacore.py +102 -97
xradio/image/_util/_casacore/xds_to_casacore.py +36 -24
xradio/image/_util/_fits/xds_from_fits.py +81 -36
xradio/image/_util/_zarr/zarr_low_level.py +3 -3
xradio/image/_util/casacore.py +7 -5
xradio/image/_util/common.py +13 -26
xradio/image/_util/image_factory.py +143 -191
xradio/image/image.py +10 -59
xradio/measurement_set/__init__.py +11 -6
xradio/measurement_set/_utils/_msv2/_tables/read.py +187 -46
xradio/measurement_set/_utils/_msv2/_tables/table_query.py +22 -0
xradio/measurement_set/_utils/_msv2/conversion.py +351 -318
xradio/measurement_set/_utils/_msv2/msv4_info_dicts.py +20 -17
xradio/measurement_set/convert_msv2_to_processing_set.py +46 -6
xradio/measurement_set/load_processing_set.py +100 -53
xradio/measurement_set/measurement_set_xdt.py +197 -0
xradio/measurement_set/open_processing_set.py +122 -86
xradio/measurement_set/processing_set_xdt.py +1552 -0
xradio/measurement_set/schema.py +199 -94
xradio/schema/bases.py +5 -1
xradio/schema/check.py +97 -5
{xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/METADATA +4 -4
{xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/RECORD +30 -30
{xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/WHEEL +1 -1
xradio/measurement_set/measurement_set_xds.py +0 -117
xradio/measurement_set/processing_set.py +0 -803
{xradio-0.0.48.dist-info → xradio-0.0.49.dist-info/licenses}/LICENSE.txt +0 -0
{xradio-0.0.48.dist-info → xradio-0.0.49.dist-info}/top_level.txt +0 -0

xradio/measurement_set/_utils/_msv2/_tables/read.py CHANGED Viewed

@@ -4,6 +4,7 @@ from pathlib import Path
 import re
 from typing import Any, Callable, Dict, List, Tuple, Union
+import dask.array as da
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -11,7 +12,7 @@ import xarray as xr
 import astropy.units
 from casacore import tables
-from .table_query import open_query, open_table_ro
+from .table_query import open_query, open_table_ro, TableManager
 from xradio._utils.list_and_array import get_pad_value
 CASACORE_TO_PD_TIME_CORRECTION = 3_506_716_800.0
@@ -1207,13 +1208,14 @@ def read_col_chunk(
     return fulldata
-def read_col_conversion(
-    tb_tool: tables.table,
+def read_col_conversion_numpy(
+    table_manager: TableManager,
     col: str,
     cshape: Tuple[int],
     tidxs: np.ndarray,
     bidxs: np.ndarray,
     use_table_iter: bool,
+    time_chunksize: int,
 ) -> np.ndarray:
     """
     Function to perform delayed reads from table columns when converting
@@ -1221,7 +1223,7 @@ def read_col_conversion(
     Parameters
     ----------
-    tb_tool : tables.table
+    table_manager : TableManager
     col : str
@@ -1231,6 +1233,8 @@ def read_col_conversion(
     bidxs : np.ndarray
+    use_table_iter : bool
     Returns
     -------
     np.ndarray
@@ -1241,60 +1245,197 @@ def read_col_conversion(
     # WARNING: Assumes the num_frequencies * num_polarizations < 2**29. If false,
     # https://github.com/casacore/python-casacore/issues/130 isn't mitigated.
+    with table_manager.get_table() as tb_tool:
+        # Use casacore to get the shape of a row for this column
+        #################################################################################
+        # getcolshapestring() only works on columns where a row element is an
+        # array ie. fails for TIME
+        # Assumes the RuntimeError is because the column is a scalar
+        try:
+            shape_string = tb_tool.getcolshapestring(col)[0]
+            # Convert `shape_string` into a tuple that numpy understands
+            extra_dimensions = tuple(
+                [
+                    int(idx)
+                    for idx in shape_string.replace("[", "")
+                    .replace("]", "")
+                    .split(", ")
+                ]
+            )
+        except RuntimeError:
+            extra_dimensions = ()
+        #################################################################################
+        # Get dtype of the column. Only read first row from disk
+        col_dtype = np.array(tb_tool.col(col)[0]).dtype
+        # Use a custom/safe fill value (https://github.com/casangi/xradio/issues/219)
+        fill_value = get_pad_value(col_dtype)
+        # Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
+        data = np.full(cshape + extra_dimensions, fill_value, dtype=col_dtype)
+        # Use built-in casacore table iterator to populate the data column by unique times.
+        if use_table_iter:
+            start_row = 0
+            for ts in tb_tool.iter("TIME", sort=False):
+                num_rows = ts.nrows()
+                # Create small temporary array to store the partial column
+                tmp_arr = np.full(
+                    (num_rows,) + extra_dimensions, fill_value, dtype=col_dtype
+                )
+                # Note we don't use `getcol()` because it's less safe. See:
+                # https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
+                ts.getcolnp(col, tmp_arr)
+                # Get the slice of rows contained in `tmp_arr`.
+                # Used to get the relevant integer indexes from `tidxs` and `bidxs`
+                tmp_slice = slice(start_row, start_row + num_rows)
+                # Copy `tmp_arr` into correct elements of `tmp_arr`
+                data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
+                start_row += num_rows
+        else:
+            data[tidxs, bidxs] = tb_tool.getcol(col)
+    return data
+def read_col_conversion_dask(
+    table_manager: TableManager,
+    col: str,
+    cshape: Tuple[int],
+    tidxs: np.ndarray,
+    bidxs: np.ndarray,
+    use_table_iter: bool,
+    time_chunksize: int,
+) -> da.Array:
+    """
+    Function to perform delayed reads from table columns when converting
+    (no need for didxs)
+    Parameters
+    ----------
+    tb_tool : tables.table
+    col : str
+    cshape : Tuple[int]
+    tidxs : np.ndarray
+    bidxs : np.ndarray
+    Returns
+    -------
+    da.Array
+    """
     # Use casacore to get the shape of a row for this column
     #################################################################################
-    # Get the total number of rows in the base measurement set
-    nrows_total = tb_tool.nrows()
+    with table_manager.get_table() as tb_tool:
+        first_row = tb_tool.row(col)[0][col]
-    # getcolshapestring() only works on columns where a row element is an
-    # array ie. fails for TIME
-    # Assumes the RuntimeError is because the column is a scalar
-    try:
-        shape_string = tb_tool.getcolshapestring(col)[0]
-        # Convert `shape_string` into a tuple that numpy understands
-        extra_dimensions = tuple(
-            [
-                int(idx)
-                for idx in shape_string.replace("[", "").replace("]", "").split(", ")
-            ]
-        )
-    except RuntimeError:
+    if isinstance(first_row, np.ndarray):
+        extra_dimensions = first_row.shape
+    else:
         extra_dimensions = ()
+    # Use dask primitives to lazily read chunks of data from the MeasurementSet
+    # Takes inspiration from dask_image https://image.dask.org/en/latest/
     #################################################################################
-    # Get dtype of the column. Only read first row from disk
-    col_dtype = np.array(tb_tool.col(col)[0]).dtype
-    # Use a custom/safe fill value (https://github.com/casangi/xradio/issues/219)
-    fill_value = get_pad_value(col_dtype)
+    # Get dtype of the column. Wrap in numpy array in case of scalar column
+    col_dtype = np.array(first_row).dtype
+    # Get the number of rows for a single TIME value
+    num_utimes = cshape[0]
+    rows_per_time = cshape[1]
+    # Calculate the chunks of unique times that gives the target chunk sizes
+    tmp_chunks = da.core.normalize_chunks(time_chunksize, (num_utimes,))[0]
+    sum = 0
+    arr_start_end_rows = []
+    for chunk in tmp_chunks:
+        start = (sum) * rows_per_time
+        end = (sum + chunk) * rows_per_time
+        arr_start_end_rows.append((start, end))
+        sum += chunk
+    # Store the start and end rows that should be read for the chunk
+    arr_start_end_rows = da.from_array(arr_start_end_rows, chunks=(1, 2))
+    # Specify the output shape `load_col_chunk`
+    output_chunkshape = (tmp_chunks, cshape[1]) + extra_dimensions
+    # Apply `load_col_chunk` to each chunk
+    data = arr_start_end_rows.map_blocks(
+        load_col_chunk,
+        table_manager=table_manager,
+        col_name=col,
+        col_dtype=col_dtype,
+        tidxs=tidxs,
+        bidxs=bidxs,
+        rows_per_time=rows_per_time,
+        cshape=cshape,
+        extra_dimensions=extra_dimensions,
+        drop_axis=[1],
+        new_axis=list(range(1, len(cshape + extra_dimensions))),
+        meta=np.array([], dtype=col_dtype),
+        chunks=output_chunkshape,
+    )
-    # Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
-    data = np.full(cshape + extra_dimensions, fill_value, dtype=col_dtype)
+    return data
-    # Use built-in casacore table iterator to populate the data column by unique times.
-    if use_table_iter:
-        start_row = 0
-        for ts in tb_tool.iter("TIME", sort=False):
-            num_rows = ts.nrows()
-            # Create small temporary array to store the partial column
-            tmp_arr = np.full(
-                (num_rows,) + extra_dimensions, fill_value, dtype=col_dtype
-            )
+def load_col_chunk(
+    x,
+    table_manager,
+    col_name,
+    col_dtype,
+    tidxs,
+    bidxs,
+    rows_per_time,
+    cshape,
+    extra_dimensions,
+):
+    start_row = x[0][0]
+    end_row = x[0][1]
+    num_rows = end_row - start_row
+    assert (num_rows % rows_per_time) == 0
+    num_utimes = num_rows // rows_per_time
+    # Create memory buffer to populate with data from disk
+    row_data = np.full((num_rows,) + extra_dimensions, np.nan, dtype=col_dtype)
+    # Load data from the column
+    # Release the casacore table as soon as possible
+    with table_manager.get_table() as tb_tool:
+        tb_tool.getcolnp(col_name, row_data, startrow=start_row, nrow=num_rows)
+    # Initialise reshaped numpy array
+    reshaped_data = np.full(
+        (num_utimes, cshape[1]) + extra_dimensions, np.nan, dtype=col_dtype
+    )
-            # Note we don't use `getcol()` because it's less safe. See:
-            # https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
-            ts.getcolnp(col, tmp_arr)
+    # Create slice object for readability
+    slc = slice(start_row, end_row)
+    tidxs_slc = tidxs[slc]
-            # Get the slice of rows contained in `tmp_arr`.
-            # Used to get the relevant integer indexes from `tidxs` and `bidxs`
-            tmp_slice = slice(start_row, start_row + num_rows)
+    tidxs_slc = (
+        tidxs_slc - tidxs_slc[0]
+    )  # Indices of reshaped_data along time differ from values in tidxs. Assumes first time is earliest time
+    bidxs_slc = bidxs[slc]
-            # Copy `tmp_arr` into correct elements of `tmp_arr`
-            data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
-            start_row += num_rows
-    else:
-        data[tidxs, bidxs] = tb_tool.getcol(col)
+    # Populate `reshaped_data` with `row_data`
+    reshaped_data[tidxs_slc, bidxs_slc] = row_data
-    return data
+    return reshaped_data

xradio/measurement_set/_utils/_msv2/_tables/table_query.py CHANGED Viewed

@@ -22,3 +22,25 @@ def open_query(table: tables.table, query: str) -> Generator[tables.table, None,
         yield ttq
     finally:
         ttq.close()
+class TableManager:
+    def __init__(
+        self,
+        infile: str,
+        taql_where: str = "",
+    ):
+        self.infile = infile
+        self.taql_where = taql_where
+        self.taql_query = taql_where.replace("where ", "")
+    def get_table(self):
+        # Performance note:
+        # table.query("(DATA_DESC_ID = 0)") is slightly faster than
+        # tables.taql("select * from $table (DATA_DESC_ID = 0)")
+        with tables.table(
+            self.infile, readonly=True, lockoptions={"option": "usernoread"}, ack=False
+        ) as mtable:
+            query = f"select * from $mtable {self.taql_where}"
+            return tables.taql(query)

xradio 0.0.48__py3-none-any.whl → 0.0.49__py3-none-any.whl

xradio 0.0.48py3-none-any.whl → 0.0.49py3-none-any.whl