PyPI - xradio - Versions diffs - 0.0.28__py3-none-any.whl → 0.0.29__py3-none-any.whl - Mend

xradio 0.0.28py3-none-any.whl → 0.0.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

xradio/__init__.py +5 -4
xradio/_utils/array.py +90 -0
xradio/_utils/zarr/common.py +48 -3
xradio/image/_util/zarr.py +4 -1
xradio/schema/__init__.py +24 -6
xradio/schema/bases.py +440 -2
xradio/schema/check.py +96 -55
xradio/schema/dataclass.py +123 -27
xradio/schema/metamodel.py +21 -4
xradio/schema/typing.py +33 -18
xradio/vis/__init__.py +5 -2
xradio/vis/_processing_set.py +28 -20
xradio/vis/_vis_utils/_ms/_tables/create_field_and_source_xds.py +710 -0
xradio/vis/_vis_utils/_ms/_tables/load.py +23 -10
xradio/vis/_vis_utils/_ms/_tables/load_main_table.py +145 -64
xradio/vis/_vis_utils/_ms/_tables/read.py +747 -172
xradio/vis/_vis_utils/_ms/_tables/read_main_table.py +173 -44
xradio/vis/_vis_utils/_ms/_tables/read_subtables.py +79 -28
xradio/vis/_vis_utils/_ms/_tables/write.py +102 -45
xradio/vis/_vis_utils/_ms/_tables/write_exp_api.py +127 -65
xradio/vis/_vis_utils/_ms/chunks.py +58 -21
xradio/vis/_vis_utils/_ms/conversion.py +536 -67
xradio/vis/_vis_utils/_ms/descr.py +52 -20
xradio/vis/_vis_utils/_ms/msv2_to_msv4_meta.py +70 -35
xradio/vis/_vis_utils/_ms/msv4_infos.py +0 -59
xradio/vis/_vis_utils/_ms/msv4_sub_xdss.py +76 -9
xradio/vis/_vis_utils/_ms/optimised_functions.py +0 -46
xradio/vis/_vis_utils/_ms/partition_queries.py +308 -119
xradio/vis/_vis_utils/_ms/partitions.py +82 -25
xradio/vis/_vis_utils/_ms/subtables.py +32 -14
xradio/vis/_vis_utils/_utils/partition_attrs.py +30 -11
xradio/vis/_vis_utils/_utils/xds_helper.py +136 -45
xradio/vis/_vis_utils/_zarr/read.py +60 -22
xradio/vis/_vis_utils/_zarr/write.py +83 -9
xradio/vis/_vis_utils/ms.py +48 -29
xradio/vis/_vis_utils/zarr.py +44 -20
xradio/vis/convert_msv2_to_processing_set.py +106 -32
xradio/vis/load_processing_set.py +38 -61
xradio/vis/read_processing_set.py +62 -96
xradio/vis/schema.py +687 -0
xradio/vis/vis_io.py +75 -43
{xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/LICENSE.txt +6 -1
{xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/METADATA +10 -5
xradio-0.0.29.dist-info/RECORD +73 -0
{xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/WHEEL +1 -1
xradio/vis/model.py +0 -497
xradio-0.0.28.dist-info/RECORD +0 -71
{xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/top_level.txt +0 -0

xradio/vis/_vis_utils/_ms/_tables/read.py CHANGED Viewed

@@ -2,7 +2,7 @@ import graphviper.utils.logger as logger
 import os
 from pathlib import Path
 import re
-from typing import Any, List, Dict, Tuple, Union
+from typing import Any, Callable, Dict, List, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -13,15 +13,18 @@ from casacore import tables
 from .table_query import open_query, open_table_ro
-CASACORE_TO_PD_TIME_CORRECTION = 3506716800.0
+CASACORE_TO_PD_TIME_CORRECTION = 3_506_716_800.0
 SECS_IN_DAY = 86400
+MJD_DIF_UNIX = 40587
 def table_exists(path: str) -> bool:
     return tables.tableexists(path)
-def convert_casacore_time(rawtimes: np.ndarray, convert_to_datetime=True) -> np.ndarray:
+def convert_casacore_time(
+    rawtimes: np.ndarray, convert_to_datetime: bool = True
+) -> np.ndarray:
     """
     Read time columns to datetime format
     pandas datetimes are referenced against a 0 of 1970-01-01
@@ -29,36 +32,218 @@ def convert_casacore_time(rawtimes: np.ndarray, convert_to_datetime=True) -> np.
     This requires a correction of 3506716800 seconds which is hardcoded to save time
-    :param rawtimes: times in casacore ref
-    :return: times converted to pandas reference
+    Parameters
+    ----------
+    rawtimes : np.ndarray
+        times in casacore ref
+    convert_to_datetime : bool (Default value = True)
+        whether to produce pandas style datetime
+    Returns
+    -------
+    np.ndarray
+        times converted to pandas reference
     """
+    times_reref = np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION
     if convert_to_datetime:
-        return pd.to_datetime(
-            np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION, unit="s"
-        ).values
+        return pd.to_datetime(times_reref, unit="s").values
     else:
-        return np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION
+        return times_reref
     # dt = pd.to_datetime(np.atleast_1d(rawtimes) - correction, unit='s').values
     # if len(np.array(rawtimes).shape) == 0: dt = dt[0]
     # return dt
 def convert_mjd_time(rawtimes: np.ndarray) -> np.ndarray:
-    """Different time conversion needed for the MJD col of EPHEM{i}_*.tab
+    """
+    Different time conversion needed for the MJD col of EPHEM{i}_*.tab
     files (only, as far as I've seen)
-    :param rawtimes: MJD times for example from the MJD col of ephemerides tables
-    :return: times converted to pandas reference and datetime type
+    Parameters
+    ----------
+    rawtimes : np.ndarray
+        MJD times for example from the MJD col of ephemerides tables
+    Returns
+    -------
+    np.ndarray
+        times converted to pandas reference and datetime type
     """
-    return pd.to_datetime(
-        rawtimes * SECS_IN_DAY - CASACORE_TO_PD_TIME_CORRECTION, unit="s"
+    times_reref = pd.to_datetime(
+        (rawtimes - MJD_DIF_UNIX) * SECS_IN_DAY, unit="s"
     ).values
+    return times_reref
+def convert_casacore_time_to_mjd(rawtimes: np.ndarray) -> np.ndarray:
+    """
+    From CASA/casacore time (as used in the TIME column of the main table) to MJD
+    (as used in the EPHEMi*.tab ephemeris tables). As the epochs are the same, this
+    is just a conversion of units.
+    Parameters
+    ----------
+    rawtimes : np.ndarray
+        times from a TIME column (seconds, casacore time epoch)
+    Returns
+    -------
+    np.ndarray
+        times converted to (ephemeris) MJD (days since casacore time epoch (1858-11-17))
+    """
+    return rawtimes / SECS_IN_DAY
+def make_taql_where_between_min_max(
+    min_max: Tuple[np.float64, np.float64],
+    path: str,
+    table_name: str,
+    colname="TIME",
+) -> Union[str, None]:
+    """
+    From a numerical min/max range, produce a TaQL string to select between
+    those min/max values (example: times) in a table.
+    The table can be for example a POINTING subtable or an EPHEM* ephemeris
+    table.
+    This is meant to be used on MSv2 table columns that will be loaded as a
+    coordinate in MSv4s and their sub-xdss (example: POINTING/TIME ephemeris/MJD).
+    This can be used for example to produce a TaQL string to constraing loading of:
+    - POINTING rows (based on the min/max from the time coordinate of the main MSv4)
+    - ephemeris rows, from EPHEM* tables ((based on the MJD column and the min/max
+      from the main MSv4 time coordinate).
+    Parameters
+    ----------
+    min_max : Tuple[np.float64, np.float64]
+        min / max values of time or other column used as coordinate
+        (assumptions: float values, sortable, typically: time coord from MSv4)
+    path :
+        Path to input MS or location of the table
+    table_name :
+        Name of the table where to load a column (example: 'POINTING')
+    colname :
+        Name of the column to search for min/max values (examples: 'TIME', 'MJD')
+    Returns
+    -------
+    taql_where : str
+        TaQL (sub)string with the min/max time 'WHERE' constraint
+    """
+    min_max_range = find_projected_min_max_table(min_max, path, table_name, colname)
+    if min_max_range is None:
+        taql = None
+    else:
+        (min_val, max_val) = min_max_range
+        taql = f"where {colname} >= {min_val} AND {colname} <= {max_val}"
+    return taql
+def find_projected_min_max_table(
+    min_max: Tuple[np.float64, np.float64], path: str, table_name: str, colname: str
+) -> Union[Tuple[np.float64, np.float64], None]:
+    """
+    We have: min/max values that define a range (for example of time)
+    We want: to project that min/max range on a sortable column (for example a
+    range of times onto a TIME column), and find min and max values
+    derived from that table column such that the range between those min and max
+    values includes at least the input min/max range.
+    The returned min/max can then be used in a data selection TaQL query to
+    select at least the values within the input range (possibly extended if
+    the input range overlaps only partially or not at all with the column
+    values). A tolerance is added to the min/max to prevent numerical issues in
+    comparisons and conversios between numerical types and strings.
+    When the range given as input is wider than the range of values found in
+    the column, use the input range, as it is sufficient and more inclusive.
+    When the range given as input is narrow (projected on the target table/column)
+    and falls between two points of the column values, or overlaps with only one
+    point, the min/max are extended to include at least the two column values that
+    define a range within which the input range is included.
+    Example scenario: an ephemeris table is sampled at a coarse interval
+    (20 min) and we want to find a min/max range projected from the time min/max
+    of a main MSv4 time coordinate sampled at ~1s for a field-scan/intent
+    that spans ~2 min. Those ~2 min will typically fall between ephemeris samples.
+    Parameters
+    ----------
+    min_max : Tuple[np.float64, np.float64]
+        min / max values of time or other column used as coordinate
+        (assumptions: float values, sortable)
+    path :
+        Path to input MS or location of the table
+    table_name :
+        Name of the table where to load a column (example: 'POINTING')
+    colname :
+        Name of the column to search for min/max values (example: 'TIME')
+    Returns
+    -------
+    output_min_max : Union[Tuple[np.float64, np.float64], None]
+        min/max values derived from the input min/max and the column values
+    """
+    with open_table_ro(os.path.join(path, table_name)) as tb_tool:
+        if tb_tool.nrows() == 0:
+            return None
+        col = tb_tool.getcol(colname)
+    out_min_max = find_projected_min_max_array(min_max, col)
+    return out_min_max
+def find_projected_min_max_array(
+    min_max: Tuple[np.float64, np.float64], array: np.array
+) -> Tuple[np.float64, np.float64]:
+    """Does the min/max checks and search for find_projected_min_max_table()"""
+    sorted_array = np.sort(array)
+    (range_min, range_max) = min_max
+    if len(sorted_array) < 2:
+        tol = np.finfo(sorted_array.dtype).eps * 4
+    else:
+        tol = np.diff(sorted_array[np.nonzero(sorted_array)]).min() / 4
+    if range_max > sorted_array[-1]:
+        projected_max = range_max + tol
+    else:
+        max_idx = sorted_array.size - 1
+        max_array_idx = min(
+            max_idx, np.searchsorted(sorted_array, range_max, side="right")
+        )
+        projected_max = sorted_array[max_array_idx] + tol
+    if range_min < sorted_array[0]:
+        projected_min = range_min - tol
+    else:
+        min_array_idx = max(
+            0, np.searchsorted(sorted_array, range_min, side="left") - 1
+        )
+        # ensure 'sorted_array[min_array_idx] < range_min' when values ==
+        if sorted_array[min_array_idx] == range_min:
+            min_array_idx = max(0, min_array_idx - 1)
+        projected_min = sorted_array[min_array_idx] - tol
+    return (projected_min, projected_max)
 def extract_table_attributes(infile: str) -> Dict[str, Dict]:
     """
-    return a dictionary of table attributes created from MS keywords and column descriptions
+    Return a dictionary of table attributes created from MS keywords and column descriptions
+    Parameters
+    ----------
+    infile : str
+        table file path
+    Returns
+    -------
+    Dict[str, Dict]
+        table attributes as a dictionary
     """
     with open_table_ro(infile) as tb_tool:
         kwd = tb_tool.getkeywords()
@@ -79,9 +264,17 @@ def add_units_measures(
     """
     Add attributes with units and measure metainfo to the variables passed in the input dictionary
-    :param mvars: data variables where to populate units
-    :param cc_attrs: dictionary with casacore table attributes (from extract_table_attributes)
-    :return: variables with units added in their attributes
+    Parameters
+    ----------
+    mvars : Dict[str, xr.DataArray]
+        data variables where to populate units
+    cc_attrs : Dict[str, Any]
+        dictionary with casacore table attributes (from extract_table_attributes)
+    Returns
+    -------
+    Dict[str, xr.DataArray]
+        variables with units added in their attributes
     """
     col_descrs = cc_attrs["column_descriptions"]
     # TODO: Should probably loop the other way around, over mvars
@@ -90,6 +283,12 @@ def add_units_measures(
         if var_name in mvars and "keywords" in col_descrs[col]:
             if "QuantumUnits" in col_descrs[col]["keywords"]:
                 cc_units = col_descrs[col]["keywords"]["QuantumUnits"]
+                if isinstance(
+                    cc_units, str
+                ):  # Little fix for Meerkat data where the units are a string.
+                    cc_units = [cc_units]
                 if not isinstance(cc_units, list) or not cc_units:
                     logger.warning(
                         f"Invalid units found for column/variable {col}: {cc_units}"
@@ -128,7 +327,8 @@ def add_units_measures(
 def make_freq_attrs(spw_xds: xr.Dataset, spw_id: int) -> Dict[str, Any]:
-    """Grab the units/measure metainfo for the xds.freq dimension of a
+    """
+    Grab the units/measure metainfo for the xds.freq dimension of a
     parttion from the SPECTRAL_WINDOW subtable CTDS attributes.
     Has to read xds_spw.meas_freq_ref and use it as index in the CTDS
@@ -137,10 +337,17 @@ def make_freq_attrs(spw_xds: xr.Dataset, spw_id: int) -> Dict[str, Any]:
     (then the ref frame from the second will be pulled to
     xds.freq.attrs)
-    :param spw_xds: (metainfo) SPECTRAL_WINDOW xds
-    :param spw_id: SPW id of a partition
-    :return: attributes (units/measure) for the freq dim of a partition
+    Parameters
+    ----------
+    spw_xds : xr.Dataset
+        (metainfo) SPECTRAL_WINDOW xds
+    spw_id : int
+        SPW id of a partition
+    Returns
+    -------
+    Dict[str, Any]
+        attributes (units/measure) for the freq dim of a partition
     """
     fallback_TabRefTypes = [
         "REST",
@@ -193,6 +400,11 @@ def get_pad_nan(col: np.ndarray) -> np.ndarray:
     ----------
     col : np.ndarray
         data being loaded from a table column
+    Returns
+    -------
+    np.ndarray
+        nan ("nan") value for the type of the input column
     """
     # This is causing frequent warnings for integers. Cast of nan to "int nan"
     # produces -2147483648 but also seems to trigger a
@@ -208,15 +420,24 @@ def get_pad_nan(col: np.ndarray) -> np.ndarray:
 def redimension_ms_subtable(xds: xr.Dataset, subt_name: str) -> xr.Dataset:
-    """Expand a MeasurementSet subtable xds from single dimension (row)
+    """
+    Expand a MeasurementSet subtable xds from single dimension (row)
     to multiple dimensions (such as (source_id, time, spectral_window)
     WIP: only works for source, experimenting
-    :param xds: dataset to change the dimensions
-    :param subt_name: subtable name (SOURCE, etc.)
-    :return: transformed xds with data dimensions representing the MS subtable key
-    (one dimension for every columns)
+    Parameters
+    ----------
+    xds : xr.Dataset
+        dataset to change the dimensions
+    subt_name : str
+        subtable name (SOURCE, etc.)
+    Returns
+    -------
+    xr.Dataset
+        transformed xds with data dimensions representing the MS subtable key
+        (one dimension for every columns)
     """
     subt_key_cols = {
         "DOPPLER": ["doppler_id", "source_id"],
@@ -239,8 +460,18 @@ def redimension_ms_subtable(xds: xr.Dataset, subt_name: str) -> xr.Dataset:
     rxds = xds.copy()
     try:
+        # drop_duplicates() needed (https://github.com/casangi/xradio/issues/185). Examples:
+        # - Some early ALMA datasets have bogus WEATHER tables with many/most rows with
+        #   (ANTENNA_ID=0, TIME=0) and no other columns to figure out the right IDs, such
+        #   as "NS_WX_STATION_ID" or similar. (example: X425.pm04.scan4.ms)
+        # - Some GBT MSs have duplicated (ANTENNA_ID=0, TIME=xxx). (example: analytic_variable.ms)
         with np.errstate(invalid="ignore"):
-            rxds = rxds.set_index(row=key_dims).unstack("row").transpose(*key_dims, ...)
+            rxds = (
+                rxds.set_index(row=key_dims)
+                .drop_duplicates("row")
+                .unstack("row")
+                .transpose(*key_dims, ...)
+            )
         # unstack changes type to float when it needs to introduce NaNs, so
         # we need to reset to the original type.
         for var in rxds.data_vars:
@@ -302,23 +533,37 @@ def read_generic_table(
     inpath: str,
     tname: str,
     timecols: Union[List[str], None] = None,
-    ignore=None,
+    ignore: Union[List[str], None] = None,
     rename_ids: Dict[str, str] = None,
+    taql_where: str = None,
 ) -> xr.Dataset:
-    """read generic casacore (sub)table into memory resident xds. This reads the table
-    and and loads the data.
+    """
+    load generic casacore (sub)table into memory resident xds (xarray wrapped
+    numpy arrays). This reads through the table columns and loads the data.
+    TODO: change read_ name to load_ name (and most if not all this module)
     Parameters
     ----------
-    :param inpath: path to the MS or directory containing the table
-    :param tname: (sub)table name, for example 'SOURCE' for myms.ms/SOURCE
-    :param timecols: column names to convert to numpy datetime format.
-    leaves times as their original casacore format.
-    :param ignore: list of column names to ignore and not try to read.
-    :rename_ids: dict with dimension renaming mapping
-    :return: table loaded as XArray dataset
+    inpath : str
+        path to the MS or directory containing the table
+    tname : str
+        (sub)table name, for example 'SOURCE' for myms.ms/SOURCE
+    timecols : Union[List[str], None] (Default value = None)
+        column names to convert to numpy datetime format.
+        leaves times as their original casacore format.
+    ignore : Union[List[str], None] (Default value = None)
+        list of column names to ignore and not try to read.
+    rename_ids : Dict[str, str] (Default value = None)
+        dict with dimension renaming mapping
+    taql_where : str (Default value = None)
+         TaQL string to optionally constain the rows/columns to read
+         (Default value = None)
+    Returns
+    -------
+    xr.Dataset
+        table loaded as XArray dataset
     """
     if timecols is None:
         timecols = []
@@ -340,13 +585,33 @@ def read_generic_table(
         )
         return xr.Dataset()
-    with open_table_ro(infile) as tb_tool:
-        if tb_tool.nrows() == 0:
+    with open_table_ro(infile) as gtable:
+        if gtable.nrows() == 0:
             logger.debug(f"table is empty: {inpath} {tname}")
             return xr.Dataset(attrs=attrs)
-        colnames = tb_tool.colnames()
-        mcoords, mvars = read_generic_cols(infile, tb_tool, timecols, ignore)
+        # if len(ignore) > 0: #This is needed because some SOURCE tables have a SOURCE_MODEL column that is corrupted and this causes the open_query to fail.
+        #     select_columns = gtable.colnames()
+        #     select_columns_str = str([item for item in select_columns if item not in ignore])[1:-1].replace("'", "") #Converts an array to a comma sepearted string. For example ['a', 'b', 'c'] to 'a, b, c'.
+        #     taql_gtable = f"select " + select_columns_str + f" from $gtable {taql_where}"
+        # else:
+        #     taql_gtable = f"select * from $gtable {taql_where}"
+        # relatively often broken columns that we do not need
+        exclude_pattern = ", !~p/SOURCE_MODEL/"
+        taql_gtable = f"select *{exclude_pattern} from $gtable {taql_where or ''}"
+        with open_query(gtable, taql_gtable) as tb_tool:
+            if tb_tool.nrows() == 0:
+                logger.debug(
+                    f"table query is empty: {inpath} {tname}, with where {taql_where}"
+                )
+                return xr.Dataset(attrs=attrs)
+            colnames = tb_tool.colnames()
+            mcoords, mvars = load_cols_into_coords_data_vars(
+                infile, tb_tool, timecols, ignore
+            )
     mvars = add_units_measures(mvars, cc_attrs)
     mcoords = add_units_measures(mcoords, cc_attrs)
@@ -379,49 +644,116 @@ def read_generic_table(
     return xds
-def read_generic_cols(
-    infile: str, tb_tool, timecols, ignore, dim_prefix: str = "dim"
+def load_cols_into_coords_data_vars(
+    inpath: str,
+    tb_tool: tables.table,
+    timecols: Union[List[str], None] = None,
+    ignore: Union[List[str], None] = None,
 ) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
-    """Reads data for each MS column (loading the data in memory) into
-    Xarray datasets
+    """
+    Produce a set of coordinate xarrays and a set of data variables xarrays
+    from the columns of a table.
+    Parameters
+    ----------
+    inpath : str
+        input path
+    tb_tool: tables.table
+        tool being used to load data
+    timecols: Union[List[str], None] (Default value = None)
+        list of columns to be considered as TIME-related
+    ignore: Union[List[str], None] (Default value = None)
+        columns to ignore
+    Returns
+    -------
+    Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
+        coordinates dictionary + variables dictionary
+    """
+    columns_loader = find_best_col_loader(inpath, tb_tool.nrows())
+    mcoords, mvars = columns_loader(inpath, tb_tool, timecols, ignore)
+    return mcoords, mvars
-    :param infile: path name of the MS
-    :param tb_tool: table to red the columns
-    :param timecols: columns names to convert to datetime format
-    :param ignore: list of column names to skip and not try to read.
-    :return: dict of coordinates and dict of data vars.
+def find_best_col_loader(inpath: str, nrows: int) -> Callable:
     """
+    Simple heuristic: for any tables other than POINTING, use the generic_load_cols
+    function that is able to deal with variable size columns. For POINTING (and if it has
+    more rows than an arbitrary "small" threshold) use a more efficient load function that
+    loads the data by column (but is not able to deal with any generic table).
+    For now, all other subtables are loaded using the generic column loader.
+    Background: the POINTING subtable can have a very large number of rows. For example in
+    ALMA it is sampled at ~50ms intervals which typically produces of the order of
+    [10^5, 10^7] rows. This becomes a serious performance bottleneck when loading the
+    table using row() (and one dict allocated per row).
+    This function chooses an alternative "by-column" load function to load in the columns
+    when the table is POINTING. See xradio issue #128 for now this distinction is made
+    solely for performance reasons.
-    # Almost sure that when TIME is present (in a standard MS subt) it
-    # is part of the key. But what about non-std subtables, ASDM subts?
-    subts_with_time_key = (
-        "FEED",
-        "FLAG_CMD",
-        "FREQ_OFFSET",
-        "HISTORY",
-        "SOURCE",
-        "SYSCAL",
-        "WEATHER",
-    )
-    # Optional cols known to sometimes have inconsistent values
-    known_misbehaving_cols = ["ASSOC_NATURE"]
+    Parameters
+    ----------
+    inpath : str
+        path name of the MS table
+    nrows : int
+        number of rows found in the table
+    Returns
+    -------
+    Callable
+        function best suited to load the data from the columns of this table
+    """
+    # do not give up generic by-row() loading if nrows is (arbitrary) small
+    ARBITRARY_MIN_ROWS = 1000
-    colnames = tb_tool.colnames()
-    col_cells = {
-        col: tb_tool.getcell(col, 0)
-        for col in colnames
-        if (col not in ignore) and (tb_tool.iscelldefined(col, 0))
-    }
+    if inpath.endswith("POINTING") and nrows >= ARBITRARY_MIN_ROWS:
+        columns_loader = load_fixed_size_cols
+    else:
+        columns_loader = load_generic_cols
+    return columns_loader
+def load_generic_cols(
+    inpath: str,
+    tb_tool: tables.table,
+    timecols: Union[List[str], None],
+    ignore: Union[List[str], None],
+) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
+    """
+    Loads data for each MS column (loading the data in memory) into Xarray datasets
+    This function is generic in that it can load variable size array columns. See also
+    load_fixed_size_cols() as a simpler and much better performing alternative
+    for tables that are large and expected/guaranteed to not have columns with variable
+    size cells.
+    Parameters
+    ----------
+    inpath : str
+        path name of the MS table
+    tb_tool : tables.table
+        table to load the columns
+    timecols : Union[List[str], None]
+        columns names to convert to datetime format
+    ignore : Union[List[str], None]
+        list of column names to skip and not try to load.
+    Returns
+    -------
+    Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
+        dict of coordinates and dict of data vars.
+    """
+    col_cells = find_loadable_filled_cols(tb_tool, ignore)
     trows = tb_tool.row(ignore, exclude=True)[:]
     # Produce coords and data vars from MS columns
     mcoords, mvars = {}, {}
     for col in col_cells.keys():
-        if tb_tool.coldatatype(col) == "record":
-            continue  # not supported
         try:
             # TODO
             # benchmark np.stack() performance
@@ -446,87 +778,287 @@ def read_generic_cols(
             if len(set([isinstance(row[col], dict) for row in trows])) > 1:
                 continue  # can't deal with this case
-            mshape = np.array(max([np.array(row[col]).shape for row in trows]))
-            try:
-                pad_nan = get_pad_nan(col_cells[col])
-                # TODO
-                # benchmark np.stack() performance
-                data = np.stack(
-                    [
-                        np.pad(
-                            (
-                                row[col]
-                                if len(row[col]) > 0
-                                else np.array(row[col]).reshape(
-                                    np.arange(len(mshape)) * 0
-                                )
-                            ),
-                            [(0, ss) for ss in mshape - np.array(row[col]).shape],
-                            "constant",
-                            constant_values=pad_nan,
-                        )
-                        for row in trows
-                    ]
-                )
-            except Exception as exc:
-                level = logger.WARNING
-                if col in known_misbehaving_cols:
-                    level = logger.DEBUG
-                logger.log(
-                    level, f"{infile}: failed to read data for column {col}: {exc}"
-                )
-                data = []
+            data = handle_variable_col_issues(inpath, col, col_cells, trows)
         if len(data) == 0:
             continue
-        if col in timecols:
-            if col == "MJD":
-                data = convert_mjd_time(data)
-            else:
-                try:
-                    data = convert_casacore_time(data)
-                except pd.errors.OutOfBoundsDatetime as exc:
-                    if infile.endswith("WEATHER"):
-                        logger.error(
-                            f"Exception when converting WEATHER/TIME: {exc}. TIME data: {data}"
-                        )
-                    else:
-                        raise
-        # should also probably add INTERVAL not only TIME
-        if col.endswith("_ID") or (
-            infile.endswith(subts_with_time_key) and col == "TIME"
-        ):
-            # weather table: importasdm produces very wrong "-1" ANTENNA_ID
-            if (
-                infile.endswith("WEATHER")
-                and col == "ANTENNA_ID"
-                and "NS_WX_STATION_ID" in colnames
-            ):
-                data = np.stack([row["NS_WX_STATION_ID"] for row in trows])
-            mcoords[col.lower()] = xr.DataArray(
-                data,
-                dims=[
-                    f"{dim_prefix}_{di}_{ds}"
-                    for di, ds in enumerate(np.array(data).shape)
-                ],
-            )
-        else:
-            mvars[col.lower()] = xr.DataArray(
-                data,
-                dims=[
-                    f"{dim_prefix}_{di}_{ds}"
-                    for di, ds in enumerate(np.array(data).shape)
-                ],
+        array_type, array_data = raw_col_data_to_coords_vars(
+            inpath, tb_tool, col, data, timecols
+        )
+        if array_type == "coord":
+            mcoords[col.lower()] = array_data
+        elif array_type == "data_var":
+            mvars[col.lower()] = array_data
+    return mcoords, mvars
+def load_fixed_size_cols(
+    inpath: str,
+    tb_tool: tables.table,
+    timecols: Union[List[str], None],
+    ignore: Union[List[str], None],
+) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
+    """
+    Loads columns into memory via the table tool getcol() function, as opposed to
+    load_generic_cols() which loads on a per-row basis via row().
+    This function is 2+ orders of magnitude faster for large tables (pointing tables with
+    the order of >=10^5 rows)
+    Prefer this function for performance reasons when all rows can be assumed to be fixed
+    size (even if they are of array type).
+    This is performance-critical for the POINTING subtable.
+    Parameters
+    ----------
+    inpath : str
+        path name of the MS
+    tb_tool : tables.table
+        table to red the columns
+    timecols : Union[List[str], None]
+        columns names to convert to datetime format
+    ignore : Union[List[str], None]
+        list of column names to skip and not try to load.
+    Returns
+    -------
+    Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
+        dict of coordinates and dict of data vars, ready to construct an xr.Dataset
+    """
+    loadable_cols = find_loadable_filled_cols(tb_tool, ignore)
+    # Produce coords and data vars from MS columns
+    mcoords, mvars = {}, {}
+    for col in loadable_cols.keys():
+        try:
+            data = tb_tool.getcol(col)
+            if isinstance(data, dict):
+                data = data["array"].reshape(data["shape"])
+        except Exception as exc:
+            logger.warning(
+                f"{inpath}: failed to load data with getcol for column {col}: {exc}"
             )
+            data = []
+        if len(data) == 0:
+            continue
+        array_type, array_data = raw_col_data_to_coords_vars(
+            inpath, tb_tool, col, data, timecols
+        )
+        if array_type == "coord":
+            mcoords[col.lower()] = array_data
+        elif array_type == "data_var":
+            mvars[col.lower()] = array_data
     return mcoords, mvars
+def find_loadable_filled_cols(
+    tb_tool: tables.table, ignore: Union[List[str], None]
+) -> Dict:
+    """
+    For a table, finds the columns that are:
+    - loadable = not of record type, and not to be ignored
+    - filled = the column cells are populated.
+    Parameters
+    ----------
+    tb_tool : tables.table
+        table to red the columns
+    ignore : Union[List[str], None]
+        list of column names to skip and not try to load.
+    Returns
+    -------
+    Dict
+        dict of {column name => first cell} for columns that can/should be loaded
+    """
+    colnames = tb_tool.colnames()
+    # columns that are not populated are skipped. record columns are not supported
+    loadable_cols = {
+        col: tb_tool.getcell(col, 0)
+        for col in colnames
+        if (col not in ignore)
+        and (tb_tool.iscelldefined(col, 0))
+        and tb_tool.coldatatype(col) != "record"
+    }
+    return loadable_cols
+def raw_col_data_to_coords_vars(
+    inpath: str,
+    tb_tool: tables.table,
+    col: str,
+    data: np.ndarray,
+    timecols: Union[List[str], None],
+) -> Tuple[str, xr.DataArray]:
+    """
+    From a raw np array of data (freshly loaded from a table column), prepares either a
+    coord or a data_var ready to be added to an xr.Dataset
+    Parameters
+    ----------
+    inpath: str
+        input table path
+    tb_tool: tables.table :
+        table toold being used to load data
+    col: str :
+        column
+    data: np.ndarray :
+        column data
+    timecols: Union[List[str], None]
+        columns to be treated as TIME-related
+    Returns
+    -------
+    Tuple[str, xr.DataArray]
+        array type string (whether this column is a 'coord' or a 'data_var') + DataArray
+        with column  data/coord values ready to be added to the table xds
+    """
+    # Almost sure that when TIME is present (in a standard MS subt) it
+    # is part of the key. But what about non-std subtables, ASDM subts?
+    subts_with_time_key = (
+        "FEED",
+        "FLAG_CMD",
+        "FREQ_OFFSET",
+        "HISTORY",
+        "POINTING",
+        "SOURCE",
+        "SYSCAL",
+        "WEATHER",
+    )
+    dim_prefix = "dim"
+    if col in timecols:
+        if col == "MJD":
+            data = convert_mjd_time(data).astype("float64") / 1e9
+        else:
+            try:
+                data = convert_casacore_time(data)
+            except pd.errors.OutOfBoundsDatetime as exc:
+                if inpath.endswith("WEATHER"):
+                    # intentionally not callling logging.exception
+                    logger.warning(
+                        f"Exception when converting WEATHER/TIME: {exc}. TIME data: {data}"
+                    )
+                else:
+                    raise
+    # should also probably add INTERVAL not only TIME
+    if col.endswith("_ID") or (inpath.endswith(subts_with_time_key) and col == "TIME"):
+        # weather table: importasdm produces very wrong "-1" ANTENNA_ID
+        if (
+            inpath.endswith("WEATHER")
+            and col == "ANTENNA_ID"
+            and "NS_WX_STATION_ID" in tb_tool.colnames()
+        ):
+            data = tb_tool.getcol("NS_WX_STATION_ID")
+        array_type = "coord"
+        array_data = xr.DataArray(
+            data,
+            dims=[
+                f"{dim_prefix}_{di}_{ds}" for di, ds in enumerate(np.array(data).shape)
+            ],
+        )
+    else:
+        array_type = "data_var"
+        array_data = xr.DataArray(
+            data,
+            dims=[
+                f"{dim_prefix}_{di}_{ds}" for di, ds in enumerate(np.array(data).shape)
+            ],
+        )
+    return array_type, array_data
+def handle_variable_col_issues(
+    inpath: str, col: str, col_cells: dict, trows: tables.tablerow
+) -> np.ndarray:
+    """
+    load variable-size array columns, padding with nans wherever
+    needed. This happens for example often in the SPECTRAL_WINDOW
+    table (CHAN_WIDTH, EFFECTIVE_BW, etc.).
+    Also handle exceptions gracefully when trying to load the rows.
+    Parameters
+    ----------
+    inpath : str
+        path name of the MS
+    col : str
+        column being loaded
+    col_cells : dict
+        col: cell} values
+    trows : tables.tablerow
+        rows from a table as loaded by tables.row()
+    Returns
+    -------
+    np.ndarray
+        array with column values (possibly padded if rows vary in size)
+    """
+    # Optional cols known to sometimes have inconsistent values
+    known_misbehaving_cols = ["ASSOC_NATURE"]
+    mshape = np.array(max([np.array(row[col]).shape for row in trows]))
+    try:
+        pad_nan = get_pad_nan(col_cells[col])
+        # TODO
+        # benchmark np.stack() performance
+        data = np.stack(
+            [
+                np.pad(
+                    (
+                        row[col]
+                        if len(row[col]) > 0
+                        else np.array(row[col]).reshape(np.arange(len(mshape)) * 0)
+                    ),
+                    [(0, ss) for ss in mshape - np.array(row[col]).shape],
+                    "constant",
+                    constant_values=pad_nan,
+                )
+                for row in trows
+            ]
+        )
+    except Exception as exc:
+        msg = f"{inpath}: failed to load data for column {col}: {exc}"
+        if col in known_misbehaving_cols:
+            logger.debug(msg)
+        else:
+            logger.warning(msg)
+        data = np.empty(0)
+    return data
 def read_flat_col_chunk(infile, col, cshape, ridxs, cstart, pstart) -> np.ndarray:
     """
     Extract data chunk for each table col, this is fed to dask.delayed
+    Parameters
+    ----------
+    infile :
+    col :
+    cshape :
+    ridxs :
+    cstart :
+    pstart :
+    Returns
+    -------
+    np.ndarray
     """
     with open_table_ro(infile) as tb_tool:
@@ -588,6 +1120,30 @@ def read_col_chunk(
 ) -> np.ndarray:
     """
     Function to perform delayed reads from table columns.
+    Parameters
+    ----------
+    infile : str
+    ts_taql : str
+    col : str
+    cshape : Tuple[int]
+    tidxs : np.ndarray
+    bidxs : np.ndarray
+    didxs : np.ndarray
+    d1: Tuple[int, int]
+    d2: Tuple[int, int]
+    Returns
+    -------
+    np.ndarray
     """
     # TODO: consider calling load_col_chunk() from inside the withs
     # for read_delayed_pointing_table and read_expanded_main_table
@@ -600,8 +1156,13 @@ def read_col_chunk(
             elif len(cshape) == 4:  # DATA and FLAG
                 data = query.getcolslice(col, (d1[0], d2[0]), (d1[1], d2[1]), [], 0, -1)
-    # full data is the maximum of the data shape and chunk shape dimensions
-    fulldata = np.full(cshape, np.nan, dtype=data.dtype)
+    policy = "warn"
+    if np.issubdtype(data.dtype, np.integer):
+        policy = "ignore"
+    with np.errstate(invalid=policy):
+        # full data is the maximum of the data shape and chunk shape dimensions
+        fulldata = np.full(cshape, np.nan, dtype=data.dtype)
     if len(didxs) > 0:
         fulldata[tidxs[didxs], bidxs[didxs]] = data[didxs]
@@ -609,71 +1170,85 @@ def read_col_chunk(
 def read_col_conversion(
-    tb_tool,
+    tb_tool: tables.table,
     col: str,
     cshape: Tuple[int],
     tidxs: np.ndarray,
     bidxs: np.ndarray,
-):
+) -> np.ndarray:
     """
     Function to perform delayed reads from table columns when converting
     (no need for didxs)
+    Parameters
+    ----------
+    tb_tool : tables.table
+    col : str
+    cshape : Tuple[int]
+    tidxs : np.ndarray
+    bidxs : np.ndarray
+    Returns
+    -------
+    np.ndarray
     """
     # Workaround for https://github.com/casacore/python-casacore/issues/130
     # WARNING: Assumes tb_tool is a single measurement set not an MMS.
-    # WARNING: Assumes the num_frequencies * num_polarisations > 2**29. If false,
+    # WARNING: Assumes the num_frequencies * num_polarizations < 2**29. If false,
     # https://github.com/casacore/python-casacore/issues/130 isn't mitigated.
     # Use casacore to get the shape of a row for this column
     #################################################################################
     # Get the total number of rows in the base measurement set
     nrows_total = tb_tool.nrows()
     # getcolshapestring() only works on columns where a row element is an
-    # array (ie fails for TIME, etc)
-    # Assumes RuntimeError is because the column is a scalar
+    # array ie. fails for TIME
+    # Assumes the RuntimeError is because the column is a scalar
     try:
         shape_string = tb_tool.getcolshapestring(col)[0]
+        # Convert `shape_string` into a tuple that numpy understands
         extra_dimensions = tuple(
             [
                 int(idx)
                 for idx in shape_string.replace("[", "").replace("]", "").split(", ")
             ]
         )
-        full_shape = tuple(
-            [nrows_total]
-            + [
-                int(idx)
-                for idx in shape_string.replace("[", "").replace("]", "").split(", ")
-            ]
-        )
     except RuntimeError:
         extra_dimensions = ()
-        full_shape = (nrows_total,)
     #################################################################################
     # Get dtype of the column. Only read first row from disk
     col_dtype = np.array(tb_tool.col(col)[0]).dtype
-    # Construct the numpy array to populate with data
-    data = np.empty(full_shape, dtype=col_dtype)
+    # Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
+    data = np.full(cshape + extra_dimensions, np.nan, dtype=col_dtype)
     # Use built-in casacore table iterator to populate the data column by unique times.
     start_row = 0
     for ts in tb_tool.iter("TIME", sort=False):
         num_rows = ts.nrows()
-        # Note don't use getcol() because it's less safe. See:
+        # Create small temporary array to store the partial column
+        tmp_arr = np.full((num_rows,) + extra_dimensions, np.nan, dtype=col_dtype)
+        # Note we don't use `getcol()` because it's less safe. See:
         # https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
-        ts.getcolnp(col, data[start_row : start_row + num_rows])
+        ts.getcolnp(col, tmp_arr)
+        # Get the slice of rows contained in `tmp_arr`.
+        # Used to get the relevant integer indexes from `tidxs` and `bidxs`
+        tmp_slice = slice(start_row, start_row + num_rows)
+        # Copy `tmp_arr` into correct elements of `tmp_arr`
+        data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
         start_row += num_rows
-    # TODO
-    # Can we return a view of `data` instead of copying?
-    fulldata = np.full(cshape + extra_dimensions, np.nan, dtype=col_dtype)
-    fulldata[tidxs, bidxs] = data
-    return fulldata
+    return data

xradio 0.0.28__py3-none-any.whl → 0.0.29__py3-none-any.whl

xradio 0.0.28py3-none-any.whl → 0.0.29py3-none-any.whl