PyPI - xarray-ms - Versions diffs - 0.2.4__tar.gz → 0.2.6__tar.gz - Mend

xarray-ms 0.2.4tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: xarray-ms
-Version: 0.2.4
+Version: 0.2.6
 Summary: xarray MSv4 views over MSv2 Measurement Sets
 Author: Simon Perkins
 Author-email: simon.perkins@gmail.com

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "xarray-ms"
-version = "0.2.4"
+version = "0.2.6"
 description = "xarray MSv4 views over MSv2 Measurement Sets"
 authors = ["Simon Perkins <simon.perkins@gmail.com>"]
 readme = "README.rst"
@@ -58,7 +58,7 @@ build-backend = "poetry.core.masonry.api"
 # github_url = "https://github.com/<user or organization>/<project>/"
 [tool.tbump.version]
-current = "0.2.4"
+current = "0.2.6"
 # Example of a semver regexp.
 # Make sure this matches current_version before

xarray_ms-0.2.6/xarray_ms/backend/msv2/array.py ADDED Viewed

@@ -0,0 +1,170 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Callable, Tuple
+import numpy as np
+from xarray.backends import BackendArray
+from xarray.core.indexing import IndexingSupport, explicit_indexing_adapter
+if TYPE_CHECKING:
+  import numpy.typing as npt
+  from xarray_ms.backend.msv2.structure import MSv2StructureFactory, PartitionKeyT
+  from xarray_ms.multiton import Multiton
+  TransformerT = Callable[[npt.NDArray], npt.NDArray]
+def slice_length(s: npt.NDArray | slice, max_len) -> int:
+  if isinstance(s, np.ndarray):
+    if s.ndim != 1:
+      raise NotImplementedError("Slicing with non-1D numpy arrays")
+    return len(s)
+  start, stop, step = s.indices(max_len)
+  if step != 1:
+    raise NotImplementedError(f"Slicing with steps {s} other than 1 not supported")
+  return stop - start
+class MSv2Array(BackendArray):
+  """Base MSv2Array backend array class,
+  containing required shape and data type"""
+  __slots__ = ("shape", "dtype")
+  shape: Tuple[int, ...]
+  dtype: npt.DTypeLike
+  def __init__(self, shape: Tuple[int, ...], dtype: npt.DTypeLike):
+    self.shape = shape
+    self.dtype = dtype
+  def __getitem__(self, key) -> npt.NDArray:
+    raise NotImplementedError
+  @property
+  def transform(self) -> TransformerT | None:
+    raise NotImplementedError
+  @transform.setter
+  def transform(self, value: TransformerT) -> None:
+    raise NotImplementedError
+class MainMSv2Array(MSv2Array):
+  """Backend array containing functionality for reading an MSv2 column
+  from the MAIN table. Columns are assumed to have ("time", "baseline_id")
+  as the first dimensions. These are mapped onto the "row" dimension
+  via the partition row map"""
+  __slots__ = (
+    "_table_factory",
+    "_structure_factory",
+    "_partition",
+    "_column",
+    "_default",
+    "_transform",
+  )
+  _table_factory: Multiton
+  _structure_factory: MSv2StructureFactory
+  _partition: PartitionKeyT
+  _column: str
+  _default: Any | None
+  _transform: TransformerT | None
+  def __init__(
+    self,
+    table_factory: Multiton,
+    structure_factory: MSv2StructureFactory,
+    partition: PartitionKeyT,
+    column: str,
+    shape: Tuple[int, ...],
+    dtype: npt.DTypeLike,
+    default: Any | None = None,
+    transform: TransformerT | None = None,
+  ):
+    super().__init__(shape, dtype)
+    self._table_factory = table_factory
+    self._structure_factory = structure_factory
+    self._partition = partition
+    self._column = column
+    self._default = default
+    self._transform = transform
+    assert len(shape) >= 2, "(time, baseline_ids) required"
+  def __getitem__(self, key) -> npt.NDArray:
+    return explicit_indexing_adapter(
+      key, self.shape, IndexingSupport.OUTER, self._getitem
+    )
+  def _getitem(self, key) -> npt.NDArray:
+    assert len(key) == len(self.shape)
+    expected_shape = tuple(slice_length(k, s) for k, s in zip(key, self.shape))
+    # Map the (time, baseline_id) coordinates onto row indices
+    rows = self._structure_factory.instance[self._partition].row_map[key[:2]]
+    row_key = (rows.ravel(),) + key[2:]
+    row_shape = (rows.size,) + expected_shape[2:]
+    result = np.full(row_shape, self._default, dtype=self.dtype)
+    self._table_factory.instance.getcol(self._column, row_key, result)
+    result = result.reshape(rows.shape + expected_shape[2:])
+    return self._transform(result) if self._transform else result
+  @property
+  def transform(self) -> TransformerT | None:
+    return self._transform
+  @transform.setter
+  def transform(self, value: TransformerT) -> None:
+    self._transform = value
+class BroadcastMSv2Array(MSv2Array):
+  """Broadcasts a MAIN table MSv2 Column up to an
+  MSv4 column. This can be inefficient for example,
+  if multiple frequency chunks are read for the same
+  ("time", "baseline_id") range as the same
+  low resolution data can be read multiple times.
+  However, this should be no worse than reading the
+  data for a full resolution column.
+  This is primarily useful for falling back to the
+  WEIGHT column when WEIGHT_SPECTRUM is missing, or
+  FLAG_ROW if FLAG is missing.
+  """
+  __slots__ = ("_low_res_array", "_low_res_index")
+  _low_res_array: MSv2Array
+  _low_res_index: Tuple[slice | None, ...]
+  shape: Tuple[int, ...]
+  def __init__(
+    self,
+    low_res_array: MSv2Array,
+    low_res_index: Tuple[slice | None, ...],
+    high_res_shape: Tuple[int, ...],
+  ):
+    self._low_res_array = low_res_array
+    self._low_res_index = low_res_index
+    self.shape = high_res_shape
+  @property
+  def dtype(self):
+    return self._low_res_array.dtype
+  @property
+  def transform(self) -> TransformerT | None:
+    return self._low_res_array.transform
+  @transform.setter
+  def transform(self, value: TransformerT) -> None:
+    self._low_res_array.transform = value
+  def __getitem__(self, key) -> npt.NDArray:
+    low_res_data = self._low_res_array.__getitem__(key)
+    low_res_data = low_res_data[self._low_res_index]
+    return np.broadcast_to(low_res_data, self.shape)

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/xarray_ms/backend/msv2/factories/antenna.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Dict, Mapping
 import numpy as np
 from xarray import Dataset, Variable
+from xarray_ms.backend.msv2.imputation import maybe_impute_observation_table
 from xarray_ms.backend.msv2.structure import MSv2StructureFactory, PartitionKeyT
 from xarray_ms.errors import InvalidMeasurementSet
 from xarray_ms.multiton import Multiton
@@ -26,13 +27,13 @@ class AntennaDatasetFactory:
     self._subtable_factories = subtable_factories
   def get_dataset(self) -> Mapping[str, Variable]:
-    structure = self._structure_factory.instance
-    partition = structure[self._partition_key]
+    partition = self._structure_factory.instance[self._partition_key]
     ants = self._subtable_factories["ANTENNA"].instance
     feeds = self._subtable_factories["FEED"].instance
     obs = self._subtable_factories["OBSERVATION"].instance
-    telescope_name = obs["TELESCOPE_NAME"][partition.obs_id].as_py()
+    obs = maybe_impute_observation_table(obs, [partition.obs_id])
+    telescope_name = obs["TELESCOPE_NAME"][0].as_py()
     import pyarrow.compute as pac

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/xarray_ms/backend/msv2/factories/correlated.py RENAMED Viewed

@@ -8,12 +8,20 @@ from xarray.coding.variables import unpack_for_decoding
 from xarray.core.indexing import LazilyIndexedArray
 from xarray.core.utils import FrozenDict
-from xarray_ms.backend.msv2.array import MSv2Array
+from xarray_ms.backend.msv2.array import (
+  BroadcastMSv2Array,
+  MainMSv2Array,
+  MSv2Array,
+)
 from xarray_ms.backend.msv2.encoders import (
   CasaCoder,
   QuantityCoder,
   TimeCoder,
 )
+from xarray_ms.backend.msv2.imputation import (
+  maybe_impute_field_table,
+  maybe_impute_observation_table,
+)
 from xarray_ms.backend.msv2.structure import MSv2StructureFactory, PartitionKeyT
 from xarray_ms.casa_types import ColumnDesc, FrequencyMeasures, Polarisations
 from xarray_ms.errors import IrregularGridWarning
@@ -26,6 +34,7 @@ class MSv2ColumnSchema:
   dims: Tuple[str, ...]
   default: Any = None
   coder: Type[CasaCoder] | None = None
+  low_res_dims: Tuple[str, ...] | None = None
 MSV4_to_MSV2_COLUMN_SCHEMAS = {
@@ -34,10 +43,20 @@ MSV4_to_MSV2_COLUMN_SCHEMAS = {
   "TIME_CENTROID": MSv2ColumnSchema("TIME_CENTROID", (), np.nan, TimeCoder),
   "EFFECTIVE_INTEGRATION_TIME": MSv2ColumnSchema("EXPOSURE", (), np.nan, QuantityCoder),
   "UVW": MSv2ColumnSchema("UVW", ("uvw_label",), np.nan, None),
+  "FLAG_ROW": MSv2ColumnSchema(
+    "FLAG_ROW", ("frequency", "polarization"), 1, None, low_res_dims=()
+  ),
   "FLAG": MSv2ColumnSchema("FLAG", ("frequency", "polarization"), 1, None),
   "VISIBILITY": MSv2ColumnSchema(
     "DATA", ("frequency", "polarization"), np.nan + np.nan * 1j, None
   ),
+  "WEIGHT_ROW": MSv2ColumnSchema(
+    "WEIGHT",
+    ("frequency", "polarization"),
+    np.nan,
+    None,
+    low_res_dims=("polarization",),
+  ),
   "WEIGHT": MSv2ColumnSchema(
     "WEIGHT_SPECTRUM", ("frequency", "polarization"), np.nan, None
   ),
@@ -78,11 +97,8 @@ class CorrelatedDatasetFactory:
       c: ColumnDesc.from_descriptor(c, ms_table_desc) for c in ms.columns()
     }
-  def _variable_from_column(self, column: str) -> Variable:
+  def _variable_from_column(self, column: str, dim_sizes: Dict[str, int]) -> Variable:
     """Derive an xarray Variable from the MSv2 column descriptor and schemas"""
-    structure = self._structure_factory.instance
-    partition = structure[self._partition_key]
     try:
       schema = MSV4_to_MSV2_COLUMN_SCHEMAS[column]
     except KeyError:
@@ -93,20 +109,6 @@ class CorrelatedDatasetFactory:
     except KeyError:
       raise KeyError(f"No Column Descriptor exist for {schema.name}")
-    spw = self._subtable_factories["SPECTRAL_WINDOW"].instance
-    pol = self._subtable_factories["POLARIZATION"].instance
-    chan_freq = spw["CHAN_FREQ"][partition.spw_id].as_py()
-    corr_type = pol["CORR_TYPE"][partition.pol_id].as_py()
-    dim_sizes = {
-      "time": len(partition.time),
-      "baseline_id": partition.nbl,
-      "frequency": len(chan_freq),
-      "polarization": len(corr_type),
-      **FIXED_DIMENSION_SIZES,
-    }
     dims = ("time", "baseline_id") + schema.dims
     try:
@@ -116,7 +118,20 @@ class CorrelatedDatasetFactory:
     default = column_desc.dtype.type(schema.default)
-    data = MSv2Array(
+    high_res_shape = shape
+    low_res_index: Tuple[slice | None, ...] = tuple(slice(None) for _ in shape)
+    if schema.low_res_dims:
+      low_res_dims = ("time", "baseline_id") + schema.low_res_dims
+      high_res_shape = shape
+      try:
+        shape_map = {d: dim_sizes[d] for d in low_res_dims}
+      except KeyError as e:
+        raise KeyError(f"No dimension size found for {e.args[0]}")
+      low_res_index = tuple(slice(None) if d in shape_map else None for d in dims)
+      shape = tuple(shape_map.values())
+    array: MSv2Array = MainMSv2Array(
       self._ms_factory,
       self._structure_factory,
       self._partition_key,
@@ -126,7 +141,10 @@ class CorrelatedDatasetFactory:
       default,
     )
-    var = Variable(dims, data, fastpath=True)
+    if schema.low_res_dims:
+      array = BroadcastMSv2Array(array, low_res_index, high_res_shape)
+    var = Variable(dims, array, fastpath=True)
     # Apply any measures encoding
     if schema.coder:
@@ -144,8 +162,7 @@ class CorrelatedDatasetFactory:
     structure = self._structure_factory.instance
     partition = structure[self._partition_key]
     ant1, ant2 = partition.antenna_pairs
-    nbl = partition.nbl
-    assert (nbl,) == ant1.shape
+    assert (partition.nbl,) == ant1.shape
     antenna = self._subtable_factories["ANTENNA"].instance
     ant_names = antenna["NAME"].to_numpy()
@@ -156,6 +173,7 @@ class CorrelatedDatasetFactory:
     pol_id = partition.pol_id
     spw = self._subtable_factories["SPECTRAL_WINDOW"].instance
     pol = self._subtable_factories["POLARIZATION"].instance
+    field = self._subtable_factories["FIELD"].instance
     chan_freq = spw["CHAN_FREQ"][spw_id].as_py()
     uchan_width = np.unique(spw["CHAN_WIDTH"][spw_id].as_py())
@@ -167,6 +185,14 @@ class CorrelatedDatasetFactory:
     corr_type = Polarisations.from_values(pol["CORR_TYPE"][pol_id].as_py()).to_str()
+    dim_sizes = {
+      "time": len(partition.time),
+      "baseline_id": partition.nbl,
+      "frequency": len(chan_freq),
+      "polarization": len(corr_type),
+      **FIXED_DIMENSION_SIZES,
+    }
     row_map = partition.row_map
     missing = np.count_nonzero(row_map == -1)
     if missing > 0:
@@ -181,17 +207,28 @@ class CorrelatedDatasetFactory:
       )
     data_vars = [
-      (n, self._variable_from_column(n))
+      (n, self._variable_from_column(n, dim_sizes))
       for n in (
         "TIME_CENTROID",
         "EFFECTIVE_INTEGRATION_TIME",
         "UVW",
         "VISIBILITY",
-        "FLAG",
-        "WEIGHT",
       )
     ]
+    if "FLAG" in self._main_column_descs:
+      data_vars.append(("FLAG", self._variable_from_column("FLAG", dim_sizes)))
+    else:
+      data_vars.append(("FLAG", self._variable_from_column("FLAG_ROW", dim_sizes)))
+    if "WEIGHT_SPECTRUM" in self._main_column_descs:
+      data_vars.append(("WEIGHT", self._variable_from_column("WEIGHT", dim_sizes)))
+    else:
+      data_vars.append(("WEIGHT", self._variable_from_column("WEIGHT_ROW", dim_sizes)))
+    field = maybe_impute_field_table(field, partition.field_ids)
+    field_names = field.take(partition.field_ids)["NAME"].to_numpy()
     # Add coordinates indexing coordinates
     coordinates = [
       (
@@ -208,6 +245,12 @@ class CorrelatedDatasetFactory:
       ),
       ("polarization", (("polarization",), corr_type, None)),
       ("uvw_label", (("uvw_label",), ["u", "v", "w"], None)),
+      ("field_name", ("time", field_names, {"coordinates": "field_name"})),
+      ("scan_number", ("time", partition.scan_numbers, {"coordinates": "scan_number"})),
+      (
+        "sub_scan_number",
+        ("time", partition.sub_scan_numbers, {"coordinates": "sub_scan_number"}),
+      ),
     ]
     e = {"preferred_chunks": self._preferred_chunks} if self._preferred_chunks else None
@@ -217,8 +260,14 @@ class CorrelatedDatasetFactory:
     time_coder = TimeCoder("TIME", self._main_column_descs)
     if partition.interval.size == 1:
+      # Single unique value
       time_attrs = {"integration_time": partition.interval.item()}
+    elif np.allclose(partition.interval[:, None], partition.interval[None, :]):
+      # Tolerate some jitter in the unique values
+      time_attrs = {"integration_time": np.mean(partition.interval)}
     else:
+      # There are multiple unique interval values,
+      # a regular grid isn't possible
       warnings.warn(
         f"Missing/Multiple intervals {partition.interval} "
         f"found in partition {self._partition_key}. "
@@ -230,8 +279,11 @@ class CorrelatedDatasetFactory:
       time_attrs = {"integration_time": np.nan}
       data_vars.extend(
         [
-          ("TIME", self._variable_from_column("TIME")),
-          ("INTEGRATION_TIME", self._variable_from_column("INTEGRATION_TIME")),
+          ("TIME", self._variable_from_column("TIME", dim_sizes)),
+          (
+            "INTEGRATION_TIME",
+            self._variable_from_column("INTEGRATION_TIME", dim_sizes),
+          ),
         ]
       )
@@ -272,19 +324,15 @@ class CorrelatedDatasetFactory:
     return FrozenDict(sorted(data_vars + coordinates))
   def _observation_info(self) -> Dict[str, Any]:
-    structure = self._structure_factory.instance
-    partition = structure[self._partition_key]
+    partition = self._structure_factory.instance[self._partition_key]
     obs = self._subtable_factories["OBSERVATION"].instance
-    observer = obs["OBSERVER"][partition.obs_id].as_py()
-    project = obs["PROJECT"][partition.obs_id].as_py()
-    # TODO: A Measures conversions is needed here
-    release_date = obs["RELEASE_DATE"][partition.obs_id].as_py()  # noqa: F841
+    obs = maybe_impute_observation_table(obs, [partition.obs_id])
     return dict(
       sorted(
         {
-          "observer": observer,
-          "project": project,
+          "observer": obs["OBSERVER"][partition.obs_id].as_py(),
+          "project": obs["PROJECT"][partition.obs_id].as_py(),
         }.items()
       )
     )

xarray_ms-0.2.6/xarray_ms/backend/msv2/imputation.py ADDED Viewed

@@ -0,0 +1,95 @@
+from __future__ import annotations
+import warnings
+from typing import TYPE_CHECKING
+import numpy as np
+import numpy.typing as npt
+from xarray_ms.errors import ImputedMetadataWarning
+if TYPE_CHECKING:
+  import pyarrow as pa
+def _maybe_return_table_or_max_id(
+  table: pa.Table, table_name: str, ids: npt.NDArray[np.int32], id_column_name: str
+) -> pa.Table | int:
+  """Returns the existing table if a row entry exists,
+  else returns the maximum id"""
+  max_id = np.max(ids)
+  if max_id < len(table):
+    return table
+  warnings.warn(
+    f"No row exists in the {table_name} table of length {len(table)} "
+    f"for {id_column_name}={max_id}. "
+    f"Artificial metadata will be substituted.",
+    ImputedMetadataWarning,
+  )
+  return max_id
+def maybe_impute_field_table(
+  field: pa.Table, field_id: npt.NDArray[np.int32]
+) -> pa.Table:
+  """Generates a FIELD subtable if there are no row ids
+  associated with the given FIELD_ID values"""
+  import pyarrow as pa
+  result = _maybe_return_table_or_max_id(field, "FIELD", field_id, "FIELD_ID")
+  if isinstance(result, pa.Table):
+    return result
+  return pa.Table.from_pydict(
+    {
+      "NAME": np.array([f"UNKNOWN-{i}" for i in range(result + 1)], dtype=object),
+      "SOURCE_ID": np.zeros(result + 1, np.int32),
+    }
+  )
+def maybe_impute_state_table(
+  state: pa.Table, state_id: npt.NDArray[np.int32]
+) -> pa.Table:
+  """Generates a STATE subtable if there are no row ids
+  associated with the given STATE_ID values"""
+  import pyarrow as pa
+  result = _maybe_return_table_or_max_id(state, "STATE", state_id, "STATE_ID")
+  if isinstance(result, pa.Table):
+    return result
+  return pa.Table.from_pydict(
+    {
+      "OBS_MODE": np.array(["UNSPECIFIED"] * (result + 1), dtype=object),
+      "SUB_SCAN": np.zeros(result + 1, np.int32),
+    }
+  )
+def maybe_impute_observation_table(
+  observation: pa.Table, observation_id: npt.NDArray[np.int32]
+) -> pa.Table:
+  """Generates an OBSERVATION table if there are no row ids
+  associated with the given OBSERVATION_ID values"""
+  import pyarrow as pa
+  result = _maybe_return_table_or_max_id(
+    observation, "OBSERVATION", observation_id, "OBSERVATION_ID"
+  )
+  if isinstance(result, pa.Table):
+    return result
+  unknown = np.array(["unknown"] * (result + 1), dtype=object)
+  return pa.Table.from_pydict(
+    {
+      "OBSERVER": unknown,
+      "PROJECT": unknown,
+      "TELESCOPE_NAME": unknown,
+    }
+  )

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/xarray_ms/backend/msv2/structure.py RENAMED Viewed

@@ -27,6 +27,10 @@ import pyarrow as pa
 from arcae.lib.arrow_tables import Table
 from cacheout import Cache
+from xarray_ms.backend.msv2.imputation import (
+  maybe_impute_field_table,
+  maybe_impute_state_table,
+)
 from xarray_ms.backend.msv2.partition import PartitionKeyT, TablePartitioner
 from xarray_ms.errors import (
   InvalidMeasurementSet,
@@ -170,16 +174,16 @@ class PartitionData:
   spw_id: int  # unique from DATA_DESC_ID
   pol_id: int  # unique from DATA_DESC_ID
   # Multiple values per partition
-  antenna_ids: List[int]
-  feed_ids: List[int]
-  field_ids: List[int]
-  state_ids: List[int]
-  scan_numbers: List[int]
+  antenna_ids: npt.NDArray[np.int32]
+  feed_ids: npt.NDArray[np.int32]
+  field_ids: npt.NDArray[np.int32]
+  state_ids: npt.NDArray[np.int32]
+  scan_numbers: npt.NDArray[np.int32]
   # FIELD subtable
-  source_ids: List[int]
+  source_ids: npt.NDArray[np.int32]
   # STATE subtable
   obs_mode: str  # unique from STATE::OBS_MODE
-  sub_scan_numbers: List[int]
+  sub_scan_numbers: npt.NDArray[np.int32]
   # Row to baseline map
   row_map: npt.NDArray[np.int64]
@@ -233,7 +237,7 @@ class MSv2StructureFactory:
   _epoch: str
   _auto_corrs: bool
   _STRUCTURE_CACHE: ClassVar[Cache] = Cache(
-    maxsize=100, ttl=60, on_get=on_get_keep_alive
+    maxsize=100, ttl=5 * 60, on_get=on_get_keep_alive
   )
   def __init__(
@@ -388,6 +392,7 @@ class MSv2Structure(Mapping):
   ) -> npt.NDArray[np.int32]:
     """Constructs a SOURCE_ID array from MAIN.FIELD_ID
     broadcast against FIELD.SOURCE_ID"""
+    field = maybe_impute_field_table(field, field_id)
     field_source_id = field["SOURCE_ID"].to_numpy()
     source_id = np.empty_like(field_id)
     chunk = (len(source_id) + ncpus - 1) // ncpus
@@ -411,6 +416,7 @@ class MSv2Structure(Mapping):
   ) -> npt.NDArray[np.int32]:
     """Constructs a SUB_SCAN_NUMBER array from MAIN.STATE_ID
     broadcast against STATE.SUB_SCAN_NUMBER"""
+    state = maybe_impute_state_table(state, state_id)
     state_ssn = state["SUB_SCAN"].to_numpy()
     subscan_nr = np.empty_like(state_id)
     chunk = (len(state_id) + ncpus - 1) // ncpus
@@ -434,6 +440,8 @@ class MSv2Structure(Mapping):
   ) -> Tuple[npt.NDArray[np.int32], Dict[str, List[int]]]:
     """Constructs an OBS_MODE_ID array from MAIN.STATE_ID broadcast
     against unique entries in STATE.OBS_MODE"""
+    state = maybe_impute_state_table(state, state_id)
     obs_mode = state["OBS_MODE"].to_numpy()
     # Map unique observation modes to state_ids
@@ -637,14 +645,24 @@ class MSv2Structure(Mapping):
         """Return the group that the subtable column should be assigned to"""
         return partition_columns if s in subtable_columns else other_columns
-      def get_uid_column(column, dkey, ids) -> List[Any]:
+      def get_uid_column(column, dkey, ids) -> npt.NDArray:
         """Get the unique values for the given column, preferably from the
         partition key or failing that, from `ids`. Generally should be used with
         ID columns"""
         try:
-          return [dkey[column]]
+          return np.array([dkey[column]])
+        except KeyError:
+          return self.par_unique(pool, ncpus, ids)
+      def time_coord(column, dkey, ids, utime, time_ids) -> npt.NDArray:
+        try:
+          value = dkey[column]
         except KeyError:
-          return self.par_unique(pool, ncpus, ids).tolist()
+          result = np.empty(utime.shape, dtype=ids.dtype)
+          result[time_ids] = ids
+          return result
+        else:
+          return np.full(utime.shape, value)
       # Broadcast and add FIELD.SOURCE_ID column
       field_id = arrow_table["FIELD_ID"].to_numpy()
@@ -696,21 +714,25 @@ class MSv2Structure(Mapping):
         antenna2 = partition["ANTENNA2"]
         interval = partition["INTERVAL"]
         rows = partition["row"]
-        chunk = (len(rows) + ncpus - 1) // ncpus
+        # Unique sorting/other column values
+        utime, time_ids = self.par_unique(
+          pool, ncpus, partition["TIME"], return_inverse=True
+        )
         # Unique partition key values
-        ufield_ids = get_uid_column("FIELD_ID", dkey, partition["FIELD_ID"])
-        usubscan_nrs = get_uid_column(
-          "SUB_SCAN_NUMBER", dkey, partition["SUB_SCAN_NUMBER"]
+        ufield_ids = time_coord(
+          "FIELD_ID", dkey, partition["FIELD_ID"], utime, time_ids
+        )
+        usubscan_nrs = time_coord(
+          "SUB_SCAN_NUMBER", dkey, partition["SUB_SCAN_NUMBER"], utime, time_ids
+        )
+        uscan_nrs = time_coord(
+          "SCAN_NUMBER", dkey, partition["SCAN_NUMBER"], utime, time_ids
         )
-        uscan_nrs = get_uid_column("SCAN_NUMBER", dkey, partition["SCAN_NUMBER"])
         ustate_ids = get_uid_column("STATE_ID", dkey, partition["STATE_ID"])
         usource_ids = get_uid_column("SOURCE_ID", dkey, partition["SOURCE_ID"])
-        # Unique sorting/other column values
-        utime, time_ids = self.par_unique(
-          pool, ncpus, partition["TIME"], return_inverse=True
-        )
         uantenna1 = self.par_unique(pool, ncpus, antenna1)
         uantenna2 = self.par_unique(pool, ncpus, antenna2)
         uantennas = np.union1d(uantenna1, uantenna2)
@@ -730,6 +752,7 @@ class MSv2Structure(Mapping):
         na = len(feed_antennas)
         nbl = nr_of_baselines(na, auto_corrs)
+        chunk = (len(rows) + ncpus - 1) // ncpus
         # Populate row map and interval grids
         row_map = np.full(utime.size * nbl, -1, dtype=np.int64)
@@ -794,8 +817,8 @@ class MSv2Structure(Mapping):
           obs_id=obs_id,
           spw_id=spw_id,
           pol_id=pol_id,
-          antenna_ids=feed_antennas.tolist(),
-          feed_ids=ufeeds.tolist(),
+          antenna_ids=feed_antennas,
+          feed_ids=ufeeds,
           field_ids=ufield_ids,
           scan_numbers=uscan_nrs,
           source_ids=usource_ids,

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/xarray_ms/errors.py RENAMED Viewed

@@ -3,6 +3,15 @@ class IrregularGridWarning(UserWarning):
   with each timestep are not homogenous"""
+class MissingMetadataWarning(UserWarning):
+  """Warning raised when metadata is missing"""
+class ImputedMetadataWarning(MissingMetadataWarning):
+  """Warning raised when metadata is imputed
+  if the original metadata is missing"""
 class InvalidMeasurementSet(ValueError):
   """Raised when the Measurement Set foreign key indexing is invalid"""

{xarray_ms-0.2.4 → xarray_ms-0.2.6}/xarray_ms/testing/simulator.py RENAMED Viewed

@@ -24,8 +24,8 @@ FIRST_FEB_2023_MJDS = 2459976.50000 * 86400
 # Default simulation parameters
 DEFAULT_SIM_PARAMS = {"ntime": 5, "data_description": [(8, ["XX", "XY", "YX", "YY"])]}
-# Additional Columns to add
-ADDITIONAL_COLUMNS = {
+# Standard DATA Columns
+STANDARD_DATA_COLUMNS = {
   "DATA": {
     "_c_order": True,
     "comment": "DATA column",
@@ -91,6 +91,8 @@ class PartitionDescriptor:
 DDIDArgType = List[Tuple[npt.NDArray[np.float64], List[str]]]
 PartitionDataType = Dict[str, Tuple[Tuple[str, ...], npt.NDArray]]
+ChunkDescriptorTransformerT = Callable[[PartitionDescriptor], PartitionDescriptor]
+DataTransformerT = Callable[[PartitionDescriptor, PartitionDataType], PartitionDataType]
 class MSStructureSimulator:
@@ -113,12 +115,11 @@ class MSStructureSimulator:
   partition_names: List[str]
   partition_indices: npt.NDArray[np.int32]
   simulate_data: bool
+  table_desc: Dict[str, Any]
   model: Dict[str, Any]
   data_description: DataDescription
-  transform_desc: Callable[[PartitionDescriptor], PartitionDescriptor] | None
-  transform_data: (
-    Callable[[PartitionDescriptor, PartitionDataType], PartitionDataType] | None
-  )
+  transform_chunk_desc: ChunkDescriptorTransformerT | None
+  transform_data: DataTransformerT | None
   def __init__(
     self,
@@ -134,11 +135,9 @@ class MSStructureSimulator:
     partition: Tuple[str, ...] = ("OBSERVATION_ID", "FIELD_ID", "DATA_DESC_ID"),
     auto_corrs: bool = True,
     simulate_data: bool = True,
-    transform_desc: Callable[[PartitionDescriptor], PartitionDescriptor] | None = None,
-    transform_data: Callable[
-      [PartitionDescriptor, PartitionDataType], PartitionDataType
-    ]
-    | None = None,
+    table_desc: Dict[str, Any] | None = None,
+    transform_chunk_desc: ChunkDescriptorTransformerT | None = None,
+    transform_data: DataTransformerT | None = None,
   ):
     assert ntime >= 1
     assert time_chunks > 0
@@ -194,9 +193,10 @@ class MSStructureSimulator:
     self.time_chunks = time_chunks
     self.time_start = time_start
     self.simulate_data = simulate_data
+    self.table_desc = STANDARD_DATA_COLUMNS if table_desc is None else table_desc
     self.partition_names = cbp_names
     self.partition_indices = bcbp_indices
-    self.transform_desc = transform_desc
+    self.transform_chunk_desc = transform_chunk_desc
     self.transform_data = transform_data
     self.model = {
       "data_description": self.data_description,
@@ -211,17 +211,16 @@ class MSStructureSimulator:
   def simulate_ms(self, output_ms: str) -> None:
     """Simulate data into the given measurement set name"""
-    table_desc = ADDITIONAL_COLUMNS if self.simulate_data else {}
     # Generate descriptors, create simulated data from the descriptors
     # and write simulated data to the main Measurement Set
-    with Table.ms_from_descriptor(output_ms, "MAIN", table_desc) as T:
+    with Table.ms_from_descriptor(output_ms, "MAIN", self.table_desc) as T:
       startrow = 0
       for chunk_desc in self.generate_descriptors():
         # Apply any chunk descriptor transforms
-        if self.transform_desc is not None:
-          chunk_desc = self.transform_desc(chunk_desc)
+        if self.transform_chunk_desc is not None:
+          chunk_desc = self.transform_chunk_desc(chunk_desc)
         # Generate the chunk data
         data_dict = self.data_factory(chunk_desc)

xarray_ms-0.2.4/xarray_ms/backend/msv2/array.py DELETED Viewed

@@ -1,87 +0,0 @@
-from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Callable, Tuple
-import numpy as np
-from xarray.backends import BackendArray
-from xarray.core.indexing import IndexingSupport, explicit_indexing_adapter
-if TYPE_CHECKING:
-  import numpy.typing as npt
-  from xarray_ms.backend.msv2.structure import MSv2StructureFactory, PartitionKeyT
-  from xarray_ms.multiton import Multiton
-  TransformerT = Callable[[npt.NDArray], npt.NDArray] | None
-def slice_length(s, max_len):
-  if isinstance(s, np.ndarray):
-    if s.ndim != 1:
-      raise NotImplementedError("Slicing with non-1D numpy arrays")
-    return len(s)
-  start, stop, step = s.indices(max_len)
-  if step != 1:
-    raise NotImplementedError(f"Slicing with steps {s} other than 1 not supported")
-  return stop - start
-class MSv2Array(BackendArray):
-  """Backend array containing functionality for reading an MSv2 column"""
-  _table_factory: Multiton
-  _structure_factory: MSv2StructureFactory
-  _partition: PartitionKeyT
-  _column: str
-  _shape: Tuple[int, ...]
-  _dtype: npt.DTypeLike
-  _default: Any | None
-  _transform: TransformerT
-  def __init__(
-    self,
-    table_factory: Multiton,
-    structure_factory: MSv2StructureFactory,
-    partition: PartitionKeyT,
-    column: str,
-    shape: Tuple[int, ...],
-    dtype: npt.DTypeLike,
-    default: Any | None = None,
-    transform: TransformerT = None,
-  ):
-    self._table_factory = table_factory
-    self._structure_factory = structure_factory
-    self._partition = partition
-    self._column = column
-    self._default = default
-    self._transform = transform
-    self.shape = shape
-    self.dtype = np.dtype(dtype)
-    assert len(shape) >= 2, "(time, baseline_ids) required"
-  def __getitem__(self, key) -> npt.NDArray:
-    return explicit_indexing_adapter(
-      key, self.shape, IndexingSupport.OUTER, self._getitem
-    )
-  def _getitem(self, key) -> npt.NDArray:
-    assert len(key) == len(self.shape)
-    expected_shape = tuple(slice_length(k, s) for k, s in zip(key, self.shape))
-    # Map the (time, baseline_id) coordinates onto row indices
-    rows = self._structure_factory.instance[self._partition].row_map[key[:2]]
-    xkey = (rows.ravel(),) + key[2:]
-    row_shape = (rows.size,) + expected_shape[2:]
-    result = np.full(row_shape, self._default, dtype=self.dtype)
-    self._table_factory.instance.getcol(self._column, xkey, result)
-    result = result.reshape(rows.shape + expected_shape[2:])
-    return self._transform(result) if self._transform else result
-  @property
-  def transform(self) -> TransformerT:
-    return self._transform
-  @transform.setter
-  def transform(self, value: TransformerT):
-    self._transform = value