xradio 0.0.28__py3-none-any.whl → 0.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. xradio/__init__.py +5 -4
  2. xradio/_utils/array.py +90 -0
  3. xradio/_utils/zarr/common.py +48 -3
  4. xradio/image/_util/zarr.py +4 -1
  5. xradio/schema/__init__.py +24 -6
  6. xradio/schema/bases.py +440 -2
  7. xradio/schema/check.py +96 -55
  8. xradio/schema/dataclass.py +123 -27
  9. xradio/schema/metamodel.py +21 -4
  10. xradio/schema/typing.py +33 -18
  11. xradio/vis/__init__.py +5 -2
  12. xradio/vis/_processing_set.py +28 -20
  13. xradio/vis/_vis_utils/_ms/_tables/create_field_and_source_xds.py +710 -0
  14. xradio/vis/_vis_utils/_ms/_tables/load.py +23 -10
  15. xradio/vis/_vis_utils/_ms/_tables/load_main_table.py +145 -64
  16. xradio/vis/_vis_utils/_ms/_tables/read.py +747 -172
  17. xradio/vis/_vis_utils/_ms/_tables/read_main_table.py +173 -44
  18. xradio/vis/_vis_utils/_ms/_tables/read_subtables.py +79 -28
  19. xradio/vis/_vis_utils/_ms/_tables/write.py +102 -45
  20. xradio/vis/_vis_utils/_ms/_tables/write_exp_api.py +127 -65
  21. xradio/vis/_vis_utils/_ms/chunks.py +58 -21
  22. xradio/vis/_vis_utils/_ms/conversion.py +536 -67
  23. xradio/vis/_vis_utils/_ms/descr.py +52 -20
  24. xradio/vis/_vis_utils/_ms/msv2_to_msv4_meta.py +70 -35
  25. xradio/vis/_vis_utils/_ms/msv4_infos.py +0 -59
  26. xradio/vis/_vis_utils/_ms/msv4_sub_xdss.py +76 -9
  27. xradio/vis/_vis_utils/_ms/optimised_functions.py +0 -46
  28. xradio/vis/_vis_utils/_ms/partition_queries.py +308 -119
  29. xradio/vis/_vis_utils/_ms/partitions.py +82 -25
  30. xradio/vis/_vis_utils/_ms/subtables.py +32 -14
  31. xradio/vis/_vis_utils/_utils/partition_attrs.py +30 -11
  32. xradio/vis/_vis_utils/_utils/xds_helper.py +136 -45
  33. xradio/vis/_vis_utils/_zarr/read.py +60 -22
  34. xradio/vis/_vis_utils/_zarr/write.py +83 -9
  35. xradio/vis/_vis_utils/ms.py +48 -29
  36. xradio/vis/_vis_utils/zarr.py +44 -20
  37. xradio/vis/convert_msv2_to_processing_set.py +106 -32
  38. xradio/vis/load_processing_set.py +38 -61
  39. xradio/vis/read_processing_set.py +62 -96
  40. xradio/vis/schema.py +687 -0
  41. xradio/vis/vis_io.py +75 -43
  42. {xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/LICENSE.txt +6 -1
  43. {xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/METADATA +10 -5
  44. xradio-0.0.29.dist-info/RECORD +73 -0
  45. {xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/WHEEL +1 -1
  46. xradio/vis/model.py +0 -497
  47. xradio-0.0.28.dist-info/RECORD +0 -71
  48. {xradio-0.0.28.dist-info → xradio-0.0.29.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ import graphviper.utils.logger as logger
2
2
  import os
3
3
  from pathlib import Path
4
4
  import re
5
- from typing import Any, List, Dict, Tuple, Union
5
+ from typing import Any, Callable, Dict, List, Tuple, Union
6
6
 
7
7
  import numpy as np
8
8
  import pandas as pd
@@ -13,15 +13,18 @@ from casacore import tables
13
13
 
14
14
  from .table_query import open_query, open_table_ro
15
15
 
16
- CASACORE_TO_PD_TIME_CORRECTION = 3506716800.0
16
+ CASACORE_TO_PD_TIME_CORRECTION = 3_506_716_800.0
17
17
  SECS_IN_DAY = 86400
18
+ MJD_DIF_UNIX = 40587
18
19
 
19
20
 
20
21
  def table_exists(path: str) -> bool:
21
22
  return tables.tableexists(path)
22
23
 
23
24
 
24
- def convert_casacore_time(rawtimes: np.ndarray, convert_to_datetime=True) -> np.ndarray:
25
+ def convert_casacore_time(
26
+ rawtimes: np.ndarray, convert_to_datetime: bool = True
27
+ ) -> np.ndarray:
25
28
  """
26
29
  Read time columns to datetime format
27
30
  pandas datetimes are referenced against a 0 of 1970-01-01
@@ -29,36 +32,218 @@ def convert_casacore_time(rawtimes: np.ndarray, convert_to_datetime=True) -> np.
29
32
 
30
33
  This requires a correction of 3506716800 seconds which is hardcoded to save time
31
34
 
32
- :param rawtimes: times in casacore ref
33
- :return: times converted to pandas reference
35
+ Parameters
36
+ ----------
37
+ rawtimes : np.ndarray
38
+ times in casacore ref
39
+ convert_to_datetime : bool (Default value = True)
40
+ whether to produce pandas style datetime
41
+
42
+ Returns
43
+ -------
44
+ np.ndarray
45
+ times converted to pandas reference
34
46
  """
35
-
47
+ times_reref = np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION
36
48
  if convert_to_datetime:
37
- return pd.to_datetime(
38
- np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION, unit="s"
39
- ).values
49
+ return pd.to_datetime(times_reref, unit="s").values
40
50
  else:
41
- return np.array(rawtimes) - CASACORE_TO_PD_TIME_CORRECTION
51
+ return times_reref
42
52
  # dt = pd.to_datetime(np.atleast_1d(rawtimes) - correction, unit='s').values
43
53
  # if len(np.array(rawtimes).shape) == 0: dt = dt[0]
44
54
  # return dt
45
55
 
46
56
 
47
57
  def convert_mjd_time(rawtimes: np.ndarray) -> np.ndarray:
48
- """Different time conversion needed for the MJD col of EPHEM{i}_*.tab
58
+ """
59
+ Different time conversion needed for the MJD col of EPHEM{i}_*.tab
49
60
  files (only, as far as I've seen)
50
61
 
51
- :param rawtimes: MJD times for example from the MJD col of ephemerides tables
52
- :return: times converted to pandas reference and datetime type
62
+ Parameters
63
+ ----------
64
+ rawtimes : np.ndarray
65
+ MJD times for example from the MJD col of ephemerides tables
66
+
67
+ Returns
68
+ -------
69
+ np.ndarray
70
+ times converted to pandas reference and datetime type
53
71
  """
54
- return pd.to_datetime(
55
- rawtimes * SECS_IN_DAY - CASACORE_TO_PD_TIME_CORRECTION, unit="s"
72
+ times_reref = pd.to_datetime(
73
+ (rawtimes - MJD_DIF_UNIX) * SECS_IN_DAY, unit="s"
56
74
  ).values
57
75
 
76
+ return times_reref
77
+
78
+
79
+ def convert_casacore_time_to_mjd(rawtimes: np.ndarray) -> np.ndarray:
80
+ """
81
+ From CASA/casacore time (as used in the TIME column of the main table) to MJD
82
+ (as used in the EPHEMi*.tab ephemeris tables). As the epochs are the same, this
83
+ is just a conversion of units.
84
+
85
+ Parameters
86
+ ----------
87
+ rawtimes : np.ndarray
88
+ times from a TIME column (seconds, casacore time epoch)
89
+
90
+ Returns
91
+ -------
92
+ np.ndarray
93
+ times converted to (ephemeris) MJD (days since casacore time epoch (1858-11-17))
94
+ """
95
+ return rawtimes / SECS_IN_DAY
96
+
97
+
98
+ def make_taql_where_between_min_max(
99
+ min_max: Tuple[np.float64, np.float64],
100
+ path: str,
101
+ table_name: str,
102
+ colname="TIME",
103
+ ) -> Union[str, None]:
104
+ """
105
+ From a numerical min/max range, produce a TaQL string to select between
106
+ those min/max values (example: times) in a table.
107
+ The table can be for example a POINTING subtable or an EPHEM* ephemeris
108
+ table.
109
+ This is meant to be used on MSv2 table columns that will be loaded as a
110
+ coordinate in MSv4s and their sub-xdss (example: POINTING/TIME ephemeris/MJD).
111
+
112
+ This can be used for example to produce a TaQL string to constraing loading of:
113
+ - POINTING rows (based on the min/max from the time coordinate of the main MSv4)
114
+ - ephemeris rows, from EPHEM* tables ((based on the MJD column and the min/max
115
+ from the main MSv4 time coordinate).
116
+
117
+ Parameters
118
+ ----------
119
+ min_max : Tuple[np.float64, np.float64]
120
+ min / max values of time or other column used as coordinate
121
+ (assumptions: float values, sortable, typically: time coord from MSv4)
122
+ path :
123
+ Path to input MS or location of the table
124
+ table_name :
125
+ Name of the table where to load a column (example: 'POINTING')
126
+ colname :
127
+ Name of the column to search for min/max values (examples: 'TIME', 'MJD')
128
+
129
+ Returns
130
+ -------
131
+ taql_where : str
132
+ TaQL (sub)string with the min/max time 'WHERE' constraint
133
+ """
134
+
135
+ min_max_range = find_projected_min_max_table(min_max, path, table_name, colname)
136
+ if min_max_range is None:
137
+ taql = None
138
+ else:
139
+ (min_val, max_val) = min_max_range
140
+ taql = f"where {colname} >= {min_val} AND {colname} <= {max_val}"
141
+
142
+ return taql
143
+
144
+
145
+ def find_projected_min_max_table(
146
+ min_max: Tuple[np.float64, np.float64], path: str, table_name: str, colname: str
147
+ ) -> Union[Tuple[np.float64, np.float64], None]:
148
+ """
149
+ We have: min/max values that define a range (for example of time)
150
+ We want: to project that min/max range on a sortable column (for example a
151
+ range of times onto a TIME column), and find min and max values
152
+ derived from that table column such that the range between those min and max
153
+ values includes at least the input min/max range.
154
+
155
+ The returned min/max can then be used in a data selection TaQL query to
156
+ select at least the values within the input range (possibly extended if
157
+ the input range overlaps only partially or not at all with the column
158
+ values). A tolerance is added to the min/max to prevent numerical issues in
159
+ comparisons and conversios between numerical types and strings.
160
+
161
+ When the range given as input is wider than the range of values found in
162
+ the column, use the input range, as it is sufficient and more inclusive.
163
+
164
+ When the range given as input is narrow (projected on the target table/column)
165
+ and falls between two points of the column values, or overlaps with only one
166
+ point, the min/max are extended to include at least the two column values that
167
+ define a range within which the input range is included.
168
+ Example scenario: an ephemeris table is sampled at a coarse interval
169
+ (20 min) and we want to find a min/max range projected from the time min/max
170
+ of a main MSv4 time coordinate sampled at ~1s for a field-scan/intent
171
+ that spans ~2 min. Those ~2 min will typically fall between ephemeris samples.
172
+
173
+ Parameters
174
+ ----------
175
+ min_max : Tuple[np.float64, np.float64]
176
+ min / max values of time or other column used as coordinate
177
+ (assumptions: float values, sortable)
178
+ path :
179
+ Path to input MS or location of the table
180
+ table_name :
181
+ Name of the table where to load a column (example: 'POINTING')
182
+ colname :
183
+ Name of the column to search for min/max values (example: 'TIME')
184
+
185
+ Returns
186
+ -------
187
+ output_min_max : Union[Tuple[np.float64, np.float64], None]
188
+ min/max values derived from the input min/max and the column values
189
+ """
190
+ with open_table_ro(os.path.join(path, table_name)) as tb_tool:
191
+ if tb_tool.nrows() == 0:
192
+ return None
193
+ col = tb_tool.getcol(colname)
194
+
195
+ out_min_max = find_projected_min_max_array(min_max, col)
196
+ return out_min_max
197
+
198
+
199
+ def find_projected_min_max_array(
200
+ min_max: Tuple[np.float64, np.float64], array: np.array
201
+ ) -> Tuple[np.float64, np.float64]:
202
+ """Does the min/max checks and search for find_projected_min_max_table()"""
203
+
204
+ sorted_array = np.sort(array)
205
+ (range_min, range_max) = min_max
206
+ if len(sorted_array) < 2:
207
+ tol = np.finfo(sorted_array.dtype).eps * 4
208
+ else:
209
+ tol = np.diff(sorted_array[np.nonzero(sorted_array)]).min() / 4
210
+
211
+ if range_max > sorted_array[-1]:
212
+ projected_max = range_max + tol
213
+ else:
214
+ max_idx = sorted_array.size - 1
215
+ max_array_idx = min(
216
+ max_idx, np.searchsorted(sorted_array, range_max, side="right")
217
+ )
218
+ projected_max = sorted_array[max_array_idx] + tol
219
+
220
+ if range_min < sorted_array[0]:
221
+ projected_min = range_min - tol
222
+ else:
223
+ min_array_idx = max(
224
+ 0, np.searchsorted(sorted_array, range_min, side="left") - 1
225
+ )
226
+ # ensure 'sorted_array[min_array_idx] < range_min' when values ==
227
+ if sorted_array[min_array_idx] == range_min:
228
+ min_array_idx = max(0, min_array_idx - 1)
229
+ projected_min = sorted_array[min_array_idx] - tol
230
+
231
+ return (projected_min, projected_max)
232
+
58
233
 
59
234
  def extract_table_attributes(infile: str) -> Dict[str, Dict]:
60
235
  """
61
- return a dictionary of table attributes created from MS keywords and column descriptions
236
+ Return a dictionary of table attributes created from MS keywords and column descriptions
237
+
238
+ Parameters
239
+ ----------
240
+ infile : str
241
+ table file path
242
+
243
+ Returns
244
+ -------
245
+ Dict[str, Dict]
246
+ table attributes as a dictionary
62
247
  """
63
248
  with open_table_ro(infile) as tb_tool:
64
249
  kwd = tb_tool.getkeywords()
@@ -79,9 +264,17 @@ def add_units_measures(
79
264
  """
80
265
  Add attributes with units and measure metainfo to the variables passed in the input dictionary
81
266
 
82
- :param mvars: data variables where to populate units
83
- :param cc_attrs: dictionary with casacore table attributes (from extract_table_attributes)
84
- :return: variables with units added in their attributes
267
+ Parameters
268
+ ----------
269
+ mvars : Dict[str, xr.DataArray]
270
+ data variables where to populate units
271
+ cc_attrs : Dict[str, Any]
272
+ dictionary with casacore table attributes (from extract_table_attributes)
273
+
274
+ Returns
275
+ -------
276
+ Dict[str, xr.DataArray]
277
+ variables with units added in their attributes
85
278
  """
86
279
  col_descrs = cc_attrs["column_descriptions"]
87
280
  # TODO: Should probably loop the other way around, over mvars
@@ -90,6 +283,12 @@ def add_units_measures(
90
283
  if var_name in mvars and "keywords" in col_descrs[col]:
91
284
  if "QuantumUnits" in col_descrs[col]["keywords"]:
92
285
  cc_units = col_descrs[col]["keywords"]["QuantumUnits"]
286
+
287
+ if isinstance(
288
+ cc_units, str
289
+ ): # Little fix for Meerkat data where the units are a string.
290
+ cc_units = [cc_units]
291
+
93
292
  if not isinstance(cc_units, list) or not cc_units:
94
293
  logger.warning(
95
294
  f"Invalid units found for column/variable {col}: {cc_units}"
@@ -128,7 +327,8 @@ def add_units_measures(
128
327
 
129
328
 
130
329
  def make_freq_attrs(spw_xds: xr.Dataset, spw_id: int) -> Dict[str, Any]:
131
- """Grab the units/measure metainfo for the xds.freq dimension of a
330
+ """
331
+ Grab the units/measure metainfo for the xds.freq dimension of a
132
332
  parttion from the SPECTRAL_WINDOW subtable CTDS attributes.
133
333
 
134
334
  Has to read xds_spw.meas_freq_ref and use it as index in the CTDS
@@ -137,10 +337,17 @@ def make_freq_attrs(spw_xds: xr.Dataset, spw_id: int) -> Dict[str, Any]:
137
337
  (then the ref frame from the second will be pulled to
138
338
  xds.freq.attrs)
139
339
 
140
- :param spw_xds: (metainfo) SPECTRAL_WINDOW xds
141
- :param spw_id: SPW id of a partition
142
- :return: attributes (units/measure) for the freq dim of a partition
143
-
340
+ Parameters
341
+ ----------
342
+ spw_xds : xr.Dataset
343
+ (metainfo) SPECTRAL_WINDOW xds
344
+ spw_id : int
345
+ SPW id of a partition
346
+
347
+ Returns
348
+ -------
349
+ Dict[str, Any]
350
+ attributes (units/measure) for the freq dim of a partition
144
351
  """
145
352
  fallback_TabRefTypes = [
146
353
  "REST",
@@ -193,6 +400,11 @@ def get_pad_nan(col: np.ndarray) -> np.ndarray:
193
400
  ----------
194
401
  col : np.ndarray
195
402
  data being loaded from a table column
403
+
404
+ Returns
405
+ -------
406
+ np.ndarray
407
+ nan ("nan") value for the type of the input column
196
408
  """
197
409
  # This is causing frequent warnings for integers. Cast of nan to "int nan"
198
410
  # produces -2147483648 but also seems to trigger a
@@ -208,15 +420,24 @@ def get_pad_nan(col: np.ndarray) -> np.ndarray:
208
420
 
209
421
 
210
422
  def redimension_ms_subtable(xds: xr.Dataset, subt_name: str) -> xr.Dataset:
211
- """Expand a MeasurementSet subtable xds from single dimension (row)
423
+ """
424
+ Expand a MeasurementSet subtable xds from single dimension (row)
212
425
  to multiple dimensions (such as (source_id, time, spectral_window)
213
426
 
214
427
  WIP: only works for source, experimenting
215
428
 
216
- :param xds: dataset to change the dimensions
217
- :param subt_name: subtable name (SOURCE, etc.)
218
- :return: transformed xds with data dimensions representing the MS subtable key
219
- (one dimension for every columns)
429
+ Parameters
430
+ ----------
431
+ xds : xr.Dataset
432
+ dataset to change the dimensions
433
+ subt_name : str
434
+ subtable name (SOURCE, etc.)
435
+
436
+ Returns
437
+ -------
438
+ xr.Dataset
439
+ transformed xds with data dimensions representing the MS subtable key
440
+ (one dimension for every columns)
220
441
  """
221
442
  subt_key_cols = {
222
443
  "DOPPLER": ["doppler_id", "source_id"],
@@ -239,8 +460,18 @@ def redimension_ms_subtable(xds: xr.Dataset, subt_name: str) -> xr.Dataset:
239
460
 
240
461
  rxds = xds.copy()
241
462
  try:
463
+ # drop_duplicates() needed (https://github.com/casangi/xradio/issues/185). Examples:
464
+ # - Some early ALMA datasets have bogus WEATHER tables with many/most rows with
465
+ # (ANTENNA_ID=0, TIME=0) and no other columns to figure out the right IDs, such
466
+ # as "NS_WX_STATION_ID" or similar. (example: X425.pm04.scan4.ms)
467
+ # - Some GBT MSs have duplicated (ANTENNA_ID=0, TIME=xxx). (example: analytic_variable.ms)
242
468
  with np.errstate(invalid="ignore"):
243
- rxds = rxds.set_index(row=key_dims).unstack("row").transpose(*key_dims, ...)
469
+ rxds = (
470
+ rxds.set_index(row=key_dims)
471
+ .drop_duplicates("row")
472
+ .unstack("row")
473
+ .transpose(*key_dims, ...)
474
+ )
244
475
  # unstack changes type to float when it needs to introduce NaNs, so
245
476
  # we need to reset to the original type.
246
477
  for var in rxds.data_vars:
@@ -302,23 +533,37 @@ def read_generic_table(
302
533
  inpath: str,
303
534
  tname: str,
304
535
  timecols: Union[List[str], None] = None,
305
- ignore=None,
536
+ ignore: Union[List[str], None] = None,
306
537
  rename_ids: Dict[str, str] = None,
538
+ taql_where: str = None,
307
539
  ) -> xr.Dataset:
308
- """read generic casacore (sub)table into memory resident xds. This reads the table
309
- and and loads the data.
540
+ """
541
+ load generic casacore (sub)table into memory resident xds (xarray wrapped
542
+ numpy arrays). This reads through the table columns and loads the data.
543
+
544
+ TODO: change read_ name to load_ name (and most if not all this module)
310
545
 
311
546
  Parameters
312
547
  ----------
313
- :param inpath: path to the MS or directory containing the table
314
- :param tname: (sub)table name, for example 'SOURCE' for myms.ms/SOURCE
315
-
316
- :param timecols: column names to convert to numpy datetime format.
317
- leaves times as their original casacore format.
318
- :param ignore: list of column names to ignore and not try to read.
319
- :rename_ids: dict with dimension renaming mapping
320
-
321
- :return: table loaded as XArray dataset
548
+ inpath : str
549
+ path to the MS or directory containing the table
550
+ tname : str
551
+ (sub)table name, for example 'SOURCE' for myms.ms/SOURCE
552
+ timecols : Union[List[str], None] (Default value = None)
553
+ column names to convert to numpy datetime format.
554
+ leaves times as their original casacore format.
555
+ ignore : Union[List[str], None] (Default value = None)
556
+ list of column names to ignore and not try to read.
557
+ rename_ids : Dict[str, str] (Default value = None)
558
+ dict with dimension renaming mapping
559
+ taql_where : str (Default value = None)
560
+ TaQL string to optionally constain the rows/columns to read
561
+ (Default value = None)
562
+
563
+ Returns
564
+ -------
565
+ xr.Dataset
566
+ table loaded as XArray dataset
322
567
  """
323
568
  if timecols is None:
324
569
  timecols = []
@@ -340,13 +585,33 @@ def read_generic_table(
340
585
  )
341
586
  return xr.Dataset()
342
587
 
343
- with open_table_ro(infile) as tb_tool:
344
- if tb_tool.nrows() == 0:
588
+ with open_table_ro(infile) as gtable:
589
+ if gtable.nrows() == 0:
345
590
  logger.debug(f"table is empty: {inpath} {tname}")
346
591
  return xr.Dataset(attrs=attrs)
347
592
 
348
- colnames = tb_tool.colnames()
349
- mcoords, mvars = read_generic_cols(infile, tb_tool, timecols, ignore)
593
+ # if len(ignore) > 0: #This is needed because some SOURCE tables have a SOURCE_MODEL column that is corrupted and this causes the open_query to fail.
594
+ # select_columns = gtable.colnames()
595
+ # select_columns_str = str([item for item in select_columns if item not in ignore])[1:-1].replace("'", "") #Converts an array to a comma sepearted string. For example ['a', 'b', 'c'] to 'a, b, c'.
596
+ # taql_gtable = f"select " + select_columns_str + f" from $gtable {taql_where}"
597
+ # else:
598
+ # taql_gtable = f"select * from $gtable {taql_where}"
599
+
600
+ # relatively often broken columns that we do not need
601
+ exclude_pattern = ", !~p/SOURCE_MODEL/"
602
+ taql_gtable = f"select *{exclude_pattern} from $gtable {taql_where or ''}"
603
+
604
+ with open_query(gtable, taql_gtable) as tb_tool:
605
+ if tb_tool.nrows() == 0:
606
+ logger.debug(
607
+ f"table query is empty: {inpath} {tname}, with where {taql_where}"
608
+ )
609
+ return xr.Dataset(attrs=attrs)
610
+
611
+ colnames = tb_tool.colnames()
612
+ mcoords, mvars = load_cols_into_coords_data_vars(
613
+ infile, tb_tool, timecols, ignore
614
+ )
350
615
 
351
616
  mvars = add_units_measures(mvars, cc_attrs)
352
617
  mcoords = add_units_measures(mcoords, cc_attrs)
@@ -379,49 +644,116 @@ def read_generic_table(
379
644
  return xds
380
645
 
381
646
 
382
- def read_generic_cols(
383
- infile: str, tb_tool, timecols, ignore, dim_prefix: str = "dim"
647
+ def load_cols_into_coords_data_vars(
648
+ inpath: str,
649
+ tb_tool: tables.table,
650
+ timecols: Union[List[str], None] = None,
651
+ ignore: Union[List[str], None] = None,
384
652
  ) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
385
- """Reads data for each MS column (loading the data in memory) into
386
- Xarray datasets
653
+ """
654
+ Produce a set of coordinate xarrays and a set of data variables xarrays
655
+ from the columns of a table.
656
+
657
+ Parameters
658
+ ----------
659
+ inpath : str
660
+ input path
661
+ tb_tool: tables.table
662
+ tool being used to load data
663
+ timecols: Union[List[str], None] (Default value = None)
664
+ list of columns to be considered as TIME-related
665
+ ignore: Union[List[str], None] (Default value = None)
666
+ columns to ignore
667
+
668
+ Returns
669
+ -------
670
+ Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
671
+ coordinates dictionary + variables dictionary
672
+ """
673
+ columns_loader = find_best_col_loader(inpath, tb_tool.nrows())
674
+
675
+ mcoords, mvars = columns_loader(inpath, tb_tool, timecols, ignore)
676
+
677
+ return mcoords, mvars
387
678
 
388
- :param infile: path name of the MS
389
- :param tb_tool: table to red the columns
390
- :param timecols: columns names to convert to datetime format
391
- :param ignore: list of column names to skip and not try to read.
392
679
 
393
- :return: dict of coordinates and dict of data vars.
680
+ def find_best_col_loader(inpath: str, nrows: int) -> Callable:
394
681
  """
682
+ Simple heuristic: for any tables other than POINTING, use the generic_load_cols
683
+ function that is able to deal with variable size columns. For POINTING (and if it has
684
+ more rows than an arbitrary "small" threshold) use a more efficient load function that
685
+ loads the data by column (but is not able to deal with any generic table).
686
+ For now, all other subtables are loaded using the generic column loader.
687
+
688
+ Background: the POINTING subtable can have a very large number of rows. For example in
689
+ ALMA it is sampled at ~50ms intervals which typically produces of the order of
690
+ [10^5, 10^7] rows. This becomes a serious performance bottleneck when loading the
691
+ table using row() (and one dict allocated per row).
692
+ This function chooses an alternative "by-column" load function to load in the columns
693
+ when the table is POINTING. See xradio issue #128 for now this distinction is made
694
+ solely for performance reasons.
395
695
 
396
- # Almost sure that when TIME is present (in a standard MS subt) it
397
- # is part of the key. But what about non-std subtables, ASDM subts?
398
- subts_with_time_key = (
399
- "FEED",
400
- "FLAG_CMD",
401
- "FREQ_OFFSET",
402
- "HISTORY",
403
- "SOURCE",
404
- "SYSCAL",
405
- "WEATHER",
406
- )
407
- # Optional cols known to sometimes have inconsistent values
408
- known_misbehaving_cols = ["ASSOC_NATURE"]
696
+ Parameters
697
+ ----------
698
+ inpath : str
699
+ path name of the MS table
700
+ nrows : int
701
+ number of rows found in the table
702
+
703
+ Returns
704
+ -------
705
+ Callable
706
+ function best suited to load the data from the columns of this table
707
+ """
708
+ # do not give up generic by-row() loading if nrows is (arbitrary) small
709
+ ARBITRARY_MIN_ROWS = 1000
409
710
 
410
- colnames = tb_tool.colnames()
411
- col_cells = {
412
- col: tb_tool.getcell(col, 0)
413
- for col in colnames
414
- if (col not in ignore) and (tb_tool.iscelldefined(col, 0))
415
- }
711
+ if inpath.endswith("POINTING") and nrows >= ARBITRARY_MIN_ROWS:
712
+ columns_loader = load_fixed_size_cols
713
+ else:
714
+ columns_loader = load_generic_cols
715
+
716
+ return columns_loader
717
+
718
+
719
+ def load_generic_cols(
720
+ inpath: str,
721
+ tb_tool: tables.table,
722
+ timecols: Union[List[str], None],
723
+ ignore: Union[List[str], None],
724
+ ) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
725
+ """
726
+ Loads data for each MS column (loading the data in memory) into Xarray datasets
727
+
728
+ This function is generic in that it can load variable size array columns. See also
729
+ load_fixed_size_cols() as a simpler and much better performing alternative
730
+ for tables that are large and expected/guaranteed to not have columns with variable
731
+ size cells.
732
+
733
+ Parameters
734
+ ----------
735
+ inpath : str
736
+ path name of the MS table
737
+ tb_tool : tables.table
738
+ table to load the columns
739
+ timecols : Union[List[str], None]
740
+ columns names to convert to datetime format
741
+ ignore : Union[List[str], None]
742
+ list of column names to skip and not try to load.
743
+
744
+ Returns
745
+ -------
746
+ Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
747
+ dict of coordinates and dict of data vars.
748
+ """
749
+
750
+ col_cells = find_loadable_filled_cols(tb_tool, ignore)
416
751
 
417
752
  trows = tb_tool.row(ignore, exclude=True)[:]
418
753
 
419
754
  # Produce coords and data vars from MS columns
420
755
  mcoords, mvars = {}, {}
421
756
  for col in col_cells.keys():
422
- if tb_tool.coldatatype(col) == "record":
423
- continue # not supported
424
-
425
757
  try:
426
758
  # TODO
427
759
  # benchmark np.stack() performance
@@ -446,87 +778,287 @@ def read_generic_cols(
446
778
 
447
779
  if len(set([isinstance(row[col], dict) for row in trows])) > 1:
448
780
  continue # can't deal with this case
449
- mshape = np.array(max([np.array(row[col]).shape for row in trows]))
450
- try:
451
- pad_nan = get_pad_nan(col_cells[col])
452
781
 
453
- # TODO
454
- # benchmark np.stack() performance
455
- data = np.stack(
456
- [
457
- np.pad(
458
- (
459
- row[col]
460
- if len(row[col]) > 0
461
- else np.array(row[col]).reshape(
462
- np.arange(len(mshape)) * 0
463
- )
464
- ),
465
- [(0, ss) for ss in mshape - np.array(row[col]).shape],
466
- "constant",
467
- constant_values=pad_nan,
468
- )
469
- for row in trows
470
- ]
471
- )
472
- except Exception as exc:
473
- level = logger.WARNING
474
- if col in known_misbehaving_cols:
475
- level = logger.DEBUG
476
- logger.log(
477
- level, f"{infile}: failed to read data for column {col}: {exc}"
478
- )
479
- data = []
782
+ data = handle_variable_col_issues(inpath, col, col_cells, trows)
480
783
 
481
784
  if len(data) == 0:
482
785
  continue
483
- if col in timecols:
484
- if col == "MJD":
485
- data = convert_mjd_time(data)
486
- else:
487
- try:
488
- data = convert_casacore_time(data)
489
- except pd.errors.OutOfBoundsDatetime as exc:
490
- if infile.endswith("WEATHER"):
491
- logger.error(
492
- f"Exception when converting WEATHER/TIME: {exc}. TIME data: {data}"
493
- )
494
- else:
495
- raise
496
- # should also probably add INTERVAL not only TIME
497
- if col.endswith("_ID") or (
498
- infile.endswith(subts_with_time_key) and col == "TIME"
499
- ):
500
- # weather table: importasdm produces very wrong "-1" ANTENNA_ID
501
- if (
502
- infile.endswith("WEATHER")
503
- and col == "ANTENNA_ID"
504
- and "NS_WX_STATION_ID" in colnames
505
- ):
506
- data = np.stack([row["NS_WX_STATION_ID"] for row in trows])
507
-
508
- mcoords[col.lower()] = xr.DataArray(
509
- data,
510
- dims=[
511
- f"{dim_prefix}_{di}_{ds}"
512
- for di, ds in enumerate(np.array(data).shape)
513
- ],
514
- )
515
- else:
516
- mvars[col.lower()] = xr.DataArray(
517
- data,
518
- dims=[
519
- f"{dim_prefix}_{di}_{ds}"
520
- for di, ds in enumerate(np.array(data).shape)
521
- ],
786
+
787
+ array_type, array_data = raw_col_data_to_coords_vars(
788
+ inpath, tb_tool, col, data, timecols
789
+ )
790
+ if array_type == "coord":
791
+ mcoords[col.lower()] = array_data
792
+ elif array_type == "data_var":
793
+ mvars[col.lower()] = array_data
794
+
795
+ return mcoords, mvars
796
+
797
+
798
+ def load_fixed_size_cols(
799
+ inpath: str,
800
+ tb_tool: tables.table,
801
+ timecols: Union[List[str], None],
802
+ ignore: Union[List[str], None],
803
+ ) -> Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]:
804
+ """
805
+ Loads columns into memory via the table tool getcol() function, as opposed to
806
+ load_generic_cols() which loads on a per-row basis via row().
807
+ This function is 2+ orders of magnitude faster for large tables (pointing tables with
808
+ the order of >=10^5 rows)
809
+ Prefer this function for performance reasons when all rows can be assumed to be fixed
810
+ size (even if they are of array type).
811
+ This is performance-critical for the POINTING subtable.
812
+
813
+ Parameters
814
+ ----------
815
+ inpath : str
816
+ path name of the MS
817
+ tb_tool : tables.table
818
+ table to red the columns
819
+ timecols : Union[List[str], None]
820
+ columns names to convert to datetime format
821
+ ignore : Union[List[str], None]
822
+ list of column names to skip and not try to load.
823
+
824
+ Returns
825
+ -------
826
+ Tuple[Dict[str, xr.Dataset], Dict[str, xr.Dataset]]
827
+ dict of coordinates and dict of data vars, ready to construct an xr.Dataset
828
+ """
829
+
830
+ loadable_cols = find_loadable_filled_cols(tb_tool, ignore)
831
+
832
+ # Produce coords and data vars from MS columns
833
+ mcoords, mvars = {}, {}
834
+ for col in loadable_cols.keys():
835
+ try:
836
+ data = tb_tool.getcol(col)
837
+ if isinstance(data, dict):
838
+ data = data["array"].reshape(data["shape"])
839
+ except Exception as exc:
840
+ logger.warning(
841
+ f"{inpath}: failed to load data with getcol for column {col}: {exc}"
522
842
  )
843
+ data = []
844
+
845
+ if len(data) == 0:
846
+ continue
847
+
848
+ array_type, array_data = raw_col_data_to_coords_vars(
849
+ inpath, tb_tool, col, data, timecols
850
+ )
851
+ if array_type == "coord":
852
+ mcoords[col.lower()] = array_data
853
+ elif array_type == "data_var":
854
+ mvars[col.lower()] = array_data
523
855
 
524
856
  return mcoords, mvars
525
857
 
526
858
 
859
+ def find_loadable_filled_cols(
860
+ tb_tool: tables.table, ignore: Union[List[str], None]
861
+ ) -> Dict:
862
+ """
863
+ For a table, finds the columns that are:
864
+ - loadable = not of record type, and not to be ignored
865
+ - filled = the column cells are populated.
866
+
867
+ Parameters
868
+ ----------
869
+ tb_tool : tables.table
870
+ table to red the columns
871
+ ignore : Union[List[str], None]
872
+ list of column names to skip and not try to load.
873
+
874
+ Returns
875
+ -------
876
+ Dict
877
+ dict of {column name => first cell} for columns that can/should be loaded
878
+ """
879
+
880
+ colnames = tb_tool.colnames()
881
+ # columns that are not populated are skipped. record columns are not supported
882
+ loadable_cols = {
883
+ col: tb_tool.getcell(col, 0)
884
+ for col in colnames
885
+ if (col not in ignore)
886
+ and (tb_tool.iscelldefined(col, 0))
887
+ and tb_tool.coldatatype(col) != "record"
888
+ }
889
+ return loadable_cols
890
+
891
+
892
+ def raw_col_data_to_coords_vars(
893
+ inpath: str,
894
+ tb_tool: tables.table,
895
+ col: str,
896
+ data: np.ndarray,
897
+ timecols: Union[List[str], None],
898
+ ) -> Tuple[str, xr.DataArray]:
899
+ """
900
+ From a raw np array of data (freshly loaded from a table column), prepares either a
901
+ coord or a data_var ready to be added to an xr.Dataset
902
+
903
+ Parameters
904
+ ----------
905
+ inpath: str
906
+ input table path
907
+ tb_tool: tables.table :
908
+ table toold being used to load data
909
+ col: str :
910
+ column
911
+ data: np.ndarray :
912
+ column data
913
+ timecols: Union[List[str], None]
914
+ columns to be treated as TIME-related
915
+
916
+ Returns
917
+ -------
918
+ Tuple[str, xr.DataArray]
919
+ array type string (whether this column is a 'coord' or a 'data_var') + DataArray
920
+ with column data/coord values ready to be added to the table xds
921
+ """
922
+
923
+ # Almost sure that when TIME is present (in a standard MS subt) it
924
+ # is part of the key. But what about non-std subtables, ASDM subts?
925
+ subts_with_time_key = (
926
+ "FEED",
927
+ "FLAG_CMD",
928
+ "FREQ_OFFSET",
929
+ "HISTORY",
930
+ "POINTING",
931
+ "SOURCE",
932
+ "SYSCAL",
933
+ "WEATHER",
934
+ )
935
+ dim_prefix = "dim"
936
+
937
+ if col in timecols:
938
+ if col == "MJD":
939
+ data = convert_mjd_time(data).astype("float64") / 1e9
940
+ else:
941
+ try:
942
+ data = convert_casacore_time(data)
943
+ except pd.errors.OutOfBoundsDatetime as exc:
944
+ if inpath.endswith("WEATHER"):
945
+ # intentionally not callling logging.exception
946
+ logger.warning(
947
+ f"Exception when converting WEATHER/TIME: {exc}. TIME data: {data}"
948
+ )
949
+ else:
950
+ raise
951
+ # should also probably add INTERVAL not only TIME
952
+ if col.endswith("_ID") or (inpath.endswith(subts_with_time_key) and col == "TIME"):
953
+ # weather table: importasdm produces very wrong "-1" ANTENNA_ID
954
+ if (
955
+ inpath.endswith("WEATHER")
956
+ and col == "ANTENNA_ID"
957
+ and "NS_WX_STATION_ID" in tb_tool.colnames()
958
+ ):
959
+ data = tb_tool.getcol("NS_WX_STATION_ID")
960
+
961
+ array_type = "coord"
962
+ array_data = xr.DataArray(
963
+ data,
964
+ dims=[
965
+ f"{dim_prefix}_{di}_{ds}" for di, ds in enumerate(np.array(data).shape)
966
+ ],
967
+ )
968
+ else:
969
+ array_type = "data_var"
970
+ array_data = xr.DataArray(
971
+ data,
972
+ dims=[
973
+ f"{dim_prefix}_{di}_{ds}" for di, ds in enumerate(np.array(data).shape)
974
+ ],
975
+ )
976
+
977
+ return array_type, array_data
978
+
979
+
980
+ def handle_variable_col_issues(
981
+ inpath: str, col: str, col_cells: dict, trows: tables.tablerow
982
+ ) -> np.ndarray:
983
+ """
984
+ load variable-size array columns, padding with nans wherever
985
+ needed. This happens for example often in the SPECTRAL_WINDOW
986
+ table (CHAN_WIDTH, EFFECTIVE_BW, etc.).
987
+ Also handle exceptions gracefully when trying to load the rows.
988
+
989
+ Parameters
990
+ ----------
991
+ inpath : str
992
+ path name of the MS
993
+ col : str
994
+ column being loaded
995
+ col_cells : dict
996
+ col: cell} values
997
+ trows : tables.tablerow
998
+ rows from a table as loaded by tables.row()
999
+
1000
+ Returns
1001
+ -------
1002
+ np.ndarray
1003
+ array with column values (possibly padded if rows vary in size)
1004
+ """
1005
+
1006
+ # Optional cols known to sometimes have inconsistent values
1007
+ known_misbehaving_cols = ["ASSOC_NATURE"]
1008
+
1009
+ mshape = np.array(max([np.array(row[col]).shape for row in trows]))
1010
+ try:
1011
+ pad_nan = get_pad_nan(col_cells[col])
1012
+
1013
+ # TODO
1014
+ # benchmark np.stack() performance
1015
+ data = np.stack(
1016
+ [
1017
+ np.pad(
1018
+ (
1019
+ row[col]
1020
+ if len(row[col]) > 0
1021
+ else np.array(row[col]).reshape(np.arange(len(mshape)) * 0)
1022
+ ),
1023
+ [(0, ss) for ss in mshape - np.array(row[col]).shape],
1024
+ "constant",
1025
+ constant_values=pad_nan,
1026
+ )
1027
+ for row in trows
1028
+ ]
1029
+ )
1030
+ except Exception as exc:
1031
+ msg = f"{inpath}: failed to load data for column {col}: {exc}"
1032
+ if col in known_misbehaving_cols:
1033
+ logger.debug(msg)
1034
+ else:
1035
+ logger.warning(msg)
1036
+ data = np.empty(0)
1037
+
1038
+ return data
1039
+
1040
+
527
1041
  def read_flat_col_chunk(infile, col, cshape, ridxs, cstart, pstart) -> np.ndarray:
528
1042
  """
529
1043
  Extract data chunk for each table col, this is fed to dask.delayed
1044
+
1045
+ Parameters
1046
+ ----------
1047
+ infile :
1048
+
1049
+ col :
1050
+
1051
+ cshape :
1052
+
1053
+ ridxs :
1054
+
1055
+ cstart :
1056
+
1057
+ pstart :
1058
+
1059
+ Returns
1060
+ -------
1061
+ np.ndarray
530
1062
  """
531
1063
 
532
1064
  with open_table_ro(infile) as tb_tool:
@@ -588,6 +1120,30 @@ def read_col_chunk(
588
1120
  ) -> np.ndarray:
589
1121
  """
590
1122
  Function to perform delayed reads from table columns.
1123
+
1124
+ Parameters
1125
+ ----------
1126
+ infile : str
1127
+
1128
+ ts_taql : str
1129
+
1130
+ col : str
1131
+
1132
+ cshape : Tuple[int]
1133
+
1134
+ tidxs : np.ndarray
1135
+
1136
+ bidxs : np.ndarray
1137
+
1138
+ didxs : np.ndarray
1139
+
1140
+ d1: Tuple[int, int]
1141
+
1142
+ d2: Tuple[int, int]
1143
+
1144
+ Returns
1145
+ -------
1146
+ np.ndarray
591
1147
  """
592
1148
  # TODO: consider calling load_col_chunk() from inside the withs
593
1149
  # for read_delayed_pointing_table and read_expanded_main_table
@@ -600,8 +1156,13 @@ def read_col_chunk(
600
1156
  elif len(cshape) == 4: # DATA and FLAG
601
1157
  data = query.getcolslice(col, (d1[0], d2[0]), (d1[1], d2[1]), [], 0, -1)
602
1158
 
603
- # full data is the maximum of the data shape and chunk shape dimensions
604
- fulldata = np.full(cshape, np.nan, dtype=data.dtype)
1159
+ policy = "warn"
1160
+ if np.issubdtype(data.dtype, np.integer):
1161
+ policy = "ignore"
1162
+ with np.errstate(invalid=policy):
1163
+ # full data is the maximum of the data shape and chunk shape dimensions
1164
+ fulldata = np.full(cshape, np.nan, dtype=data.dtype)
1165
+
605
1166
  if len(didxs) > 0:
606
1167
  fulldata[tidxs[didxs], bidxs[didxs]] = data[didxs]
607
1168
 
@@ -609,71 +1170,85 @@ def read_col_chunk(
609
1170
 
610
1171
 
611
1172
  def read_col_conversion(
612
- tb_tool,
1173
+ tb_tool: tables.table,
613
1174
  col: str,
614
1175
  cshape: Tuple[int],
615
1176
  tidxs: np.ndarray,
616
1177
  bidxs: np.ndarray,
617
- ):
1178
+ ) -> np.ndarray:
618
1179
  """
619
1180
  Function to perform delayed reads from table columns when converting
620
1181
  (no need for didxs)
1182
+
1183
+ Parameters
1184
+ ----------
1185
+ tb_tool : tables.table
1186
+
1187
+ col : str
1188
+
1189
+ cshape : Tuple[int]
1190
+
1191
+ tidxs : np.ndarray
1192
+
1193
+ bidxs : np.ndarray
1194
+
1195
+ Returns
1196
+ -------
1197
+ np.ndarray
621
1198
  """
622
1199
 
623
1200
  # Workaround for https://github.com/casacore/python-casacore/issues/130
624
1201
  # WARNING: Assumes tb_tool is a single measurement set not an MMS.
625
- # WARNING: Assumes the num_frequencies * num_polarisations > 2**29. If false,
1202
+ # WARNING: Assumes the num_frequencies * num_polarizations < 2**29. If false,
626
1203
  # https://github.com/casacore/python-casacore/issues/130 isn't mitigated.
627
1204
 
628
1205
  # Use casacore to get the shape of a row for this column
629
-
630
1206
  #################################################################################
631
1207
 
632
1208
  # Get the total number of rows in the base measurement set
633
1209
  nrows_total = tb_tool.nrows()
634
1210
 
635
1211
  # getcolshapestring() only works on columns where a row element is an
636
- # array (ie fails for TIME, etc)
637
- # Assumes RuntimeError is because the column is a scalar
1212
+ # array ie. fails for TIME
1213
+ # Assumes the RuntimeError is because the column is a scalar
638
1214
  try:
639
-
640
1215
  shape_string = tb_tool.getcolshapestring(col)[0]
1216
+ # Convert `shape_string` into a tuple that numpy understands
641
1217
  extra_dimensions = tuple(
642
1218
  [
643
1219
  int(idx)
644
1220
  for idx in shape_string.replace("[", "").replace("]", "").split(", ")
645
1221
  ]
646
1222
  )
647
- full_shape = tuple(
648
- [nrows_total]
649
- + [
650
- int(idx)
651
- for idx in shape_string.replace("[", "").replace("]", "").split(", ")
652
- ]
653
- )
654
1223
  except RuntimeError:
655
1224
  extra_dimensions = ()
656
- full_shape = (nrows_total,)
657
1225
 
658
1226
  #################################################################################
659
1227
 
660
1228
  # Get dtype of the column. Only read first row from disk
661
1229
  col_dtype = np.array(tb_tool.col(col)[0]).dtype
662
1230
 
663
- # Construct the numpy array to populate with data
664
- data = np.empty(full_shape, dtype=col_dtype)
1231
+ # Construct a numpy array to populate. `data` has shape (n_times, n_baselines, n_frequencies, n_polarizations)
1232
+ data = np.full(cshape + extra_dimensions, np.nan, dtype=col_dtype)
665
1233
 
666
1234
  # Use built-in casacore table iterator to populate the data column by unique times.
667
1235
  start_row = 0
668
1236
  for ts in tb_tool.iter("TIME", sort=False):
669
1237
  num_rows = ts.nrows()
670
- # Note don't use getcol() because it's less safe. See:
1238
+
1239
+ # Create small temporary array to store the partial column
1240
+ tmp_arr = np.full((num_rows,) + extra_dimensions, np.nan, dtype=col_dtype)
1241
+
1242
+ # Note we don't use `getcol()` because it's less safe. See:
671
1243
  # https://github.com/casacore/python-casacore/issues/130#issuecomment-463202373
672
- ts.getcolnp(col, data[start_row : start_row + num_rows])
1244
+ ts.getcolnp(col, tmp_arr)
1245
+
1246
+ # Get the slice of rows contained in `tmp_arr`.
1247
+ # Used to get the relevant integer indexes from `tidxs` and `bidxs`
1248
+ tmp_slice = slice(start_row, start_row + num_rows)
1249
+
1250
+ # Copy `tmp_arr` into correct elements of `tmp_arr`
1251
+ data[tidxs[tmp_slice], bidxs[tmp_slice]] = tmp_arr
673
1252
  start_row += num_rows
674
1253
 
675
- # TODO
676
- # Can we return a view of `data` instead of copying?
677
- fulldata = np.full(cshape + extra_dimensions, np.nan, dtype=col_dtype)
678
- fulldata[tidxs, bidxs] = data
679
- return fulldata
1254
+ return data