PyPI - sacc - Versions diffs - 1.0.2__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

sacc 1.0.2py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

sacc/__init__.py +4 -1
sacc/covariance.py +202 -96
sacc/data_types.py +64 -9
sacc/io.py +414 -0
sacc/sacc.py +372 -164
sacc/tracer_uncertainty/__init__.py +1 -0
sacc/tracer_uncertainty/base.py +34 -0
sacc/tracer_uncertainty/nz.py +287 -0
sacc/tracers/__init__.py +6 -0
sacc/tracers/base.py +127 -0
sacc/tracers/clusters.py +332 -0
sacc/tracers/maps.py +206 -0
sacc/tracers/misc.py +87 -0
sacc/tracers/nz.py +285 -0
sacc/tracers/survey.py +75 -0
sacc/utils.py +46 -2
sacc/windows.py +116 -102
{sacc-1.0.2.dist-info → sacc-2.0.1.dist-info}/METADATA +6 -4
sacc-2.0.1.dist-info/RECORD +22 -0
{sacc-1.0.2.dist-info → sacc-2.0.1.dist-info}/WHEEL +1 -1
sacc/tracers.py +0 -1217
sacc-1.0.2.dist-info/RECORD +0 -12
{sacc-1.0.2.dist-info → sacc-2.0.1.dist-info}/licenses/LICENSE +0 -0
{sacc-1.0.2.dist-info → sacc-2.0.1.dist-info}/top_level.txt +0 -0

sacc/sacc.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import copy
-import warnings
 import os
-from io import BytesIO
+import re
+import warnings
-import numpy as np
 from astropy.io import fits
 from astropy.table import Table
+import numpy as np
 from .tracers import BaseTracer
-from .windows import BaseWindow, BandpowerWindow
+from .windows import BandpowerWindow
 from .covariance import BaseCovariance, concatenate_covariances
 from .utils import unique_list
 from .data_types import standard_types, DataPoint
+from . import io
 class Sacc:
     """
@@ -29,6 +29,7 @@ class Sacc:
         self.tracers = {}
         self.covariance = None
         self.metadata = {}
+        self.tracer_uncertainties = {}
     def __len__(self):
         """
@@ -41,6 +42,48 @@ class Sacc:
         """
         return len(self.data)
+    def __eq__(self, other):
+        """
+        Test for equality between two Sacc instances.
+        Checks whether the two values are equal.  This is a
+        complete equality check, and will check that the data points,
+        tracers, covariance and metadata are all the same.
+        Parameters
+        ----------
+        other: Sacc instance
+            The other data set to compare with
+        Returns
+        -------
+        equal: bool
+            True if the two data sets are the same, False otherwise.
+        """
+        if not isinstance(other, Sacc):
+            return False
+        if self.data != other.data:
+            return False
+        if len(self.tracers) != len(other.tracers):
+            return False
+        if set(self.tracers.keys()) != set(other.tracers.keys()):
+            return False
+        for k1, v1 in self.tracers.items():
+            v2 = other.tracers[k1]
+            if not v1 == v2:
+                return False
+        if self.covariance != other.covariance:
+            return False
+        if self.metadata != other.metadata:
+            return False
+        return True
     def copy(self):
         """
         Create a copy of the data set with no data shared with the original.
@@ -72,10 +115,9 @@ class Sacc:
             # Otherwise just use whatever we have.
             if 'ell' in row.tags:
                 return (dt, row.tracers, row.tags['ell'])
-            elif 'theta' in row.tags:
+            if 'theta' in row.tags:
                 return (dt, row.tracers, row.tags['theta'])
-            else:
-                return (dt, row.tracers, 0.0)
+            return (dt, row.tracers, 0.0)
         # This from
         # https://stackoverflow.com/questions/6422700/how-to-get-indices-of-a-sorted-array-in-python
         indices = [i[0] for i in sorted(enumerate(self.data),
@@ -105,14 +147,14 @@ class Sacc:
     # Builder methods for building up Sacc data from scratch in memory
     #
-    def add_tracer(self, tracer_type, name,
+    def add_tracer(self, type_name, name,
                    *args, **kwargs):
         """
         Add a new tracer
         Parameters
         ----------
-        tracer_type: str
+        type_name: str
             A string corresponding to one of the known tracer types,
             or 'misc' to use a new tracer with no parameters.
             e.g. "NZ" for n(z) tracers
@@ -135,7 +177,7 @@ class Sacc:
         None
         """
-        tracer = BaseTracer.make(tracer_type, name,
+        tracer = BaseTracer.make(type_name, name,
                                  *args, **kwargs)
         self.add_tracer_object(tracer)
@@ -152,6 +194,17 @@ class Sacc:
         """
         self.tracers[tracer.name] = tracer
+    def add_tracer_uncertainty_object(self, uncertainty):
+        """
+        Add a pre-constructed tracer uncertainty object to this data set.
+        Parameters
+        ----------
+        uncertainty: BaseTracerUncertainty instance
+            The uncertainty object to add to the data set
+        """
+        self.tracer_uncertainties[uncertainty.name] = uncertainty
     def add_data_point(self, data_type, tracers, value,
                        tracers_later=False, **tags):
         """
@@ -347,7 +400,7 @@ class Sacc:
             # Skip things with the wrong type or tracer
             if not ((tracers is None) or (d.tracers == tracers)):
                 continue
-            if not ((data_type is None or d.data_type == data_type)):
+            if not (data_type is None or d.data_type == data_type):
                 continue
             # Remove any objects that don't match the required tags,
             # including the fact that we can specify tag__lt and tag__gt
@@ -744,6 +797,7 @@ class Sacc:
             for tri in trs:
                 if tri not in names:
                     self.remove_selection(tracers=trs)
+                    break
         trs_names = list(self.tracers.keys())
         for name in trs_names:
@@ -811,10 +865,48 @@ class Sacc:
         # Convert any window objects in the data set to tables,
         # and record a mapping from those objects to table references
         # This could easily be extended to other types
-        all_windows = unique_list(d.get_tag('window') for d in self.data)
-        window_ids = {w: id(w) for w in all_windows}
-        tables = BaseWindow.to_tables(all_windows)
-        return tables, window_ids
+        windows = []
+        for d in self.data:
+            w = d.get_tag("window")
+            if w is not None:
+                windows.append(w)
+        windows = unique_list(windows)
+        window_ids = {id(w):w for w in windows}
+        return window_ids
+    def to_tables(self):
+        """
+        Convert this data set to a collection of astropy tables.
+        Parameters
+        ----------
+        None
+        Returns
+        -------
+        tables: list of astropy Table objects
+            A list of tables, each corresponding to a different
+            type of object in the data set.  The tables will have
+            metadata that can be used to reconstruct the data set.
+        """
+        # Get the tracers
+        objects = {
+            "tracer": self.tracers,
+            "data": self.data,
+            "window": self._make_window_tables(),
+            "metadata": self.metadata,
+            "traceruncertainty": self.tracer_uncertainties,
+        }
+        if self.has_covariance():
+            # For now the name will just be "cov", but in future
+            # we may support alternatives.
+            objects["covariance"] = {self.covariance.name: self.covariance}
+        tables = io.to_tables(objects)
+        return tables
     def save_fits(self, filename, overwrite=False):
         """
@@ -830,71 +922,30 @@ class Sacc:
             If True, overwrite the file silently.
         """
-        # Since we don't want to re-order the file as a side effect
-        # we first make a copy of ourself and re-order that.
-        # Tables for the windows
-        tables, window_ids = self._make_window_tables()
-        lookup = {'window': window_ids}
+        if os.path.exists(filename) and not overwrite:
+            raise FileExistsError(f"File {filename} already exists. "
+                                  "Use overwrite=True to overwrite it.")
-        # Tables for the tracers
-        tables += BaseTracer.to_tables(self.tracers.values())
+        tables = self.to_tables()
-        # Tables for the data sets
-        for dt in self.get_data_types():
-            indices = self.indices(dt)
-            data = [self.data[i] for i in indices]
-            table = DataPoint.to_table(data, lookup)
-            table.add_column(indices, name='sacc_ordering')
-            # Could move this inside to_table?
-            table.meta['SACCTYPE'] = 'data'
-            table.meta['SACCNAME'] = dt
-            table.meta['EXTNAME'] = f'data:{dt}'
-            tables.append(table)
+        # Add the EXTNAME metadata value to each table.
+        # This is used to set the HDU name in the FITS file.
+        for table in tables:
+            typ = table.meta['SACCTYPE']
+            name = table.meta['SACCNAME']
+            if typ != 'data':
+                cls = table.meta['SACCCLSS']
+                extname = f'{typ}:{cls}:{name}'
+                table.meta['EXTNAME'] = extname
         # Create the actual fits object
-        hdr = fits.Header()
-        # save any global metadata in the header.
-        # We save the keys and values as separate header cards,
-        # because otherwise the keys are all forced to upper case
-        hdr['NMETA'] = len(self.metadata)
-        for i, (k, v) in enumerate(self.metadata.items()):
-            hdr[f'KEY{i}'] = k
-            hdr[f'VAL{i}'] = v
-        hdus = [fits.PrimaryHDU(header=hdr)] + \
-               [fits.table_to_hdu(table) for table in tables]
-        # Covariance, if needed.
-        # All the other data elements become astropy tables first,
-        # But covariances are a bit more complicated and dense, so we
-        # allow them to convert straight to
-        if self.covariance is not None:
-            hdus.append(self.covariance.to_hdu())
-        # Make and save the final FITS data
+        primary_header = fits.Header()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=fits.verify.VerifyWarning)
+            hdus = [fits.PrimaryHDU(header=primary_header)] + \
+                    [fits.table_to_hdu(table) for table in tables]
         hdu_list = fits.HDUList(hdus)
-        # The astropy writeto shows very poor performance
-        # when writing lots of small metadata strings on
-        # the NERSC Lustre file system.  So we write to
-        # a buffer first and then save that.
-        # First we have to manually check for overwritten files
-        # We raise the same error as astropy
-        if os.path.exists(filename) and not overwrite:
-            raise OSError(f"File {filename} already exists and overwrite=False")
-        # Create the buffer and write the data to it
-        buf = BytesIO()
-        hdu_list.writeto(buf)
-        # Rewind and read the binary data we just wrote
-        buf.seek(0)
-        output_data = buf.read()
-        # Write the binary data to the target file
-        with open(filename, "wb") as f:
-            f.write(output_data)
+        io.astropy_buffered_fits_write(filename, hdu_list)
     @classmethod
     def load_fits(cls, filename):
@@ -909,87 +960,195 @@ class Sacc:
         filename: str
             A FITS format sacc file
         """
-        hdu_list = fits.open(filename, "readonly")
-        # Split the HDU's into the different sacc types
-        tracer_tables = [Table.read(hdu)
-                         for hdu in hdu_list
-                         if hdu.header.get('SACCTYPE') == 'tracer']
-        window_tables = [Table.read(hdu)
-                         for hdu in hdu_list
-                         if hdu.header.get('SACCTYPE') == 'window']
-        data_tables = [Table.read(hdu) for hdu in hdu_list
-                       if hdu.header.get('SACCTYPE') == 'data']
-        cov = [hdu for hdu in hdu_list if hdu.header.get('SACCTYPE') == 'cov']
-        # Pull out the classes for these components.
-        tracers = BaseTracer.from_tables(tracer_tables)
-        windows = BaseWindow.from_tables(window_tables)
-        # The lookup table is used to convert from ID numbers to
-        # Window objects.
-        lookup = {'window': windows}
-        # Check if all tables have the 'sacc_ordering' column
-        if not all("sacc_ordering" in table.colnames for table in data_tables):
-            warnings.warn(
-                "The FITS format without the 'sacc_ordering' column is deprecated. "
-                "Assuming data rows are in the correct order as it was before version 1.0."
-            )
-            last_index = 0
-            for table in data_tables:
-                # Create a sequential order assuming rows are stored contiguously
-                order = range(last_index, last_index + len(table))
-                # Update last_index for the next table
-                last_index += len(table)
-                # Add the 'sacc_ordering' column to the table
-                table.add_column(order, name="sacc_ordering")
-        # Collect together all the data points from the different sections
-        data_unordered = []
-        index = []
-        for table in data_tables:
-            index += table["sacc_ordering"].tolist()
-            table.remove_column('sacc_ordering')
-            data_unordered += DataPoint.from_table(table, lookup)
-        # Put the data back in its original order, matching the
-        # covariance.
-        data = [None for i in range(len(data_unordered))]
-        for i, d in zip(index, data_unordered):
-            data[i] = d
-        # Finally, take all the pieces that we have collected
-        # and add them all into this data set.
-        S = cls()
+        cov = None
+        metadata = None
+        with fits.open(filename, mode="readonly") as f:
+            tables = []
+            for hdu in f:
+                if hdu.name.lower() == 'primary':
+                    # The primary table is not a data table,
+                    # but in older files it was used to store metadata
+                    header = hdu.header
+                    if "NMETA" in header:
+                        metadata = {}
+                        # Older format metadata is kept in the primary
+                        # header, with keys KEY0, VAL0, etc.
+                        n_meta = header['NMETA']
+                        for i in range(n_meta):
+                            k = header[f'KEY{i}']
+                            v = header[f'VAL{i}']
+                            metadata[k] = v
+                elif hdu.name.lower() == 'covariance':
+                    # Legacy covariance - HDU will just be called covariance
+                    # instead of the full name given by BaseIO.
+                    # Note that this will also allow us to use multiple
+                    # covariances in future.
+                    cov = BaseCovariance.from_hdu(hdu)
+                else:
+                    tables.append(Table.read(hdu))
+        # add the metadata table, if we are in the legacy format
+        if metadata is not None:
+            tables.append(io.metadata_to_table(metadata))
+        return cls.from_tables(tables, cov=cov)
+    def save_hdf5(self, filename, overwrite=False, compression='gzip', compression_opts=4):
+        """
+        Save this data to a HDF5 format Sacc file.
+        Parameters
+        ----------
+        filename: str
+            Destination HDF5 file name
+        overwrite: bool
+            If False (the default), raise an error if the file already exists
+            If True, overwrite the file silently.
+        compression: str, optional
+            Compression filter to use ('gzip', 'lzf', 'szip', or None). Default is 'gzip'.
+        compression_opts : int, optional
+            Compression level (0-9 for gzip, where 0 is no compression and 9 is maximum).
+            Default is 4 (moderate compression).
+        """
+        import h5py
+        if os.path.exists(filename) and not overwrite:
+            raise FileExistsError(f"File {filename} already exists. "
+                                  "Use overwrite=True to overwrite it.")
+        tables = self.to_tables()
+        # Add the EXTNAME metadata value to each table.
+        for table in tables:
+            typ = table.meta['SACCTYPE']
+            name = table.meta['SACCNAME']
+            if typ != 'data':
+                cls = table.meta['SACCCLSS']
+                extname = f'{typ}:{cls}:{name}'
+                table.meta['EXTNAME'] = extname
+        with h5py.File(filename, 'w') as f:
+            used_names = {}
+            for table in tables:
+                # Build a meaningful dataset name
+                typ = table.meta.get('SACCTYPE', 'unknown')
+                name = table.meta.get('SACCNAME', None)
+                cls = table.meta.get('SACCCLSS', None)
+                part = table.meta.get('SACCPART', None)
+                # Compose base dataset name
+                if typ == 'data' and name:
+                    dset_name = f"data/{name}"
+                elif typ == 'tracer' and name:
+                    dset_name = f"tracer/{name}"
+                elif typ == 'traceruncertainty' and name:
+                    dset_name = f"traceruncertainty/{name}"
+                elif typ == 'window' and name:
+                    dset_name = f"window/{name}"
+                    if part:
+                        dset_name += f"_{part}"
+                elif typ == 'covariance' and name:
+                    dset_name = f"covariance_{name}"
+                elif typ == 'metadata':
+                    dset_name = "metadata"
+                elif name:
+                    dset_name = f"{typ}_{name}"
+                else:
+                    dset_name = typ
+                # Ensure uniqueness by appending an index if needed
+                base_name = dset_name
+                idx = used_names.get(base_name, 0)
+                while dset_name in f:
+                    idx += 1
+                    dset_name = f"{base_name}_{idx}"
+                used_names[base_name] = idx
+                table.write(f,
+                            path=dset_name,
+                            serialize_meta=False,
+                            compression=compression,
+                            compression_opts=compression_opts
+                            )
+    @classmethod
+    def load_hdf5(cls, filename):
+        """
+        Load a Sacc object from an HDF5 file.
+        Parameters
+        ----------
+        filename: str
+            Path to the HDF5 file.
+        Returns
+        -------
+        sacc_obj: Sacc
+            A Sacc object reconstructed from the tables in the HDF5 file.
+        """
+        import h5py
+        recovered_tables = []
+        with h5py.File(filename, 'r') as f:
+            # Read all datasets (not groups) in the order they appear
+            for key in f.keys():
+                item = f[key]
+                if isinstance(item, h5py.Dataset):
+                    table = Table.read(f, path=key)
+                    recovered_tables.append(table)
+                elif isinstance(item, h5py.Group):
+                    for subkey in item.keys():
+                        subitem = item[subkey]
+                        if isinstance(subitem, h5py.Dataset):
+                            table = Table.read(item, path=f"{subkey}")
+                            recovered_tables.append(table)
+        sacc_obj = cls.from_tables(recovered_tables)
+        return sacc_obj
+    @classmethod
+    def from_tables(cls, tables, cov=None):
+        """
+        Reassmble a Sacc object from a collection of tables.
+        Parameters
+        ----------
+        objs: dict[str, dict[str, BaseIO]]
+            A dictionary of objects, with some of 'tracer', 'data', 'window',
+            and 'covariance'. Each key maps to a list of objects
+            or a single object.
+        """
+        s = cls()
+        objs =  io.from_tables(tables)
+        # Add all the tracers
+        tracers = objs.get('tracer', {})
         for tracer in tracers.values():
-            S.add_tracer_object(tracer)
+            s.add_tracer_object(tracer)
-        # Add the data points manually instead of using the API, since we
-        # have already constructed them.
+        # Add the actual data points. The windows and any future
+        # objects that are attached to individual data points
+        # will be included in the data points themselves, there is
+        # no need to add them separately.
+        data = fix_data_ordering(objs.get('data', []))
         for d in data:
-            S.data.append(d)
+            s.data.append(d)
-        # Assume there is only a single covariance extension,
-        # if there are any
-        if cov:
-            S.add_covariance(BaseCovariance.from_hdu(cov[0]))
+        # Add the covariance, if it is present.
+        if "covariance" in objs:
+            if cov is not None:
+                raise ValueError("Found both a legacy covariance and a new one in the same file.")
+            cov = objs["covariance"]["cov"]
-        # Load metadata from the primary heaer
-        header = hdu_list[0].header
+        # copy in metadata
+        s.metadata.update(objs.get('metadata', {}))
-        # Load each key,value pair in turn.
-        # This will work for normal scalar data types;
-        # arrays etc. will need some thought.
-        n_meta = header['NMETA']
-        for i in range(n_meta):
-            k = header[f'KEY{i}']
-            v = header[f'VAL{i}']
-            S.metadata[k] = v
+        if cov is not None:
+            s.add_covariance(cov)
-        hdu_list.close()
+        for uncertainty in objs.get('traceruncertainty', {}).values():
+            s.add_tracer_uncertainty_object(uncertainty)
+        return s
-        return S
     #
     # Methods below here are helper functions for specific types of data.
@@ -1012,13 +1171,10 @@ class Sacc:
             cov_block = self.covariance.get_block(ind)
             if return_ind:
                 return angle, mu, cov_block, ind
-            else:
-                return angle, mu, cov_block
-        else:
-            if return_ind:
-                return angle, mu, ind
-            else:
-                return angle, mu
+            return angle, mu, cov_block
+        if return_ind:
+            return angle, mu, ind
+        return angle, mu
     def get_bandpower_windows(self, indices):
         """
@@ -1047,10 +1203,8 @@ class Sacc:
         if not isinstance(ws, BandpowerWindow):
             warnings.warn("No bandpower windows associated with these data")
             return None
-        else:
-            w_inds = np.array(self._get_tags_by_index(['window_ind'],
-                                                      indices)[0])
-            return ws.get_section(w_inds)
+        w_inds = np.array(self._get_tags_by_index(['window_ind'],indices)[0])
+        return ws.get_section(w_inds)
     def get_ell_cl(self, data_type, tracer1, tracer2,
                    return_cov=False, return_ind=False):
@@ -1151,7 +1305,7 @@ class Sacc:
                                 tracers_later=tracers_later, **t)
             return
         # multiple ell/theta values but same bin
-        elif np.isscalar(tracer1):
+        if np.isscalar(tracer1):
             n1 = len(x)
             n2 = len(tag_val)
             if tag_extra_name is None:
@@ -1159,7 +1313,7 @@ class Sacc:
                 n3 = n1
             else:
                 n3 = len(tag_extra)
-            if not (n1 == n2 == n3):
+            if not n1 == n2 == n3:
                 raise ValueError("Length of inputs do not match in"
                                  f"added 2pt data ({n1},{n2},{n3})")
             if window is None:
@@ -1184,7 +1338,7 @@ class Sacc:
                 n5 = n1
             else:
                 n5 = len(tag_extra)
-            if not (n1 == n2 == n3 == n4 == n5):
+            if not n1 == n2 == n3 == n4 == n5:
                 raise ValueError("Length of inputs do not match in "
                                  f"added 2pt data ({n1},{n2},{n3},{n4},{n5})")
             if window is None:
@@ -1213,7 +1367,7 @@ class Sacc:
                 n6 = n1
             else:
                 n6 = len(tag_extra)
-            if not (n1 == n2 == n3 == n4 == n5 == n6):
+            if not n1 == n2 == n3 == n4 == n5 == n6:
                 raise ValueError("Length of inputs do not match in added "
                                  f"2pt data ({n1},{n2},{n3},{n4},{n5},{n6})")
             if window is None:
@@ -1471,3 +1625,57 @@ def concatenate_data_sets(*data_sets, labels=None, same_tracers=None):
             output.metadata[key] = val
     return output
+def fix_data_ordering(data_points):
+    """
+    SACC data points have an ordering column called 'sacc_ordering'
+    which is used to keep the data points in the same order as
+    the covariance matrix. This function re-orders the data points
+    accordingly
+    Parameters
+    ----------
+    data_points: list of DataPoint objects
+    Returns
+    -------
+    ordered_data_points: list of DataPoint objects
+    """
+    # Older versions of SACC did not have this column, so we
+    # check for that situation and if not then add it here, in the
+    # order the data points were found in the file.
+    # In the old sacc version this order automatically matched the
+    # covariance matrix.
+    have_ordering = ['sacc_ordering' in dp.tags for dp in data_points]
+    if not all(have_ordering):
+        if any(have_ordering):
+            raise ValueError(
+                "Some data points have sacc ordering and some do not. "
+                "Hybrid old/new version. This is very wrong. "
+                "Please check your data files or ask on #desc-sacc for help."
+            )
+        print("Warning: The FITS format without the 'sacc_ordering' column is deprecated")
+        print("Assuming data rows are in the correct order as it was before version 1.0.")
+        for i, dp in enumerate(data_points):
+            dp.tags['sacc_ordering'] = i
+    # In either case, we now have the 'sacc_ordering' column,
+    # so can re-order the data points.
+    ordered_data_points = [None for i in range(len(data_points))]
+    for dp in data_points:
+        i = dp.tags['sacc_ordering']
+        ordered_data_points[i] = dp
+        # We remove the ordering tag now, as it is not needed
+        # in the main library
+        del dp.tags['sacc_ordering']
+    return ordered_data_points

sacc 1.0.2__py3-none-any.whl → 2.0.1__py3-none-any.whl

sacc 1.0.2py3-none-any.whl → 2.0.1py3-none-any.whl