PyPI - ml4gw - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

ml4gw 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ml4gw might be problematic. Click here for more details.

Files changed (29) hide show

ml4gw/augmentations.py +43 -0
ml4gw/dataloading/__init__.py +2 -1
ml4gw/dataloading/chunked_dataset.py +66 -212
ml4gw/dataloading/hdf5_dataset.py +176 -0
ml4gw/nn/__init__.py +0 -0
ml4gw/nn/autoencoder/__init__.py +3 -0
ml4gw/nn/autoencoder/base.py +89 -0
ml4gw/nn/autoencoder/convolutional.py +156 -0
ml4gw/nn/autoencoder/skip_connection.py +46 -0
ml4gw/nn/autoencoder/utils.py +14 -0
ml4gw/nn/norm.py +97 -0
ml4gw/nn/resnet/__init__.py +2 -0
ml4gw/nn/resnet/resnet_1d.py +413 -0
ml4gw/nn/resnet/resnet_2d.py +413 -0
ml4gw/nn/streaming/__init__.py +2 -0
ml4gw/nn/streaming/online_average.py +121 -0
ml4gw/nn/streaming/snapshotter.py +121 -0
ml4gw/transforms/__init__.py +2 -0
ml4gw/transforms/pearson.py +87 -0
ml4gw/transforms/spectrogram.py +162 -0
ml4gw/transforms/whitening.py +1 -1
ml4gw/waveforms/__init__.py +2 -0
ml4gw/waveforms/phenom_d.py +1359 -0
ml4gw/waveforms/phenom_d_data.py +3026 -0
ml4gw/waveforms/taylorf2.py +306 -0
{ml4gw-0.2.0.dist-info → ml4gw-0.4.0.dist-info}/METADATA +14 -6
ml4gw-0.4.0.dist-info/RECORD +43 -0
{ml4gw-0.2.0.dist-info → ml4gw-0.4.0.dist-info}/WHEEL +1 -1
ml4gw-0.2.0.dist-info/RECORD +0 -23

ml4gw/augmentations.py ADDED Viewed

@@ -0,0 +1,43 @@
+import torch
+class SignalInverter(torch.nn.Module):
+    """
+    Takes a tensor of timeseries of arbitrary dimension
+    and randomly inverts (i.e. h(t) -> -h(t))
+    each timeseries with probability `prob`.
+    Args:
+        prob:
+            Probability that a timeseries is inverted
+    """
+    def __init__(self, prob: float = 0.5):
+        super().__init__()
+        self.prob = prob
+    def forward(self, X):
+        mask = torch.rand(size=X.shape[:-1]) < self.prob
+        X[mask] *= -1
+        return X
+class SignalReverser(torch.nn.Module):
+    """
+    Takes a tensor of timeseries of arbitrary dimension
+    and randomly reverses (i.e. h(t) -> h(-t))
+    each timeseries with probability `prob`.
+    Args:
+        prob:
+            Probability that a kernel is reversed
+    """
+    def __init__(self, prob: float = 0.5):
+        super().__init__()
+        self.prob = prob
+    def forward(self, X):
+        mask = torch.rand(size=X.shape[:-1]) < self.prob
+        X[mask] = X[mask].flip(-1)
+        return X

ml4gw/dataloading/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
-from .chunked_dataset import ChunkedDataset
+from .chunked_dataset import ChunkedTimeSeriesDataset
+from .hdf5_dataset import Hdf5TimeSeriesDataset
 from .in_memory_dataset import InMemoryDataset

ml4gw/dataloading/chunked_dataset.py CHANGED Viewed

@@ -1,262 +1,113 @@
-from typing import List
+from collections.abc import Iterable
-import h5py
-import numpy as np
 import torch
-class ChunkLoader(torch.utils.data.IterableDataset):
-    def __init__(
-        self,
-        fnames: List[str],
-        channels: List[str],
-        chunk_size: int,
-        reads_per_chunk: int,
-        chunks_per_epoch: int,
-        coincident: bool = True,
-    ) -> None:
-        self.fnames = fnames
-        self.channels = channels
-        self.chunk_size = chunk_size
-        self.reads_per_chunk = reads_per_chunk
-        self.chunks_per_epoch = chunks_per_epoch
-        self.coincident = coincident
-        sizes = []
-        for f in self.fnames:
-            with h5py.File(f, "r") as f:
-                size = len(f[self.channels[0]])
-                sizes.append(size)
-        total = sum(sizes)
-        self.probs = np.array([i / total for i in sizes])
-    def sample_fnames(self):
-        return np.random.choice(
-            self.fnames,
-            p=self.probs,
-            size=(self.reads_per_chunk,),
-            replace=True,
-        )
-    def load_coincident(self):
-        fnames = self.sample_fnames()
-        chunks = []
-        for fname in fnames:
-            with h5py.File(fname, "r") as f:
-                chunk, idx = [], None
-                for channel in self.channels:
-                    if idx is None:
-                        end = len(f[channel]) - self.chunk_size
-                        idx = np.random.randint(0, end)
-                    x = f[channel][idx : idx + self.chunk_size]
-                    chunk.append(x)
-            chunks.append(np.stack(chunk))
-        return np.stack(chunks)
-    def load_noncoincident(self):
-        chunks = []
-        for channel in self.channels:
-            fnames = self.sample_fnames()
-            chunk = []
-            for fname in fnames:
-                with h5py.File(fname, "r") as f:
-                    end = len(f[channel]) - self.chunk_size
-                    idx = np.random.randint(0, end)
-                    x = f[channel][idx : idx + self.chunk_size]
-                    chunk.append(x)
-            chunks.append(np.stack(chunk))
-        return np.stack(chunks, axis=1)
-    def iter_epoch(self):
-        for _ in range(self.chunks_per_epoch):
-            if self.coincident:
-                yield torch.Tensor(self.load_coincident())
-            else:
-                yield torch.Tensor(self.load_noncoincident())
-    def collate(self, xs):
-        return torch.cat(xs, axis=0)
-    def __iter__(self):
-        return self.iter_epoch()
-class ChunkedDataset(torch.utils.data.IterableDataset):
+class ChunkedTimeSeriesDataset(torch.utils.data.IterableDataset):
     """
-    Iterable dataset for generating batches of background data
-    loaded on-the-fly from multiple HDF5 files. Loads
-    `chunk_length`-sized randomly-sampled stretches of
-    background from `reads_per_chunk` randomly sampled
-    files up front, then samples `batches_per_chunk`
-    batches of kernels from this chunk before loading
-    in the next one. Terminates after `chunks_per_epoch`
-    chunks have been exhausted, which amounts to
-    `chunks_per_epoch * batches_per_chunk` batches.
-    Note that filenames are not sampled uniformly
-    at chunk-loading time, but are weighted according
-    to the amount of data each file contains. This ensures
-    a uniform sampling over time across the whole dataset.
-    To load chunks asynchronously in the background,
-    specify `num_workers > 0`. Note that if the
-    number of workers is not an even multiple of
-    `chunks_per_epoch`, the last chunks of an epoch
-    will be composed of fewer than `reads_per_chunk`
-    individual segments.
+    Wrapper dataset that will loop through chunks of timeseries
+    data produced by another iterable and sample windows from
+    these chunks.
     Args:
-        fnames:
-            List of HDF5 archives containing data to read.
-            Each file should have all of the channels specified
-            in `channels` as top-level datasets.
-        channels:
-            Datasets to load from each filename in `fnames`
-        kernel_length:
-            Length of the windows returned at iteration time
-            in seconds
-        sample_rate:
-            Rate at which the data in the specified `fnames`
-            has been sampled.
+        chunk_it:
+            Iterator which will produce chunks of timeseries
+            data to sample windows from. Should have shape
+            `(N, C, T)`, where `N` is the number of chunks
+            to sample from, `C` is the number of channels,
+            and `T` is the number of samples along the
+            time dimension for each chunk.
+        kernel_size:
+            Size of windows to be sampled from each chunk.
+            Should be less than the size of each chunk
+            along the time dimension.
         batch_size:
-            Number of samples to return at iteration time
-        reads_per_chunk:
-            Number of file reads to perform when generating
-            each chunk
-        chunk_length:
-            Amount of data to read for each segment loaded
-            into each chunk, in seconds
+            Number of windows to sample at each iteration
         batches_per_chunk:
-            Number of batches to sample from each chunk
-            before loading the next one
-        chunks_per_epoch:
-            Number of chunks to generate before iteration
-            terminates
+            Number of batches of windows to sample from
+            each chunk before moving on to the next one.
+            Sampling fewer batches from each chunk means
+            a lower likelihood of sampling duplicate windows,
+            but an increase in chunk-loading overhead.
         coincident:
-            Flag indicating whether windows returned at iteration
-            time should come from the same point in time for
-            each channel in a given batch sample.
-        num_workers:
-            Number of workers for performing chunk loading
-            asynchronously. If left as 0, chunk loading will
-            be performed in serial with batch sampling.
+            Whether the windows sampled from individual
+            channels in each batch element should be
+            sampled coincidentally, i.e. consisting of
+            the same timesteps, or whether each window
+            should be sample independently from the others.
         device:
-            Device on which to host loaded chunks
+            Which device chunks should be moved to upon loading.
     """
     def __init__(
         self,
-        fnames: List[str],
-        channels: List[str],
-        kernel_length: float,
-        sample_rate: float,
+        chunk_it: Iterable,
+        kernel_size: float,
         batch_size: int,
-        reads_per_chunk: int,
-        chunk_length: float,
         batches_per_chunk: int,
-        chunks_per_epoch: int,
         coincident: bool = True,
-        num_workers: int = 0,
         device: str = "cpu",
-        pin_memory: bool = False,
     ) -> None:
-        if not num_workers:
-            reads_per_worker = reads_per_chunk
-        elif reads_per_chunk < num_workers:
-            raise ValueError(
-                "Too many workers {} for number of reads_per_chunk {}".format(
-                    num_workers, reads_per_chunk
-                )
-            )
-        else:
-            reads_per_worker = int(reads_per_chunk // num_workers)
-        if kernel_length > chunk_length:
-            raise ValueError(
-                "Kernel length {} must be shorter than "
-                "chunk length {}".format(kernel_length, chunk_length)
-            )
-        self.kernel_size = int(kernel_length * sample_rate)
-        self.chunk_size = int(chunk_length * sample_rate)
-        chunk_loader = ChunkLoader(
-            fnames,
-            channels,
-            self.chunk_size,
-            reads_per_worker,
-            chunks_per_epoch,
-            coincident=coincident,
-        )
-        if not num_workers:
-            self.chunk_loader = chunk_loader
-        else:
-            self.chunk_loader = torch.utils.data.DataLoader(
-                chunk_loader,
-                batch_size=num_workers,
-                num_workers=num_workers,
-                pin_memory=pin_memory,
-                collate_fn=chunk_loader.collate,
-            )
-        self.device = device
-        self.num_channels = len(channels)
-        self.coincident = coincident
+        self.chunk_it = chunk_it
+        self.kernel_size = kernel_size
         self.batch_size = batch_size
         self.batches_per_chunk = batches_per_chunk
-        self.chunks_per_epoch = chunks_per_epoch
-        self.num_workers = num_workers
+        self.coincident = coincident
+        self.device = device
     def __len__(self):
-        if not self.num_workers:
-            return self.chunks_per_epoch * self.batches_per_chunk
+        return len(self.chunk_it) * self.batches_per_chunk
-        num_chunks = (self.chunks_per_epoch - 1) // self.num_workers + 1
-        return num_chunks * self.num_workers * self.batches_per_chunk
+    def __iter__(self):
+        it = iter(self.chunk_it)
+        chunk = next(it)
+        num_chunks, num_channels, chunk_size = chunk.shape
+        # if we're sampling coincidentally, we only need
+        # to sample indices on a per-batch-element basis.
+        # Otherwise, we'll need indices for both each
+        # batch sample _and_ each channel with each sample
+        if self.coincident:
+            sample_size = (self.batch_size,)
+        else:
+            sample_size = (self.batch_size, num_channels)
-    def iter_epoch(self):
         # slice kernels out a flattened chunk tensor
         # index-for-index. We'll account for batch/
         # channel indices by introducing offsets later on
         idx = torch.arange(self.kernel_size, device=self.device)
         idx = idx.view(1, 1, -1)
-        idx = idx.repeat(self.batch_size, self.num_channels, 1)
+        idx = idx.repeat(self.batch_size, num_channels, 1)
         # this will just be a set of aranged channel indices
         # repeated to offset the kernel indices in the
         # flattened chunk tensor
-        channel_idx = torch.arange(self.num_channels, device=self.device)
+        channel_idx = torch.arange(num_channels, device=self.device)
         channel_idx = channel_idx.view(1, -1, 1)
         channel_idx = channel_idx.repeat(self.batch_size, 1, self.kernel_size)
-        idx += channel_idx * self.chunk_size
+        idx += channel_idx * chunk_size
-        for chunk in self.chunk_loader:
+        while True:
             # record the number of rows in the chunk, then
             # flatten it to make it easier to slice
-            num_chunks, _, chunk_size = chunk.shape
-            chunk = chunk.to(self.device).reshape(-1)
+            if chunk_size < self.kernel_size:
+                raise ValueError(
+                    "Can't sample kernels of size {} from chunk "
+                    "with size {}".format(self.kernel_size, chunk_size)
+                )
+            chunk = chunk.reshape(-1)
             # generate batches from the current chunk
             for _ in range(self.batches_per_chunk):
-                # if we're sampling coincidentally, we only need
-                # to sample indices on a per-batch-element basis.
-                # Otherwise, we'll need indices for both each
-                # batch sample _and_ each channel with each sample
-                if self.coincident:
-                    size = (self.batch_size,)
-                else:
-                    size = (self.batch_size, self.num_channels)
                 # first sample the indices of which chunk elements
                 # we're going to read batch elements from
                 chunk_idx = torch.randint(
-                    0, num_chunks, size=size, device=self.device
+                    0, num_chunks, size=sample_size, device=self.device
                 )
                 # account for the offset this batch element
                 # introduces in the flattened array
-                chunk_idx *= self.num_channels * self.chunk_size
+                chunk_idx *= num_channels * chunk_size
                 chunk_idx = chunk_idx.view(self.batch_size, -1, 1)
                 chunk_idx = chunk_idx + idx
@@ -265,7 +116,7 @@ class ChunkedDataset(torch.utils.data.IterableDataset):
                 time_idx = torch.randint(
                     0,
                     chunk_size - self.kernel_size,
-                    size=size,
+                    size=sample_size,
                     device=self.device,
                 )
                 time_idx = time_idx.view(self.batch_size, -1, 1)
@@ -276,5 +127,8 @@ class ChunkedDataset(torch.utils.data.IterableDataset):
                 # now slice this 3D tensor from our flattened chunk
                 yield chunk[chunk_idx]
-    def __iter__(self):
-        return self.iter_epoch()
+            try:
+                chunk = next(it)
+            except StopIteration:
+                break
+            num_chunks, num_channels, chunk_size = chunk.shape

ml4gw/dataloading/hdf5_dataset.py ADDED Viewed

@@ -0,0 +1,176 @@
+import warnings
+from typing import Sequence, Union
+import h5py
+import numpy as np
+import torch
+from ml4gw.types import WaveformTensor
+class ContiguousHdf5Warning(Warning):
+    pass
+class Hdf5TimeSeriesDataset(torch.utils.data.IterableDataset):
+    """
+    Iterable dataset that samples and loads windows of
+    timeseries data uniformly from a set of HDF5 files.
+    It is _strongly_ recommended that these files have been
+    written using [chunked storage]
+    (https://docs.h5py.org/en/stable/high/dataset.html#chunked-storage).
+    This has shown to produce increases in read-time speeds
+    of over an order of magnitude.
+    Args:
+        fnames:
+            Paths to HDF5 files from which to sample data.
+        channels:
+            Datasets to read from the indicated files, which
+            will be stacked along dim 1 of the generated batches
+            during iteration.
+        kernel_size:
+            Size of the windows to read, in number of samples.
+            This will be the size of the last dimension of the
+            generated batches.
+        batch_size:
+            Number of windows to sample at each iteration.
+        batches_per_epoch:
+            Number of batches to generate during each call
+            to `__iter__`.
+        coincident:
+            Whether windows for each channel in a given batch
+            element should be sampled coincidentally, i.e.
+            corresponding to the same time indices from the
+            same files, or should be sampled independently.
+            For the latter case, users can either specify
+            `False`, which will sample filenames independently
+            for each channel, or `"files"`, which will sample
+            windows independently within a given file for each
+            channel. The latter setting limits the amount of
+            entropy in the effective dataset, but can provide
+            over 2x improvement in total throughput.
+    """
+    def __init__(
+        self,
+        fnames: Sequence[str],
+        channels: Sequence[str],
+        kernel_size: int,
+        batch_size: int,
+        batches_per_epoch: int,
+        coincident: Union[bool, str],
+    ) -> None:
+        if not isinstance(coincident, bool) and coincident != "files":
+            raise ValueError(
+                "coincident must be either a boolean or 'files', "
+                "got unrecognized value {}".format(coincident)
+            )
+        self.fnames = fnames
+        self.channels = channels
+        self.num_channels = len(channels)
+        self.kernel_size = kernel_size
+        self.batch_size = batch_size
+        self.batches_per_epoch = batches_per_epoch
+        self.coincident = coincident
+        self.sizes = {}
+        for fname in self.fnames:
+            with h5py.File(fname, "r") as f:
+                dset = f[channels[0]]
+                if dset.chunks is None:
+                    warnings.warn(
+                        "File {} contains datasets that were generated "
+                        "without using chunked storage. This can have "
+                        "severe performance impacts at data loading time. "
+                        "If you need faster loading, try re-generating "
+                        "your datset with chunked storage turned on.".format(
+                            fname
+                        ),
+                        category=ContiguousHdf5Warning,
+                    )
+                self.sizes[fname] = len(dset)
+        total = sum(self.sizes.values())
+        self.probs = np.array([i / total for i in self.sizes.values()])
+    def __len__(self) -> int:
+        return self.batches_per_epoch
+    def sample_fnames(self, size) -> np.ndarray:
+        return np.random.choice(
+            self.fnames,
+            p=self.probs,
+            size=size,
+            replace=True,
+        )
+    def sample_batch(self) -> WaveformTensor:
+        """
+        Sample a single batch of multichannel timeseries
+        """
+        # allocate memory up front
+        x = np.zeros((self.batch_size, len(self.channels), self.kernel_size))
+        # sample filenames, but only loop through each unique
+        # filename once to avoid unnecessary I/O overhead
+        if self.coincident is not False:
+            size = (self.batch_size,)
+        else:
+            size = (self.batch_size, self.num_channels)
+        fnames = self.sample_fnames(size)
+        unique_fnames, inv, counts = np.unique(
+            fnames, return_inverse=True, return_counts=True
+        )
+        for i, (fname, count) in enumerate(zip(unique_fnames, counts)):
+            size = self.sizes[fname]
+            max_idx = size - self.kernel_size
+            # figure out which batch indices should be
+            # sampled from the current filename
+            indices = np.where(inv == i)[0]
+            # when sampling coincidentally either fully
+            # or at the file level, all channels will
+            # correspond to the same file
+            if self.coincident is not False:
+                batch_indices = np.repeat(indices, self.num_channels)
+                channel_indices = np.arange(self.num_channels)
+                channel_indices = np.concatenate([channel_indices] * count)
+            else:
+                batch_indices = indices // self.num_channels
+                channel_indices = indices % self.num_channels
+            # if we're sampling fully coincidentally, each
+            # channel will be the same in each file
+            if self.coincident is True:
+                idx = np.random.randint(max_idx, size=count)
+                idx = np.repeat(idx, self.num_channels)
+            else:
+                # otherwise, every channel will be different
+                # for the given file
+                idx = np.random.randint(max_idx, size=len(batch_indices))
+            # open the file and sample a different set of
+            # kernels for each batch element it occupies
+            with h5py.File(fname, "r") as f:
+                for b, c, i in zip(batch_indices, channel_indices, idx):
+                    x[b, c] = f[self.channels[c]][i : i + self.kernel_size]
+        return torch.Tensor(x)
+    def __iter__(self) -> torch.Tensor:
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            num_batches = self.batches_per_epoch
+        else:
+            num_batches, remainder = divmod(
+                self.batches_per_epoch, worker_info.num_workers
+            )
+            if worker_info.id < remainder:
+                num_batches += 1
+        for _ in range(num_batches):
+            yield self.sample_batch()

ml4gw/nn/__init__.py ADDED Viewed

File without changes

ml4gw/nn/autoencoder/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import Autoencoder
+from .convolutional import ConvolutionalAutoencoder
+from .skip_connection import AddSkipConnect, ConcatSkipConnect, SkipConnection

ml4gw/nn/autoencoder/base.py ADDED Viewed

@@ -0,0 +1,89 @@
+from collections.abc import Sequence
+from typing import Optional
+import torch
+from ml4gw.nn.autoencoder.skip_connection import SkipConnection
+class Autoencoder(torch.nn.Module):
+    """
+    Base autoencoder class that defines some of the
+    basic methods and functionality. Autoencoders are
+    defined here as a set of sequential blocks that
+    have an `encode` method, which acts on the input
+    data to the autoencoder, and a `decode` method, which
+    acts on the encoded vector generated by the `encode`
+    method. `forward` just runs these steps one after the
+    other. Although it isn't explicitly enforced, a good
+    rule of thumb is that the ouput of a block's `decode`
+    method should have the same shape as the _input_ of its
+    `encode` method.
+    Accepts a `skip_connection` argument that defines how to
+    combine information from the input of one block's `encode`
+    layer with the output to its `decode`layer. See `skip_connections.py`
+    for more info about what these classes are expected to contain
+    and how they operate.
+    """
+    def __init__(self, skip_connection: Optional[SkipConnection] = None):
+        super().__init__()
+        self.skip_connection = skip_connection
+        self.blocks = torch.nn.ModuleList()
+    def encode(self, *X: torch.Tensor, return_states: bool = False):
+        states = []
+        for block in self.blocks:
+            if isinstance(X, tuple):
+                X = block.encode(*X)
+            else:
+                X = block.encode(X)
+            states.append(X)
+        # don't need to return the last
+        # state, since that's just equal
+        # to the output of this layer
+        if return_states:
+            return X, states[:-1]
+        return X
+    def decode(self, *X, states: Optional[Sequence[torch.Tensor]] = None):
+        if self.skip_connection is not None and states is None:
+            raise ValueError(
+                "Must pass intermediate states when autoencoder "
+                "has a skip connection function specified"
+            )
+        elif states is not None:
+            if len(states) != len(self.blocks) - 1:
+                raise ValueError(
+                    "Passed {} intermediate states, expected {}".format(
+                        len(states), len(self.blocks) - 1
+                    )
+                )
+            # Don't skip connect the output layer
+            states = states[::-1] + [None]
+        for i, block in enumerate(self.blocks[::-1]):
+            if isinstance(X, tuple):
+                X = block.decode(*X)
+            else:
+                X = block.decode(X)
+            state = states[-i - 1]
+            if state is not None:
+                X = self.skip_connection(X, state)
+        return X
+    def forward(self, *X):
+        return_states = self.skip_connection is not None
+        X = self.encode(*X, return_states=return_states)
+        if return_states:
+            *X, states = X
+        else:
+            states = None
+        if isinstance(X, torch.Tensor):
+            X = (X,)
+        return self.decode(*X, states=states)

ml4gw 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

ml4gw 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl