PyPI - madspace - Versions diffs - 0.3.1__cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - Mend

madspace 0.3.1__cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

madspace/__init__.py +1 -0
madspace/_madspace_py.cpython-311-x86_64-linux-gnu.so +0 -0
madspace/_madspace_py.pyi +2189 -0
madspace/_madspace_py_loader.py +111 -0
madspace/include/madspace/constants.h +17 -0
madspace/include/madspace/madcode/function.h +102 -0
madspace/include/madspace/madcode/function_builder_mixin.h +591 -0
madspace/include/madspace/madcode/instruction.h +208 -0
madspace/include/madspace/madcode/opcode_mixin.h +134 -0
madspace/include/madspace/madcode/optimizer.h +31 -0
madspace/include/madspace/madcode/type.h +203 -0
madspace/include/madspace/madcode.h +6 -0
madspace/include/madspace/phasespace/base.h +74 -0
madspace/include/madspace/phasespace/channel_weight_network.h +46 -0
madspace/include/madspace/phasespace/channel_weights.h +51 -0
madspace/include/madspace/phasespace/chili.h +32 -0
madspace/include/madspace/phasespace/cross_section.h +47 -0
madspace/include/madspace/phasespace/cuts.h +34 -0
madspace/include/madspace/phasespace/discrete_flow.h +44 -0
madspace/include/madspace/phasespace/discrete_sampler.h +53 -0
madspace/include/madspace/phasespace/flow.h +53 -0
madspace/include/madspace/phasespace/histograms.h +26 -0
madspace/include/madspace/phasespace/integrand.h +204 -0
madspace/include/madspace/phasespace/invariants.h +26 -0
madspace/include/madspace/phasespace/luminosity.h +41 -0
madspace/include/madspace/phasespace/matrix_element.h +70 -0
madspace/include/madspace/phasespace/mlp.h +37 -0
madspace/include/madspace/phasespace/multichannel.h +49 -0
madspace/include/madspace/phasespace/observable.h +85 -0
madspace/include/madspace/phasespace/pdf.h +78 -0
madspace/include/madspace/phasespace/phasespace.h +67 -0
madspace/include/madspace/phasespace/rambo.h +26 -0
madspace/include/madspace/phasespace/scale.h +52 -0
madspace/include/madspace/phasespace/t_propagator_mapping.h +34 -0
madspace/include/madspace/phasespace/three_particle.h +68 -0
madspace/include/madspace/phasespace/topology.h +116 -0
madspace/include/madspace/phasespace/two_particle.h +63 -0
madspace/include/madspace/phasespace/vegas.h +53 -0
madspace/include/madspace/phasespace.h +27 -0
madspace/include/madspace/runtime/context.h +147 -0
madspace/include/madspace/runtime/discrete_optimizer.h +24 -0
madspace/include/madspace/runtime/event_generator.h +257 -0
madspace/include/madspace/runtime/format.h +68 -0
madspace/include/madspace/runtime/io.h +343 -0
madspace/include/madspace/runtime/lhe_output.h +132 -0
madspace/include/madspace/runtime/logger.h +46 -0
madspace/include/madspace/runtime/runtime_base.h +39 -0
madspace/include/madspace/runtime/tensor.h +603 -0
madspace/include/madspace/runtime/thread_pool.h +101 -0
madspace/include/madspace/runtime/vegas_optimizer.h +26 -0
madspace/include/madspace/runtime.h +12 -0
madspace/include/madspace/umami.h +202 -0
madspace/include/madspace/util.h +142 -0
madspace/lib/libmadspace.so +0 -0
madspace/lib/libmadspace_cpu.so +0 -0
madspace/lib/libmadspace_cpu_avx2.so +0 -0
madspace/lib/libmadspace_cpu_avx512.so +0 -0
madspace/lib/libmadspace_cuda.so +0 -0
madspace/lib/libmadspace_hip.so +0 -0
madspace/madnis/__init__.py +44 -0
madspace/madnis/buffer.py +167 -0
madspace/madnis/channel_grouping.py +85 -0
madspace/madnis/distribution.py +103 -0
madspace/madnis/integrand.py +175 -0
madspace/madnis/integrator.py +973 -0
madspace/madnis/interface.py +191 -0
madspace/madnis/losses.py +186 -0
madspace/torch.py +82 -0
madspace-0.3.1.dist-info/METADATA +71 -0
madspace-0.3.1.dist-info/RECORD +75 -0
madspace-0.3.1.dist-info/WHEEL +6 -0
madspace-0.3.1.dist-info/licenses/LICENSE +21 -0
madspace.libs/libgfortran-83c28eba.so.5.0.0 +0 -0
madspace.libs/libopenblas-r0-11edc3fa.3.15.so +0 -0
madspace.libs/libquadmath-2284e583.so.0.0.0 +0 -0

madspace/madnis/integrator.py ADDED Viewed

@@ -0,0 +1,973 @@
+import itertools
+import signal
+import warnings
+from collections.abc import Callable, Iterable
+from dataclasses import astuple, dataclass
+from typing import Any, Literal
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+from .buffer import Buffer
+from .distribution import Distribution
+from .integrand import Integrand
+from .losses import MultiChannelLoss, kl_divergence, stratified_variance, variance
+@dataclass
+class TrainingStatus:
+    """
+    Contains the MadNIS training status to pass it to a callback function.
+    Args:
+        step: optimization step
+        loss: loss from the optimization step
+        buffered: whether the optimization was performed on buffered samples
+        learning_rate: current learning rate if learning rate scheduler is present
+        dropped_channels: number of channels dropped after this optimization step
+    """
+    step: int
+    loss: float
+    buffered: bool
+    learning_rate: float | None
+    dropped_channels: int
+@dataclass
+class SampleBatch:
+    """
+    Contains a batch of samples
+    Args:
+        x: samples generated by the flow, shape (n, dim)
+        y: remapped samples returned by the integrand, shape (n, remapped_dim)
+        q_sample: probabilities of the samples, shape (n, )
+        func_vals: integrand value, shape (n, )
+        channels: channels indices for multi-channel integration, shape (n, ), otherwise None
+        alphas_prior: prior channel weights, shape (n, channels), or None for single-channel
+            integration
+        alpha_channel_indices: channel indices if not all prior channel weights are stored,
+            otherwise None
+        integration_channels: index of the channel group in case the integration is performed at the
+            level of channel groups, shape (n, ), otherwise None
+        weights: integration weight, shape (n, ). Only set when returned from Integrator.sample
+            function, otherwise None.
+        alphas: channel weights including learned correction, shape (n, channels). Only set when
+            returned from Integrator.sample function, otherwise None.
+        zero_counts: channel-wise counts of samples with zero-weights that are not included in the
+            batch, shape (channels, ). This field is ignored by most methods, as it behaves
+            does not have the batch size as its first dimension
+    """
+    x: torch.Tensor
+    y: torch.Tensor | None
+    q_sample: torch.Tensor
+    func_vals: torch.Tensor
+    channels: torch.Tensor | None
+    alphas_prior: torch.Tensor | None = None
+    alpha_channel_indices: torch.Tensor | None = None
+    integration_channels: torch.Tensor | None = None
+    weights: torch.Tensor | None = None
+    alphas: torch.Tensor | None = None
+    zero_counts: torch.Tensor | None = None
+    def __iter__(self) -> Iterable[torch.Tensor | None]:
+        """
+        Returns iterator over the fields of the class
+        """
+        return iter(astuple(self)[:-1])
+    def map(self, func: Callable[[torch.Tensor], torch.Tensor]) -> "SampleBatch":
+        """
+        Applies function to all fields in the batch that are not None and returns a new SampleBatch
+        Args:
+            func: function that is applied to all fields in the batch. Expects a tensor as argument
+                and returns a new tensor
+        Returns:
+            Transformed SampleBatch
+        """
+        return SampleBatch(*(None if field is None else func(field) for field in self))
+    def split(self, batch_size: int) -> Iterable["SampleBatch"]:
+        """
+        Splits up the fields into batches and yields SampleBatch objects for every batch.
+        Args:
+            batch_size: maximal size of the batches
+        Returns:
+            Iterator over the batches
+        """
+        for batch in zip(
+            *(
+                itertools.repeat(None) if field is None else field.split(batch_size)
+                for field in self
+            )
+        ):
+            yield SampleBatch(*batch)
+    @staticmethod
+    def cat(batches: Iterable["SampleBatch"]) -> "SampleBatch":
+        """
+        Concatenates multiple batches. If the field zero_counts is not None, the zero_counts of
+        all batches are added.
+        Args:
+            batches: Iterable over SampleBatch objects
+        Return:
+            New SamplaBatch object containing the concatenated batches
+        """
+        cat_batch = SampleBatch(
+            *(
+                None if item[0] is None else torch.cat(item, dim=0)
+                for item in zip(*batches)
+            )
+        )
+        if batches[0].zero_counts is not None:
+            cat_batch.zero_counts = torch.stack(
+                [batch.zero_counts for batch in batches], dim=1
+            ).sum(dim=1)
+        return cat_batch
+class Integrator(nn.Module):
+    """
+    Implements MadNIS training and integration logic. MadNIS integrators are torch modules, so
+    their state can easily be saved and loaded using the torch.save and torch.load methods.
+    """
+    def __init__(
+        self,
+        integrand: Callable[[torch.Tensor], torch.Tensor] | Integrand,
+        dims: int = 0,
+        flow: Distribution | None = None,
+        flow_kwargs: dict[str, Any] = {},
+        discrete_flow_kwargs: dict[str, Any] = {},
+        discrete_model: Literal["made", "transformer"] = "made",
+        train_channel_weights: bool = True,
+        cwnet: nn.Module | None = None,
+        cwnet_kwargs: dict[str, Any] = {},
+        loss: MultiChannelLoss | None = None,
+        optimizer: (
+            Optimizer | Callable[[Iterable[nn.Parameter]], Optimizer] | None
+        ) = None,
+        batch_size: int = 1024,
+        batch_size_per_channel: int = 0,
+        learning_rate: float = 1e-3,
+        scheduler: LRScheduler | Callable[[Optimizer], LRScheduler] | None = None,
+        uniform_channel_ratio: float = 1.0,
+        integration_history_length: int = 20,
+        drop_zero_integrands: bool = False,
+        batch_size_threshold: float = 0.5,
+        buffer_capacity: int = 0,
+        minimum_buffer_size: int = 50,
+        buffered_steps: int = 0,
+        max_stored_channel_weights: int | None = None,
+        channel_dropping_threshold: float = 0.0,
+        channel_dropping_interval: int = 100,
+        channel_grouping_mode: Literal["none", "uniform", "learned"] = "none",
+        freeze_cwnet_iteration: int | None = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        """
+        Args:
+            integrand: the function to be integrated. In the case of a simple single-channel
+                integration, the integrand function can directly be passed to the integrator.
+                In more complicated cases, like multi-channel integrals, use the ``Integrand`` class.
+            dims: dimension of the integration space. Only required if a simple function is given
+                as integrand.
+            flow: sampling distribution used for the integration. If None, a flow is constructed
+                using the ``Flow`` class. Otherwise, it has to be compatible with a normalizing flow,
+                i.e. have the interface defined in the ``Distribution`` class.
+            flow_kwargs: If flow is None, these keyword arguments are passed to the `Flow`
+                constructor.
+            discrete_flow_kwargs: If flow is None, these keyword arguments are passed to the
+                ``MixedFlow`` or ``DiscreteMADE`` constructor.
+            train_channel_weights: If True, construct a channel weight network and train it. Only
+                necessary if cwnet is None.
+            cwnet: network used for the trainable channel weights. If None and
+                train_channel_weights is True, the cwnet is built using the ``MLP`` class.
+            cwnet_kwargs: If cwnet is None and train_channel_weights is True, these keyword
+                arguments are passed to the ``MLP`` constructor.
+            loss: Loss function used for training. If not provided, the KL divergence is chosen in
+                the single-channel case and the stratified variance is chosen in the multi-channel
+                case.
+            optimizer: optimizer for the training. Can be an optimizer object or function that is
+                called with the model parameters as argument and returns the optimizer. If None, the
+                Adam optimizer is used.
+            batch_size: Training batch size
+            batch_size_per_channel: used to compute the batch size as a function of the number of
+                active channels, ``batch_size + n_active_channels * batch_size_per_channel``
+            learning_rate: learning rate used for the Adam optimizer
+            scheduler: learning rate scheduler for the training. Can be a learning rate scheduler
+                object or a function that gets the optimizer as argument and returns the scheduler.
+                If None, a constant learning rate is used.
+            uniform_channel_ratio: part of samples in each batch that will be distributed equally
+                between all channels, value has to be between 0 and 1.
+            integration_history_length: number of batches for which the channel-wise means and
+                variances are stored. This is used for stratified sampling during integration, and
+                during the training if uniform_channel_ratio is different from one.
+            drop_zero_integrands: If True, points with integrand zero are dropped and not used for
+                the optimization.
+            batch_size_threshold: New samples are drawn until the number of samples is at least
+                batch_size_threshold * batch_size.
+            buffer_capacity: number of samples that are stored for buffered training
+            minimum_buffer_size: minimal size of the buffer to run buffered training
+            buffered_steps: number of optimization steps on buffered samples after every online
+                training step
+            max_stored_channel_weights: number of prior channel weights that are buffered for each
+                sample. If None, all prior channel weights are saved, otherwise only those for the
+                channels with the largest contributions.
+            channel_dropping_threshold: all channels which a cumulated contribution to the
+                integrand that is smaller than this threshold are dropped
+            channel_dropping_interval: number of training steps after which channel dropping
+                is performed
+            channel_grouping_mode: If "none" all channels are treated as separate channels in the
+                loss and integration, even when they grouped together. If "uniform", the channels
+                within each group are sampled with equal probability. If "learned", a discrete
+                normalizing flow is used to sample the channel index within a group.
+            freeze_cwnet_iteration: If not None, specifies the training iteration after which the
+                channel weight network is frozen
+            device: torch device used for training and integration. If None, use default device.
+            dtype: torch dtype used for training and integration. If None, use default dtype.
+        """
+        super().__init__()
+        if not isinstance(integrand, Integrand):
+            integrand = Integrand(integrand, dims)
+        self.integrand = integrand
+        self.multichannel = integrand.channel_count is not None
+        discrete_dims = integrand.discrete_dims
+        input_dim = integrand.input_dim
+        if integrand.channel_grouping is None or channel_grouping_mode == "none":
+            self.integration_channel_count = integrand.channel_count
+            self.group_channels = False
+            self.group_channels_uniform = False
+        elif channel_grouping_mode == "uniform":
+            self.integration_channel_count = integrand.unique_channel_count()
+            self.group_channels = True
+            self.group_channels_uniform = True
+        elif channel_grouping_mode == "learned":
+            self.integration_channel_count = integrand.unique_channel_count()
+            self.group_channels = True
+            self.group_channels_uniform = False
+            self.channel_group_dim = (
+                0
+                if integrand.discrete_dims_position == "first"
+                else input_dim - len(discrete_dims)
+            )
+            # TODO: provide default implementation of discrete prior
+            # discrete_dims.insert(0, max(len(group.channel_indices) for group in integrand.channel_grouping.groups))
+            # input_dim += 1
+        else:
+            raise ValueError(f"Unknown channel grouping mode {channel_grouping_mode}")
+        if self.group_channels:
+            self.register_buffer(
+                "channel_group_sizes",
+                torch.tensor(
+                    [
+                        len(group.channel_indices)
+                        for group in integrand.channel_grouping.groups
+                    ]
+                ),
+            )
+            self.register_buffer(
+                "channel_group_remap",
+                torch.zeros(
+                    (len(self.channel_group_sizes), max(self.channel_group_sizes)),
+                    dtype=torch.int64,
+                ),
+            )
+            for group in integrand.channel_grouping.groups:
+                for i, chan_index in enumerate(group.channel_indices):
+                    self.channel_group_remap[group.group_index][i] = chan_index
+        if flow is None:
+            channel_remap_function = (
+                None
+                if self.group_channels and not self.group_channels_uniform
+                else self.integrand.remap_channels
+            )
+            if len(discrete_dims) == 0:
+                raise NotImplementedError("removed in MadSpace version of MadNIS")
+            elif len(discrete_dims) == input_dim:
+                if discrete_model == "made":
+                    raise NotImplementedError("removed in MadSpace version of MadNIS")
+                elif discrete_model == "transformer":
+                    raise NotImplementedError("removed in MadSpace version of MadNIS")
+                else:
+                    raise ValueError("discrete_model must be 'made' or 'transformer'")
+            else:
+                discrete_kwargs = dict(
+                    prior_prob_function=integrand.discrete_prior_prob_function,
+                    **discrete_flow_kwargs,
+                )
+                if self.multichannel:
+                    discrete_kwargs["channel_remap_function"] = channel_remap_function
+                raise NotImplementedError("removed in MadSpace version of MadNIS")
+        if cwnet is None and train_channel_weights and self.multichannel:
+            raise NotImplementedError("removed in MadSpace version of MadNIS")
+        if cwnet is None:
+            parameters = flow.parameters()
+        else:
+            parameters = itertools.chain(flow.parameters(), cwnet.parameters())
+        if optimizer is None:
+            self.optimizer = torch.optim.Adam(parameters, learning_rate)
+        elif isinstance(optimizer, Optimizer):
+            self.optimizer = optimizer
+        else:
+            self.optimizer = optimizer(parameters)
+        if scheduler is None or isinstance(scheduler, LRScheduler):
+            self.scheduler = scheduler
+        else:
+            self.scheduler = scheduler(self.optimizer)
+        self.flow = flow
+        self.cwnet = cwnet
+        self.batch_size_offset = batch_size
+        self.batch_size_per_channel = batch_size_per_channel
+        self.batch_size = batch_size + batch_size_per_channel * (
+            self.integration_channel_count or 1
+        )
+        self.uniform_channel_ratio = uniform_channel_ratio
+        self.drop_zero_integrands = drop_zero_integrands
+        self.batch_size_threshold = batch_size_threshold
+        if loss is None:
+            self.loss = stratified_variance if self.multichannel else kl_divergence
+        else:
+            self.loss = loss
+        self.minimum_buffer_size = minimum_buffer_size
+        self.buffered_steps = buffered_steps
+        self.max_stored_channel_weights = (
+            None
+            if max_stored_channel_weights is None
+            or integrand.channel_count is None
+            or max_stored_channel_weights >= integrand.channel_count
+            else max_stored_channel_weights
+        )
+        if buffer_capacity > 0:
+            channel_count = self.max_stored_channel_weights or integrand.channel_count
+            buffer_fields = [
+                (input_dim,),
+                None if integrand.remapped_dim is None else (integrand.remapped_dim,),
+                (),
+                (),
+                None if integrand.channel_count is None else (),
+                None if not integrand.has_channel_weight_prior else (channel_count,),
+                None if self.max_stored_channel_weights is None else (channel_count,),
+                () if self.group_channels else None,
+                None,
+                None,
+            ]
+            buffer_dtypes = [
+                None,
+                None,
+                None,
+                None,
+                torch.int64,
+                None,
+                torch.int64,
+                torch.int64,
+                None,
+                None,
+            ]
+            self.buffer = Buffer(
+                buffer_capacity, buffer_fields, persistent=False, dtypes=buffer_dtypes
+            )
+        else:
+            self.buffer = None
+        self.channel_dropping_threshold = channel_dropping_threshold
+        self.channel_dropping_interval = channel_dropping_interval
+        self.freeze_cwnet_iteration = freeze_cwnet_iteration
+        hist_shape = (self.integration_channel_count or 1,)
+        self.integration_history = Buffer(
+            integration_history_length,
+            [hist_shape, hist_shape, hist_shape],
+            dtypes=[None, None, torch.int64],
+        )
+        self.step = 0
+        self.step_type_count = 0
+        if self.multichannel:
+            self.register_buffer(
+                "active_channels_mask",
+                torch.ones((self.integration_channel_count,), dtype=torch.bool),
+            )
+        # Dummy to determine device and dtype
+        self.register_buffer("dummy", torch.zeros((1,)))
+        if device is not None:
+            self.to(device)
+        if dtype is not None:
+            self.to(dtype)
+    def _get_alphas(self, samples: SampleBatch) -> torch.Tensor:
+        """
+        Runs the channel weight network and returns the normalized channel weights, taking prior
+        channel weights and dropped channels into account.
+        Args:
+            samples: batch of samples
+        Returns:
+            channel weights, shape (n, channels)
+        """
+        if self.cwnet is None:
+            if samples.alphas_prior is None:
+                return samples.x.new_full(
+                    (samples.x.shape[0], self.integrand.channel_count),
+                    1 / self.integrand.channel_count,
+                )
+            return self._restore_prior(samples)
+        if samples.alphas_prior is None:
+            alpha_prior = samples.x.new_ones(
+                (samples.x.shape[0], self.integrand.channel_count)
+            )
+        else:
+            alpha_prior = self._restore_prior(samples)
+        if self.group_channels:
+            active_channels_mask = self.active_channels_mask[
+                self.integrand.remap_channels(
+                    torch.arange(alpha_prior.shape[1], device=alpha_prior.device)
+                )
+            ]
+        else:
+            active_channels_mask = self.active_channels_mask
+        alpha = alpha_prior * active_channels_mask
+        mask = samples.func_vals != 0
+        y = samples.x if samples.y is None else samples.y
+        alpha[mask] *= self.cwnet(y[mask]).exp()
+        ret = alpha / alpha.sum(dim=1, keepdim=True)
+        return ret
+    def _compute_integral(
+        self, samples: SampleBatch
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Computes normalized integrand and channel-wise means, variances and counts
+        Args:
+            samples: batch of samples
+        Returns:
+            A tuple containing
+              - normalized integrand, shape (n, )
+              - channel-wise means of the integral, shape (channels, )
+              - channel-wise variances of the integral, shape (channels, )
+              - channel-wise number of samples, shape (channels, )
+        """
+        if self.multichannel:
+            alphas = torch.gather(
+                self._get_alphas(samples), index=samples.channels[:, None], dim=1
+            )[:, 0]
+            f_true = alphas * samples.func_vals
+            f_div_q = f_true.detach() / samples.q_sample
+            channels = (
+                samples.channels
+                if samples.integration_channels is None
+                else samples.integration_channels
+            )
+            counts = torch.bincount(channels, minlength=self.integration_channel_count)
+            if samples.zero_counts is not None:
+                counts += samples.zero_counts
+            means = torch.bincount(
+                channels,
+                weights=f_div_q,
+                minlength=self.integration_channel_count,
+            ) / counts.clip(min=1)
+            variances = (
+                torch.bincount(
+                    channels,
+                    weights=(f_div_q - means[channels]).square(),
+                    minlength=self.integration_channel_count,
+                )
+                / counts
+            )
+        else:
+            f_div_q = samples.func_vals / samples.q_sample
+            f_true = samples.func_vals
+            means = f_div_q.mean(dim=0, keepdim=True)
+            counts = torch.full((1,), f_div_q.shape[0], device=means.device)
+            variances = f_div_q.var(dim=0, keepdim=True)
+        return f_true, means, variances, counts
+    def _optimization_step(
+        self,
+        samples: SampleBatch,
+    ) -> tuple[float, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Perform one optimization step of the networks for the given samples
+        Args:
+            samples: batch of samples
+        Returns:
+            A tuple containing
+              - value of the loss
+              - channel-wise means of the integral, shape (channels, )
+              - channel-wise variances of the integral, shape (channels, )
+              - channel-wise number of samples, shape (channels, )
+        """
+        self.optimizer.zero_grad()
+        # TODO: depending on the loss function and for drop_zero_weights=False, we can encounter
+        # zero-weight events here and it might be sufficient to evaluate the flow for events with
+        # func_val != 0. That might however give wrong results for other loss functions
+        q_test = self.flow.prob(
+            samples.x,
+            channel=(
+                samples.integration_channels
+                if self.group_channels and not self.group_channels_uniform
+                else samples.channels
+            ),
+        )
+        f_true, means, variances, counts = self._compute_integral(samples)
+        loss = self.loss(
+            f_true,
+            q_test,
+            q_sample=samples.q_sample,
+            channels=(
+                samples.channels
+                if samples.integration_channels is None
+                else samples.integration_channels
+            ),
+        )
+        if loss.isnan().item():
+            warnings.warn("nan batch: skipping optimization")
+        else:
+            loss.backward()
+            self.optimizer.step()
+        return loss.item(), means, variances, counts
+    def _restore_prior(self, samples: SampleBatch) -> torch.Tensor:
+        """
+        Restores the full prior channel weights if only the largest channel weights and their
+        indices were saved.
+        Args:
+            samples: batch of samples
+        Returns:
+            Tensor of prior channel weights with shape (n, channels)
+        """
+        if samples.alpha_channel_indices is None:
+            return samples.alphas_prior
+        alphas_prior_reduced = samples.alphas_prior
+        epsilon = torch.finfo(alphas_prior_reduced.dtype).eps
+        # strategy 1: distribute difference to 1 evenly among non-stored channels
+        # n_rest = self.integrand.channel_count - self.max_stored_channel_weights
+        # alphas_prior = torch.clamp(
+        #    (1 - alphas_prior_reduced.sum(dim=1, keepdims=True)) / n_rest,
+        #    min=epsilon,
+        # ).repeat(1, self.integrand.channel_count)
+        # alphas_prior.scatter_(1, samples.alpha_channel_indices, alphas_prior_reduced)
+        # return alphas_prior
+        # strategy 2: set non-stored channel alphas to epsilon, normalize again
+        alphas_prior = alphas_prior_reduced.new_full(
+            (alphas_prior_reduced.shape[0], self.integrand.channel_count), epsilon
+        )
+        alphas_prior.scatter_(1, samples.alpha_channel_indices, alphas_prior_reduced)
+        return alphas_prior / alphas_prior.sum(dim=1, keepdims=True)
+    def _get_channel_contributions(
+        self,
+        expect_full_history: bool,
+        channel_weight_mode: Literal["variance", "mean"],
+    ) -> torch.Tensor:
+        """
+        Uses the list of saved variances or means to compute the contribution of each channel for
+        stratified sampling.
+        Args:
+            expect_full_history: If True, the integration history has to be full, otherwise uniform
+                weights are returned.
+            channel_weight_mode: specifies whether the channels are weighted by their mean or
+                variance. Note that weighting by mean can lead to problems for non-positive functions
+        Returns:
+            weights for sampling the channels with shape (channels,)
+        """
+        min_len = self.integration_history.capacity if expect_full_history else 1
+        if self.integration_history.size < min_len:
+            return torch.ones(
+                self.integration_channel_count,
+                device=self.dummy.device,
+                dtype=self.dummy.dtype,
+            )
+        mean_hist, var_hist, count_hist = self.integration_history
+        contrib_hist = mean_hist.abs() if channel_weight_mode == "mean" else var_hist
+        count_hist = torch.where(
+            contrib_hist.isnan(), np.nan, count_hist.to(contrib_hist.dtype)
+        )
+        hist_weights = count_hist / count_hist.nansum(dim=0)
+        return torch.nansum(hist_weights * contrib_hist, dim=0).sqrt()
+    def _disable_unused_channels(self) -> int:
+        """
+        Determines channels with a total relative contribution below
+        ``channel_dropping_threshold``, disables them and removes them from the buffer.
+        Returns:
+            Number of channels that were disabled
+        """
+        if (
+            not self.multichannel
+            or self.channel_dropping_threshold == 0.0
+            or (self.step + 1) % self.channel_dropping_interval != 0
+        ):
+            return 0
+        mean_hist, _, count_hist = self.integration_history
+        count_hist = count_hist.to(mean_hist.dtype)
+        mean_hist = torch.nan_to_num(mean_hist)
+        hist_weights = count_hist / count_hist.sum(dim=0)
+        channel_integrals = torch.nansum(hist_weights * mean_hist, dim=0)
+        channel_rel_integrals = channel_integrals / channel_integrals.sum()
+        cri_sort, cri_argsort = torch.sort(channel_rel_integrals)
+        n_irrelevant = torch.count_nonzero(
+            cri_sort.cumsum(dim=0) < self.channel_dropping_threshold
+        )
+        n_disabled = torch.count_nonzero(
+            self.active_channels_mask[cri_argsort[:n_irrelevant]]
+        )
+        self.active_channels_mask[cri_argsort[:n_irrelevant]] = False
+        self.integrand.update_active_channels_mask(self.active_channels_mask)
+        self.batch_size = (
+            self.batch_size_offset
+            + torch.count_nonzero(self.active_channels_mask).item()
+            * self.batch_size_per_channel
+        )
+        if self.buffer is not None:
+            def filter_func(batch):
+                samples = SampleBatch(*batch)
+                channels = (
+                    samples.channels
+                    if samples.integration_channels is None
+                    else samples.integration_channels
+                )
+                return self.active_channels_mask[channels]
+            self.buffer.filter(filter_func)
+        return n_disabled
+    def _store_samples(self, samples: SampleBatch):
+        """
+        Stores the generated samples and probabilites for reuse during buffered training. If
+        ``max_stored_channel_weights`` is set, the largest channel weights are determined and only
+        those and their weights are stored.
+        Args:
+            samples: Object containing a batch of samples
+        """
+        if self.buffer is None:
+            return
+        if (
+            self.max_stored_channel_weights is not None
+            and self.integrand.has_channel_weight_prior
+        ):
+            # ensure that the alpha for the channel that the sample was generated with
+            # is always stored
+            alphas_prior_mod = torch.scatter(
+                samples.alphas_prior,
+                dim=1,
+                index=samples.channels[:, None],
+                src=torch.tensor(
+                    [[2.0]], device=self.dummy.device, dtype=self.dummy.dtype
+                ).expand(*samples.alphas_prior.shape),
+            )
+            largest_alphas, alpha_indices = torch.sort(
+                alphas_prior_mod, descending=True, dim=1
+            )
+            largest_alphas[:, 0] = torch.gather(
+                samples.alphas_prior, dim=1, index=samples.channels[:, None]
+            )[:, 0]
+            samples.alphas_prior = largest_alphas[
+                :, : self.max_stored_channel_weights
+            ].clone()
+            samples.alpha_channel_indices = alpha_indices[
+                :, : self.max_stored_channel_weights
+            ].clone()
+        self.buffer.store(*samples)
+    def _get_channels(
+        self,
+        n: int,
+        channel_weights: torch.Tensor,
+        uniform_channel_ratio: float,
+        return_counts: bool = False,
+    ) -> torch.Tensor:
+        """
+        Create a tensor of channel indices or number of samples per channel in two steps:
+        1. Split up n * uniform_channel_ratio equally among all the channels
+        2. Sample the rest of the events from the distribution given by channel_weights
+           after correcting for the uniformly distributed samples
+        This allows stratified sampling by variance weighting while ensuring stable training
+        because there are events in every channel.
+        Args:
+            n: Number of samples as scalar integer tensor
+            channel_weights: Weights of the channels (not normalized) with shape (channels,)
+            uniform_channel_ratio: Number between 0.0 and 1.0 to determine the ratio of samples
+                that will be distributed uniformly first
+            return_counts: If True, return number of samples per channels, otherwise the channel
+                indices
+        Returns:
+            If return_counts is True, Tensor with number of samples per channel, shape (channels,).
+            Otherwise, Tensor of channel numbers with shape (n,)
+        """
+        assert channel_weights.shape == (self.integration_channel_count,)
+        n_active_channels = torch.count_nonzero(self.active_channels_mask).item()
+        uniform_per_channel = int(
+            np.ceil(n * uniform_channel_ratio / n_active_channels)
+        )
+        n_per_channel = torch.full(
+            (self.integration_channel_count,),
+            uniform_per_channel,
+            device=self.dummy.device,
+        )
+        n_per_channel[~self.active_channels_mask] = 0
+        n_weighted = max(n - n_per_channel.sum(), 0)
+        if n_weighted > 0:
+            normed_weights = (
+                channel_weights / channel_weights[self.active_channels_mask].sum()
+            )
+            normed_weights[~self.active_channels_mask] = 0.0
+            probs = torch.clamp(
+                normed_weights - uniform_channel_ratio / n_active_channels, min=0
+            )
+            n_per_channel += torch.ceil(probs * n_weighted / probs.sum()).int()
+        remove_chan = 0
+        while n_per_channel.sum() > n:
+            if n_per_channel[remove_chan] > 0:
+                n_per_channel[remove_chan] -= 1
+            remove_chan = (remove_chan + 1) % self.integration_channel_count
+        assert n_per_channel.sum() == n
+        if return_counts:
+            return n_per_channel
+        return torch.cat(
+            [
+                torch.full((npc,), i, device=self.dummy.device)
+                for i, npc in enumerate(n_per_channel)
+            ]
+        )
+    def _get_samples(
+        self,
+        n: int,
+        uniform_channel_ratio: float = 0.0,
+        train: bool = False,
+        channel_weight_mode: Literal["variance", "mean"] = "variance",
+        channel: int | None = None,
+    ) -> SampleBatch:
+        """
+        Draws samples from the flow and evaluates the integrand
+        Args:
+            n: number of samples
+            uniform_channel_ratio: Number between 0.0 and 1.0 to determine the ratio of samples
+                that will be distributed uniformly first
+            train: If True, the function is used in training mode, i.e. samples where the integrand
+                is zero will be removed if drop_zero_integrands is True
+            channel_weight_mode: specifies whether the channels are weighted by their mean or
+                variance. Note that weighting by mean can lead to problems for non-positive functions
+            channel: if different from None, samples are only generated for this channel
+        Returns:
+            Object containing a batch of samples
+        """
+        if channel is None:
+            batch_channels = (
+                self._get_channels(
+                    n,
+                    self._get_channel_contributions(train, channel_weight_mode),
+                    uniform_channel_ratio,
+                )
+                if self.multichannel
+                else None
+            )
+        else:
+            batch_channels = torch.full((n,), channel, device=self.dummy.device)
+        batches_out = []
+        current_batch_size = 0
+        while True:
+            integration_channels = None
+            weight_factor = None
+            if self.integrand.function_includes_sampling:
+                integration_channels = batch_channels
+                x, prob, weight, y, alphas_prior, channels = self.integrand.function(
+                    batch_channels
+                )
+            else:
+                if self.group_channels and self.group_channels_uniform:
+                    group_sizes = self.channel_group_sizes[batch_channels]
+                    chan_in_group = (
+                        torch.rand(
+                            (n,), device=self.dummy.device, dtype=self.dummy.dtype
+                        )
+                        * group_sizes
+                    ).long()
+                    weight_factor = group_sizes
+                    integration_channels = batch_channels
+                    channels = self.channel_group_remap[batch_channels, chan_in_group]
+                else:
+                    channels = batch_channels
+                with torch.no_grad():
+                    x, prob = self.flow.sample(
+                        n,
+                        channel=channels,
+                        return_prob=True,
+                        device=self.dummy.device,
+                        dtype=self.dummy.dtype,
+                    )
+                weight, y, alphas_prior = self.integrand(x, channels)
+                if self.group_channels and not self.group_channels_uniform:
+                    chan_in_group = x[:, self.channel_group_dim].long()
+                    integration_channels = batch_channels
+                    channels = self.channel_group_remap[
+                        integration_channels, chan_in_group
+                    ]
+            if weight_factor is not None:
+                weight *= weight_factor
+            batch = SampleBatch(
+                x,
+                y,
+                prob,
+                weight,
+                channels,
+                alphas_prior,
+                integration_channels=integration_channels,
+            )
+            if not train:
+                current_batch_size += batch.x.shape[0]
+            elif self.drop_zero_integrands:
+                mask = weight != 0.0
+                batch = batch.map(lambda t: t[mask])
+                if self.multichannel:
+                    batch.zero_counts = torch.bincount(
+                        (
+                            channels[~mask]
+                            if integration_channels is None
+                            else integration_channels[~mask]
+                        ),
+                        minlength=self.integration_channel_count,
+                    )
+                else:
+                    batch.zero_counts = torch.full(
+                        (1,), torch.count_nonzero(~mask), device=x.device
+                    )
+                current_batch_size += batch.x.shape[0]
+            else:
+                current_batch_size += weight.count_nonzero()
+            # mask = ~(weight.isnan() | x.isnan().any(dim=1)) & mask
+            batches_out.append(batch)
+            # check this condition at the end such that the sampling runs at least once
+            if current_batch_size > self.batch_size_threshold * n:
+                break
+        return SampleBatch.cat(batches_out)
+    def train_step(self) -> TrainingStatus:
+        """
+        Performs a single training step
+        Returns:
+            Training status
+        """
+        if self.step == self.freeze_cwnet_iteration and self.cwnet is not None:
+            for param in self.cwnet.parameters():
+                param.requires_grad = False
+        if self.step_type_count == 0:
+            buffered = False
+            samples = self._get_samples(
+                self.batch_size, self.uniform_channel_ratio, train=True
+            )
+            loss, means, variances, counts = self._optimization_step(samples)
+            self._store_samples(samples)
+            self.integration_history.store(means[None], variances[None], counts[None])
+            if self.buffered_steps != 0 and self.buffer.size > self.minimum_buffer_size:
+                self.step_type_count += 1
+        else:
+            buffered = True
+            samples = SampleBatch(*self.buffer.sample(self.batch_size))
+            loss, _, _, _ = self._optimization_step(samples)
+            self.step_type_count = (self.step_type_count + 1) % (
+                self.buffered_steps + 1
+            )
+        dropped_channels = self._disable_unused_channels()
+        status = TrainingStatus(
+            step=self.step,
+            loss=loss,
+            buffered=buffered,
+            learning_rate=(
+                None if self.scheduler is None else self.scheduler.get_last_lr()[0]
+            ),
+            dropped_channels=dropped_channels,
+        )
+        if self.scheduler is not None:
+            self.scheduler.step()
+        self.step += 1
+        return status
+    def train(
+        self,
+        steps: int,
+        callback: Callable[[TrainingStatus], None] | None = None,
+        capture_keyboard_interrupt: bool = False,
+    ):
+        """
+        Performs multiple training steps
+        Args:
+            steps: number of training steps
+            callback: function that is called after each training step with the training status
+                as argument
+            capture_keyboard_interrupt: If True, a keyboard interrupt does not raise an exception.
+                Instead, the current training step is finished and the training is aborted
+                afterwards.
+        """
+        interrupted = False
+        if capture_keyboard_interrupt:
+            def handler(sig, frame):
+                nonlocal interrupted
+                interrupted = True
+            old_handler = signal.signal(signal.SIGINT, handler)
+        try:
+            for _ in range(steps):
+                status = self.train_step()
+                if callback is not None:
+                    callback(status)
+                if interrupted:
+                    break
+        finally:
+            if capture_keyboard_interrupt:
+                signal.signal(signal.SIGINT, old_handler)