PyPI - qpytorch - Versions diffs - 0.1__py3-none-any.whl - Mend

qpytorch 0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of qpytorch might be problematic. Click here for more details.

Files changed (102) hide show

qpytorch/__init__.py +327 -0
qpytorch/constraints/__init__.py +3 -0
qpytorch/distributions/__init__.py +21 -0
qpytorch/distributions/delta.py +86 -0
qpytorch/distributions/multitask_multivariate_qexponential.py +435 -0
qpytorch/distributions/multivariate_qexponential.py +581 -0
qpytorch/distributions/power.py +113 -0
qpytorch/distributions/qexponential.py +153 -0
qpytorch/functions/__init__.py +58 -0
qpytorch/kernels/__init__.py +80 -0
qpytorch/kernels/grid_interpolation_kernel.py +213 -0
qpytorch/kernels/inducing_point_kernel.py +151 -0
qpytorch/kernels/kernel.py +695 -0
qpytorch/kernels/matern32_kernel_grad.py +155 -0
qpytorch/kernels/matern52_kernel_grad.py +194 -0
qpytorch/kernels/matern52_kernel_gradgrad.py +248 -0
qpytorch/kernels/polynomial_kernel_grad.py +88 -0
qpytorch/kernels/qexponential_symmetrized_kl_kernel.py +61 -0
qpytorch/kernels/rbf_kernel_grad.py +125 -0
qpytorch/kernels/rbf_kernel_gradgrad.py +186 -0
qpytorch/kernels/rff_kernel.py +153 -0
qpytorch/lazy/__init__.py +9 -0
qpytorch/likelihoods/__init__.py +66 -0
qpytorch/likelihoods/bernoulli_likelihood.py +75 -0
qpytorch/likelihoods/beta_likelihood.py +76 -0
qpytorch/likelihoods/gaussian_likelihood.py +472 -0
qpytorch/likelihoods/laplace_likelihood.py +59 -0
qpytorch/likelihoods/likelihood.py +437 -0
qpytorch/likelihoods/likelihood_list.py +60 -0
qpytorch/likelihoods/multitask_gaussian_likelihood.py +542 -0
qpytorch/likelihoods/multitask_qexponential_likelihood.py +545 -0
qpytorch/likelihoods/noise_models.py +184 -0
qpytorch/likelihoods/qexponential_likelihood.py +494 -0
qpytorch/likelihoods/softmax_likelihood.py +97 -0
qpytorch/likelihoods/student_t_likelihood.py +90 -0
qpytorch/means/__init__.py +23 -0
qpytorch/metrics/__init__.py +17 -0
qpytorch/mlls/__init__.py +53 -0
qpytorch/mlls/_approximate_mll.py +79 -0
qpytorch/mlls/deep_approximate_mll.py +30 -0
qpytorch/mlls/deep_predictive_log_likelihood.py +32 -0
qpytorch/mlls/exact_marginal_log_likelihood.py +96 -0
qpytorch/mlls/gamma_robust_variational_elbo.py +106 -0
qpytorch/mlls/inducing_point_kernel_added_loss_term.py +69 -0
qpytorch/mlls/kl_qexponential_added_loss_term.py +41 -0
qpytorch/mlls/leave_one_out_pseudo_likelihood.py +73 -0
qpytorch/mlls/marginal_log_likelihood.py +48 -0
qpytorch/mlls/predictive_log_likelihood.py +76 -0
qpytorch/mlls/sum_marginal_log_likelihood.py +40 -0
qpytorch/mlls/variational_elbo.py +77 -0
qpytorch/models/__init__.py +72 -0
qpytorch/models/approximate_qep.py +115 -0
qpytorch/models/deep_qeps/__init__.py +22 -0
qpytorch/models/deep_qeps/deep_qep.py +155 -0
qpytorch/models/deep_qeps/dspp.py +114 -0
qpytorch/models/exact_prediction_strategies.py +880 -0
qpytorch/models/exact_qep.py +349 -0
qpytorch/models/model_list.py +100 -0
qpytorch/models/pyro/__init__.py +28 -0
qpytorch/models/pyro/_pyro_mixin.py +57 -0
qpytorch/models/pyro/distributions/__init__.py +5 -0
qpytorch/models/pyro/pyro_qep.py +105 -0
qpytorch/models/qep.py +7 -0
qpytorch/models/qeplvm/__init__.py +6 -0
qpytorch/models/qeplvm/bayesian_qeplvm.py +40 -0
qpytorch/models/qeplvm/latent_variable.py +102 -0
qpytorch/module.py +30 -0
qpytorch/optim/__init__.py +5 -0
qpytorch/priors/__init__.py +42 -0
qpytorch/priors/qep_priors.py +81 -0
qpytorch/test/__init__.py +22 -0
qpytorch/test/base_likelihood_test_case.py +106 -0
qpytorch/test/model_test_case.py +150 -0
qpytorch/test/variational_test_case.py +400 -0
qpytorch/utils/__init__.py +38 -0
qpytorch/utils/warnings.py +37 -0
qpytorch/variational/__init__.py +47 -0
qpytorch/variational/_variational_distribution.py +61 -0
qpytorch/variational/_variational_strategy.py +391 -0
qpytorch/variational/additive_grid_interpolation_variational_strategy.py +90 -0
qpytorch/variational/batch_decoupled_variational_strategy.py +256 -0
qpytorch/variational/cholesky_variational_distribution.py +65 -0
qpytorch/variational/ciq_variational_strategy.py +352 -0
qpytorch/variational/delta_variational_distribution.py +41 -0
qpytorch/variational/grid_interpolation_variational_strategy.py +113 -0
qpytorch/variational/independent_multitask_variational_strategy.py +114 -0
qpytorch/variational/lmc_variational_strategy.py +248 -0
qpytorch/variational/mean_field_variational_distribution.py +58 -0
qpytorch/variational/multitask_variational_strategy.py +317 -0
qpytorch/variational/natural_variational_distribution.py +152 -0
qpytorch/variational/nearest_neighbor_variational_strategy.py +487 -0
qpytorch/variational/orthogonally_decoupled_variational_strategy.py +128 -0
qpytorch/variational/tril_natural_variational_distribution.py +130 -0
qpytorch/variational/uncorrelated_multitask_variational_strategy.py +114 -0
qpytorch/variational/unwhitened_variational_strategy.py +225 -0
qpytorch/variational/variational_strategy.py +280 -0
qpytorch/version.py +4 -0
qpytorch-0.1.dist-info/LICENSE +21 -0
qpytorch-0.1.dist-info/METADATA +177 -0
qpytorch-0.1.dist-info/RECORD +102 -0
qpytorch-0.1.dist-info/WHEEL +5 -0
qpytorch-0.1.dist-info/top_level.txt +1 -0

qpytorch/variational/batch_decoupled_variational_strategy.py ADDED Viewed

@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+from typing import Optional, Tuple, Union
+import torch
+from linear_operator.operators import LinearOperator, MatmulLinearOperator, SumLinearOperator
+from torch import Tensor
+from torch.distributions.kl import kl_divergence
+from ..distributions import Delta, MultivariateNormal, MultivariateQExponential
+from ..models import ApproximateGP, ApproximateQEP
+from gpytorch.utils.errors import CachingError
+from gpytorch.utils.memoize import pop_from_cache_ignore_args
+from ._variational_distribution import _VariationalDistribution
+from .delta_variational_distribution import DeltaVariationalDistribution
+from .variational_strategy import VariationalStrategy
+class BatchDecoupledVariationalStrategy(VariationalStrategy):
+    r"""
+    A VariationalStrategy that uses a different set of inducing points for the
+    variational mean and variational covar.  It follows the "decoupled" model
+    proposed by `Jankowiak et al. (2020)`_ (which is roughly based on the strategies
+    proposed by `Cheng et al. (2017)`_.
+    Let :math:`\mathbf Z_\mu` and :math:`\mathbf Z_\sigma` be the mean/variance
+    inducing points. The variational distribution for an input :math:`\mathbf
+    x` is given by:
+    .. math::
+        \begin{align*}
+            \mathbb E[ f(\mathbf x) ] &= \mathbf k_{\mathbf Z_\mu \mathbf x}^\top
+            \mathbf K_{\mathbf Z_\mu \mathbf Z_\mu}^{-1} \mathbf m
+            \\
+            \text{Var}[ f(\mathbf x) ] &= k_{\mathbf x \mathbf x} - \mathbf k_{\mathbf Z_\sigma \mathbf x}^\top
+            \mathbf K_{\mathbf Z_\sigma \mathbf Z_\sigma}^{-1}
+            \left( \mathbf K_{\mathbf Z_\sigma} - \mathbf S \right)
+            \mathbf K_{\mathbf Z_\sigma \mathbf Z_\sigma}^{-1}
+            \mathbf k_{\mathbf Z_\sigma \mathbf x}
+        \end{align*}
+    where :math:`\mathbf m` and :math:`\mathbf S` are the variational parameters.
+    Unlike the original proposed implementation, :math:`\mathbf Z_\mu` and :math:`\mathbf Z_\sigma`
+    have **the same number of inducing points**, which allows us to perform batched operations.
+    Additionally, you can use a different set of kernel hyperparameters for the mean and the variance function.
+    We recommend using this feature only with the :obj:`~qpytorch.mlls.PredictiveLogLikelihood` objective function
+    as proposed in "Parametric Gaussian Process Regressors" (`Jankowiak et al. (2020)`_).
+    Use the mean_var_batch_dim to indicate which batch dimension corresponds to the different mean/var
+    kernels.
+    .. note::
+        We recommend using the "right-most" batch dimension (i.e. ``mean_var_batch_dim=-1``) for the dimension
+        that corresponds to the different mean/variance kernel parameters.
+        Assuming you want `b1` many independent GPs (uncorrelated QEPs), the :obj:`~qpytorch.variational._VariationalDistribution`
+        objects should have a batch shape of `b1`, and the mean/covar modules
+        of the GP (QEP) should have a batch shape of `b1 x 2`.
+        (The 2 corresponds to the mean/variance hyperparameters.)
+    .. seealso::
+        :obj:`~qpytorch.variational.OrthogonallyDecoupledVariationalStrategy` (a variant proposed by
+        `Salimbeni et al. (2018)`_ that uses orthogonal projections.)
+    :param model: Model this strategy is applied to.
+        Typically passed in when the VariationalStrategy is created in the
+        __init__ method of the user defined model.
+        It should contain power if Q-Exponential distribution is involved in.
+    :param inducing_points: Tensor containing a set of inducing
+        points to use for variational inference.
+    :param variational_distribution: A
+        VariationalDistribution object that represents the form of the variational distribution :math:`q(\mathbf u)`
+    :param learn_inducing_locations: (Default True): Whether or not
+        the inducing point locations :math:`\mathbf Z` should be learned (i.e. are they
+        parameters of the model).
+    :param mean_var_batch_dim: (Default `None`):
+        Set this parameter (ideally to `-1`) to indicate which dimension corresponds to different
+        kernel hyperparameters for the mean/variance functions.
+    :param jitter_val: Amount of diagonal jitter to add for Cholesky factorization numerical stability
+    .. _Cheng et al. (2017):
+        https://arxiv.org/abs/1711.10127
+    .. _Salimbeni et al. (2018):
+        https://arxiv.org/abs/1809.08820
+    .. _Jankowiak et al. (2020):
+        https://arxiv.org/abs/1910.07123
+    Example (**different** hypers for mean/variance):
+        >>> class MeanFieldDecoupledModel(qpytorch.models.ApproximateGP or qpytorch.models.ApproximateQEP):
+        >>>     '''
+        >>>     A batch of 3 independent MeanFieldDecoupled PPGPR (PPQEP) models.
+        >>>     '''
+        >>>     def __init__(self, inducing_points):
+        >>>         # The variational parameters have a batch_shape of [3]
+        >>>         variational_distribution = qpytorch.variational.MeanFieldVariationalDistribution(
+        >>>             inducing_points.size(-1), batch_shape=torch.Size([3]),
+        >>>         )
+        >>>         variational_strategy = qpytorch.variational.BatchDecoupledVariationalStrategy(
+        >>>             self, inducing_points, variational_distribution, learn_inducing_locations=True,
+        >>>             mean_var_batch_dim=-1
+        >>>         )
+        >>>
+        >>>         # The mean/covar modules have a batch_shape of [3, 2]
+        >>>         # where the last batch dim corresponds to the mean & variance hyperparameters
+        >>>         super().__init__(variational_strategy)
+        >>>         self.mean_module = qpytorch.means.ConstantMean(batch_shape=torch.Size([3, 2]))
+        >>>         self.covar_module = qpytorch.kernels.ScaleKernel(
+        >>>             qpytorch.kernels.RBFKernel(batch_shape=torch.Size([3, 2])),
+        >>>             batch_shape=torch.Size([3, 2]),
+        >>>         )
+    Example (**shared** hypers for mean/variance):
+        >>> class MeanFieldDecoupledModel(qpytorch.models.ApproximateGP or qpytorch.models.ApproximateQEP):
+        >>>     '''
+        >>>     A batch of 3 independent MeanFieldDecoupled PPGPR (PPQEP) models.
+        >>>     '''
+        >>>     def __init__(self, inducing_points):
+        >>>         # The variational parameters have a batch_shape of [3]
+        >>>         variational_distribution = qpytorch.variational.MeanFieldVariationalDistribution(
+        >>>             inducing_points.size(-1), batch_shape=torch.Size([3]),
+        >>>         )
+        >>>         variational_strategy = qpytorch.variational.BatchDecoupledVariationalStrategy(
+        >>>             self, inducing_points, variational_distribution, learn_inducing_locations=True,
+        >>>         )
+        >>>
+        >>>         # The mean/covar modules have a batch_shape of [3, 1]
+        >>>         # where the singleton dimension corresponds to the shared mean/variance hyperparameters
+        >>>         super().__init__(variational_strategy)
+        >>>         self.mean_module = qpytorch.means.ConstantMean(batch_shape=torch.Size([3, 1]))
+        >>>         self.covar_module = qpytorch.kernels.ScaleKernel(
+        >>>             qpytorch.kernels.RBFKernel(batch_shape=torch.Size([3, 1])),
+        >>>             batch_shape=torch.Size([3, 1]),
+        >>>         )
+    """
+    def __init__(
+        self,
+        model: Union[ApproximateGP, ApproximateQEP],
+        inducing_points: Tensor,
+        variational_distribution: _VariationalDistribution,
+        learn_inducing_locations: bool = True,
+        mean_var_batch_dim: Optional[int] = None,
+        jitter_val: Optional[float] = None,
+    ):
+        if isinstance(variational_distribution, DeltaVariationalDistribution):
+            raise NotImplementedError(
+                "BatchDecoupledVariationalStrategy does not work with DeltaVariationalDistribution"
+            )
+        if mean_var_batch_dim is not None and mean_var_batch_dim >= 0:
+            raise ValueError(f"mean_var_batch_dim should be negative indexed, got {mean_var_batch_dim}")
+        self.mean_var_batch_dim = mean_var_batch_dim
+        # Maybe unsqueeze inducing points
+        if inducing_points.dim() == 1:
+            inducing_points = inducing_points.unsqueeze(-1)
+        # We're going to create two set of inducing points
+        # One set for computing the mean, one set for computing the variance
+        if self.mean_var_batch_dim is not None:
+            inducing_points = torch.stack([inducing_points, inducing_points], dim=(self.mean_var_batch_dim - 2))
+        else:
+            inducing_points = torch.stack([inducing_points, inducing_points], dim=-3)
+        super().__init__(
+            model, inducing_points, variational_distribution, learn_inducing_locations, jitter_val=jitter_val
+        )
+    def _expand_inputs(self, x: Tensor, inducing_points: Tensor) -> Tuple[Tensor, Tensor]:
+        # If we haven't explicitly marked a dimension as batch, add the corresponding batch dimension to the input
+        if self.mean_var_batch_dim is None:
+            x = x.unsqueeze(-3)
+        else:
+            x = x.unsqueeze(self.mean_var_batch_dim - 2)
+        return super()._expand_inputs(x, inducing_points)
+    def forward(
+        self,
+        x: Tensor,
+        inducing_points: Tensor,
+        inducing_values: Tensor,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        **kwargs,
+    ) -> Union[MultivariateNormal, MultivariateQExponential]:
+        # We'll compute the covariance, and cross-covariance terms for both the
+        # pred-mean and pred-covar, using their different inducing points (and maybe kernel hypers)
+        mean_var_batch_dim = self.mean_var_batch_dim or -1
+        # Compute full prior distribution
+        full_inputs = torch.cat([inducing_points, x], dim=-2)
+        full_output = self.model.forward(full_inputs, **kwargs)
+        full_covar = full_output.lazy_covariance_matrix
+        # Covariance terms
+        num_induc = inducing_points.size(-2)
+        test_mean = full_output.mean[..., num_induc:]
+        induc_induc_covar = full_covar[..., :num_induc, :num_induc].add_jitter(self.jitter_val)
+        induc_data_covar = full_covar[..., :num_induc, num_induc:].to_dense()
+        data_data_covar = full_covar[..., num_induc:, num_induc:]
+        # Compute interpolation terms
+        # K_ZZ^{-1/2} K_ZX
+        # K_ZZ^{-1/2} \mu_Z
+        L = self._cholesky_factor(induc_induc_covar)
+        if L.shape != induc_induc_covar.shape:
+            # Aggressive caching can cause nasty shape incompatibilities when evaluating with different batch shapes
+            # TODO: Use a hook to make this cleaner
+            try:
+                pop_from_cache_ignore_args(self, "cholesky_factor")
+            except CachingError:
+                pass
+            L = self._cholesky_factor(induc_induc_covar)
+        interp_term = L.solve(induc_data_covar.double()).to(full_inputs.dtype)
+        mean_interp_term = interp_term.select(mean_var_batch_dim - 2, 0)
+        var_interp_term = interp_term.select(mean_var_batch_dim - 2, 1)
+        # Compute the mean of q(f)
+        # k_XZ K_ZZ^{-1/2} m + \mu_X
+        # Here we're using the terms that correspond to the mean's inducing points
+        predictive_mean = torch.add(
+            torch.matmul(mean_interp_term.transpose(-1, -2), inducing_values.unsqueeze(-1)).squeeze(-1),
+            test_mean.select(mean_var_batch_dim - 1, 0),
+        )
+        # Compute the covariance of q(f)
+        # K_XX + k_XZ K_ZZ^{-1/2} (S - I) K_ZZ^{-1/2} k_ZX
+        middle_term = self.prior_distribution.lazy_covariance_matrix.mul(-1)
+        if variational_inducing_covar is not None:
+            middle_term = SumLinearOperator(variational_inducing_covar, middle_term)
+        predictive_covar = SumLinearOperator(
+            data_data_covar.add_jitter(self.jitter_val).to_dense().select(mean_var_batch_dim - 2, 1),
+            MatmulLinearOperator(var_interp_term.transpose(-1, -2), middle_term @ var_interp_term),
+        )
+        if hasattr(self.model, 'power'):
+            return MultivariateQExponential(predictive_mean, predictive_covar, power=self.model.power)
+        else:
+            return MultivariateNormal(predictive_mean, predictive_covar)
+    def kl_divergence(self) -> Tensor:
+        variational_dist = self.variational_distribution
+        prior_dist = self.prior_distribution
+        mean_dist = Delta(variational_dist.mean)
+        if hasattr(self.model, 'power'):
+            covar_dist = MultivariateQExponential(
+                torch.zeros_like(variational_dist.mean), variational_dist.lazy_covariance_matrix, power=self.model.power
+            )
+        else:
+            covar_dist = MultivariateNormal(
+                torch.zeros_like(variational_dist.mean), variational_dist.lazy_covariance_matrix
+            )
+        return kl_divergence(mean_dist, prior_dist) + kl_divergence(covar_dist, prior_dist)

qpytorch/variational/cholesky_variational_distribution.py ADDED Viewed

@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+from typing import Union
+import torch
+from linear_operator.operators import CholLinearOperator, TriangularLinearOperator
+from ..distributions import MultivariateNormal, MultivariateQExponential
+from ._variational_distribution import _VariationalDistribution
+class CholeskyVariationalDistribution(_VariationalDistribution):
+    """
+    A :obj:`~qpytorch.variational._VariationalDistribution` that is defined to be a multivariate normal (q-exponential) distribution
+    with a full covariance matrix.
+    The most common way this distribution is defined is to parameterize it in terms of a mean vector and a covariance
+    matrix. In order to ensure that the covariance matrix remains positive definite, we only consider the lower
+    triangle.
+    :param num_inducing_points: Size of the variational distribution. This implies that the variational mean
+        should be this size, and the variational covariance matrix should have this many rows and columns.
+    :param batch_shape: Specifies an optional batch size
+        for the variational parameters. This is useful for example when doing additive variational inference.
+    :param mean_init_std: (Default: 1e-3) Standard deviation of gaussian (q-exponential) noise to add to the mean initialization.
+    """
+    def __init__(
+        self,
+        num_inducing_points: int,
+        batch_shape: torch.Size = torch.Size([]),
+        mean_init_std: float = 1e-3,
+        **kwargs,
+    ):
+        super().__init__(num_inducing_points=num_inducing_points, batch_shape=batch_shape, mean_init_std=mean_init_std)
+        mean_init = torch.zeros(num_inducing_points)
+        covar_init = torch.eye(num_inducing_points, num_inducing_points)
+        mean_init = mean_init.repeat(*batch_shape, 1)
+        covar_init = covar_init.repeat(*batch_shape, 1, 1)
+        self.register_parameter(name="variational_mean", parameter=torch.nn.Parameter(mean_init))
+        self.register_parameter(name="chol_variational_covar", parameter=torch.nn.Parameter(covar_init))
+        if 'power' in kwargs: self.power = kwargs.pop('power')
+    def forward(self) -> Union[MultivariateNormal, MultivariateQExponential]:
+        chol_variational_covar = self.chol_variational_covar
+        dtype = chol_variational_covar.dtype
+        device = chol_variational_covar.device
+        # First make the cholesky factor is upper triangular
+        lower_mask = torch.ones(self.chol_variational_covar.shape[-2:], dtype=dtype, device=device).tril(0)
+        chol_variational_covar = TriangularLinearOperator(chol_variational_covar.mul(lower_mask))
+        # Now construct the actual matrix
+        variational_covar = CholLinearOperator(chol_variational_covar)
+        if not hasattr(self, 'power'):
+            return MultivariateNormal(self.variational_mean, variational_covar)
+        else:
+            return MultivariateQExponential(self.variational_mean, variational_covar, power=self.power)
+    def initialize_variational_distribution(self, prior_dist: Union[MultivariateNormal, MultivariateQExponential]) -> None:
+        self.variational_mean.data.copy_(prior_dist.mean)
+        self.variational_mean.data.add_(torch.randn_like(prior_dist.mean), alpha=self.mean_init_std)
+        self.chol_variational_covar.data.copy_(prior_dist.lazy_covariance_matrix.cholesky().to_dense())

qpytorch/variational/ciq_variational_strategy.py ADDED Viewed

@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+from typing import Optional, Tuple, Union
+import torch
+from linear_operator import to_linear_operator
+from linear_operator.operators import DiagLinearOperator, LinearOperator, MatmulLinearOperator, SumLinearOperator
+from linear_operator.utils import linear_cg
+from torch import Tensor
+from torch.autograd.function import FunctionCtx
+from .. import settings
+from ..distributions import Delta, Distribution, MultivariateNormal, MultivariateQExponential
+from ..module import Module
+from gpytorch.utils.memoize import cached
+from ._variational_strategy import _VariationalStrategy
+from .natural_variational_distribution import NaturalVariationalDistribution
+class _NgdInterpTerms(torch.autograd.Function):
+    """
+    This function takes in
+        - the kernel interpolation term K_ZZ^{-1/2} k_ZX
+        - the natural parameters of the variational distribution
+    and returns
+        - the predictive distribution mean/covariance
+        - the inducing KL divergence KL( q(u) || p(u))
+    However, the gradients will be with respect to the **cannonical parameters**
+    of the variational distribution, rather than the **natural parameters**.
+    This corresponds to performing natural gradient descent on the variational distribution.
+    """
+    @staticmethod
+    def forward(
+        ctx: FunctionCtx,
+        interp_term: torch.Tensor,
+        natural_vec: torch.Tensor,
+        natural_mat: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Compute precision
+        prec = natural_mat.mul(-2.0)
+        diag = prec.diagonal(dim1=-1, dim2=-2).unsqueeze(-1)
+        # Make sure that interp_term and natural_vec are the same batch shape
+        batch_shape = torch.broadcast_shapes(interp_term.shape[:-2], natural_vec.shape[:-1])
+        expanded_interp_term = interp_term.expand(*batch_shape, *interp_term.shape[-2:])
+        expanded_natural_vec = natural_vec.expand(*batch_shape, natural_vec.size(-1))
+        # Compute necessary solves with the precision. We need
+        # m = expec_vec = S * natural_vec
+        # S K^{-1/2} k
+        solves = linear_cg(
+            prec.matmul,
+            torch.cat([expanded_natural_vec.unsqueeze(-1), expanded_interp_term], dim=-1),
+            n_tridiag=0,
+            max_iter=settings.max_cg_iterations.value(),
+            tolerance=min(settings.eval_cg_tolerance.value(), settings.cg_tolerance.value()),
+            max_tridiag_iter=settings.max_lanczos_quadrature_iterations.value(),
+            preconditioner=lambda x: x / diag,
+        )
+        expec_vec = solves[..., 0]
+        s_times_interp_term = solves[..., 1:]
+        # Compute the interpolated mean
+        # k^T K^{-1/2} m
+        interp_mean = (s_times_interp_term.transpose(-1, -2) @ natural_vec.unsqueeze(-1)).squeeze(-1)
+        # Compute the interpolated variance
+        # k^T K^{-1/2} S K^{-1/2} k = k^T K^{-1/2} (expec_mat - expec_vec expec_vec^T) K^{-1/2} k
+        interp_var = (s_times_interp_term * interp_term).sum(dim=-2)
+        # Let's not bother actually computing the KL-div in the foward pass
+        # 1/2 ( -log | S | + tr(S) + m^T m - len(m) )
+        # = 1/2 ( -log | expec_mat - expec_vec expec_vec^T | + tr(expec_mat) - len(m) )
+        kl_div = torch.zeros_like(interp_mean[..., 0])
+        # We're done!
+        ctx.save_for_backward(interp_term, s_times_interp_term, interp_mean, natural_vec, expec_vec, prec)
+        return interp_mean, interp_var, kl_div
+    @staticmethod
+    def backward(
+        ctx: FunctionCtx, interp_mean_grad: torch.Tensor, interp_var_grad: torch.Tensor, kl_div_grad: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None]:
+        # Get the saved terms
+        interp_term, s_times_interp_term, interp_mean, natural_vec, expec_vec, prec = ctx.saved_tensors
+        # Expand data-depenedent gradients
+        interp_mean_grad = interp_mean_grad.unsqueeze(-2)
+        interp_var_grad = interp_var_grad.unsqueeze(-2)
+        # Compute gradient of interp term (K^{-1/2} k)
+        # interp_mean component: m
+        # interp_var component: S K^{-1/2} k
+        # kl component: 0
+        interp_term_grad = (interp_var_grad * s_times_interp_term).mul(2.0) + (
+            interp_mean_grad * expec_vec.unsqueeze(-1)
+        )
+        # Compute gradient of expected vector (m)
+        # interp_mean component: K^{-1/2} k
+        # interp_var component: (k^T K^{-1/2} m) K^{-1/2} k
+        # kl component: S^{-1} m
+        expec_vec_grad = (
+            (interp_var_grad * interp_mean.unsqueeze(-2) * interp_term).sum(dim=-1).mul(-2)
+            + (interp_mean_grad * interp_term).sum(dim=-1)
+            + (kl_div_grad.unsqueeze(-1) * natural_vec)
+        )
+        # Compute gradient of expected matrix (mm^T + S)
+        # interp_mean component: 0
+        # interp_var component: K^{-1/2} k k^T K^{-1/2}
+        # kl component: 1/2 ( I - S^{-1} )
+        eye = torch.eye(expec_vec.size(-1), device=expec_vec.device, dtype=expec_vec.dtype)
+        expec_mat_grad = torch.add(
+            (interp_var_grad * interp_term) @ interp_term.transpose(-1, -2),
+            (kl_div_grad.unsqueeze(-1).unsqueeze(-1) * (eye - prec).mul(0.5)),
+        )
+        # We're done!
+        return interp_term_grad, expec_vec_grad, expec_mat_grad, None  # Extra "None" for the kwarg
+class CiqVariationalStrategy(_VariationalStrategy):
+    r"""
+    Similar to :class:`~qpytorch.variational.VariationalStrategy`,
+    except the whitening operation is performed using Contour Integral Quadrature
+    rather than Cholesky (see `Pleiss et al. (2020)`_ for more info).
+    See the `CIQ-SVGP tutorial`_ for an example.
+    Contour Integral Quadrature uses iterative matrix-vector multiplication to approximate
+    the :math:`\mathbf K_{\mathbf Z \mathbf Z}^{-1/2}` matrix used for the whitening operation.
+    This can be more efficient than the standard variational strategy for large numbers
+    of inducing points (e.g. :math:`M > 1000`) or when the inducing points have structure
+    (e.g. they lie on an evenly-spaced grid).
+    .. note::
+        It is recommended that this object is used in conjunction with
+        :obj:`~qpytorch.variational.NaturalVariationalDistribution` and
+        `natural gradient descent`_.
+    :param model: Model this strategy is applied to.
+        Typically passed in when the VariationalStrategy is created in the
+        __init__ method of the user defined model.
+        It should contain power if Q-Exponential distribution is involved in.
+    :param inducing_points: Tensor containing a set of inducing
+        points to use for variational inference.
+    :param variational_distribution: A
+        VariationalDistribution object that represents the form of the variational distribution :math:`q(\mathbf u)`
+    :param learn_inducing_locations: (Default True): Whether or not
+        the inducing point locations :math:`\mathbf Z` should be learned (i.e. are they
+        parameters of the model).
+    :param jitter_val: Amount of diagonal jitter to add for Cholesky factorization numerical stability
+    .. _Pleiss et al. (2020):
+        https://arxiv.org/pdf/2006.11267.pdf
+    .. _CIQ-SVGP tutorial:
+        examples/04_Variational_and_Approximate_GPs/SVGP_CIQ.html
+    .. _natural gradient descent:
+        examples/04_Variational_and_Approximate_GPs/Natural_Gradient_Descent.html
+    """
+    def _ngd(self) -> bool:
+        return isinstance(self._variational_distribution, NaturalVariationalDistribution)
+    @property
+    @cached(name="prior_distribution_memo")
+    def prior_distribution(self) -> Union[MultivariateNormal, MultivariateQExponential]:
+        zeros = torch.zeros(
+            self._variational_distribution.shape(),
+            dtype=self._variational_distribution.dtype,
+            device=self._variational_distribution.device,
+        )
+        ones = torch.ones_like(zeros)
+        if hasattr(self.model, 'power'):
+            res = MultivariateQExponential(zeros, DiagLinearOperator(ones), power=self.model.power)
+        else:
+            res = MultivariateNormal(zeros, DiagLinearOperator(ones))
+        return res
+    @property
+    @cached(name="variational_distribution_memo")
+    def variational_distribution(self) -> Distribution:
+        if self._ngd():
+            raise RuntimeError(
+                "Variational distribution for NGD-CIQ should be computed during forward calls. "
+                "This is probably a bug in GPyTorch."
+            )
+        return super().variational_distribution
+    def forward(
+        self,
+        x: torch.Tensor,
+        inducing_points: torch.Tensor,
+        inducing_values: torch.Tensor,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        *params,
+        **kwargs,
+    ) -> Union[MultivariateNormal, MultivariateQExponential]:
+        # Compute full prior distribution
+        full_inputs = torch.cat([inducing_points, x], dim=-2)
+        full_output = self.model.forward(full_inputs, *params, **kwargs)
+        full_covar = full_output.lazy_covariance_matrix
+        # Covariance terms
+        num_induc = inducing_points.size(-2)
+        test_mean = full_output.mean[..., num_induc:]
+        induc_induc_covar = full_covar[..., :num_induc, :num_induc].evaluate_kernel().add_jitter(self.jitter_val)
+        induc_data_covar = full_covar[..., :num_induc, num_induc:].to_dense()
+        data_data_covar = full_covar[..., num_induc:, num_induc:].add_jitter(self.jitter_val)
+        # Compute interpolation terms
+        # K_XZ K_ZZ^{-1} \mu_z
+        # K_XZ K_ZZ^{-1/2} \mu_Z
+        with settings.max_preconditioner_size(0):  # Turn off preconditioning for CIQ
+            interp_term = to_linear_operator(induc_induc_covar).sqrt_inv_matmul(induc_data_covar)
+        # Compute interpolated mean and variance terms
+        # We have separate computation rules for NGD versus standard GD
+        if self._ngd():
+            interp_mean, interp_var, kl_div = _NgdInterpTerms.apply(
+                interp_term,
+                self._variational_distribution.natural_vec,
+                self._variational_distribution.natural_mat,
+            )
+            # Compute the covariance of q(f)
+            predictive_var = data_data_covar.diagonal(dim1=-1, dim2=-2) - interp_term.pow(2).sum(dim=-2) + interp_var
+            predictive_var = torch.clamp_min(predictive_var, settings.min_variance.value(predictive_var.dtype))
+            predictive_covar = DiagLinearOperator(predictive_var)
+            # Also compute and cache the KL divergence
+            if not hasattr(self, "_memoize_cache"):
+                self._memoize_cache = dict()
+            self._memoize_cache["kl"] = kl_div
+        else:
+            # Compute interpolated mean term
+            interp_mean = torch.matmul(
+                interp_term.transpose(-1, -2), (inducing_values - self.prior_distribution.mean).unsqueeze(-1)
+            ).squeeze(-1)
+            # Compute the covariance of q(f)
+            middle_term = self.prior_distribution.lazy_covariance_matrix.mul(-1)
+            if variational_inducing_covar is not None:
+                middle_term = SumLinearOperator(variational_inducing_covar, middle_term)
+            predictive_covar = SumLinearOperator(
+                data_data_covar.add_jitter(self.jitter_val),
+                MatmulLinearOperator(interp_term.transpose(-1, -2), middle_term @ interp_term),
+            )
+        # Compute the mean of q(f)
+        # k_XZ K_ZZ^{-1/2} (m - K_ZZ^{-1/2} \mu_Z) + \mu_X
+        predictive_mean = interp_mean + test_mean
+        # Return the distribution
+        if hasattr(self.model, 'power'):
+            return MultivariateQExponential(predictive_mean, predictive_covar, power=self.model.power)
+        else:
+            return MultivariateNormal(predictive_mean, predictive_covar)
+    def kl_divergence(self) -> Tensor:
+        r"""
+        Compute the KL divergence between the variational inducing distribution :math:`q(\mathbf u)`
+        and the prior inducing distribution :math:`p(\mathbf u)`.
+        :rtype: torch.Tensor
+        """
+        if self._ngd():
+            if hasattr(self, "_memoize_cache") and "kl" in self._memoize_cache:
+                return self._memoize_cache["kl"]
+            else:
+                raise RuntimeError(
+                    "KL divergence for NGD-CIQ should be computed during forward calls."
+                    "This is probably a bug in GPyTorch."
+                )
+        else:
+            return super().kl_divergence()
+    def __call__(self, x: torch.Tensor, prior: bool = False, *params, **kwargs) -> Union[MultivariateNormal, MultivariateQExponential]:
+        # This is mostly the same as _VariationalStrategy.__call__()
+        # but with special rules for natural gradient descent (to prevent O(M^3) computation)
+        # If we're in prior mode, then we're done!
+        if prior:
+            return self.model.forward(x)
+        # Delete previously cached items from the training distribution
+        if self.training:
+            self._clear_cache()
+        # (Maybe) initialize variational distribution
+        if not self.variational_params_initialized.item():
+            if self._ngd():
+                noise = torch.randn_like(self.prior_distribution.mean).mul_(1e-3)
+                eye = torch.eye(noise.size(-1), dtype=noise.dtype, device=noise.device).mul(-0.5)
+                self._variational_distribution.natural_vec.data.copy_(noise)
+                self._variational_distribution.natural_mat.data.copy_(eye)
+                self.variational_params_initialized.fill_(1)
+            else:
+                prior_dist = self.prior_distribution
+                self._variational_distribution.initialize_variational_distribution(prior_dist)
+                self.variational_params_initialized.fill_(1)
+        # Ensure inducing_points and x are the same size
+        inducing_points = self.inducing_points
+        if inducing_points.shape[:-2] != x.shape[:-2]:
+            x, inducing_points = self._expand_inputs(x, inducing_points)
+        # Get q(f)
+        if self._ngd():
+            return Module.__call__(
+                self,
+                x,
+                inducing_points,
+                inducing_values=None,
+                variational_inducing_covar=None,
+                *params,
+                **kwargs,
+            )
+        else:
+            # Get p(u)/q(u)
+            variational_dist_u = self.variational_distribution
+            if isinstance(variational_dist_u, (MultivariateNormal, MultivariateQExponential)):
+                return Module.__call__(
+                    self,
+                    x,
+                    inducing_points,
+                    inducing_values=variational_dist_u.mean,
+                    variational_inducing_covar=variational_dist_u.lazy_covariance_matrix,
+                    **kwargs,
+                )
+            elif isinstance(variational_dist_u, Delta):
+                return Module.__call__(
+                    self,
+                    x,
+                    inducing_points,
+                    inducing_values=variational_dist_u.mean,
+                    variational_inducing_covar=None,
+                    **kwargs,
+                )
+            else:
+                raise RuntimeError(
+                    f"Invalid variational distribuition ({type(variational_dist_u)}). "
+                    "Expected a multivariate normal (q-exponential) or a delta distribution."
+                )