PyPI - google-meridian - Versions diffs - 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

google-meridian 1.2.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{google_meridian-1.2.0.dist-info → google_meridian-1.3.0.dist-info}/METADATA +10 -10
google_meridian-1.3.0.dist-info/RECORD +62 -0
meridian/analysis/__init__.py +2 -0
meridian/analysis/analyzer.py +280 -142
meridian/analysis/formatter.py +2 -2
meridian/analysis/optimizer.py +353 -169
meridian/analysis/review/__init__.py +20 -0
meridian/analysis/review/checks.py +721 -0
meridian/analysis/review/configs.py +110 -0
meridian/analysis/review/constants.py +40 -0
meridian/analysis/review/results.py +544 -0
meridian/analysis/review/reviewer.py +186 -0
meridian/analysis/summarizer.py +14 -12
meridian/analysis/templates/chips.html.jinja +12 -0
meridian/analysis/test_utils.py +27 -5
meridian/analysis/visualizer.py +45 -50
meridian/backend/__init__.py +698 -55
meridian/backend/config.py +75 -16
meridian/backend/test_utils.py +127 -1
meridian/constants.py +52 -11
meridian/data/input_data.py +7 -2
meridian/data/test_utils.py +5 -3
meridian/mlflow/autolog.py +2 -2
meridian/model/__init__.py +1 -0
meridian/model/adstock_hill.py +10 -9
meridian/model/eda/__init__.py +3 -0
meridian/model/eda/constants.py +21 -0
meridian/model/eda/eda_engine.py +1580 -84
meridian/model/eda/eda_outcome.py +200 -0
meridian/model/eda/eda_spec.py +84 -0
meridian/model/eda/meridian_eda.py +220 -0
meridian/model/knots.py +56 -50
meridian/model/media.py +10 -8
meridian/model/model.py +79 -16
meridian/model/model_test_data.py +53 -9
meridian/model/posterior_sampler.py +398 -391
meridian/model/prior_distribution.py +114 -39
meridian/model/prior_sampler.py +146 -90
meridian/model/spec.py +7 -8
meridian/model/transformers.py +16 -8
meridian/version.py +1 -1
google_meridian-1.2.0.dist-info/RECORD +0 -52
{google_meridian-1.2.0.dist-info → google_meridian-1.3.0.dist-info}/WHEEL +0 -0
{google_meridian-1.2.0.dist-info → google_meridian-1.3.0.dist-info}/licenses/LICENSE +0 -0
{google_meridian-1.2.0.dist-info → google_meridian-1.3.0.dist-info}/top_level.txt +0 -0

meridian/analysis/analyzer.py CHANGED Viewed

@@ -15,6 +15,7 @@
 """Methods to compute analysis metrics of the model and the data."""
 from collections.abc import Mapping, Sequence
+import dataclasses
 import itertools
 import numbers
 from typing import Any, Optional
@@ -53,6 +54,7 @@ def _validate_non_media_baseline_values_numbers(
 # TODO: Refactor the related unit tests to be under DataTensors.
+@dataclasses.dataclass
 class DataTensors(backend.ExtensionType):
   """Container for data variable arguments of Analyzer methods.
@@ -175,12 +177,31 @@ class DataTensors(backend.ExtensionType):
         else None
     )
     self.time = (
-        backend.to_tensor(time, dtype="string") if time is not None else None
+        backend.to_tensor(time, dtype=backend.string)
+        if time is not None
+        else None
     )
-  def __validate__(self):
     self._validate_n_dims()
+  def __eq__(self, other: Any) -> bool:
+    """Provides safe equality comparison for mixed tensor/non-tensor fields."""
+    if type(self) is not type(other):
+      return NotImplemented
+    for field in dataclasses.fields(self):
+      a = getattr(self, field.name)
+      b = getattr(other, field.name)
+      if a is None and b is None:
+        continue
+      if a is None or b is None:
+        return False
+      try:
+        if not bool(np.all(backend.to_tensor(backend.equal(a, b)))):
+          return False
+      except (ValueError, TypeError):
+        if a != b:
+          return False
+    return True
   def total_spend(self) -> backend.Tensor | None:
     """Returns the total spend tensor.
@@ -216,7 +237,7 @@ class DataTensors(backend.ExtensionType):
       of the corresponding tensor in the `meridian` object. If all time
       dimensions are the same, returns `None`.
     """
-    for field in self._tf_extension_type_fields():
+    for field in dataclasses.fields(self):
       new_tensor = getattr(self, field.name)
       if field.name == constants.RF_IMPRESSIONS:
         old_tensor = getattr(meridian.rf_tensors, field.name)
@@ -282,7 +303,7 @@ class DataTensors(backend.ExtensionType):
   def _validate_n_dims(self):
     """Raises an error if the tensors have the wrong number of dimensions."""
-    for field in self._tf_extension_type_fields():
+    for field in dataclasses.fields(self):
       tensor = getattr(self, field.name)
       if tensor is None:
         continue
@@ -315,7 +336,7 @@ class DataTensors(backend.ExtensionType):
       Warning: If an attribute exists in the `DataTensors` object that is not in
         the `required_variables` list, it will be ignored.
     """
-    for field in self._tf_extension_type_fields():
+    for field in dataclasses.fields(self):
       tensor = getattr(self, field.name)
       if tensor is None:
         continue
@@ -468,7 +489,7 @@ class DataTensors(backend.ExtensionType):
   ) -> Self:
     """Fills default values and returns a new DataTensors object."""
     output = {}
-    for field in self._tf_extension_type_fields():
+    for field in dataclasses.fields(self):
       var_name = field.name
       if var_name not in required_fields:
         continue
@@ -489,7 +510,7 @@ class DataTensors(backend.ExtensionType):
         old_tensor = meridian.revenue_per_kpi
       elif var_name == constants.TIME:
         old_tensor = backend.to_tensor(
-            meridian.input_data.time.values.tolist(), dtype="string"
+            meridian.input_data.time.values.tolist(), dtype=backend.string
         )
       else:
         continue
@@ -500,6 +521,7 @@ class DataTensors(backend.ExtensionType):
     return DataTensors(**output)
+@dataclasses.dataclass
 class DistributionTensors(backend.ExtensionType):
   """Container for parameters distributions arguments of Analyzer methods."""
@@ -583,17 +605,19 @@ def _transformed_new_or_scaled(
 def _calc_rsquared(expected, actual):
   """Calculates r-squared between actual and expected outcome."""
-  return 1 - np.nanmean((expected - actual) ** 2) / np.nanvar(actual)
+  return 1 - backend.nanmean((expected - actual) ** 2) / backend.nanvar(actual)
 def _calc_mape(expected, actual):
   """Calculates MAPE between actual and expected outcome."""
-  return np.nanmean(np.abs((actual - expected) / actual))
+  return backend.nanmean(backend.absolute((actual - expected) / actual))
 def _calc_weighted_mape(expected, actual):
   """Calculates wMAPE between actual and expected outcome (weighted by actual)."""
-  return np.nansum(np.abs(actual - expected)) / np.nansum(actual)
+  return backend.nansum(backend.absolute(actual - expected)) / backend.nansum(
+      actual
+  )
 def _warn_if_geo_arg_in_kwargs(**kwargs):
@@ -675,43 +699,66 @@ def _validate_flexible_selected_times(
     selected_times: Sequence[str] | Sequence[bool] | None,
     media_selected_times: Sequence[str] | Sequence[bool] | None,
     new_n_media_times: int,
+    new_time: Sequence[str] | None = None,
 ):
   """Raises an error if selected times or media selected times is invalid.
-  This checks that the `selected_times` and `media_selected_times` arguments
-  are lists of booleans with the same number of elements as `new_n_media_times`.
-  This is only relevant if the time dimension of any of the variables in
-  `new_data` used in the analysis is modified.
+  This checks that (1) the `selected_times` and `media_selected_times` arguments
+  are lists of booleans with the same number of elements as `new_n_media_times`,
+  or (2) the `selected_times` and `media_selected_times` arguments are lists of
+  strings and the `new_time` list is provided and `selected_times` and
+  `media_selected_times` are subsets of `new_time`. This is only relevant if the
+  time dimension of any of the variables in `new_data` used in the analysis is
+  modified.
   Args:
     selected_times: Optional list of times to validate.
     media_selected_times: Optional list of media times to validate.
     new_n_media_times: The number of time periods in the new data.
+    new_time: The optional time dimension of the new data.
   """
   if selected_times and (
-      not _is_bool_list(selected_times)
-      or len(selected_times) != new_n_media_times
+      not (
+          _is_bool_list(selected_times)
+          and len(selected_times) == new_n_media_times
+      )
+      and not (
+          _is_str_list(selected_times)
+          and new_time is not None
+          and set(selected_times) <= set(new_time)
+      )
   ):
     raise ValueError(
         "If `media`, `reach`, `frequency`, `organic_media`,"
         " `organic_reach`, `organic_frequency`, `non_media_treatments`, or"
         " `revenue_per_kpi` is provided with a different number of time"
-        " periods than in `InputData`, then `selected_times` must be a list"
+        " periods than in `InputData`, then (1) `selected_times` must be a list"
         " of booleans with length equal to the number of time periods in"
-        " the new data."
+        " the new data, or (2) `selected_times` must be a list of strings and"
+        " `new_time` must be provided and `selected_times` must be a subset of"
+        " `new_time`."
     )
   if media_selected_times and (
-      not _is_bool_list(media_selected_times)
-      or len(media_selected_times) != new_n_media_times
+      not (
+          _is_bool_list(media_selected_times)
+          and len(media_selected_times) == new_n_media_times
+      )
+      and not (
+          _is_str_list(media_selected_times)
+          and new_time is not None
+          and set(media_selected_times) <= set(new_time)
+      )
   ):
     raise ValueError(
         "If `media`, `reach`, `frequency`, `organic_media`,"
         " `organic_reach`, `organic_frequency`, `non_media_treatments`, or"
         " `revenue_per_kpi` is provided with a different number of time"
-        " periods than in `InputData`, then `media_selected_times` must be"
+        " periods than in `InputData`, then (1) `media_selected_times` must be"
         " a list of booleans with length equal to the number of time"
-        " periods in the new data."
+        " periods in the new data, or (2) `media_selected_times` must be a list"
+        " of strings and `new_time` must be provided and"
+        " `media_selected_times` must be a subset of `new_time`."
     )
@@ -870,42 +917,37 @@ class Analyzer:
       )
     return result
-  def _check_revenue_data_exists(self, use_kpi: bool = False):
-    """Checks if the revenue data is available for the analysis.
+  def _use_kpi(self, use_kpi: bool = False) -> bool:
+    """Checks if KPI analysis should be used.
-    In the `kpi_type=NON_REVENUE` case, `revenue_per_kpi` is required to perform
-    the revenue analysis. If `revenue_per_kpi` is not defined, then the revenue
-    data is not available and the revenue analysis (`use_kpi=False`) is not
-    possible. Only the KPI analysis (`use_kpi=True`) is possible in this case.
+    If `use_kpi` is `True` but `kpi_type=REVENUE`, then `use_kpi` is ignored.
-    In the `kpi_type=REVENUE` case, KPI is equal to revenue and setting
-    `use_kpi=True` has no effect. Therefore, a warning is issued if the default
-    `False` value of `use_kpi` is overridden by the user.
+    If `use_kpi` is `False`, then  `revenue_per_kpi` is required to perform
+    the revenue analysis. Setting `use_kpi` to `False` in this case is ignored.
     Args:
-      use_kpi: A boolean flag indicating whether to use KPI instead of revenue.
+      use_kpi: A boolean flag indicating whether KPI analysis should be used.
+    Returns:
+      A boolean flag indicating whether KPI analysis should be used.
     Raises:
-      ValueError: If `use_kpi` is `False` and `revenue_per_kpi` is not defined.
-      UserWarning: If `use_kpi` is `True` in the `kpi_type=REVENUE` case.
+      UserWarning: If the KPI type is revenue and use_kpi is True or if
+      `use_kpi=False` but `revenue_per_kpi` is not available.
     """
-    if self._meridian.input_data.kpi_type == constants.NON_REVENUE:
-      if not use_kpi and self._meridian.revenue_per_kpi is None:
-        raise ValueError(
-            "Revenue analysis is not available when `revenue_per_kpi` is"
-            " unknown. Set `use_kpi=True` to perform KPI analysis instead."
-        )
+    if use_kpi and self._meridian.input_data.kpi_type == constants.REVENUE:
+      warnings.warn(
+          "Setting `use_kpi=True` has no effect when `kpi_type=REVENUE`"
+          " since in this case, KPI is equal to revenue."
+      )
+      return False
-    if self._meridian.input_data.kpi_type == constants.REVENUE:
-      # In the `kpi_type=REVENUE` case, KPI is equal to revenue and
-      # `revenue_per_kpi` is set to a tensor of 1s in the initialization of the
-      # `InputData` object.
-      assert self._meridian.revenue_per_kpi is not None
-      if use_kpi:
-        warnings.warn(
-            "Setting `use_kpi=True` has no effect when `kpi_type=REVENUE`"
-            " since in this case, KPI is equal to revenue."
-        )
+    if not use_kpi and self._meridian.input_data.revenue_per_kpi is None:
+      warnings.warn(
+          "Revenue analysis is not available when `revenue_per_kpi` is"
+          " unknown. Defaulting to KPI analysis."
+      )
+    return use_kpi or self._meridian.input_data.revenue_per_kpi is None
   def _get_adstock_dataframe(
       self,
@@ -1381,8 +1423,14 @@ class Analyzer:
             "`selected_geos` must match the geo dimension names from "
             "meridian.InputData."
         )
-      geo_mask = [x in selected_geos for x in mmm.input_data.geo]
-      tensor = backend.boolean_mask(tensor, geo_mask, axis=geo_dim)
+      geo_indices = [
+          i for i, x in enumerate(mmm.input_data.geo) if x in selected_geos
+      ]
+      tensor = backend.gather(
+          tensor,
+          backend.to_tensor(geo_indices, dtype=backend.int32),
+          axis=geo_dim,
+      )
     if selected_times is not None:
       _validate_selected_times(
@@ -1393,10 +1441,21 @@ class Analyzer:
           comparison_arg_name="`tensor`",
       )
       if _is_str_list(selected_times):
-        time_mask = [x in selected_times for x in mmm.input_data.time]
-        tensor = backend.boolean_mask(tensor, time_mask, axis=time_dim)
+        time_indices = [
+            i for i, x in enumerate(mmm.input_data.time) if x in selected_times
+        ]
+        tensor = backend.gather(
+            tensor,
+            backend.to_tensor(time_indices, dtype=backend.int32),
+            axis=time_dim,
+        )
       elif _is_bool_list(selected_times):
-        tensor = backend.boolean_mask(tensor, selected_times, axis=time_dim)
+        time_indices = [i for i, x in enumerate(selected_times) if x]
+        tensor = backend.gather(
+            tensor,
+            backend.to_tensor(time_indices, dtype=backend.int32),
+            axis=time_dim,
+        )
     tensor_dims = "...gt" + "m" * has_media_dim
     output_dims = (
@@ -1452,19 +1511,19 @@ class Analyzer:
         calculated.
       new_data: An optional `DataTensors` container with optional new tensors:
         `media`, `reach`, `frequency`, `organic_media`, `organic_reach`,
-        `organic_frequency`, `non_media_treatments`, `controls`. If `None`,
-        expected outcome is calculated conditional on the original values of the
-        data tensors that the Meridian object was initialized with. If
-        `new_data` argument is used, expected outcome is calculated conditional
-        on the values of the tensors passed in `new_data` and on the original
-        values of the remaining unset tensors. For example,
+        `organic_frequency`, `non_media_treatments`, `revenue_per_kpi`,
+        `controls`. If `None`, expected outcome is calculated conditional on the
+        original values of the data tensors that the Meridian object was
+        initialized with. If `new_data` argument is used, expected outcome is
+        calculated conditional on the values of the tensors passed in `new_data`
+        and on the original values of the remaining unset tensors. For example,
         `expected_outcome(new_data=DataTensors(reach=new_reach,
         frequency=new_frequency))` calculates expected outcome conditional on
         the original `media`, `organic_media`, `organic_reach`,
-        `organic_frequency`, `non_media_treatments` and `controls` tensors and
-        on the new given values for `reach` and `frequency` tensors. The new
-        tensors' dimensions must match the dimensions of the corresponding
-        original tensors from `input_data`.
+        `organic_frequency`, `non_media_treatments`, `revenue_per_kpi`, and
+        `controls` tensors and on the new given values for `reach` and
+        `frequency` tensors. The new tensors' dimensions must match the
+        dimensions of the corresponding original tensors from `input_data`.
       selected_geos: Optional list of containing a subset of geos to include. By
         default, all geos are included.
       selected_times: Optional list of containing a subset of dates to include.
@@ -1498,8 +1557,7 @@ class Analyzer:
         or `sample_prior()` (for `use_posterior=False`) has not been called
         prior to calling this method.
     """
-    self._check_revenue_data_exists(use_kpi)
+    use_kpi = self._use_kpi(use_kpi)
     self._check_kpi_transformation(inverse_transform_outcome, use_kpi)
     if self._meridian.is_national:
       _warn_if_geo_arg_in_kwargs(
@@ -1515,7 +1573,9 @@ class Analyzer:
     if new_data is None:
       new_data = DataTensors()
-    required_fields = constants.NON_REVENUE_DATA
+    required_fields = (
+        constants.PAID_DATA + constants.NON_PAID_DATA + (constants.CONTROLS,)
+    )
     filled_tensors = new_data.validate_and_fill_missing_data(
         required_tensors_names=required_fields,
         meridian=self._meridian,
@@ -1569,7 +1629,7 @@ class Analyzer:
     if inverse_transform_outcome:
       outcome_means = self._meridian.kpi_transformer.inverse(outcome_means)
       if not use_kpi:
-        outcome_means *= self._meridian.revenue_per_kpi
+        outcome_means *= filled_tensors.revenue_per_kpi
     return self.filter_and_aggregate_geos_and_times(
         outcome_means,
@@ -1698,7 +1758,7 @@ class Analyzer:
     Returns:
        Tensor of incremental outcome returned in terms of revenue or KPI.
     """
-    self._check_revenue_data_exists(use_kpi)
+    use_kpi = self._use_kpi(use_kpi)
     if revenue_per_kpi is None:
       revenue_per_kpi = self._meridian.revenue_per_kpi
     t1 = self._meridian.kpi_transformer.inverse(
@@ -1711,7 +1771,17 @@ class Analyzer:
       return kpi
     return backend.einsum("gt,...gtm->...gtm", revenue_per_kpi, kpi)
-  @backend.function(jit_compile=True)
+  @backend.function(
+      jit_compile=True,
+      static_argnames=[
+          "inverse_transform_outcome",
+          "use_kpi",
+          "selected_geos",
+          "selected_times",
+          "aggregate_geos",
+          "aggregate_times",
+      ],
+  )
   def _incremental_outcome_impl(
       self,
       data_tensors: DataTensors,
@@ -1781,7 +1851,7 @@ class Analyzer:
     Returns:
       Tensor containing the incremental outcome distribution.
     """
-    self._check_revenue_data_exists(use_kpi)
+    use_kpi = self._use_kpi(use_kpi)
     if (
         data_tensors.non_media_treatments is not None
         and non_media_treatments_baseline_normalized is None
@@ -1982,7 +2052,7 @@ class Analyzer:
         with matching time dimensions.
     """
     mmm = self._meridian
-    self._check_revenue_data_exists(use_kpi)
+    use_kpi = self._use_kpi(use_kpi)
     self._check_kpi_transformation(inverse_transform_outcome, use_kpi)
     if self._meridian.is_national:
       _warn_if_geo_arg_in_kwargs(
@@ -2123,8 +2193,12 @@ class Analyzer:
     )
     incremental_outcome_temps = [None] * len(batch_starting_indices)
     dim_kwargs = {
-        "selected_geos": selected_geos,
-        "selected_times": selected_times,
+        "selected_geos": (
+            tuple(selected_geos) if selected_geos is not None else None
+        ),
+        "selected_times": (
+            tuple(selected_times) if selected_times is not None else None
+        ),
         "aggregate_geos": aggregate_geos,
         "aggregate_times": aggregate_times,
     }
@@ -2299,7 +2373,7 @@ class Analyzer:
         "selected_times": selected_times,
         "aggregate_geos": aggregate_geos,
     }
-    self._check_revenue_data_exists(use_kpi)
+    use_kpi = self._use_kpi(use_kpi)
     self._validate_geo_and_time_granularity(**dim_kwargs)
     required_values = constants.PERFORMANCE_DATA
     if not new_data:
@@ -2408,6 +2482,7 @@ class Analyzer:
       (n_media_channels + n_rf_channels))`. The `n_geos` dimension is dropped if
       `aggregate_geos=True`.
     """
+    use_kpi = self._use_kpi(use_kpi)
     dim_kwargs = {
         "selected_geos": selected_geos,
         "selected_times": selected_times,
@@ -2421,7 +2496,6 @@ class Analyzer:
         "include_non_paid_channels": False,
         "aggregate_times": True,
     }
-    self._check_revenue_data_exists(use_kpi)
     self._validate_geo_and_time_granularity(**dim_kwargs)
     required_values = constants.PERFORMANCE_DATA
     if not new_data:
@@ -2609,6 +2683,7 @@ class Analyzer:
       self,
       aggregate_geos: bool = False,
       aggregate_times: bool = False,
+      use_kpi: bool = False,
       split_by_holdout_id: bool = False,
       non_media_baseline_values: Sequence[float] | None = None,
       confidence_level: float = constants.DEFAULT_CONFIDENCE_LEVEL,
@@ -2620,6 +2695,8 @@ class Analyzer:
         summed over all of the regions.
       aggregate_times: Boolean. If `True`, the expected, baseline, and actual
         are summed over all of the time periods.
+      use_kpi: If `True`, calculate the incremental KPI. Otherwise, calculate
+        the incremental revenue using the revenue per KPI (if available).
       split_by_holdout_id: Boolean. If `True` and `holdout_id` exists, the data
         is split into `'Train'`, `'Test'`, and `'All Data'` subsections.
       non_media_baseline_values: Optional list of shape
@@ -2636,8 +2713,8 @@ class Analyzer:
       A dataset with the expected, baseline, and actual outcome metrics.
     """
     _validate_non_media_baseline_values_numbers(non_media_baseline_values)
+    use_kpi = self._use_kpi(use_kpi)
     mmm = self._meridian
-    use_kpi = self._meridian.input_data.revenue_per_kpi is None
     can_split_by_holdout = self._can_split_by_holdout_id(split_by_holdout_id)
     expected_outcome = self.expected_outcome(
         aggregate_geos=False, aggregate_times=False, use_kpi=use_kpi
@@ -2805,7 +2882,7 @@ class Analyzer:
       self,
       use_posterior: bool,
       new_data: DataTensors | None = None,
-      use_kpi: bool | None = None,
+      use_kpi: bool = False,
       include_non_paid_channels: bool = True,
       non_media_baseline_values: Sequence[float] | None = None,
       **kwargs,
@@ -2852,7 +2929,7 @@ class Analyzer:
       the end containing the total incremental outcome of all channels.
     """
     _validate_non_media_baseline_values_numbers(non_media_baseline_values)
-    use_kpi = use_kpi or self._meridian.input_data.revenue_per_kpi is None
+    use_kpi = self._use_kpi(use_kpi)
     incremental_outcome_m = self.incremental_outcome(
         use_posterior=use_posterior,
         new_data=new_data,
@@ -2981,6 +3058,7 @@ class Analyzer:
       interpretation by time period.
     """
     _validate_non_media_baseline_values_numbers(non_media_baseline_values)
+    use_kpi = self._use_kpi(use_kpi)
     dim_kwargs = {
         "selected_geos": selected_geos,
         "selected_times": selected_times,
@@ -3123,16 +3201,19 @@ class Analyzer:
     ).where(lambda ds: ds.channel != constants.ALL_CHANNELS)
     if new_data.get_modified_times(self._meridian) is None:
+      expected_outcome_fields = list(
+          constants.PAID_DATA + constants.NON_PAID_DATA + (constants.CONTROLS,)
+      )
       expected_outcome_prior = self.expected_outcome(
           use_posterior=False,
-          new_data=new_data.filter_fields(constants.NON_REVENUE_DATA),
+          new_data=new_data.filter_fields(expected_outcome_fields),
           use_kpi=use_kpi,
           **dim_kwargs,
           **batched_kwargs,
       )
       expected_outcome_posterior = self.expected_outcome(
           use_posterior=True,
-          new_data=new_data.filter_fields(constants.NON_REVENUE_DATA),
+          new_data=new_data.filter_fields(expected_outcome_fields),
           use_kpi=use_kpi,
           **dim_kwargs,
           **batched_kwargs,
@@ -3376,6 +3457,7 @@ class Analyzer:
       aggregate_geos: bool = True,
       aggregate_times: bool = True,
       non_media_baseline_values: Sequence[float] | None = None,
+      use_kpi: bool = False,
       confidence_level: float = constants.DEFAULT_CONFIDENCE_LEVEL,
       batch_size: int = constants.DEFAULT_BATCH_SIZE,
   ) -> xr.Dataset:
@@ -3397,6 +3479,8 @@ class Analyzer:
         `model_spec.non_media_population_scaling_id` is `True`. If `None`, the
         `model_spec.non_media_baseline_values` is used, which defaults to the
         minimum value for each non_media treatment channel.
+      use_kpi: Boolean. If `True`, the baseline summary metrics are calculated
+        using KPI. If `False`, the metrics are calculated using revenue.
       confidence_level: Confidence level for media summary metrics credible
         intervals, represented as a value between zero and one.
       batch_size: Integer representing the maximum draws per chain in each
@@ -3412,7 +3496,7 @@ class Analyzer:
     _validate_non_media_baseline_values_numbers(non_media_baseline_values)
     # TODO: Change "pct_of_contribution" to a more accurate term.
-    use_kpi = self._meridian.input_data.revenue_per_kpi is None
+    use_kpi = self._use_kpi(use_kpi)
     dim_kwargs = {
         "selected_geos": selected_geos,
         "selected_times": selected_times,
@@ -3595,6 +3679,7 @@ class Analyzer:
       ValueError: If there are no channels with reach and frequency data.
     """
     dist_type = constants.POSTERIOR if use_posterior else constants.PRIOR
+    use_kpi = self._use_kpi(use_kpi)
     new_data = new_data or DataTensors()
     if self._meridian.n_rf_channels == 0:
       raise ValueError(
@@ -3673,9 +3758,11 @@ class Analyzer:
     )
     optimal_frequency = [freq_grid[i] for i in optimal_freq_idx]
-    optimal_frequency_tensor = backend.to_tensor(
-        backend.ones_like(filled_data.rf_impressions) * optimal_frequency,
-        backend.float32,
+    optimal_frequency_values = backend.to_tensor(
+        optimal_frequency, dtype=backend.float32
+    )
+    optimal_frequency_tensor = (
+        backend.ones_like(filled_data.rf_impressions) * optimal_frequency_values
     )
     optimal_reach = filled_data.rf_impressions / optimal_frequency_tensor
@@ -3760,10 +3847,7 @@ class Analyzer:
         attrs={
             constants.CONFIDENCE_LEVEL: confidence_level,
             constants.USE_POSTERIOR: use_posterior,
-            constants.IS_REVENUE_KPI: (
-                self._meridian.input_data.kpi_type == constants.REVENUE
-                or not use_kpi
-            ),
+            constants.IS_REVENUE_KPI: not use_kpi,
         },
     )
@@ -3771,6 +3855,7 @@ class Analyzer:
       self,
       selected_geos: Sequence[str] | None = None,
       selected_times: Sequence[str] | None = None,
+      use_kpi: bool = False,
       batch_size: int = constants.DEFAULT_BATCH_SIZE,
   ) -> xr.Dataset:
     """Calculates `R-Squared`, `MAPE`, and `wMAPE` goodness of fit metrics.
@@ -3801,6 +3886,8 @@ class Analyzer:
         default, all geos are included.
       selected_times: Optional list containing a subset of dates to include. By
         default, all time periods are included.
+      use_kpi: Whether to use KPI or revenue scale for the predictive accuracy
+        metrics.
       batch_size: Integer representing the maximum draws per chain in each
         batch. By default, `batch_size` is `100`. The calculation is run in
         batches to avoid memory exhaustion. If a memory error occurs, try
@@ -3814,7 +3901,7 @@ class Analyzer:
       is split into `'Train'`, `'Test'`, and `'All Data'` subsections, and the
       three metrics are computed for each.
     """
-    use_kpi = self._meridian.input_data.revenue_per_kpi is None
+    use_kpi = self._use_kpi(use_kpi)
     if self._meridian.is_national:
       _warn_if_geo_arg_in_kwargs(
           selected_geos=selected_geos,
@@ -3835,10 +3922,10 @@ class Analyzer:
         ],
         constants.GEO_GRANULARITY: [constants.GEO, constants.NATIONAL],
     }
-    if self._meridian.revenue_per_kpi is not None:
-      input_tensor = self._meridian.kpi * self._meridian.revenue_per_kpi
-    else:
+    if use_kpi:
       input_tensor = self._meridian.kpi
+    else:
+      input_tensor = self._meridian.kpi * self._meridian.revenue_per_kpi
     actual = np.asarray(
         self.filter_and_aggregate_geos_and_times(
             tensor=input_tensor,
@@ -3967,10 +4054,11 @@ class Analyzer:
           "sample_posterior() must be called prior to calling this method."
       )
-    def _transpose_first_two_dims(x: backend.Tensor) -> backend.Tensor:
-      n_dim = len(x.shape)
+    def _transpose_first_two_dims(x: Any) -> backend.Tensor:
+      x_tensor = backend.to_tensor(x)
+      n_dim = len(x_tensor.shape)
       perm = [1, 0] + list(range(2, n_dim))
-      return backend.transpose(x, perm)
+      return backend.transpose(x_tensor, perm)
     rhat = backend.mcmc.potential_scale_reduction({
         k: _transpose_first_two_dims(v)
@@ -4003,8 +4091,6 @@ class Analyzer:
     Returns:
       A DataFrame with the following columns:
-      *   `n_params`: The number of respective parameters in the model.
-      *   `avg_rhat`: The average R-hat value for the respective parameter.
       *   `n_params`: The number of respective parameters in the model.
       *   `avg_rhat`: The average R-hat value for the respective parameter.
       *   `max_rhat`: The maximum R-hat value for the respective parameter.
@@ -4056,6 +4142,7 @@ class Analyzer:
   def response_curves(
       self,
+      new_data: DataTensors | None = None,
       spend_multipliers: list[float] | None = None,
       use_posterior: bool = True,
       selected_geos: Sequence[str] | None = None,
@@ -4081,6 +4168,15 @@ class Analyzer:
     `selected_times` are also scaled by the multiplier.)
     Args:
+      new_data: Optional `DataTensors` object with optional new tensors:
+        `media`, `reach`, `frequency`, `media_spend`, `rf_spend`,
+        `revenue_per_kpi`, `times`. If provided, the response curves are
+        calculated using the values of the tensors passed in `new_data` and the
+        original values of all the remaining tensors. If `None`, the response
+        curves are calculated using the original values of all the tensors. If
+        any of the tensors in `new_data` is provided with a different number of
+        time periods than in `InputData`, then all tensors must be provided with
+        the same number of time periods and the `time` tensor must be provided.
       spend_multipliers: List of multipliers. Each channel's total spend is
         multiplied by these factors to obtain the values at which the curve is
         calculated for that channel.
@@ -4088,8 +4184,11 @@ class Analyzer:
         generated. If `False`, prior response curves are generated.
       selected_geos: Optional list containing a subset of geos to include. By
         default, all geos are included.
-      selected_times: Optional list containing a subset of dates to include. By
-        default, all time periods are included.
+      selected_times: Optional list containing a subset of dates to include. If
+        `new_data` is provided with modified time periods, then `selected_times`
+        must be a subset of `new_data.times`. Otherwise, `selected_times` must
+        be a subset of `self._meridian.input_data.time`. By default, all time
+        periods are included.
       by_reach: Boolean. For channels with reach and frequency. If `True`, plots
         the response curve by reach. If `False`, plots the response curve by
         frequency.
@@ -4118,11 +4217,49 @@ class Analyzer:
         "aggregate_geos": True,
         "aggregate_times": True,
     }
+    if new_data is None:
+      new_data = DataTensors()
+    # TODO: b/442920356 - Support flexible time without providing exact dates.
+    required_tensors_names = constants.PERFORMANCE_DATA + (constants.TIME,)
+    filled_data = new_data.validate_and_fill_missing_data(
+        required_tensors_names=required_tensors_names,
+        meridian=self._meridian,
+        allow_modified_times=True,
+    )
+    new_n_media_times = filled_data.get_modified_times(self._meridian)
+    if new_n_media_times is None:
+      _validate_selected_times(
+          selected_times=selected_times,
+          input_times=self._meridian.input_data.time,
+          n_times=self._meridian.n_times,
+          arg_name="selected_times",
+          comparison_arg_name="the input data",
+      )
+    else:
+      new_time = np.asarray(filled_data.time).astype(str).tolist()
+      _validate_flexible_selected_times(
+          selected_times=selected_times,
+          media_selected_times=None,
+          new_n_media_times=new_n_media_times,
+          new_time=new_time,
+      )
+      # TODO: b/407847021 - Switch to Sequence[str] once it is supported.
+      if selected_times is not None:
+        selected_times = [x in selected_times for x in new_time]
+        dim_kwargs["selected_times"] = selected_times
     if self._meridian.n_rf_channels > 0 and use_optimal_frequency:
-      frequency = backend.ones_like(
-          self._meridian.rf_tensors.frequency
-      ) * backend.to_tensor(
+      opt_freq_data = DataTensors(
+          media=filled_data.media,
+          rf_impressions=filled_data.reach * filled_data.frequency,
+          media_spend=filled_data.media_spend,
+          rf_spend=filled_data.rf_spend,
+          revenue_per_kpi=filled_data.revenue_per_kpi,
+      )
+      frequency = backend.ones_like(filled_data.frequency) * backend.to_tensor(
           self.optimal_freq(
+              new_data=opt_freq_data,
               selected_geos=selected_geos,
               selected_times=selected_times,
               use_kpi=use_kpi,
@@ -4130,12 +4267,12 @@ class Analyzer:
           dtype=backend.float32,
       )
       reach = backend.divide_no_nan(
-          self._meridian.rf_tensors.reach * self._meridian.rf_tensors.frequency,
+          filled_data.reach * filled_data.frequency,
           frequency,
       )
     else:
-      frequency = self._meridian.rf_tensors.frequency
-      reach = self._meridian.rf_tensors.reach
+      frequency = filled_data.frequency
+      reach = filled_data.reach
     if spend_multipliers is None:
       spend_multipliers = list(np.arange(0, 2.2, 0.2))
     incremental_outcome = np.zeros((
@@ -4149,18 +4286,19 @@ class Analyzer:
             (len(self._meridian.input_data.get_all_paid_channels()), 3)
         )  # Last dimension = 3 for the mean, ci_lo and ci_hi.
         continue
-      new_data = _scale_tensors_by_multiplier(
+      scaled_data = _scale_tensors_by_multiplier(
           data=DataTensors(
-              media=self._meridian.media_tensors.media,
+              media=filled_data.media,
               reach=reach,
               frequency=frequency,
+              revenue_per_kpi=filled_data.revenue_per_kpi,
           ),
           multiplier=multiplier,
           by_reach=by_reach,
       )
       inc_outcome_temp = self.incremental_outcome(
           use_posterior=use_posterior,
-          new_data=new_data.filter_fields(constants.PAID_DATA),
+          new_data=scaled_data.filter_fields(constants.PAID_DATA),
           inverse_transform_outcome=True,
           batch_size=batch_size,
           use_kpi=use_kpi,
@@ -4171,22 +4309,11 @@ class Analyzer:
           inc_outcome_temp, confidence_level
       )
-    if self._meridian.n_media_channels > 0 and self._meridian.n_rf_channels > 0:
-      spend = backend.concatenate(
-          [
-              self._meridian.media_tensors.media_spend,
-              self._meridian.rf_tensors.rf_spend,
-          ],
-          axis=-1,
-      )
-    elif self._meridian.n_media_channels > 0:
-      spend = self._meridian.media_tensors.media_spend
-    else:
-      spend = self._meridian.rf_tensors.rf_spend
-    if backend.rank(spend) == 3:
+    spend = filled_data.total_spend()
+    if spend is not None and spend.ndim == 3:
       spend = self.filter_and_aggregate_geos_and_times(
           tensor=spend,
+          flexible_time_dim=True,
           **dim_kwargs,
       )
     spend_einsum = backend.einsum("k,m->km", np.array(spend_multipliers), spend)
@@ -4880,11 +5007,12 @@ class Analyzer:
   def get_aggregated_spend(
       self,
       new_data: DataTensors | None = None,
+      selected_geos: Sequence[str] | None = None,
       selected_times: Sequence[str] | Sequence[bool] | None = None,
       include_media: bool = True,
       include_rf: bool = True,
   ) -> xr.DataArray:
-    """Gets the aggregated spend based on the selected time.
+    """Gets the aggregated spend based on the selected geos and time.
     Args:
       new_data: An optional `DataTensors` object containing the new `media`,
@@ -4895,6 +5023,9 @@ class Analyzer:
         of all the remaining tensors.  If any of the tensors in `new_data` is
         provided with a different number of time periods than in `InputData`,
         then all tensors must be provided with the same number of time periods.
+      selected_geos: Optional list containing a subset of geos to include. By
+        default, all geos are included. The selected geos should match those in
+        `InputData.geo`.
       selected_times: Optional list containing either a subset of dates to
         include or booleans with length equal to the number of time periods in
         KPI data. By default, all time periods are included.
@@ -4939,10 +5070,11 @@ class Analyzer:
       aggregated_media_spend = empty_da
     else:
       aggregated_media_spend = self._impute_and_aggregate_spend(
-          selected_times,
-          filled_data.media,
-          filled_data.media_spend,
-          list(self._meridian.input_data.media_channel.values),
+          selected_geos=selected_geos,
+          selected_times=selected_times,
+          media_execution_values=filled_data.media,
+          channel_spend=filled_data.media_spend,
+          channel_names=list(self._meridian.input_data.media_channel.values),
       )
     if not include_rf:
@@ -4961,10 +5093,11 @@ class Analyzer:
     else:
       rf_execution_values = filled_data.reach * filled_data.frequency
       aggregated_rf_spend = self._impute_and_aggregate_spend(
-          selected_times,
-          rf_execution_values,
-          filled_data.rf_spend,
-          list(self._meridian.input_data.rf_channel.values),
+          selected_geos=selected_geos,
+          selected_times=selected_times,
+          media_execution_values=rf_execution_values,
+          channel_spend=filled_data.rf_spend,
+          channel_names=list(self._meridian.input_data.rf_channel.values),
       )
     return xr.concat(
@@ -4973,21 +5106,26 @@ class Analyzer:
   def _impute_and_aggregate_spend(
       self,
+      selected_geos: Sequence[str] | None,
       selected_times: Sequence[str] | Sequence[bool] | None,
       media_execution_values: backend.Tensor,
       channel_spend: backend.Tensor,
       channel_names: Sequence[str],
   ) -> xr.DataArray:
-    """Imputes and aggregates the spend over the selected time period.
+    """Imputes and aggregates the spend within selected dimensions.
-    This function is used to aggregate the spend over the selected time period.
-    Imputation is required when `channel_spend` has only one dimension and the
-    aggregation is applied to only a subset of times, as specified by
-    `selected_times`. The `media_execution_values` argument only serves the
-    purpose of imputation. Although `media_execution_values` is a required
-    argument, its values only affect the output when imputation is required.
+    This function is used to aggregate the spend within selected geos over the
+    selected time period. Imputation is required when `channel_spend` has only
+    one dimension and the aggregation is applied to only a subset of geos or
+    times, as specified by `selected_geos` and `selected_times`. The
+    `media_execution_values` argument only serves the purpose of imputation.
+    Although `media_execution_values` is a required argument, its values only
+    affect the output when imputation is required.
     Args:
+      selected_geos: Optional list containing a subset of geos to include. By
+        default, all geos are included. The selected geos should match those in
+        `InputData.geo`.
       selected_times: Optional list containing either a subset of dates to
         include or booleans with length equal to the number of time periods in
         KPI data. By default, all time periods are included.
@@ -5002,7 +5140,7 @@ class Analyzer:
       variable `spend`.
     """
     dim_kwargs = {
-        "selected_geos": None,
+        "selected_geos": selected_geos,
         "selected_times": selected_times,
         "aggregate_geos": True,
         "aggregate_times": True,

google-meridian 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

google-meridian 1.2.0py3-none-any.whl → 1.3.0py3-none-any.whl