PyPI - google-meridian - Versions diffs - 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

google-meridian 1.0.9py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{google_meridian-1.0.9.dist-info → google_meridian-1.1.1.dist-info}/METADATA +2 -2
google_meridian-1.1.1.dist-info/RECORD +41 -0
{google_meridian-1.0.9.dist-info → google_meridian-1.1.1.dist-info}/WHEEL +1 -1
meridian/__init__.py +2 -2
meridian/analysis/__init__.py +1 -1
meridian/analysis/analyzer.py +213 -206
meridian/analysis/formatter.py +1 -1
meridian/analysis/optimizer.py +264 -66
meridian/analysis/summarizer.py +5 -5
meridian/analysis/summary_text.py +1 -1
meridian/analysis/test_utils.py +82 -82
meridian/analysis/visualizer.py +14 -19
meridian/constants.py +103 -19
meridian/data/__init__.py +1 -1
meridian/data/arg_builder.py +1 -1
meridian/data/input_data.py +127 -27
meridian/data/load.py +53 -40
meridian/data/test_utils.py +172 -44
meridian/data/time_coordinates.py +4 -4
meridian/model/__init__.py +1 -1
meridian/model/adstock_hill.py +1 -1
meridian/model/knots.py +1 -1
meridian/model/media.py +134 -99
meridian/model/model.py +494 -84
meridian/model/model_test_data.py +86 -1
meridian/model/posterior_sampler.py +139 -58
meridian/model/prior_distribution.py +97 -52
meridian/model/prior_sampler.py +209 -233
meridian/model/spec.py +197 -37
meridian/model/transformers.py +16 -4
google_meridian-1.0.9.dist-info/RECORD +0 -41
{google_meridian-1.0.9.dist-info → google_meridian-1.1.1.dist-info}/licenses/LICENSE +0 -0
{google_meridian-1.0.9.dist-info → google_meridian-1.1.1.dist-info}/top_level.txt +0 -0

meridian/data/input_data.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 The Meridian Authors.
+# Copyright 2025 The Meridian Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ def _check_dim_collection(
     )
-def _check_dim_match(dim: str, arrays: Sequence[xr.DataArray]):
+def _check_dim_match(dim: str, arrays: Sequence[xr.DataArray | None]):
   """Verifies that the dimensions of the appropriate arrays match."""
   lengths = [len(array.coords[dim]) for array in arrays if array is not None]
   names = [array.name for array in arrays if array is not None]
@@ -83,6 +83,31 @@ def _check_coords_match(dim: str, arrays: Sequence[xr.DataArray]):
       )
+def _aggregate_spend(
+    spend: xr.DataArray, calibration_period: np.ndarray | None
+) -> np.ndarray | None:
+  """Aggregates spend for each channel over the calibration period.
+  Args:
+    spend: An array with shape `(n_geos, n_times, n_channels)` to aggregate.
+    calibration_period: An optional boolean array of shape `(n_media_times,
+      n_channels)`. If provided, spend is filtered according to this period.
+  Returns:
+    A 1-D array of aggregated media spend per channel, or `None` if `spend` is
+    `None`.
+  """
+  if spend is None:
+    return None
+  if calibration_period is None:
+    return np.sum(spend, axis=(0, 1))
+  # Select the last `n_times` from the `calibration_period`
+  factors = np.where(calibration_period[-spend.shape[1] :, :], 1, 0)
+  return np.einsum("gtm,tm->m", spend, factors)
 @dataclasses.dataclass
 class InputData:
   """A data container for advertising data in a format supported by Meridian.
@@ -96,11 +121,11 @@ class InputData:
       `revenue_per_kpi` exists, ROI calibration is used and the analysis is run
       on revenue. When the `revenue_per_kpi` doesn't exist for the same
       `kpi_type`, custom ROI calibration is used and the analysis is run on KPI.
-    controls: A DataArray of dimensions `(n_geos, n_times, n_controls)`
-      containing control variable values.
     population: A DataArray of dimensions `(n_geos,)` containing the population
       of each group. This variable is used to scale the KPI and media for
       modeling.
+    controls: An optional DataArray of dimensions `(n_geos, n_times,
+      n_controls)` containing control variable values.
     revenue_per_kpi: An optional DataArray of dimensions `(n_geos, n_times)`
       containing the average revenue amount per KPI unit. Although modeling is
       done on `kpi`, model analysis and optimization are done on `KPI *
@@ -120,8 +145,14 @@ class InputData:
       in the same order. If either of these arguments is passed, then the other
       is not optional.
     media_spend: An optional `DataArray` containing the cost of each media
-      channel. This is used as the denominator for ROI calculations. The
-      DataArray shape can be `(n_geos, n_times, n_media_channels)` or
+      channel. This is used as the denominator for ROI calculations. It is also
+      used to calculate an assumed cost per media unit for post-modeling
+      analysis such as response curves and budget optimization. Only the
+      aggregate spend (across geos and time periods) is required for these
+      calculations. However, a spend breakdown by geo and time period is
+      required if `roi_calibration_period` is specified or if conducting
+      post-modeling analysis on a specific subset of geos and/or time periods.
+      The DataArray shape can be `(n_geos, n_times, n_media_channels)` or
       `(n_media_channels,)` if the data is aggregated over `geo` and `time`
       dimensions. We recommend that the spend total aligns with the time window
       of the `kpi` and `controls` data, which is the time window over which
@@ -131,7 +162,9 @@ class InputData:
       time window of media executed during the time window. `media` and
       `media_spend` must contain the same number of media channels in the same
       order. If either of these arguments is passed, then the other is not
-      optional.
+      optional. If a tensor of shape `(n_media_channels,)` is passed as
+      `media_spend`, then it will be automatically allocated across geos and
+      times proportinally to `media`.
     reach: An optional `DataArray` of dimensions `(n_geos, n_media_times,
       n_rf_channels)` containing non-negative `reach` values. It is required
       that `n_media_times` ≥ `n_times`, and the final `n_times` time periods
@@ -164,18 +197,26 @@ class InputData:
       others are not optional.
     rf_spend: An optional `DataArray` containing the cost of each reach and
       frequency channel. This is used as the denominator for ROI calculations.
-      The DataArray shape can be `(n_rf_channels,)`, `(n_geos, n_times,
-      n_rf_channels)`, or `(n_geos, n_rf_channels)`. The spend should be
-      aggregated over geo and/or time dimensions that are not represented. We
-      recommend that the spend total aligns with the time window of the `kpi`
-      and `controls` data, which is the time window over which incremental
-      outcome of the ROI numerator is calculated. However, note that incremental
-      outcome is influenced by media execution prior to this time window,
-      through lagged effects, and excludes lagged effects beyond the time window
-      of media executed during the time window. If only `media` data is used,
-      `rf_spend` will be `None`. `reach`, `frequency`, and `rf_spend` must
-      contain the same number of media channels in the same order. If any of
-      these arguments is passed, then the others are not optional.
+      It is also used to calculate an assumed cost per media unit for
+      post-modeling analysis such as response curves and budget optimization.
+      Only the aggregate spend (across geos and time periods) is required for
+      these calculations. However, a spend breakdown by geo and time period is
+      required if `rf_roi_calibration_period` is specified or if conducting
+      post-modeling analysis on a specific subset of geos and/or time periods.
+      The DataArray shape can be `(n_rf_channels,)` or `(n_geos, n_times,
+      n_rf_channels)`. The spend should be aggregated over geo and/or time
+      dimensions that are not represented. We recommend that the spend total
+      aligns with the time window of the `kpi` and `controls` data, which is the
+      time window over which incremental outcome of the ROI numerator is
+      calculated. However, note that incremental outcome is influenced by media
+      execution prior to this time window, through lagged effects, and excludes
+      lagged effects beyond the time window of media executed during the time
+      window. If only `media` data is used, `rf_spend` will be `None`. `reach`,
+      `frequency`, and `rf_spend` must contain the same number of media channels
+      in the same order. If any of these arguments is passed, then the others
+      are not optional. If a tensor of shape `(n_rf_channels,)` is passed as
+      `rf_spend`, then it will be automatically allocated across geos and times
+      proportionally to `(reach * frequency)`.
     organic_media: An optional `DataArray` of dimensions `(n_geos,
       n_media_times, n_organic_media_channels)` containing non-negative organic
       media values. Organic media variables are media activities that have no
@@ -234,8 +275,8 @@ class InputData:
   kpi: xr.DataArray
   kpi_type: str
-  controls: xr.DataArray
   population: xr.DataArray
+  controls: xr.DataArray | None = None
   revenue_per_kpi: xr.DataArray | None = None
   media: xr.DataArray | None = None
   media_spend: xr.DataArray | None = None
@@ -265,6 +306,40 @@ class InputData:
       if isinstance(array, xr.DataArray) and constants.GEO in array.dims:
         array.coords[constants.GEO] = array.coords[constants.GEO].astype(str)
+  # TODO: b/416775065 - Combine with Analyzer._impute_and_aggregate_spend
+  @functools.cached_property
+  def allocated_media_spend(self) -> xr.DataArray | None:
+    """Returns the allocated media spend for each geo and time."""
+    if self.media_spend is not None and len(self.media_spend.shape) == 1:
+      return self._allocate_spend(self.media_spend, self.media)
+    else:
+      return self.media_spend
+  @property
+  def allocated_rf_spend(self) -> xr.DataArray | None:
+    """Returns the allocated RF spend for each geo and time."""
+    if self.rf_spend is not None and len(self.rf_spend.shape) == 1:
+      return self._allocate_spend(self.rf_spend, self.reach * self.frequency)
+    else:
+      return self.rf_spend
+  def aggregate_media_spend(
+      self, calibration_period: np.ndarray | None = None
+  ) -> np.ndarray | None:
+    """Aggregates media spend by channel over the calibration period."""
+    return _aggregate_spend(
+        spend=self.allocated_media_spend, calibration_period=calibration_period
+    )
+  def aggregate_rf_spend(
+      self, calibration_period: np.ndarray | None = None
+  ) -> np.ndarray | None:
+    """Aggregates RF spend by channel over the calibration period."""
+    return _aggregate_spend(
+        spend=self.allocated_rf_spend,
+        calibration_period=calibration_period,
+    )
   @property
   def geo(self) -> xr.DataArray:
     """Returns the geo dimension."""
@@ -334,9 +409,12 @@ class InputData:
       return None
   @property
-  def control_variable(self) -> xr.DataArray:
+  def control_variable(self) -> xr.DataArray | None:
     """Returns the control variable dimension."""
-    return self.controls[constants.CONTROL_VARIABLE]
+    if self.controls is not None:
+      return self.controls[constants.CONTROL_VARIABLE]
+    else:
+      return None
   @property
   def media_spend_has_geo_dimension(self) -> bool:
@@ -424,10 +502,11 @@ class InputData:
   def _validate_names(self):
     """Verifies that the names of the data arrays are correct."""
-    arrays = [
+    # Must match the order of constants.POSSIBLE_INPUT_DATA_ARRAY_NAMES!
+    arrays = (
         self.kpi,
-        self.controls,
         self.population,
+        self.controls,
         self.revenue_per_kpi,
         self.organic_media,
         self.organic_reach,
@@ -438,7 +517,7 @@ class InputData:
         self.reach,
         self.frequency,
         self.rf_spend,
-    ]
+    )
     for array, name in zip(arrays, constants.POSSIBLE_INPUT_DATA_ARRAY_NAMES):
       if array is not None and array.name != name:
@@ -479,7 +558,6 @@ class InputData:
         [
             [constants.RF_CHANNEL],
             [constants.GEO, constants.TIME, constants.RF_CHANNEL],
-            [constants.GEO, constants.RF_CHANNEL],
         ],
     )
     _check_dim_collection(
@@ -711,9 +789,10 @@ class InputData:
     """Returns data as a single `xarray.Dataset` object."""
     data = [
         self.kpi,
-        self.controls,
         self.population,
     ]
+    if self.controls is not None:
+      data.append(self.controls)
     if self.revenue_per_kpi is not None:
       data.append(self.revenue_per_kpi)
     if self.media is not None:
@@ -848,3 +927,24 @@ class InputData:
       return self.media_spend.values
     else:
       raise ValueError("Both RF and Media are missing.")
+  def get_total_outcome(self) -> np.ndarray:
+    """Returns total outcome, aggregated over geos and times."""
+    if self.revenue_per_kpi is None:
+      return np.sum(self.kpi.values)
+    return np.sum(self.kpi.values * self.revenue_per_kpi.values)
+  def _allocate_spend(self, spend: xr.DataArray, media_units: xr.DataArray):
+    """Allocates spend across geo and time proportionally to media units."""
+    n_times = len(self.kpi.coords[constants.TIME])
+    selected_media_units = media_units.isel(media_time=slice(-n_times, None))
+    total_media_units_per_channel = selected_media_units.sum(
+        dim=["geo", "media_time"]
+    )
+    proportions = selected_media_units / total_media_units_per_channel
+    expanded_spend = spend.expand_dims({
+        "geo": selected_media_units["geo"],
+        "media_time": selected_media_units["media_time"],
+    })
+    allocated_spend = expanded_spend * proportions
+    return allocated_spend.rename({"media_time": "time"})

meridian/data/load.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2024 The Meridian Authors.
+# Copyright 2025 The Meridian Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,7 +79,7 @@ class XrDatasetDataLoader(InputDataLoader):
     """Constructor.
     The coordinates of the input dataset should be: `time`, `media_time`,
-    `control_variable`, `geo` (optional for a national model),
+    `control_variable` (optional), `geo` (optional for a national model),
     `non_media_channel` (optional), `organic_media_channel` (optional),
     `organic_rf_channel` (optional), and
     either `media_channel`, `rf_channel`, or both.
@@ -93,7 +93,7 @@ class XrDatasetDataLoader(InputDataLoader):
     *   `kpi`: `(geo, time)`
     *   `revenue_per_kpi`: `(geo, time)`
-    *   `controls`: `(geo, time, control_variable)`
+    *   `controls`: `(geo, time, control_variable)` - optional
     *   `population`: `(geo)`
     *   `media`: `(geo, media_time, media_channel)` - optional
     *   `media_spend`: `(geo, time, media_channel)`, `(1, time, media_channel)`,
@@ -113,7 +113,7 @@ class XrDatasetDataLoader(InputDataLoader):
     *   `kpi`: `([1,] time)`
     *   `revenue_per_kpi`: `([1,] time)`
-    *   `controls`: `([1,] time, control_variable)`
+    *   `controls`: `([1,] time, control_variable)` - optional
     *   `population`: `([1],)` - this array is optional for national data
     *   `media`: `([1,] media_time, media_channel)` - optional
     *   `media_spend`: `([1,] time, media_channel)` or
@@ -198,7 +198,7 @@ class XrDatasetDataLoader(InputDataLoader):
       self.dataset = dataset.rename(name_mapping)
     # Add a `geo` dimension if it is not already present.
-    if (constants.GEO) not in self.dataset.dims.keys():
+    if (constants.GEO) not in self.dataset.sizes.keys():
       self.dataset = self.dataset.expand_dims(dim=[constants.GEO], axis=0)
     if len(self.dataset.coords[constants.GEO]) == 1:
@@ -228,7 +228,7 @@ class XrDatasetDataLoader(InputDataLoader):
           compat='override',
       )
-    if constants.MEDIA_TIME not in self.dataset.dims.keys():
+    if constants.MEDIA_TIME not in self.dataset.sizes.keys():
       self._add_media_time()
     self._normalize_time_coordinates(constants.TIME)
     self._normalize_time_coordinates(constants.MEDIA_TIME)
@@ -349,14 +349,17 @@ class XrDatasetDataLoader(InputDataLoader):
     # Arrays in which NAs are expected in the lagged-media period.
     na_arrays = [
         constants.KPI,
-        constants.CONTROLS,
     ]
-    na_mask = self.dataset[constants.KPI].isnull().any(
-        dim=constants.GEO
-    ) | self.dataset[constants.CONTROLS].isnull().any(
-        dim=[constants.GEO, constants.CONTROL_VARIABLE]
-    )
+    na_mask = self.dataset[constants.KPI].isnull().any(dim=constants.GEO)
+    if constants.CONTROLS in self.dataset.data_vars.keys():
+      na_arrays.append(constants.CONTROLS)
+      na_mask |= (
+          self.dataset[constants.CONTROLS]
+          .isnull()
+          .any(dim=[constants.GEO, constants.CONTROL_VARIABLE])
+      )
     if constants.NON_MEDIA_TREATMENTS in self.dataset.data_vars.keys():
       na_arrays.append(constants.NON_MEDIA_TREATMENTS)
@@ -427,11 +430,12 @@ class XrDatasetDataLoader(InputDataLoader):
         .dropna(dim=constants.TIME)
         .rename({constants.TIME: new_time})
     )
-    new_dataset[constants.CONTROLS] = (
-        new_dataset[constants.CONTROLS]
-        .dropna(dim=constants.TIME)
-        .rename({constants.TIME: new_time})
-    )
+    if constants.CONTROLS in new_dataset.data_vars.keys():
+      new_dataset[constants.CONTROLS] = (
+          new_dataset[constants.CONTROLS]
+          .dropna(dim=constants.TIME)
+          .rename({constants.TIME: new_time})
+      )
     if constants.NON_MEDIA_TREATMENTS in new_dataset.data_vars.keys():
       new_dataset[constants.NON_MEDIA_TREATMENTS] = (
           new_dataset[constants.NON_MEDIA_TREATMENTS]
@@ -466,6 +470,11 @@ class XrDatasetDataLoader(InputDataLoader):
   def load(self) -> input_data.InputData:
     """Returns an `InputData` object containing the data from the dataset."""
+    controls = (
+        self.dataset.controls
+        if constants.CONTROLS in self.dataset.data_vars.keys()
+        else None
+    )
     revenue_per_kpi = (
         self.dataset.revenue_per_kpi
         if constants.REVENUE_PER_KPI in self.dataset.data_vars.keys()
@@ -519,9 +528,9 @@ class XrDatasetDataLoader(InputDataLoader):
     return input_data.InputData(
         kpi=self.dataset.kpi,
         kpi_type=self.kpi_type,
-        revenue_per_kpi=revenue_per_kpi,
-        controls=self.dataset.controls,
         population=self.dataset.population,
+        controls=controls,
+        revenue_per_kpi=revenue_per_kpi,
         media=media,
         media_spend=media_spend,
         reach=reach,
@@ -539,14 +548,14 @@ class CoordToColumns:
   """A mapping between the desired and actual column names in the input data.
   Attributes:
-    controls: List of column names containing `controls` values in the input
-      data.
     time: Name of column containing `time` values in the input data.
-    kpi: Name of column containing `kpi` values in the input data.
-    revenue_per_kpi: Name of column containing `revenue_per_kpi` values in the
-      input data.
     geo:  Name of column containing `geo` values in the input data. This field
       is optional for a national model.
+    kpi: Name of column containing `kpi` values in the input data.
+    controls: List of column names containing `controls` values in the input
+      data. Optional.
+    revenue_per_kpi: Name of column containing `revenue_per_kpi` values in the
+      input data. Optional. Will be overridden if model KPI type is "revenue".
     population: Name of column containing `population` values in the input data.
       This field is optional for a national model.
     media: List of column names containing `media` values in the input data.
@@ -567,11 +576,11 @@ class CoordToColumns:
       values in the input data.
   """
-  controls: Sequence[str]
   time: str = constants.TIME
+  geo: str = constants.GEO
   kpi: str = constants.KPI
+  controls: Sequence[str] | None = None
   revenue_per_kpi: str | None = None
-  geo: str = constants.GEO
   population: str = constants.POPULATION
   # Media data
   media: Sequence[str] | None = None
@@ -607,7 +616,7 @@ class DataFrameDataLoader(InputDataLoader):
   to the DataFrame column names if they are different. The fields are:
   *   `geo`, `time`, `kpi`, `revenue_per_kpi`, `population` (single column)
-  *   `controls` (multiple columns)
+  *   `controls` (multiple columns, optional)
   *   (1) `media`, `media_spend` (multiple columns)
   *   (2) `reach`, `frequency`, `rf_spend` (multiple columns)
   *   `non_media_treatments` (multiple columns, optional)
@@ -953,9 +962,10 @@ class DataFrameDataLoader(InputDataLoader):
     not_lagged_columns = []
     coords = [
         constants.KPI,
-        constants.CONTROLS,
         constants.POPULATION,
     ]
+    if self.coord_to_columns.controls is not None:
+      coords.append(constants.CONTROLS)
     if self.coord_to_columns.revenue_per_kpi is not None:
       coords.append(constants.REVENUE_PER_KPI)
     if self.coord_to_columns.media_spend is not None:
@@ -1042,17 +1052,20 @@ class DataFrameDataLoader(InputDataLoader):
         .to_frame()
         .to_xarray()
     )
-    controls_xr = (
-        df_indexed[self.coord_to_columns.controls]
-        .stack()
-        .rename(constants.CONTROLS)
-        .rename_axis(
-            [constants.GEO, constants.TIME, constants.CONTROL_VARIABLE]
-        )
-        .to_frame()
-        .to_xarray()
-    )
-    dataset = xr.combine_by_coords([kpi_xr, population_xr, controls_xr])
+    dataset = xr.combine_by_coords([kpi_xr, population_xr])
+    if self.coord_to_columns.controls is not None:
+      controls_xr = (
+          df_indexed[self.coord_to_columns.controls]
+          .stack()
+          .rename(constants.CONTROLS)
+          .rename_axis(
+              [constants.GEO, constants.TIME, constants.CONTROL_VARIABLE]
+          )
+          .to_frame()
+          .to_xarray()
+      )
+      dataset = xr.combine_by_coords([dataset, controls_xr])
     if self.coord_to_columns.non_media_treatments is not None:
       non_media_xr = (
@@ -1224,7 +1237,7 @@ class CsvDataLoader(InputDataLoader):
   CSV column names, if they are different. The fields are:
   *   `geo`, `time`, `kpi`, `revenue_per_kpi`, `population` (single column)
-  *   `controls` (multiple columns)
+  *   `controls` (multiple columns, optional)
   *   (1) `media`, `media_spend` (multiple columns)
   *   (2) `reach`, `frequency`, `rf_spend` (multiple columns)
   *   `non_media_treatments` (multiple columns, optional)

google-meridian 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

google-meridian 1.0.9py3-none-any.whl → 1.1.1py3-none-any.whl