google-meridian 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,25 +30,112 @@ __all__ = [
30
30
  class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
31
31
  """Builds `InputData` from DataFrames."""
32
32
 
33
+ def __init__(
34
+ self,
35
+ kpi_type: str,
36
+ default_geo_column: str = constants.GEO,
37
+ default_time_column: str = constants.TIME,
38
+ default_media_time_column: str = constants.TIME,
39
+ default_population_column: str = constants.POPULATION,
40
+ default_kpi_column: str = constants.KPI,
41
+ default_revenue_per_kpi_column: str = constants.REVENUE_PER_KPI,
42
+ ):
43
+ super().__init__(kpi_type)
44
+
45
+ self._default_geo_column = default_geo_column
46
+ self._default_time_column = default_time_column
47
+ self._default_media_time_column = default_media_time_column
48
+ self._default_population_column = default_population_column
49
+ self._default_kpi_column = default_kpi_column
50
+ self._default_revenue_per_kpi_column = default_revenue_per_kpi_column
51
+
52
+ @property
53
+ def default_geo_column(self) -> str:
54
+ """The default geo column name for this builder to use.
55
+
56
+ This column name is used when `geo_col` is not explicitly provided to a data
57
+ setter method.
58
+
59
+ By default, this is `"geo"`.
60
+ """
61
+ return self._default_geo_column
62
+
63
+ @property
64
+ def default_time_column(self) -> str:
65
+ """The default time column name for this builder to use.
66
+
67
+ This column name is used when `time_col` is not explicitly provided to a
68
+ data setter method.
69
+
70
+ By default, this is `"time"`.
71
+ """
72
+ return self._default_time_column
73
+
74
+ @property
75
+ def default_media_time_column(self) -> str:
76
+ """The default *media* time column name for this builder to use.
77
+
78
+ This column name is used when `media_time_col` is not explicitly provided to
79
+ a data setter method.
80
+
81
+ By default, this is also `"time"`, since most input dataframes are likely
82
+ to use the same time column for both their media execution and media spend
83
+ data.
84
+ """
85
+ return self._default_media_time_column
86
+
87
+ @property
88
+ def default_population_column(self) -> str:
89
+ """The default population column name for this builder to use.
90
+
91
+ This column name is used when `population_col` is not explicitly provided to
92
+ a data setter method.
93
+
94
+ By default, this is `"population"`.
95
+ """
96
+ return self._default_population_column
97
+
98
+ @property
99
+ def default_kpi_column(self) -> str:
100
+ """The default kpi column name for this builder to use.
101
+
102
+ This column name is used when `kpi_col` is not explicitly provided to a data
103
+ setter method.
104
+
105
+ By default, this is `"kpi"`.
106
+ """
107
+ return self._default_kpi_column
108
+
109
+ @property
110
+ def default_revenue_per_kpi_column(self) -> str:
111
+ """The default revenue per kpi column name for this builder to use.
112
+
113
+ This column name is used when `revenue_per_kpi_col` is not explicitly
114
+ provided to a data setter method.
115
+
116
+ By default, this is `"revenue_per_kpi"`.
117
+ """
118
+ return self._default_revenue_per_kpi_column
119
+
33
120
  def with_kpi(
34
121
  self,
35
122
  df: pd.DataFrame,
36
- kpi_col: str = constants.KPI,
37
- time_col: str = constants.TIME,
38
- geo_col: str = constants.GEO,
123
+ kpi_col: str | None = None,
124
+ time_col: str | None = None,
125
+ geo_col: str | None = None,
39
126
  ) -> 'DataFrameInputDataBuilder':
40
127
  """Reads KPI data from a DataFrame.
41
128
 
42
129
  Args:
43
130
  df: The DataFrame to read the KPI data from.
44
131
  kpi_col: The name of the column containing the KPI values. If not
45
- provided, the default name is `kpi`.
132
+ provided, `self.default_kpi_column` is used.
46
133
  time_col: The name of the column containing the time coordinates. If not
47
- provided, the default name is `time`.
134
+ provided, `self.default_time_column` is used.
48
135
  geo_col: (Optional) The name of the column containing the geo coordinates.
49
- If not provided, the default name is `geo`. If the DataFrame provided
50
- has no geo column, a national model data is assumed and a geo dimension
51
- will be created internally with a single coordinate value
136
+ If not provided, `self.default_geo_column` is used. If the DataFrame
137
+ provided has no geo column, a national model data is assumed and a geo
138
+ dimension will be created internally with a single coordinate value
52
139
  `national_geo`.
53
140
 
54
141
  Returns:
@@ -56,6 +143,10 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
56
143
  """
57
144
  kpi_df = df.copy()
58
145
 
146
+ kpi_col = kpi_col or self.default_kpi_column
147
+ time_col = time_col or self.default_time_column
148
+ geo_col = geo_col or self.default_geo_column
149
+
59
150
  ### Validate ###
60
151
  self._validate_cols(kpi_df, [kpi_col, time_col], [geo_col])
61
152
  self._validate_coords(kpi_df, geo_col, time_col)
@@ -73,8 +164,8 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
73
164
  self,
74
165
  df: pd.DataFrame,
75
166
  control_cols: list[str],
76
- time_col: str = constants.TIME,
77
- geo_col: str = constants.GEO,
167
+ time_col: str | None = None,
168
+ geo_col: str | None = None,
78
169
  ) -> 'DataFrameInputDataBuilder':
79
170
  """Reads controls data from a DataFrame.
80
171
 
@@ -82,18 +173,25 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
82
173
  df: The DataFrame to read the controls data from.
83
174
  control_cols: The names of the columns containing the controls values.
84
175
  time_col: The name of the column containing the time coordinates. If not
85
- provided, the default name is `time`.
176
+ provided, `self.default_time_column` is used.
86
177
  geo_col: (Optional) The name of the column containing the geo coordinates.
87
- If not provided, the default name is `geo`. If the DataFrame provided
88
- has no geo column, a national model data is assumed and a geo dimension
89
- will be created internally with a single coordinate value
178
+ If not provided, `self.default_geo_column` is used. If the DataFrame
179
+ provided has no geo column, a national model data is assumed and a geo
180
+ dimension will be created internally with a single coordinate value
90
181
  `national_geo`.
91
182
 
92
183
  Returns:
93
184
  The `DataFrameInputDataBuilder` with the added controls data.
94
185
  """
186
+ if not control_cols:
187
+ warnings.warn('No control columns provided. Not adding controls data.')
188
+ return self
189
+
95
190
  controls_df = df.copy()
96
191
 
192
+ time_col = time_col or self.default_time_column
193
+ geo_col = geo_col or self.default_geo_column
194
+
97
195
  ### Validate ###
98
196
  self._validate_cols(
99
197
  controls_df,
@@ -116,19 +214,19 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
116
214
  def with_population(
117
215
  self,
118
216
  df: pd.DataFrame,
119
- population_col: str = constants.POPULATION,
120
- geo_col: str = constants.GEO,
217
+ population_col: str | None = None,
218
+ geo_col: str | None = None,
121
219
  ) -> 'DataFrameInputDataBuilder':
122
220
  """Reads population data from a DataFrame.
123
221
 
124
222
  Args:
125
223
  df: The DataFrame to read the population data from.
126
224
  population_col: The name of the column containing the population values.
127
- If not provided, the default name is `population`.
225
+ If not provided, `self.default_population_column` is used.
128
226
  geo_col: (Optional) The name of the column containing the geo coordinates.
129
- If not provided, the default name is `geo`. If the DataFrame provided
130
- has no geo column, a national model data is assumed and a geo dimension
131
- will be created internally with a single coordinate value
227
+ If not provided, `self.default_geo_column` is used. If the DataFrame
228
+ provided has no geo column, a national model data is assumed and a geo
229
+ dimension will be created internally with a single coordinate value
132
230
  `national_geo`.
133
231
 
134
232
  Returns:
@@ -136,6 +234,9 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
136
234
  """
137
235
  population_df = df.copy()
138
236
 
237
+ population_col = population_col or self.default_population_column
238
+ geo_col = geo_col or self.default_geo_column
239
+
139
240
  ### Validate ###
140
241
  self._validate_cols(population_df, [population_col], [geo_col])
141
242
  self._validate_coords(population_df, geo_col)
@@ -157,22 +258,22 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
157
258
  def with_revenue_per_kpi(
158
259
  self,
159
260
  df: pd.DataFrame,
160
- revenue_per_kpi_col: str = constants.REVENUE_PER_KPI,
161
- time_col: str = constants.TIME,
162
- geo_col: str = constants.GEO,
261
+ revenue_per_kpi_col: str | None = None,
262
+ time_col: str | None = None,
263
+ geo_col: str | None = None,
163
264
  ) -> 'DataFrameInputDataBuilder':
164
265
  """Reads revenue per KPI data from a DataFrame.
165
266
 
166
267
  Args:
167
268
  df: The DataFrame to read the revenue per KPI data from.
168
269
  revenue_per_kpi_col: The name of the column containing the revenue per KPI
169
- values. If not provided, the default name is `revenue_per_kpi`.
270
+ values. If not provided, `self.default_revenue_per_kpi_column` is used.
170
271
  time_col: The name of the column containing the time coordinates. If not
171
- provided, the default name is `time`.
272
+ provided, `self.default_time_column` is used.
172
273
  geo_col: (Optional) The name of the column containing the geo coordinates.
173
- If not provided, the default name is `geo`. If the DataFrame provided
174
- has no geo column, a national model data is assumed and a geo dimension
175
- will be created internally with a single coordinate value
274
+ If not provided, `self.default_geo_column` is used. If the DataFrame
275
+ provided has no geo column, a national model data is assumed and a geo
276
+ dimension will be created internally with a single coordinate value
176
277
  `national_geo`.
177
278
 
178
279
  Returns:
@@ -180,6 +281,12 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
180
281
  """
181
282
  revenue_per_kpi_df = df.copy()
182
283
 
284
+ revenue_per_kpi_col = (
285
+ revenue_per_kpi_col or self.default_revenue_per_kpi_column
286
+ )
287
+ time_col = time_col or self.default_time_column
288
+ geo_col = geo_col or self.default_geo_column
289
+
183
290
  ### Validate ###
184
291
  self._validate_cols(
185
292
  revenue_per_kpi_df,
@@ -209,8 +316,8 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
209
316
  media_cols: list[str],
210
317
  media_spend_cols: list[str],
211
318
  media_channels: list[str],
212
- time_col: str = constants.TIME,
213
- geo_col: str = constants.GEO,
319
+ time_col: str | None = None,
320
+ geo_col: str | None = None,
214
321
  ) -> 'DataFrameInputDataBuilder':
215
322
  """Reads media and media spend data from a DataFrame.
216
323
 
@@ -223,21 +330,31 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
223
330
  `media_cols` and `media_spend_cols` in length. These are also index
224
331
  mapped.
225
332
  time_col: The name of the column containing the time coordinates for media
226
- spend and media time coordinates for media. If not provided, the default
227
- name is `time`. Media time coordinates will be shorter than time
333
+ spend and media time coordinates for media. If not provided,
334
+ `self.default_time_column` is used. Media time coordinates are inferred
335
+ from the same `time_col` and are potentially shorter than time
228
336
  coordinates if media spend values are missing (NaN) for some t in
229
337
  `time`. Media time must be equal or a subset of time.
230
338
  geo_col: (Optional) The name of the column containing the geo coordinates.
231
- If not provided, the default name is `geo`. If the DataFrame provided
232
- has no geo column, a national model data is assumed and a geo dimension
233
- will be created internally with a single coordinate value
339
+ If not provided, `self.default_geo_column` is used. If the DataFrame
340
+ provided has no geo column, a national model data is assumed and a geo
341
+ dimension will be created internally with a single coordinate value
234
342
  `national_geo`.
235
343
 
236
344
  Returns:
237
345
  The `DataFrameInputDataBuilder` with the added media and media spend data.
238
346
  """
347
+ if not media_cols or not media_spend_cols or not media_channels:
348
+ raise ValueError(
349
+ '`media_cols`, `media_spend_cols`, and `media_channels` must not be '
350
+ 'empty.'
351
+ )
352
+
239
353
  media_df = df.copy()
240
354
 
355
+ time_col = time_col or self.default_time_column
356
+ geo_col = geo_col or self.default_geo_column
357
+
241
358
  ### Validate ###
242
359
  # For a media dataframe, media and media_spend columns may be the same
243
360
  # (e.g. if using media spend as media execution value), so here we validate
@@ -280,8 +397,8 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
280
397
  frequency_cols: list[str],
281
398
  rf_spend_cols: list[str],
282
399
  rf_channels: list[str],
283
- time_col: str = constants.TIME,
284
- geo_col: str = constants.GEO,
400
+ time_col: str | None = None,
401
+ geo_col: str | None = None,
285
402
  ) -> 'DataFrameInputDataBuilder':
286
403
  """Reads reach, frequency, and rf spend data from a DataFrame.
287
404
 
@@ -295,21 +412,36 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
295
412
  also index mapped.
296
413
  time_col: The name of the column containing the time coordinates for rf
297
414
  spend and media time coordinates for reach and frequency. If not
298
- provided, the default name is `time`. Media time coordinates will be
299
- shorter than time coordinates if media spend values are missing (NaN)
300
- for some t in `time`. Media time must be equal or a subset of time.
415
+ provided, `self.default_time_column` is used. Media time coordinates are
416
+ inferred from the same `time_col` and are potentially shorter than time
417
+ coordinates if media spend values are missing (NaN) for some t in
418
+ `time`. Media time must be equal or a subset of time.
301
419
  geo_col: (Optional) The name of the column containing the geo coordinates.
302
- If not provided, the default name is `geo`. If the DataFrame provided
303
- has no geo column, a national model data is assumed and a geo dimension
304
- will be created internally with a single coordinate value
420
+ If not provided, `self.default_geo_column` is used. If the DataFrame
421
+ provided has no geo column, a national model data is assumed and a geo
422
+ dimension will be created internally with a single coordinate value
305
423
  `national_geo`.
306
424
 
307
425
  Returns:
308
426
  The `DataFrameInputDataBuilder` with the added reach, frequency, and rf
309
427
  spend data.
310
428
  """
429
+ if (
430
+ not reach_cols
431
+ or not frequency_cols
432
+ or not rf_spend_cols
433
+ or not rf_channels
434
+ ):
435
+ raise ValueError(
436
+ '`reach_cols`, `frequency_cols`, `rf_spend_cols`, and `rf_channels` '
437
+ 'must not be empty.'
438
+ )
439
+
311
440
  reach_df = df.copy()
312
441
 
442
+ time_col = time_col or self.default_time_column
443
+ geo_col = geo_col or self.default_geo_column
444
+
313
445
  ### Validate ###
314
446
  self._validate_cols(
315
447
  reach_df,
@@ -368,8 +500,8 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
368
500
  df: pd.DataFrame,
369
501
  organic_media_cols: list[str],
370
502
  organic_media_channels: list[str] | None = None,
371
- media_time_col: str = constants.MEDIA_TIME,
372
- geo_col: str = constants.GEO,
503
+ media_time_col: str | None = None,
504
+ geo_col: str | None = None,
373
505
  ) -> 'DataFrameInputDataBuilder':
374
506
  """Reads organic media data from a DataFrame.
375
507
 
@@ -382,18 +514,24 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
382
514
  provided, must match `organic_media_cols` in length. This is index
383
515
  mapped.
384
516
  media_time_col: The name of the column containing the media time
385
- coordinates. If not provided, the default name is `media_time`.
517
+ coordinates. If not provided, `self.default_media_time_column` is used.
386
518
  geo_col: (Optional) The name of the column containing the geo coordinates.
387
- If not provided, the default name is `geo`. If the DataFrame provided
388
- has no geo column, a national model data is assumed and a geo dimension
389
- will be created internally with a single coordinate value
519
+ If not provided, `self.default_geo_column` is used. If the DataFrame
520
+ provided has no geo column, a national model data is assumed and a geo
521
+ dimension will be created internally with a single coordinate value
390
522
  `national_geo`.
391
523
 
392
524
  Returns:
393
525
  The `DataFrameInputDataBuilder` with the added organic media data.
394
526
  """
527
+ if not organic_media_cols:
528
+ raise ValueError('`organic_media_cols` must not be empty.')
529
+
395
530
  organic_media_df = df.copy()
396
531
 
532
+ media_time_col = media_time_col or self.default_media_time_column
533
+ geo_col = geo_col or self.default_geo_column
534
+
397
535
  ### Validate ###
398
536
  if not organic_media_channels:
399
537
  organic_media_channels = organic_media_cols
@@ -432,8 +570,8 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
432
570
  organic_reach_cols: list[str],
433
571
  organic_frequency_cols: list[str],
434
572
  organic_rf_channels: list[str],
435
- media_time_col: str = constants.MEDIA_TIME,
436
- geo_col: str = constants.GEO,
573
+ media_time_col: str | None = None,
574
+ geo_col: str | None = None,
437
575
  ) -> 'DataFrameInputDataBuilder':
438
576
  """Reads organic reach and organic frequency data from a DataFrame.
439
577
 
@@ -447,19 +585,32 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
447
585
  match `organic_reach_cols` and `organic_frequency_cols` in length. These
448
586
  are also index mapped.
449
587
  media_time_col: The name of the column containing the media time
450
- coordinates. If not provided, the default name is `media_time`.
588
+ coordinates. If not provided, `self.default_media_time_column` is used.
451
589
  geo_col: (Optional) The name of the column containing the geo coordinates.
452
- If not provided, the default name is `geo`. If the DataFrame provided
453
- has no geo column, a national model data is assumed and a geo dimension
454
- will be created internally with a single coordinate value
590
+ If not provided, `self.default_geo_column` is used. If the DataFrame
591
+ provided has no geo column, a national model data is assumed and a geo
592
+ dimension will be created internally with a single coordinate value
455
593
  `national_geo`.
456
594
 
457
595
  Returns:
458
596
  The `DataFrameInputDataBuilder` with the added organic reach and organic
459
597
  frequency data.
460
598
  """
599
+ if (
600
+ not organic_reach_cols
601
+ or not organic_frequency_cols
602
+ or not organic_rf_channels
603
+ ):
604
+ raise ValueError(
605
+ '`organic_reach_cols`, `organic_frequency_cols`, and'
606
+ ' `organic_rf_channels` must not be empty.'
607
+ )
608
+
461
609
  organic_reach_frequency_df = df.copy()
462
610
 
611
+ media_time_col = media_time_col or self.default_media_time_column
612
+ geo_col = geo_col or self.default_geo_column
613
+
463
614
  ### Validate ###
464
615
  self._validate_cols(
465
616
  organic_reach_frequency_df,
@@ -506,8 +657,8 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
506
657
  self,
507
658
  df: pd.DataFrame,
508
659
  non_media_treatment_cols: list[str],
509
- time_col: str = constants.TIME,
510
- geo_col: str = constants.GEO,
660
+ time_col: str | None = None,
661
+ geo_col: str | None = None,
511
662
  ) -> 'DataFrameInputDataBuilder':
512
663
  """Reads non-media treatments data from a DataFrame.
513
664
 
@@ -516,18 +667,28 @@ class DataFrameInputDataBuilder(input_data_builder.InputDataBuilder):
516
667
  non_media_treatment_cols: The names of the columns containing the
517
668
  non-media treatments values.
518
669
  time_col: The name of the column containing the time coordinates. If not
519
- provided, the default name is `time`.
670
+ provided, `self.default_time_column` is used.
520
671
  geo_col: (Optional) The name of the column containing the geo coordinates.
521
- If not provided, the default name is `geo`. If the DataFrame provided
522
- has no geo column, a national model data is assumed and a geo dimension
523
- will be created internally with a single coordinate value
672
+ If not provided, `self.default_geo_column` is used. If the DataFrame
673
+ provided has no geo column, a national model data is assumed and a geo
674
+ dimension will be created internally with a single coordinate value
524
675
  `national_geo`.
525
676
 
526
677
  Returns:
527
678
  The `DataFrameInputDataBuilder` with the added non-media treatments data.
528
679
  """
680
+ if not non_media_treatment_cols:
681
+ warnings.warn(
682
+ 'No non-media treatment columns were provided. Not adding non-media '
683
+ 'treatments data.'
684
+ )
685
+ return self
686
+
529
687
  non_media_treatments_df = df.copy()
530
688
 
689
+ time_col = time_col or self.default_time_column
690
+ geo_col = geo_col or self.default_geo_column
691
+
531
692
  ### Validate ###
532
693
  self._validate_cols(
533
694
  non_media_treatments_df,
@@ -134,7 +134,9 @@ class InputDataBuilder(abc.ABC):
134
134
  if len(value) != len(set(value)):
135
135
  raise ValueError('Geos must be unique.')
136
136
  if self.geos is not None and set(self.geos) != set(value):
137
- raise ValueError(f'geos already set to {self.geos}.')
137
+ raise ValueError(
138
+ f'geos already set to {self.geos}. Cannot reassign to {value}.'
139
+ )
138
140
  self._geos = value
139
141
 
140
142
  @property