imsciences 0.9.7.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.7.0
3
+ Version: 1.0.1
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -1,4 +1,4 @@
1
+ from .geo import geoprocessing
1
2
  from .mmm import dataprocessing
2
3
  from .pull import datapull
3
- from .geo import geoprocessing
4
- from .vis import datavis
4
+ from .vis import datavis
@@ -1,65 +1,88 @@
1
- import pandas as pd
2
- from google.analytics.data_v1beta import BetaAnalyticsDataClient
3
- from google.analytics.data_v1beta.types import DateRange
4
- from google.analytics.data_v1beta.types import Dimension
5
- from google.analytics.data_v1beta.types import Metric
6
- from google.analytics.data_v1beta.types import RunReportRequest
7
- from google.analytics.data_v1beta.types import OrderBy
8
- from google.analytics.data_v1beta.types import Filter
9
- from google.analytics.data_v1beta.types import FilterExpression
10
- from google.analytics.data_v1beta.types import FilterExpressionList
11
- from google.auth.exceptions import DefaultCredentialsError
12
1
  import logging
13
- from datetime import datetime, timedelta
14
2
  import os
3
+ from datetime import datetime, timedelta
4
+
15
5
  import numpy as np
6
+ import pandas as pd
7
+ from google.analytics.data_v1beta import BetaAnalyticsDataClient
8
+ from google.analytics.data_v1beta.types import (
9
+ DateRange,
10
+ Dimension,
11
+ Filter,
12
+ FilterExpression,
13
+ FilterExpressionList,
14
+ Metric,
15
+ OrderBy,
16
+ RunReportRequest,
17
+ )
18
+ from google.auth.exceptions import DefaultCredentialsError
16
19
  from scipy.spatial import cKDTree
17
20
 
21
+
18
22
  class geoprocessing:
19
-
20
23
  def help(self):
21
-
22
24
  print("\n1. pull_ga")
23
25
  print(" - Description: Pull in GA4 data for geo experiments.")
24
- print(" - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)")
25
- print(" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
26
+ print(
27
+ " - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)",
28
+ )
29
+ print(
30
+ " - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])",
31
+ )
26
32
 
27
33
  print("\n2. process_itv_analysis")
28
- print(" - Description: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.")
29
- print(" - Usage: process_itv_analysis(raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list")
30
- print(" - Example: process_itv_analysis(df, 'itv_regional_mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum']")
31
-
34
+ print(
35
+ " - Description: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.",
36
+ )
37
+ print(
38
+ " - Usage: process_itv_analysis(raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list",
39
+ )
40
+ print(
41
+ " - Example: process_itv_analysis(df, 'itv_regional_mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum']",
42
+ )
43
+
32
44
  print("\n3. process_city_analysis")
33
- print(" - Description: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.")
34
- print(" - Usage: process_city_analysis(raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)")
35
- print(" - Example: process_city_analysis(df, spend, 'output.csv', ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'mean'])")
36
-
45
+ print(
46
+ " - Description: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.",
47
+ )
48
+ print(
49
+ " - Usage: process_city_analysis(raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)",
50
+ )
51
+ print(
52
+ " - Example: process_city_analysis(df, spend, 'output.csv', ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'mean'])",
53
+ )
54
+
37
55
  def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
38
56
  """
39
57
  Pulls Google Analytics data using the BetaAnalyticsDataClient.
40
58
 
41
- Parameters:
59
+ Parameters
60
+ ----------
42
61
  credentials_file (str): Path to the JSON credentials file.
43
62
  property_id (str): Google Analytics property ID.
44
63
  start_date (str): Start date in 'YYYY-MM-DD' format.
45
64
  country (str): Country to filter the data by.
46
65
  metrics (list): List of metrics to retrieve (e.g., ["totalUsers", "sessions"]).
47
66
 
48
- Returns:
67
+ Returns
68
+ -------
49
69
  pd.DataFrame: A pandas DataFrame containing the fetched data.
70
+
50
71
  """
51
72
  try:
52
- end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
73
+ end_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
53
74
 
54
75
  if not os.path.exists(credentials_file):
55
- raise FileNotFoundError(f"Credentials file '{credentials_file}' not found.")
76
+ raise FileNotFoundError(
77
+ f"Credentials file '{credentials_file}' not found.",
78
+ )
56
79
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
57
80
 
58
81
  try:
59
82
  client = BetaAnalyticsDataClient()
60
83
  except DefaultCredentialsError as e:
61
84
  raise DefaultCredentialsError(
62
- f"Failed to initialize Google Analytics client: {e}"
85
+ f"Failed to initialize Google Analytics client: {e}",
63
86
  )
64
87
 
65
88
  def format_report(request):
@@ -68,32 +91,44 @@ class geoprocessing:
68
91
  row_index_names = [header.name for header in response.dimension_headers]
69
92
  row_header = []
70
93
  for i in range(len(row_index_names)):
71
- row_header.append([row.dimension_values[i].value for row in response.rows])
94
+ row_header.append(
95
+ [row.dimension_values[i].value for row in response.rows],
96
+ )
72
97
 
73
- row_index_named = pd.MultiIndex.from_arrays(np.array(row_header), names=np.array(row_index_names))
98
+ row_index_named = pd.MultiIndex.from_arrays(
99
+ np.array(row_header),
100
+ names=np.array(row_index_names),
101
+ )
74
102
  # Row flat data
75
103
  metric_names = [header.name for header in response.metric_headers]
76
104
  data_values = []
77
105
  for i in range(len(metric_names)):
78
- data_values.append([row.metric_values[i].value for row in response.rows])
106
+ data_values.append(
107
+ [row.metric_values[i].value for row in response.rows],
108
+ )
79
109
 
80
- output = pd.DataFrame(data=np.transpose(np.array(data_values, dtype='f')),
81
- index=row_index_named, columns=metric_names)
110
+ output = pd.DataFrame(
111
+ data=np.transpose(np.array(data_values, dtype="f")),
112
+ index=row_index_named,
113
+ columns=metric_names,
114
+ )
82
115
  return output
83
116
 
84
117
  all_dfs = []
85
118
  offset_value = 0
86
- batch_size = 100000
119
+ batch_size = 100000
87
120
 
88
121
  while True:
89
122
  metric_objects = [Metric(name=metric) for metric in metrics]
90
123
 
91
124
  request = RunReportRequest(
92
- property='properties/' + property_id,
125
+ property="properties/" + property_id,
93
126
  dimensions=[Dimension(name="date"), Dimension(name="city")],
94
127
  metrics=metric_objects,
95
- order_bys=[OrderBy(dimension={'dimension_name': 'date'}),
96
- OrderBy(dimension={'dimension_name': 'city'})],
128
+ order_bys=[
129
+ OrderBy(dimension={"dimension_name": "date"}),
130
+ OrderBy(dimension={"dimension_name": "city"}),
131
+ ],
97
132
  date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
98
133
  limit=batch_size,
99
134
  offset=offset_value,
@@ -103,37 +138,39 @@ class geoprocessing:
103
138
  FilterExpression(
104
139
  filter=Filter(
105
140
  field_name="country",
106
- string_filter=Filter.StringFilter(value=country),
107
- )
141
+ string_filter=Filter.StringFilter(
142
+ value=country,
143
+ ),
144
+ ),
108
145
  ),
109
- ]
110
- )
111
- )
146
+ ],
147
+ ),
148
+ ),
112
149
  )
113
150
 
114
151
  df = format_report(request)
115
152
  if df.empty:
116
- break
153
+ break
117
154
 
118
155
  df = df.reset_index()
119
- df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
156
+ df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
120
157
  all_dfs.append(df)
121
158
  offset_value += batch_size
122
159
 
123
160
  if not all_dfs:
124
- return pd.DataFrame()
161
+ return pd.DataFrame()
125
162
 
126
163
  final_df = pd.concat(all_dfs, ignore_index=True)
127
164
  return final_df
128
165
 
129
166
  except FileNotFoundError as e:
130
- logging.error(f"FileNotFoundError: {e}")
167
+ logging.exception(f"FileNotFoundError: {e}")
131
168
  raise
132
169
  except DefaultCredentialsError as e:
133
- logging.error(f"DefaultCredentialsError: {e}")
170
+ logging.exception(f"DefaultCredentialsError: {e}")
134
171
  raise
135
172
  except Exception as e:
136
- logging.error(f"An unexpected error occurred: {e}")
173
+ logging.exception(f"An unexpected error occurred: {e}")
137
174
  raise
138
175
 
139
176
  def process_itv_analysis(self, raw_df, city_lut, itv_lut, test_list, control_list):
@@ -143,7 +180,8 @@ class geoprocessing:
143
180
  automatically aggregating metric columns (all columns from raw_df except 'date' and 'geo'),
144
181
  and assigning a numerical group based on provided test and control lists.
145
182
 
146
- Parameters:
183
+ Parameters
184
+ ----------
147
185
  raw_df (pd.DataFrame): Raw data containing at least the columns 'date' and 'geo'
148
186
  plus metric columns that should be summed.
149
187
  city_lut (pd.DataFrame): City lookup table DataFrame with a column 'geo' and coordinate data.
@@ -152,77 +190,94 @@ class geoprocessing:
152
190
  test_list (list): List of region names (strings) to be assigned the value 1.
153
191
  control_list (list): List of region names (strings) to be assigned the value 2.
154
192
 
155
- Returns:
193
+ Returns
194
+ -------
156
195
  pd.DataFrame: Aggregated DataFrame grouped by 'date' and 'geo' (where 'geo' comes from ITV Region),
157
196
  with the metric columns summed and an additional 'assignment' column.
197
+
158
198
  """
159
-
160
199
  # Ensure the ITV lookup table has valid coordinate data
161
- itv_lut = itv_lut.dropna(subset=['Latitude', 'Longitude'])
162
-
200
+ itv_lut = itv_lut.dropna(subset=["Latitude", "Longitude"])
201
+
163
202
  # Merge raw_df with the city lookup table on 'geo'
164
- merged_df = pd.merge(raw_df, city_lut, on='geo', how='left')
165
-
203
+ merged_df = pd.merge(raw_df, city_lut, on="geo", how="left")
204
+
166
205
  # Build a KD-tree from the ITV lookup table's coordinates for an efficient nearest-neighbor search
167
- tree = cKDTree(itv_lut[['Latitude', 'Longitude']].values)
168
-
206
+ tree = cKDTree(itv_lut[["Latitude", "Longitude"]].values)
207
+
169
208
  # For each record in merged_df, find the nearest ITV region based on coordinates
170
- distances, indices = tree.query(merged_df[['Latitude', 'Longitude']].values, k=1)
171
-
209
+ distances, indices = tree.query(
210
+ merged_df[["Latitude", "Longitude"]].values,
211
+ k=1,
212
+ )
213
+
172
214
  # Map the nearest ITV Region back to merged_df
173
- merged_df['ITV Region'] = itv_lut.iloc[indices]['ITV Region'].values
174
-
215
+ merged_df["ITV Region"] = itv_lut.iloc[indices]["ITV Region"].values
216
+
175
217
  # Automatically determine the metric columns from raw_df (all columns except 'date' and 'geo')
176
- metric_cols = [col for col in raw_df.columns if col not in ['date', 'geo']]
177
-
218
+ metric_cols = [col for col in raw_df.columns if col not in ["date", "geo"]]
219
+
178
220
  # Aggregate (sum) the metric columns, grouping by 'date' and the nearest ITV Region
179
- aggregated_df = merged_df.groupby(['date', 'ITV Region'], as_index=False)[metric_cols].sum()
180
-
221
+ aggregated_df = merged_df.groupby(["date", "ITV Region"], as_index=False)[
222
+ metric_cols
223
+ ].sum()
224
+
181
225
  # Rename 'ITV Region' to 'geo' to be consistent with your downstream usage
182
- aggregated_df.rename(columns={'ITV Region': 'geo'}, inplace=True)
183
-
226
+ aggregated_df.rename(columns={"ITV Region": "geo"}, inplace=True)
227
+
184
228
  # Define a function to assign group values based on the region name
185
229
  def assign_value(region):
186
230
  if region in test_list:
187
231
  return 1
188
- elif region in control_list:
232
+ if region in control_list:
189
233
  return 2
190
- else:
191
- return np.nan # Or another default value if desired
192
-
234
+ return np.nan # Or another default value if desired
235
+
193
236
  # Apply the assignment function and remove rows without a valid assignment
194
- aggregated_df['assignment'] = aggregated_df['geo'].apply(assign_value)
195
- aggregated_df.dropna(subset=['assignment'], inplace=True)
196
- aggregated_df['assignment'] = aggregated_df['assignment'].astype(int)
197
-
237
+ aggregated_df["assignment"] = aggregated_df["geo"].apply(assign_value)
238
+ aggregated_df.dropna(subset=["assignment"], inplace=True)
239
+ aggregated_df["assignment"] = aggregated_df["assignment"].astype(int)
240
+
198
241
  return aggregated_df
199
-
200
- def process_city_analysis(self, raw_data, spend_data, output_path, test_group, control_group, columns_to_aggregate, aggregator_list):
242
+
243
+ def process_city_analysis(
244
+ self,
245
+ raw_data,
246
+ spend_data,
247
+ output_path,
248
+ test_group,
249
+ control_group,
250
+ columns_to_aggregate,
251
+ aggregator_list,
252
+ ):
201
253
  """
202
- Process city-level analysis by grouping data, applying custom aggregations,
254
+ Process city-level analysis by grouping data, applying custom aggregations,
203
255
  and merging with spend data.
204
256
 
205
- Parameters:
257
+ Parameters
258
+ ----------
206
259
  raw_data (str or pd.DataFrame):
207
260
  - Raw input data as a file path (CSV/XLSX) or a DataFrame.
208
261
  - Must contain 'date' and 'city' columns, plus any columns to be aggregated.
209
262
  spend_data (str or pd.DataFrame):
210
263
  - Spend data as a file path (CSV/XLSX) or a DataFrame.
211
264
  - Must contain 'date', 'geo', and 'cost' columns.
212
- output_path (str):
265
+ output_path (str):
213
266
  - Path to save the final output file (CSV or XLSX).
214
- group1 (list):
267
+ group1 (list):
215
268
  - List of city regions to be considered "Test Group" or "Group 1".
216
- group2 (list):
269
+ group2 (list):
217
270
  - List of city regions to be considered "Control Group" or "Group 2".
218
- columns_to_aggregate (list):
271
+ columns_to_aggregate (list):
219
272
  - List of columns to apply aggregation to, e.g. ['newUsers', 'transactions'].
220
- aggregator_list (list):
273
+ aggregator_list (list):
221
274
  - List of corresponding aggregation functions, e.g. ['sum', 'mean'].
222
275
  - Must be the same length as columns_to_aggregate.
223
276
 
224
- Returns:
277
+ Returns
278
+ -------
225
279
  pd.DataFrame: The final merged, aggregated DataFrame.
280
+
226
281
  """
227
282
 
228
283
  def read_file(data):
@@ -230,82 +285,85 @@ class geoprocessing:
230
285
  if isinstance(data, pd.DataFrame):
231
286
  return data
232
287
  ext = os.path.splitext(data)[1].lower()
233
- if ext == '.csv':
288
+ if ext == ".csv":
234
289
  return pd.read_csv(data)
235
- elif ext in ['.xlsx', '.xls']:
290
+ if ext in [".xlsx", ".xls"]:
236
291
  return pd.read_excel(data)
237
- else:
238
- raise ValueError("Unsupported file type. Please use a CSV or XLSX file.")
292
+ raise ValueError(
293
+ "Unsupported file type. Please use a CSV or XLSX file.",
294
+ )
239
295
 
240
296
  def write_file(df, file_path):
241
297
  """Helper function to write DataFrame to CSV or XLSX files."""
242
298
  ext = os.path.splitext(file_path)[1].lower()
243
- if ext == '.csv':
299
+ if ext == ".csv":
244
300
  df.to_csv(file_path, index=False)
245
- elif ext in ['.xlsx', '.xls']:
246
- df.to_excel(file_path, index=False, engine='openpyxl')
301
+ elif ext in [".xlsx", ".xls"]:
302
+ df.to_excel(file_path, index=False, engine="openpyxl")
247
303
  else:
248
- raise ValueError("Unsupported file type. Please use a CSV or XLSX file.")
304
+ raise ValueError(
305
+ "Unsupported file type. Please use a CSV or XLSX file.",
306
+ )
249
307
 
250
308
  # -----------------------
251
309
  # 1. Read and validate data
252
310
  # -----------------------
253
311
  raw_df = read_file(raw_data)
254
- raw_df = raw_df.rename(columns={'city': 'geo'})
255
- spend_df = read_file(spend_data).rename(columns={'Cost': 'cost'})
312
+ raw_df = raw_df.rename(columns={"city": "geo"})
313
+ spend_df = read_file(spend_data).rename(columns={"Cost": "cost"})
256
314
 
257
315
  # Columns we minimally need in raw_df
258
- required_columns = {'date', 'geo'}
316
+ required_columns = {"date", "geo"}
259
317
  # Ensure the columns to aggregate are there
260
318
  required_columns = required_columns.union(set(columns_to_aggregate))
261
319
  missing_in_raw = required_columns - set(raw_df.columns)
262
320
  if missing_in_raw:
263
321
  raise ValueError(
264
- f"The raw data is missing the following required columns: {missing_in_raw}"
322
+ f"The raw data is missing the following required columns: {missing_in_raw}",
265
323
  )
266
324
 
267
325
  # Validate spend data
268
- spend_required_columns = {'date', 'geo', 'cost'}
326
+ spend_required_columns = {"date", "geo", "cost"}
269
327
  missing_in_spend = spend_required_columns - set(spend_df.columns)
270
328
  if missing_in_spend:
271
329
  raise ValueError(
272
- f"The spend data is missing the following required columns: {missing_in_spend}"
330
+ f"The spend data is missing the following required columns: {missing_in_spend}",
273
331
  )
274
332
 
275
333
  # -----------------------
276
334
  # 2. Clean and prepare spend data
277
335
  # -----------------------
278
336
  # Convert cost column to numeric after stripping currency symbols and commas
279
- spend_df['cost'] = (
280
- spend_df['cost']
281
- .replace('[^\\d.]', '', regex=True)
282
- .astype(float)
337
+ spend_df["cost"] = (
338
+ spend_df["cost"].replace("[^\\d.]", "", regex=True).astype(float)
283
339
  )
284
340
 
285
341
  # -----------------------
286
342
  # 3. Prepare raw data
287
343
  # -----------------------
288
344
  # Filter only the relevant geos
289
- filtered_df = raw_df[raw_df['geo'].isin(test_group + control_group)].copy()
345
+ filtered_df = raw_df[raw_df["geo"].isin(test_group + control_group)].copy()
290
346
  # -----------------------
291
347
  # 4. Group and aggregate
292
348
  # -----------------------
293
349
  # Create a dictionary of {col: agg_function}
294
350
  if len(columns_to_aggregate) != len(aggregator_list):
295
351
  raise ValueError(
296
- "columns_to_aggregate and aggregator_list must have the same length."
352
+ "columns_to_aggregate and aggregator_list must have the same length.",
297
353
  )
298
354
  aggregation_dict = dict(zip(columns_to_aggregate, aggregator_list))
299
355
 
300
356
  # Perform groupby using the aggregator dictionary
301
- grouped_df = filtered_df.groupby(['date', 'geo'], as_index=False).agg(aggregation_dict)
357
+ grouped_df = filtered_df.groupby(["date", "geo"], as_index=False).agg(
358
+ aggregation_dict,
359
+ )
302
360
 
303
361
  # -----------------------
304
362
  # 5. Map groups (Test vs. Control)
305
- # -----------------------
306
- assignment_map = {city: 1 for city in test_group}
307
- assignment_map.update({city: 2 for city in control_group})
308
- grouped_df['assignment'] = grouped_df['geo'].map(assignment_map)
363
+ # -----------------------
364
+ assignment_map = dict.fromkeys(test_group, 1)
365
+ assignment_map.update(dict.fromkeys(control_group, 2))
366
+ grouped_df["assignment"] = grouped_df["geo"].map(assignment_map)
309
367
 
310
368
  # -----------------------
311
369
  # 6. Merge with spend data
@@ -313,16 +371,16 @@ class geoprocessing:
313
371
  merged_df = pd.merge(
314
372
  grouped_df,
315
373
  spend_df, # has date, geo, cost
316
- on=['date', 'geo'],
317
- how='left'
374
+ on=["date", "geo"],
375
+ how="left",
318
376
  )
319
377
 
320
378
  # Fill missing cost with 0
321
- merged_df['cost'] = merged_df['cost'].fillna(0)
379
+ merged_df["cost"] = merged_df["cost"].fillna(0)
322
380
 
323
381
  # -----------------------
324
382
  # 7. Write out results
325
383
  # -----------------------
326
384
  write_file(merged_df, output_path)
327
385
 
328
- return merged_df
386
+ return merged_df