imsciences 0.8__py3-none-any.whl → 0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@ import xml.etree.ElementTree as ET
16
16
  from bs4 import BeautifulSoup
17
17
  import yfinance as yf
18
18
  import holidays
19
+ from dateutil.easter import easter
19
20
 
20
21
  class dataprocessing:
21
22
 
@@ -2123,18 +2124,35 @@ class datapull:
2123
2124
  ############################################################### Seasonality ##########################################################################
2124
2125
 
2125
2126
  def pull_seasonality(self, week_commencing, start_date, countries):
2126
- # Week commencing dictionary
2127
+ # ---------------------------------------------------------------------
2128
+ # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
2129
+ # ---------------------------------------------------------------------
2127
2130
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2128
-
2129
- # Create daily date range dataframe starting from start_date
2130
- date_range = pd.date_range(start=pd.to_datetime(start_date), end=datetime.today(), freq="d")
2131
+
2132
+ # ---------------------------------------------------------------------
2133
+ # 1. Create daily date range from start_date to today
2134
+ # ---------------------------------------------------------------------
2135
+ date_range = pd.date_range(
2136
+ start=pd.to_datetime(start_date),
2137
+ end=datetime.today(),
2138
+ freq="D"
2139
+ )
2131
2140
  df_daily = pd.DataFrame(date_range, columns=["Date"])
2132
2141
 
2133
- # Create weekly date range dataframe
2134
- df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2142
+ # ---------------------------------------------------------------------
2143
+ # 1.1 Identify "week_start" for each daily row, based on week_commencing
2144
+ # ---------------------------------------------------------------------
2145
+ df_daily['week_start'] = df_daily["Date"].apply(
2146
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2147
+ )
2148
+
2149
+ # ---------------------------------------------------------------------
2150
+ # 2. Build a weekly index (df_weekly_start) with dummy columns
2151
+ # ---------------------------------------------------------------------
2135
2152
  df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
2136
2153
  df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
2137
2154
 
2155
+ # Set index to weekly "start of week"
2138
2156
  df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
2139
2157
  df_weekly_start.set_index("Date", inplace=True)
2140
2158
 
@@ -2144,79 +2162,236 @@ class datapull:
2144
2162
  col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
2145
2163
  dummy_columns[col_name] = [0] * len(df_weekly_start)
2146
2164
  dummy_columns[col_name][i] = 1
2147
-
2165
+
2148
2166
  df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2149
2167
  df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2150
2168
 
2151
- # Add public holidays for each country and holiday type
2169
+ # ---------------------------------------------------------------------
2170
+ # 3. Public holidays (daily) from 'holidays' package + each holiday name
2171
+ # ---------------------------------------------------------------------
2152
2172
  for country in countries:
2153
- country_holidays = holidays.CountryHoliday(country, years=range(int(start_date[:4]), datetime.today().year + 1))
2154
- df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(lambda x: 1 if x in country_holidays else 0)
2155
-
2156
- # Extract specific holidays
2157
- for date, name in country_holidays.items():
2173
+ country_holidays = holidays.CountryHoliday(
2174
+ country,
2175
+ years=range(int(start_date[:4]), datetime.today().year + 1)
2176
+ )
2177
+ # Daily indicator: 1 if that date is a holiday
2178
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
2179
+ lambda x: 1 if x in country_holidays else 0
2180
+ )
2181
+ # Create columns for specific holiday names
2182
+ for date_hol, name in country_holidays.items():
2158
2183
  col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
2159
2184
  if col_name not in df_daily.columns:
2160
2185
  df_daily[col_name] = 0
2161
- df_daily.loc[df_daily["Date"] == pd.Timestamp(date), col_name] = 1
2186
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
2187
+
2188
+ # ---------------------------------------------------------------------
2189
+ # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
2190
+ # We'll add daily columns for each.
2191
+ # ---------------------------------------------------------------------
2192
+ # Initialize columns
2193
+ extra_cols = [
2194
+ "seas_valentines_day",
2195
+ "seas_halloween",
2196
+ "seas_fathers_day_us_uk",
2197
+ "seas_mothers_day_us",
2198
+ "seas_mothers_day_uk",
2199
+ "seas_good_friday",
2200
+ "seas_easter_monday",
2201
+ "seas_black_friday",
2202
+ "seas_cyber_monday",
2203
+ ]
2204
+ for c in extra_cols:
2205
+ df_daily[c] = 0 # default zero
2162
2206
 
2163
- # Map daily holidays to weekly aggregation
2164
- df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2165
- df_holidays = df_daily.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2166
- df_holidays.set_index("Date", inplace=True)
2207
+ # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
2208
+ # weekday: Monday=0, Tuesday=1, ... Sunday=6
2209
+ def nth_weekday_of_month(year, month, weekday, nth):
2210
+ """
2211
+ Returns date of the nth <weekday> in <month> of <year>.
2212
+ E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
2213
+ """
2214
+ # 1st day of the month
2215
+ d = datetime(year, month, 1)
2216
+ # What is the weekday of day #1?
2217
+ w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
2218
+ # If we want, e.g. Sunday=6, we see how many days to add
2219
+ delta = (weekday - w) % 7
2220
+ # This is the first <weekday> in that month
2221
+ first_weekday = d + timedelta(days=delta)
2222
+ # Now add 7*(nth-1) days
2223
+ return first_weekday + timedelta(days=7 * (nth-1))
2224
+
2225
+ def get_good_friday(year):
2226
+ """Good Friday is 2 days before Easter Sunday."""
2227
+ return easter(year) - timedelta(days=2)
2228
+
2229
+ def get_easter_monday(year):
2230
+ """Easter Monday is 1 day after Easter Sunday."""
2231
+ return easter(year) + timedelta(days=1)
2232
+
2233
+ def get_black_friday(year):
2234
+ """
2235
+ Black Friday = day after US Thanksgiving,
2236
+ and US Thanksgiving is the 4th Thursday in November.
2237
+ """
2238
+ # 4th Thursday in November
2239
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
2240
+ return fourth_thursday + timedelta(days=1)
2241
+
2242
+ def get_cyber_monday(year):
2243
+ """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
2244
+ # 4th Thursday in November
2245
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
2246
+ return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
2247
+
2248
+ # Loop over each year in range
2249
+ start_yr = int(start_date[:4])
2250
+ end_yr = datetime.today().year
2251
+
2252
+ for yr in range(start_yr, end_yr + 1):
2253
+ # Valentines = Feb 14
2254
+ valentines_day = datetime(yr, 2, 14)
2255
+ # Halloween = Oct 31
2256
+ halloween_day = datetime(yr, 10, 31)
2257
+ # Father's Day (US & UK) = 3rd Sunday in June
2258
+ fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
2259
+ # Mother's Day US = 2nd Sunday in May
2260
+ mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
2261
+ # Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
2262
+ # We can approximate as: Easter Sunday - 21 days
2263
+ # BUT we also must ensure it's actually Sunday
2264
+ # (the 4th Sunday in Lent can shift. We'll do the official approach below.)
2265
+ # Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
2266
+ # But that might overshoot if Lent started mid-week.
2267
+ # Let's do a quick approach:
2268
+ # Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
2269
+ # So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
2270
+ mothering_sunday = easter(yr) - timedelta(days=21)
2271
+ # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
2272
+ while mothering_sunday.weekday() != 6: # Sunday=6
2273
+ mothering_sunday -= timedelta(days=1)
2274
+
2275
+ # Good Friday, Easter Monday
2276
+ gf = get_good_friday(yr)
2277
+ em = get_easter_monday(yr)
2278
+
2279
+ # Black Friday, Cyber Monday
2280
+ bf = get_black_friday(yr)
2281
+ cm = get_cyber_monday(yr)
2282
+
2283
+ # Mark them in df_daily if in range
2284
+ for special_date, col in [
2285
+ (valentines_day, "seas_valentines_day"),
2286
+ (halloween_day, "seas_halloween"),
2287
+ (fathers_day, "seas_fathers_day_us_uk"),
2288
+ (mothers_day_us, "seas_mothers_day_us"),
2289
+ (mothering_sunday, "seas_mothers_day_uk"),
2290
+ (gf, "seas_good_friday"),
2291
+ (em, "seas_easter_monday"),
2292
+ (bf, "seas_black_friday"),
2293
+ (cm, "seas_cyber_monday"),
2294
+ ]:
2295
+ # Convert to pd.Timestamp:
2296
+ special_ts = pd.Timestamp(special_date)
2297
+
2298
+ # Only set if it's within your daily range
2299
+ if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
2300
+ df_daily.loc[df_daily["Date"] == special_ts, col] = 1
2301
+
2302
+ # ---------------------------------------------------------------------
2303
+ # 4. Add daily indicators for last day & last Friday of month
2304
+ # Then aggregate them to weekly level using .max()
2305
+ # ---------------------------------------------------------------------
2306
+ # Last day of month (daily)
2307
+ df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
2308
+ lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
2309
+ )
2167
2310
 
2168
- # Create monthly dummies (separately from holidays)
2169
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2170
- df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"], dtype=int)
2171
- df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2172
- df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2173
- df_monthly_dummies.set_index("Date", inplace=True)
2311
+ # Last Friday of month (daily)
2312
+ def is_last_friday(date):
2313
+ # last day of the month
2314
+ last_day_of_month = date.to_period("M").to_timestamp("M")
2315
+ last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
2316
+ # Determine how many days we go back from the last day to get Friday (weekday=4)
2317
+ if last_day_weekday >= 4:
2318
+ days_to_subtract = last_day_weekday - 4
2319
+ else:
2320
+ days_to_subtract = last_day_weekday + 3
2321
+ last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2322
+ return 1 if date == last_friday else 0
2174
2323
 
2175
- # Divide only the monthly dummy columns by 7 (exclude holiday-related columns)
2176
- monthly_cols = [col for col in df_monthly_dummies.columns if not col.startswith("seas_holiday") and not col.startswith("seas_")]
2324
+ df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
2325
+
2326
+ # ---------------------------------------------------------------------
2327
+ # 5. Weekly aggregation for holiday columns & monthly dummies
2328
+ # ---------------------------------------------------------------------
2329
+ # For monthly dummies, create a daily col "Month", then get_dummies
2330
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2331
+ df_monthly_dummies = pd.get_dummies(
2332
+ df_daily,
2333
+ prefix="seas",
2334
+ columns=["Month"],
2335
+ dtype=int
2336
+ )
2337
+ # Recalculate 'week_start' (already in df_daily, but just to be sure)
2338
+ df_monthly_dummies['week_start'] = df_daily['week_start']
2339
+
2340
+ # Group monthly dummies by .sum() or .mean()—we often spread them across the week
2341
+ df_monthly_dummies = (
2342
+ df_monthly_dummies
2343
+ .groupby('week_start')
2344
+ .sum(numeric_only=True) # sum the daily flags
2345
+ .reset_index()
2346
+ .rename(columns={'week_start': "Date"})
2347
+ .set_index("Date")
2348
+ )
2349
+ # Spread monthly dummies by 7 to distribute across that week
2350
+ monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
2177
2351
  df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
2178
2352
 
2179
- # Merge weekly dummies, monthly dummies, and holidays
2180
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1) # Combine weekly and monthly first
2181
- df_combined = pd.concat([df_combined, df_holidays], axis=1) # Add holidays separately
2353
+ # Group holiday & special-day columns by .max() => binary at weekly level
2354
+ df_holidays = (
2355
+ df_daily
2356
+ .groupby('week_start')
2357
+ .max(numeric_only=True) # if any day=1 in that week, entire week=1
2358
+ .reset_index()
2359
+ .rename(columns={'week_start': "Date"})
2360
+ .set_index("Date")
2361
+ )
2182
2362
 
2183
- # Drop duplicate columns if any exist (this ensures holidays are not duplicated)
2363
+ # ---------------------------------------------------------------------
2364
+ # 6. Combine weekly start, monthly dummies, holiday flags
2365
+ # ---------------------------------------------------------------------
2366
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2367
+ df_combined = pd.concat([df_combined, df_holidays], axis=1)
2184
2368
  df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
2185
2369
 
2186
- # Create weekly dummies
2370
+ # ---------------------------------------------------------------------
2371
+ # 7. Create weekly dummies for Week of Year & yearly dummies
2372
+ # ---------------------------------------------------------------------
2187
2373
  df_combined.reset_index(inplace=True)
2374
+ df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
2375
+
2188
2376
  df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2189
2377
  df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
2190
2378
 
2191
- # Create yearly dummies
2192
2379
  df_combined["Year"] = df_combined["Date"].dt.year
2193
2380
  df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
2194
2381
 
2195
- # Add constant
2382
+ # ---------------------------------------------------------------------
2383
+ # 8. Add constant & trend
2384
+ # ---------------------------------------------------------------------
2196
2385
  df_combined["Constant"] = 1
2197
-
2198
- # Add trend
2199
2386
  df_combined["Trend"] = df_combined.index + 1
2200
-
2201
- # Create seasonal indicators for the last day and last Friday of the month
2202
- df_combined['seas_last_day_of_month'] = df_combined["Date"].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2203
2387
 
2204
- def is_last_friday(date):
2205
- last_day_of_month = date.to_period('M').to_timestamp('M')
2206
- last_day_weekday = last_day_of_month.dayofweek
2207
- if last_day_weekday >= 4:
2208
- days_to_subtract = last_day_weekday - 4
2209
- else:
2210
- days_to_subtract = last_day_weekday + 3
2211
- last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2212
- return 1 if date == last_friday else 0
2213
-
2214
- df_combined['seas_last_friday_of_month'] = df_combined["Date"].apply(is_last_friday)
2215
-
2216
- # Rename Date to OBS
2388
+ # ---------------------------------------------------------------------
2389
+ # 9. Rename Date -> OBS and return
2390
+ # ---------------------------------------------------------------------
2217
2391
  df_combined.rename(columns={"Date": "OBS"}, inplace=True)
2218
2392
 
2219
2393
  return df_combined
2394
+
2220
2395
 
2221
2396
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2222
2397
  import pandas as pd
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.8
3
+ Version: 0.9
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -35,97 +35,97 @@ The **IMSciences package** is a Python library designed to process incoming data
35
35
 
36
36
  ---
37
37
 
38
- ## Table of Contents
38
+ Table of Contents
39
+ =================
39
40
 
40
- 1. [Data Processing](#data-processing)
41
- 2. [Data Pulling](#data-pulling)
42
- 3. [Installation](#installation)
43
- 4. [Useage](#useage)
44
- 5. [License](#license)
41
+ 1. [Data Processing](#Data-Processing)
42
+ 2. [Data Pulling](#Data-Pulling)
43
+ 3. [Installation](#Installation)
44
+ 4. [Useage](#Useage)
45
+ 5. [License](#License)
45
46
 
46
47
  ---
47
48
 
48
49
  ## Data Processing
49
50
 
50
-
51
- ## 1. `get_wd_levels`
51
+ ## 1. get_wd_levels
52
52
  - **Description**: Get the working directory with the option of moving up parents.
53
53
  - **Usage**: `get_wd_levels(levels)`
54
54
  - **Example**: `get_wd_levels(0)`
55
55
 
56
56
  ---
57
57
 
58
- ## 2. `remove_rows`
58
+ ## 2. remove_rows
59
59
  - **Description**: Removes a specified number of rows from a pandas DataFrame.
60
60
  - **Usage**: `remove_rows(data_frame, num_rows_to_remove)`
61
61
  - **Example**: `remove_rows(df, 2)`
62
62
 
63
63
  ---
64
64
 
65
- ## 3. `aggregate_daily_to_wc_long`
65
+ ## 3. aggregate_daily_to_wc_long
66
66
  - **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
67
67
  - **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
68
68
  - **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
69
69
 
70
70
  ---
71
71
 
72
- ## 4. `convert_monthly_to_daily`
72
+ ## 4. convert_monthly_to_daily
73
73
  - **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
74
74
  - **Usage**: `convert_monthly_to_daily(df, date_column, divide)`
75
75
  - **Example**: `convert_monthly_to_daily(df, 'date')`
76
76
 
77
77
  ---
78
78
 
79
- ## 5. `plot_two`
79
+ ## 5. plot_two
80
80
  - **Description**: Plots specified columns from two different DataFrames using a shared date column. Useful for comparing data.
81
81
  - **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
82
82
  - **Example**: `plot_two(df1, 'cost', df2, 'cost', 'obs', True)`
83
83
 
84
84
  ---
85
85
 
86
- ## 6. `remove_nan_rows`
86
+ ## 6. remove_nan_rows
87
87
  - **Description**: Removes rows from a DataFrame where the specified column has NaN values.
88
88
  - **Usage**: `remove_nan_rows(df, col_to_remove_rows)`
89
89
  - **Example**: `remove_nan_rows(df, 'date')`
90
90
 
91
91
  ---
92
92
 
93
- ## 7. `filter_rows`
93
+ ## 7. filter_rows
94
94
  - **Description**: Filters the DataFrame based on whether the values in a specified column are in a provided list.
95
95
  - **Usage**: `filter_rows(df, col_to_filter, list_of_filters)`
96
96
  - **Example**: `filter_rows(df, 'country', ['UK', 'IE'])`
97
97
 
98
98
  ---
99
99
 
100
- ## 8. `plot_one`
100
+ ## 8. plot_one
101
101
  - **Description**: Plots a specified column from a DataFrame.
102
102
  - **Usage**: `plot_one(df1, col1, date_column)`
103
103
  - **Example**: `plot_one(df, 'Spend', 'OBS')`
104
104
 
105
105
  ---
106
106
 
107
- ## 9. `week_of_year_mapping`
107
+ ## 9. week_of_year_mapping
108
108
  - **Description**: Converts a week column in `yyyy-Www` or `yyyy-ww` format to week commencing date.
109
109
  - **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
110
110
  - **Example**: `week_of_year_mapping(df, 'week', 'mon')`
111
111
 
112
112
  ---
113
113
 
114
- ## 10. `exclude_rows`
114
+ ## 10. exclude_rows
115
115
  - **Description**: Removes rows from a DataFrame based on whether the values in a specified column are not in a provided list.
116
116
  - **Usage**: `exclude_rows(df, col_to_filter, list_of_filters)`
117
117
  - **Example**: `exclude_rows(df, 'week', ['2022-W20', '2022-W21'])`
118
118
 
119
119
  ---
120
120
 
121
- ## 11. `rename_cols`
121
+ ## 11. rename_cols
122
122
  - **Description**: Renames columns in a pandas DataFrame.
123
123
  - **Usage**: `rename_cols(df, name)`
124
124
  - **Example**: `rename_cols(df, 'ame_facebook')`
125
125
 
126
126
  ---
127
127
 
128
- ## 12. `merge_new_and_old`
128
+ ## 12. merge_new_and_old
129
129
  - **Description**: Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
130
130
  - Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
131
131
  - **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
@@ -133,21 +133,21 @@ The **IMSciences package** is a Python library designed to process incoming data
133
133
 
134
134
  ---
135
135
 
136
- ## 13. `merge_dataframes_on_date`
136
+ ## 13. merge_dataframes_on_date
137
137
  - **Description**: Merge a list of DataFrames on a common column.
138
138
  - **Usage**: `merge_dataframes_on_date(dataframes, common_column='OBS', merge_how='outer')`
139
139
  - **Example**: `merge_dataframes_on_date([df1, df2, df3], common_column='OBS', merge_how='outer')`
140
140
 
141
141
  ---
142
142
 
143
- ## 14. `merge_and_update_dfs`
143
+ ## 14. merge_and_update_dfs
144
144
  - **Description**: Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available, and returns a dataframe sorted by the key column.
145
145
  - **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
146
146
  - **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
147
147
 
148
148
  ---
149
149
 
150
- ## 15. `convert_us_to_uk_dates`
150
+ ## 15. convert_us_to_uk_dates
151
151
  - **Description**: Convert a DataFrame column with mixed date formats to datetime.
152
152
  - **Usage**: `convert_us_to_uk_dates(df, date_col)`
153
153
  - **Example**: `convert_us_to_uk_dates(df, 'date')`
@@ -161,189 +161,189 @@ The **IMSciences package** is a Python library designed to process incoming data
161
161
 
162
162
  ---
163
163
 
164
- ## 17. `pivot_table`
164
+ ## 17. pivot_table
165
165
  - **Description**: Dynamically pivots a DataFrame based on specified columns.
166
166
  - **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
167
167
  - **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1', 'OBS': ' >= datetime(2019,9,9)', 'Metric Short Names': ' == spd'}, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=True, fill_missing_weekly_dates=True, week_commencing='W-MON')`
168
168
 
169
169
  ---
170
170
 
171
- ## 18. `apply_lookup_table_for_columns`
171
+ ## 18. apply_lookup_table_for_columns
172
172
  - **Description**: Equivalent of XLOOKUP in Excel. Allows mapping of a dictionary of substrings within a column.
173
173
  - **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
174
174
  - **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd', 'clicks': 'clk'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
175
175
 
176
176
  ---
177
177
 
178
- ## 19. `aggregate_daily_to_wc_wide`
178
+ ## 19. aggregate_daily_to_wc_wide
179
179
  - **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
180
180
  - **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc, aggregation='sum', include_totals=False)`
181
181
  - **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average', True)`
182
182
 
183
183
  ---
184
184
 
185
- ## 20. `merge_cols_with_seperator`
185
+ ## 20. merge_cols_with_seperator
186
186
  - **Description**: Merges multiple columns in a DataFrame into one column with a separator `_`. Useful for lookup tables.
187
187
  - **Usage**: `merge_cols_with_seperator(df, col_names, seperator='_', output_column_name='Merged', starting_prefix_str=None, ending_prefix_str=None)`
188
188
  - **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], seperator='|', output_column_name='Merged Columns', starting_prefix_str='start_', ending_prefix_str='_end')`
189
189
 
190
190
  ---
191
191
 
192
- ## 21. `check_sum_of_df_cols_are_equal`
192
+ ## 21. check_sum_of_df_cols_are_equal
193
193
  - **Description**: Checks if the sum of two columns in two DataFrames are the same, and provides the sums and differences.
194
194
  - **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
195
195
  - **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
196
196
 
197
197
  ---
198
198
 
199
- ## 22. `convert_2_df_cols_to_dict`
199
+ ## 22. convert_2_df_cols_to_dict
200
200
  - **Description**: Creates a dictionary using two columns in a DataFrame.
201
201
  - **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
202
202
  - **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
203
203
 
204
204
  ---
205
205
 
206
- ## 23. `create_FY_and_H_columns`
206
+ ## 23. create_FY_and_H_columns
207
207
  - **Description**: Creates financial year, half-year, and financial half-year columns.
208
208
  - **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
209
209
  - **Example**: `create_FY_and_H_columns(df, 'Week (M-S)', '2022-10-03', 'FY2023', short_format='Yes', half_years='Yes', combined_FY_and_H='Yes')`
210
210
 
211
211
  ---
212
212
 
213
- ## 24. `keyword_lookup_replacement`
213
+ ## 24. keyword_lookup_replacement
214
214
  - **Description**: Updates chosen values in a specified column of the DataFrame based on a lookup dictionary.
215
215
  - **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
216
216
  - **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment', 'product'], qlik_dict_for_channel, output_column_name='Channel New')`
217
217
 
218
218
  ---
219
219
 
220
- ## 25. `create_new_version_of_col_using_LUT`
220
+ ## 25. create_new_version_of_col_using_LUT
221
221
  - **Description**: Creates a new column in a DataFrame by mapping values from an old column using a lookup table.
222
222
  - **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
223
223
  - **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', search_campaign_name_retag_lut, 'Campaign Name New')`
224
224
 
225
225
  ---
226
226
 
227
- ## 26. `convert_df_wide_2_long`
227
+ ## 26. convert_df_wide_2_long
228
228
  - **Description**: Converts a DataFrame from wide to long format.
229
229
  - **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
230
230
  - **Example**: `convert_df_wide_2_long(df, ['Media Cost', 'Impressions', 'Clicks'], variable_col_name='Metric')`
231
231
 
232
232
  ---
233
233
 
234
- ## 27. `manually_edit_data`
234
+ ## 27. manually_edit_data
235
235
  - **Description**: Enables manual updates to DataFrame cells by applying filters and editing a column.
236
236
  - **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
237
237
  - **Example**: `manually_edit_data(df, {'OBS': ' <= datetime(2023,1,23)', 'File_Name': ' == France media'}, 'Master Include', 1, change_in_existing_df_col='Yes', new_col_to_change_name='Master Include', manual_edit_col_name='Manual Changes')`
238
238
 
239
239
  ---
240
240
 
241
- ## 28. `format_numbers_with_commas`
241
+ ## 28. format_numbers_with_commas
242
242
  - **Description**: Formats numeric data into numbers with commas and specified decimal places.
243
243
  - **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
244
244
  - **Example**: `format_numbers_with_commas(df, 1)`
245
245
 
246
246
  ---
247
247
 
248
- ## 29. `filter_df_on_multiple_conditions`
248
+ ## 29. filter_df_on_multiple_conditions
249
249
  - **Description**: Filters a DataFrame based on multiple conditions from a dictionary.
250
250
  - **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
251
251
  - **Example**: `filter_df_on_multiple_conditions(df, {'OBS': ' <= datetime(2023,1,23)', 'File_Name': ' == France media'})`
252
252
 
253
253
  ---
254
254
 
255
- ## 30. `read_and_concatenate_files`
255
+ ## 30. read_and_concatenate_files
256
256
  - **Description**: Reads and concatenates all files of a specified type in a folder.
257
257
  - **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
258
258
  - **Example**: `read_and_concatenate_files(folder_path, file_type='csv')`
259
259
 
260
260
  ---
261
261
 
262
- ## 31. `remove_zero_values`
262
+ ## 31. remove_zero_values
263
263
  - **Description**: Removes rows with zero values in a specified column.
264
264
  - **Usage**: `remove_zero_values(data_frame, column_to_filter)`
265
265
  - **Example**: `remove_zero_values(df, 'Funeral_Delivery')`
266
266
 
267
267
  ---
268
268
 
269
- ## 32. `upgrade_outdated_packages`
269
+ ## 32. upgrade_outdated_packages
270
270
  - **Description**: Upgrades all outdated packages in the environment.
271
271
  - **Usage**: `upgrade_outdated_packages()`
272
272
  - **Example**: `upgrade_outdated_packages()`
273
273
 
274
274
  ---
275
275
 
276
- ## 33. `convert_mixed_formats_dates`
276
+ ## 33. convert_mixed_formats_dates
277
277
  - **Description**: Converts a mix of US and UK date formats to datetime.
278
278
  - **Usage**: `convert_mixed_formats_dates(df, date_col)`
279
279
  - **Example**: `convert_mixed_formats_dates(df, 'OBS')`
280
280
 
281
281
  ---
282
282
 
283
- ## 34. `fill_weekly_date_range`
283
+ ## 34. fill_weekly_date_range
284
284
  - **Description**: Fills in missing weeks with zero values.
285
285
  - **Usage**: `fill_weekly_date_range(df, date_column, freq)`
286
286
  - **Example**: `fill_weekly_date_range(df, 'OBS', 'W-MON')`
287
287
 
288
288
  ---
289
289
 
290
- ## 35. `add_prefix_and_suffix`
290
+ ## 35. add_prefix_and_suffix
291
291
  - **Description**: Adds prefixes and/or suffixes to column headers.
292
292
  - **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
293
293
  - **Example**: `add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')`
294
294
 
295
295
  ---
296
296
 
297
- ## 36. `create_dummies`
297
+ ## 36. create_dummies
298
298
  - **Description**: Converts time series into binary indicators based on a threshold.
299
299
  - **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
300
300
  - **Example**: `create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')`
301
301
 
302
302
  ---
303
303
 
304
- ## 37. `replace_substrings`
304
+ ## 37. replace_substrings
305
305
  - **Description**: Replaces substrings in a column of strings using a dictionary and can change column values to lowercase.
306
306
  - **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
307
307
  - **Example**: `replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')`
308
308
 
309
309
  ---
310
310
 
311
- ## 38. `add_total_column`
311
+ ## 38. `add_total_column
312
312
  - **Description**: Sums all columns (excluding a specified column) to create a total column.
313
313
  - **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
314
314
  - **Example**: `add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')`
315
315
 
316
316
  ---
317
317
 
318
- ## 39. `apply_lookup_table_based_on_substring`
318
+ ## 39. apply_lookup_table_based_on_substring
319
319
  - **Description**: Maps substrings in a column to values using a lookup dictionary.
320
320
  - **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
321
321
  - **Example**: `apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')`
322
322
 
323
323
  ---
324
324
 
325
- ## 40. `compare_overlap`
325
+ ## 40. compare_overlap
326
326
  - **Description**: Compares matching rows and columns in two DataFrames and outputs the differences.
327
327
  - **Usage**: `compare_overlap(df1, df2, date_col)`
328
328
  - **Example**: `compare_overlap(df_1, df_2, 'obs')`
329
329
 
330
330
  ---
331
331
 
332
- ## 41. `week_commencing_2_week_commencing_conversion`
332
+ ## 41. week_commencing_2_week_commencing_conversion
333
333
  - **Description**: Converts a week commencing column to a different start day.
334
334
  - **Usage**: `week_commencing_2_week_commencing_conversion(df, date_col, week_commencing='sun')`
335
335
  - **Example**: `week_commencing_2_week_commencing_conversion(df, 'obs', week_commencing='mon')`
336
336
 
337
337
  ---
338
338
 
339
- ## 42. `plot_chart`
339
+ ## 42. plot_chart
340
340
  - **Description**: Plots various chart types including line, area, scatter, and bar.
341
341
  - **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs)`
342
342
  - **Example**: `plot_chart(df, 'obs', df.cols, chart_type='line', title='Spend Over Time', x_title='Date', y_title='Spend')`
343
343
 
344
344
  ---
345
345
 
346
- ## 43. `plot_two_with_common_cols`
346
+ ## 43. plot_two_with_common_cols
347
347
  - **Description**: Plots charts for two DataFrames based on common column names.
348
348
  - **Usage**: `plot_two_with_common_cols(df1, df2, date_column, same_axis=True)`
349
349
  - **Example**: `plot_two_with_common_cols(df_1, df_2, date_column='obs')`
@@ -411,7 +411,7 @@ The **IMSciences package** is a Python library designed to process incoming data
411
411
  Install the IMS package via pip:
412
412
 
413
413
  ```bash
414
- pip install ims-package
414
+ pip install imsciences
415
415
  ```
416
416
 
417
417
  ---
@@ -3,15 +3,15 @@ dataprocessing/data-processing-functions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nF
3
3
  dataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
4
4
  imsciences/__init__.py,sha256=7CfK2dMjPnBBw6I4st-20MdMlLjZULviFVXF2eMD9NI,80
5
5
  imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWLoyZLJp6l8Y,141406
6
- imsciences/datafunctions.py,sha256=KbZuvjJF-1gydPsb2qFlvpbVLwuG6y-lhLKt-wZ5JDI,156389
6
+ imsciences/datafunctions.py,sha256=NGJ3j1HIXX2G2bE529Tlvq6AXaAxSye6YjobUF_QpL4,164562
7
7
  imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
8
8
  imsciences/unittesting.py,sha256=d9H5HN8y7oof59hqN9mGqkjulExqFd93BEW-X8w_Id8,58142
9
9
  imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
10
10
  imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
11
11
  imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
12
12
  imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
13
- imsciences-0.8.dist-info/METADATA,sha256=moylR64i_w4kk3TPPZMpFmAPc9f0A4xJgjAY-Zy-Tac,17845
14
- imsciences-0.8.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
15
- imsciences-0.8.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
16
- imsciences-0.8.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
17
- imsciences-0.8.dist-info/RECORD,,
13
+ imsciences-0.9.dist-info/METADATA,sha256=KZSjJgbi89Oon07qhMCo9nlP_kE3GIUeRM29vs50tds,17775
14
+ imsciences-0.9.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
15
+ imsciences-0.9.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
16
+ imsciences-0.9.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
17
+ imsciences-0.9.dist-info/RECORD,,