imsciences 0.8.1__tar.gz → 0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.8.1
3
+ Version: 0.9
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -39,11 +39,11 @@ The **IMSciences package** is a Python library designed to process incoming data
39
39
  Table of Contents
40
40
  =================
41
41
 
42
- 1. `Data Processing <#data-processing>`_
43
- 2. `Data Pulling <#data-pulling>`_
44
- 3. `Installation <#installation>`_
45
- 4. `Usage <#usage>`_
46
- 5. `License <#license>`_
42
+ 1. [Data Processing](#Data-Processing)
43
+ 2. [Data Pulling](#Data-Pulling)
44
+ 3. [Installation](#Installation)
45
+ 4. [Useage](#Useage)
46
+ 5. [License](#License)
47
47
 
48
48
  ---
49
49
 
@@ -14,11 +14,11 @@ The **IMSciences package** is a Python library designed to process incoming data
14
14
  Table of Contents
15
15
  =================
16
16
 
17
- 1. `Data Processing <#data-processing>`_
18
- 2. `Data Pulling <#data-pulling>`_
19
- 3. `Installation <#installation>`_
20
- 4. `Usage <#usage>`_
21
- 5. `License <#license>`_
17
+ 1. [Data Processing](#Data-Processing)
18
+ 2. [Data Pulling](#Data-Pulling)
19
+ 3. [Installation](#Installation)
20
+ 4. [Useage](#Useage)
21
+ 5. [License](#License)
22
22
 
23
23
  ---
24
24
 
@@ -16,6 +16,7 @@ import xml.etree.ElementTree as ET
16
16
  from bs4 import BeautifulSoup
17
17
  import yfinance as yf
18
18
  import holidays
19
+ from dateutil.easter import easter
19
20
 
20
21
  class dataprocessing:
21
22
 
@@ -2123,27 +2124,31 @@ class datapull:
2123
2124
  ############################################################### Seasonality ##########################################################################
2124
2125
 
2125
2126
  def pull_seasonality(self, week_commencing, start_date, countries):
2126
- # Week commencing dictionary
2127
+ # ---------------------------------------------------------------------
2128
+ # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
2129
+ # ---------------------------------------------------------------------
2127
2130
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2128
-
2129
- # Create daily date range dataframe starting from start_date
2131
+
2132
+ # ---------------------------------------------------------------------
2133
+ # 1. Create daily date range from start_date to today
2134
+ # ---------------------------------------------------------------------
2130
2135
  date_range = pd.date_range(
2131
2136
  start=pd.to_datetime(start_date),
2132
2137
  end=datetime.today(),
2133
2138
  freq="D"
2134
2139
  )
2135
2140
  df_daily = pd.DataFrame(date_range, columns=["Date"])
2136
-
2137
- # ------------------------------------------------
2138
- # 1. Identify "week_start" for each daily row
2139
- # ------------------------------------------------
2141
+
2142
+ # ---------------------------------------------------------------------
2143
+ # 1.1 Identify "week_start" for each daily row, based on week_commencing
2144
+ # ---------------------------------------------------------------------
2140
2145
  df_daily['week_start'] = df_daily["Date"].apply(
2141
2146
  lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2142
2147
  )
2143
-
2144
- # ------------------------------------------------
2148
+
2149
+ # ---------------------------------------------------------------------
2145
2150
  # 2. Build a weekly index (df_weekly_start) with dummy columns
2146
- # ------------------------------------------------
2151
+ # ---------------------------------------------------------------------
2147
2152
  df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
2148
2153
  df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
2149
2154
 
@@ -2160,10 +2165,10 @@ class datapull:
2160
2165
 
2161
2166
  df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2162
2167
  df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2163
-
2164
- # ------------------------------------------------
2165
- # 3. Public holidays (daily) and specific holiday columns
2166
- # ------------------------------------------------
2168
+
2169
+ # ---------------------------------------------------------------------
2170
+ # 3. Public holidays (daily) from 'holidays' package + each holiday name
2171
+ # ---------------------------------------------------------------------
2167
2172
  for country in countries:
2168
2173
  country_holidays = holidays.CountryHoliday(
2169
2174
  country,
@@ -2180,10 +2185,124 @@ class datapull:
2180
2185
  df_daily[col_name] = 0
2181
2186
  df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
2182
2187
 
2183
- # ------------------------------------------------
2188
+ # ---------------------------------------------------------------------
2189
+ # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
2190
+ # We'll add daily columns for each.
2191
+ # ---------------------------------------------------------------------
2192
+ # Initialize columns
2193
+ extra_cols = [
2194
+ "seas_valentines_day",
2195
+ "seas_halloween",
2196
+ "seas_fathers_day_us_uk",
2197
+ "seas_mothers_day_us",
2198
+ "seas_mothers_day_uk",
2199
+ "seas_good_friday",
2200
+ "seas_easter_monday",
2201
+ "seas_black_friday",
2202
+ "seas_cyber_monday",
2203
+ ]
2204
+ for c in extra_cols:
2205
+ df_daily[c] = 0 # default zero
2206
+
2207
+ # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
2208
+ # weekday: Monday=0, Tuesday=1, ... Sunday=6
2209
+ def nth_weekday_of_month(year, month, weekday, nth):
2210
+ """
2211
+ Returns date of the nth <weekday> in <month> of <year>.
2212
+ E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
2213
+ """
2214
+ # 1st day of the month
2215
+ d = datetime(year, month, 1)
2216
+ # What is the weekday of day #1?
2217
+ w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
2218
+ # If we want, e.g. Sunday=6, we see how many days to add
2219
+ delta = (weekday - w) % 7
2220
+ # This is the first <weekday> in that month
2221
+ first_weekday = d + timedelta(days=delta)
2222
+ # Now add 7*(nth-1) days
2223
+ return first_weekday + timedelta(days=7 * (nth-1))
2224
+
2225
+ def get_good_friday(year):
2226
+ """Good Friday is 2 days before Easter Sunday."""
2227
+ return easter(year) - timedelta(days=2)
2228
+
2229
+ def get_easter_monday(year):
2230
+ """Easter Monday is 1 day after Easter Sunday."""
2231
+ return easter(year) + timedelta(days=1)
2232
+
2233
+ def get_black_friday(year):
2234
+ """
2235
+ Black Friday = day after US Thanksgiving,
2236
+ and US Thanksgiving is the 4th Thursday in November.
2237
+ """
2238
+ # 4th Thursday in November
2239
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
2240
+ return fourth_thursday + timedelta(days=1)
2241
+
2242
+ def get_cyber_monday(year):
2243
+ """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
2244
+ # 4th Thursday in November
2245
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
2246
+ return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
2247
+
2248
+ # Loop over each year in range
2249
+ start_yr = int(start_date[:4])
2250
+ end_yr = datetime.today().year
2251
+
2252
+ for yr in range(start_yr, end_yr + 1):
2253
+ # Valentines = Feb 14
2254
+ valentines_day = datetime(yr, 2, 14)
2255
+ # Halloween = Oct 31
2256
+ halloween_day = datetime(yr, 10, 31)
2257
+ # Father's Day (US & UK) = 3rd Sunday in June
2258
+ fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
2259
+ # Mother's Day US = 2nd Sunday in May
2260
+ mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
2261
+ # Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
2262
+ # We can approximate as: Easter Sunday - 21 days
2263
+ # BUT we also must ensure it's actually Sunday
2264
+ # (the 4th Sunday in Lent can shift. We'll do the official approach below.)
2265
+ # Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
2266
+ # But that might overshoot if Lent started mid-week.
2267
+ # Let's do a quick approach:
2268
+ # Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
2269
+ # So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
2270
+ mothering_sunday = easter(yr) - timedelta(days=21)
2271
+ # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
2272
+ while mothering_sunday.weekday() != 6: # Sunday=6
2273
+ mothering_sunday -= timedelta(days=1)
2274
+
2275
+ # Good Friday, Easter Monday
2276
+ gf = get_good_friday(yr)
2277
+ em = get_easter_monday(yr)
2278
+
2279
+ # Black Friday, Cyber Monday
2280
+ bf = get_black_friday(yr)
2281
+ cm = get_cyber_monday(yr)
2282
+
2283
+ # Mark them in df_daily if in range
2284
+ for special_date, col in [
2285
+ (valentines_day, "seas_valentines_day"),
2286
+ (halloween_day, "seas_halloween"),
2287
+ (fathers_day, "seas_fathers_day_us_uk"),
2288
+ (mothers_day_us, "seas_mothers_day_us"),
2289
+ (mothering_sunday, "seas_mothers_day_uk"),
2290
+ (gf, "seas_good_friday"),
2291
+ (em, "seas_easter_monday"),
2292
+ (bf, "seas_black_friday"),
2293
+ (cm, "seas_cyber_monday"),
2294
+ ]:
2295
+ # Convert to pd.Timestamp:
2296
+ special_ts = pd.Timestamp(special_date)
2297
+
2298
+ # Only set if it's within your daily range
2299
+ if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
2300
+ df_daily.loc[df_daily["Date"] == special_ts, col] = 1
2301
+
2302
+ # ---------------------------------------------------------------------
2184
2303
  # 4. Add daily indicators for last day & last Friday of month
2185
2304
  # Then aggregate them to weekly level using .max()
2186
- # ------------------------------------------------
2305
+ # ---------------------------------------------------------------------
2187
2306
  # Last day of month (daily)
2188
2307
  df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
2189
2308
  lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
@@ -2193,8 +2312,8 @@ class datapull:
2193
2312
  def is_last_friday(date):
2194
2313
  # last day of the month
2195
2314
  last_day_of_month = date.to_period("M").to_timestamp("M")
2196
- last_day_weekday = last_day_of_month.dayofweek
2197
- # Determine how many days we go back from the last day to get Friday
2315
+ last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
2316
+ # Determine how many days we go back from the last day to get Friday (weekday=4)
2198
2317
  if last_day_weekday >= 4:
2199
2318
  days_to_subtract = last_day_weekday - 4
2200
2319
  else:
@@ -2204,10 +2323,9 @@ class datapull:
2204
2323
 
2205
2324
  df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
2206
2325
 
2207
- # ------------------------------------------------
2208
- # 5. Weekly aggregation for HOLIDAYS & monthly dummies
2209
- # (Using .max() for holiday indicators so they become binary)
2210
- # ------------------------------------------------
2326
+ # ---------------------------------------------------------------------
2327
+ # 5. Weekly aggregation for holiday columns & monthly dummies
2328
+ # ---------------------------------------------------------------------
2211
2329
  # For monthly dummies, create a daily col "Month", then get_dummies
2212
2330
  df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2213
2331
  df_monthly_dummies = pd.get_dummies(
@@ -2218,8 +2336,8 @@ class datapull:
2218
2336
  )
2219
2337
  # Recalculate 'week_start' (already in df_daily, but just to be sure)
2220
2338
  df_monthly_dummies['week_start'] = df_daily['week_start']
2221
-
2222
- # Group monthly dummies by .sum() or .mean()—often we average across the week
2339
+
2340
+ # Group monthly dummies by .sum() or .mean()—we often spread them across the week
2223
2341
  df_monthly_dummies = (
2224
2342
  df_monthly_dummies
2225
2343
  .groupby('week_start')
@@ -2228,33 +2346,30 @@ class datapull:
2228
2346
  .rename(columns={'week_start': "Date"})
2229
2347
  .set_index("Date")
2230
2348
  )
2231
- # Divide the monthly dummy columns by 7 to spread them across the week
2232
- monthly_cols = [
2233
- c for c in df_monthly_dummies.columns
2234
- if c.startswith("seas_month_")
2235
- ]
2349
+ # Spread monthly dummies by 7 to distribute across that week
2350
+ monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
2236
2351
  df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
2237
-
2238
- # Group holiday columns (and last-day-of-month columns) by .max() => binary
2352
+
2353
+ # Group holiday & special-day columns by .max() => binary at weekly level
2239
2354
  df_holidays = (
2240
2355
  df_daily
2241
2356
  .groupby('week_start')
2242
- .max(numeric_only=True) # use max => if any day=1, entire week=1
2357
+ .max(numeric_only=True) # if any day=1 in that week, entire week=1
2243
2358
  .reset_index()
2244
2359
  .rename(columns={'week_start': "Date"})
2245
2360
  .set_index("Date")
2246
2361
  )
2247
-
2248
- # ------------------------------------------------
2362
+
2363
+ # ---------------------------------------------------------------------
2249
2364
  # 6. Combine weekly start, monthly dummies, holiday flags
2250
- # ------------------------------------------------
2365
+ # ---------------------------------------------------------------------
2251
2366
  df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2252
2367
  df_combined = pd.concat([df_combined, df_holidays], axis=1)
2253
2368
  df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
2254
2369
 
2255
- # ------------------------------------------------
2370
+ # ---------------------------------------------------------------------
2256
2371
  # 7. Create weekly dummies for Week of Year & yearly dummies
2257
- # ------------------------------------------------
2372
+ # ---------------------------------------------------------------------
2258
2373
  df_combined.reset_index(inplace=True)
2259
2374
  df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
2260
2375
 
@@ -2264,18 +2379,19 @@ class datapull:
2264
2379
  df_combined["Year"] = df_combined["Date"].dt.year
2265
2380
  df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
2266
2381
 
2267
- # ------------------------------------------------
2382
+ # ---------------------------------------------------------------------
2268
2383
  # 8. Add constant & trend
2269
- # ------------------------------------------------
2384
+ # ---------------------------------------------------------------------
2270
2385
  df_combined["Constant"] = 1
2271
2386
  df_combined["Trend"] = df_combined.index + 1
2272
2387
 
2273
- # ------------------------------------------------
2388
+ # ---------------------------------------------------------------------
2274
2389
  # 9. Rename Date -> OBS and return
2275
- # ------------------------------------------------
2390
+ # ---------------------------------------------------------------------
2276
2391
  df_combined.rename(columns={"Date": "OBS"}, inplace=True)
2277
2392
 
2278
2393
  return df_combined
2394
+
2279
2395
 
2280
2396
  def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2281
2397
  import pandas as pd
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.8.1
3
+ Version: 0.9
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -39,11 +39,11 @@ The **IMSciences package** is a Python library designed to process incoming data
39
39
  Table of Contents
40
40
  =================
41
41
 
42
- 1. `Data Processing <#data-processing>`_
43
- 2. `Data Pulling <#data-pulling>`_
44
- 3. `Installation <#installation>`_
45
- 4. `Usage <#usage>`_
46
- 5. `License <#license>`_
42
+ 1. [Data Processing](#Data-Processing)
43
+ 2. [Data Pulling](#Data-Pulling)
44
+ 3. [Installation](#Installation)
45
+ 4. [Useage](#Useage)
46
+ 5. [License](#License)
47
47
 
48
48
  ---
49
49
 
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.8.1'
11
+ VERSION = '0.9'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md')
14
14
 
File without changes