imsciences 0.8.1__tar.gz → 0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of imsciences might be problematic. Click here for more details.
- {imsciences-0.8.1 → imsciences-0.9}/PKG-INFO +6 -6
- {imsciences-0.8.1 → imsciences-0.9}/README.md +5 -5
- {imsciences-0.8.1 → imsciences-0.9}/imsciences/datafunctions.py +157 -41
- {imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/PKG-INFO +6 -6
- {imsciences-0.8.1 → imsciences-0.9}/setup.py +1 -1
- {imsciences-0.8.1 → imsciences-0.9}/imsciences/__init__.py +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/imsciences/unittesting.py +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.8.1 → imsciences-0.9}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: imsciences
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9
|
|
4
4
|
Summary: IMS Data Processing Package
|
|
5
5
|
Author: IMS
|
|
6
6
|
Author-email: cam@im-sciences.com
|
|
@@ -39,11 +39,11 @@ The **IMSciences package** is a Python library designed to process incoming data
|
|
|
39
39
|
Table of Contents
|
|
40
40
|
=================
|
|
41
41
|
|
|
42
|
-
1.
|
|
43
|
-
2.
|
|
44
|
-
3.
|
|
45
|
-
4.
|
|
46
|
-
5.
|
|
42
|
+
1. [Data Processing](#Data-Processing)
|
|
43
|
+
2. [Data Pulling](#Data-Pulling)
|
|
44
|
+
3. [Installation](#Installation)
|
|
45
|
+
4. [Useage](#Useage)
|
|
46
|
+
5. [License](#License)
|
|
47
47
|
|
|
48
48
|
---
|
|
49
49
|
|
|
@@ -14,11 +14,11 @@ The **IMSciences package** is a Python library designed to process incoming data
|
|
|
14
14
|
Table of Contents
|
|
15
15
|
=================
|
|
16
16
|
|
|
17
|
-
1.
|
|
18
|
-
2.
|
|
19
|
-
3.
|
|
20
|
-
4.
|
|
21
|
-
5.
|
|
17
|
+
1. [Data Processing](#Data-Processing)
|
|
18
|
+
2. [Data Pulling](#Data-Pulling)
|
|
19
|
+
3. [Installation](#Installation)
|
|
20
|
+
4. [Useage](#Useage)
|
|
21
|
+
5. [License](#License)
|
|
22
22
|
|
|
23
23
|
---
|
|
24
24
|
|
|
@@ -16,6 +16,7 @@ import xml.etree.ElementTree as ET
|
|
|
16
16
|
from bs4 import BeautifulSoup
|
|
17
17
|
import yfinance as yf
|
|
18
18
|
import holidays
|
|
19
|
+
from dateutil.easter import easter
|
|
19
20
|
|
|
20
21
|
class dataprocessing:
|
|
21
22
|
|
|
@@ -2123,27 +2124,31 @@ class datapull:
|
|
|
2123
2124
|
############################################################### Seasonality ##########################################################################
|
|
2124
2125
|
|
|
2125
2126
|
def pull_seasonality(self, week_commencing, start_date, countries):
|
|
2126
|
-
#
|
|
2127
|
+
# ---------------------------------------------------------------------
|
|
2128
|
+
# 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
|
|
2129
|
+
# ---------------------------------------------------------------------
|
|
2127
2130
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2128
|
-
|
|
2129
|
-
#
|
|
2131
|
+
|
|
2132
|
+
# ---------------------------------------------------------------------
|
|
2133
|
+
# 1. Create daily date range from start_date to today
|
|
2134
|
+
# ---------------------------------------------------------------------
|
|
2130
2135
|
date_range = pd.date_range(
|
|
2131
2136
|
start=pd.to_datetime(start_date),
|
|
2132
2137
|
end=datetime.today(),
|
|
2133
2138
|
freq="D"
|
|
2134
2139
|
)
|
|
2135
2140
|
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
2136
|
-
|
|
2137
|
-
#
|
|
2138
|
-
# 1. Identify "week_start" for each daily row
|
|
2139
|
-
#
|
|
2141
|
+
|
|
2142
|
+
# ---------------------------------------------------------------------
|
|
2143
|
+
# 1.1 Identify "week_start" for each daily row, based on week_commencing
|
|
2144
|
+
# ---------------------------------------------------------------------
|
|
2140
2145
|
df_daily['week_start'] = df_daily["Date"].apply(
|
|
2141
2146
|
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
2142
2147
|
)
|
|
2143
|
-
|
|
2144
|
-
#
|
|
2148
|
+
|
|
2149
|
+
# ---------------------------------------------------------------------
|
|
2145
2150
|
# 2. Build a weekly index (df_weekly_start) with dummy columns
|
|
2146
|
-
#
|
|
2151
|
+
# ---------------------------------------------------------------------
|
|
2147
2152
|
df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
|
|
2148
2153
|
df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
|
|
2149
2154
|
|
|
@@ -2160,10 +2165,10 @@ class datapull:
|
|
|
2160
2165
|
|
|
2161
2166
|
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
2162
2167
|
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
2163
|
-
|
|
2164
|
-
#
|
|
2165
|
-
# 3. Public holidays (daily)
|
|
2166
|
-
#
|
|
2168
|
+
|
|
2169
|
+
# ---------------------------------------------------------------------
|
|
2170
|
+
# 3. Public holidays (daily) from 'holidays' package + each holiday name
|
|
2171
|
+
# ---------------------------------------------------------------------
|
|
2167
2172
|
for country in countries:
|
|
2168
2173
|
country_holidays = holidays.CountryHoliday(
|
|
2169
2174
|
country,
|
|
@@ -2180,10 +2185,124 @@ class datapull:
|
|
|
2180
2185
|
df_daily[col_name] = 0
|
|
2181
2186
|
df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
|
|
2182
2187
|
|
|
2183
|
-
#
|
|
2188
|
+
# ---------------------------------------------------------------------
|
|
2189
|
+
# 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
|
|
2190
|
+
# We'll add daily columns for each.
|
|
2191
|
+
# ---------------------------------------------------------------------
|
|
2192
|
+
# Initialize columns
|
|
2193
|
+
extra_cols = [
|
|
2194
|
+
"seas_valentines_day",
|
|
2195
|
+
"seas_halloween",
|
|
2196
|
+
"seas_fathers_day_us_uk",
|
|
2197
|
+
"seas_mothers_day_us",
|
|
2198
|
+
"seas_mothers_day_uk",
|
|
2199
|
+
"seas_good_friday",
|
|
2200
|
+
"seas_easter_monday",
|
|
2201
|
+
"seas_black_friday",
|
|
2202
|
+
"seas_cyber_monday",
|
|
2203
|
+
]
|
|
2204
|
+
for c in extra_cols:
|
|
2205
|
+
df_daily[c] = 0 # default zero
|
|
2206
|
+
|
|
2207
|
+
# Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
|
|
2208
|
+
# weekday: Monday=0, Tuesday=1, ... Sunday=6
|
|
2209
|
+
def nth_weekday_of_month(year, month, weekday, nth):
|
|
2210
|
+
"""
|
|
2211
|
+
Returns date of the nth <weekday> in <month> of <year>.
|
|
2212
|
+
E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
|
|
2213
|
+
"""
|
|
2214
|
+
# 1st day of the month
|
|
2215
|
+
d = datetime(year, month, 1)
|
|
2216
|
+
# What is the weekday of day #1?
|
|
2217
|
+
w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
|
|
2218
|
+
# If we want, e.g. Sunday=6, we see how many days to add
|
|
2219
|
+
delta = (weekday - w) % 7
|
|
2220
|
+
# This is the first <weekday> in that month
|
|
2221
|
+
first_weekday = d + timedelta(days=delta)
|
|
2222
|
+
# Now add 7*(nth-1) days
|
|
2223
|
+
return first_weekday + timedelta(days=7 * (nth-1))
|
|
2224
|
+
|
|
2225
|
+
def get_good_friday(year):
|
|
2226
|
+
"""Good Friday is 2 days before Easter Sunday."""
|
|
2227
|
+
return easter(year) - timedelta(days=2)
|
|
2228
|
+
|
|
2229
|
+
def get_easter_monday(year):
|
|
2230
|
+
"""Easter Monday is 1 day after Easter Sunday."""
|
|
2231
|
+
return easter(year) + timedelta(days=1)
|
|
2232
|
+
|
|
2233
|
+
def get_black_friday(year):
|
|
2234
|
+
"""
|
|
2235
|
+
Black Friday = day after US Thanksgiving,
|
|
2236
|
+
and US Thanksgiving is the 4th Thursday in November.
|
|
2237
|
+
"""
|
|
2238
|
+
# 4th Thursday in November
|
|
2239
|
+
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
|
|
2240
|
+
return fourth_thursday + timedelta(days=1)
|
|
2241
|
+
|
|
2242
|
+
def get_cyber_monday(year):
|
|
2243
|
+
"""Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
|
|
2244
|
+
# 4th Thursday in November
|
|
2245
|
+
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
|
|
2246
|
+
return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
|
|
2247
|
+
|
|
2248
|
+
# Loop over each year in range
|
|
2249
|
+
start_yr = int(start_date[:4])
|
|
2250
|
+
end_yr = datetime.today().year
|
|
2251
|
+
|
|
2252
|
+
for yr in range(start_yr, end_yr + 1):
|
|
2253
|
+
# Valentines = Feb 14
|
|
2254
|
+
valentines_day = datetime(yr, 2, 14)
|
|
2255
|
+
# Halloween = Oct 31
|
|
2256
|
+
halloween_day = datetime(yr, 10, 31)
|
|
2257
|
+
# Father's Day (US & UK) = 3rd Sunday in June
|
|
2258
|
+
fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
|
|
2259
|
+
# Mother's Day US = 2nd Sunday in May
|
|
2260
|
+
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
2261
|
+
# Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
|
|
2262
|
+
# We can approximate as: Easter Sunday - 21 days
|
|
2263
|
+
# BUT we also must ensure it's actually Sunday
|
|
2264
|
+
# (the 4th Sunday in Lent can shift. We'll do the official approach below.)
|
|
2265
|
+
# Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
|
|
2266
|
+
# But that might overshoot if Lent started mid-week.
|
|
2267
|
+
# Let's do a quick approach:
|
|
2268
|
+
# Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
|
|
2269
|
+
# So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
|
|
2270
|
+
mothering_sunday = easter(yr) - timedelta(days=21)
|
|
2271
|
+
# If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
|
|
2272
|
+
while mothering_sunday.weekday() != 6: # Sunday=6
|
|
2273
|
+
mothering_sunday -= timedelta(days=1)
|
|
2274
|
+
|
|
2275
|
+
# Good Friday, Easter Monday
|
|
2276
|
+
gf = get_good_friday(yr)
|
|
2277
|
+
em = get_easter_monday(yr)
|
|
2278
|
+
|
|
2279
|
+
# Black Friday, Cyber Monday
|
|
2280
|
+
bf = get_black_friday(yr)
|
|
2281
|
+
cm = get_cyber_monday(yr)
|
|
2282
|
+
|
|
2283
|
+
# Mark them in df_daily if in range
|
|
2284
|
+
for special_date, col in [
|
|
2285
|
+
(valentines_day, "seas_valentines_day"),
|
|
2286
|
+
(halloween_day, "seas_halloween"),
|
|
2287
|
+
(fathers_day, "seas_fathers_day_us_uk"),
|
|
2288
|
+
(mothers_day_us, "seas_mothers_day_us"),
|
|
2289
|
+
(mothering_sunday, "seas_mothers_day_uk"),
|
|
2290
|
+
(gf, "seas_good_friday"),
|
|
2291
|
+
(em, "seas_easter_monday"),
|
|
2292
|
+
(bf, "seas_black_friday"),
|
|
2293
|
+
(cm, "seas_cyber_monday"),
|
|
2294
|
+
]:
|
|
2295
|
+
# Convert to pd.Timestamp:
|
|
2296
|
+
special_ts = pd.Timestamp(special_date)
|
|
2297
|
+
|
|
2298
|
+
# Only set if it's within your daily range
|
|
2299
|
+
if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
|
|
2300
|
+
df_daily.loc[df_daily["Date"] == special_ts, col] = 1
|
|
2301
|
+
|
|
2302
|
+
# ---------------------------------------------------------------------
|
|
2184
2303
|
# 4. Add daily indicators for last day & last Friday of month
|
|
2185
2304
|
# Then aggregate them to weekly level using .max()
|
|
2186
|
-
#
|
|
2305
|
+
# ---------------------------------------------------------------------
|
|
2187
2306
|
# Last day of month (daily)
|
|
2188
2307
|
df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
|
|
2189
2308
|
lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
|
|
@@ -2193,8 +2312,8 @@ class datapull:
|
|
|
2193
2312
|
def is_last_friday(date):
|
|
2194
2313
|
# last day of the month
|
|
2195
2314
|
last_day_of_month = date.to_period("M").to_timestamp("M")
|
|
2196
|
-
last_day_weekday = last_day_of_month.
|
|
2197
|
-
# Determine how many days we go back from the last day to get Friday
|
|
2315
|
+
last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
|
|
2316
|
+
# Determine how many days we go back from the last day to get Friday (weekday=4)
|
|
2198
2317
|
if last_day_weekday >= 4:
|
|
2199
2318
|
days_to_subtract = last_day_weekday - 4
|
|
2200
2319
|
else:
|
|
@@ -2204,10 +2323,9 @@ class datapull:
|
|
|
2204
2323
|
|
|
2205
2324
|
df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
|
|
2206
2325
|
|
|
2207
|
-
#
|
|
2208
|
-
# 5. Weekly aggregation for
|
|
2209
|
-
#
|
|
2210
|
-
# ------------------------------------------------
|
|
2326
|
+
# ---------------------------------------------------------------------
|
|
2327
|
+
# 5. Weekly aggregation for holiday columns & monthly dummies
|
|
2328
|
+
# ---------------------------------------------------------------------
|
|
2211
2329
|
# For monthly dummies, create a daily col "Month", then get_dummies
|
|
2212
2330
|
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
2213
2331
|
df_monthly_dummies = pd.get_dummies(
|
|
@@ -2218,8 +2336,8 @@ class datapull:
|
|
|
2218
2336
|
)
|
|
2219
2337
|
# Recalculate 'week_start' (already in df_daily, but just to be sure)
|
|
2220
2338
|
df_monthly_dummies['week_start'] = df_daily['week_start']
|
|
2221
|
-
|
|
2222
|
-
# Group monthly dummies by .sum() or .mean()—often
|
|
2339
|
+
|
|
2340
|
+
# Group monthly dummies by .sum() or .mean()—we often spread them across the week
|
|
2223
2341
|
df_monthly_dummies = (
|
|
2224
2342
|
df_monthly_dummies
|
|
2225
2343
|
.groupby('week_start')
|
|
@@ -2228,33 +2346,30 @@ class datapull:
|
|
|
2228
2346
|
.rename(columns={'week_start': "Date"})
|
|
2229
2347
|
.set_index("Date")
|
|
2230
2348
|
)
|
|
2231
|
-
#
|
|
2232
|
-
monthly_cols = [
|
|
2233
|
-
c for c in df_monthly_dummies.columns
|
|
2234
|
-
if c.startswith("seas_month_")
|
|
2235
|
-
]
|
|
2349
|
+
# Spread monthly dummies by 7 to distribute across that week
|
|
2350
|
+
monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
|
|
2236
2351
|
df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
|
|
2237
|
-
|
|
2238
|
-
# Group holiday
|
|
2352
|
+
|
|
2353
|
+
# Group holiday & special-day columns by .max() => binary at weekly level
|
|
2239
2354
|
df_holidays = (
|
|
2240
2355
|
df_daily
|
|
2241
2356
|
.groupby('week_start')
|
|
2242
|
-
.max(numeric_only=True) #
|
|
2357
|
+
.max(numeric_only=True) # if any day=1 in that week, entire week=1
|
|
2243
2358
|
.reset_index()
|
|
2244
2359
|
.rename(columns={'week_start': "Date"})
|
|
2245
2360
|
.set_index("Date")
|
|
2246
2361
|
)
|
|
2247
|
-
|
|
2248
|
-
#
|
|
2362
|
+
|
|
2363
|
+
# ---------------------------------------------------------------------
|
|
2249
2364
|
# 6. Combine weekly start, monthly dummies, holiday flags
|
|
2250
|
-
#
|
|
2365
|
+
# ---------------------------------------------------------------------
|
|
2251
2366
|
df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
|
|
2252
2367
|
df_combined = pd.concat([df_combined, df_holidays], axis=1)
|
|
2253
2368
|
df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
|
|
2254
2369
|
|
|
2255
|
-
#
|
|
2370
|
+
# ---------------------------------------------------------------------
|
|
2256
2371
|
# 7. Create weekly dummies for Week of Year & yearly dummies
|
|
2257
|
-
#
|
|
2372
|
+
# ---------------------------------------------------------------------
|
|
2258
2373
|
df_combined.reset_index(inplace=True)
|
|
2259
2374
|
df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
|
|
2260
2375
|
|
|
@@ -2264,18 +2379,19 @@ class datapull:
|
|
|
2264
2379
|
df_combined["Year"] = df_combined["Date"].dt.year
|
|
2265
2380
|
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
|
|
2266
2381
|
|
|
2267
|
-
#
|
|
2382
|
+
# ---------------------------------------------------------------------
|
|
2268
2383
|
# 8. Add constant & trend
|
|
2269
|
-
#
|
|
2384
|
+
# ---------------------------------------------------------------------
|
|
2270
2385
|
df_combined["Constant"] = 1
|
|
2271
2386
|
df_combined["Trend"] = df_combined.index + 1
|
|
2272
2387
|
|
|
2273
|
-
#
|
|
2388
|
+
# ---------------------------------------------------------------------
|
|
2274
2389
|
# 9. Rename Date -> OBS and return
|
|
2275
|
-
#
|
|
2390
|
+
# ---------------------------------------------------------------------
|
|
2276
2391
|
df_combined.rename(columns={"Date": "OBS"}, inplace=True)
|
|
2277
2392
|
|
|
2278
2393
|
return df_combined
|
|
2394
|
+
|
|
2279
2395
|
|
|
2280
2396
|
def pull_weather(self, week_commencing, country) -> pd.DataFrame:
|
|
2281
2397
|
import pandas as pd
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: imsciences
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9
|
|
4
4
|
Summary: IMS Data Processing Package
|
|
5
5
|
Author: IMS
|
|
6
6
|
Author-email: cam@im-sciences.com
|
|
@@ -39,11 +39,11 @@ The **IMSciences package** is a Python library designed to process incoming data
|
|
|
39
39
|
Table of Contents
|
|
40
40
|
=================
|
|
41
41
|
|
|
42
|
-
1.
|
|
43
|
-
2.
|
|
44
|
-
3.
|
|
45
|
-
4.
|
|
46
|
-
5.
|
|
42
|
+
1. [Data Processing](#Data-Processing)
|
|
43
|
+
2. [Data Pulling](#Data-Pulling)
|
|
44
|
+
3. [Installation](#Installation)
|
|
45
|
+
4. [Useage](#Useage)
|
|
46
|
+
5. [License](#License)
|
|
47
47
|
|
|
48
48
|
---
|
|
49
49
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|