PyPI - imsciences - Versions diffs - 0.8.1__tar.gz → 0.9__tar.gz - Mend

imsciences 0.8.1tar.gz → 0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of imsciences might be problematic. Click here for more details.

Files changed (13) hide show

{imsciences-0.8.1 → imsciences-0.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.8.1
+Version: 0.9
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com
@@ -39,11 +39,11 @@ The **IMSciences package** is a Python library designed to process incoming data
 Table of Contents
 =================
-1. `Data Processing <#data-processing>`_
-2. `Data Pulling <#data-pulling>`_
-3. `Installation <#installation>`_
-4. `Usage <#usage>`_
-5. `License <#license>`_
+1. [Data Processing](#Data-Processing)
+2. [Data Pulling](#Data-Pulling)
+3. [Installation](#Installation)
+4. [Useage](#Useage)
+5. [License](#License)
 ---

{imsciences-0.8.1 → imsciences-0.9}/README.md RENAMED Viewed

@@ -14,11 +14,11 @@ The **IMSciences package** is a Python library designed to process incoming data
 Table of Contents
 =================
-1. `Data Processing <#data-processing>`_
-2. `Data Pulling <#data-pulling>`_
-3. `Installation <#installation>`_
-4. `Usage <#usage>`_
-5. `License <#license>`_
+1. [Data Processing](#Data-Processing)
+2. [Data Pulling](#Data-Pulling)
+3. [Installation](#Installation)
+4. [Useage](#Useage)
+5. [License](#License)
 ---

{imsciences-0.8.1 → imsciences-0.9}/imsciences/datafunctions.py RENAMED Viewed

@@ -16,6 +16,7 @@ import xml.etree.ElementTree as ET
 from bs4 import BeautifulSoup
 import yfinance as yf
 import holidays
+from dateutil.easter import easter
 class dataprocessing:
@@ -2123,27 +2124,31 @@ class datapull:
     ###############################################################  Seasonality  ##########################################################################
     def pull_seasonality(self, week_commencing, start_date, countries):
-        # Week commencing dictionary
+        # ---------------------------------------------------------------------
+        # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
+        # ---------------------------------------------------------------------
         day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        # Create daily date range dataframe starting from start_date
+        # ---------------------------------------------------------------------
+        # 1. Create daily date range from start_date to today
+        # ---------------------------------------------------------------------
         date_range = pd.date_range(
             start=pd.to_datetime(start_date),
             end=datetime.today(),
             freq="D"
         )
         df_daily = pd.DataFrame(date_range, columns=["Date"])
-        # ------------------------------------------------
-        # 1. Identify "week_start" for each daily row
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
+        # 1.1 Identify "week_start" for each daily row, based on week_commencing
+        # ---------------------------------------------------------------------
         df_daily['week_start'] = df_daily["Date"].apply(
             lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
         )
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         # 2. Build a weekly index (df_weekly_start) with dummy columns
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
         df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
@@ -2160,10 +2165,10 @@ class datapull:
         df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
         df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
-        # ------------------------------------------------
-        # 3. Public holidays (daily) and specific holiday columns
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
+        # 3. Public holidays (daily) from 'holidays' package + each holiday name
+        # ---------------------------------------------------------------------
         for country in countries:
             country_holidays = holidays.CountryHoliday(
                 country,
@@ -2180,10 +2185,124 @@ class datapull:
                     df_daily[col_name] = 0
                 df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
+        # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
+        #     We'll add daily columns for each.
+        # ---------------------------------------------------------------------
+        # Initialize columns
+        extra_cols = [
+            "seas_valentines_day",
+            "seas_halloween",
+            "seas_fathers_day_us_uk",
+            "seas_mothers_day_us",
+            "seas_mothers_day_uk",
+            "seas_good_friday",
+            "seas_easter_monday",
+            "seas_black_friday",
+            "seas_cyber_monday",
+        ]
+        for c in extra_cols:
+            df_daily[c] = 0  # default zero
+        # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
+        # weekday: Monday=0, Tuesday=1, ... Sunday=6
+        def nth_weekday_of_month(year, month, weekday, nth):
+            """
+            Returns date of the nth <weekday> in <month> of <year>.
+            E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
+            """
+            # 1st day of the month
+            d = datetime(year, month, 1)
+            # What is the weekday of day #1?
+            w = d.weekday()  # Monday=0, Tuesday=1, ... Sunday=6
+            # If we want, e.g. Sunday=6, we see how many days to add
+            delta = (weekday - w) % 7
+            # This is the first <weekday> in that month
+            first_weekday = d + timedelta(days=delta)
+            # Now add 7*(nth-1) days
+            return first_weekday + timedelta(days=7 * (nth-1))
+        def get_good_friday(year):
+            """Good Friday is 2 days before Easter Sunday."""
+            return easter(year) - timedelta(days=2)
+        def get_easter_monday(year):
+            """Easter Monday is 1 day after Easter Sunday."""
+            return easter(year) + timedelta(days=1)
+        def get_black_friday(year):
+            """
+            Black Friday = day after US Thanksgiving,
+            and US Thanksgiving is the 4th Thursday in November.
+            """
+            # 4th Thursday in November
+            fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)  # weekday=3 => Thursday
+            return fourth_thursday + timedelta(days=1)
+        def get_cyber_monday(year):
+            """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
+            # 4th Thursday in November
+            fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
+            return fourth_thursday + timedelta(days=4)  # Monday after Thanksgiving
+        # Loop over each year in range
+        start_yr = int(start_date[:4])
+        end_yr = datetime.today().year
+        for yr in range(start_yr, end_yr + 1):
+            # Valentines = Feb 14
+            valentines_day = datetime(yr, 2, 14)
+            # Halloween = Oct 31
+            halloween_day  = datetime(yr, 10, 31)
+            # Father's Day (US & UK) = 3rd Sunday in June
+            fathers_day    = nth_weekday_of_month(yr, 6, 6, 3)  # Sunday=6
+            # Mother's Day US = 2nd Sunday in May
+            mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
+            # Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
+            #   We can approximate as: Easter Sunday - 21 days
+            #   BUT we also must ensure it's actually Sunday
+            #   (the 4th Sunday in Lent can shift. We'll do the official approach below.)
+            #   Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
+            #   But that might overshoot if Lent started mid-week.
+            # Let's do a quick approach:
+            #   Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
+            #   So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
+            mothering_sunday = easter(yr) - timedelta(days=21)
+            # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
+            while mothering_sunday.weekday() != 6:  # Sunday=6
+                mothering_sunday -= timedelta(days=1)
+            # Good Friday, Easter Monday
+            gf = get_good_friday(yr)
+            em = get_easter_monday(yr)
+            # Black Friday, Cyber Monday
+            bf = get_black_friday(yr)
+            cm = get_cyber_monday(yr)
+            # Mark them in df_daily if in range
+            for special_date, col in [
+                (valentines_day, "seas_valentines_day"),
+                (halloween_day,  "seas_halloween"),
+                (fathers_day,    "seas_fathers_day_us_uk"),
+                (mothers_day_us, "seas_mothers_day_us"),
+                (mothering_sunday, "seas_mothers_day_uk"),
+                (gf, "seas_good_friday"),
+                (em, "seas_easter_monday"),
+                (bf, "seas_black_friday"),
+                (cm, "seas_cyber_monday"),
+            ]:
+                # Convert to pd.Timestamp:
+                special_ts = pd.Timestamp(special_date)
+                # Only set if it's within your daily range
+                if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
+                    df_daily.loc[df_daily["Date"] == special_ts, col] = 1
+        # ---------------------------------------------------------------------
         # 4. Add daily indicators for last day & last Friday of month
         #    Then aggregate them to weekly level using .max()
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         # Last day of month (daily)
         df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
             lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
@@ -2193,8 +2312,8 @@ class datapull:
         def is_last_friday(date):
             # last day of the month
             last_day_of_month = date.to_period("M").to_timestamp("M")
-            last_day_weekday = last_day_of_month.dayofweek
-            # Determine how many days we go back from the last day to get Friday
+            last_day_weekday = last_day_of_month.weekday()  # Monday=0,...Sunday=6
+            # Determine how many days we go back from the last day to get Friday (weekday=4)
             if last_day_weekday >= 4:
                 days_to_subtract = last_day_weekday - 4
             else:
@@ -2204,10 +2323,9 @@ class datapull:
         df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
-        # ------------------------------------------------
-        # 5. Weekly aggregation for HOLIDAYS & monthly dummies
-        #    (Using .max() for holiday indicators so they become binary)
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
+        # 5. Weekly aggregation for holiday columns & monthly dummies
+        # ---------------------------------------------------------------------
         # For monthly dummies, create a daily col "Month", then get_dummies
         df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
         df_monthly_dummies = pd.get_dummies(
@@ -2218,8 +2336,8 @@ class datapull:
         )
         # Recalculate 'week_start' (already in df_daily, but just to be sure)
         df_monthly_dummies['week_start'] = df_daily['week_start']
-        # Group monthly dummies by .sum() or .mean()—often we average across the week
+        # Group monthly dummies by .sum() or .mean()—we often spread them across the week
         df_monthly_dummies = (
             df_monthly_dummies
             .groupby('week_start')
@@ -2228,33 +2346,30 @@ class datapull:
             .rename(columns={'week_start': "Date"})
             .set_index("Date")
         )
-        # Divide the monthly dummy columns by 7 to spread them across the week
-        monthly_cols = [
-            c for c in df_monthly_dummies.columns
-            if c.startswith("seas_month_")
-        ]
+        # Spread monthly dummies by 7 to distribute across that week
+        monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
         df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
-        # Group holiday columns (and last-day-of-month columns) by .max() => binary
+        # Group holiday & special-day columns by .max() => binary at weekly level
         df_holidays = (
             df_daily
             .groupby('week_start')
-            .max(numeric_only=True)   # use max => if any day=1, entire week=1
+            .max(numeric_only=True)   # if any day=1 in that week, entire week=1
             .reset_index()
             .rename(columns={'week_start': "Date"})
             .set_index("Date")
         )
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         # 6. Combine weekly start, monthly dummies, holiday flags
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
         df_combined = pd.concat([df_combined, df_holidays], axis=1)
         df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         # 7. Create weekly dummies for Week of Year & yearly dummies
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         df_combined.reset_index(inplace=True)
         df_combined.rename(columns={"index": "old_index"}, inplace=True)  # just in case
@@ -2264,18 +2379,19 @@ class datapull:
         df_combined["Year"] = df_combined["Date"].dt.year
         df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         # 8. Add constant & trend
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         df_combined["Constant"] = 1
         df_combined["Trend"] = df_combined.index + 1
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         # 9. Rename Date -> OBS and return
-        # ------------------------------------------------
+        # ---------------------------------------------------------------------
         df_combined.rename(columns={"Date": "OBS"}, inplace=True)
         return df_combined
     def pull_weather(self, week_commencing, country) -> pd.DataFrame:
         import pandas as pd

{imsciences-0.8.1 → imsciences-0.9}/imsciences.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.8.1
+Version: 0.9
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com
@@ -39,11 +39,11 @@ The **IMSciences package** is a Python library designed to process incoming data
 Table of Contents
 =================
-1. `Data Processing <#data-processing>`_
-2. `Data Pulling <#data-pulling>`_
-3. `Installation <#installation>`_
-4. `Usage <#usage>`_
-5. `License <#license>`_
+1. [Data Processing](#Data-Processing)
+2. [Data Pulling](#Data-Pulling)
+3. [Installation](#Installation)
+4. [Useage](#Useage)
+5. [License](#License)
 ---

{imsciences-0.8.1 → imsciences-0.9}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ def read_md(file_name):
             return f.read()
     return ''
-VERSION = '0.8.1'
+VERSION = '0.9'
 DESCRIPTION = 'IMS Data Processing Package'
 LONG_DESCRIPTION = read_md('README.md')