PyPI - imsciences - Versions diffs - 0.9.5.9__tar.gz → 0.9.6.3__tar.gz - Mend

imsciences 0.9.5.9tar.gz → 0.9.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{imsciences-0.9.5.9 → imsciences-0.9.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.9.5.9
+Version: 0.9.6.3
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

{imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences/pull.py RENAMED Viewed

@@ -133,7 +133,7 @@ class datapull:
         Args:
             week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
+                                Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
                                 Default is "mon".
             max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
             delay (int): Delay in seconds between retry attempts. Default is 5.
@@ -144,7 +144,7 @@ class datapull:
                         and 'macro_boe_intr_rate' contains the average interest rate for the week.
         """
         # Week commencing dictionary
-        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
         # URL of the Bank of England data page
         url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
@@ -209,7 +209,7 @@ class datapull:
         Args:
             country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
             week_commencing (str): The starting day of the week for aggregation.
-                                Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
+                                Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
             start_date (str): Dataset start date in the format "YYYY-MM-DD"
         Returns:
@@ -383,7 +383,7 @@ class datapull:
         # ---------------------------------------------------------------------
         # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
         # ---------------------------------------------------------------------
-        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
         # ---------------------------------------------------------------------
         # 1. Create daily date range from start_date to today
@@ -668,7 +668,7 @@ class datapull:
             raise ValueError("country_codes must be a list/tuple or a single string.")
         # --- Setup / Constants --- #
-        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
         # Map each 2-letter code to a key
         country_dict = {
             "US": "US_STATES",
@@ -1171,74 +1171,110 @@ class datapull:
     def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
         """
-        Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
+        Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
         aggregates it to weekly averages, and renames variables based on specified rules.
         Parameters:
-            cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
-            week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
-            sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
+            cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
+            week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
+            sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
+                                             (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
         Returns:
-            pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
-                        and all series as renamed columns.
+            pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
+                          and all series as renamed columns (e.g., 'macro_retail_sales_uk').
+                          Returns an empty DataFrame if no data is fetched or processed.
         """
         # Define CDIDs for sectors and defaults
-        sector_cdids = {
+        sector_cdids_map = {
             "fast_food": ["L7TD", "L78Q", "DOAD"],
+            "clothing_footwear": ["D7BW","D7GO","CHBJ"],
+            "fuel": ["A9FS","L7FP","CHOL"],
+            "cars":["D7E8","D7E9","D7CO"],
             "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
         }
-        default_cdids = sector_cdids["default"]
-        sector_specific_cdids = sector_cdids.get(sector, [])
-        standard_cdids = list(set(default_cdids + sector_specific_cdids))  # Avoid duplicates
+        default_cdids = sector_cdids_map["default"]
+        sector_specific_cdids = [] # Initialize empty list for sector CDIDs
+        if sector: # Check if sector is not None or empty
+            if isinstance(sector, str):
+                # If it's a single string, wrap it in a list
+                sector_list = [sector]
+            elif isinstance(sector, list):
+                # If it's already a list, use it directly
+                sector_list = sector
+            else:
+                raise TypeError("`sector` parameter must be a string or a list of strings.")
+            # Iterate through the list of sectors and collect their CDIDs
+            for sec in sector_list:
+                sector_specific_cdids.extend(sector_cdids_map.get(sec, [])) # Use extend to add items from the list
+        standard_cdids = list(set(default_cdids + sector_specific_cdids))  # Combine default and selected sector CDIDs, ensure uniqueness
-        # Combine standard CDIDs and additional CDIDs
+        # Combine standard CDIDs and any additional user-provided CDIDs
         if cdid_list is None:
             cdid_list = []
-        cdid_list = list(set(standard_cdids + cdid_list))  # Avoid duplicates
+        final_cdid_list = list(set(standard_cdids + cdid_list))  # Ensure uniqueness in the final list
         base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
         base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
         combined_df = pd.DataFrame()
         # Map week start day to pandas weekday convention
-        days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
-        if week_start_day not in days_map:
+        days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
+        if week_start_day.lower() not in days_map:
             raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
-        week_start = days_map[week_start_day]
+        week_start = days_map[week_start_day.lower()] # Use lower() for case-insensitivity
-        for cdid in cdid_list:
+        for cdid in final_cdid_list: # Use the final combined list
             try:
                 # Search for the series
                 search_url = f"{base_search_url}{cdid}"
-                search_response = requests.get(search_url)
+                search_response = requests.get(search_url, timeout=30) # Add timeout
                 search_response.raise_for_status()
                 search_data = search_response.json()
                 items = search_data.get("items", [])
                 if not items:
-                    print(f"No data found for CDID: {cdid}")
+                    print(f"Warning: No data found for CDID: {cdid}")
                     continue
                 # Extract series name and latest release URI
-                series_name = items[0].get("title", f"Series_{cdid}")
-                latest_date = max(
-                    datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
-                    for item in items if "release_date" in item
-                )
-                latest_uri = next(
-                    item["uri"] for item in items
-                    if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
-                )
+                # Find the item with the most recent release_date
+                latest_item = None
+                latest_date = None
+                for item in items:
+                    if "release_date" in item:
+                        try:
+                            # Ensure timezone awareness for comparison
+                            current_date = datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
+                            if latest_date is None or current_date > latest_date:
+                                latest_date = current_date
+                                latest_item = item
+                        except ValueError:
+                            print(f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}")
+                            continue # Skip this item if date is invalid
+                if latest_item is None:
+                     print(f"Warning: No valid release date found for CDID: {cdid}")
+                     continue
+                series_name = latest_item.get("title", f"Series_{cdid}") # Use title from the latest item
+                latest_uri = latest_item.get("uri")
+                if not latest_uri:
+                     print(f"Warning: No URI found for the latest release of CDID: {cdid}")
+                     continue
                 # Fetch the dataset
                 data_url = f"{base_data_url}{latest_uri}"
-                data_response = requests.get(data_url)
+                data_response = requests.get(data_url, timeout=30) # Add timeout
                 data_response.raise_for_status()
                 data_json = data_response.json()
                 # Detect the frequency and process accordingly
+                frequency_key = None
                 if "months" in data_json and data_json["months"]:
                     frequency_key = "months"
                 elif "quarters" in data_json and data_json["quarters"]:
@@ -1246,72 +1282,142 @@ class datapull:
                 elif "years" in data_json and data_json["years"]:
                     frequency_key = "years"
                 else:
-                    print(f"Unsupported frequency or no data for CDID: {cdid}")
+                    print(f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}")
                     continue
                 # Prepare the DataFrame
+                if not data_json[frequency_key]: # Check if the list of values is empty
+                     print(f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}")
+                     continue
                 df = pd.DataFrame(data_json[frequency_key])
-                # Parse the 'date' field based on frequency
-                if frequency_key == "months":
-                    df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
-                elif frequency_key == "quarters":
-                    def parse_quarter(quarter_str):
-                        year, qtr = quarter_str.split(" Q")
-                        month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
-                        return datetime(int(year), month, 1)
-                    df["date"] = df["date"].apply(parse_quarter)
-                elif frequency_key == "years":
-                    df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
+                # Check if essential columns exist
+                if "date" not in df.columns or "value" not in df.columns:
+                    print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
+                    continue
+                # Parse the 'date' field based on frequency
+                try:
+                    if frequency_key == "months":
+                        # Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
+                        df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
+                    elif frequency_key == "quarters":
+                        def parse_quarter(quarter_str):
+                            try:
+                                year, qtr = quarter_str.split(" Q")
+                                month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
+                                return datetime(int(year), month, 1)
+                            except (ValueError, KeyError):
+                                return pd.NaT # Return Not a Time for parsing errors
+                        df["date"] = df["date"].apply(parse_quarter)
+                    elif frequency_key == "years":
+                        df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
+                except Exception as e:
+                    print(f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}")
+                    continue # Skip this series if date parsing fails
+                # Coerce value to numeric, handle potential errors
                 df["value"] = pd.to_numeric(df["value"], errors="coerce")
+                # Drop rows where date or value parsing failed
+                df.dropna(subset=["date", "value"], inplace=True)
+                if df.empty:
+                    print(f"Warning: No valid data points after processing for CDID: {cdid}")
+                    continue
                 df.rename(columns={"value": series_name}, inplace=True)
                 # Combine data
-                df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
+                df_subset = df.loc[:, ["date", series_name]].reset_index(drop=True) # Explicitly select columns
                 if combined_df.empty:
-                    combined_df = df
+                    combined_df = df_subset
                 else:
-                    combined_df = pd.merge(combined_df, df, on="date", how="outer")
+                    # Use outer merge to keep all dates, sort afterwards
+                    combined_df = pd.merge(combined_df, df_subset, on="date", how="outer")
             except requests.exceptions.RequestException as e:
                 print(f"Error fetching data for CDID {cdid}: {e}")
-            except (KeyError, ValueError) as e:
+            except (KeyError, ValueError, TypeError) as e: # Added TypeError
                 print(f"Error processing data for CDID {cdid}: {e}")
+            except Exception as e: # Catch unexpected errors
+                 print(f"An unexpected error occurred for CDID {cdid}: {e}")
         if not combined_df.empty:
+            # Sort by date after merging to ensure correct forward fill
+            combined_df.sort_values(by="date", inplace=True)
+            combined_df.reset_index(drop=True, inplace=True)
+            # Create a complete daily date range
             min_date = combined_df["date"].min()
-            max_date = datetime.today()
+            # Ensure max_date is timezone-naive if min_date is, or consistent otherwise
+            max_date = pd.Timestamp(datetime.today().date()) # Use today's date, timezone-naive
+            if pd.isna(min_date):
+                 print("Error: Minimum date is NaT, cannot create date range.")
+                 return pd.DataFrame()
+            # Make sure min_date is not NaT before creating the range
             date_range = pd.date_range(start=min_date, end=max_date, freq='D')
             daily_df = pd.DataFrame(date_range, columns=['date'])
+            # Merge with original data and forward fill
             daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
             daily_df = daily_df.ffill()
+            # Drop rows before the first valid data point after ffill
+            first_valid_index = daily_df.dropna(subset=daily_df.columns.difference(['date'])).index.min()
+            if pd.notna(first_valid_index):
+                 daily_df = daily_df.loc[first_valid_index:]
+            else:
+                 print("Warning: No valid data points found after forward filling.")
+                 return pd.DataFrame() # Return empty if ffill results in no data
             # Aggregate to weekly frequency
-            daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
+            # Ensure 'date' column is datetime type before dt accessor
+            daily_df['date'] = pd.to_datetime(daily_df['date'])
+            daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start + 7) % 7, unit='D') # Corrected logic for week start
+            # Group by week_commencing and calculate mean for numeric columns only
             weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
             def clean_column_name(name):
+                # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
                 name = re.sub(r"\(.*?\)", "", name)
+                # Take only the part before the first colon if present
                 name = re.split(r":", name)[0]
-                name = re.sub(r"\d+", "", name)
+                # Remove digits
+                #name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
+                # Remove specific words like 'annual', 'rate' case-insensitively
                 name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
+                # Remove non-alphanumeric characters (except underscore and space)
                 name = re.sub(r"[^\w\s]", "", name)
+                # Replace spaces with underscores
+                name = name.strip() # Remove leading/trailing whitespace
                 name = name.replace(" ", "_")
+                # Replace multiple underscores with a single one
                 name = re.sub(r"_+", "_", name)
+                # Remove trailing underscores
                 name = name.rstrip("_")
+                # Add prefix and suffix
                 return f"macro_{name.lower()}_uk"
+            # Apply cleaning function to relevant columns
             weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
-            weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
+            weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True) # Rename week commencing col
-            weekly_df = weekly_df.fillna(0)
+            # Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
+            # Consider if 0 is the appropriate fill value for your use case
+            # weekly_df = weekly_df.fillna(0)
             return weekly_df
         else:
-            print("No data available to process.")
+            print("No data successfully fetched or processed.")
             return pd.DataFrame()
     def pull_yfinance(self, tickers=None, week_start_day="mon"):
         """
         Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
@@ -1337,7 +1443,7 @@ class datapull:
         end_date = datetime.today().strftime("%Y-%m-%d")
         # Mapping week start day to pandas weekday convention
-        days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
+        days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
         if week_start_day not in days_map:
             raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
         week_start = days_map[week_start_day]
@@ -1497,9 +1603,9 @@ class datapull:
             # Aggregate by week commencing
             day_offsets = {
                 'mon': 'W-MON',
-                'tues': 'W-TUE',
+                'tue': 'W-TUE',
                 'wed': 'W-WED',
-                'thurs': 'W-THU',
+                'thu': 'W-THU',
                 'fri': 'W-FRI',
                 'sat': 'W-SAT',
                 'sun': 'W-SUN'
@@ -1592,9 +1698,9 @@ class datapull:
             # Resample by week
             day_offsets = {
                 'mon': 'W-MON',
-                'tues': 'W-TUE',
+                'tue': 'W-TUE',
                 'wed': 'W-WED',
-                'thurs': 'W-THU',
+                'thu': 'W-THU',
                 'fri': 'W-FRI',
                 'sat': 'W-SAT',
                 'sun': 'W-SUN'

{imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: imsciences
-Version: 0.9.5.9
+Version: 0.9.6.3
 Summary: IMS Data Processing Package
 Author: IMS
 Author-email: cam@im-sciences.com

imsciences-0.9.6.3/imsciences.egg-info/PKG-INFO-TomG-HP-290722 ADDED Viewed

@@ -0,0 +1,355 @@
+Metadata-Version: 2.1
+Name: imsciences
+Version: 0.9.6.3
+Summary: IMS Data Processing Package
+Author: IMS
+Author-email: cam@im-sciences.com
+Keywords: data processing,apis,data analysis,data visualization,machine learning
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: Unix
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+Requires-Dist: pandas
+Requires-Dist: plotly
+Requires-Dist: numpy
+Requires-Dist: fredapi
+Requires-Dist: xgboost
+Requires-Dist: scikit-learn
+Requires-Dist: bs4
+Requires-Dist: yfinance
+Requires-Dist: holidays
+Requires-Dist: google-analytics-data
+Requires-Dist: geopandas
+Requires-Dist: geopy
+# IMS Package Documentation
+The **Independent Marketing Sciences** package is a Python library designed to process incoming data into a format tailored for projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data collection, manipulation, visualisation and analysis.
+---
+## Key Features
+- Seamless data processing for time series workflows.
+- Aggregation, filtering, and transformation of time series data.
+- Visualising Data
+- Integration with external data sources like FRED, Bank of England and ONS.
+---
+Table of Contents
+=================
+1. [Usage](#usage)
+2. [Data Processing for Time Series](#data-processing-for-time-series)
+3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
+4. [Data Visualisations](#data-visualisations)
+5. [Data Pulling](#data-pulling)
+6. [Installation](#installation)
+7. [License](#license)
+8. [Roadmap](#roadmap)
+---
+## Usage
+```bash
+from imsciences import dataprocessing, geoprocessing, datapull, datavis
+ims_proc = dataprocessing()
+ims_geo = geoprocessing()
+ims_pull = datapull()
+ims_vis = datavis()
+```
+## Data Processing for Time Series
+## 1. `get_wd_levels`
+- **Description**: Get the working directory with the option of moving up parents.
+- **Usage**: `get_wd_levels(levels)`
+- **Example**: `get_wd_levels(0)`
+## 2. `aggregate_daily_to_wc_long`
+- **Description**: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.
+- **Usage**: `aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')`
+- **Example**: `aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')`
+## 3. `convert_monthly_to_daily`
+- **Description**: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.
+- **Usage**: `convert_monthly_to_daily(df, date_column, divide=True)`
+- **Example**: `convert_monthly_to_daily(df, 'date')`
+## 4. `week_of_year_mapping`
+- **Description**: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.
+- **Usage**: `week_of_year_mapping(df, week_col, start_day_str)`
+- **Example**: `week_of_year_mapping(df, 'week', 'mon')`
+## 5. `rename_cols`
+- **Description**: Renames columns in a pandas DataFrame with a specified prefix or format.
+- **Usage**: `rename_cols(df, name='ame_')`
+- **Example**: `rename_cols(df, 'ame_facebook')`
+## 6. `merge_new_and_old`
+- **Description**: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.
+- **Usage**: `merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')`
+- **Example**: `merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')`
+## 7. `merge_dataframes_on_column`
+- **Description**: Merge a list of DataFrames on a common column.
+- **Usage**: `merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')`
+- **Example**: `merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')`
+## 8. `merge_and_update_dfs`
+- **Description**: Merges two dataframes, updating columns from the second dataframe where values are available.
+- **Usage**: `merge_and_update_dfs(df1, df2, key_column)`
+- **Example**: `merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')`
+## 9. `convert_us_to_uk_dates`
+- **Description**: Convert a DataFrame column with mixed US and UK date formats to datetime.
+- **Usage**: `convert_us_to_uk_dates(df, date_col)`
+- **Example**: `convert_us_to_uk_dates(df, 'date')`
+## 10. `combine_sheets`
+- **Description**: Combines multiple DataFrames from a dictionary into a single DataFrame.
+- **Usage**: `combine_sheets(all_sheets)`
+- **Example**: `combine_sheets({'Sheet1': df1, 'Sheet2': df2})`
+## 11. `pivot_table`
+- **Description**: Dynamically pivots a DataFrame based on specified columns.
+- **Usage**: `pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')`
+- **Example**: `pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)`
+## 12. `apply_lookup_table_for_columns`
+- **Description**: Maps substrings in columns to new values based on a dictionary.
+- **Usage**: `apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')`
+- **Example**: `apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')`
+## 13. `aggregate_daily_to_wc_wide`
+- **Description**: Aggregates daily data into weekly data and pivots it to wide format.
+- **Usage**: `aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)`
+- **Example**: `aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)`
+## 14. `merge_cols_with_seperator`
+- **Description**: Merges multiple columns in a DataFrame into one column with a specified separator.
+- **Usage**: `merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')`
+- **Example**: `merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')`
+## 15. `check_sum_of_df_cols_are_equal`
+- **Description**: Checks if the sum of two columns in two DataFrames are equal and provides the difference.
+- **Usage**: `check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)`
+- **Example**: `check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')`
+## 16. `convert_2_df_cols_to_dict`
+- **Description**: Creates a dictionary from two DataFrame columns.
+- **Usage**: `convert_2_df_cols_to_dict(df, key_col, value_col)`
+- **Example**: `convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')`
+## 17. `create_FY_and_H_columns`
+- **Description**: Adds financial year and half-year columns to a DataFrame based on a start date.
+- **Usage**: `create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')`
+- **Example**: `create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')`
+## 18. `keyword_lookup_replacement`
+- **Description**: Updates values in a column based on a lookup dictionary with conditional logic.
+- **Usage**: `keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')`
+- **Example**: `keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')`
+## 19. `create_new_version_of_col_using_LUT`
+- **Description**: Creates a new column based on a lookup table applied to an existing column.
+- **Usage**: `create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')`
+- **Example**: `create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)`
+## 20. `convert_df_wide_2_long`
+- **Description**: Converts a wide-format DataFrame into a long-format DataFrame.
+- **Usage**: `convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')`
+- **Example**: `convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')`
+## 21. `manually_edit_data`
+- **Description**: Manually updates specified cells in a DataFrame based on filters.
+- **Usage**: `manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)`
+- **Example**: `manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')`
+## 22. `format_numbers_with_commas`
+- **Description**: Formats numerical columns with commas and a specified number of decimal places.
+- **Usage**: `format_numbers_with_commas(df, decimal_length_chosen=2)`
+- **Example**: `format_numbers_with_commas(df, decimal_length_chosen=1)`
+## 23. `filter_df_on_multiple_conditions`
+- **Description**: Filters a DataFrame based on multiple column conditions.
+- **Usage**: `filter_df_on_multiple_conditions(df, filters_dict)`
+- **Example**: `filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})`
+## 24. `read_and_concatenate_files`
+- **Description**: Reads and concatenates files from a specified folder into a single DataFrame.
+- **Usage**: `read_and_concatenate_files(folder_path, file_type='csv')`
+- **Example**: `read_and_concatenate_files('/path/to/files', file_type='xlsx')`
+## 25. `upgrade_outdated_packages`
+- **Description**: Upgrades all outdated Python packages except specified ones.
+- **Usage**: `upgrade_outdated_packages(exclude_packages=['twine'])`
+- **Example**: `upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])`
+## 26. `convert_mixed_formats_dates`
+- **Description**: Converts mixed-format date columns into standardized datetime format.
+- **Usage**: `convert_mixed_formats_dates(df, column_name)`
+- **Example**: `convert_mixed_formats_dates(df, 'date_col')`
+## 27. `fill_weekly_date_range`
+- **Description**: Fills in missing weekly dates in a DataFrame with a specified frequency.
+- **Usage**: `fill_weekly_date_range(df, date_column, freq='W-MON')`
+- **Example**: `fill_weekly_date_range(df, 'date_col')`
+## 28. `add_prefix_and_suffix`
+- **Description**: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.
+- **Usage**: `add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)`
+- **Example**: `add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')`
+## 29. `create_dummies`
+- **Description**: Creates dummy variables for columns, with an option to add a total dummy column.
+- **Usage**: `create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')`
+- **Example**: `create_dummies(df, date_col='date_col', dummy_threshold=1)`
+## 30. `replace_substrings`
+- **Description**: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.
+- **Usage**: `replace_substrings(df, column, replacements, to_lower=False, new_column=None)`
+- **Example**: `replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')`
+## 31. `add_total_column`
+- **Description**: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.
+- **Usage**: `add_total_column(df, exclude_col=None, total_col_name='Total')`
+- **Example**: `add_total_column(df, exclude_col='date_col')`
+## 32. `apply_lookup_table_based_on_substring`
+- **Description**: Categorizes text in a column using a lookup table based on substrings.
+- **Usage**: `apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')`
+- **Example**: `apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})`
+## 33. `compare_overlap`
+- **Description**: Compares overlapping periods between two DataFrames and summarizes differences.
+- **Usage**: `compare_overlap(df1, df2, date_col)`
+- **Example**: `compare_overlap(df1, df2, 'date_col')`
+## 34. `week_commencing_2_week_commencing_conversion_isoweekday`
+- **Description**: Maps dates to the start of the current ISO week based on a specified weekday.
+- **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
+- **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
+## 35. `seasonality_feature_extraction`
+- **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
+- **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
+- **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
+---
+## Data Processing for Incrementality Testing
+## 1. `pull_ga`
+- **Description**: Pull in GA4 data for geo experiments.
+- **Usage**: `pull_ga(credentials_file, property_id, start_date, country, metrics)`
+- **Example**: `pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])`
+## 2. `process_itv_analysis`
+- **Description**: Processes region-level data for geo experiments by mapping ITV regions, grouping selected metrics, merging with media spend data, and saving the result.
+- **Usage**: `process_itv_analysis(self, raw_df, itv_path, cities_path, media_spend_path, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
+- **Example**: `process_itv_analysis(df, 'itv regional mapping.csv', 'Geo_Mappings_with_Coordinates.xlsx', 'IMS.xlsx', 'itv_for_test_analysis_itvx.csv', ['West', 'Westcountry', 'Tyne Tees'], ['Central Scotland', 'North Scotland'], ['newUsers', 'transactions'], ['sum', 'sum'])`
+## 3. `process_city_analysis`
+- **Description**: Processes city-level data for geo experiments by grouping selected metrics, merging with media spend data, and saving the result.
+- **Usage**: `process_city_analysis(raw_df, spend_df, output_path, test_group, control_group, columns_to_aggregate, aggregator_list)`
+- **Example**: `process_city_analysis(df, spend, output, ['Barnsley'], ['Aberdeen'], ['newUsers', 'transactions'], ['sum', 'sum'])`
+---
+## Data Visualisations
+## 1. `plot_one`
+- **Description**: Plots a specified column from a DataFrame with white background and black axes.
+- **Usage**: `plot_one(df1, col1, date_column)`
+- **Example**: `plot_one(df, 'sales', 'date')`
+## 2. `plot_two`
+- **Description**: Plots specified columns from two DataFrames, optionally on the same or separate y-axes.
+- **Usage**: `plot_two(df1, col1, df2, col2, date_column, same_axis=True)`
+- **Example**: `plot_two(df1, 'sales', df2, 'revenue', 'date', same_axis=False)`
+## 3. `plot_chart`
+- **Description**: Plots various chart types using Plotly, including line, bar, scatter, area, pie, etc.
+- **Usage**: `plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values')`
+- **Example**: `plot_chart(df, 'date', ['sales', 'revenue'], chart_type='line', title='Sales and Revenue')`
+---
+## Data Pulling
+## 1. `pull_fred_data`
+- **Description**: Fetch data from FRED using series ID tokens.
+- **Usage**: `pull_fred_data(week_commencing, series_id_list)`
+- **Example**: `pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])`
+## 2. `pull_boe_data`
+- **Description**: Fetch and process Bank of England interest rate data.
+- **Usage**: `pull_boe_data(week_commencing)`
+- **Example**: `pull_boe_data('mon')`
+## 3. `pull_oecd`
+- **Description**: Fetch macroeconomic data from OECD for a specified country.
+- **Usage**: `pull_oecd(country='GBR', week_commencing='mon', start_date='2020-01-01')`
+- **Example**: `pull_oecd('GBR', 'mon', '2000-01-01')`
+## 4. `get_google_mobility_data`
+- **Description**: Fetch Google Mobility data for the specified country.
+- **Usage**: `get_google_mobility_data(country, wc)`
+- **Example**: `get_google_mobility_data('United Kingdom', 'mon')`
+## 5. `pull_seasonality`
+- **Description**: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.
+- **Usage**: `pull_seasonality(week_commencing, start_date, countries)`
+- **Example**: `pull_seasonality('mon', '2020-01-01', ['US', 'GB'])`
+## 6. `pull_weather`
+- **Description**: Fetch and process historical weather data for the specified country.
+- **Usage**: `pull_weather(week_commencing, start_date, country)`
+- **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
+## 7. `pull_macro_ons_uk`
+- **Description**: Fetch and process time series data from the Beta ONS API.
+- **Usage**: `pull_macro_ons_uk(additional_list, week_commencing, sector)`
+- **Example**: `pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')`
+## 8. `pull_yfinance`
+- **Description**: Fetch and process time series data from Yahoo Finance.
+- **Usage**: `pull_yfinance(tickers, week_start_day)`
+- **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
+## 9. `pull_sports_events`
+- **Description**: Pull a veriety of sports events primaraly football and rugby.
+- **Usage**: `pull_sports_events(start_date, week_commencing)`
+- **Example**: `pull_sports_events('2020-01-01', 'mon')`
+---
+## Installation
+Install the IMS package via pip:
+```bash
+pip install imsciences
+```
+---
+## License
+This project is licensed under the MIT License. ![License](https://img.shields.io/badge/license-MIT-blue.svg)
+---
+## Roadmap
+- [Fixes]: Naming conventions are inconsistent/ have changed from previous seasonality tools (eg. 'seas_nyd' is named 'seas_new_years_day', 'week_1' is named 'seas_1')
+- [Fixes]: Naming conventions can be inconsistent within the data pull (suffix on some var is 'gb' on some it is 'uk' and for others there is no suffix) - furthermore, there is a lack of consistency for global holidays/events (Christmas, Easter, Halloween, etc) - some have regional suffix and others don't.
+- [Additions]: Need to add new data pulls for more macro and seasonal varibles
+---

{imsciences-0.9.5.9 → imsciences-0.9.6.3}/imsciences.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,7 +8,7 @@ imsciences/pull.py
 imsciences/unittesting.py
 imsciences/vis.py
 imsciences.egg-info/PKG-INFO
-imsciences.egg-info/PKG-INFO-IMS-24Ltp-3
+imsciences.egg-info/PKG-INFO-TomG-HP-290722
 imsciences.egg-info/SOURCES.txt
 imsciences.egg-info/dependency_links.txt
 imsciences.egg-info/requires.txt

{imsciences-0.9.5.9 → imsciences-0.9.6.3}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ def read_md(file_name):
             return f.read()
     return ''
-VERSION = '0.9.5.9'
+VERSION = '0.9.6.3'
 DESCRIPTION = 'IMS Data Processing Package'
 LONG_DESCRIPTION = read_md('README.md')
@@ -34,5 +34,5 @@ setup(
         "Operating System :: Unix",
         "Operating System :: MacOS :: MacOS X",
         "Operating System :: Microsoft :: Windows",
-    ]
+    ],
 )

imsciences-0.9.5.9/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 DELETED Viewed

@@ -1,24 +0,0 @@
-Metadata-Version: 2.1
-Name: imsciences
-Version: 0.6.1.1
-Summary: IMS Data Processing Package
-Author: IMS
-Author-email: cam@im-sciences.com
-Keywords: python,data processing
-Classifier: Development Status :: 3 - Alpha
-Classifier: Intended Audience :: Developers
-Classifier: Programming Language :: Python :: 3
-Classifier: Operating System :: Unix
-Classifier: Operating System :: MacOS :: MacOS X
-Classifier: Operating System :: Microsoft :: Windows
-Description-Content-Type: text/markdown
-Requires-Dist: pandas
-# IMS Package Documentation
-The IMS package is a python library for processing incoming data into a format that can be used for projects. IMS processing offers a variety of functions to manipulate and analyze data efficiently. Here are the functionalities provided by the package:
-## Data Processing
-## Data Pulling