PyPI - rgwfuncs - Versions diffs - 0.0.63__tar.gz → 0.0.65__tar.gz - Mend

rgwfuncs 0.0.63tar.gz → 0.0.65tar.gz

Files changed (17) hide show

{rgwfuncs-0.0.63/src/rgwfuncs.egg-info → rgwfuncs-0.0.65}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rgwfuncs
-Version: 0.0.63
+Version: 0.0.65
 Summary: A functional programming paradigm for mathematical modelling and data science
 Home-page: https://github.com/ryangerardwilson/rgwfunc
 Author: Ryan Gerard Wilson
@@ -1713,7 +1713,34 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
 --------------------------------------------------------------------------------
+### 46. `load_fresh_data_or_pull_from_cache`
+Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
+• Parameters:
+  - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
+  - `cache_dir` (str): The directory where cache files are stored.
+  - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
+  - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
+• Returns:
+  - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
+• Example:
+    from rgwfuncs import load_fresh_data_or_pull_from_cache
+    import pandas as pd
+    def fetch_data():
+        # This is your data-fetching logic. Replace with real fetching code.
+        return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
+    cache_dir = 'cache_directory'
+    file_prefix = 'cached_data'
+    cache_cutoff_hours = 24
+    df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
+--------------------------------------------------------------------------------
 ## Additional Info

{rgwfuncs-0.0.63 → rgwfuncs-0.0.65}/README.md RENAMED Viewed

@@ -1686,7 +1686,34 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
 --------------------------------------------------------------------------------
+### 46. `load_fresh_data_or_pull_from_cache`
+Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
+• Parameters:
+  - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
+  - `cache_dir` (str): The directory where cache files are stored.
+  - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
+  - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
+• Returns:
+  - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
+• Example:
+    from rgwfuncs import load_fresh_data_or_pull_from_cache
+    import pandas as pd
+    def fetch_data():
+        # This is your data-fetching logic. Replace with real fetching code.
+        return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
+    cache_dir = 'cache_directory'
+    file_prefix = 'cached_data'
+    cache_cutoff_hours = 24
+    df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
+--------------------------------------------------------------------------------
 ## Additional Info

{rgwfuncs-0.0.63 → rgwfuncs-0.0.65}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "rgwfuncs"
-version = "0.0.63"
+version = "0.0.65"
 authors = [
   { name = "Ryan Gerard Wilson", email = "ryangerardwilson@gmail.com" },
 ]

{rgwfuncs-0.0.63 → rgwfuncs-0.0.65}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = rgwfuncs
-version = 0.0.63
+version = 0.0.65
 author = Ryan Gerard Wilson
 author_email = ryangerardwilson@gmail.com
 description = A functional programming paradigm for mathematical modelling and data science

{rgwfuncs-0.0.63 → rgwfuncs-0.0.65}/src/rgwfuncs/__init__.py RENAMED Viewed

@@ -1,8 +1,8 @@
 # This file is automatically generated
 # Dynamically importing functions from modules
+from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, load_fresh_data_or_pull_from_cache, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
+from .interactive_shell_lib import interactive_shell
 from .algebra_lib import cancel_polynomial_expression, compute_constant_expression, compute_constant_expression_involving_matrices, compute_constant_expression_involving_ordered_series, compute_prime_factors, expand_polynomial_expression, factor_polynomial_expression, plot_polynomial_functions, plot_x_points_of_polynomial_functions, python_polynomial_expression_to_latex, simplify_polynomial_expression, solve_homogeneous_polynomial_expression
-from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
 from .docs_lib import docs
-from .interactive_shell_lib import interactive_shell
 from .str_lib import send_telegram_message

{rgwfuncs-0.0.63 → rgwfuncs-0.0.65}/src/rgwfuncs/df_lib.py RENAMED Viewed

@@ -23,7 +23,7 @@ from googleapiclient.discovery import build
 import base64
 import boto3
 # import inspect
-from typing import Optional, Dict, List, Tuple, Any
+from typing import Optional, Dict, List, Tuple, Any, Callable
 import warnings
 # Suppress all FutureWarnings
@@ -685,7 +685,7 @@ def bottom_n_unique_values(
                 report[column] = {
                     str(value): str(count) for value,
                     count in bottom_n_values.items()}
-                print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column],indent=2)}\n")
+                print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
             else:
                 print(f"Column '{column}' does not exist in the DataFrame.")
     else:
@@ -1221,7 +1221,7 @@ def append_ranged_classification_column(
             for r in range_list
         )
-        labels = [f"{pad_number(range_list[i],max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
+        labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
     # Ensure the target column is numeric
     df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
@@ -1946,8 +1946,7 @@ def insert_dataframe_in_sqlite_database(
         cursor = conn.cursor()
         if not table_exists(cursor, tablename):
-            columns_with_types = ', '.join(
-                f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes))
+            columns_with_types = ', '.join(f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes))
             create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
             conn.execute(create_table_query)
@@ -1992,7 +1991,7 @@ def sync_dataframe_to_sqlite_database(
         cursor.execute(f"PRAGMA table_info({new_table_name})")
         if cursor.fetchall() == []:  # Table does not exist
             # Create a table using the DataFrame's column names and types
-            columns_with_types = ', '.join(f'"{col}" {map_dtype(dtype)}' for col,dtype in zip(df.columns,df.dtypes))
+            columns_with_types = ', '.join(f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes))
             create_table_query = f'CREATE TABLE "{new_table_name}" ({columns_with_types})'
             conn.execute(create_table_query)
@@ -2009,3 +2008,65 @@ def sync_dataframe_to_sqlite_database(
             conn.execute(f"DROP TABLE IF EXISTS {tablename}")
             # Rename the new table to the old table name
             conn.execute(f"ALTER TABLE {new_table_name} RENAME TO {tablename}")
+def load_fresh_data_or_pull_from_cache(fetch_func: Callable[[], pd.DataFrame], cache_dir: str, file_prefix: str, cache_cutoff_hours: int) -> pd.DataFrame:
+    """
+    Retrieve data from a cache if a recent cache file exists, or fetch fresh data, save it to the cache, and return it.
+    This function checks a specified directory for the most recent cache file matching a specified prefix.
+    If a recent cache file (within the cutoff time in hours) is found, the data is read from there.
+    Otherwise, it calls the data-fetching function, saves the newly fetched data to a new cache file, and returns it.
+    Parameters:
+    - fetch_func (typing.Callable[[], pd.DataFrame]):
+        A callable function that, when executed, returns a pandas DataFrame with fresh data.
+    - cache_dir (str):
+        The directory where cache files are stored.
+    - file_prefix (str):
+        The prefix used for cache filenames to identify relevant cache files.
+    - cache_cutoff_hours (int):
+        The maximum age of a cache file (in hours) to be considered valid.
+        If no file is fresh enough, fresh data will be fetched.
+    Returns:
+    - pd.DataFrame:
+        The pandas DataFrame containing either cached or freshly fetched data.
+    """
+    # Ensure the directory exists
+    os.makedirs(cache_dir, exist_ok=True)
+    # Generate the current timestamp in the required format
+    now: datetime = datetime.now()
+    # Initialize cache file details
+    latest_cache_filename: str = None
+    latest_cache_time: datetime = None
+    # Retrieve the latest cache file if it exists
+    for filename in os.listdir(cache_dir):
+        if filename.startswith(file_prefix) and filename.endswith(".csv"):
+            timestamp_str: str = filename[len(file_prefix)+1:].replace('.csv', '')
+            try:
+                file_time: datetime = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S')
+                if latest_cache_time is None or file_time > latest_cache_time:
+                    latest_cache_time = file_time
+                    latest_cache_filename = filename
+            except ValueError:
+                continue
+    # If a valid cache exists and is within the cutoff time, read from it
+    if latest_cache_time and now - latest_cache_time < timedelta(hours=cache_cutoff_hours):
+        df: pd.DataFrame = pd.read_csv(os.path.join(cache_dir, latest_cache_filename))
+    else:
+        # Fetch new data via the provided function
+        df = fetch_func()
+        # Save the new data in a cache file
+        current_time_str: str = now.strftime('%Y%m%d%H%M%S')
+        cache_filename: str = f"{file_prefix}_{current_time_str}.csv"
+        df.to_csv(os.path.join(cache_dir, cache_filename), index=False)
+    return df

{rgwfuncs-0.0.63 → rgwfuncs-0.0.65/src/rgwfuncs.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: rgwfuncs
-Version: 0.0.63
+Version: 0.0.65
 Summary: A functional programming paradigm for mathematical modelling and data science
 Home-page: https://github.com/ryangerardwilson/rgwfunc
 Author: Ryan Gerard Wilson
@@ -1713,7 +1713,34 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
 --------------------------------------------------------------------------------
+### 46. `load_fresh_data_or_pull_from_cache`
+Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
+• Parameters:
+  - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
+  - `cache_dir` (str): The directory where cache files are stored.
+  - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
+  - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
+• Returns:
+  - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
+• Example:
+    from rgwfuncs import load_fresh_data_or_pull_from_cache
+    import pandas as pd
+    def fetch_data():
+        # This is your data-fetching logic. Replace with real fetching code.
+        return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
+    cache_dir = 'cache_directory'
+    file_prefix = 'cached_data'
+    cache_cutoff_hours = 24
+    df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
+--------------------------------------------------------------------------------
 ## Additional Info