rgwfuncs 0.0.63__tar.gz → 0.0.65__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rgwfuncs
3
- Version: 0.0.63
3
+ Version: 0.0.65
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -1713,7 +1713,34 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1713
1713
 
1714
1714
  --------------------------------------------------------------------------------
1715
1715
 
1716
+ ### 46. `load_fresh_data_or_pull_from_cache`
1717
+ Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
1716
1718
 
1719
+ • Parameters:
1720
+ - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
1721
+ - `cache_dir` (str): The directory where cache files are stored.
1722
+ - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
1723
+ - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
1724
+
1725
+ • Returns:
1726
+ - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
1727
+
1728
+ • Example:
1729
+
1730
+ from rgwfuncs import load_fresh_data_or_pull_from_cache
1731
+ import pandas as pd
1732
+
1733
+ def fetch_data():
1734
+ # This is your data-fetching logic. Replace with real fetching code.
1735
+ return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
1736
+
1737
+ cache_dir = 'cache_directory'
1738
+ file_prefix = 'cached_data'
1739
+ cache_cutoff_hours = 24
1740
+
1741
+ df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
1742
+
1743
+ --------------------------------------------------------------------------------
1717
1744
 
1718
1745
  ## Additional Info
1719
1746
 
@@ -1686,7 +1686,34 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1686
1686
 
1687
1687
  --------------------------------------------------------------------------------
1688
1688
 
1689
+ ### 46. `load_fresh_data_or_pull_from_cache`
1690
+ Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
1689
1691
 
1692
+ • Parameters:
1693
+ - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
1694
+ - `cache_dir` (str): The directory where cache files are stored.
1695
+ - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
1696
+ - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
1697
+
1698
+ • Returns:
1699
+ - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
1700
+
1701
+ • Example:
1702
+
1703
+ from rgwfuncs import load_fresh_data_or_pull_from_cache
1704
+ import pandas as pd
1705
+
1706
+ def fetch_data():
1707
+ # This is your data-fetching logic. Replace with real fetching code.
1708
+ return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
1709
+
1710
+ cache_dir = 'cache_directory'
1711
+ file_prefix = 'cached_data'
1712
+ cache_cutoff_hours = 24
1713
+
1714
+ df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
1715
+
1716
+ --------------------------------------------------------------------------------
1690
1717
 
1691
1718
  ## Additional Info
1692
1719
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rgwfuncs"
7
- version = "0.0.63"
7
+ version = "0.0.65"
8
8
  authors = [
9
9
  { name = "Ryan Gerard Wilson", email = "ryangerardwilson@gmail.com" },
10
10
  ]
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = rgwfuncs
3
- version = 0.0.63
3
+ version = 0.0.65
4
4
  author = Ryan Gerard Wilson
5
5
  author_email = ryangerardwilson@gmail.com
6
6
  description = A functional programming paradigm for mathematical modelling and data science
@@ -1,8 +1,8 @@
1
1
  # This file is automatically generated
2
2
  # Dynamically importing functions from modules
3
3
 
4
+ from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, load_fresh_data_or_pull_from_cache, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
5
+ from .interactive_shell_lib import interactive_shell
4
6
  from .algebra_lib import cancel_polynomial_expression, compute_constant_expression, compute_constant_expression_involving_matrices, compute_constant_expression_involving_ordered_series, compute_prime_factors, expand_polynomial_expression, factor_polynomial_expression, plot_polynomial_functions, plot_x_points_of_polynomial_functions, python_polynomial_expression_to_latex, simplify_polynomial_expression, solve_homogeneous_polynomial_expression
5
- from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
6
7
  from .docs_lib import docs
7
- from .interactive_shell_lib import interactive_shell
8
8
  from .str_lib import send_telegram_message
@@ -23,7 +23,7 @@ from googleapiclient.discovery import build
23
23
  import base64
24
24
  import boto3
25
25
  # import inspect
26
- from typing import Optional, Dict, List, Tuple, Any
26
+ from typing import Optional, Dict, List, Tuple, Any, Callable
27
27
  import warnings
28
28
 
29
29
  # Suppress all FutureWarnings
@@ -685,7 +685,7 @@ def bottom_n_unique_values(
685
685
  report[column] = {
686
686
  str(value): str(count) for value,
687
687
  count in bottom_n_values.items()}
688
- print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column],indent=2)}\n")
688
+ print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
689
689
  else:
690
690
  print(f"Column '{column}' does not exist in the DataFrame.")
691
691
  else:
@@ -1221,7 +1221,7 @@ def append_ranged_classification_column(
1221
1221
  for r in range_list
1222
1222
  )
1223
1223
 
1224
- labels = [f"{pad_number(range_list[i],max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1224
+ labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1225
1225
 
1226
1226
  # Ensure the target column is numeric
1227
1227
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
@@ -1946,8 +1946,7 @@ def insert_dataframe_in_sqlite_database(
1946
1946
  cursor = conn.cursor()
1947
1947
 
1948
1948
  if not table_exists(cursor, tablename):
1949
- columns_with_types = ', '.join(
1950
- f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes))
1949
+ columns_with_types = ', '.join(f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes))
1951
1950
  create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
1952
1951
  conn.execute(create_table_query)
1953
1952
 
@@ -1992,7 +1991,7 @@ def sync_dataframe_to_sqlite_database(
1992
1991
  cursor.execute(f"PRAGMA table_info({new_table_name})")
1993
1992
  if cursor.fetchall() == []: # Table does not exist
1994
1993
  # Create a table using the DataFrame's column names and types
1995
- columns_with_types = ', '.join(f'"{col}" {map_dtype(dtype)}' for col,dtype in zip(df.columns,df.dtypes))
1994
+ columns_with_types = ', '.join(f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes))
1996
1995
  create_table_query = f'CREATE TABLE "{new_table_name}" ({columns_with_types})'
1997
1996
  conn.execute(create_table_query)
1998
1997
 
@@ -2009,3 +2008,65 @@ def sync_dataframe_to_sqlite_database(
2009
2008
  conn.execute(f"DROP TABLE IF EXISTS {tablename}")
2010
2009
  # Rename the new table to the old table name
2011
2010
  conn.execute(f"ALTER TABLE {new_table_name} RENAME TO {tablename}")
2011
+
2012
+
2013
+ def load_fresh_data_or_pull_from_cache(fetch_func: Callable[[], pd.DataFrame], cache_dir: str, file_prefix: str, cache_cutoff_hours: int) -> pd.DataFrame:
2014
+ """
2015
+ Retrieve data from a cache if a recent cache file exists, or fetch fresh data, save it to the cache, and return it.
2016
+
2017
+ This function checks a specified directory for the most recent cache file matching a specified prefix.
2018
+ If a recent cache file (within the cutoff time in hours) is found, the data is read from there.
2019
+ Otherwise, it calls the data-fetching function, saves the newly fetched data to a new cache file, and returns it.
2020
+
2021
+ Parameters:
2022
+ - fetch_func (typing.Callable[[], pd.DataFrame]):
2023
+ A callable function that, when executed, returns a pandas DataFrame with fresh data.
2024
+ - cache_dir (str):
2025
+ The directory where cache files are stored.
2026
+ - file_prefix (str):
2027
+ The prefix used for cache filenames to identify relevant cache files.
2028
+ - cache_cutoff_hours (int):
2029
+ The maximum age of a cache file (in hours) to be considered valid.
2030
+ If no file is fresh enough, fresh data will be fetched.
2031
+
2032
+ Returns:
2033
+ - pd.DataFrame:
2034
+ The pandas DataFrame containing either cached or freshly fetched data.
2035
+ """
2036
+
2037
+ # Ensure the directory exists
2038
+ os.makedirs(cache_dir, exist_ok=True)
2039
+
2040
+ # Generate the current timestamp in the required format
2041
+ now: datetime = datetime.now()
2042
+
2043
+ # Initialize cache file details
2044
+ latest_cache_filename: str = None
2045
+ latest_cache_time: datetime = None
2046
+
2047
+ # Retrieve the latest cache file if it exists
2048
+ for filename in os.listdir(cache_dir):
2049
+ if filename.startswith(file_prefix) and filename.endswith(".csv"):
2050
+ timestamp_str: str = filename[len(file_prefix)+1:].replace('.csv', '')
2051
+ try:
2052
+ file_time: datetime = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S')
2053
+ if latest_cache_time is None or file_time > latest_cache_time:
2054
+ latest_cache_time = file_time
2055
+ latest_cache_filename = filename
2056
+ except ValueError:
2057
+ continue
2058
+
2059
+ # If a valid cache exists and is within the cutoff time, read from it
2060
+ if latest_cache_time and now - latest_cache_time < timedelta(hours=cache_cutoff_hours):
2061
+ df: pd.DataFrame = pd.read_csv(os.path.join(cache_dir, latest_cache_filename))
2062
+ else:
2063
+ # Fetch new data via the provided function
2064
+ df = fetch_func()
2065
+
2066
+ # Save the new data in a cache file
2067
+ current_time_str: str = now.strftime('%Y%m%d%H%M%S')
2068
+ cache_filename: str = f"{file_prefix}_{current_time_str}.csv"
2069
+ df.to_csv(os.path.join(cache_dir, cache_filename), index=False)
2070
+
2071
+ return df
2072
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rgwfuncs
3
- Version: 0.0.63
3
+ Version: 0.0.65
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -1713,7 +1713,34 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1713
1713
 
1714
1714
  --------------------------------------------------------------------------------
1715
1715
 
1716
+ ### 46. `load_fresh_data_or_pull_from_cache`
1717
+ Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
1716
1718
 
1719
+ • Parameters:
1720
+ - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
1721
+ - `cache_dir` (str): The directory where cache files are stored.
1722
+ - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
1723
+ - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
1724
+
1725
+ • Returns:
1726
+ - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
1727
+
1728
+ • Example:
1729
+
1730
+ from rgwfuncs import load_fresh_data_or_pull_from_cache
1731
+ import pandas as pd
1732
+
1733
+ def fetch_data():
1734
+ # This is your data-fetching logic. Replace with real fetching code.
1735
+ return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
1736
+
1737
+ cache_dir = 'cache_directory'
1738
+ file_prefix = 'cached_data'
1739
+ cache_cutoff_hours = 24
1740
+
1741
+ df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
1742
+
1743
+ --------------------------------------------------------------------------------
1717
1744
 
1718
1745
  ## Additional Info
1719
1746
 
File without changes