rgwfuncs 0.0.92__py3-none-any.whl → 0.0.94__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/df_lib.py CHANGED
@@ -1279,7 +1279,7 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: List[Union[int
1279
1279
  decimal=True
1280
1280
  )
1281
1281
  if i == len(ranges) - 1:
1282
- label = f"{start} to infinity"
1282
+ label = f"{start}+"
1283
1283
  else:
1284
1284
  end = pad_number(
1285
1285
  ranges[i + 1],
@@ -1287,7 +1287,7 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: List[Union[int
1287
1287
  max_decimal_length,
1288
1288
  decimal=True
1289
1289
  )
1290
- label = f"{start} to {end}"
1290
+ label = f"{start} - {end}"
1291
1291
  labels.append(label)
1292
1292
 
1293
1293
  else:
@@ -1298,10 +1298,10 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: List[Union[int
1298
1298
  for i in range(len(ranges)):
1299
1299
  start = pad_number(ranges[i], max_integer_length)
1300
1300
  if i == len(ranges) - 1:
1301
- label = f"{start} to infinity"
1301
+ label = f"{start}+"
1302
1302
  else:
1303
1303
  end = pad_number(ranges[i + 1], max_integer_length)
1304
- label = f"{start} to {end}"
1304
+ label = f"{start} - {end}"
1305
1305
  labels.append(label)
1306
1306
 
1307
1307
  # Ensure the target column is numeric
@@ -1365,7 +1365,7 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: List[
1365
1365
  max_decimal_length,
1366
1366
  decimal=True
1367
1367
  )
1368
- label = f"{start} to {end}"
1368
+ label = f"{start} - {end}"
1369
1369
  labels.append(label)
1370
1370
  else:
1371
1371
  percentiles_list = [int(p) for p in percentiles]
@@ -1375,7 +1375,7 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: List[
1375
1375
  for i in range(len(percentiles_list) - 1):
1376
1376
  start = pad_number(percentiles_list[i], max_integer_length)
1377
1377
  end = pad_number(percentiles_list[i + 1], max_integer_length)
1378
- label = f"{start} to {end}"
1378
+ label = f"{start} - {end}"
1379
1379
  labels.append(label)
1380
1380
 
1381
1381
  # Ensure the target column is numeric
@@ -1412,7 +1412,7 @@ def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: list
1412
1412
  for i in range(len(date_list) - 1):
1413
1413
  start_date = date_list[i].strftime('%Y-%m-%d')
1414
1414
  end_date = date_list[i + 1].strftime('%Y-%m-%d')
1415
- label = f"{start_date} to {end_date}"
1415
+ label = f"{start_date} - {end_date}"
1416
1416
  labels.append(label)
1417
1417
 
1418
1418
  df[new_col_name] = pd.cut(
@@ -2085,7 +2085,7 @@ def sync_dataframe_to_sqlite_database(
2085
2085
  conn.execute(f"ALTER TABLE {new_table_name} RENAME TO {tablename}")
2086
2086
 
2087
2087
 
2088
- def load_fresh_data_or_pull_from_cache(fetch_func: Callable[[], pd.DataFrame], cache_dir: str, file_prefix: str, cache_cutoff_hours: int) -> pd.DataFrame:
2088
+ def load_fresh_data_or_pull_from_cache(fetch_func: Callable[[], pd.DataFrame], cache_dir: str, file_prefix: str, cache_cutoff_hours: int, dtype: dict = None) -> pd.DataFrame:
2089
2089
  """
2090
2090
  Retrieve data from a cache if a recent cache file exists, or fetch fresh data, save it to the cache, and return it.
2091
2091
 
@@ -2103,16 +2103,18 @@ def load_fresh_data_or_pull_from_cache(fetch_func: Callable[[], pd.DataFrame], c
2103
2103
  - cache_cutoff_hours (int):
2104
2104
  The maximum age of a cache file (in hours) to be considered valid.
2105
2105
  If no file is fresh enough, fresh data will be fetched.
2106
+ - dtype (dict, optional):
2107
+ A dictionary specifying the data types for columns when reading the CSV cache file.
2108
+ Passed to pd.read_csv() to handle mixed-type columns explicitly. Defaults to None.
2106
2109
 
2107
2110
  Returns:
2108
2111
  - pd.DataFrame:
2109
2112
  The pandas DataFrame containing either cached or freshly fetched data.
2110
2113
  """
2111
-
2112
2114
  # Ensure the directory exists
2113
2115
  os.makedirs(cache_dir, exist_ok=True)
2114
2116
 
2115
- # Generate the current timestamp in the required format
2117
+ # Generate the current timestamp
2116
2118
  now: datetime = datetime.now()
2117
2119
 
2118
2120
  # Initialize cache file details
@@ -2133,7 +2135,7 @@ def load_fresh_data_or_pull_from_cache(fetch_func: Callable[[], pd.DataFrame], c
2133
2135
 
2134
2136
  # If a valid cache exists and is within the cutoff time, read from it
2135
2137
  if latest_cache_time and now - latest_cache_time < timedelta(hours=cache_cutoff_hours):
2136
- df: pd.DataFrame = pd.read_csv(os.path.join(cache_dir, latest_cache_filename))
2138
+ df: pd.DataFrame = pd.read_csv(os.path.join(cache_dir, latest_cache_filename), dtype=dtype)
2137
2139
  else:
2138
2140
  # Fetch new data via the provided function
2139
2141
  df = fetch_func()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rgwfuncs
3
- Version: 0.0.92
3
+ Version: 0.0.94
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -1734,32 +1734,39 @@ Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1734
1734
  --------------------------------------------------------------------------------
1735
1735
 
1736
1736
  ### 46. `load_fresh_data_or_pull_from_cache`
1737
- Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn't exist, it uses a fetching function to get new data, which it caches and returns.
1738
1737
 
1739
- Parameters:
1740
- - `fetch_func` (typing.Callable[[], pd.DataFrame]): A callable function that fetches fresh data and returns it as a pandas DataFrame.
1741
- - `cache_dir` (str): The directory where cache files are stored.
1742
- - `file_prefix` (str): The prefix used for cache filenames to identify relevant cache files.
1743
- - `cache_cutoff_hours` (int): The age in hours beyond which a cache file is considered obsolete.
1738
+ Retrieves data from a cache if a recent cache file exists, or fetches fresh data, saves it to the cache, and returns it. If the cache is too old or doesn’t exist, it uses a fetching function to get new data, which it caches and returns. An optional `dtype` parameter allows specifying column data types when reading from the cache, preventing issues with mixed-type columns.
1744
1739
 
1745
- Returns:
1746
- - `pd.DataFrame`: The DataFrame containing cached or freshly fetched data.
1740
+ #### Parameters:
1741
+ - **`fetch_func` (typing.Callable[[], pd.DataFrame])**: A callable function that fetches fresh data and returns it as a pandas DataFrame.
1742
+ - **`cache_dir` (str)**: The directory where cache files are stored.
1743
+ - **`file_prefix` (str)**: The prefix used for cache filenames to identify relevant cache files.
1744
+ - **`cache_cutoff_hours` (int)**: The age in hours beyond which a cache file is considered obsolete.
1745
+ - **`dtype` (dict, optional)**: A dictionary specifying the data types for columns when reading the CSV cache file. Passed to `pd.read_csv()` to handle mixed-type columns explicitly. Defaults to `None`, in which case pandas infers the types.
1747
1746
 
1748
- Example:
1747
+ #### Returns:
1748
+ - **`pd.DataFrame`**: The DataFrame containing cached or freshly unmarked data.
1749
+
1750
+ #### Example:
1749
1751
 
1750
1752
  from rgwfuncs import load_fresh_data_or_pull_from_cache
1751
1753
  import pandas as pd
1752
1754
 
1753
1755
  def fetch_data():
1754
1756
  # This is your data-fetching logic. Replace with real fetching code.
1755
- return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': [4, 5, 6]})
1757
+ return pd.DataFrame({'Column1': [1, 2, 3], 'Column2': ['4', '5', '6']})
1756
1758
 
1757
1759
  cache_dir = 'cache_directory'
1758
1760
  file_prefix = 'cached_data'
1759
1761
  cache_cutoff_hours = 24
1760
1762
 
1763
+ # Without dtype (pandas infers types)
1761
1764
  df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours)
1762
1765
 
1766
+ # With dtype to handle mixed types
1767
+ df = load_fresh_data_or_pull_from_cache(fetch_data, cache_dir, file_prefix, cache_cutoff_hours, dtype={'Column2': str})
1768
+ print(df)
1769
+
1763
1770
  --------------------------------------------------------------------------------
1764
1771
 
1765
1772
  ## Additional Info
@@ -1,12 +1,12 @@
1
1
  rgwfuncs/__init__.py,sha256=LSn54Tlyskcb6Wab_wUpPLB6UGMe5LdrB3GU88mDEbU,1712
2
2
  rgwfuncs/algebra_lib.py,sha256=rKFITfpWfgdBswnbMUuS41XgndEt-jUVz2ObO_ik7eM,42234
3
- rgwfuncs/df_lib.py,sha256=uhP5qv1PTBNTuZSzUe_-Qwwtm20rPU8JpEQa8OEetHk,75555
3
+ rgwfuncs/df_lib.py,sha256=SUEjUc8kCELtbQE2luMsBGh18aTWS97Wb5s3RdMcmHc,75750
4
4
  rgwfuncs/docs_lib.py,sha256=i63NzX-V8cGhikYdtkRGAEe2VcuwpXxDUyTRa9xI7l8,1972
5
5
  rgwfuncs/interactive_shell_lib.py,sha256=YN0ZnM5twIsOeDKuOQ9ZGURCvvBX0RZjM4a1vO1C3E8,4281
6
6
  rgwfuncs/str_lib.py,sha256=hE0VfP6rhQpczsKyCZvH3G1aMRwngKnkW3NTYCEc0Po,3208
7
- rgwfuncs-0.0.92.dist-info/licenses/LICENSE,sha256=jLvt20gcUZYB8UOvyBvyKQ1qhYYhD__qP7ZDx2lPFkU,1062
8
- rgwfuncs-0.0.92.dist-info/METADATA,sha256=Vx7bicfYGVHY2ER5s4gpjDdNsVYsfQx6_2kbLGS6EVU,61443
9
- rgwfuncs-0.0.92.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
10
- rgwfuncs-0.0.92.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
11
- rgwfuncs-0.0.92.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
12
- rgwfuncs-0.0.92.dist-info/RECORD,,
7
+ rgwfuncs-0.0.94.dist-info/licenses/LICENSE,sha256=jLvt20gcUZYB8UOvyBvyKQ1qhYYhD__qP7ZDx2lPFkU,1062
8
+ rgwfuncs-0.0.94.dist-info/METADATA,sha256=K0ehKuNHmsn7IbtRWM8o7_323F8RUnYNtry2QHosFWo,62066
9
+ rgwfuncs-0.0.94.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
10
+ rgwfuncs-0.0.94.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
11
+ rgwfuncs-0.0.94.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
12
+ rgwfuncs-0.0.94.dist-info/RECORD,,