pandas-plots 0.11.22__tar.gz → 0.11.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pandas-plots
3
- Version: 0.11.22
3
+ Version: 0.11.23
4
4
  Summary: A collection of helper for table handling and vizualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -100,14 +100,15 @@ tbl.show_num_df(
100
100
  - `show_venn3()` displays a venn diagram for 3 sets
101
101
 
102
102
  - `hlp` contains some (variety) helper functions
103
- - `df_to_series()` converts a dataframe to a series
103
+ - `to_series()` converts a dataframe to a series (`🚨 breaking change`)
104
104
  - `mean_confidence_interval()` calculates mean and confidence interval for a series
105
105
  - `wrap_text()` formats strings or lists to a given width to fit nicely on the screen
106
106
  - `replace_delimiter_outside_quotes()` when manual import of csv files is needed: replaces delimiters only outside of quotes
107
107
  - `create_barcode_from_url()` creates a barcode from a given URL
108
- - `add_datetime_col()` adds a datetime columns to a dataframe
108
+ - `add_datetime_col()` adds a datetime columns to a dataframe (chainable)
109
109
  - `show_package_version` prints version of a list of packages
110
110
  - `get_os` helps to identify and ensure operating system at runtime
111
+ - `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
111
112
 
112
113
  - `pii` has routines for handling of personally identifiable information
113
114
  - `remove_pii()` logs and deletes pii from a series
@@ -66,14 +66,15 @@ tbl.show_num_df(
66
66
  - `show_venn3()` displays a venn diagram for 3 sets
67
67
 
68
68
  - `hlp` contains some (variety) helper functions
69
- - `df_to_series()` converts a dataframe to a series
69
+ - `to_series()` converts a dataframe to a series (`🚨 breaking change`)
70
70
  - `mean_confidence_interval()` calculates mean and confidence interval for a series
71
71
  - `wrap_text()` formats strings or lists to a given width to fit nicely on the screen
72
72
  - `replace_delimiter_outside_quotes()` when manual import of csv files is needed: replaces delimiters only outside of quotes
73
73
  - `create_barcode_from_url()` creates a barcode from a given URL
74
- - `add_datetime_col()` adds a datetime columns to a dataframe
74
+ - `add_datetime_col()` adds a datetime columns to a dataframe (chainable)
75
75
  - `show_package_version` prints version of a list of packages
76
76
  - `get_os` helps to identify and ensure operating system at runtime
77
+ - `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
77
78
 
78
79
  - `pii` has routines for handling of personally identifiable information
79
80
  - `remove_pii()` logs and deletes pii from a series
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = pandas-plots
3
- version = 0.11.22
3
+ version = 0.11.23
4
4
  author = smeisegeier
5
5
  author_email = dexterDSDo@googlemail.com
6
6
  description = A collection of helper for table handling and vizualization
@@ -1,19 +1,19 @@
1
- import pandas as pd
2
- import numpy as np
3
- import scipy.stats
4
1
  import importlib.metadata as md
5
- from platform import python_version
6
- from typing import Literal, List
7
-
8
- from enum import Enum, auto
9
- import platform
10
2
  import os
11
-
3
+ import platform
4
+ import re
5
+ from enum import Enum, auto
12
6
  from io import BytesIO
7
+ from platform import python_version
8
+ from typing import List, Literal
9
+
10
+ import duckdb as ddb
11
+ import numpy as np
12
+ import pandas as pd
13
+ import requests
14
+ import scipy.stats
13
15
  from matplotlib import pyplot as plt
14
16
  from PIL import Image
15
- import requests
16
- import re
17
17
 
18
18
  # from devtools import debug
19
19
 
@@ -32,7 +32,7 @@ def mean_confidence_interval(df, confidence=0.95):
32
32
  Returns:
33
33
  tuple: A tuple containing the mean, interval, lower bound, and upper bound.
34
34
  """
35
- df = df_to_series(df)
35
+ df = to_series(df)
36
36
  if df is None:
37
37
  return None
38
38
  a = 1.0 * np.array(df)
@@ -53,7 +53,7 @@ def mean_confidence_interval(df, confidence=0.95):
53
53
  # return dist.mean - h, dist.mean + h
54
54
 
55
55
 
56
- def df_to_series(df) -> pd.Series | None:
56
+ def to_series(df) -> pd.Series | None:
57
57
  """
58
58
  Converts a pandas DataFrame to a pandas Series.
59
59
 
@@ -103,6 +103,10 @@ def df_to_series(df) -> pd.Series | None:
103
103
  s.name = _data_col.name
104
104
  return s
105
105
 
106
+ # * extend objects to enable chaining
107
+ pd.DataFrame.to_series = to_series
108
+ pd.Series.to_series = to_series
109
+
106
110
 
107
111
  def replace_delimiter_outside_quotes(
108
112
  input: str, delimiter_old: str = ",", delimiter_new: str = ";", quotechar: str = '"'
@@ -234,6 +238,26 @@ def create_barcode_from_url(
234
238
 
235
239
 
236
240
  def add_datetime_columns(df: pd.DataFrame, date_column: str = None) -> pd.DataFrame:
241
+ """
242
+ Add datetime columns to a given DataFrame.
243
+
244
+ Adds the following columns to the given DataFrame:
245
+ - YYYY: Year of date_column
246
+ - MM: Month of date_column
247
+ - Q: Quarter of date_column
248
+ - YYYY-MM: Year-month of date_column
249
+ - YYYYQ: Year-quarter of date_column
250
+ - YYYY-WW: Year-week of date_column
251
+ - DDD: Day of the week of date_column
252
+
253
+ Args:
254
+ df (pd.DataFrame): The DataFrame to add datetime columns to.
255
+ date_column (str, optional): The column to base the added datetime columns off of. Defaults to None.
256
+
257
+ Returns:
258
+ pd.DataFrame: The DataFrame with the added datetime columns.
259
+ This command can be chained.
260
+ """
237
261
  df_ = df.copy()
238
262
  if not date_column:
239
263
  date_column = [
@@ -269,6 +293,9 @@ def add_datetime_columns(df: pd.DataFrame, date_column: str = None) -> pd.DataFr
269
293
 
270
294
  return df_
271
295
 
296
+ # * extend objects to enable chaining
297
+ pd.DataFrame.add_datetime_columns = add_datetime_columns
298
+
272
299
 
273
300
  def show_package_version(
274
301
  packages: list[str] = None,
@@ -289,7 +316,7 @@ def show_package_version(
289
316
  # ! avoid empty list in signature, it will NOT be empty in runtime
290
317
  if packages is None:
291
318
  packages = []
292
-
319
+
293
320
  if not isinstance(packages, List):
294
321
  print(f"❌ A list of str must be provided")
295
322
  return
@@ -315,6 +342,7 @@ def show_package_version(
315
342
  print(out)
316
343
  return
317
344
 
345
+
318
346
  class OperatingSystem(Enum):
319
347
  WINDOWS = auto()
320
348
  LINUX = auto()
@@ -333,7 +361,7 @@ def get_os(is_os: OperatingSystem = None, verbose: bool = False) -> bool | str:
333
361
  - OperatingSystem.MAC
334
362
 
335
363
  Returns:
336
- bool: True if the desired operating system matches the current operating system, False otherwise.
364
+ bool: True if the desired operating system matches the current operating system, False otherwise.
337
365
  str: Returns the current operating system (platform.system()) if is_os is None.
338
366
  """
339
367
  if verbose:
@@ -352,3 +380,90 @@ def get_os(is_os: OperatingSystem = None, verbose: bool = False) -> bool | str:
352
380
  return True
353
381
  else:
354
382
  return False
383
+
384
+
385
+ def add_bitmask_label(
386
+ data: pd.DataFrame | pd.Series | ddb.DuckDBPyRelation,
387
+ bitmask_col: str,
388
+ labels: list[str],
389
+ separator: str = "|",
390
+ zero_code: str = "-",
391
+ keep_col: bool = True,
392
+ con: ddb.DuckDBPyConnection = None,
393
+ ) -> pd.DataFrame | ddb.DuckDBPyRelation:
394
+ """
395
+ adds a column to the data (DataFrame, Series, or DuckDB Relation) that resolves a bitmask column into human-readable labels.
396
+ - bitmask_col must have been generated before. its value must be constructed as a bitmask, e.g:
397
+ - a red, green, blue combination is rendered into binary 110, which means it has green and blue
398
+ - its value is 6, which will resolved into "g|b" if the list ["r","g","b"] is given
399
+
400
+ if the bitmask value is 0, it will be replaced with the zero_code.
401
+ the method can be chained in pandas as well as in duckdb: df.add_bitmask_label(...)
402
+
403
+ Parameters:
404
+ - data (pd.DataFrame | pd.Series | duckdb.DuckDBPyRelation): Input data.
405
+ - bitmask_col (str): The name of the column containing bitmask values (ignored if input is Series).
406
+ - labels (list[str]): Labels corresponding to the bits, in the correct order.
407
+ - separator (str): Separator for combining labels. Default is "|".
408
+ - zero_code (str): Value to return for bitmask value 0. Default is "-".
409
+ - keep_col (bool): If True, retains the bitmask column. If False, removes it. Default is True.
410
+ - con (duckdb.Connection): DuckDB connection object. Required if data is a DuckDB Relation.
411
+
412
+ Returns:
413
+ - pd.DataFrame | duckdb.DuckDBPyRelation: The modified data with the new column added.
414
+ """
415
+ # * check possible input formats
416
+ if isinstance(data, ddb.DuckDBPyRelation):
417
+ if con is None:
418
+ raise ValueError(
419
+ "A DuckDB connection must be provided when the input is a DuckDB Relation."
420
+ )
421
+ data = data.df() # * Convert DuckDB Relation to DataFrame
422
+
423
+ if isinstance(data, pd.Series):
424
+ bitmask_col = data.name if data.name else "bitmask"
425
+ data = data.to_frame(name=bitmask_col)
426
+
427
+ if not isinstance(data, pd.DataFrame):
428
+ raise ValueError(
429
+ "Input must be a pandas DataFrame, Series, or DuckDB Relation."
430
+ )
431
+
432
+ # * get max allowed value by bitshift, eg for 4 labels its 2^4 -1 = 15
433
+ max_allowable_value = (1 << len(labels)) - 1
434
+ # * compare against max in col
435
+ max_value_in_column = data[bitmask_col].max()
436
+ if max_value_in_column > max_allowable_value:
437
+ raise ValueError(
438
+ f"The maximum value in column '{bitmask_col}' ({max_value_in_column}) exceeds "
439
+ f"the maximum allowable value for {len(labels)} labels ({max_allowable_value}). "
440
+ f"Ensure the number of labels matches the possible bitmask range."
441
+ )
442
+
443
+ # ? Core logic
444
+ # * exit if 0
445
+ def decode_bitmask(value):
446
+ if value == 0:
447
+ return zero_code
448
+ # * iterate over each value as bitfield, on binary 1 fetch assigned label from [labels]
449
+ return separator.join(
450
+ [label for i, label in enumerate(labels) if value & (1 << i)]
451
+ )
452
+
453
+ label_col = f"{bitmask_col}_label"
454
+ data[label_col] = data[bitmask_col].apply(decode_bitmask)
455
+
456
+ # * drop value col if not to be kept
457
+ if not keep_col:
458
+ data = data.drop(columns=[bitmask_col])
459
+
460
+ # * Convert back to DuckDB Relation if original input was a Relation
461
+ if isinstance(data, pd.DataFrame) and con is not None:
462
+ return con.from_df(data)
463
+
464
+ return data
465
+
466
+
467
+ # * extend objects to enable chaining
468
+ pd.DataFrame.add_bitmask_label = add_bitmask_label
469
+ ddb.DuckDBPyRelation.add_bitmask_label = add_bitmask_label
@@ -610,12 +610,12 @@ def plot_histogram(
610
610
  Returns:
611
611
  None
612
612
  """
613
-
613
+
614
614
  # * convert to df if series
615
615
  if isinstance(df_ser, pd.Series):
616
616
  df = df_ser.to_frame()
617
617
  else:
618
- df=df_ser
618
+ df = df_ser
619
619
 
620
620
  col_not_num = df.select_dtypes(exclude="number").columns
621
621
  if any(col_not_num):
@@ -628,7 +628,7 @@ def plot_histogram(
628
628
  df = df.applymap(lambda x: round(x, precision))
629
629
 
630
630
  # ! plot
631
- _caption=_set_caption(caption)
631
+ _caption = _set_caption(caption)
632
632
  fig = px.histogram(
633
633
  data_frame=df,
634
634
  histnorm=histnorm,
@@ -653,7 +653,7 @@ def plot_histogram(
653
653
  "size": 24,
654
654
  },
655
655
  },
656
- showlegend=False if df.shape[1]==1 else True,
656
+ showlegend=False if df.shape[1] == 1 else True,
657
657
  )
658
658
 
659
659
  fig.show(renderer)
@@ -702,7 +702,7 @@ def plot_joint(
702
702
  # * set theme and palette
703
703
  sb.set_theme(style="darkgrid", palette="tab10")
704
704
  if os.getenv("THEME") == "dark":
705
- _style = "dark_background"
705
+ _style = "dark_background"
706
706
  _cmap = "rocket"
707
707
  else:
708
708
  _style = "bmh"
@@ -720,19 +720,21 @@ def plot_joint(
720
720
  "dropna": dropna,
721
721
  # "title": f"{caption}[{ser.name}], n = {len(ser):_}" if not title else title,
722
722
  }
723
- dict_hex={"cmap": _cmap}
724
- dict_kde={"fill": True, "cmap": _cmap}
725
-
726
- if kind=="hex":
723
+ dict_hex = {"cmap": _cmap}
724
+ dict_kde = {"fill": True, "cmap": _cmap}
725
+
726
+ if kind == "hex":
727
727
  fig = sb.jointplot(**dict_base, **dict_hex)
728
- elif kind=="kde":
728
+ elif kind == "kde":
729
729
  fig = sb.jointplot(**dict_base, **dict_kde)
730
730
  else:
731
731
  fig = sb.jointplot(**dict_base)
732
-
732
+
733
733
  # * emojis dont work in good ol seaborn
734
- _caption="" if not caption else f"#{caption}, "
735
- fig.figure.suptitle(title or f"{_caption}[{df.columns[0]}] vs [{df.columns[1]}], n = {len(df):_}")
734
+ _caption = "" if not caption else f"#{caption}, "
735
+ fig.figure.suptitle(
736
+ title or f"{_caption}[{df.columns[0]}] vs [{df.columns[1]}], n = {len(df):_}"
737
+ )
736
738
  # * leave some room for the title
737
739
  fig.figure.tight_layout()
738
740
  fig.figure.subplots_adjust(top=0.90)
@@ -783,10 +785,10 @@ def plot_box(
783
785
  Returns:
784
786
  None
785
787
  """
786
- ser = df_to_series(ser)
788
+ ser = to_series(ser)
787
789
  if ser is None:
788
790
  return
789
-
791
+
790
792
  # * drop na to keep scipy sane
791
793
  n_ = len(ser)
792
794
  ser.dropna(inplace=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pandas-plots
3
- Version: 0.11.22
3
+ Version: 0.11.23
4
4
  Summary: A collection of helper for table handling and vizualization
5
5
  Home-page: https://github.com/smeisegeier/pandas-plots
6
6
  Author: smeisegeier
@@ -100,14 +100,15 @@ tbl.show_num_df(
100
100
  - `show_venn3()` displays a venn diagram for 3 sets
101
101
 
102
102
  - `hlp` contains some (variety) helper functions
103
- - `df_to_series()` converts a dataframe to a series
103
+ - `to_series()` converts a dataframe to a series (`🚨 breaking change`)
104
104
  - `mean_confidence_interval()` calculates mean and confidence interval for a series
105
105
  - `wrap_text()` formats strings or lists to a given width to fit nicely on the screen
106
106
  - `replace_delimiter_outside_quotes()` when manual import of csv files is needed: replaces delimiters only outside of quotes
107
107
  - `create_barcode_from_url()` creates a barcode from a given URL
108
- - `add_datetime_col()` adds a datetime columns to a dataframe
108
+ - `add_datetime_col()` adds a datetime columns to a dataframe (chainable)
109
109
  - `show_package_version` prints version of a list of packages
110
110
  - `get_os` helps to identify and ensure operating system at runtime
111
+ - `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
111
112
 
112
113
  - `pii` has routines for handling of personally identifiable information
113
114
  - `remove_pii()` logs and deletes pii from a series
File without changes