pandas-plots 0.11.22__tar.gz → 0.11.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pandas_plots-0.11.22/src/pandas_plots.egg-info → pandas_plots-0.11.23}/PKG-INFO +4 -3
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/README.md +3 -2
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/setup.cfg +1 -1
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots/hlp.py +130 -15
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots/pls.py +17 -15
- {pandas_plots-0.11.22 → pandas_plots-0.11.23/src/pandas_plots.egg-info}/PKG-INFO +4 -3
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/LICENSE +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/pyproject.toml +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots/pii.py +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots/tbl.py +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots/ven.py +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots.egg-info/SOURCES.txt +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots.egg-info/dependency_links.txt +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots.egg-info/requires.txt +0 -0
- {pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: pandas-plots
|
3
|
-
Version: 0.11.
|
3
|
+
Version: 0.11.23
|
4
4
|
Summary: A collection of helper for table handling and vizualization
|
5
5
|
Home-page: https://github.com/smeisegeier/pandas-plots
|
6
6
|
Author: smeisegeier
|
@@ -100,14 +100,15 @@ tbl.show_num_df(
|
|
100
100
|
- `show_venn3()` displays a venn diagram for 3 sets
|
101
101
|
|
102
102
|
- `hlp` contains some (variety) helper functions
|
103
|
-
- `
|
103
|
+
- `to_series()` converts a dataframe to a series (`🚨 breaking change`)
|
104
104
|
- `mean_confidence_interval()` calculates mean and confidence interval for a series
|
105
105
|
- `wrap_text()` formats strings or lists to a given width to fit nicely on the screen
|
106
106
|
- `replace_delimiter_outside_quotes()` when manual import of csv files is needed: replaces delimiters only outside of quotes
|
107
107
|
- `create_barcode_from_url()` creates a barcode from a given URL
|
108
|
-
- `add_datetime_col()` adds a datetime columns to a dataframe
|
108
|
+
- `add_datetime_col()` adds a datetime columns to a dataframe (chainable)
|
109
109
|
- `show_package_version` prints version of a list of packages
|
110
110
|
- `get_os` helps to identify and ensure operating system at runtime
|
111
|
+
- `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
|
111
112
|
|
112
113
|
- `pii` has routines for handling of personally identifiable information
|
113
114
|
- `remove_pii()` logs and deletes pii from a series
|
@@ -66,14 +66,15 @@ tbl.show_num_df(
|
|
66
66
|
- `show_venn3()` displays a venn diagram for 3 sets
|
67
67
|
|
68
68
|
- `hlp` contains some (variety) helper functions
|
69
|
-
- `
|
69
|
+
- `to_series()` converts a dataframe to a series (`🚨 breaking change`)
|
70
70
|
- `mean_confidence_interval()` calculates mean and confidence interval for a series
|
71
71
|
- `wrap_text()` formats strings or lists to a given width to fit nicely on the screen
|
72
72
|
- `replace_delimiter_outside_quotes()` when manual import of csv files is needed: replaces delimiters only outside of quotes
|
73
73
|
- `create_barcode_from_url()` creates a barcode from a given URL
|
74
|
-
- `add_datetime_col()` adds a datetime columns to a dataframe
|
74
|
+
- `add_datetime_col()` adds a datetime columns to a dataframe (chainable)
|
75
75
|
- `show_package_version` prints version of a list of packages
|
76
76
|
- `get_os` helps to identify and ensure operating system at runtime
|
77
|
+
- `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
|
77
78
|
|
78
79
|
- `pii` has routines for handling of personally identifiable information
|
79
80
|
- `remove_pii()` logs and deletes pii from a series
|
@@ -1,19 +1,19 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
import numpy as np
|
3
|
-
import scipy.stats
|
4
1
|
import importlib.metadata as md
|
5
|
-
from platform import python_version
|
6
|
-
from typing import Literal, List
|
7
|
-
|
8
|
-
from enum import Enum, auto
|
9
|
-
import platform
|
10
2
|
import os
|
11
|
-
|
3
|
+
import platform
|
4
|
+
import re
|
5
|
+
from enum import Enum, auto
|
12
6
|
from io import BytesIO
|
7
|
+
from platform import python_version
|
8
|
+
from typing import List, Literal
|
9
|
+
|
10
|
+
import duckdb as ddb
|
11
|
+
import numpy as np
|
12
|
+
import pandas as pd
|
13
|
+
import requests
|
14
|
+
import scipy.stats
|
13
15
|
from matplotlib import pyplot as plt
|
14
16
|
from PIL import Image
|
15
|
-
import requests
|
16
|
-
import re
|
17
17
|
|
18
18
|
# from devtools import debug
|
19
19
|
|
@@ -32,7 +32,7 @@ def mean_confidence_interval(df, confidence=0.95):
|
|
32
32
|
Returns:
|
33
33
|
tuple: A tuple containing the mean, interval, lower bound, and upper bound.
|
34
34
|
"""
|
35
|
-
df =
|
35
|
+
df = to_series(df)
|
36
36
|
if df is None:
|
37
37
|
return None
|
38
38
|
a = 1.0 * np.array(df)
|
@@ -53,7 +53,7 @@ def mean_confidence_interval(df, confidence=0.95):
|
|
53
53
|
# return dist.mean - h, dist.mean + h
|
54
54
|
|
55
55
|
|
56
|
-
def
|
56
|
+
def to_series(df) -> pd.Series | None:
|
57
57
|
"""
|
58
58
|
Converts a pandas DataFrame to a pandas Series.
|
59
59
|
|
@@ -103,6 +103,10 @@ def df_to_series(df) -> pd.Series | None:
|
|
103
103
|
s.name = _data_col.name
|
104
104
|
return s
|
105
105
|
|
106
|
+
# * extend objects to enable chaining
|
107
|
+
pd.DataFrame.to_series = to_series
|
108
|
+
pd.Series.to_series = to_series
|
109
|
+
|
106
110
|
|
107
111
|
def replace_delimiter_outside_quotes(
|
108
112
|
input: str, delimiter_old: str = ",", delimiter_new: str = ";", quotechar: str = '"'
|
@@ -234,6 +238,26 @@ def create_barcode_from_url(
|
|
234
238
|
|
235
239
|
|
236
240
|
def add_datetime_columns(df: pd.DataFrame, date_column: str = None) -> pd.DataFrame:
|
241
|
+
"""
|
242
|
+
Add datetime columns to a given DataFrame.
|
243
|
+
|
244
|
+
Adds the following columns to the given DataFrame:
|
245
|
+
- YYYY: Year of date_column
|
246
|
+
- MM: Month of date_column
|
247
|
+
- Q: Quarter of date_column
|
248
|
+
- YYYY-MM: Year-month of date_column
|
249
|
+
- YYYYQ: Year-quarter of date_column
|
250
|
+
- YYYY-WW: Year-week of date_column
|
251
|
+
- DDD: Day of the week of date_column
|
252
|
+
|
253
|
+
Args:
|
254
|
+
df (pd.DataFrame): The DataFrame to add datetime columns to.
|
255
|
+
date_column (str, optional): The column to base the added datetime columns off of. Defaults to None.
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
pd.DataFrame: The DataFrame with the added datetime columns.
|
259
|
+
This command can be chained.
|
260
|
+
"""
|
237
261
|
df_ = df.copy()
|
238
262
|
if not date_column:
|
239
263
|
date_column = [
|
@@ -269,6 +293,9 @@ def add_datetime_columns(df: pd.DataFrame, date_column: str = None) -> pd.DataFr
|
|
269
293
|
|
270
294
|
return df_
|
271
295
|
|
296
|
+
# * extend objects to enable chaining
|
297
|
+
pd.DataFrame.add_datetime_columns = add_datetime_columns
|
298
|
+
|
272
299
|
|
273
300
|
def show_package_version(
|
274
301
|
packages: list[str] = None,
|
@@ -289,7 +316,7 @@ def show_package_version(
|
|
289
316
|
# ! avoid empty list in signature, it will NOT be empty in runtime
|
290
317
|
if packages is None:
|
291
318
|
packages = []
|
292
|
-
|
319
|
+
|
293
320
|
if not isinstance(packages, List):
|
294
321
|
print(f"❌ A list of str must be provided")
|
295
322
|
return
|
@@ -315,6 +342,7 @@ def show_package_version(
|
|
315
342
|
print(out)
|
316
343
|
return
|
317
344
|
|
345
|
+
|
318
346
|
class OperatingSystem(Enum):
|
319
347
|
WINDOWS = auto()
|
320
348
|
LINUX = auto()
|
@@ -333,7 +361,7 @@ def get_os(is_os: OperatingSystem = None, verbose: bool = False) -> bool | str:
|
|
333
361
|
- OperatingSystem.MAC
|
334
362
|
|
335
363
|
Returns:
|
336
|
-
bool: True if the desired operating system matches the current operating system, False otherwise.
|
364
|
+
bool: True if the desired operating system matches the current operating system, False otherwise.
|
337
365
|
str: Returns the current operating system (platform.system()) if is_os is None.
|
338
366
|
"""
|
339
367
|
if verbose:
|
@@ -352,3 +380,90 @@ def get_os(is_os: OperatingSystem = None, verbose: bool = False) -> bool | str:
|
|
352
380
|
return True
|
353
381
|
else:
|
354
382
|
return False
|
383
|
+
|
384
|
+
|
385
|
+
def add_bitmask_label(
|
386
|
+
data: pd.DataFrame | pd.Series | ddb.DuckDBPyRelation,
|
387
|
+
bitmask_col: str,
|
388
|
+
labels: list[str],
|
389
|
+
separator: str = "|",
|
390
|
+
zero_code: str = "-",
|
391
|
+
keep_col: bool = True,
|
392
|
+
con: ddb.DuckDBPyConnection = None,
|
393
|
+
) -> pd.DataFrame | ddb.DuckDBPyRelation:
|
394
|
+
"""
|
395
|
+
adds a column to the data (DataFrame, Series, or DuckDB Relation) that resolves a bitmask column into human-readable labels.
|
396
|
+
- bitmask_col must have been generated before. its value must be constructed as a bitmask, e.g:
|
397
|
+
- a red, green, blue combination is rendered into binary 110, which means it has green and blue
|
398
|
+
- its value is 6, which will resolved into "g|b" if the list ["r","g","b"] is given
|
399
|
+
|
400
|
+
if the bitmask value is 0, it will be replaced with the zero_code.
|
401
|
+
the method can be chained in pandas as well as in duckdb: df.add_bitmask_label(...)
|
402
|
+
|
403
|
+
Parameters:
|
404
|
+
- data (pd.DataFrame | pd.Series | duckdb.DuckDBPyRelation): Input data.
|
405
|
+
- bitmask_col (str): The name of the column containing bitmask values (ignored if input is Series).
|
406
|
+
- labels (list[str]): Labels corresponding to the bits, in the correct order.
|
407
|
+
- separator (str): Separator for combining labels. Default is "|".
|
408
|
+
- zero_code (str): Value to return for bitmask value 0. Default is "-".
|
409
|
+
- keep_col (bool): If True, retains the bitmask column. If False, removes it. Default is True.
|
410
|
+
- con (duckdb.Connection): DuckDB connection object. Required if data is a DuckDB Relation.
|
411
|
+
|
412
|
+
Returns:
|
413
|
+
- pd.DataFrame | duckdb.DuckDBPyRelation: The modified data with the new column added.
|
414
|
+
"""
|
415
|
+
# * check possible input formats
|
416
|
+
if isinstance(data, ddb.DuckDBPyRelation):
|
417
|
+
if con is None:
|
418
|
+
raise ValueError(
|
419
|
+
"A DuckDB connection must be provided when the input is a DuckDB Relation."
|
420
|
+
)
|
421
|
+
data = data.df() # * Convert DuckDB Relation to DataFrame
|
422
|
+
|
423
|
+
if isinstance(data, pd.Series):
|
424
|
+
bitmask_col = data.name if data.name else "bitmask"
|
425
|
+
data = data.to_frame(name=bitmask_col)
|
426
|
+
|
427
|
+
if not isinstance(data, pd.DataFrame):
|
428
|
+
raise ValueError(
|
429
|
+
"Input must be a pandas DataFrame, Series, or DuckDB Relation."
|
430
|
+
)
|
431
|
+
|
432
|
+
# * get max allowed value by bitshift, eg for 4 labels its 2^4 -1 = 15
|
433
|
+
max_allowable_value = (1 << len(labels)) - 1
|
434
|
+
# * compare against max in col
|
435
|
+
max_value_in_column = data[bitmask_col].max()
|
436
|
+
if max_value_in_column > max_allowable_value:
|
437
|
+
raise ValueError(
|
438
|
+
f"The maximum value in column '{bitmask_col}' ({max_value_in_column}) exceeds "
|
439
|
+
f"the maximum allowable value for {len(labels)} labels ({max_allowable_value}). "
|
440
|
+
f"Ensure the number of labels matches the possible bitmask range."
|
441
|
+
)
|
442
|
+
|
443
|
+
# ? Core logic
|
444
|
+
# * exit if 0
|
445
|
+
def decode_bitmask(value):
|
446
|
+
if value == 0:
|
447
|
+
return zero_code
|
448
|
+
# * iterate over each value as bitfield, on binary 1 fetch assigned label from [labels]
|
449
|
+
return separator.join(
|
450
|
+
[label for i, label in enumerate(labels) if value & (1 << i)]
|
451
|
+
)
|
452
|
+
|
453
|
+
label_col = f"{bitmask_col}_label"
|
454
|
+
data[label_col] = data[bitmask_col].apply(decode_bitmask)
|
455
|
+
|
456
|
+
# * drop value col if not to be kept
|
457
|
+
if not keep_col:
|
458
|
+
data = data.drop(columns=[bitmask_col])
|
459
|
+
|
460
|
+
# * Convert back to DuckDB Relation if original input was a Relation
|
461
|
+
if isinstance(data, pd.DataFrame) and con is not None:
|
462
|
+
return con.from_df(data)
|
463
|
+
|
464
|
+
return data
|
465
|
+
|
466
|
+
|
467
|
+
# * extend objects to enable chaining
|
468
|
+
pd.DataFrame.add_bitmask_label = add_bitmask_label
|
469
|
+
ddb.DuckDBPyRelation.add_bitmask_label = add_bitmask_label
|
@@ -610,12 +610,12 @@ def plot_histogram(
|
|
610
610
|
Returns:
|
611
611
|
None
|
612
612
|
"""
|
613
|
-
|
613
|
+
|
614
614
|
# * convert to df if series
|
615
615
|
if isinstance(df_ser, pd.Series):
|
616
616
|
df = df_ser.to_frame()
|
617
617
|
else:
|
618
|
-
df=df_ser
|
618
|
+
df = df_ser
|
619
619
|
|
620
620
|
col_not_num = df.select_dtypes(exclude="number").columns
|
621
621
|
if any(col_not_num):
|
@@ -628,7 +628,7 @@ def plot_histogram(
|
|
628
628
|
df = df.applymap(lambda x: round(x, precision))
|
629
629
|
|
630
630
|
# ! plot
|
631
|
-
_caption=_set_caption(caption)
|
631
|
+
_caption = _set_caption(caption)
|
632
632
|
fig = px.histogram(
|
633
633
|
data_frame=df,
|
634
634
|
histnorm=histnorm,
|
@@ -653,7 +653,7 @@ def plot_histogram(
|
|
653
653
|
"size": 24,
|
654
654
|
},
|
655
655
|
},
|
656
|
-
showlegend=False if df.shape[1]==1 else True,
|
656
|
+
showlegend=False if df.shape[1] == 1 else True,
|
657
657
|
)
|
658
658
|
|
659
659
|
fig.show(renderer)
|
@@ -702,7 +702,7 @@ def plot_joint(
|
|
702
702
|
# * set theme and palette
|
703
703
|
sb.set_theme(style="darkgrid", palette="tab10")
|
704
704
|
if os.getenv("THEME") == "dark":
|
705
|
-
_style = "dark_background"
|
705
|
+
_style = "dark_background"
|
706
706
|
_cmap = "rocket"
|
707
707
|
else:
|
708
708
|
_style = "bmh"
|
@@ -720,19 +720,21 @@ def plot_joint(
|
|
720
720
|
"dropna": dropna,
|
721
721
|
# "title": f"{caption}[{ser.name}], n = {len(ser):_}" if not title else title,
|
722
722
|
}
|
723
|
-
dict_hex={"cmap": _cmap}
|
724
|
-
dict_kde={"fill": True, "cmap": _cmap}
|
725
|
-
|
726
|
-
if kind=="hex":
|
723
|
+
dict_hex = {"cmap": _cmap}
|
724
|
+
dict_kde = {"fill": True, "cmap": _cmap}
|
725
|
+
|
726
|
+
if kind == "hex":
|
727
727
|
fig = sb.jointplot(**dict_base, **dict_hex)
|
728
|
-
elif kind=="kde":
|
728
|
+
elif kind == "kde":
|
729
729
|
fig = sb.jointplot(**dict_base, **dict_kde)
|
730
730
|
else:
|
731
731
|
fig = sb.jointplot(**dict_base)
|
732
|
-
|
732
|
+
|
733
733
|
# * emojis dont work in good ol seaborn
|
734
|
-
_caption="" if not caption else f"#{caption}, "
|
735
|
-
fig.figure.suptitle(
|
734
|
+
_caption = "" if not caption else f"#{caption}, "
|
735
|
+
fig.figure.suptitle(
|
736
|
+
title or f"{_caption}[{df.columns[0]}] vs [{df.columns[1]}], n = {len(df):_}"
|
737
|
+
)
|
736
738
|
# * leave some room for the title
|
737
739
|
fig.figure.tight_layout()
|
738
740
|
fig.figure.subplots_adjust(top=0.90)
|
@@ -783,10 +785,10 @@ def plot_box(
|
|
783
785
|
Returns:
|
784
786
|
None
|
785
787
|
"""
|
786
|
-
ser =
|
788
|
+
ser = to_series(ser)
|
787
789
|
if ser is None:
|
788
790
|
return
|
789
|
-
|
791
|
+
|
790
792
|
# * drop na to keep scipy sane
|
791
793
|
n_ = len(ser)
|
792
794
|
ser.dropna(inplace=True)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: pandas-plots
|
3
|
-
Version: 0.11.
|
3
|
+
Version: 0.11.23
|
4
4
|
Summary: A collection of helper for table handling and vizualization
|
5
5
|
Home-page: https://github.com/smeisegeier/pandas-plots
|
6
6
|
Author: smeisegeier
|
@@ -100,14 +100,15 @@ tbl.show_num_df(
|
|
100
100
|
- `show_venn3()` displays a venn diagram for 3 sets
|
101
101
|
|
102
102
|
- `hlp` contains some (variety) helper functions
|
103
|
-
- `
|
103
|
+
- `to_series()` converts a dataframe to a series (`🚨 breaking change`)
|
104
104
|
- `mean_confidence_interval()` calculates mean and confidence interval for a series
|
105
105
|
- `wrap_text()` formats strings or lists to a given width to fit nicely on the screen
|
106
106
|
- `replace_delimiter_outside_quotes()` when manual import of csv files is needed: replaces delimiters only outside of quotes
|
107
107
|
- `create_barcode_from_url()` creates a barcode from a given URL
|
108
|
-
- `add_datetime_col()` adds a datetime columns to a dataframe
|
108
|
+
- `add_datetime_col()` adds a datetime columns to a dataframe (chainable)
|
109
109
|
- `show_package_version` prints version of a list of packages
|
110
110
|
- `get_os` helps to identify and ensure operating system at runtime
|
111
|
+
- `🆕 add_bitmask_label()` adds a column to the data that resolves a bitmask column into human-readable labels
|
111
112
|
|
112
113
|
- `pii` has routines for handling of personally identifiable information
|
113
114
|
- `remove_pii()` logs and deletes pii from a series
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{pandas_plots-0.11.22 → pandas_plots-0.11.23}/src/pandas_plots.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|