opsci-toolbox 0.0.16__py3-none-any.whl → 0.0.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/helpers/common.py +27 -4
- opsci_toolbox/helpers/dataviz.py +110 -3
- opsci_toolbox/helpers/nlp.py +6 -10
- opsci_toolbox/helpers/nlp_cuml.py +7 -4
- opsci_toolbox/helpers/sna.py +1 -1
- {opsci_toolbox-0.0.16.dist-info → opsci_toolbox-0.0.17.dist-info}/METADATA +1 -1
- {opsci_toolbox-0.0.16.dist-info → opsci_toolbox-0.0.17.dist-info}/RECORD +10 -10
- {opsci_toolbox-0.0.16.dist-info → opsci_toolbox-0.0.17.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.16.dist-info → opsci_toolbox-0.0.17.dist-info}/dependency_links.txt +0 -0
- {opsci_toolbox-0.0.16.dist-info → opsci_toolbox-0.0.17.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/common.py
CHANGED
@@ -1502,6 +1502,28 @@ def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list)
|
|
1502
1502
|
df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
|
1503
1503
|
return df
|
1504
1504
|
|
1505
|
+
# def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
|
1506
|
+
# """
|
1507
|
+
# Calculates the total and percentage values for the given metrics based on a grouping column.
|
1508
|
+
# Args:
|
1509
|
+
# df (DataFrame): The input DataFrame.
|
1510
|
+
# col_gb (list): Names of the columns to group by.
|
1511
|
+
# metrics (dict): A dictionary of metrics to calculate.
|
1512
|
+
# Returns:
|
1513
|
+
# DataFrame: The modified DataFrame with total and percentage values added.
|
1514
|
+
|
1515
|
+
# """
|
1516
|
+
# percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
|
1517
|
+
|
1518
|
+
# df = (df.join(df.groupby(col_gb)
|
1519
|
+
# .agg(metrics)
|
1520
|
+
# .add_prefix("total_"), on=col_gb
|
1521
|
+
# )
|
1522
|
+
# .assign(**percentage_agregations).fillna(0)
|
1523
|
+
# )
|
1524
|
+
|
1525
|
+
# return df
|
1526
|
+
|
1505
1527
|
def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
|
1506
1528
|
"""
|
1507
1529
|
Calculates the total and percentage values for the given metrics based on a grouping column.
|
@@ -1513,14 +1535,15 @@ def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict
|
|
1513
1535
|
DataFrame: The modified DataFrame with total and percentage values added.
|
1514
1536
|
|
1515
1537
|
"""
|
1516
|
-
percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
|
1538
|
+
# percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
|
1517
1539
|
|
1518
1540
|
df = (df.join(df.groupby(col_gb)
|
1519
1541
|
.agg(metrics)
|
1520
1542
|
.add_prefix("total_"), on=col_gb
|
1521
1543
|
)
|
1522
|
-
.assign(**percentage_agregations).fillna(0)
|
1523
1544
|
)
|
1524
|
-
|
1525
|
-
|
1545
|
+
for key in list(metrics.keys()):
|
1546
|
+
df['per_' + key] = df[key] / df['total_' + key]
|
1547
|
+
df['per_' + key] = df['per_' + key].fillna(0)
|
1526
1548
|
|
1549
|
+
return df
|
opsci_toolbox/helpers/dataviz.py
CHANGED
@@ -2007,6 +2007,115 @@ def horizontal_stacked_bars(df: pd.DataFrame,
|
|
2007
2007
|
|
2008
2008
|
return fig
|
2009
2009
|
|
2010
|
+
def bar_stacked(df: pd.DataFrame,
|
2011
|
+
col_x: str,
|
2012
|
+
col_y: str,
|
2013
|
+
col_cat: str,
|
2014
|
+
col_color: str,
|
2015
|
+
**kwargs) -> go.Figure:
|
2016
|
+
"""
|
2017
|
+
Create horizontal stacked bar plots.
|
2018
|
+
|
2019
|
+
Args:
|
2020
|
+
df (pd.DataFrame): DataFrame containing data for the bar plots.
|
2021
|
+
col_x (str): Name of the column containing x-axis values.
|
2022
|
+
col_y (str): Name of the column containing y-axis values.
|
2023
|
+
col_percentage (str): Name of the column containing percentage values.
|
2024
|
+
col_cat (str): Name of the column containing categories.
|
2025
|
+
col_color (str): Name of the column containing colors.
|
2026
|
+
**kwargs: Additional keyword arguments to update default plotting parameters.
|
2027
|
+
|
2028
|
+
Returns:
|
2029
|
+
go.Figure: Plotly Figure object representing the horizontal stacked bar plots.
|
2030
|
+
"""
|
2031
|
+
params = general_kwargs()
|
2032
|
+
params.update(kwargs)
|
2033
|
+
|
2034
|
+
categories = df[col_cat].unique()
|
2035
|
+
|
2036
|
+
col_hover = params["col_hover"]
|
2037
|
+
|
2038
|
+
fig = go.Figure()
|
2039
|
+
|
2040
|
+
for cat in categories:
|
2041
|
+
current_df = df[df[col_cat] == cat]
|
2042
|
+
hovertemplate= "<b>Catégorie</b> : "+str(cat)+"<br><b>"+str(col_x)+"</b> : "+current_df[col_x].astype(str)+ str(col_y) + "</b> : "+current_df[col_y].astype(str)
|
2043
|
+
|
2044
|
+
for c in col_hover:
|
2045
|
+
hovertemplate += (
|
2046
|
+
"<br><b>"
|
2047
|
+
+ str(c)
|
2048
|
+
+ "</b>:"
|
2049
|
+
+ current_df[c].astype(str).apply(wrap_text)
|
2050
|
+
)
|
2051
|
+
|
2052
|
+
fig.add_trace(
|
2053
|
+
go.Bar(
|
2054
|
+
x=current_df[col_x],
|
2055
|
+
y=current_df[col_y],
|
2056
|
+
orientation=params['orientation'],
|
2057
|
+
text = current_df[col_x],
|
2058
|
+
textposition=params["textposition"],
|
2059
|
+
name=cat,
|
2060
|
+
marker=dict(color=current_df[col_color]),
|
2061
|
+
hovertemplate=hovertemplate+'<extra></extra>',
|
2062
|
+
textangle=params["xaxis_tickangle"],
|
2063
|
+
)
|
2064
|
+
)
|
2065
|
+
|
2066
|
+
fig.update_layout(
|
2067
|
+
barmode='stack',
|
2068
|
+
title_text=params["title_text"],
|
2069
|
+
showlegend=params['showlegend'],
|
2070
|
+
width = params["width"],
|
2071
|
+
height= params["height"],
|
2072
|
+
font_family=params["font_family"],
|
2073
|
+
font_size=params["font_size"],
|
2074
|
+
template=params["template"],
|
2075
|
+
plot_bgcolor=params["plot_bgcolor"], # background color (plot)
|
2076
|
+
paper_bgcolor=params["paper_bgcolor"],
|
2077
|
+
uniformtext_minsize=params["uniformtext_minsize"],
|
2078
|
+
uniformtext_mode=params["uniformtext_mode"],
|
2079
|
+
|
2080
|
+
)
|
2081
|
+
|
2082
|
+
fig.update_yaxes(
|
2083
|
+
# title=params["yaxis_title"],
|
2084
|
+
title_font_size=params["yaxis_title_font_size"],
|
2085
|
+
tickangle=params["yaxis_tickangle"],
|
2086
|
+
tickfont_size=params["yaxis_tickfont_size"],
|
2087
|
+
range=params["yaxis_range"],
|
2088
|
+
showgrid=params["yaxis_showgrid"],
|
2089
|
+
showline=params["yaxis_showline"],
|
2090
|
+
zeroline=params["yaxis_zeroline"],
|
2091
|
+
gridwidth=params["yaxis_gridwidth"],
|
2092
|
+
gridcolor=params["yaxis_gridcolor"],
|
2093
|
+
linewidth=params["yaxis_linewidth"],
|
2094
|
+
linecolor=params["yaxis_linecolor"],
|
2095
|
+
mirror=params["yaxis_mirror"],
|
2096
|
+
)
|
2097
|
+
|
2098
|
+
fig.update_xaxes(
|
2099
|
+
# title=params["xaxis_title"],
|
2100
|
+
title_font_size=params["xaxis_title_font_size"],
|
2101
|
+
tickangle=params["xaxis_tickangle"],
|
2102
|
+
tickfont_size=params["xaxis_tickfont_size"],
|
2103
|
+
# range=params["xaxis_range"],
|
2104
|
+
showgrid=params["xaxis_showgrid"],
|
2105
|
+
showline=params["xaxis_showline"],
|
2106
|
+
zeroline=params["xaxis_zeroline"],
|
2107
|
+
gridwidth=params["xaxis_gridwidth"],
|
2108
|
+
gridcolor=params["xaxis_gridcolor"],
|
2109
|
+
linewidth=params["xaxis_linewidth"],
|
2110
|
+
linecolor=params["xaxis_linecolor"],
|
2111
|
+
mirror=params["xaxis_mirror"]
|
2112
|
+
)
|
2113
|
+
fig.update_xaxes(title_text=params["xaxis_title"])
|
2114
|
+
fig.update_yaxes(title_text=params["yaxis_title"])
|
2115
|
+
fig.update_yaxes(showticklabels = False)
|
2116
|
+
|
2117
|
+
return fig
|
2118
|
+
|
2010
2119
|
def bar_trend_per_cat(df: pd.DataFrame,
|
2011
2120
|
col_x: str,
|
2012
2121
|
col_cat: str,
|
@@ -3597,13 +3706,11 @@ def density_map(df_posts: pd.DataFrame,
|
|
3597
3706
|
show_topics: bool = True,
|
3598
3707
|
show_halo: bool = False,
|
3599
3708
|
show_histogram: bool = True,
|
3600
|
-
|
3601
3709
|
colorscale: str = "Portland",
|
3602
3710
|
marker_color: str = "#ff7f0e",
|
3603
3711
|
arrow_color: str = "#ff7f0e",
|
3604
3712
|
width: int = 1000,
|
3605
3713
|
height: int = 1000,
|
3606
|
-
|
3607
3714
|
label_size_ratio: int = 100,
|
3608
3715
|
n_words: int = 3,
|
3609
3716
|
title_text: str = "Clustering",
|
@@ -3625,7 +3732,7 @@ def density_map(df_posts: pd.DataFrame,
|
|
3625
3732
|
col_engagement (str): Column name corresponding to a metric.
|
3626
3733
|
col_text (str): Column name corresponding to a text separated by |.
|
3627
3734
|
col_text_dots (str): Column name corresponding to the text for dots.
|
3628
|
-
colorscale (str, optional): Possible values are
|
3735
|
+
colorscale (str, optional): Possible values are ``https://plotly.com/python/builtin-colorscales/``. Defaults to "Portland".
|
3629
3736
|
marker_color (str, optional): Dots color value. Defaults to "#ff7f0e".
|
3630
3737
|
arrow_color (str, optional): Arrow pointing to topic centroid color value. Defaults to "#ff7f0e".
|
3631
3738
|
width (int, optional): Width of the plot. Defaults to 1000.
|
opsci_toolbox/helpers/nlp.py
CHANGED
@@ -2074,9 +2074,9 @@ def encode_chunked_files(chunk_files_paths: list,
|
|
2074
2074
|
Encode text from files and save the results in another pickle file.
|
2075
2075
|
|
2076
2076
|
Parameters:
|
2077
|
-
chunk_files_paths (
|
2077
|
+
chunk_files_paths (list): List of file paths containing documents.
|
2078
2078
|
HF_encoder (Encoder): Encoder object for text vectorization.
|
2079
|
-
cols (
|
2079
|
+
cols (list): Columns to keep in the resulting DataFrame.
|
2080
2080
|
col_text (str): Column containing text data in the DataFrame.
|
2081
2081
|
path_embedded_chunks (str): Path to save the embedded chunks.
|
2082
2082
|
reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
|
@@ -2118,12 +2118,10 @@ def encode_labels(data_to_encode: np.ndarray) -> tuple:
|
|
2118
2118
|
Encodes a list of labels using a LabelEncoder.
|
2119
2119
|
|
2120
2120
|
Args:
|
2121
|
-
|
2122
|
-
but strings or integers are typical.
|
2121
|
+
data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type, but strings or integers are typical.
|
2123
2122
|
|
2124
2123
|
Returns:
|
2125
|
-
|
2126
|
-
of encoded labels.
|
2124
|
+
Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array of encoded labels.
|
2127
2125
|
"""
|
2128
2126
|
label_encoder = LabelEncoder()
|
2129
2127
|
label_encoder.fit(data_to_encode)
|
@@ -2150,12 +2148,10 @@ def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
|
|
2150
2148
|
One-hot encodes a list of categorical values using OneHotEncoder.
|
2151
2149
|
|
2152
2150
|
Args:
|
2153
|
-
- data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
|
2154
|
-
any hashable type, typically strings or integers.
|
2151
|
+
- data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of any hashable type, typically strings or integers.
|
2155
2152
|
|
2156
2153
|
Returns:
|
2157
|
-
- Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
|
2158
|
-
of one-hot encoded values.
|
2154
|
+
- Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array of one-hot encoded values.
|
2159
2155
|
"""
|
2160
2156
|
one_hot_encoder = OneHotEncoder(sparse=False)
|
2161
2157
|
data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
|
@@ -258,7 +258,7 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
|
|
258
258
|
new_file_paths=[]
|
259
259
|
for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
|
260
260
|
|
261
|
-
filename = os.path.splitext(os.path.basename(file_path))[0]
|
261
|
+
filename = os.path.splitext(os.path.basename(file_path))[0]
|
262
262
|
new_filename = filename+"_reduce_embeddings.parquet"
|
263
263
|
new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
|
264
264
|
|
@@ -309,7 +309,7 @@ def process_HDBSCAN(clusterer,
|
|
309
309
|
new_file_paths=[]
|
310
310
|
for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
|
311
311
|
|
312
|
-
filename = os.path.splitext(os.path.basename(file_path))[0]
|
312
|
+
filename = os.path.splitext(os.path.basename(file_path))[0]
|
313
313
|
new_filename = filename+ "_predictions.parquet"
|
314
314
|
new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
|
315
315
|
if not os.path.exists(new_file_path) or reencode:
|
@@ -566,7 +566,7 @@ def cudf_write_parquet(df: cudf.DataFrame, path: str, filename: str) -> str:
|
|
566
566
|
df.to_parquet(file_path)
|
567
567
|
return file_path
|
568
568
|
|
569
|
-
def cudf_read_parquet(path: str) -> cudf.DataFrame:
|
569
|
+
def cudf_read_parquet(path: str, cols : list = None) -> cudf.DataFrame:
|
570
570
|
"""
|
571
571
|
Read a Parquet file into a cuDF DataFrame.
|
572
572
|
|
@@ -576,7 +576,10 @@ def cudf_read_parquet(path: str) -> cudf.DataFrame:
|
|
576
576
|
Returns:
|
577
577
|
cudf.DataFrame: The read cuDF DataFrame.
|
578
578
|
"""
|
579
|
-
|
579
|
+
if cols :
|
580
|
+
df = cudf.read_parquet(path, columns=cols)
|
581
|
+
else :
|
582
|
+
df = cudf.read_parquet(path)
|
580
583
|
return df
|
581
584
|
|
582
585
|
def convert_df_to_cudf(df: pd.DataFrame) -> cudf.DataFrame:
|
opsci_toolbox/helpers/sna.py
CHANGED
@@ -44,7 +44,7 @@ def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: f
|
|
44
44
|
|
45
45
|
subgraph = G.subgraph(nodes_with_min_metric).copy()
|
46
46
|
return subgraph
|
47
|
-
|
47
|
+
|
48
48
|
def group_nodes_by_values(dictionnary : dict) -> dict:
|
49
49
|
"""
|
50
50
|
Group nodes by their values from a dictionary.
|
@@ -6,21 +6,21 @@ opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWP
|
|
6
6
|
opsci_toolbox/apis/webscraping.py,sha256=fo6H2OaH0m_LHJB9IyN-q0Vkk8L9OvHxNn4O_A6a6yc,21572
|
7
7
|
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
8
8
|
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
opsci_toolbox/helpers/common.py,sha256=
|
9
|
+
opsci_toolbox/helpers/common.py,sha256=gM0QzLsdjMQTTT522CqzpFO86YWaxPaK48EXemjw9nI,54298
|
10
10
|
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
11
|
-
opsci_toolbox/helpers/dataviz.py,sha256=
|
11
|
+
opsci_toolbox/helpers/dataviz.py,sha256=viIrTrnxFzCRLY5sJDEz3jJtsB-gZTZb2uLoq0yvTlU,212762
|
12
12
|
opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
|
13
13
|
opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
|
14
|
-
opsci_toolbox/helpers/nlp.py,sha256=
|
15
|
-
opsci_toolbox/helpers/nlp_cuml.py,sha256=
|
16
|
-
opsci_toolbox/helpers/sna.py,sha256=
|
14
|
+
opsci_toolbox/helpers/nlp.py,sha256=MC2ibMi0j9BCysloEPXpvpvRlzlMvRn8krOAcFF-4VU,108286
|
15
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=sLvaDfVL0aoGi3mNXUkW47tWVrrYK5wxbf8QPgljQNA,30991
|
16
|
+
opsci_toolbox/helpers/sna.py,sha256=yzBTQXYXow_lKGhlSMz8hYl2JcSlle95YEDht9v-_fY,33734
|
17
17
|
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
18
18
|
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
19
19
|
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
21
21
|
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
22
|
-
opsci_toolbox-0.0.
|
23
|
-
opsci_toolbox-0.0.
|
24
|
-
opsci_toolbox-0.0.
|
25
|
-
opsci_toolbox-0.0.
|
26
|
-
opsci_toolbox-0.0.
|
22
|
+
opsci_toolbox-0.0.17.dist-info/METADATA,sha256=RvPoecg-cflzmh0PcNj9dDZm_RLp5KsK2n-hRTXdEUs,1727
|
23
|
+
opsci_toolbox-0.0.17.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
24
|
+
opsci_toolbox-0.0.17.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
|
25
|
+
opsci_toolbox-0.0.17.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
26
|
+
opsci_toolbox-0.0.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|