opsci-toolbox 0.0.16__py3-none-any.whl → 0.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1502,6 +1502,28 @@ def custom_ordering(df : pd.DataFrame, col_to_order : str, custom_order : list)
1502
1502
  df[col_to_order] = pd.Categorical(df[col_to_order], categories=custom_order, ordered=True).to_numpy()
1503
1503
  return df
1504
1504
 
1505
+ # def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
1506
+ # """
1507
+ # Calculates the total and percentage values for the given metrics based on a grouping column.
1508
+ # Args:
1509
+ # df (DataFrame): The input DataFrame.
1510
+ # col_gb (list): Names of the columns to group by.
1511
+ # metrics (dict): A dictionary of metrics to calculate.
1512
+ # Returns:
1513
+ # DataFrame: The modified DataFrame with total and percentage values added.
1514
+
1515
+ # """
1516
+ # percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
1517
+
1518
+ # df = (df.join(df.groupby(col_gb)
1519
+ # .agg(metrics)
1520
+ # .add_prefix("total_"), on=col_gb
1521
+ # )
1522
+ # .assign(**percentage_agregations).fillna(0)
1523
+ # )
1524
+
1525
+ # return df
1526
+
1505
1527
  def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict) -> pd.DataFrame:
1506
1528
  """
1507
1529
  Calculates the total and percentage values for the given metrics based on a grouping column.
@@ -1513,14 +1535,15 @@ def calcul_total_et_pourcentage(df : pd.DataFrame, col_gb : list, metrics : dict
1513
1535
  DataFrame: The modified DataFrame with total and percentage values added.
1514
1536
 
1515
1537
  """
1516
- percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
1538
+ # percentage_agregations = {f'per_{key}': lambda x: x[key] / x[f"total_{key}"] for key in list(metrics.keys())}
1517
1539
 
1518
1540
  df = (df.join(df.groupby(col_gb)
1519
1541
  .agg(metrics)
1520
1542
  .add_prefix("total_"), on=col_gb
1521
1543
  )
1522
- .assign(**percentage_agregations).fillna(0)
1523
1544
  )
1524
-
1525
- return df
1545
+ for key in list(metrics.keys()):
1546
+ df['per_' + key] = df[key] / df['total_' + key]
1547
+ df['per_' + key] = df['per_' + key].fillna(0)
1526
1548
 
1549
+ return df
@@ -2007,6 +2007,115 @@ def horizontal_stacked_bars(df: pd.DataFrame,
2007
2007
 
2008
2008
  return fig
2009
2009
 
2010
+ def bar_stacked(df: pd.DataFrame,
2011
+ col_x: str,
2012
+ col_y: str,
2013
+ col_cat: str,
2014
+ col_color: str,
2015
+ **kwargs) -> go.Figure:
2016
+ """
2017
+ Create horizontal stacked bar plots.
2018
+
2019
+ Args:
2020
+ df (pd.DataFrame): DataFrame containing data for the bar plots.
2021
+ col_x (str): Name of the column containing x-axis values.
2022
+ col_y (str): Name of the column containing y-axis values.
2023
+ col_percentage (str): Name of the column containing percentage values.
2024
+ col_cat (str): Name of the column containing categories.
2025
+ col_color (str): Name of the column containing colors.
2026
+ **kwargs: Additional keyword arguments to update default plotting parameters.
2027
+
2028
+ Returns:
2029
+ go.Figure: Plotly Figure object representing the horizontal stacked bar plots.
2030
+ """
2031
+ params = general_kwargs()
2032
+ params.update(kwargs)
2033
+
2034
+ categories = df[col_cat].unique()
2035
+
2036
+ col_hover = params["col_hover"]
2037
+
2038
+ fig = go.Figure()
2039
+
2040
+ for cat in categories:
2041
+ current_df = df[df[col_cat] == cat]
2042
+ hovertemplate= "<b>Catégorie</b> : "+str(cat)+"<br><b>"+str(col_x)+"</b> : "+current_df[col_x].astype(str)+ str(col_y) + "</b> : "+current_df[col_y].astype(str)
2043
+
2044
+ for c in col_hover:
2045
+ hovertemplate += (
2046
+ "<br><b>"
2047
+ + str(c)
2048
+ + "</b>:"
2049
+ + current_df[c].astype(str).apply(wrap_text)
2050
+ )
2051
+
2052
+ fig.add_trace(
2053
+ go.Bar(
2054
+ x=current_df[col_x],
2055
+ y=current_df[col_y],
2056
+ orientation=params['orientation'],
2057
+ text = current_df[col_x],
2058
+ textposition=params["textposition"],
2059
+ name=cat,
2060
+ marker=dict(color=current_df[col_color]),
2061
+ hovertemplate=hovertemplate+'<extra></extra>',
2062
+ textangle=params["xaxis_tickangle"],
2063
+ )
2064
+ )
2065
+
2066
+ fig.update_layout(
2067
+ barmode='stack',
2068
+ title_text=params["title_text"],
2069
+ showlegend=params['showlegend'],
2070
+ width = params["width"],
2071
+ height= params["height"],
2072
+ font_family=params["font_family"],
2073
+ font_size=params["font_size"],
2074
+ template=params["template"],
2075
+ plot_bgcolor=params["plot_bgcolor"], # background color (plot)
2076
+ paper_bgcolor=params["paper_bgcolor"],
2077
+ uniformtext_minsize=params["uniformtext_minsize"],
2078
+ uniformtext_mode=params["uniformtext_mode"],
2079
+
2080
+ )
2081
+
2082
+ fig.update_yaxes(
2083
+ # title=params["yaxis_title"],
2084
+ title_font_size=params["yaxis_title_font_size"],
2085
+ tickangle=params["yaxis_tickangle"],
2086
+ tickfont_size=params["yaxis_tickfont_size"],
2087
+ range=params["yaxis_range"],
2088
+ showgrid=params["yaxis_showgrid"],
2089
+ showline=params["yaxis_showline"],
2090
+ zeroline=params["yaxis_zeroline"],
2091
+ gridwidth=params["yaxis_gridwidth"],
2092
+ gridcolor=params["yaxis_gridcolor"],
2093
+ linewidth=params["yaxis_linewidth"],
2094
+ linecolor=params["yaxis_linecolor"],
2095
+ mirror=params["yaxis_mirror"],
2096
+ )
2097
+
2098
+ fig.update_xaxes(
2099
+ # title=params["xaxis_title"],
2100
+ title_font_size=params["xaxis_title_font_size"],
2101
+ tickangle=params["xaxis_tickangle"],
2102
+ tickfont_size=params["xaxis_tickfont_size"],
2103
+ # range=params["xaxis_range"],
2104
+ showgrid=params["xaxis_showgrid"],
2105
+ showline=params["xaxis_showline"],
2106
+ zeroline=params["xaxis_zeroline"],
2107
+ gridwidth=params["xaxis_gridwidth"],
2108
+ gridcolor=params["xaxis_gridcolor"],
2109
+ linewidth=params["xaxis_linewidth"],
2110
+ linecolor=params["xaxis_linecolor"],
2111
+ mirror=params["xaxis_mirror"]
2112
+ )
2113
+ fig.update_xaxes(title_text=params["xaxis_title"])
2114
+ fig.update_yaxes(title_text=params["yaxis_title"])
2115
+ fig.update_yaxes(showticklabels = False)
2116
+
2117
+ return fig
2118
+
2010
2119
  def bar_trend_per_cat(df: pd.DataFrame,
2011
2120
  col_x: str,
2012
2121
  col_cat: str,
@@ -3597,13 +3706,11 @@ def density_map(df_posts: pd.DataFrame,
3597
3706
  show_topics: bool = True,
3598
3707
  show_halo: bool = False,
3599
3708
  show_histogram: bool = True,
3600
-
3601
3709
  colorscale: str = "Portland",
3602
3710
  marker_color: str = "#ff7f0e",
3603
3711
  arrow_color: str = "#ff7f0e",
3604
3712
  width: int = 1000,
3605
3713
  height: int = 1000,
3606
-
3607
3714
  label_size_ratio: int = 100,
3608
3715
  n_words: int = 3,
3609
3716
  title_text: str = "Clustering",
@@ -3625,7 +3732,7 @@ def density_map(df_posts: pd.DataFrame,
3625
3732
  col_engagement (str): Column name corresponding to a metric.
3626
3733
  col_text (str): Column name corresponding to a text separated by |.
3627
3734
  col_text_dots (str): Column name corresponding to the text for dots.
3628
- colorscale (str, optional): Possible values are 'https://plotly.com/python/builtin-colorscales/'. Defaults to "Portland".
3735
+ colorscale (str, optional): Possible values are ``https://plotly.com/python/builtin-colorscales/``. Defaults to "Portland".
3629
3736
  marker_color (str, optional): Dots color value. Defaults to "#ff7f0e".
3630
3737
  arrow_color (str, optional): Arrow pointing to topic centroid color value. Defaults to "#ff7f0e".
3631
3738
  width (int, optional): Width of the plot. Defaults to 1000.
@@ -2074,9 +2074,9 @@ def encode_chunked_files(chunk_files_paths: list,
2074
2074
  Encode text from files and save the results in another pickle file.
2075
2075
 
2076
2076
  Parameters:
2077
- chunk_files_paths (List[str]): List of file paths containing documents.
2077
+ chunk_files_paths (list): List of file paths containing documents.
2078
2078
  HF_encoder (Encoder): Encoder object for text vectorization.
2079
- cols (List[str]): Columns to keep in the resulting DataFrame.
2079
+ cols (list): Columns to keep in the resulting DataFrame.
2080
2080
  col_text (str): Column containing text data in the DataFrame.
2081
2081
  path_embedded_chunks (str): Path to save the embedded chunks.
2082
2082
  reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
@@ -2118,12 +2118,10 @@ def encode_labels(data_to_encode: np.ndarray) -> tuple:
2118
2118
  Encodes a list of labels using a LabelEncoder.
2119
2119
 
2120
2120
  Args:
2121
- - data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type,
2122
- but strings or integers are typical.
2121
+ data_to_encode (List[Union[str, int]]): The list of labels to encode. Labels can be of any hashable type, but strings or integers are typical.
2123
2122
 
2124
2123
  Returns:
2125
- - Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array
2126
- of encoded labels.
2124
+ Tuple[LabelEncoder, np.ndarray]: A tuple containing the fitted LabelEncoder instance and a numpy array of encoded labels.
2127
2125
  """
2128
2126
  label_encoder = LabelEncoder()
2129
2127
  label_encoder.fit(data_to_encode)
@@ -2150,12 +2148,10 @@ def one_hot_encode(data_to_encode:np.ndarray) -> tuple:
2150
2148
  One-hot encodes a list of categorical values using OneHotEncoder.
2151
2149
 
2152
2150
  Args:
2153
- - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of
2154
- any hashable type, typically strings or integers.
2151
+ - data_to_encode (List[Union[str, int]]): The list of categorical values to encode. The values can be of any hashable type, typically strings or integers.
2155
2152
 
2156
2153
  Returns:
2157
- - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array
2158
- of one-hot encoded values.
2154
+ - Tuple[OneHotEncoder, np.ndarray]: A tuple containing the fitted OneHotEncoder instance and a numpy array of one-hot encoded values.
2159
2155
  """
2160
2156
  one_hot_encoder = OneHotEncoder(sparse=False)
2161
2157
  data_to_encode_reshaped = np.array(data_to_encode).reshape(-1, 1) # Reshape for OneHotEncoder
@@ -258,7 +258,7 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
258
258
  new_file_paths=[]
259
259
  for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
260
260
 
261
- filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
261
+ filename = os.path.splitext(os.path.basename(file_path))[0]
262
262
  new_filename = filename+"_reduce_embeddings.parquet"
263
263
  new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
264
264
 
@@ -309,7 +309,7 @@ def process_HDBSCAN(clusterer,
309
309
  new_file_paths=[]
310
310
  for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
311
311
 
312
- filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
312
+ filename = os.path.splitext(os.path.basename(file_path))[0]
313
313
  new_filename = filename+ "_predictions.parquet"
314
314
  new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
315
315
  if not os.path.exists(new_file_path) or reencode:
@@ -566,7 +566,7 @@ def cudf_write_parquet(df: cudf.DataFrame, path: str, filename: str) -> str:
566
566
  df.to_parquet(file_path)
567
567
  return file_path
568
568
 
569
- def cudf_read_parquet(path: str) -> cudf.DataFrame:
569
+ def cudf_read_parquet(path: str, cols : list = None) -> cudf.DataFrame:
570
570
  """
571
571
  Read a Parquet file into a cuDF DataFrame.
572
572
 
@@ -576,7 +576,10 @@ def cudf_read_parquet(path: str) -> cudf.DataFrame:
576
576
  Returns:
577
577
  cudf.DataFrame: The read cuDF DataFrame.
578
578
  """
579
- df = cudf.read_parquet(path)
579
+ if cols :
580
+ df = cudf.read_parquet(path, columns=cols)
581
+ else :
582
+ df = cudf.read_parquet(path)
580
583
  return df
581
584
 
582
585
  def convert_df_to_cudf(df: pd.DataFrame) -> cudf.DataFrame:
@@ -44,7 +44,7 @@ def create_subgraph_min_metric(G: nx.Graph, metric: str = "degree", min_value: f
44
44
 
45
45
  subgraph = G.subgraph(nodes_with_min_metric).copy()
46
46
  return subgraph
47
-
47
+
48
48
  def group_nodes_by_values(dictionnary : dict) -> dict:
49
49
  """
50
50
  Group nodes by their values from a dictionary.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.16
3
+ Version: 0.0.17
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -6,21 +6,21 @@ opsci_toolbox/apis/telegram.py,sha256=JjmAk6tKvpnFIYpZDKthxS_mgqhWQpDPUOvyC7SiWP
6
6
  opsci_toolbox/apis/webscraping.py,sha256=fo6H2OaH0m_LHJB9IyN-q0Vkk8L9OvHxNn4O_A6a6yc,21572
7
7
  opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
8
8
  opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- opsci_toolbox/helpers/common.py,sha256=zmi-FbN39Rci_hGEKj2bmkcucrVwnHhMgKU6AAIap3Q,53327
9
+ opsci_toolbox/helpers/common.py,sha256=gM0QzLsdjMQTTT522CqzpFO86YWaxPaK48EXemjw9nI,54298
10
10
  opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
11
- opsci_toolbox/helpers/dataviz.py,sha256=U2Kj-xoF1wHvYXUKxLsrSvKnhky9PrPUy61s1WEKp44,208743
11
+ opsci_toolbox/helpers/dataviz.py,sha256=viIrTrnxFzCRLY5sJDEz3jJtsB-gZTZb2uLoq0yvTlU,212762
12
12
  opsci_toolbox/helpers/dates.py,sha256=Pq-SKP2n1z0_jzU8NxGSv8CHLH_MOKjP_rNYeny0Tb8,4752
13
13
  opsci_toolbox/helpers/gliner.py,sha256=qLkpuoCDezQyYmg_TE3XYETSpobHods6WBjCLo0Gjqw,3579
14
- opsci_toolbox/helpers/nlp.py,sha256=4edA5JZ4vzpU4U9w-INNspW2oTQ-yYpm5rFXExKB4YI,108324
15
- opsci_toolbox/helpers/nlp_cuml.py,sha256=KfgC0hMqLCKoOME2DOu3Wje4ormV19fEB8Fyq8G7D-E,30901
16
- opsci_toolbox/helpers/sna.py,sha256=3qx1WBQwLKpZNGR0bLSMB2-LBRx-vtNHp8puzoj-84A,33730
14
+ opsci_toolbox/helpers/nlp.py,sha256=MC2ibMi0j9BCysloEPXpvpvRlzlMvRn8krOAcFF-4VU,108286
15
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=sLvaDfVL0aoGi3mNXUkW47tWVrrYK5wxbf8QPgljQNA,30991
16
+ opsci_toolbox/helpers/sna.py,sha256=yzBTQXYXow_lKGhlSMz8hYl2JcSlle95YEDht9v-_fY,33734
17
17
  opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
18
18
  opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
19
19
  opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
21
21
  opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
22
- opsci_toolbox-0.0.16.dist-info/METADATA,sha256=-SCFUBnwnWlUrOGgQwxib8ZfCjWxXm3iVVwnfErQ9Fk,1727
23
- opsci_toolbox-0.0.16.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
- opsci_toolbox-0.0.16.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
- opsci_toolbox-0.0.16.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
- opsci_toolbox-0.0.16.dist-info/RECORD,,
22
+ opsci_toolbox-0.0.17.dist-info/METADATA,sha256=RvPoecg-cflzmh0PcNj9dDZm_RLp5KsK2n-hRTXdEUs,1727
23
+ opsci_toolbox-0.0.17.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
24
+ opsci_toolbox-0.0.17.dist-info/dependency_links.txt,sha256=bEiJsgyh9M0F_pGpJBwUYDefiTNq9F6QEGfQS5RH1Os,39
25
+ opsci_toolbox-0.0.17.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
26
+ opsci_toolbox-0.0.17.dist-info/RECORD,,