opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,27 @@ from tqdm import tqdm
5
5
  import os
6
6
  from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
7
7
 
8
- def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dist = 0.0, metric = "cosine", spread = 1.0):
8
+ def reduce_with_cuml_UMAP(embeddings: np.ndarray,
9
+ n_neighbors: int = 5,
10
+ n_components: int = 3,
11
+ min_dist: float = 0.0,
12
+ metric: str = "cosine",
13
+ spread: float = 1.0) -> tuple:
14
+ """
15
+ Reduces the dimensionality of embeddings using UMAP with cuML library.
16
+
17
+ Parameters:
18
+ - embeddings (np.ndarray): The input embeddings to be reduced.
19
+ - n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
20
+ - n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
21
+ - min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
22
+ - metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
23
+ - spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
24
+
25
+ Returns:
26
+ - reducer (UMAP): The UMAP reducer object.
27
+ - reduced_embeddings (np.ndarray): The reduced embeddings.
28
+ """
9
29
  reducer = UMAP(n_neighbors=n_neighbors,
10
30
  n_components=n_components,
11
31
  min_dist=min_dist,
@@ -15,61 +35,77 @@ def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dis
15
35
  reduced_embeddings = reducer.transform(embeddings)
16
36
  return reducer, reduced_embeddings
17
37
 
18
- def transform_with_cuml_UMAP(reducer, new_embeddings):
38
+ def transform_with_cuml_UMAP(reducer,
39
+ new_embeddings: np.ndarray) -> np.ndarray:
19
40
  """
20
- Transform new data points using a UMAP object
41
+ Transform new data points using a UMAP object.
42
+
43
+ Parameters:
44
+ - reducer (UMAP): The UMAP reducer object.
45
+ - new_embeddings (np.ndarray): The new data points to be transformed.
46
+
47
+ Returns:
48
+ - reduced_embeddings (np.ndarray): The transformed embeddings.
21
49
  """
22
50
  reduced_embeddings = reducer.transform(new_embeddings)
23
51
  return reduced_embeddings
24
52
 
25
53
 
26
- def hdbscan_cuml_clustering(embeddings, min_cluster_size=5, min_samples=None, max_cluster_size = 0, metric='euclidean', alpha=1.0, p=2, cluster_selection_epsilon=0.0, cluster_selection_method='eom',
27
- approx_min_span_tree=True, gen_min_span_tree = False, gen_condensed_tree = False, gen_single_linkage_tree_ = False, prediction_data=True):
28
-
54
+ def hdbscan_cuml_clustering(embeddings: np.ndarray,
55
+ min_cluster_size: int = 5,
56
+ min_samples: int = None,
57
+ max_cluster_size: int = 0,
58
+ metric: str = 'euclidean',
59
+ alpha: float = 1.0,
60
+ p: int = 2,
61
+ cluster_selection_epsilon: float = 0.0,
62
+ cluster_selection_method: str = 'eom',
63
+ approx_min_span_tree: bool = True,
64
+ gen_min_span_tree: bool = False,
65
+ gen_condensed_tree: bool = False,
66
+ gen_single_linkage_tree_: bool = False,
67
+ prediction_data: bool = True) -> tuple:
29
68
  """
69
+ Perform clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
70
+
30
71
  Parameters:
31
- embeddings : array-like or sparse matrix, shape (n_samples, n_features)
32
- The input data to be clustered.
33
- min_cluster_size : int, optional
34
- The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
35
- min_samples : int or None, optional
36
- The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
37
- max_cluster_size : int, optional (default=0)
38
- A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
39
- Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
40
- metric : str or callable, optional
41
- The metric to use for distance computation. Default is 'euclidean'.
42
- alpha : float, optional
43
- distance scaling parameter as used in robust single linkage.
44
- p : int, optional
45
- The Minkowski p-norm distance metric parameter. Default is None.
46
- cluster_selection_epsilon : float, optional
47
- A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
48
- cluster_selection_method : {'eom', 'leaf'}, optional
49
- The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
50
- approx_min_span_tree : bool, optional
51
- Whether to compute an approximation of the minimum spanning tree. Default is True.
52
- gen_min_span_tree : bool, optional
53
- Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed
54
- gen_condensed_tree : bool, optional
55
- Whether to populate the condensed_tree_ member for utilizing plotting tools.
56
- gen_single_linkage_tree_ : bool
57
- Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
58
- prediction_data : bool, optional
59
- Whether the data is prediction data or not. Default is True.
60
-
61
- Returns:
62
- clusterer : hdbscan.hdbscan_.HDBSCAN
63
- HDBSCAN clusterer object.
64
- labels : array, shape (n_samples,)
65
- Cluster labels for each point. Noisy samples are given the label -1.
66
- probabilities : array, shape (n_samples,)
67
- The probability of each sample being an outlier.
68
-
69
- Description:
70
- This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
71
- It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
72
- probability of each sample being an outlier.
72
+ embeddings : array-like or sparse matrix, shape (n_samples, n_features)
73
+ The input data to be clustered.
74
+ min_cluster_size : int, optional
75
+ The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
76
+ min_samples : int or None, optional
77
+ The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
78
+ max_cluster_size : int, optional (default=0)
79
+ A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
80
+ Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
81
+ metric : str or callable, optional
82
+ The metric to use for distance computation. Default is 'euclidean'.
83
+ alpha : float, optional
84
+ Distance scaling parameter as used in robust single linkage.
85
+ p : int, optional
86
+ The Minkowski p-norm distance metric parameter. Default is None.
87
+ cluster_selection_epsilon : float, optional
88
+ A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
89
+ cluster_selection_method : {'eom', 'leaf'}, optional
90
+ The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
91
+ approx_min_span_tree : bool, optional
92
+ Whether to compute an approximation of the minimum spanning tree. Default is True.
93
+ gen_min_span_tree : bool, optional
94
+ Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed.
95
+ gen_condensed_tree : bool, optional
96
+ Whether to populate the condensed_tree_ member for utilizing plotting tools.
97
+ gen_single_linkage_tree_ : bool
98
+ Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
99
+ prediction_data : bool, optional
100
+ Whether the data is prediction data or not. Default is True.
101
+
102
+ Returns:
103
+ clusterer : hdbscan.hdbscan_.HDBSCAN
104
+ HDBSCAN clusterer object.
105
+ labels : array, shape (n_samples,)
106
+ Cluster labels for each point. Noisy samples are given the label -1.
107
+ probabilities : array, shape (n_samples,)
108
+ The probability of each sample being an outlier.
73
109
  """
74
110
  clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
75
111
  min_samples=min_samples,
@@ -89,17 +125,39 @@ Description:
89
125
 
90
126
  return clusterer, clusterer.labels_, clusterer.probabilities_
91
127
 
92
- def transform_with_cuml_HDBSCAN(clusterer, new_embeddings):
128
+ def transform_with_cuml_HDBSCAN(clusterer, new_embeddings: np.ndarray) -> tuple:
93
129
  """
94
- Transform new data points using a HDBSCAN object
130
+ Transform new data points using an HDBSCAN object.
131
+
132
+ Parameters:
133
+ clusterer : hdbscan.hdbscan_.HDBSCAN
134
+ The HDBSCAN clusterer object trained on the original data.
135
+ new_embeddings : array-like or sparse matrix, shape (n_samples, n_features)
136
+ The new data points to be transformed.
137
+
138
+ Returns:
139
+ new_data_topic : array, shape (n_samples,)
140
+ Predicted cluster labels for each new data point.
141
+ new_data_proba : array, shape (n_samples,)
142
+ The probability of each new data point being an outlier.
95
143
  """
96
144
  new_data_topic, new_data_proba = approximate_predict(clusterer, new_embeddings)
97
145
  return new_data_topic, new_data_proba
98
146
 
99
147
 
100
- def cuml_soft_clustering(clusterer):
148
+ def cuml_soft_clustering(clusterer) -> tuple:
101
149
  """
102
- HDBSCAN SOFT CLUSTERING
150
+ Perform soft clustering using HDBSCAN.
151
+
152
+ Parameters:
153
+ clusterer : hdbscan.hdbscan_.HDBSCAN
154
+ The HDBSCAN clusterer object trained on the original data.
155
+
156
+ Returns:
157
+ soft_clusters_val : list of str
158
+ Predicted cluster labels for each data point, represented as strings.
159
+ soft_clusters_proba : list of float
160
+ The maximum probability of each data point belonging to any cluster.
103
161
  """
104
162
  soft_clusters = all_points_membership_vectors(clusterer)
105
163
  soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
@@ -107,17 +165,45 @@ def cuml_soft_clustering(clusterer):
107
165
  return soft_clusters_val, soft_clusters_proba
108
166
 
109
167
 
110
- def soft_cuml_clustering_new_data(clusterer, embeddings):
168
+ def soft_cuml_clustering_new_data(clusterer, embeddings: np.ndarray) -> tuple:
111
169
  """
112
- PREDICT NEW DATA POINTS HDBSCAN SOFT CLUSTERING
170
+ Predict cluster memberships for new data points using HDBSCAN soft clustering.
171
+
172
+ Parameters:
173
+ clusterer : hdbscan.hdbscan_.HDBSCAN
174
+ The HDBSCAN clusterer object trained on the original data.
175
+ embeddings : array-like or sparse matrix, shape (n_samples, n_features)
176
+ The new data points to be clustered.
177
+
178
+ Returns:
179
+ soft_clusters_val : list of str
180
+ Predicted cluster labels for each new data point, represented as strings.
181
+ soft_clusters_proba : list of float
182
+ The maximum probability of each new data point belonging to any cluster.
113
183
  """
114
- soft_clusters =membership_vector(clusterer, embeddings)
184
+ soft_clusters = membership_vector(clusterer, embeddings)
115
185
  soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
116
186
  soft_clusters_proba = [np.max(x) for x in soft_clusters]
117
187
  return soft_clusters_val, soft_clusters_proba
118
188
 
119
- def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, reencode = False):
189
+ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, reducer, reencode: bool = False) -> list:
190
+ """
191
+ Process embeddings using UMAP reduction.
120
192
 
193
+ Parameters:
194
+ embedded_chunks_paths : list of str
195
+ List of file paths containing the embedded chunks.
196
+ path_reduced_embeddings_id : str
197
+ Path to store the reduced embeddings.
198
+ reducer : UMAP object
199
+ The UMAP reducer object used for dimensionality reduction.
200
+ reencode : bool, optional
201
+ Whether to reencode the embeddings even if the reduced file already exists. Default is False.
202
+
203
+ Returns:
204
+ new_file_paths : list of str
205
+ List of file paths to the reduced embeddings.
206
+ """
121
207
  new_file_paths=[]
122
208
  for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
123
209
 
@@ -144,7 +230,30 @@ def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, ree
144
230
 
145
231
 
146
232
 
147
- def process_HDBSCAN(clusterer, reduced_embeddings_paths, path_predictions_dataset_id, run_soft_clustering= False, reencode = False):
233
+ def process_HDBSCAN(clusterer,
234
+ reduced_embeddings_paths: list,
235
+ path_predictions_dataset_id: str,
236
+ run_soft_clustering: bool = False,
237
+ reencode: bool = False) -> list:
238
+ """
239
+ Process reduced embeddings using HDBSCAN clustering.
240
+
241
+ Parameters:
242
+ clusterer : hdbscan.hdbscan_.HDBSCAN
243
+ The HDBSCAN clusterer object.
244
+ reduced_embeddings_paths : list of str
245
+ List of file paths containing the reduced embeddings.
246
+ path_predictions_dataset_id : str
247
+ Path to store the clustering predictions.
248
+ run_soft_clustering : bool, optional
249
+ Whether to perform soft clustering in addition to regular clustering. Default is False.
250
+ reencode : bool, optional
251
+ Whether to reencode the embeddings even if the clustering file already exists. Default is False.
252
+
253
+ Returns:
254
+ new_file_paths : list of str
255
+ List of file paths to the clustering predictions.
256
+ """
148
257
  new_file_paths=[]
149
258
  for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
150
259
 
@@ -8,7 +8,21 @@ from opsci_toolbox.helpers.common import scale_list
8
8
  import pandas as pd
9
9
  import math
10
10
 
11
- def create_collocations(lst_text, word_freq, coloc_freq, stop_words):
11
+ def create_collocations(lst_text : list, word_freq : int, coloc_freq : int, stop_words : list) -> tuple:
12
+ """
13
+ Creates collocations (bigrams) from a list of texts and returns their relative frequencies and a DataFrame of word sizes.
14
+
15
+ Args:
16
+ lst_text (List[str]): A list of text documents.
17
+ word_freq (int): Minimum document frequency for words to be included.
18
+ coloc_freq (int): Minimum frequency for collocations (bigrams) to be included.
19
+ stop_words (Set[str]): A set of stop words to be excluded from tokenization.
20
+
21
+ Returns:
22
+ Tuple[List[Tuple[str, str, float]], pd.DataFrame]:
23
+ - A list of tuples where each tuple contains two words and their relative bigram frequency.
24
+ - A DataFrame containing words and their sizes based on their counts in the documents.
25
+ """
12
26
  # Tokenize the documents into words using scikit-learn's CountVectorizer
13
27
  vectorizer = CountVectorizer(token_pattern=r'[^\s]+', stop_words=stop_words, min_df=word_freq)
14
28
  tokenized_documents = vectorizer.fit_transform(lst_text)
@@ -42,7 +56,19 @@ def create_collocations(lst_text, word_freq, coloc_freq, stop_words):
42
56
  return edges, df_nodes
43
57
 
44
58
 
45
- def create_maximum_tree(edges, df_nodes):
59
+ def create_maximum_tree(edges : list, df_nodes : pd.DataFrame) -> tuple:
60
+ """
61
+ Creates a network graph from edges and node attributes, then generates its maximum spanning tree.
62
+
63
+ Args:
64
+ edges (List[Tuple[str, str, float]]): A list of tuples where each tuple contains two nodes and the weight of the edge between them.
65
+ df_nodes (pd.DataFrame): A DataFrame containing node attributes, where 'word' is the node identifier.
66
+
67
+ Returns:
68
+ Tuple[nx.Graph, nx.Graph]:
69
+ - The original network graph with node attributes.
70
+ - The maximum spanning tree of the network graph.
71
+ """
46
72
  attributs=df_nodes.set_index('word')
47
73
  dictionnaire=attributs.to_dict('index')
48
74
 
@@ -54,7 +80,17 @@ def create_maximum_tree(edges, df_nodes):
54
80
 
55
81
  return network, tree
56
82
 
57
- def words_partitions(network, resolution = 1.0):
83
+ def words_partitions(network : nx.Graph, resolution : float = 1.0) -> None:
84
+ """
85
+ Partitions the network using the Louvain method and calculates the modularity of the partition.
86
+
87
+ Args:
88
+ network (nx.Graph): The network graph to partition.
89
+ resolution (float): The resolution parameter for the Louvain method. Higher values lead to smaller communities.
90
+
91
+ Returns:
92
+ None
93
+ """
58
94
  try:
59
95
  partition = community_louvain.best_partition(network, resolution=resolution)
60
96
  modularity = community_louvain.modularity(partition, network)
@@ -69,8 +105,16 @@ def words_partitions(network, resolution = 1.0):
69
105
  nx.set_node_attributes(network, partition, "modularity")
70
106
 
71
107
 
72
- def compute_metrics(network):
73
- ### CALCUL DE LA CENTRALITE DE DEGRES
108
+ def compute_metrics(network : nx.Graph) -> None :
109
+ """
110
+ Computes and sets centrality metrics for the nodes in the network graph.
111
+
112
+ Args:
113
+ network (nx.Graph): The network graph on which to compute centrality.
114
+
115
+ Returns:
116
+ None
117
+ """
74
118
  try:
75
119
  degree_cent = nx.degree_centrality(network)
76
120
  nx.set_node_attributes(network, degree_cent, "degree_centrality")
@@ -82,7 +126,6 @@ def compute_metrics(network):
82
126
  degree_cent = {node: 0 for node in network.nodes()}
83
127
  nx.set_node_attributes(network, degree_cent, "degree_centrality")
84
128
 
85
-
86
129
  ### CALCUL DE LA CENTRALITE DE VECTEUR PROPRE
87
130
  try:
88
131
  centrality = nx.eigenvector_centrality(network)
@@ -106,7 +149,20 @@ def compute_metrics(network):
106
149
  betweenness_cent = {node: 0 for node in network.nodes()}
107
150
  nx.set_node_attributes(network, betweenness_cent, "betweenness_centrality")
108
151
 
109
- def prepare_nodes(T, layout_positions, colormap, min_node_size = 8, max_node_size = 40):
152
+ def prepare_nodes(T : nx.Graph, layout_positions : dict, colormap : str, min_node_size : int = 8, max_node_size : int = 40) -> None:
153
+ """
154
+ Prepares and sets node attributes for a graph based on various centrality measures and colors them using a colormap.
155
+
156
+ Args:
157
+ T (nx.Graph): The input graph.
158
+ layout_positions (Dict[str, Tuple[float, float]]): A dictionary of node positions for layout.
159
+ colormap (Colormap): A colormap for generating node colors.
160
+ min_node_size (int): Minimum node size for scaling. Default is 8.
161
+ max_node_size (int): Maximum node size for scaling. Default is 40.
162
+
163
+ Returns:
164
+ None
165
+ """
110
166
 
111
167
  # on génère une palette de couleur à partir de colormap
112
168
  modularity_palette = generate_color_palette_with_colormap(set(nx.get_node_attributes(T,"modularity").values()), colormap=colormap)
@@ -147,17 +203,52 @@ def prepare_nodes(T, layout_positions, colormap, min_node_size = 8, max_node_siz
147
203
  for n, p in layout_positions.items():
148
204
  T.nodes[n]['pos'] = p
149
205
 
150
- def prepare_edges(T, min_edge_size=1, max_edge_size=5):
206
+ def prepare_edges(T : nx.Graph, min_edge_size : int =1, max_edge_size : int =5) -> None:
207
+ """
208
+ Prepares and sets edge attributes for a graph by scaling edge weights.
209
+
210
+ Args:
211
+ T (nx.Graph): The input graph.
212
+ min_edge_size (int): Minimum edge size for scaling. Default is 1.
213
+ max_edge_size (int): Maximum edge size for scaling. Default is 5.
214
+
215
+ Returns:
216
+ None
217
+ """
151
218
  w = [e[2]['weight'] for e in T.edges(data=True)]
152
219
  scaled_w = scale_list(w, min_edge_size, max_edge_size)
153
220
  edges_attributes_dict = {(e[0], e[1]): {'scaled_weight': scaled_w[i]} for i, e in enumerate(T.edges(data=True))}
154
221
  nx.set_edge_attributes(T, edges_attributes_dict)
155
222
 
156
223
 
157
- def layout_graphviz(network, layout ="fdp", args=""):
224
+ def layout_graphviz(network : nx.Graph, layout : str = "fdp", args : str ="") -> dict:
225
+ """
226
+ Generates node positions for a graph using Graphviz layout algorithms.
227
+
228
+ Args:
229
+ network (nx.Graph): The input graph.
230
+ layout (str): The Graphviz layout algorithm to use (e.g., "dot", "fdp", "sfdp"). Default is "fdp".
231
+ args (str): Additional arguments to pass to the Graphviz layout algorithm. Default is an empty string.
232
+
233
+ Returns:
234
+ Dict[str, Tuple[float, float]]: A dictionary of node positions.
235
+ """
158
236
  layout_positions = nx.nx_agraph.graphviz_layout(network, prog=layout, args=args)
159
237
  return layout_positions
160
238
 
161
- def layout_spring(network, k = 0.08, scale = 2, iterations = 200, weight="weight"):
239
+ def layout_spring(network : nx.Graph, k : float = 0.08, scale : int = 2, iterations : int = 200, weight : str ="weight") -> dict:
240
+ """
241
+ Generates node positions for a graph using the spring layout algorithm.
242
+
243
+ Args:
244
+ network (nx.Graph): The input graph.
245
+ k (float): Optimal distance between nodes. Default is 0.08.
246
+ scale (float): Scale factor for the layout. Default is 2.
247
+ iterations (int): Number of iterations for the spring layout algorithm. Default is 200.
248
+ weight (str): Edge attribute to use as weight. Default is "weight".
249
+
250
+ Returns:
251
+ Dict[str, Tuple[float, float]]: A dictionary of node positions.
252
+ """
162
253
  layout_positions = nx.spring_layout(network, k=k, scale=scale, iterations=iterations, weight=weight)
163
254
  return layout_positions
@@ -1,9 +1,17 @@
1
1
  import pandas as pd
2
2
  from tqdm import tqdm
3
3
 
4
- def generate_index(df, col_author_id ='author_id', col_date='created_time'):
4
+ def generate_index(df : pd.DataFrame, col_author_id : str ='author_id', col_date : str = 'created_time') -> pd.DataFrame:
5
5
  """
6
- Generates an index based on user_id and date
6
+ Generates an index based on author ID and creation date.
7
+
8
+ Args:
9
+ df (pd.DataFrame): The input DataFrame containing author IDs and creation dates.
10
+ col_author_id (str): The column name for author IDs. Default is 'author_id'.
11
+ col_date (str): The column name for creation dates. Default is 'created_time'.
12
+
13
+ Returns:
14
+ pd.DataFrame: The DataFrame with a new 'index' column containing the generated indices.
7
15
  """
8
16
  res=[]
9
17
  for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"):
@@ -13,15 +21,25 @@ def generate_index(df, col_author_id ='author_id', col_date='created_time'):
13
21
 
14
22
  return df
15
23
 
16
- def avg_performance(df,
17
- col_date='created_time',
18
- col_author_id='author_id',
19
- col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
24
+ def avg_performance(df : pd.DataFrame,
25
+ col_date : str ='created_time',
26
+ col_author_id : str ='author_id',
27
+ col_engagement : list =['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
20
28
  'sad', 'angry','total_engagement', 'replies', 'percentage_replies'],
21
- rolling_period='7D'):
29
+ rolling_period : str ='7D') -> pd.DataFrame:
22
30
 
23
31
  """
24
- Function to compute average performance on a rolling period for a list of metrics
32
+ Computes average performance on a rolling period for a list of engagement metrics.
33
+
34
+ Args:
35
+ df (pd.DataFrame): The input DataFrame containing engagement metrics.
36
+ col_date (str): The column name for creation dates. Default is 'created_time'.
37
+ col_author_id (str): The column name for author IDs. Default is 'author_id'.
38
+ col_engagement (List[str]): A list of columns representing engagement metrics.
39
+ rolling_period (str): The rolling period for calculating the average. Default is '7D'.
40
+
41
+ Returns:
42
+ pd.DataFrame: The DataFrame with additional columns containing the rolling average of engagement metrics.
25
43
  """
26
44
 
27
45
  # Nettoyage au cas où
@@ -47,18 +65,32 @@ def avg_performance(df,
47
65
 
48
66
  return df
49
67
 
50
- def kpi_reaction(df, cols):
68
+ def kpi_reaction(df : pd.DataFrame, cols : list) -> pd.DataFrame:
51
69
  """
52
- Cette fonction prend un dataframe et une liste de colonnes en entrée.
53
- Pour chaque colonne, on va calculer le taux de sur-réaction.
70
+ Computes the overreaction rate for each column in the DataFrame.
71
+
72
+ Args:
73
+ df (pd.DataFrame): The input DataFrame containing engagement metrics.
74
+ cols (List[str]): A list of column names for which to calculate the overreaction rate.
75
+
76
+ Returns:
77
+ pd.DataFrame: The DataFrame with additional columns containing the overreaction rates.
54
78
  """
55
79
  for col in cols:
56
80
  df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
57
81
  return df
58
82
 
59
- def get_reactions_type(df, cols, col_dest):
83
+ def get_reactions_type(df : pd.DataFrame, cols : list, col_dest : str) -> pd.DataFrame:
60
84
  """
61
- Conditional function to return the reaction type based on a list of metrics
85
+ Returns the reaction type based on a list of metrics for each row in the DataFrame.
86
+
87
+ Args:
88
+ df (pd.DataFrame): The input DataFrame containing engagement metrics.
89
+ cols (List[str]): A list of column names for which to determine the reaction type.
90
+ col_dest (str): The name of the column to store the reaction type in.
91
+
92
+ Returns:
93
+ pd.DataFrame: The DataFrame with additional column containing the reaction types.
62
94
  """
63
95
  all_val=[]
64
96
 
@@ -80,10 +112,20 @@ def get_reactions_type(df, cols, col_dest):
80
112
  df[col_dest]=all_val
81
113
  return df
82
114
 
83
- def compute_surreaction(df, col_date, col_author_id, cols_sureaction_metrics, cols_typologie_sureaction, rolling_period_sureaction = '7D'):
115
+ def compute_surreaction(df : pd.DataFrame, col_date : str, col_author_id : str, cols_sureaction_metrics : list, cols_typologie_sureaction : list, rolling_period_sureaction : str = '7D') -> pd.DataFrame:
84
116
  """
85
- Helpers to compute surreaction and return a dataframe with reaction rates and typology
86
-
117
+ Computes surreaction rates and typology for a DataFrame containing engagement metrics.
118
+
119
+ Args:
120
+ df (pd.DataFrame): The input DataFrame containing engagement metrics.
121
+ col_date (str): The column name for creation dates.
122
+ col_author_id (str): The column name for author IDs.
123
+ cols_sureaction_metrics (List[str]): A list of column names for which to calculate surreaction rates.
124
+ cols_typologie_sureaction (List[str]): A list of column names for categorizing the forms of reaction.
125
+ rolling_period_sureaction (str): The rolling period for calculating the average and surreaction rates. Default is '7D'.
126
+
127
+ Returns:
128
+ pd.DataFrame: The DataFrame with additional columns containing surreaction rates and typology.
87
129
  """
88
130
  # on désactive temporairement les messages d'alerte
89
131
  pd.options.mode.chained_assignment = None # default='warn'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: opsci-toolbox
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: a complete toolbox
5
5
  Home-page: UNKNOWN
6
6
  Author: Erwan Le Nagard
@@ -23,9 +23,10 @@ Requires-Dist: networkx (==3.2.1)
23
23
  Requires-Dist: nltk (==3.8.1)
24
24
  Requires-Dist: numpy (<1.25.0,>=1.21.5)
25
25
  Requires-Dist: opencv-python-headless (==4.9.0.80)
26
+ Requires-Dist: openpyxl (==3.1.3)
26
27
  Requires-Dist: pandas (==1.5.3)
27
28
  Requires-Dist: plotly (==5.19.0)
28
- Requires-Dist: protobuf (==5.26.1)
29
+ Requires-Dist: protobuf (<5,>=3.20)
29
30
  Requires-Dist: pyarrow (==14.0.2)
30
31
  Requires-Dist: python-louvain (==0.16)
31
32
  Requires-Dist: scikit-learn (==1.4.1.post1)
@@ -0,0 +1,21 @@
1
+ opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ opsci_toolbox/apis/rapidapi_helpers.py,sha256=f2o4ItMZwoAt3ow5bSK-MPkqzP3wzJ857xU0CzDZIyI,23207
4
+ opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
5
+ opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
6
+ opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ opsci_toolbox/helpers/common.py,sha256=TvlGcCdpkfKUaDkahILq3wFLgxwAtgRv5KJRoNy9brw,40339
8
+ opsci_toolbox/helpers/cv.py,sha256=-uXHncyAr8sDF0ip32LAz7Xae9Z4-T9MH6palpIzq-c,21109
9
+ opsci_toolbox/helpers/dataviz.py,sha256=JbudfwWPCEEEzP8Vpmu1CMEKaE6O2vtk9xsflW2pT1M,112451
10
+ opsci_toolbox/helpers/dates.py,sha256=EvNqut2s6S4CaaVFQhIDR-W00TZbt3J04yRYKYhxCkU,2638
11
+ opsci_toolbox/helpers/nlp.py,sha256=jpZRyTkYeoVH8tzqIT0opZn5unt8cdU1qPdFzXxEOw8,86638
12
+ opsci_toolbox/helpers/nlp_cuml.py,sha256=w-pkch2Sk_FfVrm1j8NUmmxVvoJXJHuXzGnXGV_FWSE,14153
13
+ opsci_toolbox/helpers/sna.py,sha256=SZjS21qfBmlkHDJaXi7CaHpj6KhefcsDmJ1A9NRtVeQ,12006
14
+ opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
15
+ opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
17
+ opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
18
+ opsci_toolbox-0.0.7.dist-info/METADATA,sha256=ErIa8rDRfvT52LjZJcSKU7zougC_1hZa3oWnvPPTzJQ,1601
19
+ opsci_toolbox-0.0.7.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
20
+ opsci_toolbox-0.0.7.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
21
+ opsci_toolbox-0.0.7.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- opsci_toolbox/apis/rapidapi_helpers.py,sha256=5QbF6ehsmmdTrzp7Q8cF5wrf4DmO91v8YexbybczyHA,23183
4
- opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
5
- opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
6
- opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- opsci_toolbox/helpers/common.py,sha256=41EsQ2pTwQYnUUM1ggwaPueFVj2Qcm_UG7o_Zj41FU8,26152
8
- opsci_toolbox/helpers/cv.py,sha256=z0HecreIi-vqiOGpDa4VVnHIX_rvkObngrqwTwkWT44,12403
9
- opsci_toolbox/helpers/dataviz.py,sha256=4wFi0wCMgvIEQEL8okiVJOWxz-eJq5cZ7svHoBbZjnk,77393
10
- opsci_toolbox/helpers/dates.py,sha256=yQm9pUQAeLTFNPcgeumhi8oErustQJhaoL_HqxSxhiA,996
11
- opsci_toolbox/helpers/nlp.py,sha256=LGW8CIjrkQvGLKEnxYu7RNrBNViQ5dUygK67EhkBHZo,57999
12
- opsci_toolbox/helpers/nlp_cuml.py,sha256=Mkbtl9ewbv3aa9rFvhH9VOM5Y0G-XIsXtR_6IeYpebY,9450
13
- opsci_toolbox/helpers/sna.py,sha256=D6nwgUgbuApXGpT2zoIMip8262hynEwfppVdvaZ4Qm0,8053
14
- opsci_toolbox/helpers/surreaction.py,sha256=k5hcZZlXnJ-zczRpwfwthggEgFCr9lQsHHKVOPlm7fc,4606
15
- opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
17
- opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
18
- opsci_toolbox-0.0.5.dist-info/METADATA,sha256=Nhp2oK-KXD4JVivU37-T_MsN-VJfbPtJsWlUq7Kp5-A,1566
19
- opsci_toolbox-0.0.5.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
20
- opsci_toolbox-0.0.5.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
21
- opsci_toolbox-0.0.5.dist-info/RECORD,,