opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -0
- opsci_toolbox/helpers/common.py +557 -207
- opsci_toolbox/helpers/cv.py +298 -123
- opsci_toolbox/helpers/dataviz.py +875 -191
- opsci_toolbox/helpers/dates.py +55 -8
- opsci_toolbox/helpers/nlp.py +746 -97
- opsci_toolbox/helpers/nlp_cuml.py +166 -57
- opsci_toolbox/helpers/sna.py +101 -10
- opsci_toolbox/helpers/surreaction.py +58 -16
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/METADATA +3 -2
- opsci_toolbox-0.0.7.dist-info/RECORD +21 -0
- opsci_toolbox-0.0.5.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.7.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,27 @@ from tqdm import tqdm
|
|
5
5
|
import os
|
6
6
|
from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
|
7
7
|
|
8
|
-
def reduce_with_cuml_UMAP(embeddings
|
8
|
+
def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
9
|
+
n_neighbors: int = 5,
|
10
|
+
n_components: int = 3,
|
11
|
+
min_dist: float = 0.0,
|
12
|
+
metric: str = "cosine",
|
13
|
+
spread: float = 1.0) -> tuple:
|
14
|
+
"""
|
15
|
+
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
16
|
+
|
17
|
+
Parameters:
|
18
|
+
- embeddings (np.ndarray): The input embeddings to be reduced.
|
19
|
+
- n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
|
20
|
+
- n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
|
21
|
+
- min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
|
22
|
+
- metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
|
23
|
+
- spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
- reducer (UMAP): The UMAP reducer object.
|
27
|
+
- reduced_embeddings (np.ndarray): The reduced embeddings.
|
28
|
+
"""
|
9
29
|
reducer = UMAP(n_neighbors=n_neighbors,
|
10
30
|
n_components=n_components,
|
11
31
|
min_dist=min_dist,
|
@@ -15,61 +35,77 @@ def reduce_with_cuml_UMAP(embeddings, n_neighbors = 5, n_components = 3, min_dis
|
|
15
35
|
reduced_embeddings = reducer.transform(embeddings)
|
16
36
|
return reducer, reduced_embeddings
|
17
37
|
|
18
|
-
def transform_with_cuml_UMAP(reducer,
|
38
|
+
def transform_with_cuml_UMAP(reducer,
|
39
|
+
new_embeddings: np.ndarray) -> np.ndarray:
|
19
40
|
"""
|
20
|
-
Transform new data points using a UMAP object
|
41
|
+
Transform new data points using a UMAP object.
|
42
|
+
|
43
|
+
Parameters:
|
44
|
+
- reducer (UMAP): The UMAP reducer object.
|
45
|
+
- new_embeddings (np.ndarray): The new data points to be transformed.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
- reduced_embeddings (np.ndarray): The transformed embeddings.
|
21
49
|
"""
|
22
50
|
reduced_embeddings = reducer.transform(new_embeddings)
|
23
51
|
return reduced_embeddings
|
24
52
|
|
25
53
|
|
26
|
-
def hdbscan_cuml_clustering(embeddings
|
27
|
-
|
28
|
-
|
54
|
+
def hdbscan_cuml_clustering(embeddings: np.ndarray,
|
55
|
+
min_cluster_size: int = 5,
|
56
|
+
min_samples: int = None,
|
57
|
+
max_cluster_size: int = 0,
|
58
|
+
metric: str = 'euclidean',
|
59
|
+
alpha: float = 1.0,
|
60
|
+
p: int = 2,
|
61
|
+
cluster_selection_epsilon: float = 0.0,
|
62
|
+
cluster_selection_method: str = 'eom',
|
63
|
+
approx_min_span_tree: bool = True,
|
64
|
+
gen_min_span_tree: bool = False,
|
65
|
+
gen_condensed_tree: bool = False,
|
66
|
+
gen_single_linkage_tree_: bool = False,
|
67
|
+
prediction_data: bool = True) -> tuple:
|
29
68
|
"""
|
69
|
+
Perform clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
|
70
|
+
|
30
71
|
Parameters:
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
Returns:
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
Description:
|
70
|
-
This function performs clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
|
71
|
-
It clusters the input data based on the specified parameters and returns the clusterer object, cluster labels for each point, and the
|
72
|
-
probability of each sample being an outlier.
|
72
|
+
embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
73
|
+
The input data to be clustered.
|
74
|
+
min_cluster_size : int, optional
|
75
|
+
The minimum number of samples in a group for that group to be considered a cluster; groupings smaller than this size will be left as noise.
|
76
|
+
min_samples : int or None, optional
|
77
|
+
The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. If ‘None’, it defaults to the min_cluster_size.
|
78
|
+
max_cluster_size : int, optional (default=0)
|
79
|
+
A limit to the size of clusters returned by the eom algorithm. Has no effect when using leaf clustering (where clusters are usually small regardless) and can also be overridden in rare cases by a high value for cluster_selection_epsilon.
|
80
|
+
Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
|
81
|
+
metric : str or callable, optional
|
82
|
+
The metric to use for distance computation. Default is 'euclidean'.
|
83
|
+
alpha : float, optional
|
84
|
+
Distance scaling parameter as used in robust single linkage.
|
85
|
+
p : int, optional
|
86
|
+
The Minkowski p-norm distance metric parameter. Default is None.
|
87
|
+
cluster_selection_epsilon : float, optional
|
88
|
+
A distance threshold. Clusters below this value will be merged. Note that this should not be used if we want to predict the cluster labels for new points in future (e.g. using approximate_predict), as the approximate_predict function is not aware of this argument.
|
89
|
+
cluster_selection_method : {'eom', 'leaf'}, optional
|
90
|
+
The method used to select clusters from the condensed tree. The standard approach for HDBSCAN* is to use an Excess of Mass algorithm to find the most persistent clusters. Alternatively you can instead select the clusters at the leaves of the tree – this provides the most fine grained and homogeneous clusters. Options are:
|
91
|
+
approx_min_span_tree : bool, optional
|
92
|
+
Whether to compute an approximation of the minimum spanning tree. Default is True.
|
93
|
+
gen_min_span_tree : bool, optional
|
94
|
+
Whether to populate the minimum_spanning_tree_ member for utilizing plotting tools. This requires the hdbscan CPU Python package to be installed.
|
95
|
+
gen_condensed_tree : bool, optional
|
96
|
+
Whether to populate the condensed_tree_ member for utilizing plotting tools.
|
97
|
+
gen_single_linkage_tree_ : bool
|
98
|
+
Whether to populate the single_linkage_tree_ member for utilizing plotting tools.
|
99
|
+
prediction_data : bool, optional
|
100
|
+
Whether the data is prediction data or not. Default is True.
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN
|
104
|
+
HDBSCAN clusterer object.
|
105
|
+
labels : array, shape (n_samples,)
|
106
|
+
Cluster labels for each point. Noisy samples are given the label -1.
|
107
|
+
probabilities : array, shape (n_samples,)
|
108
|
+
The probability of each sample being an outlier.
|
73
109
|
"""
|
74
110
|
clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
|
75
111
|
min_samples=min_samples,
|
@@ -89,17 +125,39 @@ Description:
|
|
89
125
|
|
90
126
|
return clusterer, clusterer.labels_, clusterer.probabilities_
|
91
127
|
|
92
|
-
def transform_with_cuml_HDBSCAN(clusterer, new_embeddings):
|
128
|
+
def transform_with_cuml_HDBSCAN(clusterer, new_embeddings: np.ndarray) -> tuple:
|
93
129
|
"""
|
94
|
-
Transform new data points using
|
130
|
+
Transform new data points using an HDBSCAN object.
|
131
|
+
|
132
|
+
Parameters:
|
133
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN
|
134
|
+
The HDBSCAN clusterer object trained on the original data.
|
135
|
+
new_embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
136
|
+
The new data points to be transformed.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
new_data_topic : array, shape (n_samples,)
|
140
|
+
Predicted cluster labels for each new data point.
|
141
|
+
new_data_proba : array, shape (n_samples,)
|
142
|
+
The probability of each new data point being an outlier.
|
95
143
|
"""
|
96
144
|
new_data_topic, new_data_proba = approximate_predict(clusterer, new_embeddings)
|
97
145
|
return new_data_topic, new_data_proba
|
98
146
|
|
99
147
|
|
100
|
-
def cuml_soft_clustering(clusterer):
|
148
|
+
def cuml_soft_clustering(clusterer) -> tuple:
|
101
149
|
"""
|
102
|
-
|
150
|
+
Perform soft clustering using HDBSCAN.
|
151
|
+
|
152
|
+
Parameters:
|
153
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN
|
154
|
+
The HDBSCAN clusterer object trained on the original data.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
soft_clusters_val : list of str
|
158
|
+
Predicted cluster labels for each data point, represented as strings.
|
159
|
+
soft_clusters_proba : list of float
|
160
|
+
The maximum probability of each data point belonging to any cluster.
|
103
161
|
"""
|
104
162
|
soft_clusters = all_points_membership_vectors(clusterer)
|
105
163
|
soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
|
@@ -107,17 +165,45 @@ def cuml_soft_clustering(clusterer):
|
|
107
165
|
return soft_clusters_val, soft_clusters_proba
|
108
166
|
|
109
167
|
|
110
|
-
def soft_cuml_clustering_new_data(clusterer, embeddings):
|
168
|
+
def soft_cuml_clustering_new_data(clusterer, embeddings: np.ndarray) -> tuple:
|
111
169
|
"""
|
112
|
-
|
170
|
+
Predict cluster memberships for new data points using HDBSCAN soft clustering.
|
171
|
+
|
172
|
+
Parameters:
|
173
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN
|
174
|
+
The HDBSCAN clusterer object trained on the original data.
|
175
|
+
embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
176
|
+
The new data points to be clustered.
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
soft_clusters_val : list of str
|
180
|
+
Predicted cluster labels for each new data point, represented as strings.
|
181
|
+
soft_clusters_proba : list of float
|
182
|
+
The maximum probability of each new data point belonging to any cluster.
|
113
183
|
"""
|
114
|
-
soft_clusters =membership_vector(clusterer, embeddings)
|
184
|
+
soft_clusters = membership_vector(clusterer, embeddings)
|
115
185
|
soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
|
116
186
|
soft_clusters_proba = [np.max(x) for x in soft_clusters]
|
117
187
|
return soft_clusters_val, soft_clusters_proba
|
118
188
|
|
119
|
-
def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, reencode =
|
189
|
+
def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, reducer, reencode: bool = False) -> list:
|
190
|
+
"""
|
191
|
+
Process embeddings using UMAP reduction.
|
120
192
|
|
193
|
+
Parameters:
|
194
|
+
embedded_chunks_paths : list of str
|
195
|
+
List of file paths containing the embedded chunks.
|
196
|
+
path_reduced_embeddings_id : str
|
197
|
+
Path to store the reduced embeddings.
|
198
|
+
reducer : UMAP object
|
199
|
+
The UMAP reducer object used for dimensionality reduction.
|
200
|
+
reencode : bool, optional
|
201
|
+
Whether to reencode the embeddings even if the reduced file already exists. Default is False.
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
new_file_paths : list of str
|
205
|
+
List of file paths to the reduced embeddings.
|
206
|
+
"""
|
121
207
|
new_file_paths=[]
|
122
208
|
for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
|
123
209
|
|
@@ -144,7 +230,30 @@ def process_UMAP(embedded_chunks_paths, path_reduced_embeddings_id, reducer, ree
|
|
144
230
|
|
145
231
|
|
146
232
|
|
147
|
-
def process_HDBSCAN(clusterer,
|
233
|
+
def process_HDBSCAN(clusterer,
|
234
|
+
reduced_embeddings_paths: list,
|
235
|
+
path_predictions_dataset_id: str,
|
236
|
+
run_soft_clustering: bool = False,
|
237
|
+
reencode: bool = False) -> list:
|
238
|
+
"""
|
239
|
+
Process reduced embeddings using HDBSCAN clustering.
|
240
|
+
|
241
|
+
Parameters:
|
242
|
+
clusterer : hdbscan.hdbscan_.HDBSCAN
|
243
|
+
The HDBSCAN clusterer object.
|
244
|
+
reduced_embeddings_paths : list of str
|
245
|
+
List of file paths containing the reduced embeddings.
|
246
|
+
path_predictions_dataset_id : str
|
247
|
+
Path to store the clustering predictions.
|
248
|
+
run_soft_clustering : bool, optional
|
249
|
+
Whether to perform soft clustering in addition to regular clustering. Default is False.
|
250
|
+
reencode : bool, optional
|
251
|
+
Whether to reencode the embeddings even if the clustering file already exists. Default is False.
|
252
|
+
|
253
|
+
Returns:
|
254
|
+
new_file_paths : list of str
|
255
|
+
List of file paths to the clustering predictions.
|
256
|
+
"""
|
148
257
|
new_file_paths=[]
|
149
258
|
for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
|
150
259
|
|
opsci_toolbox/helpers/sna.py
CHANGED
@@ -8,7 +8,21 @@ from opsci_toolbox.helpers.common import scale_list
|
|
8
8
|
import pandas as pd
|
9
9
|
import math
|
10
10
|
|
11
|
-
def create_collocations(lst_text, word_freq, coloc_freq, stop_words):
|
11
|
+
def create_collocations(lst_text : list, word_freq : int, coloc_freq : int, stop_words : list) -> tuple:
|
12
|
+
"""
|
13
|
+
Creates collocations (bigrams) from a list of texts and returns their relative frequencies and a DataFrame of word sizes.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
lst_text (List[str]): A list of text documents.
|
17
|
+
word_freq (int): Minimum document frequency for words to be included.
|
18
|
+
coloc_freq (int): Minimum frequency for collocations (bigrams) to be included.
|
19
|
+
stop_words (Set[str]): A set of stop words to be excluded from tokenization.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
Tuple[List[Tuple[str, str, float]], pd.DataFrame]:
|
23
|
+
- A list of tuples where each tuple contains two words and their relative bigram frequency.
|
24
|
+
- A DataFrame containing words and their sizes based on their counts in the documents.
|
25
|
+
"""
|
12
26
|
# Tokenize the documents into words using scikit-learn's CountVectorizer
|
13
27
|
vectorizer = CountVectorizer(token_pattern=r'[^\s]+', stop_words=stop_words, min_df=word_freq)
|
14
28
|
tokenized_documents = vectorizer.fit_transform(lst_text)
|
@@ -42,7 +56,19 @@ def create_collocations(lst_text, word_freq, coloc_freq, stop_words):
|
|
42
56
|
return edges, df_nodes
|
43
57
|
|
44
58
|
|
45
|
-
def create_maximum_tree(edges, df_nodes):
|
59
|
+
def create_maximum_tree(edges : list, df_nodes : pd.DataFrame) -> tuple:
|
60
|
+
"""
|
61
|
+
Creates a network graph from edges and node attributes, then generates its maximum spanning tree.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
edges (List[Tuple[str, str, float]]): A list of tuples where each tuple contains two nodes and the weight of the edge between them.
|
65
|
+
df_nodes (pd.DataFrame): A DataFrame containing node attributes, where 'word' is the node identifier.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Tuple[nx.Graph, nx.Graph]:
|
69
|
+
- The original network graph with node attributes.
|
70
|
+
- The maximum spanning tree of the network graph.
|
71
|
+
"""
|
46
72
|
attributs=df_nodes.set_index('word')
|
47
73
|
dictionnaire=attributs.to_dict('index')
|
48
74
|
|
@@ -54,7 +80,17 @@ def create_maximum_tree(edges, df_nodes):
|
|
54
80
|
|
55
81
|
return network, tree
|
56
82
|
|
57
|
-
def words_partitions(network, resolution = 1.0):
|
83
|
+
def words_partitions(network : nx.Graph, resolution : float = 1.0) -> None:
|
84
|
+
"""
|
85
|
+
Partitions the network using the Louvain method and calculates the modularity of the partition.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
network (nx.Graph): The network graph to partition.
|
89
|
+
resolution (float): The resolution parameter for the Louvain method. Higher values lead to smaller communities.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
None
|
93
|
+
"""
|
58
94
|
try:
|
59
95
|
partition = community_louvain.best_partition(network, resolution=resolution)
|
60
96
|
modularity = community_louvain.modularity(partition, network)
|
@@ -69,8 +105,16 @@ def words_partitions(network, resolution = 1.0):
|
|
69
105
|
nx.set_node_attributes(network, partition, "modularity")
|
70
106
|
|
71
107
|
|
72
|
-
def compute_metrics(network):
|
73
|
-
|
108
|
+
def compute_metrics(network : nx.Graph) -> None :
|
109
|
+
"""
|
110
|
+
Computes and sets centrality metrics for the nodes in the network graph.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
network (nx.Graph): The network graph on which to compute centrality.
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
None
|
117
|
+
"""
|
74
118
|
try:
|
75
119
|
degree_cent = nx.degree_centrality(network)
|
76
120
|
nx.set_node_attributes(network, degree_cent, "degree_centrality")
|
@@ -82,7 +126,6 @@ def compute_metrics(network):
|
|
82
126
|
degree_cent = {node: 0 for node in network.nodes()}
|
83
127
|
nx.set_node_attributes(network, degree_cent, "degree_centrality")
|
84
128
|
|
85
|
-
|
86
129
|
### CALCUL DE LA CENTRALITE DE VECTEUR PROPRE
|
87
130
|
try:
|
88
131
|
centrality = nx.eigenvector_centrality(network)
|
@@ -106,7 +149,20 @@ def compute_metrics(network):
|
|
106
149
|
betweenness_cent = {node: 0 for node in network.nodes()}
|
107
150
|
nx.set_node_attributes(network, betweenness_cent, "betweenness_centrality")
|
108
151
|
|
109
|
-
def prepare_nodes(T, layout_positions, colormap, min_node_size = 8, max_node_size = 40):
|
152
|
+
def prepare_nodes(T : nx.Graph, layout_positions : dict, colormap : str, min_node_size : int = 8, max_node_size : int = 40) -> None:
|
153
|
+
"""
|
154
|
+
Prepares and sets node attributes for a graph based on various centrality measures and colors them using a colormap.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
T (nx.Graph): The input graph.
|
158
|
+
layout_positions (Dict[str, Tuple[float, float]]): A dictionary of node positions for layout.
|
159
|
+
colormap (Colormap): A colormap for generating node colors.
|
160
|
+
min_node_size (int): Minimum node size for scaling. Default is 8.
|
161
|
+
max_node_size (int): Maximum node size for scaling. Default is 40.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
None
|
165
|
+
"""
|
110
166
|
|
111
167
|
# on génère une palette de couleur à partir de colormap
|
112
168
|
modularity_palette = generate_color_palette_with_colormap(set(nx.get_node_attributes(T,"modularity").values()), colormap=colormap)
|
@@ -147,17 +203,52 @@ def prepare_nodes(T, layout_positions, colormap, min_node_size = 8, max_node_siz
|
|
147
203
|
for n, p in layout_positions.items():
|
148
204
|
T.nodes[n]['pos'] = p
|
149
205
|
|
150
|
-
def prepare_edges(T, min_edge_size=1, max_edge_size=5):
|
206
|
+
def prepare_edges(T : nx.Graph, min_edge_size : int =1, max_edge_size : int =5) -> None:
|
207
|
+
"""
|
208
|
+
Prepares and sets edge attributes for a graph by scaling edge weights.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
T (nx.Graph): The input graph.
|
212
|
+
min_edge_size (int): Minimum edge size for scaling. Default is 1.
|
213
|
+
max_edge_size (int): Maximum edge size for scaling. Default is 5.
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
None
|
217
|
+
"""
|
151
218
|
w = [e[2]['weight'] for e in T.edges(data=True)]
|
152
219
|
scaled_w = scale_list(w, min_edge_size, max_edge_size)
|
153
220
|
edges_attributes_dict = {(e[0], e[1]): {'scaled_weight': scaled_w[i]} for i, e in enumerate(T.edges(data=True))}
|
154
221
|
nx.set_edge_attributes(T, edges_attributes_dict)
|
155
222
|
|
156
223
|
|
157
|
-
def layout_graphviz(network, layout ="fdp", args=""):
|
224
|
+
def layout_graphviz(network : nx.Graph, layout : str = "fdp", args : str ="") -> dict:
|
225
|
+
"""
|
226
|
+
Generates node positions for a graph using Graphviz layout algorithms.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
network (nx.Graph): The input graph.
|
230
|
+
layout (str): The Graphviz layout algorithm to use (e.g., "dot", "fdp", "sfdp"). Default is "fdp".
|
231
|
+
args (str): Additional arguments to pass to the Graphviz layout algorithm. Default is an empty string.
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
Dict[str, Tuple[float, float]]: A dictionary of node positions.
|
235
|
+
"""
|
158
236
|
layout_positions = nx.nx_agraph.graphviz_layout(network, prog=layout, args=args)
|
159
237
|
return layout_positions
|
160
238
|
|
161
|
-
def layout_spring(network, k = 0.08, scale = 2, iterations = 200, weight="weight"):
|
239
|
+
def layout_spring(network : nx.Graph, k : float = 0.08, scale : int = 2, iterations : int = 200, weight : str ="weight") -> dict:
|
240
|
+
"""
|
241
|
+
Generates node positions for a graph using the spring layout algorithm.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
network (nx.Graph): The input graph.
|
245
|
+
k (float): Optimal distance between nodes. Default is 0.08.
|
246
|
+
scale (float): Scale factor for the layout. Default is 2.
|
247
|
+
iterations (int): Number of iterations for the spring layout algorithm. Default is 200.
|
248
|
+
weight (str): Edge attribute to use as weight. Default is "weight".
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
Dict[str, Tuple[float, float]]: A dictionary of node positions.
|
252
|
+
"""
|
162
253
|
layout_positions = nx.spring_layout(network, k=k, scale=scale, iterations=iterations, weight=weight)
|
163
254
|
return layout_positions
|
@@ -1,9 +1,17 @@
|
|
1
1
|
import pandas as pd
|
2
2
|
from tqdm import tqdm
|
3
3
|
|
4
|
-
def generate_index(df, col_author_id ='author_id', col_date='created_time'):
|
4
|
+
def generate_index(df : pd.DataFrame, col_author_id : str ='author_id', col_date : str = 'created_time') -> pd.DataFrame:
|
5
5
|
"""
|
6
|
-
Generates an index based on
|
6
|
+
Generates an index based on author ID and creation date.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
df (pd.DataFrame): The input DataFrame containing author IDs and creation dates.
|
10
|
+
col_author_id (str): The column name for author IDs. Default is 'author_id'.
|
11
|
+
col_date (str): The column name for creation dates. Default is 'created_time'.
|
12
|
+
|
13
|
+
Returns:
|
14
|
+
pd.DataFrame: The DataFrame with a new 'index' column containing the generated indices.
|
7
15
|
"""
|
8
16
|
res=[]
|
9
17
|
for i, row in tqdm(df.iterrows(), total=df.shape[0], desc="generation des index"):
|
@@ -13,15 +21,25 @@ def generate_index(df, col_author_id ='author_id', col_date='created_time'):
|
|
13
21
|
|
14
22
|
return df
|
15
23
|
|
16
|
-
def avg_performance(df,
|
17
|
-
col_date='created_time',
|
18
|
-
col_author_id='author_id',
|
19
|
-
col_engagement=['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
|
24
|
+
def avg_performance(df : pd.DataFrame,
|
25
|
+
col_date : str ='created_time',
|
26
|
+
col_author_id : str ='author_id',
|
27
|
+
col_engagement : list =['shares', 'comments', 'reactions', 'likes','top_comments', 'love', 'wow', 'haha',
|
20
28
|
'sad', 'angry','total_engagement', 'replies', 'percentage_replies'],
|
21
|
-
rolling_period='7D'):
|
29
|
+
rolling_period : str ='7D') -> pd.DataFrame:
|
22
30
|
|
23
31
|
"""
|
24
|
-
|
32
|
+
Computes average performance on a rolling period for a list of engagement metrics.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
df (pd.DataFrame): The input DataFrame containing engagement metrics.
|
36
|
+
col_date (str): The column name for creation dates. Default is 'created_time'.
|
37
|
+
col_author_id (str): The column name for author IDs. Default is 'author_id'.
|
38
|
+
col_engagement (List[str]): A list of columns representing engagement metrics.
|
39
|
+
rolling_period (str): The rolling period for calculating the average. Default is '7D'.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
pd.DataFrame: The DataFrame with additional columns containing the rolling average of engagement metrics.
|
25
43
|
"""
|
26
44
|
|
27
45
|
# Nettoyage au cas où
|
@@ -47,18 +65,32 @@ def avg_performance(df,
|
|
47
65
|
|
48
66
|
return df
|
49
67
|
|
50
|
-
def kpi_reaction(df, cols):
|
68
|
+
def kpi_reaction(df : pd.DataFrame, cols : list) -> pd.DataFrame:
|
51
69
|
"""
|
52
|
-
|
53
|
-
|
70
|
+
Computes the overreaction rate for each column in the DataFrame.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
df (pd.DataFrame): The input DataFrame containing engagement metrics.
|
74
|
+
cols (List[str]): A list of column names for which to calculate the overreaction rate.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
pd.DataFrame: The DataFrame with additional columns containing the overreaction rates.
|
54
78
|
"""
|
55
79
|
for col in cols:
|
56
80
|
df['tx_'+col]=(df[col]-df[col+'_avg'])/(df[col]+df[col+'_avg'])
|
57
81
|
return df
|
58
82
|
|
59
|
-
def get_reactions_type(df, cols, col_dest):
|
83
|
+
def get_reactions_type(df : pd.DataFrame, cols : list, col_dest : str) -> pd.DataFrame:
|
60
84
|
"""
|
61
|
-
|
85
|
+
Returns the reaction type based on a list of metrics for each row in the DataFrame.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
df (pd.DataFrame): The input DataFrame containing engagement metrics.
|
89
|
+
cols (List[str]): A list of column names for which to determine the reaction type.
|
90
|
+
col_dest (str): The name of the column to store the reaction type in.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
pd.DataFrame: The DataFrame with additional column containing the reaction types.
|
62
94
|
"""
|
63
95
|
all_val=[]
|
64
96
|
|
@@ -80,10 +112,20 @@ def get_reactions_type(df, cols, col_dest):
|
|
80
112
|
df[col_dest]=all_val
|
81
113
|
return df
|
82
114
|
|
83
|
-
def compute_surreaction(df, col_date, col_author_id, cols_sureaction_metrics, cols_typologie_sureaction, rolling_period_sureaction = '7D'):
|
115
|
+
def compute_surreaction(df : pd.DataFrame, col_date : str, col_author_id : str, cols_sureaction_metrics : list, cols_typologie_sureaction : list, rolling_period_sureaction : str = '7D') -> pd.DataFrame:
|
84
116
|
"""
|
85
|
-
|
86
|
-
|
117
|
+
Computes surreaction rates and typology for a DataFrame containing engagement metrics.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
df (pd.DataFrame): The input DataFrame containing engagement metrics.
|
121
|
+
col_date (str): The column name for creation dates.
|
122
|
+
col_author_id (str): The column name for author IDs.
|
123
|
+
cols_sureaction_metrics (List[str]): A list of column names for which to calculate surreaction rates.
|
124
|
+
cols_typologie_sureaction (List[str]): A list of column names for categorizing the forms of reaction.
|
125
|
+
rolling_period_sureaction (str): The rolling period for calculating the average and surreaction rates. Default is '7D'.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
pd.DataFrame: The DataFrame with additional columns containing surreaction rates and typology.
|
87
129
|
"""
|
88
130
|
# on désactive temporairement les messages d'alerte
|
89
131
|
pd.options.mode.chained_assignment = None # default='warn'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: opsci-toolbox
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Summary: a complete toolbox
|
5
5
|
Home-page: UNKNOWN
|
6
6
|
Author: Erwan Le Nagard
|
@@ -23,9 +23,10 @@ Requires-Dist: networkx (==3.2.1)
|
|
23
23
|
Requires-Dist: nltk (==3.8.1)
|
24
24
|
Requires-Dist: numpy (<1.25.0,>=1.21.5)
|
25
25
|
Requires-Dist: opencv-python-headless (==4.9.0.80)
|
26
|
+
Requires-Dist: openpyxl (==3.1.3)
|
26
27
|
Requires-Dist: pandas (==1.5.3)
|
27
28
|
Requires-Dist: plotly (==5.19.0)
|
28
|
-
Requires-Dist: protobuf (
|
29
|
+
Requires-Dist: protobuf (<5,>=3.20)
|
29
30
|
Requires-Dist: pyarrow (==14.0.2)
|
30
31
|
Requires-Dist: python-louvain (==0.16)
|
31
32
|
Requires-Dist: scikit-learn (==1.4.1.post1)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
opsci_toolbox/apis/rapidapi_helpers.py,sha256=f2o4ItMZwoAt3ow5bSK-MPkqzP3wzJ857xU0CzDZIyI,23207
|
4
|
+
opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
|
5
|
+
opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
|
6
|
+
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
opsci_toolbox/helpers/common.py,sha256=TvlGcCdpkfKUaDkahILq3wFLgxwAtgRv5KJRoNy9brw,40339
|
8
|
+
opsci_toolbox/helpers/cv.py,sha256=-uXHncyAr8sDF0ip32LAz7Xae9Z4-T9MH6palpIzq-c,21109
|
9
|
+
opsci_toolbox/helpers/dataviz.py,sha256=JbudfwWPCEEEzP8Vpmu1CMEKaE6O2vtk9xsflW2pT1M,112451
|
10
|
+
opsci_toolbox/helpers/dates.py,sha256=EvNqut2s6S4CaaVFQhIDR-W00TZbt3J04yRYKYhxCkU,2638
|
11
|
+
opsci_toolbox/helpers/nlp.py,sha256=jpZRyTkYeoVH8tzqIT0opZn5unt8cdU1qPdFzXxEOw8,86638
|
12
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=w-pkch2Sk_FfVrm1j8NUmmxVvoJXJHuXzGnXGV_FWSE,14153
|
13
|
+
opsci_toolbox/helpers/sna.py,sha256=SZjS21qfBmlkHDJaXi7CaHpj6KhefcsDmJ1A9NRtVeQ,12006
|
14
|
+
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
15
|
+
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
17
|
+
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
18
|
+
opsci_toolbox-0.0.7.dist-info/METADATA,sha256=ErIa8rDRfvT52LjZJcSKU7zougC_1hZa3oWnvPPTzJQ,1601
|
19
|
+
opsci_toolbox-0.0.7.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
20
|
+
opsci_toolbox-0.0.7.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
21
|
+
opsci_toolbox-0.0.7.dist-info/RECORD,,
|
@@ -1,21 +0,0 @@
|
|
1
|
-
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
opsci_toolbox/apis/rapidapi_helpers.py,sha256=5QbF6ehsmmdTrzp7Q8cF5wrf4DmO91v8YexbybczyHA,23183
|
4
|
-
opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
|
5
|
-
opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
|
6
|
-
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
opsci_toolbox/helpers/common.py,sha256=41EsQ2pTwQYnUUM1ggwaPueFVj2Qcm_UG7o_Zj41FU8,26152
|
8
|
-
opsci_toolbox/helpers/cv.py,sha256=z0HecreIi-vqiOGpDa4VVnHIX_rvkObngrqwTwkWT44,12403
|
9
|
-
opsci_toolbox/helpers/dataviz.py,sha256=4wFi0wCMgvIEQEL8okiVJOWxz-eJq5cZ7svHoBbZjnk,77393
|
10
|
-
opsci_toolbox/helpers/dates.py,sha256=yQm9pUQAeLTFNPcgeumhi8oErustQJhaoL_HqxSxhiA,996
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=LGW8CIjrkQvGLKEnxYu7RNrBNViQ5dUygK67EhkBHZo,57999
|
12
|
-
opsci_toolbox/helpers/nlp_cuml.py,sha256=Mkbtl9ewbv3aa9rFvhH9VOM5Y0G-XIsXtR_6IeYpebY,9450
|
13
|
-
opsci_toolbox/helpers/sna.py,sha256=D6nwgUgbuApXGpT2zoIMip8262hynEwfppVdvaZ4Qm0,8053
|
14
|
-
opsci_toolbox/helpers/surreaction.py,sha256=k5hcZZlXnJ-zczRpwfwthggEgFCr9lQsHHKVOPlm7fc,4606
|
15
|
-
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
17
|
-
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
18
|
-
opsci_toolbox-0.0.5.dist-info/METADATA,sha256=Nhp2oK-KXD4JVivU37-T_MsN-VJfbPtJsWlUq7Kp5-A,1566
|
19
|
-
opsci_toolbox-0.0.5.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
20
|
-
opsci_toolbox-0.0.5.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
21
|
-
opsci_toolbox-0.0.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|