opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +120 -21
- opsci_toolbox/apis/webscraping.py +186 -59
- opsci_toolbox/apis/youtube_helpers.py +103 -16
- opsci_toolbox/helpers/common.py +368 -254
- opsci_toolbox/helpers/cv.py +50 -60
- opsci_toolbox/helpers/dataviz.py +255 -184
- opsci_toolbox/helpers/dates.py +17 -18
- opsci_toolbox/helpers/nlp.py +154 -114
- opsci_toolbox/helpers/nlp_cuml.py +389 -36
- opsci_toolbox/helpers/sna.py +509 -0
- opsci_toolbox/helpers/sql.py +53 -0
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
- opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
- opsci_toolbox-0.0.7.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/sna.py
CHANGED
@@ -7,6 +7,515 @@ from opsci_toolbox.helpers.dataviz import generate_color_palette_with_colormap,
|
|
7
7
|
from opsci_toolbox.helpers.common import scale_list
|
8
8
|
import pandas as pd
|
9
9
|
import math
|
10
|
+
from collections import Counter
|
11
|
+
from opsci_toolbox.helpers.dataviz import boxplot
|
12
|
+
from fa2_modified import ForceAtlas2
|
13
|
+
|
14
|
+
def group_nodes_by_values(dictionnary : dict) -> dict:
|
15
|
+
"""
|
16
|
+
Group nodes by their values from a dictionary.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
dictionnary (Dict[Any, Any]): A dictionary where keys are nodes and values are attributes
|
20
|
+
or categories.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
Dict[Any, List[Any]]: A dictionary where each key is a unique value from the input dictionary,
|
24
|
+
and the corresponding value is a list of nodes that have that value.
|
25
|
+
|
26
|
+
"""
|
27
|
+
new_dict = {}
|
28
|
+
for node, comm in dictionnary.items():
|
29
|
+
if comm not in new_dict:
|
30
|
+
new_dict[comm] = []
|
31
|
+
new_dict[comm].append(node)
|
32
|
+
return new_dict
|
33
|
+
|
34
|
+
def graph_key_metrics(G : nx.Graph) -> dict:
|
35
|
+
"""
|
36
|
+
Calculate key metrics for a NetworkX graph.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
G (nx.Graph): The NetworkX graph for which to calculate metrics.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
Dict[str, float]: A dictionary containing the following metrics:
|
43
|
+
- "nodes": Number of nodes in the graph.
|
44
|
+
- "edges": Number of edges in the graph.
|
45
|
+
- "density": Density of the graph.
|
46
|
+
- "average_degree": Average degree of nodes in the graph.
|
47
|
+
- "assortativity": Degree assortativity coefficient of the graph.
|
48
|
+
- "transitivity": Transitivity (global clustering coefficient) of the graph.
|
49
|
+
"""
|
50
|
+
num_nodes = G.number_of_nodes()
|
51
|
+
num_edges = G.number_of_edges()
|
52
|
+
density = nx.density(G)
|
53
|
+
total_degree = sum(degree for _, degree in G.degree())
|
54
|
+
avg_degree = total_degree / num_nodes if num_nodes > 0 else 0
|
55
|
+
assortativity = nx.degree_assortativity_coefficient(G)
|
56
|
+
transitivity = nx.transitivity(G)
|
57
|
+
key_metrics = {
|
58
|
+
"nodes": num_nodes,
|
59
|
+
"edges" : num_edges,
|
60
|
+
"density" : density,
|
61
|
+
"average_degree" : avg_degree,
|
62
|
+
"assortativity" : assortativity,
|
63
|
+
"transitivity" : transitivity
|
64
|
+
}
|
65
|
+
return key_metrics
|
66
|
+
|
67
|
+
def communities_metrics(G : nx.Graph, nodes_by_community : dict) -> dict:
|
68
|
+
"""
|
69
|
+
Calculate various metrics for communities within a subgraph.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
G (nx.Graph): The NetworkX graph containing the communities.
|
73
|
+
nodes_by_community (Dict[Any, List[Any]]): A dictionary where keys are community identifiers and
|
74
|
+
values are lists of nodes in each community.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
Dict[Any, Dict[str, float]]: A dictionary where each key is a community identifier, and the value
|
78
|
+
is another dictionary containing various metrics for that community.
|
79
|
+
"""
|
80
|
+
communities_metrics = {}
|
81
|
+
for comm, nodes in nodes_by_community.items():
|
82
|
+
subgraph = G.subgraph(nodes)
|
83
|
+
num_nodes = subgraph.number_of_nodes()
|
84
|
+
num_edges = subgraph.number_of_edges()
|
85
|
+
density = nx.density(subgraph)
|
86
|
+
total_degree = sum(degree for _, degree in subgraph.degree())
|
87
|
+
avg_degree = total_degree / num_nodes if num_nodes > 0 else 0
|
88
|
+
assortativity = nx.degree_assortativity_coefficient(subgraph)
|
89
|
+
transitivity = nx.transitivity(subgraph)
|
90
|
+
communities_metrics[comm] = {
|
91
|
+
"nodes": num_nodes,
|
92
|
+
"edges": num_edges,
|
93
|
+
"density": density,
|
94
|
+
"average_degree": avg_degree,
|
95
|
+
"assortativity": assortativity,
|
96
|
+
"transitivity": transitivity,
|
97
|
+
}
|
98
|
+
return communities_metrics
|
99
|
+
|
100
|
+
def remove_attributes(G : nx.Graph, attributes : list = ['degree_centrality', 'in_degree_centrality', 'out_degree_centrality', 'eigenvector_centrality', 'degree', 'in_degree', 'out_degree', 'composante', 'betweenness_centrality', 'viz']) -> nx.Graph:
|
101
|
+
"""
|
102
|
+
Remove specified attributes from all nodes in a NetworkX graph.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
G (nx.Graph): The NetworkX graph from which to remove node attributes.
|
106
|
+
attributes (List[str], optional): List of attribute names to remove from each node.
|
107
|
+
Defaults to common graph attributes.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
nx.Graph: The graph with the specified attributes removed from each node.
|
111
|
+
"""
|
112
|
+
for node, attrs in G.nodes(data=True):
|
113
|
+
for attr in attributes:
|
114
|
+
attrs.pop(attr, None)
|
115
|
+
return G
|
116
|
+
|
117
|
+
def compute_modularity(G : nx.Graph, resolution : float =1, col_name : str = "modularity") -> dict:
|
118
|
+
"""
|
119
|
+
Compute modularity of a graph using the Louvain method and assign community labels as node attributes.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
G (nx.Graph): The input graph for which to compute modularity.
|
123
|
+
resolution (float, optional): The resolution parameter for the Louvain method. Default is 1.
|
124
|
+
col_name (str, optional): The name of the node attribute to store community labels. Default is "modularity".
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
Dict[int, int]: A dictionary mapping each node to its community.
|
128
|
+
"""
|
129
|
+
try :
|
130
|
+
communities = nx.community.louvain_communities(G, resolution=resolution)
|
131
|
+
community_dict=transform_dict_of_nodes(communities)
|
132
|
+
except Exception as e:
|
133
|
+
pass
|
134
|
+
print(e)
|
135
|
+
community_dict = {node: 0 for node in G.nodes()}
|
136
|
+
nx.set_node_attributes(G, community_dict, col_name)
|
137
|
+
return community_dict
|
138
|
+
|
139
|
+
def compute_degrees(G : nx.Graph, col_name : str = "degree") -> dict:
|
140
|
+
"""
|
141
|
+
Compute the degrees of nodes in a graph and assign them as node attributes.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
G (nx.Graph): The input graph for which to compute node degrees.
|
145
|
+
col_name (str, optional): The name of the node attribute to store degrees. Default is "degree".
|
146
|
+
|
147
|
+
Returns:
|
148
|
+
Dict[int, int]: A dictionary mapping each node to its degree.
|
149
|
+
"""
|
150
|
+
try:
|
151
|
+
degree_dict = {node[0] : node[1] for node in list(G.degree())}
|
152
|
+
except Exception as e:
|
153
|
+
pass
|
154
|
+
print(e)
|
155
|
+
degree_dict = {node: 0 for node in G.nodes()}
|
156
|
+
nx.set_node_attributes(G, degree_dict, col_name)
|
157
|
+
return degree_dict
|
158
|
+
|
159
|
+
def compute_in_degrees(G: nx.Graph, col_name : str = "in_degree") -> dict:
|
160
|
+
"""
|
161
|
+
Compute the in degrees of nodes in a graph and assign them as node attributes.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
G (nx.Graph): The input graph for which to compute node in degrees.
|
165
|
+
col_name (str, optional): The name of the node attribute to store in degrees. Default is "in_degree".
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
Dict[int, int]: A dictionary mapping each node to its degree.
|
169
|
+
"""
|
170
|
+
try:
|
171
|
+
in_degree_dict = {node[0] : node[1] for node in list(G.in_degree())}
|
172
|
+
except Exception as e :
|
173
|
+
pass
|
174
|
+
print(e)
|
175
|
+
in_degree_dict = {node: 0 for node in G.nodes()}
|
176
|
+
nx.set_node_attributes(G, in_degree_dict, col_name)
|
177
|
+
return in_degree_dict
|
178
|
+
|
179
|
+
def compute_out_degrees(G : nx.Graph, col_name : str = "out_degree") -> dict:
|
180
|
+
"""
|
181
|
+
Compute the out degrees of nodes in a graph and assign them as node attributes.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
G (nx.Graph): The input graph for which to compute node out degrees.
|
185
|
+
col_name (str, optional): The name of the node attribute to store in degrees. Default is "out_degree".
|
186
|
+
|
187
|
+
Returns:
|
188
|
+
Dict[int, int]: A dictionary mapping each node to its degree.
|
189
|
+
"""
|
190
|
+
try:
|
191
|
+
out_degree_dict = {node[0] : node[1] for node in list(G.out_degree())}
|
192
|
+
except Exception as e:
|
193
|
+
pass
|
194
|
+
print(e)
|
195
|
+
out_degree_dict = {node: 0 for node in G.nodes()}
|
196
|
+
nx.set_node_attributes(G, out_degree_dict, col_name)
|
197
|
+
return out_degree_dict
|
198
|
+
|
199
|
+
def compute_degree_centrality(G : nx.Graph, col_name : str = "degree_centrality") -> dict :
|
200
|
+
"""
|
201
|
+
Computes and sets Degree centrality metric for the nodes in the network graph.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
network (nx.Graph): The network graph on which to compute centrality.
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
None
|
208
|
+
"""
|
209
|
+
try:
|
210
|
+
degree_cent = nx.degree_centrality(G)
|
211
|
+
nx.set_node_attributes(G, degree_cent, col_name)
|
212
|
+
# print("Calcul de la centralité de degrés effectué")
|
213
|
+
except Exception as e:
|
214
|
+
pass
|
215
|
+
# print(e, "Calcul de la centralité de degrés impossible")
|
216
|
+
# Set a default value for degree centrality
|
217
|
+
degree_cent = {node: 0 for node in G.nodes()}
|
218
|
+
nx.set_node_attributes(G, degree_cent, col_name)
|
219
|
+
return degree_cent
|
220
|
+
|
221
|
+
def compute_in_degree_centrality(G : nx.Graph, col_name : str = "in_degree_centrality") -> dict :
|
222
|
+
"""
|
223
|
+
Computes and sets In Degree centrality metric for the nodes in the network graph.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
network (nx.Graph): The network graph on which to compute centrality.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
None
|
230
|
+
"""
|
231
|
+
try:
|
232
|
+
in_degree_cent = nx.in_degree_centrality(G)
|
233
|
+
nx.set_node_attributes(G, in_degree_cent, col_name)
|
234
|
+
except Exception as e:
|
235
|
+
pass
|
236
|
+
# Set a default value for degree centrality
|
237
|
+
in_degree_cent = {node: 0 for node in G.nodes()}
|
238
|
+
nx.set_node_attributes(G, in_degree_cent, col_name)
|
239
|
+
return in_degree_cent
|
240
|
+
|
241
|
+
def compute_out_degree_centrality(G : nx.Graph, col_name : str = "out_degree_centrality") -> dict :
|
242
|
+
"""
|
243
|
+
Computes and sets Out Degree centrality metric for the nodes in the network graph.
|
244
|
+
|
245
|
+
Args:
|
246
|
+
network (nx.Graph): The network graph on which to compute centrality.
|
247
|
+
|
248
|
+
Returns:
|
249
|
+
None
|
250
|
+
"""
|
251
|
+
try:
|
252
|
+
out_degree_cent = nx.out_degree_centrality(G)
|
253
|
+
nx.set_node_attributes(G, out_degree_cent, col_name)
|
254
|
+
except Exception as e:
|
255
|
+
pass
|
256
|
+
# Set a default value for degree centrality
|
257
|
+
out_degree_cent = {node: 0 for node in G.nodes()}
|
258
|
+
nx.set_node_attributes(G, out_degree_cent, col_name)
|
259
|
+
return out_degree_cent
|
260
|
+
|
261
|
+
|
262
|
+
def compute_eigenvector_centrality(G : nx.Graph, col_name : str = "eigenvector_centrality") -> dict :
|
263
|
+
"""
|
264
|
+
Computes and sets Eigenvector centrality metric for the nodes in the network graph.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
network (nx.Graph): The network graph on which to compute centrality.
|
268
|
+
|
269
|
+
Returns:
|
270
|
+
None
|
271
|
+
"""
|
272
|
+
### CALCUL DE LA CENTRALITE DE VECTEUR PROPRE
|
273
|
+
try:
|
274
|
+
eigenvector_centrality = nx.eigenvector_centrality(G)
|
275
|
+
nx.set_node_attributes(G, eigenvector_centrality, col_name)
|
276
|
+
# print("Calcul de la centralité de vecteur propre effectué")
|
277
|
+
except Exception as e:
|
278
|
+
pass
|
279
|
+
# print(e, "Calcul de la centralité de vecteur propre impossible")
|
280
|
+
# Set a default value for centrality
|
281
|
+
eigenvector_centrality = {node: 0 for node in G.nodes()}
|
282
|
+
nx.set_node_attributes(G, eigenvector_centrality, col_name)
|
283
|
+
return eigenvector_centrality
|
284
|
+
|
285
|
+
def compute_betweenness_centrality(G : nx.Graph, col_name : str = "betweenness_centrality") -> dict :
|
286
|
+
"""
|
287
|
+
Computes and sets Betweeness centrality metric for the nodes in the network graph.
|
288
|
+
|
289
|
+
Args:
|
290
|
+
network (nx.Graph): The network graph on which to compute centrality.
|
291
|
+
|
292
|
+
Returns:
|
293
|
+
None
|
294
|
+
"""
|
295
|
+
try:
|
296
|
+
betweenness_cent = nx.betweenness_centrality(G, k=None, normalized=True, weight='weight', endpoints=False, seed=None)
|
297
|
+
nx.set_node_attributes(G, betweenness_cent, col_name)
|
298
|
+
# print("Calcul de l'intermédiarité effectué")
|
299
|
+
except Exception as e:
|
300
|
+
pass
|
301
|
+
# print(e, "Calcul de l'intermédiarité impossible")
|
302
|
+
# Set a default value for betweenness centrality
|
303
|
+
betweenness_cent = {node: 0 for node in G.nodes()}
|
304
|
+
nx.set_node_attributes(G, betweenness_cent, col_name)
|
305
|
+
return betweenness_cent
|
306
|
+
|
307
|
+
def calcul_composantes_connexes(G : nx.Graph, col_name : str = "composante") -> dict:
|
308
|
+
"""
|
309
|
+
Calculate weakly connected components in a graph and assign component labels as node attributes.
|
310
|
+
|
311
|
+
Args:
|
312
|
+
G (nx.Graph): The input graph.
|
313
|
+
col_name (str, optional): The name of the node attribute to store component labels. Default is "composante".
|
314
|
+
|
315
|
+
Returns:
|
316
|
+
List[set]: A list of sets, each set containing nodes belonging to a weakly connected component.
|
317
|
+
"""
|
318
|
+
composantes_connexes = sorted(nx.weakly_connected_components(G),
|
319
|
+
key=len, # clé de tri - len = longueur de la composante
|
320
|
+
reverse=True)
|
321
|
+
|
322
|
+
composantes_dict = transform_dict_of_nodes(composantes_connexes)
|
323
|
+
nx.set_node_attributes(G, composantes_dict, col_name)
|
324
|
+
return composantes_connexes
|
325
|
+
|
326
|
+
def filtrer_composante_principale(G: nx.Graph, composantes_connexes : dict) -> nx.Graph:
|
327
|
+
"""
|
328
|
+
Filter the main component (largest weakly connected component) from a graph.
|
329
|
+
|
330
|
+
Args:
|
331
|
+
G (nx.Graph): The input graph.
|
332
|
+
composantes_connexes (Dict[int, set]): Dictionary mapping component indices to sets of nodes.
|
333
|
+
|
334
|
+
Returns:
|
335
|
+
nx.Graph: The largest weakly connected component as a subgraph of the original graph.
|
336
|
+
"""
|
337
|
+
composante_principale = G.subgraph(composantes_connexes[0])
|
338
|
+
return composante_principale
|
339
|
+
|
340
|
+
def select_mutual_relationships(G: nx.Graph) -> set:
|
341
|
+
"""
|
342
|
+
Select mutual relationships (edges) in a graph.
|
343
|
+
|
344
|
+
Args:
|
345
|
+
G (nx.Graph): The input graph.
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
Set[Tuple[int, int]]: A set of tuples representing mutual edges.
|
349
|
+
"""
|
350
|
+
mutual_edges = set()
|
351
|
+
for u, v in G.edges():
|
352
|
+
if G.has_edge(v, u):
|
353
|
+
mutual_edges.add((u, v))
|
354
|
+
mutual_edges.add((v, u))
|
355
|
+
return mutual_edges
|
356
|
+
|
357
|
+
def select_top_nodes_by_metric(G: nx.Graph, metric : str = "degree_centrality", N : int =1000) -> nx.Graph:
|
358
|
+
"""
|
359
|
+
Selects the top N nodes in the graph based on a specified node attribute (metric) and returns the subgraph of these nodes.
|
360
|
+
|
361
|
+
Args:
|
362
|
+
G (nx.Graph): The input graph.
|
363
|
+
metric (str, optional): The node attribute used to rank and select the top nodes. Default is "degree_centrality".
|
364
|
+
N (int, optional): The number of top nodes to select. Default is 1000.
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
subgraph (Optional[nx.Graph]): A subgraph containing the top N nodes based on the specified metric. Returns None if the metric is not found.
|
368
|
+
|
369
|
+
"""
|
370
|
+
if metric in G.nodes[list(G.nodes)[0]].keys():
|
371
|
+
metric_selection = select_attribute(G, metric)
|
372
|
+
sorted_nodes = sorted(dict(metric_selection).items(), key=lambda x: x[1], reverse=True)
|
373
|
+
top_N_nodes = [node for node, degree in sorted_nodes[:N]]
|
374
|
+
subgraph = G.subgraph(top_N_nodes)
|
375
|
+
return subgraph
|
376
|
+
else:
|
377
|
+
print(metric, "not found in nodes attribute")
|
378
|
+
return None
|
379
|
+
|
380
|
+
def select_attribute(G : nx.Graph, attribute : str) -> dict:
|
381
|
+
"""
|
382
|
+
Extracts a specified attribute from each node in the graph and returns it as a dictionary.
|
383
|
+
|
384
|
+
Args:
|
385
|
+
G (nx.Graph): The input graph.
|
386
|
+
attribute (str): The node attribute to extract.
|
387
|
+
|
388
|
+
Returns:
|
389
|
+
attribute_dict (Dict): A dictionary where keys are node identifiers and values are the attribute values.
|
390
|
+
"""
|
391
|
+
attribute_dict = {node[0] : node[1][attribute] for node in G.nodes(data=True)}
|
392
|
+
return attribute_dict
|
393
|
+
|
394
|
+
def select_top_nodes_by_degrees(G: nx.Graph, degree_type : str = "degree", N : int = 1000) -> nx.Graph:
|
395
|
+
"""
|
396
|
+
Selects the top N nodes from a graph based on their degree and returns a subgraph.
|
397
|
+
|
398
|
+
Args:
|
399
|
+
G : nx.Graph The input graph, which can be undirected or directed.
|
400
|
+
degree_type : str, optional, default="degree". The type of degree to consider for selection. Valid values are "degree", "in degree", and "out degree".
|
401
|
+
N : int, optional, default=1000. The number of top nodes to select based on degree.
|
402
|
+
Returns:
|
403
|
+
nx.Graph : A subgraph containing the top N nodes based on the specified degree type.
|
404
|
+
|
405
|
+
Raises:
|
406
|
+
ValueError : If an invalid degree_type is provided.
|
407
|
+
"""
|
408
|
+
if degree_type == "degree":
|
409
|
+
degree_selection = G.degree()
|
410
|
+
elif degree_type == "in degree":
|
411
|
+
degree_selection = G.in_degree()
|
412
|
+
elif degree_type == "out degree":
|
413
|
+
degree_selection = G.out_degree()
|
414
|
+
else:
|
415
|
+
raise ValueError("Invalid degree_type. Must be one of: 'degree', 'in degree', 'out degree'.")
|
416
|
+
|
417
|
+
sorted_nodes_by_degree = sorted(dict(degree_selection).items(), key=lambda x: x[1], reverse=True)
|
418
|
+
top_N_nodes = [node for node, degree in sorted_nodes_by_degree[:N]]
|
419
|
+
subgraph = G.subgraph(top_N_nodes)
|
420
|
+
|
421
|
+
return subgraph
|
422
|
+
|
423
|
+
|
424
|
+
def scale_size(G, size_attribute, min_node_size = 10, max_node_size = 100):
|
425
|
+
"""
|
426
|
+
Scale the sizes of nodes in a graph based on a specified attribute.
|
427
|
+
|
428
|
+
Args:
|
429
|
+
G (nx.Graph): The graph containing nodes with attributes.
|
430
|
+
size_attribute (str): The node attribute to scale the sizes by.
|
431
|
+
min_node_size (int, optional): The minimum size to scale to. Default is 10.
|
432
|
+
max_node_size (int, optional): The maximum size to scale to. Default is 100.
|
433
|
+
|
434
|
+
Returns:
|
435
|
+
List[int]: A list of scaled node sizes.
|
436
|
+
"""
|
437
|
+
sizes=[n[1].get(size_attribute,0) for n in G.nodes(data=True)]
|
438
|
+
scaled_sizes = scale_list(sizes, min_node_size, max_node_size)
|
439
|
+
return scaled_sizes
|
440
|
+
|
441
|
+
def transform_dict_of_nodes(dict_of_nodes : dict) -> dict:
|
442
|
+
"""
|
443
|
+
Dictionnary format transformation
|
444
|
+
Args:
|
445
|
+
dict_of_nodes (dict) : dictionnary returned by networkx
|
446
|
+
Returns:
|
447
|
+
transformed_dict (dict)
|
448
|
+
|
449
|
+
"""
|
450
|
+
transformed_dict={}
|
451
|
+
for idx, nodes in enumerate(dict_of_nodes):
|
452
|
+
for node_id in nodes:
|
453
|
+
transformed_dict[node_id] = idx
|
454
|
+
return transformed_dict
|
455
|
+
|
456
|
+
def layout_forceatlas(G: nx.Graph, dissuade_hubs: bool = True, edge_weight_influence: float = 1.0, scalingRatio: float = 5.0, gravity: float = 0.5, iterations: int = 200) -> dict:
|
457
|
+
"""
|
458
|
+
Computes a ForceAtlas2 layout for a NetworkX graph.
|
459
|
+
|
460
|
+
Args:
|
461
|
+
G : nx.Graph
|
462
|
+
The input graph.
|
463
|
+
dissuade_hubs : bool, optional, default=True
|
464
|
+
Whether to apply the outbound attraction distribution, which dissuades hubs.
|
465
|
+
edge_weight_influence : float, optional, default=1.0
|
466
|
+
The influence of edge weights on the layout.
|
467
|
+
scalingRatio : float, optional, default=5.0
|
468
|
+
The scaling ratio for the layout.
|
469
|
+
gravity : float, optional, default=0.5
|
470
|
+
The gravity force applied to the layout.
|
471
|
+
iterations : int, optional, default=200
|
472
|
+
The number of iterations to run the layout algorithm.
|
473
|
+
|
474
|
+
Returns:
|
475
|
+
dict : a dictionary mapping node IDs to their positions in 2D space.
|
476
|
+
"""
|
477
|
+
|
478
|
+
forceatlas2 = ForceAtlas2(
|
479
|
+
# Behavior alternatives
|
480
|
+
outboundAttractionDistribution=dissuade_hubs, # Dissuade hubs
|
481
|
+
linLogMode=False, # NOT IMPLEMENTED
|
482
|
+
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
|
483
|
+
edgeWeightInfluence=edge_weight_influence,
|
484
|
+
|
485
|
+
# Performance
|
486
|
+
jitterTolerance=1.0, # Tolerance
|
487
|
+
barnesHutOptimize=True,
|
488
|
+
barnesHutTheta=1.2,
|
489
|
+
multiThreaded=False, # NOT IMPLEMENTED
|
490
|
+
|
491
|
+
# Tuning
|
492
|
+
scalingRatio=scalingRatio,
|
493
|
+
strongGravityMode=False,
|
494
|
+
gravity=gravity,
|
495
|
+
|
496
|
+
# Log
|
497
|
+
verbose=True)
|
498
|
+
|
499
|
+
layout_positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=iterations)
|
500
|
+
return layout_positions
|
501
|
+
|
502
|
+
def distribution(metric_dict : dict, metric_name : str) -> tuple:
|
503
|
+
"""
|
504
|
+
Generate a distribution DataFrame and a boxplot for a given metric.
|
505
|
+
|
506
|
+
Args:
|
507
|
+
metric_dict (dict): Dictionary containing metric data, with keys as nodes and values as metric values.
|
508
|
+
metric_name (str): The name of the metric to be used as the column name in the DataFrame and plot titles.
|
509
|
+
|
510
|
+
Returns:
|
511
|
+
DataFrame containing the distribution of metric values.
|
512
|
+
Boxplot figure visualizing the distribution of the metric.
|
513
|
+
"""
|
514
|
+
metric_count = Counter(metric_dict.values())
|
515
|
+
df = pd.DataFrame(list(metric_count.items()), columns=[metric_name, "nodes"]).sort_values(by="nodes", ascending=False)
|
516
|
+
fig = boxplot(df, col_y = metric_name, title =f"{metric_name} - Nodes distribution", yaxis_title = metric_name)
|
517
|
+
return df, fig
|
518
|
+
|
10
519
|
|
11
520
|
def create_collocations(lst_text : list, word_freq : int, coloc_freq : int, stop_words : list) -> tuple:
|
12
521
|
"""
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import mysql.connector
|
2
|
+
import pandas as pd
|
3
|
+
from mysql.connector import errorcode
|
4
|
+
|
5
|
+
def connect_to_mysql_database(host: str = 'localhost', user: str = 'root', password: str = 'password', database: str = 'subs'):
|
6
|
+
"""
|
7
|
+
Connect to a MySQL database.
|
8
|
+
|
9
|
+
Parameters:
|
10
|
+
- host (str): The host of the database. Default is 'localhost'.
|
11
|
+
- user (str): The username to use for connecting to the database. Default is 'root'.
|
12
|
+
- password (str): The password to use for connecting to the database. Default is 'password'.
|
13
|
+
- database (str): The name of the database to connect to. Default is 'subs'.
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
- mysql.connector.connection.MySQLConnection: The connection object if connection is successful.
|
17
|
+
- None: If the connection fails.
|
18
|
+
"""
|
19
|
+
try:
|
20
|
+
conn = mysql.connector.connect(
|
21
|
+
host=host,
|
22
|
+
user=user,
|
23
|
+
password=password,
|
24
|
+
database=database
|
25
|
+
)
|
26
|
+
return conn
|
27
|
+
except mysql.connector.Error as err:
|
28
|
+
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
|
29
|
+
print("Something is wrong with your user name or password")
|
30
|
+
elif err.errno == errorcode.ER_BAD_DB_ERROR:
|
31
|
+
print("Database does not exist")
|
32
|
+
else:
|
33
|
+
print(err)
|
34
|
+
return None
|
35
|
+
|
36
|
+
def execute_query(conn: mysql.connector.connection.MySQLConnection, query: str) -> pd.DataFrame:
|
37
|
+
"""
|
38
|
+
Execute a SQL query and return the results as a pandas DataFrame.
|
39
|
+
|
40
|
+
Parameters:
|
41
|
+
- conn (mysql.connector.connection.MySQLConnection): The connection object to the database.
|
42
|
+
- query (str): The SQL query to be executed.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
- pd.DataFrame: A DataFrame containing the results of the query.
|
46
|
+
"""
|
47
|
+
cursor = conn.cursor()
|
48
|
+
cursor.execute(query)
|
49
|
+
results = cursor.fetchall()
|
50
|
+
column_names = [i[0] for i in cursor.description]
|
51
|
+
df = pd.DataFrame(results, columns=column_names)
|
52
|
+
cursor.close()
|
53
|
+
return df
|
@@ -1,33 +1,38 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: opsci-toolbox
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.8
|
4
4
|
Summary: a complete toolbox
|
5
5
|
Home-page: UNKNOWN
|
6
6
|
Author: Erwan Le Nagard
|
7
7
|
Author-email: erwan@opsci.ai
|
8
8
|
License: MIT
|
9
9
|
Platform: UNKNOWN
|
10
|
-
Requires-Dist: Pillow (
|
11
|
-
Requires-Dist: Requests (==2.
|
10
|
+
Requires-Dist: Pillow (>=9.0.1)
|
11
|
+
Requires-Dist: Requests (==2.32.3)
|
12
12
|
Requires-Dist: beautifulsoup4 (==4.10.0)
|
13
|
+
Requires-Dist: chardet (>=4.0.0)
|
13
14
|
Requires-Dist: chart-studio (==1.1.0)
|
15
|
+
Requires-Dist: cudf (==0.6.1.post1)
|
16
|
+
Requires-Dist: cuml (==0.6.1.post1)
|
14
17
|
Requires-Dist: eldar (==0.0.8)
|
15
18
|
Requires-Dist: emoji (==2.10.1)
|
19
|
+
Requires-Dist: fa2-modified (==0.3.10)
|
16
20
|
Requires-Dist: google-api-python-client (==2.122.0)
|
17
|
-
Requires-Dist: gspread (==6.1.
|
21
|
+
Requires-Dist: gspread (==6.1.2)
|
18
22
|
Requires-Dist: hdbscan (==0.8.33)
|
19
23
|
Requires-Dist: jusText (==3.0.0)
|
20
24
|
Requires-Dist: langchain (==0.1.20)
|
21
|
-
Requires-Dist: matplotlib (>=3.
|
25
|
+
Requires-Dist: matplotlib (>=3.9.0)
|
26
|
+
Requires-Dist: mysql-connector-repackaged (==0.3.1)
|
22
27
|
Requires-Dist: networkx (==3.2.1)
|
23
28
|
Requires-Dist: nltk (==3.8.1)
|
24
29
|
Requires-Dist: numpy (<1.25.0,>=1.21.5)
|
25
30
|
Requires-Dist: opencv-python-headless (==4.9.0.80)
|
26
31
|
Requires-Dist: openpyxl (==3.1.3)
|
27
|
-
Requires-Dist: pandas (
|
32
|
+
Requires-Dist: pandas (>=1.5.3)
|
28
33
|
Requires-Dist: plotly (==5.19.0)
|
29
|
-
Requires-Dist: protobuf (
|
30
|
-
Requires-Dist: pyarrow (
|
34
|
+
Requires-Dist: protobuf (==5.27.2)
|
35
|
+
Requires-Dist: pyarrow (>=14.0.2)
|
31
36
|
Requires-Dist: python-louvain (==0.16)
|
32
37
|
Requires-Dist: scikit-learn (==1.4.1.post1)
|
33
38
|
Requires-Dist: scipy (<2.0.0,>=1.8.0)
|
@@ -36,7 +41,7 @@ Requires-Dist: setuptools (==59.6.0)
|
|
36
41
|
Requires-Dist: spacy (==3.7.4)
|
37
42
|
Requires-Dist: spacy-language-detection (==0.2.1)
|
38
43
|
Requires-Dist: spacymoji (==3.1.0)
|
39
|
-
Requires-Dist: supervision (==0.
|
44
|
+
Requires-Dist: supervision (==0.21.0)
|
40
45
|
Requires-Dist: textacy (==0.13.0)
|
41
46
|
Requires-Dist: torch (==2.0.1)
|
42
47
|
Requires-Dist: tqdm (==4.66.2)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
opsci_toolbox/apis/rapidapi_helpers.py,sha256=k_hYcRNww5noNkX7zyz5Htggxb15BPoKSlbY7NLuQXI,26696
|
4
|
+
opsci_toolbox/apis/webscraping.py,sha256=Gz3hOfhOHUpwHU1Pzj3mB2WdBAcKa2WisYBHMi3lcVE,18343
|
5
|
+
opsci_toolbox/apis/youtube_helpers.py,sha256=j4hwCS2BEWRJjd9Q5XBN9FeCrL3lqteyz5dqbtfypdo,17418
|
6
|
+
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
opsci_toolbox/helpers/common.py,sha256=lemGhNwWIxaMwo-X7UsksUMGLV-IOuX_XwC82a50GD4,44672
|
8
|
+
opsci_toolbox/helpers/cv.py,sha256=N3hnLX223UQbdw_YEdUYj10xUXT_95O6BpQt6TbAE08,21092
|
9
|
+
opsci_toolbox/helpers/dataviz.py,sha256=IfHByNWAU2rErZMfs3LuwZwJApLN5w320JEbBPuVp6U,115856
|
10
|
+
opsci_toolbox/helpers/dates.py,sha256=Wf7HxaUY62IRrY3XPdRIuoaMbGi3QqWf-vStqbRRY_o,2633
|
11
|
+
opsci_toolbox/helpers/nlp.py,sha256=r4o7V9tJrj3xt34O_4hN0szbSB4RmveP8qmwCqHOxEY,87988
|
12
|
+
opsci_toolbox/helpers/nlp_cuml.py,sha256=XzBfoFMpVIehpRbp60E4wGokpoqJP0lJxs1plOxQqBY,28882
|
13
|
+
opsci_toolbox/helpers/sna.py,sha256=XL1BZ-x83xWRNbGsvh7-m8Mdy6iOrWx8vjgaL2_TSmo,31905
|
14
|
+
opsci_toolbox/helpers/sql.py,sha256=LMrDWcv1QpfE8HyyrqiKuhhkt930lvME3-AKU89LF38,1928
|
15
|
+
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
16
|
+
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
18
|
+
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
19
|
+
opsci_toolbox-0.0.8.dist-info/METADATA,sha256=CkDzhMlMim64kRmfCp5Ae5ujYNlzB6EccXByt_QvjUw,1788
|
20
|
+
opsci_toolbox-0.0.8.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
21
|
+
opsci_toolbox-0.0.8.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
22
|
+
opsci_toolbox-0.0.8.dist-info/RECORD,,
|
@@ -1,21 +0,0 @@
|
|
1
|
-
opsci_toolbox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
opsci_toolbox/apis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
opsci_toolbox/apis/rapidapi_helpers.py,sha256=f2o4ItMZwoAt3ow5bSK-MPkqzP3wzJ857xU0CzDZIyI,23207
|
4
|
-
opsci_toolbox/apis/webscraping.py,sha256=D1A_ixjImPOncbWrKf6Nem2SR4NQraxTbcYqiE64VTY,12263
|
5
|
-
opsci_toolbox/apis/youtube_helpers.py,sha256=CZQ4mP43eA3STWNJ0HjSoJpvz3iHzohSGxmp5ntEgpA,13115
|
6
|
-
opsci_toolbox/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
opsci_toolbox/helpers/common.py,sha256=TvlGcCdpkfKUaDkahILq3wFLgxwAtgRv5KJRoNy9brw,40339
|
8
|
-
opsci_toolbox/helpers/cv.py,sha256=-uXHncyAr8sDF0ip32LAz7Xae9Z4-T9MH6palpIzq-c,21109
|
9
|
-
opsci_toolbox/helpers/dataviz.py,sha256=JbudfwWPCEEEzP8Vpmu1CMEKaE6O2vtk9xsflW2pT1M,112451
|
10
|
-
opsci_toolbox/helpers/dates.py,sha256=EvNqut2s6S4CaaVFQhIDR-W00TZbt3J04yRYKYhxCkU,2638
|
11
|
-
opsci_toolbox/helpers/nlp.py,sha256=jpZRyTkYeoVH8tzqIT0opZn5unt8cdU1qPdFzXxEOw8,86638
|
12
|
-
opsci_toolbox/helpers/nlp_cuml.py,sha256=w-pkch2Sk_FfVrm1j8NUmmxVvoJXJHuXzGnXGV_FWSE,14153
|
13
|
-
opsci_toolbox/helpers/sna.py,sha256=SZjS21qfBmlkHDJaXi7CaHpj6KhefcsDmJ1A9NRtVeQ,12006
|
14
|
-
opsci_toolbox/helpers/surreaction.py,sha256=JjVvHs7Sf9IJxX0QdHpQ_3E8-c_OS6q_bfUKvurl1z4,7093
|
15
|
-
opsci_toolbox/lexicons/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
opsci_toolbox/lexicons/stop_words_en.csv,sha256=4lzjBZHCn_b3lg_CUNkmA_MDQ7DLEpS83k6-dWpkC2o,1957
|
17
|
-
opsci_toolbox/lexicons/stop_words_fr.csv,sha256=sPdA8VmyNYbiHg-M8O3tg7ayHvCE3GDg6cF-oSZxICM,6776
|
18
|
-
opsci_toolbox-0.0.7.dist-info/METADATA,sha256=ErIa8rDRfvT52LjZJcSKU7zougC_1hZa3oWnvPPTzJQ,1601
|
19
|
-
opsci_toolbox-0.0.7.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
20
|
-
opsci_toolbox-0.0.7.dist-info/top_level.txt,sha256=fUiqxou4FPec_tOfauTLCKAuepeYLfRyhedycWxVnq4,14
|
21
|
-
opsci_toolbox-0.0.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|