opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +120 -21
- opsci_toolbox/apis/webscraping.py +186 -59
- opsci_toolbox/apis/youtube_helpers.py +103 -16
- opsci_toolbox/helpers/common.py +368 -254
- opsci_toolbox/helpers/cv.py +50 -60
- opsci_toolbox/helpers/dataviz.py +255 -184
- opsci_toolbox/helpers/dates.py +17 -18
- opsci_toolbox/helpers/nlp.py +154 -114
- opsci_toolbox/helpers/nlp_cuml.py +389 -36
- opsci_toolbox/helpers/sna.py +509 -0
- opsci_toolbox/helpers/sql.py +53 -0
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
- opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
- opsci_toolbox-0.0.6.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,47 @@
|
|
1
1
|
from cuml import UMAP
|
2
|
+
import cudf
|
3
|
+
from sklearn.feature_selection import chi2
|
4
|
+
from cuml.feature_extraction.text import CountVectorizer
|
2
5
|
from cuml.cluster.hdbscan import HDBSCAN, all_points_membership_vectors, approximate_predict, membership_vector
|
3
6
|
import numpy as np
|
4
7
|
from tqdm import tqdm
|
5
8
|
import os
|
6
9
|
from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
|
10
|
+
import cudf.pandas
|
11
|
+
cudf.pandas.install()
|
12
|
+
import pandas as pd
|
7
13
|
|
8
14
|
def reduce_with_cuml_UMAP(embeddings: np.ndarray,
|
9
15
|
n_neighbors: int = 5,
|
10
16
|
n_components: int = 3,
|
11
17
|
min_dist: float = 0.0,
|
12
18
|
metric: str = "cosine",
|
13
|
-
spread: float = 1.0
|
19
|
+
spread: float = 1.0,
|
20
|
+
learning_rate: float = 1.0,
|
21
|
+
n_epochs:int = 300
|
22
|
+
) -> tuple:
|
14
23
|
"""
|
15
24
|
Reduces the dimensionality of embeddings using UMAP with cuML library.
|
16
25
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
26
|
+
Args:
|
27
|
+
embeddings (np.ndarray): The input embeddings to be reduced.
|
28
|
+
n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
|
29
|
+
n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
|
30
|
+
min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
|
31
|
+
metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
|
32
|
+
spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
|
24
33
|
|
25
34
|
Returns:
|
26
|
-
|
27
|
-
|
35
|
+
reducer (UMAP): The UMAP reducer object.
|
36
|
+
reduced_embeddings (np.ndarray): The reduced embeddings.
|
28
37
|
"""
|
29
38
|
reducer = UMAP(n_neighbors=n_neighbors,
|
30
39
|
n_components=n_components,
|
31
40
|
min_dist=min_dist,
|
32
41
|
metric=metric,
|
33
|
-
spread =
|
42
|
+
spread = spread,
|
43
|
+
n_epochs=n_epochs,
|
44
|
+
learning_rate=learning_rate).fit(embeddings)
|
34
45
|
|
35
46
|
reduced_embeddings = reducer.transform(embeddings)
|
36
47
|
return reducer, reduced_embeddings
|
@@ -40,12 +51,12 @@ def transform_with_cuml_UMAP(reducer,
|
|
40
51
|
"""
|
41
52
|
Transform new data points using a UMAP object.
|
42
53
|
|
43
|
-
|
44
|
-
|
45
|
-
|
54
|
+
Args:
|
55
|
+
reducer (UMAP): The UMAP reducer object.
|
56
|
+
new_embeddings (np.ndarray): The new data points to be transformed.
|
46
57
|
|
47
58
|
Returns:
|
48
|
-
|
59
|
+
reduced_embeddings (np.ndarray): The transformed embeddings.
|
49
60
|
"""
|
50
61
|
reduced_embeddings = reducer.transform(new_embeddings)
|
51
62
|
return reduced_embeddings
|
@@ -68,7 +79,7 @@ def hdbscan_cuml_clustering(embeddings: np.ndarray,
|
|
68
79
|
"""
|
69
80
|
Perform clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
|
70
81
|
|
71
|
-
|
82
|
+
Args:
|
72
83
|
embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
73
84
|
The input data to be clustered.
|
74
85
|
min_cluster_size : int, optional
|
@@ -100,7 +111,7 @@ def hdbscan_cuml_clustering(embeddings: np.ndarray,
|
|
100
111
|
Whether the data is prediction data or not. Default is True.
|
101
112
|
|
102
113
|
Returns:
|
103
|
-
clusterer : hdbscan.
|
114
|
+
clusterer : hdbscan.HDBSCAN
|
104
115
|
HDBSCAN clusterer object.
|
105
116
|
labels : array, shape (n_samples,)
|
106
117
|
Cluster labels for each point. Noisy samples are given the label -1.
|
@@ -129,8 +140,8 @@ def transform_with_cuml_HDBSCAN(clusterer, new_embeddings: np.ndarray) -> tuple:
|
|
129
140
|
"""
|
130
141
|
Transform new data points using an HDBSCAN object.
|
131
142
|
|
132
|
-
|
133
|
-
clusterer : hdbscan.
|
143
|
+
Args:
|
144
|
+
clusterer : hdbscan.HDBSCAN
|
134
145
|
The HDBSCAN clusterer object trained on the original data.
|
135
146
|
new_embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
136
147
|
The new data points to be transformed.
|
@@ -149,15 +160,13 @@ def cuml_soft_clustering(clusterer) -> tuple:
|
|
149
160
|
"""
|
150
161
|
Perform soft clustering using HDBSCAN.
|
151
162
|
|
152
|
-
|
153
|
-
clusterer : hdbscan.
|
163
|
+
Args:
|
164
|
+
clusterer : hdbscan.HDBSCAN
|
154
165
|
The HDBSCAN clusterer object trained on the original data.
|
155
166
|
|
156
167
|
Returns:
|
157
|
-
soft_clusters_val : list of str
|
158
|
-
|
159
|
-
soft_clusters_proba : list of float
|
160
|
-
The maximum probability of each data point belonging to any cluster.
|
168
|
+
soft_clusters_val : list of str. Predicted cluster labels for each data point, represented as strings.
|
169
|
+
soft_clusters_proba : list of float. The maximum probability of each data point belonging to any cluster.
|
161
170
|
"""
|
162
171
|
soft_clusters = all_points_membership_vectors(clusterer)
|
163
172
|
soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
|
@@ -169,7 +178,7 @@ def soft_cuml_clustering_new_data(clusterer, embeddings: np.ndarray) -> tuple:
|
|
169
178
|
"""
|
170
179
|
Predict cluster memberships for new data points using HDBSCAN soft clustering.
|
171
180
|
|
172
|
-
|
181
|
+
Args:
|
173
182
|
clusterer : hdbscan.hdbscan_.HDBSCAN
|
174
183
|
The HDBSCAN clusterer object trained on the original data.
|
175
184
|
embeddings : array-like or sparse matrix, shape (n_samples, n_features)
|
@@ -190,7 +199,7 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
|
|
190
199
|
"""
|
191
200
|
Process embeddings using UMAP reduction.
|
192
201
|
|
193
|
-
|
202
|
+
Args:
|
194
203
|
embedded_chunks_paths : list of str
|
195
204
|
List of file paths containing the embedded chunks.
|
196
205
|
path_reduced_embeddings_id : str
|
@@ -208,20 +217,21 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
|
|
208
217
|
for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
|
209
218
|
|
210
219
|
filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
|
211
|
-
new_filename = filename+"_reduce_embeddings.
|
220
|
+
new_filename = filename+"_reduce_embeddings.parquet"
|
212
221
|
new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
|
213
222
|
|
214
223
|
if not os.path.exists(new_file_path) or reencode:
|
215
|
-
df =
|
224
|
+
df = cudf_read_parquet(file_path)
|
216
225
|
create_dir(path_reduced_embeddings_id)
|
217
226
|
# embeddings = df["embeddings"].to_list()
|
218
|
-
embeddings = np.vstack(df['embeddings'].values)
|
227
|
+
# embeddings = np.vstack(df['embeddings'].values)
|
228
|
+
embeddings = np.vstack(df['embeddings'].to_pandas().tolist())
|
219
229
|
reduced_embeddings = transform_with_cuml_UMAP(reducer, embeddings)
|
220
230
|
reduced_embeddings_transformed=[list(e) for e in reduced_embeddings]
|
221
231
|
df['reduced_embeddings'] = reduced_embeddings_transformed
|
222
232
|
df.drop(columns=["embeddings"], inplace=True)
|
223
233
|
print(path_reduced_embeddings_id, filename+"_reduce_embeddings")
|
224
|
-
|
234
|
+
cudf_write_parquet(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
|
225
235
|
new_file_paths.append(new_file_path)
|
226
236
|
else:
|
227
237
|
print("REDUCED EMBEDDINGS ALREADY EXISTS", file_path)
|
@@ -238,7 +248,7 @@ def process_HDBSCAN(clusterer,
|
|
238
248
|
"""
|
239
249
|
Process reduced embeddings using HDBSCAN clustering.
|
240
250
|
|
241
|
-
|
251
|
+
Args:
|
242
252
|
clusterer : hdbscan.hdbscan_.HDBSCAN
|
243
253
|
The HDBSCAN clusterer object.
|
244
254
|
reduced_embeddings_paths : list of str
|
@@ -258,12 +268,13 @@ def process_HDBSCAN(clusterer,
|
|
258
268
|
for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
|
259
269
|
|
260
270
|
filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
|
261
|
-
new_filename = filename+ "_predictions.
|
271
|
+
new_filename = filename+ "_predictions.parquet"
|
262
272
|
new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
|
263
273
|
if not os.path.exists(new_file_path) or reencode:
|
264
|
-
df =
|
274
|
+
df = cudf_read_parquet(file_path)
|
265
275
|
# reduced_embeddings = df["reduced_embeddings"].to_list()
|
266
|
-
reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
|
276
|
+
# reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
|
277
|
+
reduced_embeddings = np.vstack(df['reduced_embeddings'].to_pandas().tolist())
|
267
278
|
topics, probas = transform_with_cuml_HDBSCAN(clusterer, reduced_embeddings)
|
268
279
|
df["topic"]=topics.astype(int).astype(str)
|
269
280
|
df["proba"]=probas
|
@@ -272,9 +283,351 @@ def process_HDBSCAN(clusterer,
|
|
272
283
|
df["soft_topic"]=soft_clusters
|
273
284
|
df["soft_proba"]=soft_proba
|
274
285
|
|
275
|
-
|
286
|
+
cudf_write_parquet(df, path_predictions_dataset_id, filename+ "_predictions")
|
276
287
|
new_file_paths.append(new_file_path)
|
277
288
|
else:
|
278
289
|
print("CLUSTERING ALREADY EXISTS", file_path)
|
279
290
|
new_file_paths.append(new_file_path)
|
280
291
|
return new_file_paths
|
292
|
+
|
293
|
+
# def cuml_word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
|
294
|
+
# """
|
295
|
+
# Calculate word frequency per category using cuML for GPU acceleration.
|
296
|
+
|
297
|
+
# Parameters:
|
298
|
+
# df : pandas DataFrame
|
299
|
+
# DataFrame containing text data and corresponding categories.
|
300
|
+
# col_text : str
|
301
|
+
# Name of the column containing the text data.
|
302
|
+
# col_cat : str
|
303
|
+
# Name of the column containing the categories.
|
304
|
+
# ngram_range : tuple, optional
|
305
|
+
# The range for n-grams. Default is (1, 1) for unigrams.
|
306
|
+
# stop_words : list, optional
|
307
|
+
# List of stopwords to be ignored during frequency calculation. Default is an empty list.
|
308
|
+
# n_words : int, optional
|
309
|
+
# Number of top words to display per category. Default is 20.
|
310
|
+
# min_freq : int, optional
|
311
|
+
# Minimum frequency threshold for word occurrences per category. Default is 3.
|
312
|
+
|
313
|
+
# Returns:
|
314
|
+
# DataFrame
|
315
|
+
# DataFrame containing word frequencies per category.
|
316
|
+
|
317
|
+
# Description:
|
318
|
+
# This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
|
319
|
+
# It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
|
320
|
+
# """
|
321
|
+
# # Convert pandas DataFrame to cuDF DataFrame
|
322
|
+
# gdf = cudf.DataFrame.from_pandas(df)
|
323
|
+
|
324
|
+
# # Initialize cuML's CountVectorizer
|
325
|
+
# count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
|
326
|
+
|
327
|
+
# # Fit and transform the text data
|
328
|
+
# X_train_count = count_vectorizer.fit_transform(gdf[col_text])
|
329
|
+
# X_names_count = count_vectorizer.get_feature_names()
|
330
|
+
|
331
|
+
# # Initialize the resulting DataFrame
|
332
|
+
# df_count = cudf.DataFrame()
|
333
|
+
|
334
|
+
# # Calculate word frequencies per category
|
335
|
+
# for cat in gdf[col_cat].unique().to_pandas().tolist():
|
336
|
+
# word_count = X_train_count[gdf[col_cat] == cat].sum(axis=0)
|
337
|
+
# df_count_tmp = cudf.DataFrame({col_cat: [cat]*len(X_names_count), "word": X_names_count, "freq": word_count.tolist()[0]}).sort_values(by="freq", ascending=False)
|
338
|
+
|
339
|
+
# # Apply frequency and n_words filters
|
340
|
+
# if n_words:
|
341
|
+
# df_count_tmp = df_count_tmp.head(n_words)
|
342
|
+
# if min_freq:
|
343
|
+
# df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
|
344
|
+
|
345
|
+
# # Concatenate the result to the main DataFrame
|
346
|
+
# df_count = cudf.concat([df_count, df_count_tmp])
|
347
|
+
|
348
|
+
# # Convert the result back to pandas DataFrame
|
349
|
+
# return df_count.to_pandas()
|
350
|
+
|
351
|
+
def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
|
352
|
+
"""
|
353
|
+
Calculate word frequency per category using cuML for GPU acceleration.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
df : pandas DataFrame
|
357
|
+
DataFrame containing text data and corresponding categories.
|
358
|
+
col_text : str
|
359
|
+
Name of the column containing the text data.
|
360
|
+
col_cat : str
|
361
|
+
Name of the column containing the categories.
|
362
|
+
ngram_range : tuple, optional
|
363
|
+
The range for n-grams. Default is (1, 1) for unigrams.
|
364
|
+
stop_words : list, optional
|
365
|
+
List of stopwords to be ignored during frequency calculation. Default is an empty list.
|
366
|
+
n_words : int, optional
|
367
|
+
Number of top words to display per category. Default is 20.
|
368
|
+
min_freq : int, optional
|
369
|
+
Minimum frequency threshold for word occurrences per category. Default is 3.
|
370
|
+
|
371
|
+
Returns:
|
372
|
+
DataFrame
|
373
|
+
DataFrame containing word frequencies per category.
|
374
|
+
|
375
|
+
Description:
|
376
|
+
This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
|
377
|
+
It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
|
378
|
+
"""
|
379
|
+
# Convert pandas DataFrame to cuDF DataFrame
|
380
|
+
# gdf = cudf.DataFrame.from_pandas(df))
|
381
|
+
# print(type(gdf))
|
382
|
+
# gdf = convert_df_to_cudf(gdf)
|
383
|
+
|
384
|
+
# Initialize cuML's CountVectorizer
|
385
|
+
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
|
386
|
+
|
387
|
+
print(type(gdf[col_text]))
|
388
|
+
# Fit and transform the text data
|
389
|
+
X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
|
390
|
+
X_names_count = count_vectorizer.get_feature_names()
|
391
|
+
|
392
|
+
# Initialize the resulting DataFrame
|
393
|
+
df_count = cudf.DataFrame()
|
394
|
+
|
395
|
+
# Calculate word frequencies per category
|
396
|
+
for cat in gdf[col_cat].unique().tolist():
|
397
|
+
word_count = X_train_count[gdf[col_cat] == cat].sum(axis=0)
|
398
|
+
df_count_tmp = cudf.DataFrame({col_cat: [cat]*len(X_names_count), "word": X_names_count, "freq": word_count.tolist()[0]}).sort_values(by="freq", ascending=False)
|
399
|
+
|
400
|
+
# Apply frequency and n_words filters
|
401
|
+
if n_words:
|
402
|
+
df_count_tmp = df_count_tmp.head(n_words)
|
403
|
+
if min_freq:
|
404
|
+
df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
|
405
|
+
|
406
|
+
# Concatenate the result to the main DataFrame
|
407
|
+
df_count = cudf.concat([df_count, df_count_tmp])
|
408
|
+
|
409
|
+
# Convert the result back to pandas DataFrame
|
410
|
+
return df_count.to_pandas()
|
411
|
+
|
412
|
+
# def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
|
413
|
+
|
414
|
+
# # Convert input lists to cuDF Series
|
415
|
+
# gdf_text = cudf.Series(lst_text)
|
416
|
+
# gdf_categorie = cudf.Series(lst_categorie)
|
417
|
+
|
418
|
+
# # Initialize cuML's CountVectorizer
|
419
|
+
# count_vectorizer = CountVectorizer(analyzer='word')
|
420
|
+
|
421
|
+
# # Fit and transform the text data
|
422
|
+
# X_train_count = count_vectorizer.fit_transform(gdf_text)
|
423
|
+
# X_names_count = count_vectorizer.get_feature_names()
|
424
|
+
|
425
|
+
# # Initialize the resulting DataFrame
|
426
|
+
# df_chi = cudf.DataFrame()
|
427
|
+
|
428
|
+
# # Calculate Chi-squared statistics per category
|
429
|
+
# unique_categories = gdf_categorie.unique().to_pandas().tolist()
|
430
|
+
# for cat in unique_categories:
|
431
|
+
# cat_series = (gdf_categorie == cat).astype(int).to_pandas()
|
432
|
+
# chi2_scores, p_values = chi2(X_train_count.get(), cat_series)
|
433
|
+
# word_count = X_train_count[cat_series.astype(bool)].sum(axis=0).get()[0]
|
434
|
+
|
435
|
+
# df_chi_tmp = cudf.DataFrame({
|
436
|
+
# col_cat: cat,
|
437
|
+
# "relevant_words_chi2": X_names_count,
|
438
|
+
# "chi2": chi2_scores,
|
439
|
+
# "p_values": 1 - p_values,
|
440
|
+
# "word_count_per_class": word_count
|
441
|
+
# }).sort_values(by="chi2", ascending=False).head(n_words)
|
442
|
+
|
443
|
+
# # Filter based on p_values and word_count
|
444
|
+
# df_chi_tmp = df_chi_tmp[df_chi_tmp["p_values"] > p_value_limit]
|
445
|
+
# df_chi_tmp = df_chi_tmp[df_chi_tmp["word_count_per_class"] > min_freq]
|
446
|
+
|
447
|
+
# df_chi = cudf.concat([df_chi, df_chi_tmp])
|
448
|
+
|
449
|
+
# # Reset index
|
450
|
+
# df_chi.reset_index(drop=True, inplace=True)
|
451
|
+
# return df_chi.to_pandas()
|
452
|
+
|
453
|
+
def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
|
454
|
+
"""
|
455
|
+
Calculate Chi-squared statistics for each category and return a DataFrame
|
456
|
+
of relevant words per category.
|
457
|
+
|
458
|
+
Args:
|
459
|
+
lst_text (List[str]): List of text documents.
|
460
|
+
lst_categorie (List[str]): List of categories corresponding to each document.
|
461
|
+
col_cat (str): Name of the category column in the resulting DataFrame.
|
462
|
+
n_words (int, optional): Number of top words to return per category. Default is 10.
|
463
|
+
p_value_limit (float, optional): The minimum p-value to filter relevant words. Default is 0.95.
|
464
|
+
min_freq (int, optional): The minimum frequency of words to be considered relevant. Default is 3.
|
465
|
+
|
466
|
+
Returns:
|
467
|
+
pd.DataFrame: A pandas DataFrame containing the relevant words for each category.
|
468
|
+
"""
|
469
|
+
# Convert input lists to cuDF Series
|
470
|
+
gdf_text = cudf.Series(lst_text)
|
471
|
+
gdf_categorie = lst_categorie
|
472
|
+
|
473
|
+
# Initialize cuML's CountVectorizer
|
474
|
+
count_vectorizer = CountVectorizer(analyzer='word')
|
475
|
+
|
476
|
+
# Fit and transform the text data
|
477
|
+
X_train_count = count_vectorizer.fit_transform(gdf_text)
|
478
|
+
X_names_count = count_vectorizer.get_feature_names()
|
479
|
+
|
480
|
+
# Initialize the resulting DataFrame
|
481
|
+
df_chi = cudf.DataFrame()
|
482
|
+
|
483
|
+
# Calculate Chi-squared statistics per category
|
484
|
+
unique_categories = gdf_categorie.unique().tolist()
|
485
|
+
for cat in unique_categories:
|
486
|
+
cat_series = (gdf_categorie == cat).astype(int)
|
487
|
+
chi2_scores, p_values = chi2(X_train_count.get(), cat_series)
|
488
|
+
word_count = X_train_count[cat_series.astype(bool)].sum(axis=0).get()[0]
|
489
|
+
|
490
|
+
df_chi_tmp = cudf.DataFrame({
|
491
|
+
col_cat: cat,
|
492
|
+
"relevant_words_chi2": X_names_count,
|
493
|
+
"chi2": chi2_scores,
|
494
|
+
"p_values": 1 - p_values,
|
495
|
+
"word_count_per_class": word_count
|
496
|
+
}).sort_values(by="chi2", ascending=False).head(n_words)
|
497
|
+
|
498
|
+
# Filter based on p_values and word_count
|
499
|
+
df_chi_tmp = df_chi_tmp[df_chi_tmp["p_values"] > p_value_limit]
|
500
|
+
df_chi_tmp = df_chi_tmp[df_chi_tmp["word_count_per_class"] > min_freq]
|
501
|
+
|
502
|
+
df_chi = cudf.concat([df_chi, df_chi_tmp])
|
503
|
+
|
504
|
+
# Reset index
|
505
|
+
df_chi.reset_index(drop=True, inplace=True)
|
506
|
+
return df_chi.to_pandas()
|
507
|
+
|
508
|
+
def cudf_write_parquet(df: cudf.DataFrame, path: str, filename: str) -> str:
|
509
|
+
"""
|
510
|
+
Write a cuDF DataFrame to a Parquet file.
|
511
|
+
|
512
|
+
Args:
|
513
|
+
df (cudf.DataFrame): The cuDF DataFrame to be written.
|
514
|
+
path (str): The directory path where the file should be saved.
|
515
|
+
filename (str): The name of the file without extension.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
str: The file path of the saved Parquet file.
|
519
|
+
"""
|
520
|
+
file_path = os.path.join(path, str(filename)+".parquet")
|
521
|
+
df.to_parquet(file_path)
|
522
|
+
return file_path
|
523
|
+
|
524
|
+
def cudf_read_parquet(path: str) -> cudf.DataFrame:
|
525
|
+
"""
|
526
|
+
Read a Parquet file into a cuDF DataFrame.
|
527
|
+
|
528
|
+
Args:
|
529
|
+
path (str): The file path to the Parquet file.
|
530
|
+
|
531
|
+
Returns:
|
532
|
+
cudf.DataFrame: The read cuDF DataFrame.
|
533
|
+
"""
|
534
|
+
df = cudf.read_parquet(path)
|
535
|
+
return df
|
536
|
+
|
537
|
+
def convert_df_to_cudf(df: pd.DataFrame) -> cudf.DataFrame:
|
538
|
+
"""
|
539
|
+
Convert a pandas DataFrame to a cuDF DataFrame.
|
540
|
+
|
541
|
+
Args:
|
542
|
+
df (pd.DataFrame): The pandas DataFrame to convert.
|
543
|
+
|
544
|
+
Returns:
|
545
|
+
cudf.DataFrame: The resulting cuDF DataFrame.
|
546
|
+
"""
|
547
|
+
return cudf.DataFrame.from_pandas(df)
|
548
|
+
|
549
|
+
def convert_cudf_to_df(cdf: cudf.DataFrame) -> pd.DataFrame:
|
550
|
+
"""
|
551
|
+
Convert a cuDF DataFrame to a pandas DataFrame.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
cdf (cudf.DataFrame): The cuDF DataFrame to convert.
|
555
|
+
|
556
|
+
Returns:
|
557
|
+
pd.DataFrame: The resulting pandas DataFrame.
|
558
|
+
"""
|
559
|
+
return cdf.to_pandas()
|
560
|
+
|
561
|
+
|
562
|
+
def cudf_encode_chunked_files(chunk_files_paths: list,
|
563
|
+
HF_encoder,
|
564
|
+
cols: list,
|
565
|
+
col_text: str,
|
566
|
+
path_embedded_chunks: str,
|
567
|
+
reencode: bool = False) -> list:
|
568
|
+
"""
|
569
|
+
Encode text from files and save the results in another pickle file.
|
570
|
+
|
571
|
+
Args:
|
572
|
+
chunk_files_paths (List[str]): List of file paths containing documents.
|
573
|
+
HF_encoder (Encoder): Encoder object for text vectorization.
|
574
|
+
cols (List[str]): Columns to keep in the resulting DataFrame.
|
575
|
+
col_text (str): Column containing text data in the DataFrame.
|
576
|
+
path_embedded_chunks (str): Path to save the embedded chunks.
|
577
|
+
reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
|
578
|
+
|
579
|
+
Returns:
|
580
|
+
List[str]: List of paths for newly created files.
|
581
|
+
"""
|
582
|
+
new_file_paths=[]
|
583
|
+
for file in tqdm(chunk_files_paths, total=len(chunk_files_paths), desc="Encoding text from files"):
|
584
|
+
new_filename = os.path.splitext(os.path.basename(file))[0]+"_embedded"
|
585
|
+
new_file_path = os.path.join(path_embedded_chunks, new_filename+".parquet")
|
586
|
+
# on vérifie si on a déjà effectué l'encodage, si reencode == True, on effectue quand même la procédure
|
587
|
+
if not os.path.exists(new_file_path) or reencode:
|
588
|
+
current_df = cudf_read_parquet(file)
|
589
|
+
|
590
|
+
text_list = current_df[col_text].to_arrow().to_pylist()
|
591
|
+
|
592
|
+
# text vectorization
|
593
|
+
embeddings = HF_encoder.embed_documents(text_list)
|
594
|
+
|
595
|
+
# on crée un dataframe avec les embeddings
|
596
|
+
current_df = current_df[cols]
|
597
|
+
current_df['embeddings'] = embeddings
|
598
|
+
|
599
|
+
# on sauvegarde
|
600
|
+
new_file_path = cudf_write_parquet(current_df, path_embedded_chunks, new_filename)
|
601
|
+
new_file_paths.append(new_file_path)
|
602
|
+
else :
|
603
|
+
new_file_paths.append(new_file_path)
|
604
|
+
|
605
|
+
return new_file_paths
|
606
|
+
|
607
|
+
def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
|
608
|
+
"""
|
609
|
+
Split a DataFrame into multiple pickle files with a specified chunk size.
|
610
|
+
|
611
|
+
Args:
|
612
|
+
df (pd.DataFrame): The DataFrame to be split.
|
613
|
+
path (str): The directory path where the pickle files will be saved.
|
614
|
+
name (str): The base name for the pickle files.
|
615
|
+
chunk_size (int, optional): The size of each chunk. Default is 10000.
|
616
|
+
|
617
|
+
Returns:
|
618
|
+
list[str]: A list of file paths to the saved pickle files.
|
619
|
+
"""
|
620
|
+
num_chunks = -(-len(df) // chunk_size) # Calculate the number of chunks using ceil division
|
621
|
+
|
622
|
+
file_paths = []
|
623
|
+
|
624
|
+
# create smaller datasets of chunk_size each
|
625
|
+
for i in range(num_chunks):
|
626
|
+
start = i * chunk_size
|
627
|
+
end = (i + 1) * chunk_size
|
628
|
+
chunk = df.iloc[start:end]
|
629
|
+
filename = f"{name}_{i}" # Adjust the filename format as needed
|
630
|
+
file_path = cudf_write_parquet(chunk, path, filename)
|
631
|
+
file_paths.append(file_path)
|
632
|
+
|
633
|
+
return file_paths
|