opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,36 +1,47 @@
1
1
  from cuml import UMAP
2
+ import cudf
3
+ from sklearn.feature_selection import chi2
4
+ from cuml.feature_extraction.text import CountVectorizer
2
5
  from cuml.cluster.hdbscan import HDBSCAN, all_points_membership_vectors, approximate_predict, membership_vector
3
6
  import numpy as np
4
7
  from tqdm import tqdm
5
8
  import os
6
9
  from opsci_toolbox.helpers.common import load_pickle, create_dir, write_pickle
10
+ import cudf.pandas
11
+ cudf.pandas.install()
12
+ import pandas as pd
7
13
 
8
14
  def reduce_with_cuml_UMAP(embeddings: np.ndarray,
9
15
  n_neighbors: int = 5,
10
16
  n_components: int = 3,
11
17
  min_dist: float = 0.0,
12
18
  metric: str = "cosine",
13
- spread: float = 1.0) -> tuple:
19
+ spread: float = 1.0,
20
+ learning_rate: float = 1.0,
21
+ n_epochs:int = 300
22
+ ) -> tuple:
14
23
  """
15
24
  Reduces the dimensionality of embeddings using UMAP with cuML library.
16
25
 
17
- Parameters:
18
- - embeddings (np.ndarray): The input embeddings to be reduced.
19
- - n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
20
- - n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
21
- - min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
22
- - metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
23
- - spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
26
+ Args:
27
+ embeddings (np.ndarray): The input embeddings to be reduced.
28
+ n_neighbors (int, optional): The number of nearest neighbors to consider. Defaults to 5.
29
+ n_components (int, optional): The number of dimensions of the embedded space. Defaults to 3.
30
+ min_dist (float, optional): The minimum distance between embedded points. Defaults to 0.0.
31
+ metric (str, optional): The metric to use for distance computation. Defaults to "cosine".
32
+ spread (float, optional): The effective scale of embedded points. Defaults to 1.0.
24
33
 
25
34
  Returns:
26
- - reducer (UMAP): The UMAP reducer object.
27
- - reduced_embeddings (np.ndarray): The reduced embeddings.
35
+ reducer (UMAP): The UMAP reducer object.
36
+ reduced_embeddings (np.ndarray): The reduced embeddings.
28
37
  """
29
38
  reducer = UMAP(n_neighbors=n_neighbors,
30
39
  n_components=n_components,
31
40
  min_dist=min_dist,
32
41
  metric=metric,
33
- spread = spread).fit(embeddings)
42
+ spread = spread,
43
+ n_epochs=n_epochs,
44
+ learning_rate=learning_rate).fit(embeddings)
34
45
 
35
46
  reduced_embeddings = reducer.transform(embeddings)
36
47
  return reducer, reduced_embeddings
@@ -40,12 +51,12 @@ def transform_with_cuml_UMAP(reducer,
40
51
  """
41
52
  Transform new data points using a UMAP object.
42
53
 
43
- Parameters:
44
- - reducer (UMAP): The UMAP reducer object.
45
- - new_embeddings (np.ndarray): The new data points to be transformed.
54
+ Args:
55
+ reducer (UMAP): The UMAP reducer object.
56
+ new_embeddings (np.ndarray): The new data points to be transformed.
46
57
 
47
58
  Returns:
48
- - reduced_embeddings (np.ndarray): The transformed embeddings.
59
+ reduced_embeddings (np.ndarray): The transformed embeddings.
49
60
  """
50
61
  reduced_embeddings = reducer.transform(new_embeddings)
51
62
  return reduced_embeddings
@@ -68,7 +79,7 @@ def hdbscan_cuml_clustering(embeddings: np.ndarray,
68
79
  """
69
80
  Perform clustering using the HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) algorithm.
70
81
 
71
- Parameters:
82
+ Args:
72
83
  embeddings : array-like or sparse matrix, shape (n_samples, n_features)
73
84
  The input data to be clustered.
74
85
  min_cluster_size : int, optional
@@ -100,7 +111,7 @@ def hdbscan_cuml_clustering(embeddings: np.ndarray,
100
111
  Whether the data is prediction data or not. Default is True.
101
112
 
102
113
  Returns:
103
- clusterer : hdbscan.hdbscan_.HDBSCAN
114
+ clusterer : hdbscan.HDBSCAN
104
115
  HDBSCAN clusterer object.
105
116
  labels : array, shape (n_samples,)
106
117
  Cluster labels for each point. Noisy samples are given the label -1.
@@ -129,8 +140,8 @@ def transform_with_cuml_HDBSCAN(clusterer, new_embeddings: np.ndarray) -> tuple:
129
140
  """
130
141
  Transform new data points using an HDBSCAN object.
131
142
 
132
- Parameters:
133
- clusterer : hdbscan.hdbscan_.HDBSCAN
143
+ Args:
144
+ clusterer : hdbscan.HDBSCAN
134
145
  The HDBSCAN clusterer object trained on the original data.
135
146
  new_embeddings : array-like or sparse matrix, shape (n_samples, n_features)
136
147
  The new data points to be transformed.
@@ -149,15 +160,13 @@ def cuml_soft_clustering(clusterer) -> tuple:
149
160
  """
150
161
  Perform soft clustering using HDBSCAN.
151
162
 
152
- Parameters:
153
- clusterer : hdbscan.hdbscan_.HDBSCAN
163
+ Args:
164
+ clusterer : hdbscan.HDBSCAN
154
165
  The HDBSCAN clusterer object trained on the original data.
155
166
 
156
167
  Returns:
157
- soft_clusters_val : list of str
158
- Predicted cluster labels for each data point, represented as strings.
159
- soft_clusters_proba : list of float
160
- The maximum probability of each data point belonging to any cluster.
168
+ soft_clusters_val : list of str. Predicted cluster labels for each data point, represented as strings.
169
+ soft_clusters_proba : list of float. The maximum probability of each data point belonging to any cluster.
161
170
  """
162
171
  soft_clusters = all_points_membership_vectors(clusterer)
163
172
  soft_clusters_val = [str(np.argmax(x)) for x in soft_clusters]
@@ -169,7 +178,7 @@ def soft_cuml_clustering_new_data(clusterer, embeddings: np.ndarray) -> tuple:
169
178
  """
170
179
  Predict cluster memberships for new data points using HDBSCAN soft clustering.
171
180
 
172
- Parameters:
181
+ Args:
173
182
  clusterer : hdbscan.hdbscan_.HDBSCAN
174
183
  The HDBSCAN clusterer object trained on the original data.
175
184
  embeddings : array-like or sparse matrix, shape (n_samples, n_features)
@@ -190,7 +199,7 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
190
199
  """
191
200
  Process embeddings using UMAP reduction.
192
201
 
193
- Parameters:
202
+ Args:
194
203
  embedded_chunks_paths : list of str
195
204
  List of file paths containing the embedded chunks.
196
205
  path_reduced_embeddings_id : str
@@ -208,20 +217,21 @@ def process_UMAP(embedded_chunks_paths: list, path_reduced_embeddings_id: str, r
208
217
  for file_path in tqdm(embedded_chunks_paths, total=len(embedded_chunks_paths), desc="UMAP transform from files"):
209
218
 
210
219
  filename = os.path.splitext(os.path.basename(file_path))[0][:-9]
211
- new_filename = filename+"_reduce_embeddings.pickle"
220
+ new_filename = filename+"_reduce_embeddings.parquet"
212
221
  new_file_path = os.path.join(path_reduced_embeddings_id, new_filename)
213
222
 
214
223
  if not os.path.exists(new_file_path) or reencode:
215
- df = load_pickle(file_path)
224
+ df = cudf_read_parquet(file_path)
216
225
  create_dir(path_reduced_embeddings_id)
217
226
  # embeddings = df["embeddings"].to_list()
218
- embeddings = np.vstack(df['embeddings'].values)
227
+ # embeddings = np.vstack(df['embeddings'].values)
228
+ embeddings = np.vstack(df['embeddings'].to_pandas().tolist())
219
229
  reduced_embeddings = transform_with_cuml_UMAP(reducer, embeddings)
220
230
  reduced_embeddings_transformed=[list(e) for e in reduced_embeddings]
221
231
  df['reduced_embeddings'] = reduced_embeddings_transformed
222
232
  df.drop(columns=["embeddings"], inplace=True)
223
233
  print(path_reduced_embeddings_id, filename+"_reduce_embeddings")
224
- write_pickle(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
234
+ cudf_write_parquet(df, path_reduced_embeddings_id, filename+"_reduce_embeddings")
225
235
  new_file_paths.append(new_file_path)
226
236
  else:
227
237
  print("REDUCED EMBEDDINGS ALREADY EXISTS", file_path)
@@ -238,7 +248,7 @@ def process_HDBSCAN(clusterer,
238
248
  """
239
249
  Process reduced embeddings using HDBSCAN clustering.
240
250
 
241
- Parameters:
251
+ Args:
242
252
  clusterer : hdbscan.hdbscan_.HDBSCAN
243
253
  The HDBSCAN clusterer object.
244
254
  reduced_embeddings_paths : list of str
@@ -258,12 +268,13 @@ def process_HDBSCAN(clusterer,
258
268
  for file_path in tqdm(reduced_embeddings_paths, total=len(reduced_embeddings_paths), desc="HDBSCAN transform from files"):
259
269
 
260
270
  filename = os.path.splitext(os.path.basename(file_path))[0][:-18]
261
- new_filename = filename+ "_predictions.pickle"
271
+ new_filename = filename+ "_predictions.parquet"
262
272
  new_file_path = os.path.join(path_predictions_dataset_id, new_filename)
263
273
  if not os.path.exists(new_file_path) or reencode:
264
- df = load_pickle(file_path)
274
+ df = cudf_read_parquet(file_path)
265
275
  # reduced_embeddings = df["reduced_embeddings"].to_list()
266
- reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
276
+ # reduced_embeddings = np.vstack(df['reduced_embeddings'].values)
277
+ reduced_embeddings = np.vstack(df['reduced_embeddings'].to_pandas().tolist())
267
278
  topics, probas = transform_with_cuml_HDBSCAN(clusterer, reduced_embeddings)
268
279
  df["topic"]=topics.astype(int).astype(str)
269
280
  df["proba"]=probas
@@ -272,9 +283,351 @@ def process_HDBSCAN(clusterer,
272
283
  df["soft_topic"]=soft_clusters
273
284
  df["soft_proba"]=soft_proba
274
285
 
275
- write_pickle(df, path_predictions_dataset_id, filename+ "_predictions")
286
+ cudf_write_parquet(df, path_predictions_dataset_id, filename+ "_predictions")
276
287
  new_file_paths.append(new_file_path)
277
288
  else:
278
289
  print("CLUSTERING ALREADY EXISTS", file_path)
279
290
  new_file_paths.append(new_file_path)
280
291
  return new_file_paths
292
+
293
+ # def cuml_word_frequency_per_categorie(df: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
294
+ # """
295
+ # Calculate word frequency per category using cuML for GPU acceleration.
296
+
297
+ # Parameters:
298
+ # df : pandas DataFrame
299
+ # DataFrame containing text data and corresponding categories.
300
+ # col_text : str
301
+ # Name of the column containing the text data.
302
+ # col_cat : str
303
+ # Name of the column containing the categories.
304
+ # ngram_range : tuple, optional
305
+ # The range for n-grams. Default is (1, 1) for unigrams.
306
+ # stop_words : list, optional
307
+ # List of stopwords to be ignored during frequency calculation. Default is an empty list.
308
+ # n_words : int, optional
309
+ # Number of top words to display per category. Default is 20.
310
+ # min_freq : int, optional
311
+ # Minimum frequency threshold for word occurrences per category. Default is 3.
312
+
313
+ # Returns:
314
+ # DataFrame
315
+ # DataFrame containing word frequencies per category.
316
+
317
+ # Description:
318
+ # This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
319
+ # It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
320
+ # """
321
+ # # Convert pandas DataFrame to cuDF DataFrame
322
+ # gdf = cudf.DataFrame.from_pandas(df)
323
+
324
+ # # Initialize cuML's CountVectorizer
325
+ # count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
326
+
327
+ # # Fit and transform the text data
328
+ # X_train_count = count_vectorizer.fit_transform(gdf[col_text])
329
+ # X_names_count = count_vectorizer.get_feature_names()
330
+
331
+ # # Initialize the resulting DataFrame
332
+ # df_count = cudf.DataFrame()
333
+
334
+ # # Calculate word frequencies per category
335
+ # for cat in gdf[col_cat].unique().to_pandas().tolist():
336
+ # word_count = X_train_count[gdf[col_cat] == cat].sum(axis=0)
337
+ # df_count_tmp = cudf.DataFrame({col_cat: [cat]*len(X_names_count), "word": X_names_count, "freq": word_count.tolist()[0]}).sort_values(by="freq", ascending=False)
338
+
339
+ # # Apply frequency and n_words filters
340
+ # if n_words:
341
+ # df_count_tmp = df_count_tmp.head(n_words)
342
+ # if min_freq:
343
+ # df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
344
+
345
+ # # Concatenate the result to the main DataFrame
346
+ # df_count = cudf.concat([df_count, df_count_tmp])
347
+
348
+ # # Convert the result back to pandas DataFrame
349
+ # return df_count.to_pandas()
350
+
351
+ def cuml_word_frequency_per_categorie(gdf: pd.DataFrame, col_text: str, col_cat: str, ngram_range: tuple = (1, 1), stop_words: list = [], n_words: int = 20, min_freq: int = 3) -> pd.DataFrame:
352
+ """
353
+ Calculate word frequency per category using cuML for GPU acceleration.
354
+
355
+ Args:
356
+ df : pandas DataFrame
357
+ DataFrame containing text data and corresponding categories.
358
+ col_text : str
359
+ Name of the column containing the text data.
360
+ col_cat : str
361
+ Name of the column containing the categories.
362
+ ngram_range : tuple, optional
363
+ The range for n-grams. Default is (1, 1) for unigrams.
364
+ stop_words : list, optional
365
+ List of stopwords to be ignored during frequency calculation. Default is an empty list.
366
+ n_words : int, optional
367
+ Number of top words to display per category. Default is 20.
368
+ min_freq : int, optional
369
+ Minimum frequency threshold for word occurrences per category. Default is 3.
370
+
371
+ Returns:
372
+ DataFrame
373
+ DataFrame containing word frequencies per category.
374
+
375
+ Description:
376
+ This function calculates word frequencies per category based on the provided DataFrame, considering the text data and corresponding categories.
377
+ It filters out words with frequencies below the specified minimum frequency threshold and returns the top words for each category.
378
+ """
379
+ # Convert pandas DataFrame to cuDF DataFrame
380
+ # gdf = cudf.DataFrame.from_pandas(df))
381
+ # print(type(gdf))
382
+ # gdf = convert_df_to_cudf(gdf)
383
+
384
+ # Initialize cuML's CountVectorizer
385
+ count_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, stop_words=stop_words)
386
+
387
+ print(type(gdf[col_text]))
388
+ # Fit and transform the text data
389
+ X_train_count = count_vectorizer.fit_transform(cudf.Series(gdf[col_text]))
390
+ X_names_count = count_vectorizer.get_feature_names()
391
+
392
+ # Initialize the resulting DataFrame
393
+ df_count = cudf.DataFrame()
394
+
395
+ # Calculate word frequencies per category
396
+ for cat in gdf[col_cat].unique().tolist():
397
+ word_count = X_train_count[gdf[col_cat] == cat].sum(axis=0)
398
+ df_count_tmp = cudf.DataFrame({col_cat: [cat]*len(X_names_count), "word": X_names_count, "freq": word_count.tolist()[0]}).sort_values(by="freq", ascending=False)
399
+
400
+ # Apply frequency and n_words filters
401
+ if n_words:
402
+ df_count_tmp = df_count_tmp.head(n_words)
403
+ if min_freq:
404
+ df_count_tmp = df_count_tmp[df_count_tmp["freq"] > min_freq]
405
+
406
+ # Concatenate the result to the main DataFrame
407
+ df_count = cudf.concat([df_count, df_count_tmp])
408
+
409
+ # Convert the result back to pandas DataFrame
410
+ return df_count.to_pandas()
411
+
412
+ # def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
413
+
414
+ # # Convert input lists to cuDF Series
415
+ # gdf_text = cudf.Series(lst_text)
416
+ # gdf_categorie = cudf.Series(lst_categorie)
417
+
418
+ # # Initialize cuML's CountVectorizer
419
+ # count_vectorizer = CountVectorizer(analyzer='word')
420
+
421
+ # # Fit and transform the text data
422
+ # X_train_count = count_vectorizer.fit_transform(gdf_text)
423
+ # X_names_count = count_vectorizer.get_feature_names()
424
+
425
+ # # Initialize the resulting DataFrame
426
+ # df_chi = cudf.DataFrame()
427
+
428
+ # # Calculate Chi-squared statistics per category
429
+ # unique_categories = gdf_categorie.unique().to_pandas().tolist()
430
+ # for cat in unique_categories:
431
+ # cat_series = (gdf_categorie == cat).astype(int).to_pandas()
432
+ # chi2_scores, p_values = chi2(X_train_count.get(), cat_series)
433
+ # word_count = X_train_count[cat_series.astype(bool)].sum(axis=0).get()[0]
434
+
435
+ # df_chi_tmp = cudf.DataFrame({
436
+ # col_cat: cat,
437
+ # "relevant_words_chi2": X_names_count,
438
+ # "chi2": chi2_scores,
439
+ # "p_values": 1 - p_values,
440
+ # "word_count_per_class": word_count
441
+ # }).sort_values(by="chi2", ascending=False).head(n_words)
442
+
443
+ # # Filter based on p_values and word_count
444
+ # df_chi_tmp = df_chi_tmp[df_chi_tmp["p_values"] > p_value_limit]
445
+ # df_chi_tmp = df_chi_tmp[df_chi_tmp["word_count_per_class"] > min_freq]
446
+
447
+ # df_chi = cudf.concat([df_chi, df_chi_tmp])
448
+
449
+ # # Reset index
450
+ # df_chi.reset_index(drop=True, inplace=True)
451
+ # return df_chi.to_pandas()
452
+
453
+ def cuml_chi2_per_category(lst_text: list, lst_categorie: list, col_cat: str, n_words: int = 10, p_value_limit: float = 0.95, min_freq: int = 3) -> pd.DataFrame:
454
+ """
455
+ Calculate Chi-squared statistics for each category and return a DataFrame
456
+ of relevant words per category.
457
+
458
+ Args:
459
+ lst_text (List[str]): List of text documents.
460
+ lst_categorie (List[str]): List of categories corresponding to each document.
461
+ col_cat (str): Name of the category column in the resulting DataFrame.
462
+ n_words (int, optional): Number of top words to return per category. Default is 10.
463
+ p_value_limit (float, optional): The minimum p-value to filter relevant words. Default is 0.95.
464
+ min_freq (int, optional): The minimum frequency of words to be considered relevant. Default is 3.
465
+
466
+ Returns:
467
+ pd.DataFrame: A pandas DataFrame containing the relevant words for each category.
468
+ """
469
+ # Convert input lists to cuDF Series
470
+ gdf_text = cudf.Series(lst_text)
471
+ gdf_categorie = lst_categorie
472
+
473
+ # Initialize cuML's CountVectorizer
474
+ count_vectorizer = CountVectorizer(analyzer='word')
475
+
476
+ # Fit and transform the text data
477
+ X_train_count = count_vectorizer.fit_transform(gdf_text)
478
+ X_names_count = count_vectorizer.get_feature_names()
479
+
480
+ # Initialize the resulting DataFrame
481
+ df_chi = cudf.DataFrame()
482
+
483
+ # Calculate Chi-squared statistics per category
484
+ unique_categories = gdf_categorie.unique().tolist()
485
+ for cat in unique_categories:
486
+ cat_series = (gdf_categorie == cat).astype(int)
487
+ chi2_scores, p_values = chi2(X_train_count.get(), cat_series)
488
+ word_count = X_train_count[cat_series.astype(bool)].sum(axis=0).get()[0]
489
+
490
+ df_chi_tmp = cudf.DataFrame({
491
+ col_cat: cat,
492
+ "relevant_words_chi2": X_names_count,
493
+ "chi2": chi2_scores,
494
+ "p_values": 1 - p_values,
495
+ "word_count_per_class": word_count
496
+ }).sort_values(by="chi2", ascending=False).head(n_words)
497
+
498
+ # Filter based on p_values and word_count
499
+ df_chi_tmp = df_chi_tmp[df_chi_tmp["p_values"] > p_value_limit]
500
+ df_chi_tmp = df_chi_tmp[df_chi_tmp["word_count_per_class"] > min_freq]
501
+
502
+ df_chi = cudf.concat([df_chi, df_chi_tmp])
503
+
504
+ # Reset index
505
+ df_chi.reset_index(drop=True, inplace=True)
506
+ return df_chi.to_pandas()
507
+
508
+ def cudf_write_parquet(df: cudf.DataFrame, path: str, filename: str) -> str:
509
+ """
510
+ Write a cuDF DataFrame to a Parquet file.
511
+
512
+ Args:
513
+ df (cudf.DataFrame): The cuDF DataFrame to be written.
514
+ path (str): The directory path where the file should be saved.
515
+ filename (str): The name of the file without extension.
516
+
517
+ Returns:
518
+ str: The file path of the saved Parquet file.
519
+ """
520
+ file_path = os.path.join(path, str(filename)+".parquet")
521
+ df.to_parquet(file_path)
522
+ return file_path
523
+
524
+ def cudf_read_parquet(path: str) -> cudf.DataFrame:
525
+ """
526
+ Read a Parquet file into a cuDF DataFrame.
527
+
528
+ Args:
529
+ path (str): The file path to the Parquet file.
530
+
531
+ Returns:
532
+ cudf.DataFrame: The read cuDF DataFrame.
533
+ """
534
+ df = cudf.read_parquet(path)
535
+ return df
536
+
537
+ def convert_df_to_cudf(df: pd.DataFrame) -> cudf.DataFrame:
538
+ """
539
+ Convert a pandas DataFrame to a cuDF DataFrame.
540
+
541
+ Args:
542
+ df (pd.DataFrame): The pandas DataFrame to convert.
543
+
544
+ Returns:
545
+ cudf.DataFrame: The resulting cuDF DataFrame.
546
+ """
547
+ return cudf.DataFrame.from_pandas(df)
548
+
549
+ def convert_cudf_to_df(cdf: cudf.DataFrame) -> pd.DataFrame:
550
+ """
551
+ Convert a cuDF DataFrame to a pandas DataFrame.
552
+
553
+ Args:
554
+ cdf (cudf.DataFrame): The cuDF DataFrame to convert.
555
+
556
+ Returns:
557
+ pd.DataFrame: The resulting pandas DataFrame.
558
+ """
559
+ return cdf.to_pandas()
560
+
561
+
562
+ def cudf_encode_chunked_files(chunk_files_paths: list,
563
+ HF_encoder,
564
+ cols: list,
565
+ col_text: str,
566
+ path_embedded_chunks: str,
567
+ reencode: bool = False) -> list:
568
+ """
569
+ Encode text from files and save the results in another pickle file.
570
+
571
+ Args:
572
+ chunk_files_paths (List[str]): List of file paths containing documents.
573
+ HF_encoder (Encoder): Encoder object for text vectorization.
574
+ cols (List[str]): Columns to keep in the resulting DataFrame.
575
+ col_text (str): Column containing text data in the DataFrame.
576
+ path_embedded_chunks (str): Path to save the embedded chunks.
577
+ reencode (bool, optional): Whether to re-encode files even if they already exist. Defaults to False.
578
+
579
+ Returns:
580
+ List[str]: List of paths for newly created files.
581
+ """
582
+ new_file_paths=[]
583
+ for file in tqdm(chunk_files_paths, total=len(chunk_files_paths), desc="Encoding text from files"):
584
+ new_filename = os.path.splitext(os.path.basename(file))[0]+"_embedded"
585
+ new_file_path = os.path.join(path_embedded_chunks, new_filename+".parquet")
586
+ # on vérifie si on a déjà effectué l'encodage, si reencode == True, on effectue quand même la procédure
587
+ if not os.path.exists(new_file_path) or reencode:
588
+ current_df = cudf_read_parquet(file)
589
+
590
+ text_list = current_df[col_text].to_arrow().to_pylist()
591
+
592
+ # text vectorization
593
+ embeddings = HF_encoder.embed_documents(text_list)
594
+
595
+ # on crée un dataframe avec les embeddings
596
+ current_df = current_df[cols]
597
+ current_df['embeddings'] = embeddings
598
+
599
+ # on sauvegarde
600
+ new_file_path = cudf_write_parquet(current_df, path_embedded_chunks, new_filename)
601
+ new_file_paths.append(new_file_path)
602
+ else :
603
+ new_file_paths.append(new_file_path)
604
+
605
+ return new_file_paths
606
+
607
+ def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
608
+ """
609
+ Split a DataFrame into multiple pickle files with a specified chunk size.
610
+
611
+ Args:
612
+ df (pd.DataFrame): The DataFrame to be split.
613
+ path (str): The directory path where the pickle files will be saved.
614
+ name (str): The base name for the pickle files.
615
+ chunk_size (int, optional): The size of each chunk. Default is 10000.
616
+
617
+ Returns:
618
+ list[str]: A list of file paths to the saved pickle files.
619
+ """
620
+ num_chunks = -(-len(df) // chunk_size) # Calculate the number of chunks using ceil division
621
+
622
+ file_paths = []
623
+
624
+ # create smaller datasets of chunk_size each
625
+ for i in range(num_chunks):
626
+ start = i * chunk_size
627
+ end = (i + 1) * chunk_size
628
+ chunk = df.iloc[start:end]
629
+ filename = f"{name}_{i}" # Adjust the filename format as needed
630
+ file_path = cudf_write_parquet(chunk, path, filename)
631
+ file_paths.append(file_path)
632
+
633
+ return file_paths